diff --git a/.buildkite/pyproject.toml b/.buildkite/pyproject.toml
index 083bb795caf5..d5cad1c73c6f 100644
--- a/.buildkite/pyproject.toml
+++ b/.buildkite/pyproject.toml
@@ -6,11 +6,6 @@
 
 [tool.ruff]
 line-length = 88
-exclude = [
-    # External file, leaving license intact
-    "examples/other/fp8/quantizer/quantize.py",
-    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
-]
 
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 2118cf4595eb..b3c27e2c99c2 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -64,7 +64,7 @@ steps:
       - "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
     plugins:
       - docker-login#v3.0.0:
-          username: vllm
+          username: vllmbot
           password-env: DOCKERHUB_TOKEN
     env:
       DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/scripts/hardware_ci/run-hpu-test.sh b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
index 95b6ac37f185..5efac3ddf469 100644
--- a/.buildkite/scripts/hardware_ci/run-hpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-hpu-test.sh
@@ -10,15 +10,17 @@ docker build -t hpu-test-env -f docker/Dockerfile.hpu .
 # Setup cleanup
 # certain versions of HPU software stack have a bug that can
 # override the exit code of the script, so we need to use
-# separate remove_docker_container and remove_docker_container_and_exit
+# separate remove_docker_containers and remove_docker_containers_and_exit
 # functions, while other platforms only need one remove_docker_container
 # function.
 EXITCODE=1
-remove_docker_container() { docker rm -f hpu-test || true; }
-remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
-trap remove_docker_container_and_exit EXIT
-remove_docker_container
+remove_docker_containers() { docker rm -f hpu-test || true; docker rm -f hpu-test-tp2 || true; }
+remove_docker_containers_and_exit() { remove_docker_containers; exit $EXITCODE; }
+trap remove_docker_containers_and_exit EXIT
+remove_docker_containers
 
 # Run the image and launch offline inference
 docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
+docker run --runtime=habana --name=hpu-test-tp2 --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --tensor-parallel-size 2
+
 EXITCODE=$?
diff --git a/.buildkite/scripts/hardware_ci/run-neuron-test.sh b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
index ec6a080eb499..3d294ea5f8a7 100644
--- a/.buildkite/scripts/hardware_ci/run-neuron-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-neuron-test.sh
@@ -11,13 +11,14 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 HF_CACHE="$(realpath ~)/huggingface"
 mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
+HF_TOKEN=$(aws secretsmanager get-secret-value  --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
 
 NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
 mkdir -p "${NEURON_COMPILE_CACHE_URL}"
 NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
 
 # Try building the docker image
-aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
 
 # prune old image and containers to save disk space, and only once a day
 # by using a timestamp file in tmp.
@@ -47,8 +48,16 @@ trap remove_docker_container EXIT
 docker run --rm -it --device=/dev/neuron0 --network bridge \
        -v "${HF_CACHE}:${HF_MOUNT}" \
        -e "HF_HOME=${HF_MOUNT}" \
+       -e "HF_TOKEN=${HF_TOKEN}" \
        -v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
        -e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
        --name "${container_name}" \
        ${image_name} \
-       /bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
+       /bin/bash -c "
+            python3 /workspace/vllm/examples/offline_inference/neuron.py;
+            python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
+            for f in /workspace/vllm/tests/neuron/2_core/*.py; do
+                echo 'Running test file: '$f;
+                python3 -m pytest \$f -v --capture=tee-sys;
+            done
+       "
\ No newline at end of file
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 21f0c4c39a0c..728bf2a9b1cc 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -33,14 +33,13 @@ steps:
 
 - label: Documentation Build # 2min
   mirror_hardwares: [amdexperimental]
-  working_dir: "/vllm-workspace/test_docs/docs"
+  working_dir: "/vllm-workspace/test_docs"
   fast_check: true
   no_gpu: True
   commands:
-  - pip install -r ../../requirements/docs.txt
-  - SPHINXOPTS=\"-W\" make html
-  # Check API reference (if it fails, you may have missing mock imports)
-  - grep \"sig sig-object py\" build/html/api/vllm/vllm.sampling_params.html
+  - pip install -r ../requirements/docs.txt
+  # TODO: add `--strict` once warnings in docstrings are fixed
+  - mkdocs build
 
 - label: Async Engine, Inputs, Utils, Worker Test # 24min
   mirror_hardwares: [amdexperimental]
@@ -59,6 +58,7 @@ steps:
   - pytest -v -s async_engine # AsyncLLMEngine
   - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
   - pytest -v -s test_inputs.py
+  - pytest -v -s test_outputs.py
   - pytest -v -s multimodal
   - pytest -v -s test_utils.py # Utils
   - pytest -v -s worker # Worker
@@ -128,7 +128,7 @@ steps:
   - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
   - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
   - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py  --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
   - pytest -v -s entrypoints/test_chat_utils.py
   - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
@@ -141,6 +141,7 @@ steps:
   - vllm/core/
   - tests/distributed/test_utils
   - tests/distributed/test_pynccl
+  - tests/distributed/test_events
   - tests/spec_decode/e2e/test_integration_dist_tp4
   - tests/compile/test_basic_correctness
   - examples/offline_inference/rlhf.py
@@ -159,6 +160,7 @@ steps:
   - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s distributed/test_events.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
   # TODO: create a dedicated test section for multi-GPU example tests
   # when we have multiple distributed example tests
@@ -224,6 +226,7 @@ steps:
     - pytest -v -s v1/test_serial_utils.py
     - pytest -v -s v1/test_utils.py
     - pytest -v -s v1/test_oracle.py
+    - pytest -v -s v1/test_metrics_reader.py
     # TODO: accuracy does not match, whether setting
     # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
     - pytest -v -s v1/e2e
@@ -248,7 +251,7 @@ steps:
     - python3 offline_inference/vision_language.py --seed 0
     - python3 offline_inference/vision_language_embedding.py --seed 0
     - python3 offline_inference/vision_language_multi_image.py --seed 0
-    - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference/encoder_decoder.py
     - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     - python3 offline_inference/basic/classify.py
@@ -320,6 +323,7 @@ steps:
     - pytest -v -s compile/test_fusion.py
     - pytest -v -s compile/test_silu_mul_quant_fusion.py
     - pytest -v -s compile/test_sequence_parallelism.py
+    - pytest -v -s compile/test_async_tp.py
 
 - label: PyTorch Fullgraph Smoke Test # 9min
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -397,10 +401,12 @@ steps:
   source_file_dependencies:
   - vllm/model_executor/model_loader
   - tests/tensorizer_loader
+  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
   commands:
     - apt-get update && apt-get install -y curl libsodium23
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s tensorizer_loader
+    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 
 - label: Benchmarks # 9min
   mirror_hardwares: [amdexperimental, amdproduction]
@@ -479,10 +485,7 @@ steps:
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_utils.py
     - pytest -v -s models/test_vision.py
-    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
-    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
+    - pytest -v -s models/test_initialization.py
 
 - label: Language Models Test (Standard)
   mirror_hardwares: [amdexperimental]
@@ -496,16 +499,25 @@ steps:
     - pip freeze | grep -E 'torch'
     - pytest -v -s models/language -m core_model
 
-- label: Language Models Test (Extended)
+- label: Language Models Test (Extended Generation) # 1hr20min
   mirror_hardwares: [amdexperimental]
   optional: true
   source_file_dependencies:
   - vllm/
-  - tests/models/language
+  - tests/models/language/generation
   commands:
     # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
     - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
-    - pytest -v -s models/language -m 'not core_model'
+    - pytest -v -s models/language/generation -m 'not core_model'
+
+- label: Language Models Test (Extended Pooling)  # 36min
+  mirror_hardwares: [amdexperimental]
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/language/pooling
+  commands:
+    - pytest -v -s models/language/pooling -m 'not core_model'
 
 - label: Multi-Modal Models Test (Standard)
   mirror_hardwares: [amdexperimental]
diff --git a/.github/ISSUE_TEMPLATE/400-bug-report.yml b/.github/ISSUE_TEMPLATE/400-bug-report.yml
index 00b0f024c0da..f05be2ba8707 100644
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@@ -81,14 +81,14 @@ body:
     required: true
 - type: markdown
   attributes:
-    value: >
-      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
+    value: |
+      ⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
 
       - Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
 
       - If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
 
-      Thanks for contributing 🎉!
+      Thanks for reporting 🙏!
 - type: checkboxes
   id: askllm
   attributes:
diff --git a/.github/ISSUE_TEMPLATE/450-ci-failure.yml b/.github/ISSUE_TEMPLATE/450-ci-failure.yml
new file mode 100644
index 000000000000..7af0e0673a2f
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/450-ci-failure.yml
@@ -0,0 +1,69 @@
+name: 🧪 CI failure report
+description: Report a failing test.
+title: "[CI Failure]: "
+labels: ["ci-failure"]
+
+body:
+- type: markdown
+  attributes:
+    value: >
+      #### Include the name of the failing Buildkite step and test file in the title.
+- type: input
+  attributes:
+    label: Name of failing test
+    description: |
+      Paste in the fully-qualified name of the failing test from the logs.
+    placeholder: |
+      `path/to/test_file.py::test_name[params]`
+  validations:
+    required: true
+- type: checkboxes
+  attributes:
+    label: Basic information
+    description: Select all items that apply to the failing test.
+    options:
+      - label: Flaky test
+      - label: Can reproduce locally
+      - label: Caused by external libraries (e.g. bug in `transformers`)
+- type: textarea
+  attributes:
+    label: 🧪 Describe the failing test
+    description: |
+      Please provide a clear and concise description of the failing test.
+    placeholder: |
+      A clear and concise description of the failing test.
+  
+      ```
+      The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
+      ```
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: 📝 History of failing test
+    description: |
+      Since when did the test start to fail?
+      You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
+
+      If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
+
+      - Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
+
+      - Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
+
+      - Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
+    placeholder: |
+      Approximate timeline and/or problematic PRs
+
+      A link to the Buildkite analytics of the failing test (if available)
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: CC List.
+    description: >
+      The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
+- type: markdown
+  attributes:
+    value: >
+      Thanks for reporting 🙏!
diff --git a/.github/mergify.yml b/.github/mergify.yml
index ccfd571625b5..e595060c325a 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -58,7 +58,7 @@ pull_request_rules:
       - files~=^benchmarks/structured_schemas/
       - files=benchmarks/benchmark_serving_structured_output.py
       - files=benchmarks/run_structured_output_benchmark.sh
-      - files=docs/source/features/structured_outputs.md
+      - files=docs/features/structured_outputs.md
       - files=examples/offline_inference/structured_outputs.py
       - files=examples/online_serving/openai_chat_completion_structured_outputs.py
       - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@@ -135,9 +135,7 @@ pull_request_rules:
       - files~=^tests/entrypoints/openai/tool_parsers/
       - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
       - files~=^vllm/entrypoints/openai/tool_parsers/
-      - files=docs/source/features/tool_calling.md
-      - files=docs/source/getting_started/examples/openai_chat_completion_client_with_tools.md
-      - files=docs/source/getting_started/examples/chat_with_tools.md
+      - files=docs/features/tool_calling.md
       - files~=^examples/tool_chat_*
       - files=examples/offline_inference/chat_with_tools.py
       - files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh
index 3246c6f9bc4b..8d65936fba1d 100755
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
 
 # Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
 python3 - <<EOF
-import re
+import regex as re
 
 with open("${NEW}", "r") as file:
     content = file.read()
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index 50fea0c43cb8..d5c6b8d43a6e 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -20,7 +20,12 @@ jobs:
         with:
           python-version: '3.12'
 
+      - name: Install Python dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install regex
+
       - name: Update PR description
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
+        run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
diff --git a/.gitignore b/.gitignore
index 2756c612b82f..e49d1d6ba619 100644
--- a/.gitignore
+++ b/.gitignore
@@ -77,11 +77,6 @@ instance/
 # Scrapy stuff:
 .scrapy
 
-# Sphinx documentation
-docs/_build/
-docs/source/getting_started/examples/
-docs/source/api/vllm
-
 # PyBuilder
 .pybuilder/
 target/
@@ -151,6 +146,7 @@ venv.bak/
 
 # mkdocs documentation
 /site
+docs/examples
 
 # mypy
 .mypy_cache/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index dd3d600e6a75..2a34c5117c94 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,7 +17,7 @@ repos:
   - id: ruff
     args: [--output-format, github, --fix]
   - id: ruff-format
-    files: ^(.buildkite|benchmarks)/.*
+    files: ^(.buildkite|benchmarks|examples)/.*
 - repo: https://github.com/codespell-project/codespell
   rev: v2.4.1
   hooks:
@@ -39,6 +39,7 @@ repos:
   rev: v0.9.29
   hooks:
   - id: pymarkdown
+    exclude: '.*\.inc\.md'
     args: [fix]
 - repo: https://github.com/rhysd/actionlint
   rev: v1.7.7
@@ -127,6 +128,21 @@ repos:
     name: Update Dockerfile dependency graph
     entry: tools/update-dockerfile-graph.sh
     language: script
+  - id: enforce-import-regex-instead-of-re
+    name: Enforce import regex as re
+    entry: python tools/enforce_regex_import.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [regex]
+  # forbid directly import triton
+  - id: forbid-direct-triton-import
+    name: "Forbid direct 'import triton'"
+    entry: python tools/check_triton_import.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [regex]
   # Keep `suggestion` last
   - id: suggestion
     name: Suggestion
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 2781ec223b66..98c3be25f7e7 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -8,12 +8,8 @@ build:
   tools:
     python: "3.12"
 
-sphinx:
-  configuration: docs/source/conf.py
-  fail_on_warning: true
-
-# If using Sphinx, optionally build your docs in additional formats such as PDF
-formats: []
+mkdocs:
+  configuration: mkdocs.yaml
 
 # Optionally declare the Python requirements required to build your docs
 python:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 40977bd075d9..df7881bd0fb7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,9 +29,6 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
 #
 set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 
-# Supported NVIDIA architectures.
-set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
-
 # Supported AMD GPU architectures.
 set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
 
@@ -79,6 +76,15 @@ endif()
 #
 find_package(Torch REQUIRED)
 
+# Supported NVIDIA architectures.
+# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
+if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
+   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
+  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
+else()
+  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
+endif()
+
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@@ -227,6 +233,8 @@ endif()
 #
 
 set(VLLM_EXT_SRC
+  "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
+  "csrc/mamba/causal_conv1d/causal_conv1d.cu"
   "csrc/cache_kernels.cu"
   "csrc/attention/paged_attention_v1.cu"
   "csrc/attention/paged_attention_v2.cu"
@@ -282,8 +290,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
   FetchContent_MakeAvailable(cutlass)
 
   list(APPEND VLLM_EXT_SRC
-    "csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
-    "csrc/mamba/causal_conv1d/causal_conv1d.cu"
     "csrc/quantization/aqlm/gemm_kernels.cu"
     "csrc/quantization/awq/gemm_kernels.cu"
     "csrc/permute_cols.cu"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6d46a6dca371..2947aad75ee5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,3 +1,3 @@
 # Contributing to vLLM
 
-You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
+You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
diff --git a/README.md b/README.md
index 5b87ae838885..67f6b957ec55 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 <p align="center">
   <picture>
-    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
-    <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
+    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
+    <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%>
   </picture>
 </p>
 
@@ -58,7 +58,7 @@ vLLM is fast with:
 - Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
 - Continuous batching of incoming requests
 - Fast model execution with CUDA/HIP graph
-- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
+- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
 - Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
 - Speculative decoding
 - Chunked prefill
@@ -100,14 +100,14 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
 ## Contributing
 
 We welcome and value any contributions and collaborations.
-Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
+Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.
 
 ## Sponsors
 
 vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
 
 <!-- Note: Please sort them in alphabetical order. -->
-<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
+<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
 Cash Donations:
 - a16z
 - Dropbox
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 4a8ab895e18e..ecab570bb31c 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -146,10 +146,9 @@ python3 vllm/benchmarks/benchmark_serving.py \
 
 ``` bash
 VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
-    --speculative-model "[ngram]" \
     --ngram_prompt_lookup_min 2 \
     --ngram-prompt-lookup-max 5 \
-    --num_speculative_tokens 5
+    --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
 ```
 
 ``` bash
@@ -274,10 +273,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \
     --output-len=100 \
     --num-prompts=2048 \
     --async-engine \
-    --speculative-model="[ngram]" \
     --ngram_prompt_lookup_min=2 \
     --ngram-prompt-lookup-max=5 \
-    --num_speculative_tokens=5
+    --speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
 ```
 
 ```
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 800d426c6d11..88616e1108c5 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -194,6 +194,11 @@ async def async_request_deepspeed_mii(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("completions", "profile")), (
+        "OpenAI Completions API URL must end with 'completions' or 'profile'."
+    )
+
     async with aiohttp.ClientSession(
         trust_env=True, timeout=AIOHTTP_TIMEOUT
     ) as session:
@@ -204,6 +209,8 @@ async def async_request_deepspeed_mii(
             "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
             "top_p": 1.0,
         }
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+
         output = RequestFuncOutput()
         output.prompt_len = request_func_input.prompt_len
 
@@ -215,7 +222,7 @@ async def async_request_deepspeed_mii(
         st = time.perf_counter()
         try:
             async with session.post(
-                url=request_func_input.api_url, json=payload
+                url=api_url, json=payload, headers=headers
             ) as response:
                 if response.status == 200:
                     parsed_resp = await response.json()
diff --git a/benchmarks/benchmark_dataset.py b/benchmarks/benchmark_dataset.py
index d8f48644cc00..5513a5f78f1c 100644
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -35,6 +35,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal.image import convert_image_mode
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 
 logger = logging.getLogger(__name__)
@@ -257,7 +258,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
     if isinstance(image, dict) and "bytes" in image:
         image = Image.open(BytesIO(image["bytes"]))
     if isinstance(image, Image.Image):
-        image = image.convert("RGB")
+        image = convert_image_mode(image, "RGB")
         with io.BytesIO() as image_data:
             image.save(image_data, format="JPEG")
             image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index d5aaceeb8c9c..84759c5c354d 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -189,5 +189,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
     )
 
     parser = EngineArgs.add_cli_args(parser)
+    # V1 enables prefix caching by default which skews the latency
+    # numbers. We need to disable prefix caching by default.
+    parser.set_defaults(enable_prefix_caching=False)
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_serving_structured_output.py b/benchmarks/benchmark_serving_structured_output.py
index 5088c805f53e..6a50f47d3951 100644
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -672,7 +672,7 @@ def process_one_metric(
 def evaluate(ret, args):
     def _eval_correctness_json(expected, actual):
         # extract json string from string using regex
-        import re
+        import regex as re
 
         actual = actual.replace("\n", "").replace(" ", "").strip()
         try:
@@ -687,7 +687,7 @@ def _eval_correctness_choice(expected, actual):
         return actual in args.choice
 
     def _eval_correctness_regex(expected, actual):
-        import re
+        import regex as re
 
         return re.match(args.regex, actual) is not None
 
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index 2d62f7bdcd23..99808d8058b1 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -84,7 +84,10 @@ def main(
     if version == "v2":
         if current_platform.is_rocm():
             global PARTITION_SIZE
-            PARTITION_SIZE = 1024 if not args.custom_paged_attn else PARTITION_SIZE_ROCM
+            if not args.custom_paged_attn and not current_platform.is_navi():
+                PARTITION_SIZE = 1024
+            else:
+                PARTITION_SIZE = PARTITION_SIZE_ROCM
         num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
         tmp_output = torch.empty(
             size=(num_seqs, num_query_heads, num_partitions, head_size),
@@ -159,6 +162,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                         scale,
                         block_tables,
                         seq_lens,
+                        None,
                         block_size,
                         max_seq_len,
                         alibi_slopes,
diff --git a/benchmarks/kernels/graph_machete_bench.py b/benchmarks/kernels/graph_machete_bench.py
index ab364a84d6cb..0c86e4072957 100644
--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -2,11 +2,11 @@
 
 import math
 import pickle
-import re
 from collections import defaultdict
 
 import matplotlib.pyplot as plt
 import pandas as pd
+import regex as re
 import seaborn as sns
 from torch.utils.benchmark import Measurement as TMeasurement
 
diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml
index f825cb203269..65b1e09a247e 100644
--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@@ -6,11 +6,6 @@
 
 [tool.ruff]
 line-length = 88
-exclude = [
-    # External file, leaving license intact
-    "examples/other/fp8/quantizer/quantize.py",
-    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
-]
 
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index cf67847b45ba..9a613ba588dd 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -19,6 +19,7 @@ namespace vec_op {
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...)        \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
   AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
 
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
diff --git a/csrc/cutlass_extensions/common.hpp b/csrc/cutlass_extensions/common.hpp
index 0877da52435e..195872e8edd3 100644
--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -15,15 +15,6 @@
                 cutlassGetStatusString(error));     \
   }
 
-/**
- * Panic wrapper for unwinding CUDA runtime errors
- */
-#define CUDA_CHECK(status)                                        \
-  {                                                               \
-    cudaError_t error = status;                                   \
-    TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
-  }
-
 inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
   int max_shared_mem_per_block_opt_in = 0;
   cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu
index 98daf1a1b8e6..f62d08c17c6d 100644
--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -13,6 +13,10 @@
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_store.cuh>
 
+#ifdef USE_ROCM
+    namespace cub = hipcub;
+#endif
+
 #include "static_switch.h"
 
 
@@ -501,15 +505,9 @@ void causal_conv1d_fwd_launch(ConvParamsBase &params, cudaStream_t stream) {
         auto kernel = &causal_conv1d_fwd_kernel<Ktraits>;
 
         if (kSmemSize >= 48 * 1024) {
-            #ifndef USE_ROCM
-            C10_CUDA_CHECK(cudaFuncSetAttribute(
-                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
-            #else
-            // There is a slight signature discrepancy in HIP and CUDA "FuncSetAttribute" function.
             C10_CUDA_CHECK(cudaFuncSetAttribute(
                 (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
             std::cerr << "Warning (causal_conv1d fwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl;
-            #endif
         }
         kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
 
diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
index bd0a34119c82..0c9df925bdbf 100644
--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@@ -321,7 +321,7 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
             auto kernel = &selective_scan_fwd_kernel<Ktraits>;
             if (kSmemSize >= 48 * 1024) {
                 C10_CUDA_CHECK(cudaFuncSetAttribute(
-                    kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+                    (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
             }
             kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
             C10_CUDA_KERNEL_LAUNCH_CHECK();
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 0bae119a7c46..8fda434d452f 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -28,4 +28,6 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                              torch::Tensor num_tokens_post_pad, int64_t top_k,
                              int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
                              int64_t BLOCK_SIZE_K, int64_t bit);
-#endif
\ No newline at end of file
+#endif
+
+bool moe_permute_unpermute_supported();
\ No newline at end of file
diff --git a/csrc/moe/moe_permute_unpermute_op.cu b/csrc/moe/moe_permute_unpermute_op.cu
index 76d5f0eab021..9a7465261abf 100644
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -5,6 +5,9 @@
 #include "permute_unpermute_kernels/dispatch.h"
 #include "core/registration.h"
 
+// moe_permute kernels require at least CUDA 12.0
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+
 void moe_permute(
     const torch::Tensor& input,                      // [n_token, hidden]
     const torch::Tensor& topk_weights,               //[n_token, topk]
@@ -127,7 +130,45 @@ void moe_unpermute(
   });
 }
 
+#else
+
+void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
+                 torch::Tensor& topk_ids,
+                 const torch::Tensor& token_expert_indicies,
+                 const std::optional<torch::Tensor>& expert_map,
+                 int64_t n_expert, int64_t n_local_expert, int64_t topk,
+                 const std::optional<int64_t>& align_block_size,
+                 torch::Tensor& permuted_input,
+                 torch::Tensor& expert_first_token_offset,
+                 torch::Tensor& src_row_id2dst_row_id_map,
+                 torch::Tensor& m_indices) {
+  TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
+}
+
+void moe_unpermute(const torch::Tensor& input,
+                   const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
+                   const torch::Tensor& token_expert_indicies,
+                   const std::optional<torch::Tensor>& expert_map,
+                   int64_t n_expert, int64_t n_local_expert, int64_t topk,
+                   const std::optional<int64_t>& align_block_size,
+                   torch::Tensor& permuted_input,
+                   torch::Tensor& expert_first_token_offset,
+                   torch::Tensor& src_row_id2dst_row_id_map,
+                   torch::Tensor& m_indices) {
+  TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
+}
+
+#endif
+
+bool moe_permute_unpermute_supported() {
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+  return true;
+#else
+  return false;
+#endif
+}
+
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
   m.impl("moe_permute", &moe_permute);
   m.impl("moe_unpermute", &moe_unpermute);
-}
\ No newline at end of file
+}
diff --git a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
index aa353d0f0437..de2c153882d9 100644
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
@@ -1,6 +1,9 @@
 
 #include "moe_permute_unpermute_kernel.h"
 
+// moe_permute kernels require at least CUDA 12.0
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+
 // CubKeyValueSorter definition begin
 CubKeyValueSorter::CubKeyValueSorter()
     : num_experts_(0), num_bits_(sizeof(int) * 8) {}
@@ -131,9 +134,6 @@ __global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size,
                                        int num_experts) {
   auto tidx = threadIdx.x;
   auto bidx = blockIdx.x;
-  auto lidx = tidx & 31;
-  auto widx = tidx >> 5;
-  auto warp_count = (blockDim.x + 31) >> 5;
   auto offset = bidx * blockDim.x;
   auto bound = min(offset + blockDim.x, size);
   extern __shared__ int smem_expert_map[];
@@ -226,4 +226,6 @@ void getMIndices(int64_t* expert_first_token_offset,
         expert_first_token_offset, align_expert_first_token_offset, m_indices,
         num_local_expert, align_block_size);
   }
-}
\ No newline at end of file
+}
+
+#endif
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index 810026d034c0..7d35ec79ead4 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -10,7 +10,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
 
   // Calculate the result of moe by summing up the partial results
   // from all selected experts.
-  m.def("moe_sum(Tensor! input, Tensor output) -> ()");
+  m.def("moe_sum(Tensor input, Tensor! output) -> ()");
   m.impl("moe_sum", torch::kCUDA, &moe_sum);
 
   // Aligning the number of tokens to be processed by each expert such
@@ -77,7 +77,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
       "expert_first_token_offset, int n_expert, int n_local_expert,int "
       "topk, Tensor! hidden_states)->()");
-  // conditionally compiled so impl registration is in source file
+
+  m.def("moe_permute_unpermute_supported() -> bool");
+  m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
 
 #endif
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
index 3c258ddce61e..e9b408fbf2ee 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -123,7 +123,7 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
 }
 
 bool cutlass_group_gemm_supported(int64_t cuda_device_capability) {
-  // CUTLASS groped FP8 kernels need at least CUDA 12.3
+  // CUTLASS grouped FP8 kernels need at least CUDA 12.3
   // and SM90 (Hopper)
 
 #if defined CUDA_VERSION
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
index c5f0b113421c..76bbec3c6774 100644
--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
@@ -30,6 +30,14 @@
   #define __HIP__GFX9__
 #endif
 
+#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__))
+  #define __HIP__GFX11__
+#endif
+
+#if defined(__HIPCC__) && (defined(__gfx1200__) || defined(__gfx1201__))
+  #define __HIP__GFX12__
+#endif
+
 #if defined(NDEBUG)
   #undef NDEBUG
   #include <assert.h>
@@ -43,7 +51,7 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
 
-#if defined(__HIP__GFX9__)  // TODO: Add NAVI support
+#if defined(__HIP__GFX9__)
 
   #define GCN_MFMA_INSTR1 __builtin_amdgcn_mfma_f32_16x16x4f32
   #define GCN_MFMA_INSTR __builtin_amdgcn_mfma_f32_4x4x4f16
@@ -1486,190 +1494,1689 @@ __launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
   }
 }
 
-#else  // !defined(__HIP__GFX9__) TODO: Add NAVI support
+#elif defined(__HIP__GFX11__)
 
-// clang-format off
-template <typename scalar_t, typename cache_t,
-          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
-          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
-          int GQA_RATIO>
-__global__
-__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
-    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
-    const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
-    const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
-    const int num_kv_heads,
-    const float scale,
-    const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
-    const int* __restrict__ context_lens,    // [num_seqs]
-    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
-    const int max_num_blocks_per_seq,
-    const float* __restrict__ alibi_slopes,  // [num_heads]
-    const int q_stride,
-    const int kv_block_stride,
-    const int kv_head_stride,
-    float* __restrict__ exp_sums,             // [num_seqs, num_heads, max_num_partitions]
-    float* __restrict__ max_logits,           // [num_seqs, num_heads, max_num_partitions]
-    scalar_t* __restrict__ out,               // [num_seqs, num_heads, max_num_partitions, head_size]
-    OUTT* __restrict__ final_out,             // [num_seqs, num_heads, head_size]
-    int max_ctx_blocks, const float* k_scale, const float* v_scale,
-    const float* __restrict__ fp8_out_scale_ptr) {
-  UNREACHABLE_CODE
+using floatx8 = __attribute__((__vector_size__(8 * sizeof(float)))) float;
+
+using bit16_t = uint16_t;
+using bit16x4 = __attribute__((__vector_size__(4 * sizeof(uint16_t)))) uint16_t;
+typedef bit16x4 _B16x4;
+
+using bit16x8 = __attribute__((__vector_size__(8 * sizeof(uint16_t)))) uint16_t;
+union b16x8_u {
+  bit16x8 u16x8;
+  _B16x4 xy[2];
+};
+typedef b16x8_u _B16x8;
+
+using bit16x16 =
+    __attribute__((__vector_size__(16 * sizeof(uint16_t)))) uint16_t;
+union b16x16_u {
+  bit16x16 u16x16;
+  _B16x8 xy[2];
+};
+typedef b16x16_u _B16x16;
+
+using _B8x8 = uint2;
+using bit8_t = uint8_t;
+
+typedef struct _B8x16 {
+  _B8x8 xy[2];
+} _B8x16;
+
+template <typename T, int absz, int cbid, int blgp>
+__device__ __forceinline__ floatx8 gcn_wmma16x16x16_instr(const bit16x16& inpA,
+                                                          const bit16x16& inpB,
+                                                          const floatx8& inpC) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(inpA, inpB, inpC);
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(inpA, inpB, inpC);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ float to_float(const T& inp) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return (float)inp;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __bfloat162float(inp);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T from_float(const float& inp) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return (_Float16)inp;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __float2bfloat16(inp);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    union h2cvt {
+      __half2 h2[4];
+      _B16x8 b16x8;
+    } u;
+    u.h2[0] = __float22half2_rn(make_float2(inp[0], inp[1]));
+    u.h2[1] = __float22half2_rn(make_float2(inp[2], inp[3]));
+    u.h2[2] = __float22half2_rn(make_float2(inp[4], inp[5]));
+    u.h2[3] = __float22half2_rn(make_float2(inp[6], inp[7]));
+    return u.b16x8;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    union b2cvt {
+      __hip_bfloat162 b2[4];
+      _B16x8 b16x8;
+    } u;
+
+    u.b2[0] = __float22bfloat162_rn(make_float2(inp[0], inp[1]));
+    u.b2[1] = __float22bfloat162_rn(make_float2(inp[2], inp[3]));
+    u.b2[2] = __float22bfloat162_rn(make_float2(inp[4], inp[5]));
+    u.b2[3] = __float22bfloat162_rn(make_float2(inp[6], inp[7]));
+
+    return u.b16x8;
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
 }
 
+// clang-format off
 template <typename scalar_t, typename cache_t,
           vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
-          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
-          int GQA_RATIO>
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO>
 __global__
-__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
-    const scalar_t* __restrict__ q,          // [num_seqs, num_heads, head_size]
-    const cache_t* __restrict__ k_cache,     // [num_blocks, num_kv_heads, head_size/x, block_size, x]
-    const cache_t* __restrict__ v_cache,     // [num_blocks, num_kv_heads, head_size, block_size]
-    const int num_kv_heads,
-    const float scale,
-    const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
-    const int* __restrict__ context_lens,    // [num_seqs]
-    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+__launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
+    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size, block_size]
+    const int num_kv_heads, const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
     const int max_num_blocks_per_seq,
     const float* __restrict__ alibi_slopes,  // [num_heads]
-    const int q_stride,
-    const int kv_block_stride,
-    const int kv_head_stride,
-    float* __restrict__ exp_sums,            // [num_seqs, num_heads, max_num_partitions]
-    float* __restrict__ max_logits,          // [num_seqs, num_heads, max_num_partitions]
-    scalar_t* __restrict__ out,              // [num_seqs, num_heads, max_num_partitions, head_size]
-    OUTT* __restrict__ final_out,            // [num_seqs, num_heads, head_size]
-    int max_ctx_blocks, const float* k_scale, const float* v_scale,
-    const float* __restrict__ fp8_out_scale_ptr) {
-  UNREACHABLE_CODE
-}
+    const int q_stride, const int kv_block_stride, const int kv_head_stride,
+    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                     // max_num_partitions]
+    scalar_t* __restrict__ out,    // [num_seqs, num_heads, max_num_partitions,
+                                   // head_size]
+    OUTT* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  // clang-format on
+  constexpr int NWARPS = NUM_THREADS / WARP_SIZE;  // 8 warps on gfx11
+  const int warpid = threadIdx.x / WARP_SIZE;
+  const int laneid = threadIdx.x % WARP_SIZE;
+  const int lane2id = laneid % 2;
+  const int lane4id = laneid % 4;
+  const int lane16id = laneid % 16;
+  const int rowid = laneid / 16;
 
-// Grid: (num_heads, num_seqs).
-template <typename scalar_t, typename OUTT, int HEAD_SIZE, int NUM_THREADS,
-          int PARTITION_SIZE, int NPAR_LOOPS>
-__global__
-__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
-    OUTT* __restrict__ out,                // [num_seqs, num_heads, head_size]
-    const float* __restrict__ exp_sums,    // [num_seqs, num_heads, max_num_partitions]
-    const float* __restrict__ max_logits,  // [num_seqs, num_heads, max_num_partitions]
-    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
-    const int* __restrict__ context_lens,  // [num_seqs]
-    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
-    const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
-  UNREACHABLE_CODE
-}
-// clang-format on
+  const int seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx]) != 1) {
+    return;
+  }
 
-#endif  // defined(__HIP__GFX9__) TODO: Add NAVI support
+  const int partition_idx = blockIdx.y;
 
-#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                              \
-  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
-                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
-                                          GQA_RATIO>                           \
-      <<<grid, block, 0, stream>>>(                                            \
-          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
-          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
-          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
-          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
-          max_ctx_blocks, k_scale_ptr, v_scale_ptr, fp8_out_scale_ptr);
+  constexpr int T_PAR_SIZE = 256;  // token partition size set to 256
 
-#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO)                               \
-  paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,   \
-                                         HEAD_SIZE, NTHR, ALIBI_ENABLED,       \
-                                         GQA_RATIO>                            \
-      <<<grid, block, 0, stream>>>(                                            \
-          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
-          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
-          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
-          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
-          max_ctx_blocks, k_scale_ptr, v_scale_ptr, fp8_out_scale_ptr);
+  const int max_num_partitions = gridDim.y;
 
-#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS)                          \
-  paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
-                                      PARTITION_SIZE, NPAR_LOOPS>    \
-      <<<reduce_grid, reduce_block, 0, stream>>>(                    \
-          out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr,        \
-          context_lens_ptr, query_start_loc_ptr, max_num_partitions, \
-          fp8_out_scale_ptr);
+  const int context_len = context_lens[seq_idx];  // length of a seq
 
-template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
-          int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
-          bool ALIBI_ENABLED>
-void paged_attention_custom_launcher(
-    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
-    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, const int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& context_lens,
-    const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
-    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
-    torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale) {
-  int num_seqs = block_tables.size(0);
-  int num_heads = query.size(1);
-  int head_size = query.size(2);
-  int max_num_blocks_per_seq = block_tables.size(1);
-  int q_stride = query.stride(0);
-  int kv_block_stride = key_cache.stride(0);
-  int kv_head_stride = key_cache.stride(1);
+  const int partition_start_token_idx = partition_idx * T_PAR_SIZE;
+  // exit if partition is out of context for seq
+  if (partition_start_token_idx >= context_len) {
+    return;
+  }
 
-  // NOTE: query start location is optional for V0 decode should not be used.
-  // If batch contains mix of prefills and decode, prefills should be skipped.
-  const int* query_start_loc_ptr =
-      query_start_loc
-          ? reinterpret_cast<const int*>(query_start_loc.value().data_ptr())
-          : nullptr;
+  constexpr int GQA_RATIO2 = DIVIDE_ROUND_UP(GQA_RATIO, 2);
 
-  // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr =
-      alibi_slopes
-          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-          : nullptr;
+  __shared__ float shared_qk_max[NWARPS][16 + 1];
+  __shared__ float shared_exp_sum[NWARPS][16 + 1];
+  // shared_logits is used for multiple purposes
+  __shared__ _B16x16 shared_logits[NWARPS][2][16][2];
 
-  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
-  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
-  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  KVT* key_cache_ptr = reinterpret_cast<KVT*>(key_cache.data_ptr());
-  KVT* value_cache_ptr = reinterpret_cast<KVT*>(value_cache.data_ptr());
-  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* context_lens_ptr = context_lens.data_ptr<int>();
-  const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
-  const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
-  // NOTE: fp8_out_scale is optional.
-  const auto fp8_out_scale_ptr =
-      fp8_out_scale
-          ? static_cast<const float*>(fp8_out_scale.value().data_ptr())
-          : nullptr;
-  OUTT* out_ptr = reinterpret_cast<OUTT*>(out.data_ptr());
+  // for QK wmma16x16, layout is QHead/Tokenx16 across every 16 lanes,
+  // 32 Bytes HeadElements in each lane, 2x16B HeadElements across a row of warp
+  constexpr int ROWS_PER_WARP =
+      WARP_SIZE / 16 / 2;  // rows refers to 16 lanes; refer dpp terminology
+  constexpr int CONTIGUOUS_KV_ELEMS_16B_LOAD =
+      16 / sizeof(cache_t);  // 8 for 16 bit cache type, 16 for 8 bit types
+  constexpr int QKHE_PER_FETCH =
+      CONTIGUOUS_KV_ELEMS_16B_LOAD *
+      ROWS_PER_WARP;  // each fetch across a warp fetches these many elements
+  constexpr int QKHELOOP = HEAD_SIZE / QKHE_PER_FETCH;  // 2xQKHE_16B across
+                                                        // warp
 
-  const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE);
+  _B16x16 Qlocal[QKHELOOP / 2];  // note that 16 contiguous elements of Q should
+                                 // be fetched per lane for 16 bit cache types
 
-  // partition size is fixed at 256 since both mfma4 and mfma16 kernels support
-  // it mfma4 kernel also supports partition size 512
-  constexpr int PARTITION_SIZE = 256;
-  const int max_num_partitions =
-      DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
-  const int gqa_ratio = num_heads / num_kv_heads;
-  assert(num_heads % num_kv_heads == 0);
-  assert(head_size == HEAD_SIZE);
+  constexpr int CONTIGUOUS_SCALAR_ELEMS_16B = 16 / sizeof(scalar_t);
 
-  constexpr int NTHR = 256;
-  dim3 grid(num_seqs, max_num_partitions, num_kv_heads);
-  dim3 block(NTHR);
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  constexpr int TOKENS_PER_WARP =
+      T_PAR_SIZE /
+      NWARPS;  // sub partition of tokens per warp for qk calculation
+  constexpr int TLOOP =
+      TOKENS_PER_WARP /
+      16;  // each wmma16x16x16 instruction processes 16 tokens
 
-  // mfma4 kernel is faster than mfma16 for gqa_ratio <= 4
-  switch (gqa_ratio) {
-    case 1:
-      LAUNCH_CUSTOM_ATTENTION_MFMA4(1);
-      break;
-    case 2:
-      LAUNCH_CUSTOM_ATTENTION_MFMA4(2);
-      break;
-    case 3:
-      LAUNCH_CUSTOM_ATTENTION_MFMA4(3);
-      break;
+  _B16x16 Klocal[TLOOP]
+                [QKHELOOP / 2];  // can be interpreted as B8x16 for 8 bit types
+
+  const int wg_start_head_idx = blockIdx.z * GQA_RATIO;
+  const int wg_start_kv_head_idx = blockIdx.z;
+  const int total_num_heads = gridDim.z * GQA_RATIO;
+
+  // for QK wmma, tokens in multiples of TOKENS_PER_WARP are spread across warps
+  // each wmma takes QH16xT16x16HE across warp
+  // repeat wmma across QKHELOOP dimension
+  // output layout from QKwmma : QH16xT8x2 16 qheads across 16 lanes, 16 tokens
+  // across 2 rows x 8 tokens per lane
+
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+
+  if (GQA_RATIO == 1) {
+    const int local_qhead_idx = lane16id % GQA_RATIO;
+    const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
+    const scalar_t* q_ptr =
+        q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
+    if (lane16id < GQA_RATIO) {
+  #pragma unroll
+      for (int qkhe_depth = 0; qkhe_depth < QKHELOOP / 2; qkhe_depth++) {
+        const scalar_t* q_fetch_ptr = q_ptr + qkhe_depth * QKHE_PER_FETCH * 2;
+        const _B16x16* q_fetch_ptr_32B =
+            reinterpret_cast<const _B16x16*>(q_fetch_ptr);
+        Qlocal[qkhe_depth] = *q_fetch_ptr_32B;
+      }
+    }
+  } else {
+    // fetch Q in shared across warps and then write to registers
+    const int local_qhead_idx = 2 * warpid + rowid;
+    const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
+    const scalar_t* q_ptr =
+        q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
+
+    const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
+    if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
+      const scalar_t* q_fetch_ptr = q_ptr + qhead_element;
+      const _B16x8* q_fetch_ptr_16B =
+          reinterpret_cast<const _B16x8*>(q_fetch_ptr);
+      _B16x8 tmp = *q_fetch_ptr_16B;
+
+      const int offset1 =
+          lane16id /
+          2;  // 16 contiguous chunks of head elems are spread across 8x2lanes
+      shared_logits[offset1][lane2id][local_qhead_idx][0].xy[0] = tmp;
+    }
+
+    __syncthreads();
+
+  #pragma unroll
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP / 2; qkhe_depth++) {
+      Qlocal[qkhe_depth].xy[0] =
+          shared_logits[qkhe_depth][0][lane16id % GQA_RATIO][0].xy[0];
+      Qlocal[qkhe_depth].xy[1] =
+          shared_logits[qkhe_depth][1][lane16id % GQA_RATIO][0].xy[0];
+    }
+  }
+
+  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
+  const int last_ctx_block = num_context_blocks - 1;
+
+  const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
+
+  int kphysical_block_number[TLOOP];
+
+  // fetch k physical block numbers
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int klocal_token_idx =
+        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
+    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
+    const int kblock_idx = (kglobal_token_idx < context_len)
+                               ? kglobal_token_idx / BLOCK_SIZE
+                               : last_ctx_block;
+    kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
+  }
+
+  constexpr int KX = 16 / sizeof(cache_t);
+  const cache_t* k_ptr = k_cache + wg_start_kv_head_idx * kv_head_stride;
+
+  const int row_head_elem = 0;
+
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int64_t kblock_number =
+        static_cast<int64_t>(kphysical_block_number[token_depth]);
+    const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
+    const int klocal_token_idx =
+        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
+    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
+    const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
+    const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
+
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+      const int head_elem = row_head_elem + qkhe_depth * QKHE_PER_FETCH;
+      const int offset1 = head_elem / KX;
+      const int offset2 = head_elem % KX;
+      const cache_t* k_fetch_ptr = k_ptr3 + offset1 * BLOCK_SIZE * KX + offset2;
+      const _B16x8* k_fetch_ptr_16B =
+          reinterpret_cast<const _B16x8*>(k_fetch_ptr);
+      Klocal[token_depth][qkhe_depth / 2].xy[qkhe_depth % 2] = *k_fetch_ptr_16B;
+    }
+  }
+
+  constexpr int VTOKENS_PER_LANE =
+      TOKENS_PER_WARP / ROWS_PER_WARP;  // 32/1 = 32 vtokens per lane
+  constexpr int VBLOCKS_PER_LANE = 2;   // assumes block size >=16
+  constexpr int VTLOOP = NWARPS;        // corresponds to tokens across warps
+  constexpr int VTLANELOOP = DIVIDE_ROUND_UP(
+      VTOKENS_PER_LANE,
+      CONTIGUOUS_KV_ELEMS_16B_LOAD);  // optimized for 16B fetches; assumes
+                                      // minimum block size is 16
+  constexpr int VHELOOP = DIVIDE_ROUND_UP(
+      (HEAD_SIZE / 16), NWARPS);  // head_size distributed across warps; each
+                                  // wmma instr works on 16 head elements
+
+  int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE];
+
+  // fetch v physical block numbers
+  for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+    for (int vblock_depth = 0; vblock_depth < VBLOCKS_PER_LANE;
+         vblock_depth++) {
+      const int vlocal_token_idx =
+          vtoken_depth * VTOKENS_PER_LANE * ROWS_PER_WARP +
+          vblock_depth * BLOCK_SIZE;
+      const int vglobal_token_idx =
+          partition_start_token_idx + vlocal_token_idx;
+      const int vblock_idx = (vglobal_token_idx < context_len)
+                                 ? vglobal_token_idx / BLOCK_SIZE
+                                 : last_ctx_block;
+      vphysical_block_number[vtoken_depth][vblock_depth] =
+          block_table_seq[vblock_idx];
+    }
+  }
+
+  _B16x16 Vlocal[VTLOOP][VHELOOP]
+                [VTLANELOOP / 2];  // this can be interpreted as B8x16 too
+
+  const cache_t* v_ptr = v_cache + wg_start_kv_head_idx * kv_head_stride;
+  // v fetches are 16head elems across lanes x (16x2) tokens per lane
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    const int vhead_elem = vhe_depth * NWARPS * 16 + warpid * 16 + lane16id;
+    const cache_t* v_ptr2 = v_ptr + vhead_elem * BLOCK_SIZE;
+
+    for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+      for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) {
+        const int64_t vblock_number = static_cast<int64_t>(
+            vphysical_block_number[vtoken_depth]
+                                  [vfetch_depth / VBLOCKS_PER_LANE]);
+        const cache_t* v_ptr3 = v_ptr2 + (vblock_number * kv_block_stride);
+
+        const cache_t* v_fetch_ptr =
+            v_ptr3 +
+            (vfetch_depth % VBLOCKS_PER_LANE) * CONTIGUOUS_KV_ELEMS_16B_LOAD;
+        const _B16x8* v_fetch_ptr_16B =
+            reinterpret_cast<const _B16x8*>(v_fetch_ptr);
+        Vlocal[vtoken_depth][vhe_depth][vfetch_depth / 2].xy[vfetch_depth % 2] =
+            *v_fetch_ptr_16B;
+      }
+    }
+  }
+
+  floatx8 dout[TLOOP];
+  // qk wmma
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    dout[token_depth] = {0};
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP / 2; qkhe_depth++) {
+      dout[token_depth] = gcn_wmma16x16x16_instr<scalar_t, 0, 0, 0>(
+          Klocal[token_depth][qkhe_depth].u16x16, Qlocal[qkhe_depth].u16x16,
+          dout[token_depth]);
+    }
+    dout[token_depth] *= scale;
+  }
+
+  // calculate qk_max and exp_sum per warp and write to shared memory
+  float qk_max = -FLT_MAX;
+  float exp_sum = 0.0f;
+  const int qkout_token_idx =
+      partition_start_token_idx + TOKENS_PER_WARP * warpid + rowid;
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int local_token_idx = qkout_token_idx + token_depth * 16;
+    for (int i = 0; i < 8; i++) {
+      const float tmp = (local_token_idx + 2 * i < context_len)
+                            ? dout[token_depth][i]
+                            : -FLT_MAX;
+      qk_max = fmaxf(qk_max, tmp);
+    }
+  }
+
+  qk_max = fmaxf(qk_max, __shfl_xor(qk_max, 16));
+
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int local_token_idx = qkout_token_idx + token_depth * 16;
+    for (int i = 0; i < 8; i++) {
+      const float tmp = (local_token_idx + 2 * i < context_len)
+                            ? __expf(dout[token_depth][i] - qk_max)
+                            : 0.0f;
+      dout[token_depth][i] = tmp;
+      exp_sum += tmp;
+    }
+  }
+
+  exp_sum += __shfl_xor(exp_sum, 16);
+
+  __syncthreads();
+
+  if (laneid < 16) {
+    shared_qk_max[warpid][lane16id] = qk_max;
+    shared_exp_sum[warpid][lane16id] = exp_sum;
+  }
+
+  __syncthreads();
+
+  // calculate partition qk_max and exp_sum
+  float partition_qk_max = -FLT_MAX;
+  float warp_qk_max_exp[NWARPS];
+  float partition_exp_sum = 0.0f;
+
+  #pragma unroll
+  for (int w = 0; w < NWARPS; w++) {
+    warp_qk_max_exp[w] = shared_qk_max[w][lane16id];
+    partition_qk_max = fmaxf(partition_qk_max, warp_qk_max_exp[w]);
+  }
+
+  for (int w = 0; w < NWARPS; w++) {
+    warp_qk_max_exp[w] = __expf(warp_qk_max_exp[w] - partition_qk_max);
+    partition_exp_sum += shared_exp_sum[w][lane16id] * warp_qk_max_exp[w];
+  }
+
+  const float inv_sum_scale =
+      __fdividef(1.f, partition_exp_sum + 1e-6f) * warp_qk_max_exp[warpid];
+
+  __syncthreads();
+
+  // write logits to shared mem
+  #pragma unroll
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    dout[token_depth] *= inv_sum_scale;
+    shared_logits[warpid][token_depth][lane16id][0].xy[rowid] =
+        from_floatx8<scalar_t>(dout[token_depth]);
+  }
+  __syncthreads();
+
+  _B16x8 swp_buf[TLOOP][2];
+  #pragma unroll
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    swp_buf[token_depth][0] =
+        shared_logits[warpid][token_depth][lane16id][0].xy[0];
+    swp_buf[token_depth][1] =
+        shared_logits[warpid][token_depth][lane16id][0].xy[1];
+  }
+
+  #pragma unroll
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+  #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      shared_logits[warpid][token_depth][lane16id][0].xy[rowid].u16x8[i] =
+          swp_buf[token_depth][i % 2].u16x8[4 * rowid + (i / 2)];
+    }
+  }
+
+  // write out partition max_logits and exp_sum
+  if (threadIdx.x < GQA_RATIO) {
+    const int qhead_idx = lane16id;
+    const int offset = seq_idx * total_num_heads * max_num_partitions +
+                       (wg_start_head_idx + qhead_idx) * max_num_partitions +
+                       partition_idx;
+    max_logits[offset] = partition_qk_max;
+    exp_sums[offset] = partition_exp_sum;
+  }
+
+  __syncthreads();
+
+  _B16x8 outelems[VHELOOP];
+  // Softmax V wmma
+  // v layout: 16he across lanes x (16x2) tokens per lane
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    floatx8 tmp_out = {0};
+    for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+      for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP / 2;
+           vfetch_depth++) {
+        const int offset = vfetch_depth;
+        // if output format is 16 qheads across 16 lanes, 16 head elems spread
+        // across rows
+        tmp_out = gcn_wmma16x16x16_instr<scalar_t, 0, 0, 0>(
+            Vlocal[vtoken_depth][vhe_depth][vfetch_depth].u16x16,
+            shared_logits[vtoken_depth][offset][lane16id][0].u16x16, tmp_out);
+      }
+    }
+    outelems[vhe_depth] = from_floatx8<scalar_t>(tmp_out);
+  }
+
+  __syncthreads();
+
+  #pragma unroll
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    shared_logits[warpid][vhe_depth][lane16id][0].xy[rowid] =
+        outelems[vhe_depth];  // lane16 id head dimension; rowid head element
+                              // dimension
+  }
+
+  __syncthreads();
+
+  #pragma unroll
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    swp_buf[vhe_depth][0] = shared_logits[warpid][vhe_depth][lane16id][0].xy[0];
+    swp_buf[vhe_depth][1] = shared_logits[warpid][vhe_depth][lane16id][0].xy[1];
+  }
+
+  #pragma unroll
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+  #pragma unroll
+    for (int i = 0; i < 8; i++) {
+      shared_logits[warpid][vhe_depth][lane16id][0].xy[rowid].u16x8[i] =
+          swp_buf[vhe_depth][i % 2].u16x8[4 * rowid + (i / 2)];
+    }
+  }
+
+  __syncthreads();
+
+  // write to tmp_out with coalesced writes after reading from shared mem
+  if (warpid == 0) {
+    _B16x8 vout[GQA_RATIO2];
+    // each lane writes out 16Bytes of tmp_out along head elem dimension
+    const int head_elem_idx = lane16id * 8;
+    if (head_elem_idx < HEAD_SIZE) {
+      for (int h = 0; h < GQA_RATIO2; h++) {
+        const int local_head_idx = 2 * h + rowid;
+        const int offset1 = (head_elem_idx / 16) % NWARPS;
+        const int offset2 = head_elem_idx / 16 / NWARPS;
+        const int offset3 = (head_elem_idx / 8) % 2;  // num_he % num_row
+        vout[h] =
+            shared_logits[offset1][offset2][local_head_idx][0].xy[offset3];
+      }
+
+      const int hsz_maxp_mult = HEAD_SIZE * max_num_partitions;
+      scalar_t* out_ptr = out + seq_idx * total_num_heads * hsz_maxp_mult +
+                          partition_idx * HEAD_SIZE;
+      for (int h = 0; h < GQA_RATIO2; h++) {
+        const int local_head_idx = 2 * h + rowid;
+        if (local_head_idx < GQA_RATIO) {
+          const int out_head_idx = wg_start_head_idx + local_head_idx;
+          scalar_t* out_ptr2 = out_ptr + out_head_idx * hsz_maxp_mult;
+          scalar_t* out_ptr3 = out_ptr2 + head_elem_idx;
+          _B16x8* out_ptr_B16x8 = reinterpret_cast<_B16x8*>(out_ptr3);
+          *out_ptr_B16x8 = vout[h];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
+          int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
+    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size, block_size]
+    const int num_kv_heads, const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride, const int kv_block_stride, const int kv_head_stride,
+    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                     // max_num_partitions]
+    scalar_t* __restrict__ out,    // [num_seqs, num_heads, max_num_partitions,
+                                   // head_size]
+    OUTT* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  UNREACHABLE_CODE
+}
+
+// Grid: (num_heads, num_seqs).
+template <typename scalar_t, typename OUTT, int HEAD_SIZE, int NUM_THREADS,
+          int PARTITION_SIZE, int NPAR_LOOPS>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
+    OUTT* __restrict__ out,                // [num_seqs, num_heads, head_size]
+    const float* __restrict__ exp_sums,    // [num_seqs, num_heads,
+                                           // max_num_partitions]
+    const float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                           // max_num_partitions]
+    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
+                                           // max_num_partitions, head_size]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
+  const auto num_heads = gridDim.x;
+  const auto head_idx = blockIdx.x;
+  const auto seq_idx = blockIdx.y;
+
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
+
+  const int context_len = context_lens[seq_idx];
+  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
+  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  const int warpid = threadIdx.x / WARP_SIZE;
+  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
+
+  __shared__ float shared_global_exp_sum;
+  // max num partitions supported is warp_size * NPAR_LOOPS
+  __shared__ float shared_exp_sums[NPAR_LOOPS * WARP_SIZE];
+
+  if (warpid == 0) {
+    const float* max_logits_ptr = max_logits +
+                                  seq_idx * num_heads * max_num_partitions +
+                                  head_idx * max_num_partitions;
+
+    // valid partition is the last valid partition in case threadid > num
+    // partitions
+    int valid_partition[NPAR_LOOPS];
+    float reg_max_logit[NPAR_LOOPS];
+    const int last_valid_partition = num_partitions - 1;
+
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      valid_partition[i] =
+          (partition_no < num_partitions) ? partition_no : last_valid_partition;
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      reg_max_logit[i] = max_logits_ptr[valid_partition[i]];
+    }
+    float max_logit = reg_max_logit[0];
+  #pragma unroll
+    for (int i = 1; i < NPAR_LOOPS; i++) {
+      max_logit = fmaxf(max_logit, reg_max_logit[i]);
+    }
+
+  #pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+      max_logit = fmaxf(max_logit, __shfl_xor(max_logit, mask));
+    }
+
+    const float* exp_sums_ptr = exp_sums +
+                                seq_idx * num_heads * max_num_partitions +
+                                head_idx * max_num_partitions;
+
+    float rescaled_exp_sum[NPAR_LOOPS];
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      rescaled_exp_sum[i] = exp_sums_ptr[valid_partition[i]];
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      rescaled_exp_sum[i] *= (partition_no < num_partitions)
+                                 ? expf(reg_max_logit[i] - max_logit)
+                                 : 0.0f;
+    }
+    float global_exp_sum = rescaled_exp_sum[0];
+  #pragma unroll
+    for (int i = 1; i < NPAR_LOOPS; i++) {
+      global_exp_sum += rescaled_exp_sum[i];
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      shared_exp_sums[partition_no] = rescaled_exp_sum[i];
+    }
+
+  #pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+      global_exp_sum += __shfl_xor(global_exp_sum, mask);
+    }
+    if (threadIdx.x == 0) {
+      shared_global_exp_sum = global_exp_sum;
+    }
+  }  // warpid == 0
+  const scalar_t* tmp_out_ptr =
+      tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+      head_idx * max_num_partitions * HEAD_SIZE + threadIdx.x;
+  constexpr int MAX_NPAR = 32;
+  scalar_t tmps[MAX_NPAR];
+  const float dzero = 0.0f;
+  #pragma unroll
+  for (int j = 0; j < MAX_NPAR; j++) {
+    tmps[j] = from_float<scalar_t>(dzero);
+  }
+  const int last_partition_offset = (num_partitions - 1) * HEAD_SIZE;
+  const int num_partition_offset = (num_partitions)*HEAD_SIZE;
+  int idx = 0;
+
+  constexpr int JCHUNK = 16;
+
+  #pragma unroll
+  for (int j = 0; j < JCHUNK * HEAD_SIZE; j += HEAD_SIZE) {
+    // lastj is last valid partition
+    const int lastj_offset =
+        (j < num_partition_offset) ? j : last_partition_offset;
+    tmps[idx] = tmp_out_ptr[lastj_offset];
+    idx++;
+  }
+  __syncthreads();
+
+  if (num_partitions > JCHUNK) {
+  #pragma unroll
+    for (int j = JCHUNK * HEAD_SIZE; j < 2 * JCHUNK * HEAD_SIZE;
+         j += HEAD_SIZE) {
+      const int lastj_offset =
+          (j < num_partition_offset) ? j : last_partition_offset;
+      tmps[idx] = tmp_out_ptr[lastj_offset];
+      idx++;
+    }
+
+    if (num_partitions > 2 * JCHUNK) {
+  #pragma unroll
+      for (int j = 2 * JCHUNK * HEAD_SIZE; j < MAX_NPAR * HEAD_SIZE;
+           j += HEAD_SIZE) {
+        const int lastj_offset =
+            (j < num_partition_offset) ? j : last_partition_offset;
+        tmps[idx] = tmp_out_ptr[lastj_offset];
+        idx++;
+      }
+    }
+  }  // num_partitions > JCHUNK
+
+  // Aggregate tmp_out to out.
+  float acc = 0.0f;
+  #pragma unroll
+  for (int j = 0; j < JCHUNK; j++) {
+    acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+  }
+  if (num_partitions > JCHUNK) {
+  #pragma unroll
+    for (int j = JCHUNK; j < 2 * JCHUNK; j++) {
+      acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+    }
+    if (num_partitions > 2 * JCHUNK) {
+  #pragma unroll
+      for (int j = 2 * JCHUNK; j < MAX_NPAR; j++) {
+        acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+      }
+    }
+  }
+
+  for (int p = 1; p < NPAR_LOOPS; p++) {
+    if (num_partitions > p * MAX_NPAR) {
+      idx = 0;
+  #pragma unroll
+      for (int j = p * MAX_NPAR * HEAD_SIZE; j < (p + 1) * MAX_NPAR * HEAD_SIZE;
+           j += HEAD_SIZE) {
+        // lastj is last valid partition
+        const int lastj_offset =
+            (j < num_partition_offset) ? j : last_partition_offset;
+        tmps[idx] = tmp_out_ptr[lastj_offset];
+        idx++;
+      }
+
+  #pragma unroll
+      for (int j = 0; j < MAX_NPAR; j++) {
+        acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j + p * MAX_NPAR];
+      }
+    }
+  }
+
+  const float inv_global_exp_sum =
+      __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
+  acc *= inv_global_exp_sum;
+
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+  OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE +
+                  static_cast<int64_t>(head_idx) * HEAD_SIZE;
+  out_ptr[threadIdx.x] = from_float<scalar_t>(acc);
+}
+
+#elif defined(__HIP__GFX12__)
+
+using floatx8 = __attribute__((__vector_size__(8 * sizeof(float)))) float;
+
+using bit16_t = uint16_t;
+using bit16x4 = __attribute__((__vector_size__(4 * sizeof(uint16_t)))) uint16_t;
+typedef bit16x4 _B16x4;
+
+using bit16x8 = __attribute__((__vector_size__(8 * sizeof(uint16_t)))) uint16_t;
+union b16x8_u {
+  bit16x8 u16x8;
+  _B16x4 xy[2];
+};
+typedef b16x8_u _B16x8;
+
+using _B8x8 = uint2;
+using bit8_t = uint8_t;
+
+typedef struct _B8x16 {
+  _B8x8 xy[2];
+} _B8x16;
+
+template <typename T, int absz, int cbid, int blgp>
+__device__ __forceinline__ floatx8 gcn_wmma16x16x16_instr(const bit16x8& inpA,
+                                                          const bit16x8& inpB,
+                                                          const floatx8& inpC) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(inpA, inpB, inpC);
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(inpA, inpB, inpC);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ float to_float(const T& inp) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return (float)inp;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __bfloat162float(inp);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ float to_float_b16(const bit16_t& inp) {
+  union tmpcvt {
+    bit16_t u;
+    _Float16 f;
+    __hip_bfloat16 b;
+  } t16;
+  t16.u = inp;
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return (float)t16.f;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __bfloat162float(t16.b);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T from_float(const float& inp) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    return (_Float16)inp;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    return __float2bfloat16(inp);
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ _B16x8 from_floatx8(const floatx8& inp) {
+  if constexpr (std::is_same<T, _Float16>::value) {
+    union h2cvt {
+      __half2 h2[4];
+      _B16x8 b16x8;
+    } u;
+    u.h2[0] = __float22half2_rn(make_float2(inp[0], inp[1]));
+    u.h2[1] = __float22half2_rn(make_float2(inp[2], inp[3]));
+    u.h2[2] = __float22half2_rn(make_float2(inp[4], inp[5]));
+    u.h2[3] = __float22half2_rn(make_float2(inp[6], inp[7]));
+    return u.b16x8;
+  } else if constexpr (std::is_same<T, __hip_bfloat16>::value) {
+    union b2cvt {
+      __hip_bfloat162 b2[4];
+      _B16x8 b16x8;
+    } u;
+
+    u.b2[0] = __float22bfloat162_rn(make_float2(inp[0], inp[1]));
+    u.b2[1] = __float22bfloat162_rn(make_float2(inp[2], inp[3]));
+    u.b2[2] = __float22bfloat162_rn(make_float2(inp[4], inp[5]));
+    u.b2[3] = __float22bfloat162_rn(make_float2(inp[6], inp[7]));
+
+    return u.b16x8;
+  } else {
+    static_assert(false, "unsupported 16b dtype");
+  }
+}
+
+// clang-format off
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED, int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
+    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size, block_size]
+    const int num_kv_heads, const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,   // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride, const int kv_block_stride, const int kv_head_stride,
+    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                     // max_num_partitions]
+    scalar_t* __restrict__ out,    // [num_seqs, num_heads, max_num_partitions,
+                                   // head_size]
+    OUTT* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  // clang-format on
+  constexpr int NWARPS = NUM_THREADS / WARP_SIZE;  // 8 warps on gfx11
+  const int warpid = threadIdx.x / WARP_SIZE;
+  const int laneid = threadIdx.x % WARP_SIZE;
+  const int lane2id = laneid % 2;
+  const int lane4id = laneid % 4;
+  const int lane16id = laneid % 16;
+  const int rowid = laneid / 16;
+
+  const int seq_idx = blockIdx.x;
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
+  const int partition_idx = blockIdx.y;
+
+  constexpr int T_PAR_SIZE = 256;  // token partition size set to 256
+
+  const int max_num_partitions = gridDim.y;
+
+  const int context_len = context_lens[seq_idx];  // length of a seq
+
+  const int partition_start_token_idx = partition_idx * T_PAR_SIZE;
+  // exit if partition is out of context for seq
+  if (partition_start_token_idx >= context_len) {
+    return;
+  }
+
+  constexpr int GQA_RATIO2 = DIVIDE_ROUND_UP(GQA_RATIO, 2);
+
+  __shared__ float shared_qk_max[NWARPS][16 + 1];
+  __shared__ float shared_exp_sum[NWARPS][16 + 1];
+  // shared_logits is used for multiple purposes
+  __shared__ _B16x8 shared_logits[NWARPS][2][16][2];
+
+  // for QK wmma16x16_gfx12, layout is QHead/Tokenx16 across every 16 lanes,
+  // 16 Bytes HeadElements in each lane, 2x16B HeadElements across 2 rows of
+  // warp
+  constexpr int ROWS_PER_WARP =
+      WARP_SIZE / 16;  // rows refers to 16 lanes; refer dpp terminology
+  constexpr int CONTIGUOUS_KV_ELEMS_16B_LOAD =
+      16 / sizeof(cache_t);  // 8 for 16 bit cache type, 16 for 8 bit types
+  constexpr int QKHE_PER_FETCH =
+      CONTIGUOUS_KV_ELEMS_16B_LOAD *
+      ROWS_PER_WARP;  // each fetch across a warp fetches these many elements
+  constexpr int QKHELOOP = HEAD_SIZE / QKHE_PER_FETCH;  // 2xQKHE_16B across
+                                                        // warp
+
+  _B16x8 Qlocal[QKHELOOP];  // note that 16 contiguous elements of Q should
+                            // be fetched per lane for 16 bit cache types
+
+  constexpr int CONTIGUOUS_SCALAR_ELEMS_16B = 16 / sizeof(scalar_t);
+
+  constexpr int TOKENS_PER_WARP =
+      T_PAR_SIZE /
+      NWARPS;  // sub partition of tokens per warp for qk calculation
+  constexpr int TLOOP =
+      TOKENS_PER_WARP /
+      16;  // each wmma16x16x16 instruction processes 16 tokens
+
+  _B16x8 Klocal[TLOOP]
+               [QKHELOOP];  // can be interpreted as B8x16 for 8 bit types
+
+  const int wg_start_head_idx = blockIdx.z * GQA_RATIO;
+  const int wg_start_kv_head_idx = blockIdx.z;
+  const int total_num_heads = gridDim.z * GQA_RATIO;
+
+  // for QK wmma, tokens in multiples of TOKENS_PER_WARP are spread across warps
+  // each wmma takes QH16xT16x16HE across warp
+  // repeat wmma across QKHELOOP dimension
+  // output layout from QKwmma : QH16xT8x2 16 qheads across 16 lanes, 16 tokens
+  // across 2 rows x 8 tokens per lane
+
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+
+  if (GQA_RATIO == 1) {
+    const int local_qhead_idx = lane16id % GQA_RATIO;
+    const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
+    const scalar_t* q_ptr = q + query_start_off * q_stride +
+                            global_qhead_idx * HEAD_SIZE +
+                            rowid * CONTIGUOUS_KV_ELEMS_16B_LOAD;
+    if (lane16id < GQA_RATIO) {
+  #pragma unroll
+      for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+        const scalar_t* q_fetch_ptr = q_ptr + qkhe_depth * QKHE_PER_FETCH;
+        const _B16x8* q_fetch_ptr_16B =
+            reinterpret_cast<const _B16x8*>(q_fetch_ptr);
+        Qlocal[qkhe_depth] = *q_fetch_ptr_16B;
+      }
+    }
+  } else {
+    // fetch Q in shared across warps and then write to registers
+    const int local_qhead_idx = 2 * warpid + rowid;
+    const int global_qhead_idx = wg_start_head_idx + local_qhead_idx;
+    const scalar_t* q_ptr =
+        q + query_start_off * q_stride + global_qhead_idx * HEAD_SIZE;
+
+    const int qhead_element = lane16id * CONTIGUOUS_SCALAR_ELEMS_16B;
+    if ((local_qhead_idx < GQA_RATIO) && (qhead_element < HEAD_SIZE)) {
+      const scalar_t* q_fetch_ptr = q_ptr + qhead_element;
+      const _B16x8* q_fetch_ptr_16B =
+          reinterpret_cast<const _B16x8*>(q_fetch_ptr);
+      _B16x8 tmp = *q_fetch_ptr_16B;
+
+      const int offset1 =
+          lane16id /
+          2;  // 16 contiguous chunks of head elems are spread across 8x2lanes
+      shared_logits[offset1][lane2id][local_qhead_idx][0] = tmp;
+    }
+
+    __syncthreads();
+
+  #pragma unroll
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+      Qlocal[qkhe_depth] =
+          shared_logits[qkhe_depth][rowid][lane16id % GQA_RATIO][0];
+    }
+  }
+
+  const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
+  const int last_ctx_block = num_context_blocks - 1;
+
+  const int* block_table_seq = block_tables + seq_idx * max_num_blocks_per_seq;
+
+  int kphysical_block_number[TLOOP];
+
+  // fetch k physical block numbers
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int klocal_token_idx =
+        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
+    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
+    const int kblock_idx = (kglobal_token_idx < context_len)
+                               ? kglobal_token_idx / BLOCK_SIZE
+                               : last_ctx_block;
+    kphysical_block_number[token_depth] = block_table_seq[kblock_idx];
+  }
+
+  constexpr int KX = 16 / sizeof(cache_t);
+  const cache_t* k_ptr = k_cache + wg_start_kv_head_idx * kv_head_stride;
+
+  const int row_head_elem = rowid * CONTIGUOUS_KV_ELEMS_16B_LOAD;
+
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int64_t kblock_number =
+        static_cast<int64_t>(kphysical_block_number[token_depth]);
+    const cache_t* k_ptr2 = k_ptr + kblock_number * kv_block_stride;
+    const int klocal_token_idx =
+        TOKENS_PER_WARP * warpid + token_depth * 16 + lane16id;
+    const int kglobal_token_idx = partition_start_token_idx + klocal_token_idx;
+    const int kphysical_block_offset = klocal_token_idx % BLOCK_SIZE;
+    const cache_t* k_ptr3 = k_ptr2 + kphysical_block_offset * KX;
+
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+      const int head_elem = row_head_elem + qkhe_depth * QKHE_PER_FETCH;
+      const int offset1 = head_elem / KX;
+      const int offset2 = head_elem % KX;
+      const cache_t* k_fetch_ptr = k_ptr3 + offset1 * BLOCK_SIZE * KX + offset2;
+      const _B16x8* k_fetch_ptr_16B =
+          reinterpret_cast<const _B16x8*>(k_fetch_ptr);
+      Klocal[token_depth][qkhe_depth] = *k_fetch_ptr_16B;
+    }
+  }
+
+  constexpr int VTOKENS_PER_LANE =
+      TOKENS_PER_WARP / ROWS_PER_WARP;  // 32/2 = 16 vtokens per lane
+  constexpr int VBLOCKS_PER_LANE = 1;   // assumes block size >=16
+  constexpr int VTLOOP = NWARPS;        // corresponds to tokens across warps
+  constexpr int VTLANELOOP = DIVIDE_ROUND_UP(
+      VTOKENS_PER_LANE,
+      CONTIGUOUS_KV_ELEMS_16B_LOAD);  // optimized for 16B fetches; assumes
+                                      // minimum block size is 16
+  constexpr int VHELOOP = DIVIDE_ROUND_UP(
+      (HEAD_SIZE / 16), NWARPS);  // head_size distributed across warps; each
+                                  // wmma instr works on 16 head elements
+
+  int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE];
+
+  // fetch v physical block numbers
+  for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+    for (int vblock_depth = 0; vblock_depth < VBLOCKS_PER_LANE;
+         vblock_depth++) {
+      const int vlocal_token_idx =
+          vtoken_depth * VTOKENS_PER_LANE * ROWS_PER_WARP +
+          rowid * VTOKENS_PER_LANE + vblock_depth * BLOCK_SIZE;
+      const int vglobal_token_idx =
+          partition_start_token_idx + vlocal_token_idx;
+      const int vblock_idx = (vglobal_token_idx < context_len)
+                                 ? vglobal_token_idx / BLOCK_SIZE
+                                 : last_ctx_block;
+      vphysical_block_number[vtoken_depth][vblock_depth] =
+          block_table_seq[vblock_idx];
+    }
+  }
+
+  _B16x8 Vlocal[VTLOOP][VHELOOP]
+               [VTLANELOOP];  // this can be interpreted as B8x16 too
+
+  const cache_t* v_ptr = v_cache + wg_start_kv_head_idx * kv_head_stride +
+                         ((rowid * VTOKENS_PER_LANE) % BLOCK_SIZE);
+
+  // v fetches are 16head elems across lanes x 16 tokens per lane
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    const int vhead_elem = vhe_depth * NWARPS * 16 + warpid * 16 + lane16id;
+    const cache_t* v_ptr2 = v_ptr + vhead_elem * BLOCK_SIZE;
+
+    for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+      for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) {
+        const int vblock_depth = 0;
+        const int64_t vblock_number = static_cast<int64_t>(
+            vphysical_block_number[vtoken_depth][vblock_depth]);
+        const cache_t* v_ptr3 = v_ptr2 + (vblock_number * kv_block_stride);
+
+        const cache_t* v_fetch_ptr =
+            v_ptr3 + vfetch_depth * CONTIGUOUS_KV_ELEMS_16B_LOAD;
+        const _B16x8* v_fetch_ptr_16B =
+            reinterpret_cast<const _B16x8*>(v_fetch_ptr);
+        Vlocal[vtoken_depth][vhe_depth][vfetch_depth] = *v_fetch_ptr_16B;
+      }
+    }
+  }
+
+  floatx8 dout[TLOOP];
+  // qk wmma
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    dout[token_depth] = {0};
+    for (int qkhe_depth = 0; qkhe_depth < QKHELOOP; qkhe_depth++) {
+      dout[token_depth] = gcn_wmma16x16x16_instr<scalar_t, 0, 0, 0>(
+          Klocal[token_depth][qkhe_depth].u16x8, Qlocal[qkhe_depth].u16x8,
+          dout[token_depth]);
+    }
+    dout[token_depth] *= scale;
+  }
+
+  // calculate qk_max and exp_sum per warp and write to shared memory
+  float qk_max = -FLT_MAX;
+  float exp_sum = 0.0f;
+  const int qkout_token_idx =
+      partition_start_token_idx + TOKENS_PER_WARP * warpid + rowid * 8;
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int local_token_idx = qkout_token_idx + token_depth * 16;
+    for (int i = 0; i < 8; i++) {
+      const float tmp =
+          (local_token_idx + i < context_len) ? dout[token_depth][i] : -FLT_MAX;
+      qk_max = fmaxf(qk_max, tmp);
+    }
+  }
+
+  qk_max = fmaxf(qk_max, __shfl_xor(qk_max, 16));
+
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    const int local_token_idx = qkout_token_idx + token_depth * 16;
+    for (int i = 0; i < 8; i++) {
+      const float tmp = (local_token_idx + i < context_len)
+                            ? __expf(dout[token_depth][i] - qk_max)
+                            : 0.0f;
+      dout[token_depth][i] = tmp;
+      exp_sum += tmp;
+    }
+  }
+
+  exp_sum += __shfl_xor(exp_sum, 16);
+
+  __syncthreads();
+
+  if (laneid < 16) {
+    shared_qk_max[warpid][lane16id] = qk_max;
+    shared_exp_sum[warpid][lane16id] = exp_sum;
+  }
+
+  __syncthreads();
+
+  // calculate partition qk_max and exp_sum
+  float partition_qk_max = -FLT_MAX;
+  float warp_qk_max_exp[NWARPS];
+  float partition_exp_sum = 0.0f;
+
+  #pragma unroll
+  for (int w = 0; w < NWARPS; w++) {
+    warp_qk_max_exp[w] = shared_qk_max[w][lane16id];
+    partition_qk_max = fmaxf(partition_qk_max, warp_qk_max_exp[w]);
+  }
+
+  for (int w = 0; w < NWARPS; w++) {
+    warp_qk_max_exp[w] = __expf(warp_qk_max_exp[w] - partition_qk_max);
+    partition_exp_sum += shared_exp_sum[w][lane16id] * warp_qk_max_exp[w];
+  }
+
+  const float inv_sum_scale =
+      __fdividef(1.f, partition_exp_sum + 1e-6f) * warp_qk_max_exp[warpid];
+
+  __syncthreads();
+
+  // write logits to shared mem
+  #pragma unroll
+  for (int token_depth = 0; token_depth < TLOOP; token_depth++) {
+    dout[token_depth] *= inv_sum_scale;
+    shared_logits[warpid][token_depth][lane16id][rowid] =
+        from_floatx8<scalar_t>(dout[token_depth]);
+  }
+
+  // write out partition max_logits and exp_sum
+  if (threadIdx.x < GQA_RATIO) {
+    const int qhead_idx = lane16id;
+    const int offset = seq_idx * total_num_heads * max_num_partitions +
+                       (wg_start_head_idx + qhead_idx) * max_num_partitions +
+                       partition_idx;
+    max_logits[offset] = partition_qk_max;
+    exp_sums[offset] = partition_exp_sum;
+  }
+
+  __syncthreads();
+
+  _B16x8 outelems[VHELOOP];
+  // Softmax V wmma
+  // v layout: 16he across lanes x 16 tokens per lane
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    floatx8 tmp_out = {0};
+
+    for (int vtoken_depth = 0; vtoken_depth < VTLOOP; vtoken_depth++) {
+      for (int vfetch_depth = 0; vfetch_depth < VTLANELOOP; vfetch_depth++) {
+        const int offset = rowid * VTLANELOOP + vfetch_depth;
+        const int offset1 = offset % ROWS_PER_WARP;
+        const int offset2 = offset / ROWS_PER_WARP;
+        // if output format is 16 qheads across 16 lanes, 16 head elems spread
+        // across rows
+        tmp_out = gcn_wmma16x16x16_instr<scalar_t, 0, 0, 0>(
+            Vlocal[vtoken_depth][vhe_depth][vfetch_depth].u16x8,
+            shared_logits[vtoken_depth][offset2][lane16id][offset1].u16x8,
+            tmp_out);
+      }
+    }
+    outelems[vhe_depth] = from_floatx8<scalar_t>(tmp_out);
+  }
+
+  __syncthreads();
+
+  #pragma unroll
+  for (int vhe_depth = 0; vhe_depth < VHELOOP; vhe_depth++) {
+    shared_logits[warpid][vhe_depth][lane16id][rowid] =
+        outelems[vhe_depth];  // lane16 id head dimension; rowid head element
+                              // dimension
+  }
+
+  __syncthreads();
+
+  // write to tmp_out with coalesced writes after reading from shared mem
+  if (warpid == 0) {
+    _B16x8 vout[GQA_RATIO2];
+    // each lane writes out 16Bytes of tmp_out along head elem dimension
+    const int head_elem_idx = lane16id * 8;
+    if (head_elem_idx < HEAD_SIZE) {
+      for (int h = 0; h < GQA_RATIO2; h++) {
+        const int local_head_idx = 2 * h + rowid;
+        const int offset1 = (head_elem_idx / 16) % NWARPS;
+        const int offset2 = head_elem_idx / 16 / NWARPS;
+        const int offset3 = (head_elem_idx / 8) % 2;  // num_he % num_row
+        vout[h] = shared_logits[offset1][offset2][local_head_idx][offset3];
+      }
+
+      const int hsz_maxp_mult = HEAD_SIZE * max_num_partitions;
+      scalar_t* out_ptr = out + seq_idx * total_num_heads * hsz_maxp_mult +
+                          partition_idx * HEAD_SIZE;
+      for (int h = 0; h < GQA_RATIO2; h++) {
+        const int local_head_idx = 2 * h + rowid;
+        if (local_head_idx < GQA_RATIO) {
+          const int out_head_idx = wg_start_head_idx + local_head_idx;
+          scalar_t* out_ptr2 = out_ptr + out_head_idx * hsz_maxp_mult;
+          scalar_t* out_ptr3 = out_ptr2 + head_elem_idx;
+          _B16x8* out_ptr_B16x8 = reinterpret_cast<_B16x8*>(out_ptr3);
+          *out_ptr_B16x8 = vout[h];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
+          int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
+    const scalar_t* __restrict__ q,       // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,  // [num_blocks, num_kv_heads,
+                                          // head_size, block_size]
+    const int num_kv_heads, const float scale,
+    const int* __restrict__ block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride, const int kv_block_stride, const int kv_head_stride,
+    float* __restrict__ exp_sums,  // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                     // max_num_partitions]
+    scalar_t* __restrict__ out,    // [num_seqs, num_heads, max_num_partitions,
+                                   // head_size]
+    OUTT* __restrict__ final_out,  // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale) {
+  UNREACHABLE_CODE
+}
+
+// Grid: (num_heads, num_seqs).
+template <typename scalar_t, typename OUTT, int HEAD_SIZE, int NUM_THREADS,
+          int PARTITION_SIZE, int NPAR_LOOPS>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
+    OUTT* __restrict__ out,                // [num_seqs, num_heads, head_size]
+    const float* __restrict__ exp_sums,    // [num_seqs, num_heads,
+                                           // max_num_partitions]
+    const float* __restrict__ max_logits,  // [num_seqs, num_heads,
+                                           // max_num_partitions]
+    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads,
+                                           // max_num_partitions, head_size]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
+  const auto num_heads = gridDim.x;
+  const auto head_idx = blockIdx.x;
+  const auto seq_idx = blockIdx.y;
+
+  // NOTE queries with sequence len > 1 are prefills and taken care by another
+  // kernel.
+  if (query_start_loc_ptr != nullptr &&
+      (query_start_loc_ptr[seq_idx + 1] - query_start_loc_ptr[seq_idx] != 1)) {
+    return;
+  }
+
+  const int context_len = context_lens[seq_idx];
+  const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
+  [[maybe_unused]] constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  const int warpid = threadIdx.x / WARP_SIZE;
+  [[maybe_unused]] const int laneid = threadIdx.x % WARP_SIZE;
+
+  __shared__ float shared_global_exp_sum;
+  // max num partitions supported is warp_size * NPAR_LOOPS
+  __shared__ float shared_exp_sums[NPAR_LOOPS * WARP_SIZE];
+
+  if (warpid == 0) {
+    const float* max_logits_ptr = max_logits +
+                                  seq_idx * num_heads * max_num_partitions +
+                                  head_idx * max_num_partitions;
+
+    // valid partition is the last valid partition in case threadid > num
+    // partitions
+    int valid_partition[NPAR_LOOPS];
+    float reg_max_logit[NPAR_LOOPS];
+    const int last_valid_partition = num_partitions - 1;
+
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      valid_partition[i] =
+          (partition_no < num_partitions) ? partition_no : last_valid_partition;
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      reg_max_logit[i] = max_logits_ptr[valid_partition[i]];
+    }
+    float max_logit = reg_max_logit[0];
+  #pragma unroll
+    for (int i = 1; i < NPAR_LOOPS; i++) {
+      max_logit = fmaxf(max_logit, reg_max_logit[i]);
+    }
+
+  #pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+      max_logit = fmaxf(max_logit, __shfl_xor(max_logit, mask));
+    }
+
+    const float* exp_sums_ptr = exp_sums +
+                                seq_idx * num_heads * max_num_partitions +
+                                head_idx * max_num_partitions;
+
+    float rescaled_exp_sum[NPAR_LOOPS];
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      rescaled_exp_sum[i] = exp_sums_ptr[valid_partition[i]];
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      rescaled_exp_sum[i] *= (partition_no < num_partitions)
+                                 ? expf(reg_max_logit[i] - max_logit)
+                                 : 0.0f;
+    }
+    float global_exp_sum = rescaled_exp_sum[0];
+  #pragma unroll
+    for (int i = 1; i < NPAR_LOOPS; i++) {
+      global_exp_sum += rescaled_exp_sum[i];
+    }
+  #pragma unroll
+    for (int i = 0; i < NPAR_LOOPS; i++) {
+      const int partition_no = i * WARP_SIZE + threadIdx.x;
+      shared_exp_sums[partition_no] = rescaled_exp_sum[i];
+    }
+
+  #pragma unroll
+    for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+      global_exp_sum += __shfl_xor(global_exp_sum, mask);
+    }
+    if (threadIdx.x == 0) {
+      shared_global_exp_sum = global_exp_sum;
+    }
+  }  // warpid == 0
+  const scalar_t* tmp_out_ptr =
+      tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+      head_idx * max_num_partitions * HEAD_SIZE + threadIdx.x;
+  constexpr int MAX_NPAR = 32;
+  scalar_t tmps[MAX_NPAR];
+  const float dzero = 0.0f;
+  #pragma unroll
+  for (int j = 0; j < MAX_NPAR; j++) {
+    tmps[j] = from_float<scalar_t>(dzero);
+  }
+  const int last_partition_offset = (num_partitions - 1) * HEAD_SIZE;
+  const int num_partition_offset = (num_partitions)*HEAD_SIZE;
+  int idx = 0;
+
+  constexpr int JCHUNK = 16;
+
+  #pragma unroll
+  for (int j = 0; j < JCHUNK * HEAD_SIZE; j += HEAD_SIZE) {
+    // lastj is last valid partition
+    const int lastj_offset =
+        (j < num_partition_offset) ? j : last_partition_offset;
+    tmps[idx] = tmp_out_ptr[lastj_offset];
+    idx++;
+  }
+  __syncthreads();
+
+  if (num_partitions > JCHUNK) {
+  #pragma unroll
+    for (int j = JCHUNK * HEAD_SIZE; j < 2 * JCHUNK * HEAD_SIZE;
+         j += HEAD_SIZE) {
+      const int lastj_offset =
+          (j < num_partition_offset) ? j : last_partition_offset;
+      tmps[idx] = tmp_out_ptr[lastj_offset];
+      idx++;
+    }
+
+    if (num_partitions > 2 * JCHUNK) {
+  #pragma unroll
+      for (int j = 2 * JCHUNK * HEAD_SIZE; j < MAX_NPAR * HEAD_SIZE;
+           j += HEAD_SIZE) {
+        const int lastj_offset =
+            (j < num_partition_offset) ? j : last_partition_offset;
+        tmps[idx] = tmp_out_ptr[lastj_offset];
+        idx++;
+      }
+    }
+  }  // num_partitions > JCHUNK
+
+  // Aggregate tmp_out to out.
+  float acc = 0.0f;
+  #pragma unroll
+  for (int j = 0; j < JCHUNK; j++) {
+    acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+  }
+  if (num_partitions > JCHUNK) {
+  #pragma unroll
+    for (int j = JCHUNK; j < 2 * JCHUNK; j++) {
+      acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+    }
+    if (num_partitions > 2 * JCHUNK) {
+  #pragma unroll
+      for (int j = 2 * JCHUNK; j < MAX_NPAR; j++) {
+        acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j];
+      }
+    }
+  }
+
+  for (int p = 1; p < NPAR_LOOPS; p++) {
+    if (num_partitions > p * MAX_NPAR) {
+      idx = 0;
+  #pragma unroll
+      for (int j = p * MAX_NPAR * HEAD_SIZE; j < (p + 1) * MAX_NPAR * HEAD_SIZE;
+           j += HEAD_SIZE) {
+        // lastj is last valid partition
+        const int lastj_offset =
+            (j < num_partition_offset) ? j : last_partition_offset;
+        tmps[idx] = tmp_out_ptr[lastj_offset];
+        idx++;
+      }
+
+  #pragma unroll
+      for (int j = 0; j < MAX_NPAR; j++) {
+        acc += to_float<scalar_t>(tmps[j]) * shared_exp_sums[j + p * MAX_NPAR];
+      }
+    }
+  }
+
+  const float inv_global_exp_sum =
+      __fdividef(1.0f, shared_global_exp_sum + 1e-6f);
+  acc *= inv_global_exp_sum;
+
+  const int64_t query_start_off = static_cast<int64_t>(
+      query_start_loc_ptr ? query_start_loc_ptr[seq_idx] : seq_idx);
+  OUTT* out_ptr = out + query_start_off * num_heads * HEAD_SIZE +
+                  static_cast<int64_t>(head_idx) * HEAD_SIZE;
+  out_ptr[threadIdx.x] = from_float<scalar_t>(acc);
+}
+
+#else
+
+// clang-format off
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
+          int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma16_kernel(
+    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,    // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,    // [num_blocks, num_kv_heads, head_size, block_size]
+    const int num_kv_heads,
+    const float scale,
+    const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,    // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride,
+    const int kv_block_stride,
+    const int kv_head_stride,
+    float* __restrict__ exp_sums,             // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,           // [num_seqs, num_heads, max_num_partitions]
+    scalar_t* __restrict__ out,               // [num_seqs, num_heads, max_num_partitions, head_size]
+    OUTT* __restrict__ final_out,             // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale,
+    const float* __restrict__ fp8_out_scale_ptr) {
+  UNREACHABLE_CODE
+}
+
+template <typename scalar_t, typename cache_t,
+          vllm::Fp8KVCacheDataType KV_DTYPE, typename OUTT, int BLOCK_SIZE,
+          int HEAD_SIZE, int NUM_THREADS, bool ALIBI_ENABLED,
+          int GQA_RATIO>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_QKV_mfma4_kernel(
+    const scalar_t* __restrict__ q,          // [num_seqs, num_heads, head_size]
+    const cache_t* __restrict__ k_cache,     // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+    const cache_t* __restrict__ v_cache,     // [num_blocks, num_kv_heads, head_size, block_size]
+    const int num_kv_heads,
+    const float scale,
+    const int* __restrict__ block_tables,    // [num_seqs, max_num_blocks_per_seq]
+    const int* __restrict__ context_lens,    // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_blocks_per_seq,
+    const float* __restrict__ alibi_slopes,  // [num_heads]
+    const int q_stride,
+    const int kv_block_stride,
+    const int kv_head_stride,
+    float* __restrict__ exp_sums,            // [num_seqs, num_heads, max_num_partitions]
+    float* __restrict__ max_logits,          // [num_seqs, num_heads, max_num_partitions]
+    scalar_t* __restrict__ out,              // [num_seqs, num_heads, max_num_partitions, head_size]
+    OUTT* __restrict__ final_out,            // [num_seqs, num_heads, head_size]
+    int max_ctx_blocks, const float* k_scale, const float* v_scale,
+    const float* __restrict__ fp8_out_scale_ptr) {
+  UNREACHABLE_CODE
+}
+
+// Grid: (num_heads, num_seqs).
+template <typename scalar_t, typename OUTT, int HEAD_SIZE, int NUM_THREADS,
+          int PARTITION_SIZE, int NPAR_LOOPS>
+__global__
+__launch_bounds__(NUM_THREADS) void paged_attention_ll4mi_reduce_kernel(
+    OUTT* __restrict__ out,                // [num_seqs, num_heads, head_size]
+    const float* __restrict__ exp_sums,    // [num_seqs, num_heads, max_num_partitions]
+    const float* __restrict__ max_logits,  // [num_seqs, num_heads, max_num_partitions]
+    const scalar_t* __restrict__ tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
+    const int* __restrict__ context_lens,  // [num_seqs]
+    const int* __restrict__ query_start_loc_ptr,  // [num_seqs]
+    const int max_num_partitions, const float* __restrict__ fp8_out_scale_ptr) {
+  UNREACHABLE_CODE
+}
+// clang-format on
+
+#endif
+
+#define LAUNCH_CUSTOM_ATTENTION_MFMA16(GQA_RATIO)                              \
+  paged_attention_ll4mi_QKV_mfma16_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,  \
+                                          HEAD_SIZE, NTHR, ALIBI_ENABLED,      \
+                                          GQA_RATIO>                           \
+      <<<grid, block, 0, stream>>>(                                            \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
+          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
+          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
+          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
+          max_ctx_blocks, k_scale_ptr, v_scale_ptr, fp8_out_scale_ptr);
+
+#define LAUNCH_CUSTOM_ATTENTION_MFMA4(GQA_RATIO)                               \
+  paged_attention_ll4mi_QKV_mfma4_kernel<T, KVT, KV_DTYPE, OUTT, BLOCK_SIZE,   \
+                                         HEAD_SIZE, NTHR, ALIBI_ENABLED,       \
+                                         GQA_RATIO>                            \
+      <<<grid, block, 0, stream>>>(                                            \
+          query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale,      \
+          block_tables_ptr, context_lens_ptr, query_start_loc_ptr,             \
+          max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, kv_block_stride, \
+          kv_head_stride, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, out_ptr,  \
+          max_ctx_blocks, k_scale_ptr, v_scale_ptr, fp8_out_scale_ptr);
+
+#define LAUNCH_CUSTOM_REDUCTION(NPAR_LOOPS)                          \
+  paged_attention_ll4mi_reduce_kernel<T, OUTT, HEAD_SIZE, HEAD_SIZE, \
+                                      PARTITION_SIZE, NPAR_LOOPS>    \
+      <<<reduce_grid, reduce_block, 0, stream>>>(                    \
+          out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr,        \
+          context_lens_ptr, query_start_loc_ptr, max_num_partitions, \
+          fp8_out_scale_ptr);
+
+template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
+          int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
+          bool ALIBI_ENABLED>
+void paged_attention_custom_launcher(
+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, const int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& context_lens,
+    const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
+    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
+    torch::Tensor& v_scale, const std::optional<torch::Tensor>& fp8_out_scale) {
+  int num_seqs = block_tables.size(0);
+  int num_heads = query.size(1);
+  int head_size = query.size(2);
+  int max_num_blocks_per_seq = block_tables.size(1);
+  int q_stride = query.stride(0);
+  int kv_block_stride = key_cache.stride(0);
+  int kv_head_stride = key_cache.stride(1);
+
+  // NOTE: query start location is optional for V0 decode should not be used.
+  // If batch contains mix of prefills and decode, prefills should be skipped.
+  const int* query_start_loc_ptr =
+      query_start_loc
+          ? reinterpret_cast<const int*>(query_start_loc.value().data_ptr())
+          : nullptr;
+
+  // NOTE: alibi_slopes is optional.
+  const float* alibi_slopes_ptr =
+      alibi_slopes
+          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
+          : nullptr;
+
+  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
+  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
+  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
+  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
+  KVT* key_cache_ptr = reinterpret_cast<KVT*>(key_cache.data_ptr());
+  KVT* value_cache_ptr = reinterpret_cast<KVT*>(value_cache.data_ptr());
+  int* block_tables_ptr = block_tables.data_ptr<int>();
+  int* context_lens_ptr = context_lens.data_ptr<int>();
+  const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
+  const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
+  // NOTE: fp8_out_scale is optional.
+  const auto fp8_out_scale_ptr =
+      fp8_out_scale
+          ? static_cast<const float*>(fp8_out_scale.value().data_ptr())
+          : nullptr;
+  OUTT* out_ptr = reinterpret_cast<OUTT*>(out.data_ptr());
+
+  const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE);
+
+  // partition size is fixed at 256 since both mfma4 and mfma16 kernels support
+  // it mfma4 kernel also supports partition size 512
+  constexpr int PARTITION_SIZE = 256;
+  const int max_num_partitions =
+      DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
+  const int gqa_ratio = num_heads / num_kv_heads;
+  assert(num_heads % num_kv_heads == 0);
+  assert(head_size == HEAD_SIZE);
+
+  constexpr int NTHR = 256;
+  dim3 grid(num_seqs, max_num_partitions, num_kv_heads);
+  dim3 block(NTHR);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // mfma4 kernel is faster than mfma16 for gqa_ratio <= 4
+  switch (gqa_ratio) {
+    case 1:
+      LAUNCH_CUSTOM_ATTENTION_MFMA4(1);
+      break;
+    case 2:
+      LAUNCH_CUSTOM_ATTENTION_MFMA4(2);
+      break;
+    case 3:
+      LAUNCH_CUSTOM_ATTENTION_MFMA4(3);
+      break;
     case 4:
       LAUNCH_CUSTOM_ATTENTION_MFMA4(4);
       break;
@@ -1750,13 +3257,195 @@ void paged_attention_custom_launcher(
   }
 }
 
-#define CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT,      \
-                             PSIZE, ALIBI_ENABLED)                             \
-  paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, \
-                                  PSIZE, ALIBI_ENABLED>(                       \
-      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,       \
-      num_kv_heads, scale, block_tables, context_lens, query_start_loc,        \
-      max_context_len, alibi_slopes, k_scale, v_scale, fp8_out_scale);
+template <typename T, typename KVT, vllm::Fp8KVCacheDataType KV_DTYPE,
+          int BLOCK_SIZE, int HEAD_SIZE, typename OUTT, int PARTITION_SIZE_OLD,
+          bool ALIBI_ENABLED>
+void paged_attention_custom_launcher_navi(
+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, const int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& context_lens,
+    const std::optional<torch::Tensor>& query_start_loc, int max_context_len,
+    const std::optional<torch::Tensor>& alibi_slopes, torch::Tensor& k_scale,
+    torch::Tensor& v_scale) {
+  int num_seqs = block_tables.size(0);
+  int num_heads = query.size(1);
+  int head_size = query.size(2);
+  int max_num_blocks_per_seq = block_tables.size(1);
+  int q_stride = query.stride(0);
+  int kv_block_stride = key_cache.stride(0);
+  int kv_head_stride = key_cache.stride(1);
+
+  // NOTE: query start location is optional for V0 decode should not be used.
+  // If batch contains mix of prefills and decode, prefills should be skipped.
+  const int* query_start_loc_ptr =
+      query_start_loc
+          ? reinterpret_cast<const int*>(query_start_loc.value().data_ptr())
+          : nullptr;
+
+  // NOTE: Navi does not support alibi_slopes.
+  const float* alibi_slopes_ptr = nullptr;
+
+  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
+  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
+  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
+  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
+  KVT* key_cache_ptr = reinterpret_cast<KVT*>(key_cache.data_ptr());
+  KVT* value_cache_ptr = reinterpret_cast<KVT*>(value_cache.data_ptr());
+  int* block_tables_ptr = block_tables.data_ptr<int>();
+  int* context_lens_ptr = context_lens.data_ptr<int>();
+
+  const float* k_scale_ptr = reinterpret_cast<const float*>(k_scale.data_ptr());
+  const float* v_scale_ptr = reinterpret_cast<const float*>(v_scale.data_ptr());
+  // NOTE: Navi does not support fp8.
+  const auto fp8_out_scale_ptr = nullptr;
+  OUTT* out_ptr = reinterpret_cast<OUTT*>(out.data_ptr());
+
+  const int max_ctx_blocks = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE);
+
+  constexpr int PARTITION_SIZE = 256;
+  const int max_num_partitions =
+      DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
+  const int gqa_ratio = num_heads / num_kv_heads;
+  assert(num_heads % num_kv_heads == 0);
+  assert(head_size == HEAD_SIZE);
+
+  constexpr int NTHR = 256;
+  dim3 grid(num_seqs, max_num_partitions, num_kv_heads);
+  dim3 block(NTHR);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  switch (gqa_ratio) {
+    case 1:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(1);
+      break;
+    case 2:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(2);
+      break;
+    case 3:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(3);
+      break;
+    case 4:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(4);
+      break;
+    case 5:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(5);
+      break;
+    case 6:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(6);
+      break;
+    case 7:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(7);
+      break;
+    case 8:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(8);
+      break;
+    case 9:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(9);
+      break;
+    case 10:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(10);
+      break;
+    case 11:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(11);
+      break;
+    case 12:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(12);
+      break;
+    case 13:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(13);
+      break;
+    case 14:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(14);
+      break;
+    case 15:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(15);
+      break;
+    case 16:
+      LAUNCH_CUSTOM_ATTENTION_MFMA16(16);
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported gqa ratio: ", gqa_ratio);
+      break;
+  }
+
+  dim3 reduce_grid(num_heads, num_seqs);
+  dim3 reduce_block(head_size);
+  const int warp_size = 32;
+  const int npar_loops = DIVIDE_ROUND_UP(max_num_partitions, warp_size);
+  // reduction kernel supports upto 16 NPAR_loops * 32 (warp_size) * 256
+  // (partition size) = 128K context length
+  switch (npar_loops) {
+    case 1:
+      LAUNCH_CUSTOM_REDUCTION(1);
+      break;
+    case 2:
+      LAUNCH_CUSTOM_REDUCTION(2);
+      break;
+    case 3:
+      LAUNCH_CUSTOM_REDUCTION(3);
+      break;
+    case 4:
+      LAUNCH_CUSTOM_REDUCTION(4);
+      break;
+    case 5:
+      LAUNCH_CUSTOM_REDUCTION(5);
+      break;
+    case 6:
+      LAUNCH_CUSTOM_REDUCTION(6);
+      break;
+    case 7:
+      LAUNCH_CUSTOM_REDUCTION(7);
+      break;
+    case 8:
+      LAUNCH_CUSTOM_REDUCTION(8);
+      break;
+    case 9:
+      LAUNCH_CUSTOM_REDUCTION(9);
+      break;
+    case 10:
+      LAUNCH_CUSTOM_REDUCTION(10);
+      break;
+    case 11:
+      LAUNCH_CUSTOM_REDUCTION(11);
+      break;
+    case 12:
+      LAUNCH_CUSTOM_REDUCTION(12);
+      break;
+    case 13:
+      LAUNCH_CUSTOM_REDUCTION(13);
+      break;
+    case 14:
+      LAUNCH_CUSTOM_REDUCTION(14);
+      break;
+    case 15:
+      LAUNCH_CUSTOM_REDUCTION(15);
+      break;
+    case 16:
+      LAUNCH_CUSTOM_REDUCTION(16);
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported npar_loops: ", npar_loops);
+      break;
+  }
+}
+
+#define CALL_CUSTOM_LAUNCHER(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT,   \
+                             PSIZE, ALIBI_ENABLED)                          \
+  if (!is_navi) {                                                           \
+    paged_attention_custom_launcher<T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,  \
+                                    OUTT, PSIZE, ALIBI_ENABLED>(            \
+        out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,  \
+        num_kv_heads, scale, block_tables, context_lens, query_start_loc,   \
+        max_context_len, alibi_slopes, k_scale, v_scale, fp8_out_scale);    \
+  } else {                                                                  \
+    paged_attention_custom_launcher_navi<                                   \
+        T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE, OUTT, PSIZE, ALIBI_ENABLED>( \
+        out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,  \
+        num_kv_heads, scale, block_tables, context_lens, query_start_loc,   \
+        max_context_len, alibi_slopes, k_scale, v_scale);                   \
+  }
 
 #define CALL_CUSTOM_LAUNCHER_ALIBI(T, KVT, KV_DTYPE, BLK_SIZE, HEAD_SIZE,    \
                                    OUTT, PSIZE)                              \
@@ -1813,6 +3502,24 @@ void paged_attention_custom_launcher(
       break;                                                    \
   }
 
+bool is_navi_gpu() {
+  static bool is_cached = false;
+  static bool result;
+
+  if (!is_cached) {
+    int device_id;
+    hipDeviceProp_t deviceProp;
+    hipGetDevice(&device_id);
+    hipGetDeviceProperties(&deviceProp, device_id);
+
+    std::string arch = deviceProp.gcnArchName;
+    result = arch.find("gfx11") == 0 || arch.find("gfx12") == 0;
+    is_cached = true;
+  }
+
+  return result;
+}
+
 // clang-format off
 void paged_attention(
     torch::Tensor& out,         // [num_seqs, num_heads, head_size]
@@ -1833,6 +3540,8 @@ void paged_attention(
     torch::Tensor& v_scale,
     const std::optional<torch::Tensor>& fp8_out_scale) {
   // clang-format on
+  bool is_navi = is_navi_gpu();
+
   const int head_size = query.size(2);
   if (kv_cache_dtype == "auto") {
     if (query.dtype() == at::ScalarType::Half) {
diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
index 9c8a50332ad0..c22523da4e43 100644
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -8,6 +8,8 @@
 
 #include <ATen/cuda/CUDAContext.h>
 
+#include "cuda_utils.h"
+
 #include "cutlass/cutlass.h"
 
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
@@ -95,9 +97,9 @@ struct cutlass_sparse_3x_gemm {
   // clang-format off
   using CollectiveMainloop =
       typename cutlass::gemm::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, 
-          ElementAB, cutlass::layout::RowMajor, AlignmentAB, 
-          ElementAB, cutlass::layout::ColumnMajor, AlignmentAB, 
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
+          ElementAB, cutlass::layout::RowMajor, AlignmentAB,
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
           ElementAcc, TileShape, ClusterShape,
           Stages,
           KernelSchedule>::CollectiveOp;
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 83d512d97dbe..65e2e9fa7990 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -488,41 +488,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                   Tensor page_table, float scale) -> ()");
   ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
 
-  // Mamba selective scan kernel
-  ops.def(
-      "selective_scan_fwd(Tensor! u, Tensor! delta,"
-      "Tensor! A, Tensor! B, Tensor! C,"
-      "Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
-      "bool delta_softplus,"
-      "Tensor? query_start_loc,"
-      "Tensor? cache_indices,"
-      "Tensor? has_initial_state,"
-      "Tensor! ssm_states,"
-      "int pad_slot_id) -> ()");
-  ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
-
-  ops.def(
-      "causal_conv1d_update(Tensor! x,"
-      "Tensor! conv_state,"
-      "Tensor! weight,"
-      "Tensor? bias_,"
-      "bool silu_activation,"
-      "Tensor? cache_seqlens_,"
-      "Tensor? conv_state_indices,"
-      "int pad_slot_id) -> ()");
-  ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
-
-  ops.def(
-      "causal_conv1d_fwd(Tensor! x, Tensor! weight,"
-      "Tensor? bias_,"
-      "Tensor!? conv_states,"
-      "Tensor? query_start_loc,"
-      "Tensor? cache_indices,"
-      "Tensor? has_initial_state,"
-      "bool silu_activation,"
-      "int pad_slot_id) -> ()");
-  ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
-
   // Compute NVFP4 block quantized tensor.
   ops.def(
       "scaled_fp4_quant(Tensor! output, Tensor input,"
@@ -590,6 +555,41 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
            &dynamic_scaled_int8_quant);
 
+  // Mamba selective scan kernel
+  ops.def(
+      "selective_scan_fwd(Tensor! u, Tensor! delta,"
+      "Tensor! A, Tensor! B, Tensor! C,"
+      "Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
+      "bool delta_softplus,"
+      "Tensor? query_start_loc,"
+      "Tensor? cache_indices,"
+      "Tensor? has_initial_state,"
+      "Tensor! ssm_states,"
+      "int pad_slot_id) -> ()");
+  ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
+
+  ops.def(
+      "causal_conv1d_update(Tensor! x,"
+      "Tensor! conv_state,"
+      "Tensor! weight,"
+      "Tensor? bias_,"
+      "bool silu_activation,"
+      "Tensor? cache_seqlens_,"
+      "Tensor? conv_state_indices,"
+      "int pad_slot_id) -> ()");
+  ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
+
+  ops.def(
+      "causal_conv1d_fwd(Tensor! x, Tensor! weight,"
+      "Tensor? bias_,"
+      "Tensor!? conv_states,"
+      "Tensor? query_start_loc,"
+      "Tensor? cache_indices,"
+      "Tensor? has_initial_state,"
+      "bool silu_activation,"
+      "int pad_slot_id) -> ()");
+  ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
+
 #ifndef USE_ROCM
   // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
   ops.def(
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 97a7879da876..24986a1b73b1 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,8 +2,8 @@
 # to run the OpenAI compatible server.
 
 # Please update any changes made here to
-# docs/source/contributing/dockerfile/dockerfile.md and
-# docs/source/assets/contributing/dockerfile-stages-dependency.png
+# docs/contributing/dockerfile/dockerfile.md and
+# docs/assets/contributing/dockerfile-stages-dependency.png
 
 ARG CUDA_VERSION=12.8.1
 #################### BASE BUILD IMAGE ####################
@@ -189,6 +189,8 @@ WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
 
+SHELL ["/bin/bash", "-c"]
+
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
     echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
 
@@ -255,15 +257,17 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    # uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.4/flashinfer_python-0.2.4+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
-    # TESTING: install FlashInfer from source to test 2.7.0 final RC
+    # FlashInfer alreary has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
     if [[ "$CUDA_VERSION" == 12.8* ]]; then \
-        export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'; \
+        uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl; \
     else \
         export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \
-    fi && \
-    export FLASHINFER_ENABLE_AOT=1; \
-    uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
+        CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
+        if [ "$CUDA_MAJOR" -lt 12 ]; then \
+            export FLASHINFER_ENABLE_SM90=0; \
+        fi; \
+        uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
+    fi \
 fi
 COPY examples examples
 COPY benchmarks benchmarks
@@ -273,7 +277,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 uv pip list
 
-# Although we build Flashinfer with AOT mode, there's still
+# Even when we build Flashinfer with AOT mode, there's still
 # some issues w.r.t. JIT compilation. Therefore we need to
 # install build dependencies for JIT compilation.
 # TODO: Remove this once FlashInfer AOT wheel is fixed
@@ -301,8 +305,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
 
 # install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/dev.txt
+RUN --mount=type=cache,target=/root/.cache/uv \   
+    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
+    if [ "$CUDA_MAJOR" -ge 12 ]; then \
+        uv pip install --system -r requirements/dev.txt; \
+    fi
 
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -321,7 +328,9 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
 # will not be imported by other tests
 RUN mkdir test_docs
 RUN mv docs test_docs/
+RUN cp -r examples test_docs/
 RUN mv vllm test_docs/
+RUN mv mkdocs.yaml test_docs/
 #################### TEST IMAGE ####################
 
 #################### OPENAI API SERVER ####################
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index c647d9036f40..5395b3884fb5 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -51,9 +51,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install --upgrade pip && \
     uv pip install -r requirements/cpu.txt
 
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0
-
 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
 
 RUN echo 'ulimit -c 0' >> ~/.bashrc
diff --git a/docker/Dockerfile.neuron b/docker/Dockerfile.neuron
index 2b63fe301bac..259dc5a23f78 100644
--- a/docker/Dockerfile.neuron
+++ b/docker/Dockerfile.neuron
@@ -1,6 +1,6 @@
 # default base image
 # https://gallery.ecr.aws/neuron/pytorch-inference-neuronx
-ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.5.1-neuronx-py310-sdk2.22.0-ubuntu22.04"
+ARG BASE_IMAGE="public.ecr.aws/neuron/pytorch-inference-neuronx:2.6.0-neuronx-py310-sdk2.23.0-ubuntu22.04"
 
 FROM $BASE_IMAGE
 
@@ -22,8 +22,7 @@ WORKDIR ${APP_MOUNT}/vllm
 
 RUN python3 -m pip install --upgrade pip
 RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas tenacity
-RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
-RUN python3 -m pip install neuronx-cc==2.17.194.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install neuronx-cc==2.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
 RUN python3 -m pip install pytest
 
 # uninstall transformers-neuronx package explicitly to avoid version conflict
@@ -49,6 +48,8 @@ RUN python3 -m pip install -e tests/vllm_test_utils
 # FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
 RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
 
+RUN python3 -m pip install sentencepiece transformers==4.48.0 -U
+
 # overwrite entrypoint to run bash script
 RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py
 
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 39b4dd9640d9..fe8d9cf23d7b 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -114,12 +114,6 @@ ENV TOKENIZERS_PARALLELISM=false
 # ENV that can improve safe tensor loading, and end-to-end time
 ENV SAFETENSORS_FAST_GPU=1
 
-# User-friendly environment setting for multi-processing to avoid below RuntimeError.
-# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
-# you must use the 'spawn' start method 
-# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-
 # Performance environment variable.
 ENV HIP_FORCE_DEV_KERNARG=1
 
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 8300228fe3f2..2a23ee5a9db1 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -10,7 +10,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="5a77249"
+ARG AITER_BRANCH="c1debd8"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
diff --git a/docker/Dockerfile.s390x b/docker/Dockerfile.s390x
index 9c10cd56b594..4e89bb3057c5 100644
--- a/docker/Dockerfile.s390x
+++ b/docker/Dockerfile.s390x
@@ -84,16 +84,40 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
     rustup default stable && \
     rustup show
 
+FROM python-install AS torch
+ARG TORCH_VERSION=2.7.0
+ENV export _GLIBCXX_USE_CXX11_ABI=1
+ENV CARGO_HOME=/root/.cargo
+ENV RUSTUP_HOME=/root/.rustup
+ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
+
+WORKDIR /tmp
+
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
+    --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
+    git clone https://github.com/pytorch/pytorch.git && \
+    cd pytorch && \
+    git checkout v2.7.0 && \
+    git submodule sync && \
+    git submodule update --init --recursive && \
+    uv pip install cmake ninja && \
+    uv pip install -r requirements.txt && \
+    python setup.py bdist_wheel
+    
+
 FROM python-install AS torch-vision
 # Install torchvision
-ARG TORCH_VERSION=2.7.0.dev20250304
+ARG TORCH_VERSION=2.7.0
 ARG TORCH_VISION_VERSION=v0.20.1
 WORKDIR /tmp
 RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
     git clone https://github.com/pytorch/vision.git && \
     cd vision && \
     git checkout $TORCH_VISION_VERSION && \
-    uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
+    TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
+    uv pip install -v $TORCH_WHL_FILE && \
     python setup.py bdist_wheel
 
 FROM python-install AS hf-xet-builder
@@ -138,15 +162,17 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
     --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
     --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
+    --mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
      sed -i '/^torch/d' requirements/build.txt && \
      ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
      VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
      HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl | head -n 1) && \
+     TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \
     uv pip install -v \    
         $ARROW_WHL_FILE  \
         $VISION_WHL_FILE \
         $HF_XET_WHL_FILE \
-        --extra-index-url https://download.pytorch.org/whl/nightly/cpu \
+        $TORCH_WHL_FILE \
         --index-strategy unsafe-best-match \
         -r requirements/build.txt \
         -r requirements/cpu.txt 
diff --git a/docs/.nav.yml b/docs/.nav.yml
new file mode 100644
index 000000000000..42aba9775360
--- /dev/null
+++ b/docs/.nav.yml
@@ -0,0 +1,63 @@
+nav:
+  - Home: 
+    - vLLM: README.md
+    - Getting Started:
+      - getting_started/quickstart.md
+      - getting_started/installation
+    - Examples:
+      - Offline Inference: examples/offline_inference
+      - Online Serving: examples/online_serving
+      - Others: examples/others
+    - Quick Links:
+      - User Guide: usage/README.md
+      - Developer Guide: contributing/README.md
+      - API Reference: api/README.md
+    - Timeline:
+      - Roadmap: https://roadmap.vllm.ai
+      - Releases: https://github.com/vllm-project/vllm/releases
+  - User Guide:
+    - Summary: usage/README.md
+    - usage/v1_guide.md
+    - General:
+      - usage/*
+    - Inference and Serving:
+      - serving/offline_inference.md
+      - serving/openai_compatible_server.md
+      - serving/*
+      - serving/integrations
+    - Deployment:
+      - deployment/*
+      - deployment/frameworks
+      - deployment/integrations
+    - Training: training
+    - Configuration:
+      - Summary: configuration/README.md
+      - configuration/*
+    - Models:
+      - models/supported_models.md
+      - models/generative_models.md
+      - models/pooling_models.md
+      - models/extensions
+    - Features:
+      - features/compatibility_matrix.md
+      - features/*
+      - features/quantization
+  - Developer Guide:
+    - Summary: contributing/README.md
+    - General:
+      - glob: contributing/*
+        flatten_single_child_sections: true
+    - Model Implementation: contributing/model
+    - Design Documents:
+      - V0: design
+      - V1: design/v1
+  - API Reference:
+    - Summary: api/README.md
+    - Contents:
+      - glob: api/vllm/*
+        preserve_directory_names: true
+  - Community:
+    - community/*
+    - Blog: https://blog.vllm.ai
+    - Forum: https://discuss.vllm.ai
+    - Slack: https://slack.vllm.ai
diff --git a/docs/Makefile b/docs/Makefile
deleted file mode 100644
index d3b429dfb925..000000000000
--- a/docs/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS    ?=
-SPHINXBUILD   ?= sphinx-build
-SOURCEDIR     = source
-BUILDDIR      = build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-clean:
-	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-	rm -rf "$(SOURCEDIR)/getting_started/examples"
-	rm -rf "$(SOURCEDIR)/api/vllm"
diff --git a/docs/README.md b/docs/README.md
index dcd5e759dfa8..57b1d03deee2 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,43 +1,50 @@
-# vLLM documents
-
-## Build the docs
-
-- Make sure in `docs` directory
-
-```bash
-cd docs
-```
-
-- Install the dependencies:
-
-```bash
-pip install -r ../requirements/docs.txt
-```
-
-- Clean the previous build (optional but recommended):
-
-```bash
-make clean
-```
-
-- Generate the HTML documentation:
-
-```bash
-make html
-```
-
-## Open the docs with your browser
-
-- Serve the documentation locally:
-
-```bash
-python -m http.server -d build/html/
-```
-
-This will start a local server at http://localhost:8000. You can now open your browser and view the documentation.
-
-If port 8000 is already in use, you can specify a different port, for example:
-
-```bash
-python -m http.server 3000 -d build/html/
-```
+# Welcome to vLLM
+
+<figure markdown="span">
+  ![](./assets/logos/vllm-logo-text-light.png){ align="center" alt="vLLM" class="no-scaled-link" width="60%" }
+</figure>
+
+<p style="text-align:center">
+<strong>Easy, fast, and cheap LLM serving for everyone
+</strong>
+</p>
+
+<p style="text-align:center">
+<script async defer src="https://buttons.github.io/buttons.js"></script>
+<a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
+<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
+</p>
+
+vLLM is a fast and easy-to-use library for LLM inference and serving.
+
+Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
+
+vLLM is fast with:
+
+- State-of-the-art serving throughput
+- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
+- Continuous batching of incoming requests
+- Fast model execution with CUDA/HIP graph
+- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8
+- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
+- Speculative decoding
+- Chunked prefill
+
+vLLM is flexible and easy to use with:
+
+- Seamless integration with popular HuggingFace models
+- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
+- Tensor parallelism and pipeline parallelism support for distributed inference
+- Streaming outputs
+- OpenAI-compatible API server
+- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
+- Prefix caching support
+- Multi-lora support
+
+For more information, check out the following:
+
+- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
+- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
+- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
+- [vLLM Meetups][meetups]
diff --git a/docs/api/README.md b/docs/api/README.md
new file mode 100644
index 000000000000..5c7b2ca79ee2
--- /dev/null
+++ b/docs/api/README.md
@@ -0,0 +1,107 @@
+# Summary
+
+[](){ #configuration }
+
+## Configuration
+
+API documentation for vLLM's configuration classes.
+
+- [vllm.config.ModelConfig][]
+- [vllm.config.CacheConfig][]
+- [vllm.config.TokenizerPoolConfig][]
+- [vllm.config.LoadConfig][]
+- [vllm.config.ParallelConfig][]
+- [vllm.config.SchedulerConfig][]
+- [vllm.config.DeviceConfig][]
+- [vllm.config.SpeculativeConfig][]
+- [vllm.config.LoRAConfig][]
+- [vllm.config.PromptAdapterConfig][]
+- [vllm.config.MultiModalConfig][]
+- [vllm.config.PoolerConfig][]
+- [vllm.config.DecodingConfig][]
+- [vllm.config.ObservabilityConfig][]
+- [vllm.config.KVTransferConfig][]
+- [vllm.config.CompilationConfig][]
+- [vllm.config.VllmConfig][]
+
+[](){ #offline-inference-api }
+
+## Offline Inference
+
+LLM Class.
+
+- [vllm.LLM][]
+
+LLM Inputs.
+
+- [vllm.inputs.PromptType][]
+- [vllm.inputs.TextPrompt][]
+- [vllm.inputs.TokensPrompt][]
+
+## vLLM Engines
+
+Engine classes for offline and online inference.
+
+- [vllm.LLMEngine][]
+- [vllm.AsyncLLMEngine][]
+
+## Inference Parameters
+
+Inference parameters for vLLM APIs.
+
+[](){ #sampling-params }
+[](){ #pooling-params }
+
+- [vllm.SamplingParams][]
+- [vllm.PoolingParams][]
+
+[](){ #multi-modality }
+
+## Multi-Modality
+
+vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
+
+Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
+via the `multi_modal_data` field in [vllm.inputs.PromptType][].
+
+Looking to add your own multi-modal model? Please follow the instructions listed [here][supports-multimodal].
+
+- [vllm.multimodal.MULTIMODAL_REGISTRY][]
+
+### Inputs
+
+User-facing inputs.
+
+- [vllm.multimodal.inputs.MultiModalDataDict][]
+
+Internal data structures.
+
+- [vllm.multimodal.inputs.PlaceholderRange][]
+- [vllm.multimodal.inputs.NestedTensors][]
+- [vllm.multimodal.inputs.MultiModalFieldElem][]
+- [vllm.multimodal.inputs.MultiModalFieldConfig][]
+- [vllm.multimodal.inputs.MultiModalKwargsItem][]
+- [vllm.multimodal.inputs.MultiModalKwargs][]
+- [vllm.multimodal.inputs.MultiModalInputs][]
+
+### Data Parsing
+
+- [vllm.multimodal.parse][]
+
+### Data Processing
+
+- [vllm.multimodal.processing][]
+
+### Memory Profiling
+
+- [vllm.multimodal.profiling][]
+
+### Registry
+
+- [vllm.multimodal.registry][]
+
+## Model Development
+
+- [vllm.model_executor.models.interfaces_base][]
+- [vllm.model_executor.models.interfaces][]
+- [vllm.model_executor.models.adapters][]
diff --git a/docs/api/vllm/.meta.yml b/docs/api/vllm/.meta.yml
new file mode 100644
index 000000000000..c15adfec644c
--- /dev/null
+++ b/docs/api/vllm/.meta.yml
@@ -0,0 +1,2 @@
+search:
+  boost: 0.5
diff --git a/docs/source/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png
similarity index 100%
rename from docs/source/assets/contributing/dockerfile-stages-dependency.png
rename to docs/assets/contributing/dockerfile-stages-dependency.png
diff --git a/docs/source/assets/deployment/anything-llm-chat-with-doc.png b/docs/assets/deployment/anything-llm-chat-with-doc.png
similarity index 100%
rename from docs/source/assets/deployment/anything-llm-chat-with-doc.png
rename to docs/assets/deployment/anything-llm-chat-with-doc.png
diff --git a/docs/source/assets/deployment/anything-llm-chat-without-doc.png b/docs/assets/deployment/anything-llm-chat-without-doc.png
similarity index 100%
rename from docs/source/assets/deployment/anything-llm-chat-without-doc.png
rename to docs/assets/deployment/anything-llm-chat-without-doc.png
diff --git a/docs/source/assets/deployment/anything-llm-provider.png b/docs/assets/deployment/anything-llm-provider.png
similarity index 100%
rename from docs/source/assets/deployment/anything-llm-provider.png
rename to docs/assets/deployment/anything-llm-provider.png
diff --git a/docs/source/assets/deployment/anything-llm-upload-doc.png b/docs/assets/deployment/anything-llm-upload-doc.png
similarity index 100%
rename from docs/source/assets/deployment/anything-llm-upload-doc.png
rename to docs/assets/deployment/anything-llm-upload-doc.png
diff --git a/docs/source/assets/deployment/architecture_helm_deployment.png b/docs/assets/deployment/architecture_helm_deployment.png
similarity index 100%
rename from docs/source/assets/deployment/architecture_helm_deployment.png
rename to docs/assets/deployment/architecture_helm_deployment.png
diff --git a/docs/source/assets/deployment/chatbox-chat.png b/docs/assets/deployment/chatbox-chat.png
similarity index 100%
rename from docs/source/assets/deployment/chatbox-chat.png
rename to docs/assets/deployment/chatbox-chat.png
diff --git a/docs/source/assets/deployment/chatbox-settings.png b/docs/assets/deployment/chatbox-settings.png
similarity index 100%
rename from docs/source/assets/deployment/chatbox-settings.png
rename to docs/assets/deployment/chatbox-settings.png
diff --git a/docs/source/assets/deployment/dify-chat.png b/docs/assets/deployment/dify-chat.png
similarity index 100%
rename from docs/source/assets/deployment/dify-chat.png
rename to docs/assets/deployment/dify-chat.png
diff --git a/docs/source/assets/deployment/dify-create-chatbot.png b/docs/assets/deployment/dify-create-chatbot.png
similarity index 100%
rename from docs/source/assets/deployment/dify-create-chatbot.png
rename to docs/assets/deployment/dify-create-chatbot.png
diff --git a/docs/source/assets/deployment/dify-settings.png b/docs/assets/deployment/dify-settings.png
similarity index 100%
rename from docs/source/assets/deployment/dify-settings.png
rename to docs/assets/deployment/dify-settings.png
diff --git a/docs/source/assets/deployment/open_webui.png b/docs/assets/deployment/open_webui.png
similarity index 100%
rename from docs/source/assets/deployment/open_webui.png
rename to docs/assets/deployment/open_webui.png
diff --git a/docs/source/assets/deployment/streamlit-chat.png b/docs/assets/deployment/streamlit-chat.png
similarity index 100%
rename from docs/source/assets/deployment/streamlit-chat.png
rename to docs/assets/deployment/streamlit-chat.png
diff --git a/docs/source/assets/design/arch_overview/entrypoints.excalidraw.png b/docs/assets/design/arch_overview/entrypoints.excalidraw.png
similarity index 100%
rename from docs/source/assets/design/arch_overview/entrypoints.excalidraw.png
rename to docs/assets/design/arch_overview/entrypoints.excalidraw.png
diff --git a/docs/source/assets/design/arch_overview/llm_engine.excalidraw.png b/docs/assets/design/arch_overview/llm_engine.excalidraw.png
similarity index 100%
rename from docs/source/assets/design/arch_overview/llm_engine.excalidraw.png
rename to docs/assets/design/arch_overview/llm_engine.excalidraw.png
diff --git a/docs/source/assets/design/hierarchy.png b/docs/assets/design/hierarchy.png
similarity index 100%
rename from docs/source/assets/design/hierarchy.png
rename to docs/assets/design/hierarchy.png
diff --git a/docs/source/assets/design/v1/metrics/intervals-1.png b/docs/assets/design/v1/metrics/intervals-1.png
similarity index 100%
rename from docs/source/assets/design/v1/metrics/intervals-1.png
rename to docs/assets/design/v1/metrics/intervals-1.png
diff --git a/docs/source/assets/design/v1/metrics/intervals-2.png b/docs/assets/design/v1/metrics/intervals-2.png
similarity index 100%
rename from docs/source/assets/design/v1/metrics/intervals-2.png
rename to docs/assets/design/v1/metrics/intervals-2.png
diff --git a/docs/source/assets/design/v1/metrics/intervals-3.png b/docs/assets/design/v1/metrics/intervals-3.png
similarity index 100%
rename from docs/source/assets/design/v1/metrics/intervals-3.png
rename to docs/assets/design/v1/metrics/intervals-3.png
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-1.png b/docs/assets/design/v1/prefix_caching/example-time-1.png
similarity index 100%
rename from docs/source/assets/design/v1/prefix_caching/example-time-1.png
rename to docs/assets/design/v1/prefix_caching/example-time-1.png
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-3.png b/docs/assets/design/v1/prefix_caching/example-time-3.png
similarity index 100%
rename from docs/source/assets/design/v1/prefix_caching/example-time-3.png
rename to docs/assets/design/v1/prefix_caching/example-time-3.png
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-4.png b/docs/assets/design/v1/prefix_caching/example-time-4.png
similarity index 100%
rename from docs/source/assets/design/v1/prefix_caching/example-time-4.png
rename to docs/assets/design/v1/prefix_caching/example-time-4.png
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-5.png b/docs/assets/design/v1/prefix_caching/example-time-5.png
similarity index 100%
rename from docs/source/assets/design/v1/prefix_caching/example-time-5.png
rename to docs/assets/design/v1/prefix_caching/example-time-5.png
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-6.png b/docs/assets/design/v1/prefix_caching/example-time-6.png
similarity index 100%
rename from docs/source/assets/design/v1/prefix_caching/example-time-6.png
rename to docs/assets/design/v1/prefix_caching/example-time-6.png
diff --git a/docs/source/assets/design/v1/prefix_caching/example-time-7.png b/docs/assets/design/v1/prefix_caching/example-time-7.png
similarity index 100%
rename from docs/source/assets/design/v1/prefix_caching/example-time-7.png
rename to docs/assets/design/v1/prefix_caching/example-time-7.png
diff --git a/docs/source/assets/design/v1/prefix_caching/free.png b/docs/assets/design/v1/prefix_caching/free.png
similarity index 100%
rename from docs/source/assets/design/v1/prefix_caching/free.png
rename to docs/assets/design/v1/prefix_caching/free.png
diff --git a/docs/source/assets/design/v1/prefix_caching/overview.png b/docs/assets/design/v1/prefix_caching/overview.png
similarity index 100%
rename from docs/source/assets/design/v1/prefix_caching/overview.png
rename to docs/assets/design/v1/prefix_caching/overview.png
diff --git a/docs/source/assets/features/disagg_prefill/abstraction.jpg b/docs/assets/features/disagg_prefill/abstraction.jpg
similarity index 100%
rename from docs/source/assets/features/disagg_prefill/abstraction.jpg
rename to docs/assets/features/disagg_prefill/abstraction.jpg
diff --git a/docs/source/assets/features/disagg_prefill/overview.jpg b/docs/assets/features/disagg_prefill/overview.jpg
similarity index 100%
rename from docs/source/assets/features/disagg_prefill/overview.jpg
rename to docs/assets/features/disagg_prefill/overview.jpg
diff --git a/docs/source/assets/kernel/k_vecs.png b/docs/assets/kernel/k_vecs.png
similarity index 100%
rename from docs/source/assets/kernel/k_vecs.png
rename to docs/assets/kernel/k_vecs.png
diff --git a/docs/source/assets/kernel/key.png b/docs/assets/kernel/key.png
similarity index 100%
rename from docs/source/assets/kernel/key.png
rename to docs/assets/kernel/key.png
diff --git a/docs/source/assets/kernel/logits_vec.png b/docs/assets/kernel/logits_vec.png
similarity index 100%
rename from docs/source/assets/kernel/logits_vec.png
rename to docs/assets/kernel/logits_vec.png
diff --git a/docs/source/assets/kernel/q_vecs.png b/docs/assets/kernel/q_vecs.png
similarity index 100%
rename from docs/source/assets/kernel/q_vecs.png
rename to docs/assets/kernel/q_vecs.png
diff --git a/docs/source/assets/kernel/query.png b/docs/assets/kernel/query.png
similarity index 100%
rename from docs/source/assets/kernel/query.png
rename to docs/assets/kernel/query.png
diff --git a/docs/source/assets/kernel/v_vec.png b/docs/assets/kernel/v_vec.png
similarity index 100%
rename from docs/source/assets/kernel/v_vec.png
rename to docs/assets/kernel/v_vec.png
diff --git a/docs/source/assets/kernel/value.png b/docs/assets/kernel/value.png
similarity index 100%
rename from docs/source/assets/kernel/value.png
rename to docs/assets/kernel/value.png
diff --git a/docs/source/assets/logos/vllm-logo-only-light.ico b/docs/assets/logos/vllm-logo-only-light.ico
similarity index 100%
rename from docs/source/assets/logos/vllm-logo-only-light.ico
rename to docs/assets/logos/vllm-logo-only-light.ico
diff --git a/docs/source/assets/logos/vllm-logo-only-light.png b/docs/assets/logos/vllm-logo-only-light.png
similarity index 100%
rename from docs/source/assets/logos/vllm-logo-only-light.png
rename to docs/assets/logos/vllm-logo-only-light.png
diff --git a/docs/source/assets/logos/vllm-logo-text-dark.png b/docs/assets/logos/vllm-logo-text-dark.png
similarity index 100%
rename from docs/source/assets/logos/vllm-logo-text-dark.png
rename to docs/assets/logos/vllm-logo-text-dark.png
diff --git a/docs/source/assets/logos/vllm-logo-text-light.png b/docs/assets/logos/vllm-logo-text-light.png
similarity index 100%
rename from docs/source/assets/logos/vllm-logo-text-light.png
rename to docs/assets/logos/vllm-logo-text-light.png
diff --git a/docs/source/community/meetups.md b/docs/community/meetups.md
similarity index 98%
rename from docs/source/community/meetups.md
rename to docs/community/meetups.md
index aa1a71c86c0a..8ea42e3cad18 100644
--- a/docs/source/community/meetups.md
+++ b/docs/community/meetups.md
@@ -1,6 +1,7 @@
-(meetups)=
-
-# vLLM Meetups
+---
+title: Meetups
+---
+[](){ #meetups }
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
diff --git a/docs/source/community/sponsors.md b/docs/community/sponsors.md
similarity index 100%
rename from docs/source/community/sponsors.md
rename to docs/community/sponsors.md
diff --git a/docs/configuration/README.md b/docs/configuration/README.md
new file mode 100644
index 000000000000..6a8fbc79f4af
--- /dev/null
+++ b/docs/configuration/README.md
@@ -0,0 +1,9 @@
+# Configuration Options
+
+This section lists the most common options for running vLLM.
+
+There are three main levels of configuration, from highest priority to lowest priority:
+
+- [Request parameters][completions-api] and [input arguments][sampling-params]
+- [Engine arguments](./engine_args.md)
+- [Environment variables](./env_vars.md)
diff --git a/docs/configuration/conserving_memory.md b/docs/configuration/conserving_memory.md
new file mode 100644
index 000000000000..a1283a503a6d
--- /dev/null
+++ b/docs/configuration/conserving_memory.md
@@ -0,0 +1,144 @@
+# Conserving Memory
+
+Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.
+
+## Tensor Parallelism (TP)
+
+Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs.
+
+The following code splits the model across 2 GPUs.
+
+```python
+from vllm import LLM
+
+llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
+          tensor_parallel_size=2)
+```
+
+!!! warning
+    To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][])
+    before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
+
+    To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
+
+!!! note
+    With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
+
+    You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
+
+## Quantization
+
+Quantized models take less memory at the cost of lower precision.
+
+Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Red Hat AI](https://huggingface.co/RedHatAI))
+and used directly without extra configuration.
+
+Dynamic quantization is also supported via the `quantization` option -- see [here][quantization-index] for more details.
+
+## Context length and batch size
+
+You can further reduce memory usage by limiting the context length of the model (`max_model_len` option)
+and the maximum batch size (`max_num_seqs` option).
+
+```python
+from vllm import LLM
+
+llm = LLM(model="adept/fuyu-8b",
+          max_model_len=2048,
+          max_num_seqs=2)
+```
+
+## Reduce CUDA Graphs
+
+By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
+
+!!! warning
+    CUDA graph capture takes up more memory in V1 than in V0.
+
+You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
+
+```python
+from vllm import LLM
+from vllm.config import CompilationConfig, CompilationLevel
+
+llm = LLM(
+    model="meta-llama/Llama-3.1-8B-Instruct",
+    compilation_config=CompilationConfig(
+        level=CompilationLevel.PIECEWISE,
+        # By default, it goes up to max_num_seqs
+        cudagraph_capture_sizes=[1, 2, 4, 8, 16],
+    ),
+)
+```
+
+You can disable graph capturing completely via the `enforce_eager` flag:
+
+```python
+from vllm import LLM
+
+llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
+          enforce_eager=True)
+```
+
+## Adjust cache size
+
+If you run out of CPU RAM, try the following options:
+
+- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
+- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
+
+## Multi-modal input limits
+
+You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model:
+
+```python
+from vllm import LLM
+
+# Accept up to 3 images and 1 video per prompt
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          limit_mm_per_prompt={"image": 3, "video": 1})
+```
+
+You can go a step further and disable unused modalities completely by setting its limit to zero.
+For example, if your application only accepts image input, there is no need to allocate any memory for videos.
+
+```python
+from vllm import LLM
+
+# Accept any number of images but no videos
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          limit_mm_per_prompt={"video": 0})
+```
+
+You can even run a multi-modal model for text-only inference:
+
+```python
+from vllm import LLM
+
+# Don't accept images. Just text.
+llm = LLM(model="google/gemma-3-27b-it",
+          limit_mm_per_prompt={"image": 0})
+```
+
+## Multi-modal processor arguments
+
+For certain models, you can adjust the multi-modal processor arguments to
+reduce the size of the processed multi-modal inputs, which in turn saves memory.
+
+Here are some examples:
+
+```python
+from vllm import LLM
+
+# Available for Qwen2-VL series models
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          mm_processor_kwargs={
+              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
+          })
+
+# Available for InternVL series models
+llm = LLM(model="OpenGVLab/InternVL2-2B",
+          mm_processor_kwargs={
+              "max_dynamic_patch": 4,  # Default is 12
+          })
+```
diff --git a/docs/configuration/engine_args.md b/docs/configuration/engine_args.md
new file mode 100644
index 000000000000..fb2689a56391
--- /dev/null
+++ b/docs/configuration/engine_args.md
@@ -0,0 +1,18 @@
+---
+title: Engine Arguments
+---
+[](){ #engine-args }
+
+Engine arguments control the behavior of the vLLM engine.
+
+- For [offline inference][offline-inference], they are part of the arguments to [LLM][vllm.LLM] class.
+- For [online serving][openai-compatible-server], they are part of the arguments to `vllm serve`.
+
+You can look at [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs] to see the available engine arguments.
+
+However, these classes are a combination of the configuration classes defined in [vllm.config][]. Therefore, we would recommend you read about them there where they are best documented.
+
+For offline inference you will have access to these configuration classes and for online serving you can cross-reference the configs with `vllm serve --help`, which has its arguments grouped by config.
+
+!!! note
+    Additional arguments are available to the [AsyncLLMEngine][vllm.engine.async_llm_engine.AsyncLLMEngine] which is used for online serving. These can be found by running `vllm serve --help`
diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md
new file mode 100644
index 000000000000..f6d548a19d91
--- /dev/null
+++ b/docs/configuration/env_vars.md
@@ -0,0 +1,12 @@
+# Environment Variables
+
+vLLM uses the following environment variables to configure the system:
+
+!!! warning
+    Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work.
+
+    All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
+
+```python
+--8<-- "vllm/envs.py:env-vars-definition"
+```
diff --git a/docs/configuration/model_resolution.md b/docs/configuration/model_resolution.md
new file mode 100644
index 000000000000..8757c257d3e9
--- /dev/null
+++ b/docs/configuration/model_resolution.md
@@ -0,0 +1,23 @@
+# Model Resolution
+
+vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository
+and finding the corresponding implementation that is registered to vLLM.
+Nevertheless, our model resolution may fail for the following reasons:
+
+- The `config.json` of the model repository lacks the `architectures` field.
+- Unofficial repositories refer to a model using alternative names which are not recorded in vLLM.
+- The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded.
+
+To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option.
+For example:
+
+```python
+from vllm import LLM
+
+model = LLM(
+    model="cerebras/Cerebras-GPT-1.3B",
+    hf_overrides={"architectures": ["GPT2LMHeadModel"]},  # GPT-2
+)
+```
+
+Our [list of supported models][supported-models] shows the model architectures that are recognized by vLLM.
diff --git a/docs/source/performance/optimization.md b/docs/configuration/optimization.md
similarity index 99%
rename from docs/source/performance/optimization.md
rename to docs/configuration/optimization.md
index 4160f0784962..811925c19e63 100644
--- a/docs/source/performance/optimization.md
+++ b/docs/configuration/optimization.md
@@ -1,5 +1,3 @@
-(optimization-and-tuning)=
-
 # Optimization and Tuning
 
 This guide covers optimization strategies and performance tuning for vLLM V1.
@@ -26,7 +24,7 @@ You can monitor the number of preemption requests through Prometheus metrics exp
 
 In vLLM V1, the default preemption mode is `RECOMPUTE` rather than `SWAP`, as recomputation has lower overhead in the V1 architecture.
 
-(chunked-prefill)=
+[](){ #chunked-prefill }
 
 ## Chunked Prefill
 
diff --git a/docs/configuration/serve_args.md b/docs/configuration/serve_args.md
new file mode 100644
index 000000000000..16b4b29f45d9
--- /dev/null
+++ b/docs/configuration/serve_args.md
@@ -0,0 +1,38 @@
+---
+title: Server Arguments
+---
+[](){ #serve-args }
+
+The `vllm serve` command is used to launch the OpenAI-compatible server.
+
+## CLI Arguments
+
+The `vllm serve` command is used to launch the OpenAI-compatible server.
+To see the available CLI arguments, run `vllm serve --help`!
+
+## Configuration file
+
+You can load CLI arguments via a [YAML](https://yaml.org/) config file.
+The argument names must be the long form of those outlined [above][serve-args].
+
+For example:
+
+```yaml
+# config.yaml
+
+model: meta-llama/Llama-3.1-8B-Instruct
+host: "127.0.0.1"
+port: 6379
+uvicorn-log-level: "info"
+```
+
+To use the above config file:
+
+```bash
+vllm serve --config config.yaml
+```
+
+!!! note
+    In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
+    The order of priorities is `command line > config file values > defaults`.
+    e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file.
diff --git a/docs/source/contributing/overview.md b/docs/contributing/README.md
similarity index 84%
rename from docs/source/contributing/overview.md
rename to docs/contributing/README.md
index 89b31f0311e2..2517436afcc1 100644
--- a/docs/source/contributing/overview.md
+++ b/docs/contributing/README.md
@@ -16,9 +16,9 @@ Finally, one of the most impactful ways to support us is by raising awareness ab
 Unsure on where to start? Check out the following links for tasks to work on:
 
 - [Good first issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22)
-  - [Selected onboarding tasks](gh-project:6)
+    - [Selected onboarding tasks](gh-project:6)
 - [New model requests](https://github.com/vllm-project/vllm/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22new-model%22)
-  - [Models with multi-modal capabilities](gh-project:10)
+    - [Models with multi-modal capabilities](gh-project:10)
 
 ## License
 
@@ -27,7 +27,21 @@ See <gh-file:LICENSE>.
 ## Developing
 
 Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
-Check out the [building from source](#build-from-source) documentation for details.
+Check out the [building from source][build-from-source] documentation for details.
+
+### Building the docs
+
+Install the dependencies:
+
+```bash
+pip install -r requirements/docs.txt
+```
+
+Start the autoreloading MkDocs server:
+
+```bash
+mkdocs serve
+```
 
 ## Testing
 
@@ -48,29 +62,25 @@ pre-commit run mypy-3.9 --hook-stage manual --all-files
 pytest tests/
 ```
 
-:::{tip}
-Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
+!!! tip
+    Since the <gh-file:docker/Dockerfile> ships with Python 3.12, all tests in CI (except `mypy`) are run with Python 3.12.
 
-Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
-:::
+    Therefore, we recommend developing with Python 3.12 to minimise the chance of your local environment clashing with our CI environment.
 
-:::{note}
-Currently, the repository is not fully checked by `mypy`.
-:::
+!!! note
+    Currently, the repository is not fully checked by `mypy`.
 
-:::{note}
-Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU
-platform to run unit tests locally, rely on the continuous integration system to run the tests for
-now.
-:::
+!!! note
+    Currently, not all unit tests pass when run on CPU platforms. If you don't have access to a GPU
+    platform to run unit tests locally, rely on the continuous integration system to run the tests for
+    now.
 
 ## Issues
 
 If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
-:::{important}
-If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
-:::
+!!! warning
+    If you discover a security vulnerability, please follow the instructions [here](gh-file:SECURITY.md#reporting-a-vulnerability).
 
 ## Pull Requests & Code Reviews
 
@@ -106,9 +116,8 @@ appropriately to indicate the type of change. Please use one of the following:
 - `[Misc]` for PRs that do not fit the above categories. Please use this
   sparingly.
 
-:::{note}
-If the PR spans more than one category, please include all relevant prefixes.
-:::
+!!! note
+    If the PR spans more than one category, please include all relevant prefixes.
 
 ### Code Quality
 
@@ -121,9 +130,8 @@ The PR needs to meet the following code quality standards:
   understand the code.
 - Include sufficient tests to ensure the project stays correct and robust. This
   includes both unit tests and integration tests.
-- Please add documentation to `docs/source/` if the PR modifies the
-  user-facing behaviors of vLLM. It helps vLLM users understand and utilize the
-  new features or changes.
+- Please add documentation to `docs/` if the PR modifies the user-facing behaviors of vLLM.
+  It helps vLLM users understand and utilize the new features or changes.
 
 ### Adding or Changing Kernels
 
diff --git a/docs/source/performance/benchmarks.md b/docs/contributing/benchmarks.md
similarity index 86%
rename from docs/source/performance/benchmarks.md
rename to docs/contributing/benchmarks.md
index 39dc470a1c70..00505fc6f2a9 100644
--- a/docs/source/performance/benchmarks.md
+++ b/docs/contributing/benchmarks.md
@@ -1,13 +1,14 @@
-(benchmarks)=
-
-# Benchmark Suites
+---
+title: Benchmark Suites
+---
+[](){ #benchmarks }
 
 vLLM contains two sets of benchmarks:
 
-- [Performance benchmarks](#performance-benchmarks)
-- [Nightly benchmarks](#nightly-benchmarks)
+- [Performance benchmarks][performance-benchmarks]
+- [Nightly benchmarks][nightly-benchmarks]
 
-(performance-benchmarks)=
+[](){ #performance-benchmarks }
 
 ## Performance Benchmarks
 
@@ -17,7 +18,7 @@ The latest performance results are hosted on the public [vLLM Performance Dashbo
 
 More information on the performance benchmarks and their parameters can be found [here](gh-file:.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md).
 
-(nightly-benchmarks)=
+[](){ #nightly-benchmarks }
 
 ## Nightly Benchmarks
 
diff --git a/docs/source/contributing/deprecation_policy.md b/docs/contributing/deprecation_policy.md
similarity index 100%
rename from docs/source/contributing/deprecation_policy.md
rename to docs/contributing/deprecation_policy.md
diff --git a/docs/source/contributing/dockerfile/dockerfile.md b/docs/contributing/dockerfile/dockerfile.md
similarity index 82%
rename from docs/source/contributing/dockerfile/dockerfile.md
rename to docs/contributing/dockerfile/dockerfile.md
index 90b9a33cfbe6..a39f335c87b8 100644
--- a/docs/source/contributing/dockerfile/dockerfile.md
+++ b/docs/contributing/dockerfile/dockerfile.md
@@ -1,7 +1,7 @@
 # Dockerfile
 
 We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
-More information about deploying with Docker can be found [here](#deployment-docker).
+More information about deploying with Docker can be found [here][deployment-docker].
 
 Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
 
@@ -17,18 +17,21 @@ The edges of the build graph represent:
 
 - `RUN --mount=(.\*)from=...` dependencies (with a dotted line and an empty diamond arrow head)
 
-  > :::{figure} /assets/contributing/dockerfile-stages-dependency.png
-  > :align: center
-  > :alt: query
-  > :width: 100%
-  > :::
+  > <figure markdown="span">
+  >   ![](../../assets/contributing/dockerfile-stages-dependency.png){ align="center" alt="query" width="100%" }
+  > </figure>
   >
   > Made using: <https://github.com/patrickhoefler/dockerfilegraph>
   >
   > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
   >
   > ```bash
-  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile
+  > dockerfilegraph \
+  >   -o png \
+  >   --legend \
+  >   --dpi 200 \
+  >   --max-label-length 50 \
+  >   --filename docker/Dockerfile
   > ```
   >
   > or in case you want to run it directly with the docker image:
diff --git a/docs/contributing/model/README.md b/docs/contributing/model/README.md
new file mode 100644
index 000000000000..b7727f02c11b
--- /dev/null
+++ b/docs/contributing/model/README.md
@@ -0,0 +1,23 @@
+---
+title: Adding a New Model
+---
+[](){ #new-model }
+
+This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
+
+Contents:
+
+- [Basic](basic.md)
+- [Registration](registration.md)
+- [Tests](tests.md)
+- [Multimodal](multimodal.md)
+
+!!! note
+    The complexity of adding a new model depends heavily on the model's architecture.
+    The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
+    However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
+
+!!! tip
+    If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
+    or ask on our [developer slack](https://slack.vllm.ai).
+    We will be happy to help you out!
diff --git a/docs/source/contributing/model/basic.md b/docs/contributing/model/basic.md
similarity index 82%
rename from docs/source/contributing/model/basic.md
rename to docs/contributing/model/basic.md
index ad31995f76be..0c0ba3379257 100644
--- a/docs/source/contributing/model/basic.md
+++ b/docs/contributing/model/basic.md
@@ -1,6 +1,7 @@
-(new-model-basic)=
-
-# Implementing a Basic Model
+---
+title: Implementing a Basic Model
+---
+[](){ #new-model-basic }
 
 This guide walks you through the steps to implement a basic vLLM model.
 
@@ -10,9 +11,8 @@ First, clone the PyTorch model code from the source repository.
 For instance, vLLM's [OPT model](gh-file:vllm/model_executor/models/opt.py) was adapted from
 HuggingFace's [modeling_opt.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py) file.
 
-:::{warning}
-Make sure to review and adhere to the original code's copyright and licensing terms!
-:::
+!!! warning
+    Make sure to review and adhere to the original code's copyright and licensing terms!
 
 ## 2. Make your code compatible with vLLM
 
@@ -67,7 +67,7 @@ class MyModel(nn.Module):
         ... 
 ```
 
-- Rewrite the {meth}`~torch.nn.Module.forward` method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
+- Rewrite the [forward][torch.nn.Module.forward] method of your model to remove any unnecessary code, such as training-specific code. Modify the input parameters to treat `input_ids` and `positions` as flattened tensors with a single batch size dimension, without a max-sequence length dimension.
 
 ```python
 def forward(
@@ -78,10 +78,9 @@ def forward(
     ...
 ```
 
-:::{note}
-Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
-If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
-:::
+!!! note
+    Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
+    If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
 
 For reference, check out our [Llama implementation](gh-file:vllm/model_executor/models/llama.py). vLLM already supports a large number of models. It is recommended to find a model similar to yours and adapt it to your model's architecture. Check out <gh-dir:vllm/model_executor/models> for more examples.
 
@@ -89,7 +88,7 @@ For reference, check out our [Llama implementation](gh-file:vllm/model_executor/
 
 If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
 To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
-For the embedding layer, you can simply replace {class}`torch.nn.Embedding` with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`.
+For the embedding layer, you can simply replace [torch.nn.Embedding][] with `VocabParallelEmbedding`. For the output LM head, you can use `ParallelLMHead`.
 When it comes to the linear layers, we provide the following options to parallelize them:
 
 - `ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
@@ -107,7 +106,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a
 
 ## 5. Register your model
 
-See [this page](#new-model-registration) for instructions on how to register your new model to be used by vLLM.
+See [this page][new-model-registration] for instructions on how to register your new model to be used by vLLM.
 
 ## Frequently Asked Questions
 
@@ -117,7 +116,7 @@ For models with interleaving sliding windows (e.g. `google/gemma-2-2b-it` and `m
 
 To support a model with interleaving sliding windows, we need to take care of the following details:
 
-- Make sure [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/config.py#L308) evaluates `has_interleaved_attention` to `True` for this model, and set `self.hf_text_config.interleaved_sliding_window` to the format of interleaving sliding windows the model can understand. Then, `self.hf_text_config.sliding_window` will be deleted, and the model will be treated as a full-attention model.
+- Make sure the model's `config.json` contains `sliding_window_pattern`. vLLM then sets `self.hf_text_config.interleaved_sliding_window` to the value of `self.hf_text_config.sliding_window` and deletes `sliding_window` from `self.hf_text_config`. The model will then be treated as a full-attention model.
 - In the modeling code, parse the correct sliding window value for every layer, and pass it to the attention layer's `per_layer_sliding_window` argument. For reference, check [this line](https://github.com/vllm-project/vllm/blob/996357e4808ca5eab97d4c97c7d25b3073f46aab/vllm/model_executor/models/llama.py#L171).
 
 With these two steps, interleave sliding windows should work with the model.
diff --git a/docs/contributing/model/multimodal.md b/docs/contributing/model/multimodal.md
new file mode 100644
index 000000000000..892ab9098407
--- /dev/null
+++ b/docs/contributing/model/multimodal.md
@@ -0,0 +1,803 @@
+---
+title: Multi-Modal Support
+---
+[](){ #supports-multimodal }
+
+This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs][multimodal-inputs].
+
+## 1. Update the base vLLM model
+
+It is assumed that you have already implemented the model in vLLM according to [these steps][new-model-basic].
+Further update the model as follows:
+
+- Reserve a keyword parameter in [forward][torch.nn.Module.forward] for each input tensor that corresponds to a multi-modal input, as shown in the following example:
+
+  ```diff
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+  +     pixel_values: torch.Tensor,
+    ) -> SamplerOutput:
+  ```
+  
+  More conveniently, you can simply pass `**kwargs` to the [forward][torch.nn.Module.forward] method and retrieve the keyword parameters for multimodal inputs from it.
+
+- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
+
+    ```python
+    class YourModelForImage2Seq(nn.Module):
+        ...
+
+        def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
+
+            assert self.vision_encoder is not None
+            image_features = self.vision_encoder(image_input)
+            return self.multi_modal_projector(image_features)
+
+        def get_multimodal_embeddings(
+                self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
+
+            # Validate the multimodal input keyword arguments
+            image_input = self._parse_and_validate_image_input(**kwargs)
+            if image_input is None:
+                return None
+
+            # Run multimodal inputs through encoder and projector
+            vision_embeddings = self._process_image_input(image_input)
+            return vision_embeddings
+    ```
+
+!!! warning
+        The returned `multimodal_embeddings` must be either a **3D [torch.Tensor][]** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D [torch.Tensor][]'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
+
+- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
+
+    ```python
+    from .utils import merge_multimodal_embeddings
+
+    class YourModelForImage2Seq(nn.Module):
+        ...
+
+        def get_input_embeddings(
+            self,
+            input_ids: torch.Tensor,
+            multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
+        ) -> torch.Tensor:
+
+            # `get_input_embeddings` should already be implemented for the language 
+            # model as one of the requirements of basic vLLM model implementation.
+            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+            if multimodal_embeddings is not None:
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids=input_ids, 
+                    inputs_embeds=inputs_embeds, 
+                    multimodal_embeddings=multimodal_embeddings,
+                    placeholder_token_id=self.config.image_token_index)
+
+            return inputs_embeds
+    ```
+
+- Implement [get_language_model][vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model] getter to provide stable access to the underlying language model.
+
+    ```python
+    class YourModelForImage2Seq(nn.Module):
+        ...
+
+        def get_language_model(self) -> torch.nn.Module:
+            # Change `language_model` according to your implementation.
+            return self.language_model
+    ```
+
+- Once the above steps are done, update the model class with the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface.
+
+  ```diff
+  + from vllm.model_executor.models.interfaces import SupportsMultiModal
+
+  - class YourModelForImage2Seq(nn.Module):
+  + class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+  ```
+
+!!! note
+      The model class does not have to be named `*ForCausalLM`.
+      Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
+
+## 2. Specify processing information
+
+Next, create a subclass of [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo]
+to provide basic information related to HF processing.
+
+### Maximum number of input items
+
+You need to override the abstract method [get_supported_mm_limits][vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits]
+to return the maximum number of input items for each modality supported by the model.
+
+For example, if the model supports any number of images but only one video per prompt:
+
+```python
+def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
+    return {"image": None, "video": 1}
+```
+
+## 3. Specify dummy inputs
+
+Then, inherit [BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] to construct dummy inputs for
+HF processing as well as memory profiling.
+
+### For memory profiling
+
+Override the abstract methods [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text] and [get_dummy_mm_data][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data] to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.
+
+Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
+
+=== "Basic example: LLaVA"
+
+    Looking at the code of HF's `LlavaForConditionalGeneration`:
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
+    n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
+    n_image_features = image_features.shape[0] * image_features.shape[1]
+
+    if n_image_tokens != n_image_features:
+        raise ValueError(
+            f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+        )
+    special_image_mask = (
+        (input_ids == self.config.image_token_index)
+        .unsqueeze(-1)
+        .expand_as(inputs_embeds)
+        .to(inputs_embeds.device)
+    )
+    image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+    inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+    ```
+
+    The number of placeholder feature tokens per image is `image_features.shape[1]`.
+    `image_features` is calculated inside the `get_image_features` method:
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
+    image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+
+    selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+    if vision_feature_select_strategy == "default":
+        selected_image_feature = selected_image_feature[:, 1:]
+    elif vision_feature_select_strategy == "full":
+        selected_image_feature = selected_image_feature
+    else:
+        raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+    image_features = self.multi_modal_projector(selected_image_feature)
+    return image_features
+    ```
+
+    We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
+    (`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
+    Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`.
+    The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention
+    mechanism doesn't change the sequence length of the output hidden states.
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102
+    hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+    hidden_states = self.pre_layrnorm(hidden_states)
+
+    encoder_outputs = self.encoder(
+        inputs_embeds=hidden_states,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+    )
+    ```
+
+    To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
+    target_dtype = self.patch_embedding.weight.dtype
+    patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+    patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+    class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+    embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+    if interpolate_pos_encoding:
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+    else:
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+    return embeddings
+    ```
+
+    We can infer that `embeddings.shape[1] == self.num_positions`, where
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196
+    self.num_patches = (self.image_size // self.patch_size) ** 2
+    self.num_positions = self.num_patches + 1
+    ```
+
+    Overall, the number of placeholder feature tokens for an image can be calculated as:
+
+    ```python
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> int:
+        hf_config = self.get_hf_config()
+        hf_processor = self.get_hf_processor()
+
+        image_size = hf_config.vision_config.image_size
+        patch_size = hf_config.vision_config.patch_size
+
+        num_image_tokens = (image_size // patch_size) ** 2 + 1
+        if hf_processor.vision_feature_select_strategy == "default":
+            num_image_tokens -= 1
+
+        return num_image_tokens
+    ```
+
+    Notice that the number of image tokens doesn't depend on the image width and height.
+    We can simply use a dummy `image_size` to calculate the multimodal profiling data:
+
+    ```python
+    # NOTE: In actuality, this is usually implemented as part of the
+    # model's subclass of `BaseProcessingInfo`, but we show it as is
+    # here for simplicity.
+    def get_image_size_with_most_features(self) -> ImageSize:
+        hf_config = self.get_hf_config()
+        width = height = hf_config.image_size
+        return ImageSize(width=width, height=height)
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        num_images = mm_counts.get("image", 0)
+
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+    ```
+
+    For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
+
+    ```python
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_images = mm_counts.get("image", 0)
+
+        processor = self.info.get_hf_processor()
+        image_token = processor.image_token
+
+        return image_token * num_images
+    ```
+
+=== "No input placeholders: Fuyu"
+
+    Looking at the code of HF's `FuyuForCausalLM`:
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
+    if image_patches is not None and past_key_values is None:
+        patch_embeddings = [
+            self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
+            .squeeze(0)
+            .to(inputs_embeds.device)
+            for patch in image_patches
+        ]
+        inputs_embeds = self.gather_continuous_embeddings(
+            word_embeddings=inputs_embeds,
+            continuous_embeddings=patch_embeddings,
+            image_patch_input_indices=image_patches_indices,
+        )
+    ```
+
+    The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
+    which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
+
+    Unlike LLaVA, Fuyu does not define the number of patches inside the modeling file. Where can we get more information?
+    Considering that the model input comes from the output of `FuyuProcessor`, let's **look at the preprocessing files**.
+
+    The image outputs are obtained by calling `FuyuImageProcessor.preprocess` and then
+    `FuyuImageProcessor.preprocess_with_tokenizer_info` inside `FuyuProcessor`.
+
+    In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
+    returning the dimensions after resizing (but before padding) as metadata.
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
+    image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
+    batch_images = image_encoding["images"]
+    image_unpadded_heights = image_encoding["image_unpadded_heights"]
+    image_unpadded_widths = image_encoding["image_unpadded_widths"]
+
+    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
+    if do_resize:
+        batch_images = [
+            [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
+            for images in batch_images
+        ]
+
+    image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
+    image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
+    image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
+
+    if do_pad:
+        batch_images = [
+            [
+                self.pad_image(
+                    image,
+                    size=size,
+                    mode=padding_mode,
+                    constant_values=padding_value,
+                    input_data_format=input_data_format,
+                )
+                for image in images
+            ]
+            for images in batch_images
+        ]
+    ```
+
+    In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
+    model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+        image_input=tensor_batch_images,
+        image_present=image_present,
+        image_unpadded_h=image_unpadded_heights,
+        image_unpadded_w=image_unpadded_widths,
+        image_placeholder_id=image_placeholder_id,
+        image_newline_id=image_newline_id,
+        variable_sized=True,
+    )
+
+    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
+    image_height, image_width = image.shape[1], image.shape[2]
+    if variable_sized:  # variable_sized=True
+        new_h = min(
+            image_height,
+            math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
+        )
+        new_w = min(
+            image_width,
+            math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
+        )
+        image = image[:, :new_h, :new_w]
+        image_height, image_width = new_h, new_w
+
+    num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
+    tensor_of_image_ids = torch.full(
+        [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
+    )
+    patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
+    assert num_patches == patches.shape[0]
+    ```
+
+    The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
+    patch_size = patch_size if patch_size is not None else self.patch_size
+    patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
+
+    if image_height % patch_height != 0:
+        raise ValueError(f"{image_height=} must be divisible by {patch_height}")
+    if image_width % patch_width != 0:
+        raise ValueError(f"{image_width=} must be divisible by {patch_width}")
+
+    num_patches_per_dim_h = image_height // patch_height
+    num_patches_per_dim_w = image_width // patch_width
+    num_patches = num_patches_per_dim_h * num_patches_per_dim_w
+    ```
+
+    These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
+    to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
+
+    ```python
+    def get_image_size_with_most_features(self) -> ImageSize:
+        image_processor = self.get_image_processor()
+        return ImageSize(width=image_processor.size["width"],
+                            height=image_processor.size["height"])
+    ```
+
+    Fuyu does not expect image placeholders in the inputs to HF processor, so
+    the dummy prompt text is empty regardless of the number of images.
+
+    ```python
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        return ""
+    ```
+
+    For the multimodal image profiling data, the logic is very similar to LLaVA:
+
+    ```python
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        target_width, target_height = \
+            self.info.get_image_size_with_most_features()
+        num_images = mm_counts.get("image", 0)
+
+        return {
+            "image":
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
+                                   num_images=num_images)
+        }
+    ```
+
+## 4. Specify processing details
+
+Afterwards, create a subclass of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]
+to fill in the missing details about HF processing.
+
+!!! info
+    [Multi-Modal Data Processing][mm-processing]
+
+### Multi-modal fields
+
+Override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] to
+return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
+
+=== "Basic example: LLaVA"
+
+    The output of `CLIPImageProcessor` is a simple tensor with shape
+    `(num_images, num_channels, image_height, image_width)`:
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
+    images = [
+        to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        for image in all_images
+    ]
+
+    data = {"pixel_values": images}
+    return BatchFeature(data=data, tensor_type=return_tensors)
+    ```
+
+    So, we override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows:
+
+    ```python
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+        )
+    ```
+
+    !!! note
+        Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
+        pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
+
+=== "With postprocessing: Fuyu"
+
+    The `image_patches` output of `FuyuImageProcessor.preprocess_with_tokenizer_info` concatenates
+    the patches from each image belonging to an item in the batch:
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L673-L679
+            image_input_ids.append(tensor_of_image_ids)
+            image_patches.append(patches)
+        else:
+            image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device))
+
+    batch_image_input_ids.append(image_input_ids)
+    batch_image_patches.append(image_patches)
+    ```
+
+    The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore
+    `(1, num_images, num_patches, patch_width * patch_height * num_channels)`.
+
+    In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
+    we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
+
+    ```python
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> BatchFeature:
+        processed_outputs = super()._call_hf_processor(
+            prompt=prompt,
+            mm_data=mm_data,
+            mm_kwargs=mm_kwargs,
+        )
+
+        image_patches = processed_outputs.get("image_patches")
+        if image_patches is not None:
+            images = mm_data["images"]
+            assert isinstance(images, list)
+
+            # Original output: (1, num_images, Pn, Px * Py * C)
+            # New output: (num_images, Pn, Px * Py * C)
+            assert (isinstance(image_patches, list)
+                    and len(image_patches) == 1)
+            assert (isinstance(image_patches[0], torch.Tensor)
+                    and len(image_patches[0]) == len(images))
+
+            processed_outputs["image_patches"] = image_patches[0]
+
+        return processed_outputs
+    ```
+
+    !!! note
+        Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
+        for text-only inputs to prevent unnecessary warnings from HF processor.
+
+    This lets us override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows:
+
+    ```python
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        return dict(image_patches=MultiModalFieldConfig.batched("image"))
+    ```
+
+### Prompt updates
+
+Override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] to
+return a list of [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instances.
+
+Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies an update operation
+(e.g.: insertion, replacement) performed by the HF processor.
+
+=== "Basic example: LLaVA"
+
+    Looking at HF's `LlavaProcessor`:
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170
+    prompt_strings = []
+    for sample in text:
+        sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+        prompt_strings.append(sample)
+    ```
+
+    It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
+    Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:
+
+    ```python
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_config = self.info.get_hf_config()
+        image_token_id = hf_config.image_token_index
+
+        def get_replacement(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+
+            image_size = images.get_image_size(item_idx)
+            num_image_tokens = self.info.get_num_image_tokens(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+
+            return [image_token_id] * num_image_tokens
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[image_token_id],
+                replacement=get_replacement,
+            ),
+        ]
+    ```
+
+=== "Handling additional tokens: Fuyu"
+
+    Recall the layout of feature tokens from Step 2:
+
+    ```
+    |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
+    |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
+    ...
+    |SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
+    ```
+
+    We define a helper function to return `ncols` and `nrows` directly:
+
+    ```python
+    def get_image_feature_grid_size(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+    ) -> tuple[int, int]:
+        image_processor = self.get_image_processor()
+        target_width = image_processor.size["width"]
+        target_height = image_processor.size["height"]
+        patch_width = image_processor.patch_size["width"]
+        patch_height = image_processor.patch_size["height"]
+
+        if not (image_width <= target_width and image_height <= target_height):
+            height_scale_factor = target_height / image_height
+            width_scale_factor = target_width / image_width
+            optimal_scale_factor = min(height_scale_factor, width_scale_factor)
+
+            image_height = int(image_height * optimal_scale_factor)
+            image_width = int(image_width * optimal_scale_factor)
+
+        ncols = math.ceil(image_width / patch_width)
+        nrows = math.ceil(image_height / patch_height)
+        return ncols, nrows
+    ```
+
+    Based on this, we can initially define our replacement tokens as:
+
+    ```python
+    def get_replacement(item_idx: int):
+        images = mm_items.get_items("image", ImageProcessorItems)
+        image_size = images.get_image_size(item_idx)
+
+        ncols, nrows = self.info.get_image_feature_grid_size(
+            image_width=image_size.width,
+            image_height=image_size.height,
+        )
+
+        # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
+        # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
+        return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
+    ```
+
+    However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
+    a BOS token (`<s>`) is also added to the promopt:
+
+    ```python
+    # https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
+    model_image_input = self.image_processor.preprocess_with_tokenizer_info(
+        image_input=tensor_batch_images,
+        image_present=image_present,
+        image_unpadded_h=image_unpadded_heights,
+        image_unpadded_w=image_unpadded_widths,
+        image_placeholder_id=image_placeholder_id,
+        image_newline_id=image_newline_id,
+        variable_sized=True,
+    )
+    prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
+        tokenizer=self.tokenizer,
+        prompts=prompts,
+        scale_factors=scale_factors,
+        max_tokens_to_generate=self.max_tokens_to_generate,
+        max_position_embeddings=self.max_position_embeddings,
+        add_BOS=True,
+        add_beginning_of_answer_token=True,
+    )
+    ```
+
+    To assign the vision embeddings to only the image tokens, instead of a string
+    you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:
+
+    ```python
+    hf_config = self.info.get_hf_config()
+    bos_token_id = hf_config.bos_token_id  # `<s>`
+    assert isinstance(bos_token_id, int)
+
+    def get_replacement_fuyu(item_idx: int):
+        images = mm_items.get_items("image", ImageProcessorItems)
+        image_size = images.get_image_size(item_idx)
+
+        ncols, nrows = self.info.get_image_feature_grid_size(
+            image_width=image_size.width,
+            image_height=image_size.height,
+        )
+        image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
+                        [_NEWLINE_TOKEN_ID]) * nrows
+
+        return PromptUpdateDetails.select_token_id(
+            image_tokens + [bos_token_id],
+            embed_token_id=_IMAGE_TOKEN_ID,
+        )
+    ```
+
+    Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
+    we can search for it to conduct the replacement at the start of the string:
+
+    ```python
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        hf_config = self.info.get_hf_config()
+        bos_token_id = hf_config.bos_token_id
+        assert isinstance(bos_token_id, int)
+
+        tokenizer = self.info.get_tokenizer()
+        eot_token_id = tokenizer.bos_token_id
+        assert isinstance(eot_token_id, int)
+
+        def get_replacement_fuyu(item_idx: int):
+            images = mm_items.get_items("image", ImageProcessorItems)
+            image_size = images.get_image_size(item_idx)
+
+            ncols, nrows = self.info.get_image_feature_grid_size(
+                image_width=image_size.width,
+                image_height=image_size.height,
+            )
+            image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
+                            [_NEWLINE_TOKEN_ID]) * nrows
+
+            return PromptUpdateDetails.select_token_id(
+                image_tokens + [bos_token_id],
+                embed_token_id=_IMAGE_TOKEN_ID,
+            )
+
+        return [
+            PromptReplacement(
+                modality="image",
+                target=[eot_token_id],
+                replacement=get_replacement_fuyu,
+            )
+        ]
+    ```
+
+## 5. Register processor-related classes
+
+After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2),
+[BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] (Step 3),
+and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] (Step 4),
+decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor <vllm.multimodal.registry.MultiModalRegistry.register_processor>`
+to register them to the multi-modal registry:
+
+```diff
+  from vllm.model_executor.models.interfaces import SupportsMultiModal
++ from vllm.multimodal import MULTIMODAL_REGISTRY
+
++ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
++                                         info=YourProcessingInfo,
++                                         dummy_inputs=YourDummyInputsBuilder)
+  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
+```
+
+## Notes
+
+### Inserting feature tokens without replacement
+
+Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use [PromptInsertion][vllm.multimodal.processing.PromptInsertion] instead of [PromptReplacement][vllm.multimodal.processing.PromptReplacement] inside [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates].
+
+Examples:
+
+- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
+- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py>
+- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
+
+### Handling prompt updates unrelated to multi-modal data
+
+[_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override [_apply_hf_processor_tokens_only][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only] so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design][mm-processing].
+
+Examples:
+
+- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
+- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
+- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
+
+### Custom HF processor
+
+Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to [_call_hf_processor][vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor].
+
+Examples:
+
+- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
+- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
+- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md
new file mode 100644
index 000000000000..7a7bd7914058
--- /dev/null
+++ b/docs/contributing/model/registration.md
@@ -0,0 +1,54 @@
+---
+title: Registering a Model to vLLM
+---
+[](){ #new-model-registration }
+
+vLLM relies on a model registry to determine how to run each model.
+A list of pre-registered architectures can be found [here][supported-models].
+
+If your model is not on this list, you must register it to vLLM.
+This page provides detailed instructions on how to do so.
+
+## Built-in models
+
+To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
+This gives you the ability to modify the codebase and test your model.
+
+After you have implemented your model (see [tutorial][new-model-basic]), put it into the <gh-dir:vllm/model_executor/models> directory.
+Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
+Finally, update our [list of supported models][supported-models] to promote your model!
+
+!!! warning
+    The list of models in each section should be maintained in alphabetical order.
+
+## Out-of-tree models
+
+You can load an external model [using a plugin][plugin-system] without modifying the vLLM codebase.
+
+To register the model, use the following code:
+
+```python
+# The entrypoint of your plugin
+def register():
+    from vllm import ModelRegistry
+    from your_code import YourModelForCausalLM
+
+    ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
+```
+
+If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
+
+```python
+# The entrypoint of your plugin
+def register():
+    from vllm import ModelRegistry
+
+    ModelRegistry.register_model(
+        "YourModelForCausalLM",
+        "your_code:YourModelForCausalLM"
+    )
+```
+
+!!! warning
+    If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface.
+    Read more about that [here][supports-multimodal].
diff --git a/docs/source/contributing/model/tests.md b/docs/contributing/model/tests.md
similarity index 74%
rename from docs/source/contributing/model/tests.md
rename to docs/contributing/model/tests.md
index 68d51d89f7cf..67f8eda61dc5 100644
--- a/docs/source/contributing/model/tests.md
+++ b/docs/contributing/model/tests.md
@@ -1,6 +1,7 @@
-(new-model-tests)=
-
-# Writing Unit Tests
+---
+title: Writing Unit Tests
+---
+[](){ #new-model-tests }
 
 This page explains how to write unit tests to verify the implementation of your model.
 
@@ -14,14 +15,12 @@ Without them, the CI for your PR will fail.
 Include an example HuggingFace repository for your model in <gh-file:tests/models/registry.py>.
 This enables a unit test that loads dummy weights to ensure that the model can be initialized in vLLM.
 
-:::{important}
-The list of models in each section should be maintained in alphabetical order.
-:::
+!!! warning
+    The list of models in each section should be maintained in alphabetical order.
 
-:::{tip}
-If your model requires a development version of HF Transformers, you can set
-`min_transformers_version` to skip the test in CI until the model is released.
-:::
+!!! tip
+    If your model requires a development version of HF Transformers, you can set
+    `min_transformers_version` to skip the test in CI until the model is released.
 
 ## Optional Tests
 
@@ -34,16 +33,16 @@ These tests compare the model outputs of vLLM against [HF Transformers](https://
 
 #### Generative models
 
-For [generative models](#generative-models), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
+For [generative models](../../models/generative_models.md), there are two levels of correctness tests, as defined in <gh-file:tests/models/utils.py>:
 
 - Exact correctness (`check_outputs_equal`): The text outputted by vLLM should exactly match the text outputted by HF.
 - Logprobs similarity (`check_logprobs_close`): The logprobs outputted by vLLM should be in the top-k logprobs outputted by HF, and vice versa.
 
 #### Pooling models
 
-For [pooling models](#pooling-models), we simply check the cosine similarity, as defined in <gh-file:tests/models/embedding/utils.py>.
+For [pooling models](../../models/pooling_models.md), we simply check the cosine similarity, as defined in <gh-file:tests/models/utils.py>.
 
-(mm-processing-tests)=
+[](){ #mm-processing-tests }
 
 ### Multi-modal processing
 
diff --git a/docs/source/contributing/profiling/profiling_index.md b/docs/contributing/profiling.md
similarity index 90%
rename from docs/source/contributing/profiling/profiling_index.md
rename to docs/contributing/profiling.md
index ce25daa39c5c..be01b9b65f65 100644
--- a/docs/source/contributing/profiling/profiling_index.md
+++ b/docs/contributing/profiling.md
@@ -1,8 +1,7 @@
 # Profiling vLLM
 
-:::{warning}
-Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference.
-:::
+!!! warning
+    Profiling is only intended for vLLM developers and maintainers to understand the proportion of time spent in different parts of the codebase. **vLLM end-users should never turn on profiling** as it will significantly slow down the inference.
 
 ## Profile with PyTorch Profiler
 
@@ -14,15 +13,13 @@ When using `benchmarks/benchmark_serving.py`, you can enable profiling by passin
 
 Traces can be visualized using <https://ui.perfetto.dev/>.
 
-:::{tip}
-Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
-:::
+!!! tip
+    Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
 
-:::{tip}
-To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
-Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
-`export VLLM_RPC_TIMEOUT=1800000`
-:::
+!!! tip
+    To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
+    Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
+    `export VLLM_RPC_TIMEOUT=1800000`
 
 ### Example commands and usage
 
diff --git a/docs/source/contributing/vulnerability_management.md b/docs/contributing/vulnerability_management.md
similarity index 100%
rename from docs/source/contributing/vulnerability_management.md
rename to docs/contributing/vulnerability_management.md
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
new file mode 100644
index 000000000000..516640f6fd3c
--- /dev/null
+++ b/docs/deployment/docker.md
@@ -0,0 +1,129 @@
+---
+title: Using Docker
+---
+[](){ #deployment-docker }
+
+[](){ #deployment-docker-pre-built-image }
+
+## Use vLLM's Official Docker Image
+
+vLLM offers an official Docker image for deployment.
+The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
+
+```console
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    -p 8000:8000 \
+    --ipc=host \
+    vllm/vllm-openai:latest \
+    --model mistralai/Mistral-7B-v0.1
+```
+
+This image can also be used with other container engines such as [Podman](https://podman.io/).
+
+```console
+podman run --gpus all \
+  -v ~/.cache/huggingface:/root/.cache/huggingface \
+  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
+  -p 8000:8000 \
+  --ipc=host \
+  vllm/vllm-openai:latest \
+  --model mistralai/Mistral-7B-v0.1
+```
+
+You can add any other [engine-args][engine-args] you need after the image tag (`vllm/vllm-openai:latest`).
+
+!!! note
+    You can either use the `ipc=host` flag or `--shm-size` flag to allow the
+    container to access the host's shared memory. vLLM uses PyTorch, which uses shared
+    memory to share data between processes under the hood, particularly for tensor parallel inference.
+
+!!! note
+    Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
+
+    If you need to use those dependencies (having accepted the license terms),
+    create a custom Dockerfile on top of the base image with an extra layer that installs them:
+
+    ```Dockerfile
+    FROM vllm/vllm-openai:v0.8.3
+
+    # e.g. install the `audio` optional dependencies
+    # NOTE: Make sure the version of vLLM matches the base image!
+    RUN uv pip install --system vllm[audio]==0.8.3
+    ```
+
+!!! tip
+    Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers).
+
+    To use the development version of `transformers`, create a custom Dockerfile on top of the base image
+    with an extra layer that installs their code from source:
+
+    ```Dockerfile
+    FROM vllm/vllm-openai:latest
+
+    RUN uv pip install --system git+https://github.com/huggingface/transformers.git
+    ```
+
+[](){ #deployment-docker-build-image-from-source }
+
+## Building vLLM's Docker Image from Source
+
+You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
+
+```console
+# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
+DOCKER_BUILDKIT=1 docker build . \
+    --target vllm-openai \
+    --tag vllm/vllm-openai \
+    --file docker/Dockerfile
+```
+
+!!! note
+    By default vLLM will build for all GPU types for widest distribution. If you are just building for the
+    current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
+    for vLLM to find the current GPU type and build for that.
+
+    If you are using Podman instead of Docker, you might need to disable SELinux labeling by
+    adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
+
+## Building for Arm64/aarch64
+
+A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
+of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
+
+!!! note
+    Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
+    flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
+    Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
+
+```console
+# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
+python3 use_existing_torch.py
+DOCKER_BUILDKIT=1 docker build . \
+  --file docker/Dockerfile \
+  --target vllm-openai \
+  --platform "linux/arm64" \
+  -t vllm/vllm-gh200-openai:latest \
+  --build-arg max_jobs=66 \
+  --build-arg nvcc_threads=2 \
+  --build-arg torch_cuda_arch_list="9.0+PTX" \
+  --build-arg vllm_fa_cmake_gpu_arches="90-real"
+```
+
+## Use the custom-built vLLM Docker image
+
+To run vLLM with the custom-built Docker image:
+
+```console
+docker run --runtime nvidia --gpus all \
+    -v ~/.cache/huggingface:/root/.cache/huggingface \
+    -p 8000:8000 \
+    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+    vllm/vllm-openai <args...>
+```
+
+The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
+
+!!! note
+    **For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
diff --git a/docs/source/deployment/frameworks/anything-llm.md b/docs/deployment/frameworks/anything-llm.md
similarity index 78%
rename from docs/source/deployment/frameworks/anything-llm.md
rename to docs/deployment/frameworks/anything-llm.md
index d430c170ef54..a89e633c086e 100644
--- a/docs/source/deployment/frameworks/anything-llm.md
+++ b/docs/deployment/frameworks/anything-llm.md
@@ -1,6 +1,7 @@
-(deployment-anything-llm)=
-
-# Anything LLM
+---
+title: Anything LLM
+---
+[](){ #deployment-anything-llm }
 
 [Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
 
@@ -25,23 +26,19 @@ vllm serve Qwen/Qwen1.5-32B-Chat-AWQ --max-model-len 4096
   - Base URL: http://{vllm server host}:{vllm server port}/v1
   - Chat Model Name: `Qwen/Qwen1.5-32B-Chat-AWQ`
 
-:::{image} /assets/deployment/anything-llm-provider.png
-:::
+![](../../assets/deployment/anything-llm-provider.png)
 
 - Back to home page, New Workspace --> create `vllm` workspace, and start to chat:
 
-:::{image} /assets/deployment/anything-llm-chat-without-doc.png
-:::
+![](../../assets/deployment/anything-llm-chat-without-doc.png)
 
 - Click the upload button:
   - upload the doc
   - select the doc and move to the workspace
   - save and embed
 
-:::{image} /assets/deployment/anything-llm-upload-doc.png
-:::
+![](../../assets/deployment/anything-llm-upload-doc.png)
 
 - Chat again:
 
-:::{image} /assets/deployment/anything-llm-chat-with-doc.png
-:::
+![](../../assets/deployment/anything-llm-chat-with-doc.png)
diff --git a/docs/deployment/frameworks/autogen.md b/docs/deployment/frameworks/autogen.md
new file mode 100644
index 000000000000..ad8c167659ef
--- /dev/null
+++ b/docs/deployment/frameworks/autogen.md
@@ -0,0 +1,83 @@
+---
+title: AutoGen
+---
+[](){ #deployment-autogen }
+
+[AutoGen](https://github.com/microsoft/autogen) is a framework for creating multi-agent AI applications that can act autonomously or work alongside humans.
+
+## Prerequisites
+
+- Setup vLLM environment
+
+- Setup [AutoGen](https://microsoft.github.io/autogen/0.2/docs/installation/) environment
+
+```console
+pip install vllm
+
+# Install AgentChat and OpenAI client from Extensions
+# AutoGen requires Python 3.10 or later.
+pip install -U "autogen-agentchat" "autogen-ext[openai]"
+```
+
+## Deploy
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```console
+python -m vllm.entrypoints.openai.api_server \
+    --model mistralai/Mistral-7B-Instruct-v0.2
+```
+
+- Call it with AutoGen:
+
+```python
+import asyncio
+from autogen_core.models import UserMessage
+from autogen_ext.models.openai import OpenAIChatCompletionClient
+from autogen_core.models import ModelFamily
+
+
+async def main() -> None:
+    # Create a model client
+    model_client = OpenAIChatCompletionClient(
+        model="mistralai/Mistral-7B-Instruct-v0.2",
+        base_url="http://{your-vllm-host-ip}:{your-vllm-host-port}/v1",
+        api_key="EMPTY",
+        model_info={
+            "vision": False,
+            "function_calling": False,
+            "json_output": False,
+            "family": ModelFamily.MISTRAL,
+            "structured_output": True,
+        },
+    )
+
+    messages = [UserMessage(content="Write a very short story about a dragon.", source="user")]
+
+    # Create a stream.
+    stream = model_client.create_stream(messages=messages)
+
+    # Iterate over the stream and print the responses.
+    print("Streamed responses:")
+    async for response in stream:
+        if isinstance(response, str):
+            # A partial response is a string.
+            print(response, flush=True, end="")
+        else:
+            # The last response is a CreateResult object with the complete message.
+            print("\n\n------------\n")
+            print("The complete response:", flush=True)
+            print(response.content, flush=True)
+
+    # Close the client when done.
+    await model_client.close()
+
+
+asyncio.run(main())
+```
+
+For details, see the tutorial:
+
+- [Using vLLM in AutoGen](https://microsoft.github.io/autogen/0.2/docs/topics/non-openai-models/local-vllm/)
+
+- [OpenAI-compatible API examples](https://microsoft.github.io/autogen/stable/reference/python/autogen_ext.models.openai.html#autogen_ext.models.openai.OpenAIChatCompletionClient)
diff --git a/docs/source/deployment/frameworks/bentoml.md b/docs/deployment/frameworks/bentoml.md
similarity index 89%
rename from docs/source/deployment/frameworks/bentoml.md
rename to docs/deployment/frameworks/bentoml.md
index 2bf435bda838..7e64b6eb6fb0 100644
--- a/docs/source/deployment/frameworks/bentoml.md
+++ b/docs/deployment/frameworks/bentoml.md
@@ -1,6 +1,7 @@
-(deployment-bentoml)=
-
-# BentoML
+---
+title: BentoML
+---
+[](){ #deployment-bentoml }
 
 [BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
 
diff --git a/docs/source/deployment/frameworks/cerebrium.md b/docs/deployment/frameworks/cerebrium.md
similarity index 98%
rename from docs/source/deployment/frameworks/cerebrium.md
rename to docs/deployment/frameworks/cerebrium.md
index b20c95137b6e..84cb2304fac2 100644
--- a/docs/source/deployment/frameworks/cerebrium.md
+++ b/docs/deployment/frameworks/cerebrium.md
@@ -1,12 +1,11 @@
-(deployment-cerebrium)=
+---
+title: Cerebrium
+---
+[](){ #deployment-cerebrium }
 
-# Cerebrium
-
-:::{raw} html
 <p align="center">
     <img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
 </p>
-:::
 
 vLLM can be run on a cloud based GPU machine with [Cerebrium](https://www.cerebrium.ai/), a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications.
 
diff --git a/docs/source/deployment/frameworks/chatbox.md b/docs/deployment/frameworks/chatbox.md
similarity index 84%
rename from docs/source/deployment/frameworks/chatbox.md
rename to docs/deployment/frameworks/chatbox.md
index e62f4647150f..10da2fc71002 100644
--- a/docs/source/deployment/frameworks/chatbox.md
+++ b/docs/deployment/frameworks/chatbox.md
@@ -1,6 +1,7 @@
-(deployment-chatbox)=
-
-# Chatbox
+---
+title: Chatbox
+---
+[](){ #deployment-chatbox }
 
 [Chatbox](https://github.com/chatboxai/chatbox) is a desktop client for LLMs, available on Windows, Mac, Linux.
 
@@ -27,10 +28,8 @@ vllm serve qwen/Qwen1.5-0.5B-Chat
   - API Path: `/chat/completions`
   - Model: `qwen/Qwen1.5-0.5B-Chat`
 
-:::{image} /assets/deployment/chatbox-settings.png
-:::
+![](../../assets/deployment/chatbox-settings.png)
 
 - Go to `Just chat`, and start to chat:
 
-:::{image} /assets/deployment/chatbox-chat.png
-:::
+![](../../assets/deployment/chatbox-chat.png)
diff --git a/docs/source/deployment/frameworks/dify.md b/docs/deployment/frameworks/dify.md
similarity index 90%
rename from docs/source/deployment/frameworks/dify.md
rename to docs/deployment/frameworks/dify.md
index 5cdf6a387637..886484b54347 100644
--- a/docs/source/deployment/frameworks/dify.md
+++ b/docs/deployment/frameworks/dify.md
@@ -1,6 +1,7 @@
-(deployment-dify)=
-
-# Dify
+---
+title: Dify
+---
+[](){ #deployment-dify }
 
 [Dify](https://github.com/langgenius/dify) is an open-source LLM app development platform. Its intuitive interface combines agentic AI workflow, RAG pipeline, agent capabilities, model management, observability features, and more, allowing you to quickly move from prototype to production.
 
@@ -42,15 +43,12 @@ docker compose up -d
   - **Model Name for API Endpoint**: `Qwen/Qwen1.5-7B-Chat`
   - **Completion Mode**: `Completion`
 
-:::{image} /assets/deployment/dify-settings.png
-:::
+![](../../assets/deployment/dify-settings.png)
 
 - To create a test chatbot, go to `Studio → Chatbot → Create from Blank`, then select Chatbot as the type:
 
-:::{image} /assets/deployment/dify-create-chatbot.png
-:::
+![](../../assets/deployment/dify-create-chatbot.png)
 
 - Click the chatbot you just created to open the chat interface and start interacting with the model:
 
-:::{image} /assets/deployment/dify-chat.png
-:::
+![](../../assets/deployment/dify-chat.png)
diff --git a/docs/source/deployment/frameworks/dstack.md b/docs/deployment/frameworks/dstack.md
similarity index 83%
rename from docs/source/deployment/frameworks/dstack.md
rename to docs/deployment/frameworks/dstack.md
index a16e28f2d898..7de92855745b 100644
--- a/docs/source/deployment/frameworks/dstack.md
+++ b/docs/deployment/frameworks/dstack.md
@@ -1,12 +1,11 @@
-(deployment-dstack)=
+---
+title: dstack
+---
+[](){ #deployment-dstack }
 
-# dstack
-
-:::{raw} html
 <p align="center">
     <img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
 </p>
-:::
 
 vLLM can be run on a cloud based GPU machine with [dstack](https://dstack.ai/), an open-source framework for running LLMs on any cloud. This tutorial assumes that you have already configured credentials, gateway, and GPU quotas on your cloud environment.
 
@@ -97,6 +96,5 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-:::{note}
-dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
-:::
+!!! note
+    dstack automatically handles authentication on the gateway using dstack's tokens. Meanwhile, if you don't want to configure a gateway, you can provision dstack `Task` instead of `Service`. The `Task` is for development purpose only. If you want to know more about hands-on materials how to serve vLLM using dstack, check out [this repository](https://github.com/dstackai/dstack-examples/tree/main/deployment/vllm)
diff --git a/docs/deployment/frameworks/haystack.md b/docs/deployment/frameworks/haystack.md
new file mode 100644
index 000000000000..2eac4a5279fd
--- /dev/null
+++ b/docs/deployment/frameworks/haystack.md
@@ -0,0 +1,60 @@
+---
+title: Haystack
+---
+[](){ #deployment-haystack }
+
+# Haystack
+
+[Haystack](https://github.com/deepset-ai/haystack) is an end-to-end LLM framework that allows you to build applications powered by LLMs, Transformer models, vector search and more. Whether you want to perform retrieval-augmented generation (RAG), document search, question answering or answer generation, Haystack can orchestrate state-of-the-art embedding models and LLMs into pipelines to build end-to-end NLP applications and solve your use case.
+
+It allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints.
+
+## Prerequisites
+
+- Setup vLLM and Haystack environment
+
+```console
+pip install vllm haystack-ai
+```
+
+## Deploy
+
+- Start the vLLM server with the supported chat completion model, e.g.
+
+```console
+vllm serve mistralai/Mistral-7B-Instruct-v0.1
+```
+
+- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
+
+```python
+from haystack.components.generators.chat import OpenAIChatGenerator
+from haystack.dataclasses import ChatMessage
+from haystack.utils import Secret
+
+generator = OpenAIChatGenerator(
+    # for compatibility with the OpenAI API, a placeholder api_key is needed
+    api_key=Secret.from_token("VLLM-PLACEHOLDER-API-KEY"),
+    model="mistralai/Mistral-7B-Instruct-v0.1",
+    api_base_url="http://{your-vLLM-host-ip}:{your-vLLM-host-port}/v1",
+    generation_kwargs = {"max_tokens": 512}
+)
+
+response = generator.run(
+  messages=[ChatMessage.from_user("Hi. Can you help me plan my next trip to Italy?")]
+)
+
+print("-"*30)
+print(response)
+print("-"*30)
+```
+
+Output e.g.:
+
+```console
+------------------------------
+{'replies': [ChatMessage(_role=<ChatRole.ASSISTANT: 'assistant'>, _content=[TextContent(text=' Of course! Where in Italy would you like to go and what type of trip are you looking to plan?')], _name=None, _meta={'model': 'mistralai/Mistral-7B-Instruct-v0.1', 'index': 0, 'finish_reason': 'stop', 'usage': {'completion_tokens': 23, 'prompt_tokens': 21, 'total_tokens': 44, 'completion_tokens_details': None, 'prompt_tokens_details': None}})]}
+------------------------------
+```
+
+For details, see the tutorial [Using vLLM in Haystack](https://github.com/deepset-ai/haystack-integrations/blob/main/integrations/vllm.md).
diff --git a/docs/deployment/frameworks/helm.md b/docs/deployment/frameworks/helm.md
new file mode 100644
index 000000000000..192b90438acf
--- /dev/null
+++ b/docs/deployment/frameworks/helm.md
@@ -0,0 +1,95 @@
+---
+title: Helm
+---
+[](){ #deployment-helm }
+
+A Helm chart to deploy vLLM for Kubernetes
+
+Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
+
+This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.
+
+## Prerequisites
+
+Before you begin, ensure that you have the following:
+
+- A running Kubernetes cluster
+- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
+- Available GPU resources in your cluster
+- S3 with the model which will be deployed
+
+## Installing the chart
+
+To install the chart with the release name `test-vllm`:
+
+```console
+helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
+```
+
+## Uninstalling the Chart
+
+To uninstall the `test-vllm` deployment:
+
+```console
+helm uninstall test-vllm --namespace=ns-vllm
+```
+
+The command removes all the Kubernetes components associated with the
+chart **including persistent volumes** and deletes the release.
+
+## Architecture
+
+![](../../assets/deployment/architecture_helm_deployment.png)
+
+## Values
+
+| Key                                        | Type    | Default                                                                                                                                                  | Description                                                                                                                               |
+|--------------------------------------------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------|
+| autoscaling                                | object  | {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}                                                                  | Autoscaling configuration                                                                                                                 |
+| autoscaling.enabled                        | bool    | false                                                                                                                                                    | Enable autoscaling                                                                                                                        |
+| autoscaling.maxReplicas                    | int     | 100                                                                                                                                                      | Maximum replicas                                                                                                                          |
+| autoscaling.minReplicas                    | int     | 1                                                                                                                                                        | Minimum replicas                                                                                                                          |
+| autoscaling.targetCPUUtilizationPercentage | int     | 80                                                                                                                                                       | Target CPU utilization for autoscaling                                                                                                    |
+| configs                                    | object  | {}                                                                                                                                                       | Configmap                                                                                                                                 |
+| containerPort                              | int     | 8000                                                                                                                                                     | Container port                                                                                                                            |
+| customObjects                              | list    | []                                                                                                                                                       | Custom Objects configuration                                                                                                              |
+| deploymentStrategy                         | object  | {}                                                                                                                                                       | Deployment strategy configuration                                                                                                         |
+| externalConfigs                            | list    | []                                                                                                                                                       | External configuration                                                                                                                    |
+| extraContainers                            | list    | []                                                                                                                                                       | Additional containers configuration                                                                                                       |
+| extraInit                                  | object  | {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}                                                     | Additional configuration for the init container                                                                                           |
+| extraInit.pvcStorage                       | string  | "50Gi"                                                                                                                                                   | Storage size of the s3                                                                                                                    |
+| extraInit.s3modelpath                      | string  | "relative_s3_model_path/opt-125m"                                                                                                                        | Path of the model on the s3 which hosts model weights and config files                                                                    |
+| extraInit.awsEc2MetadataDisabled           | boolean | true                                                                                                                                                     | Disables the use of the Amazon EC2 instance metadata service                                                                              |
+| extraPorts                                 | list    | []                                                                                                                                                       | Additional ports configuration                                                                                                            |
+| gpuModels                                  | list    | ["TYPE_GPU_USED"]                                                                                                                                        | Type of gpu used                                                                                                                          |
+| image                                      | object  | {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"} | Image configuration                                                                                                                       |
+| image.command                              | list    | ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]                                                            | Container launch command                                                                                                                  |
+| image.repository                           | string  | "vllm/vllm-openai"                                                                                                                                       | Image repository                                                                                                                          |
+| image.tag                                  | string  | "latest"                                                                                                                                                 | Image tag                                                                                                                                 |
+| livenessProbe                              | object  | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}                                              | Liveness probe configuration                                                                                                              |
+| livenessProbe.failureThreshold             | int     | 3                                                                                                                                                        | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive |
+| livenessProbe.httpGet                      | object  | {"path":"/health","port":8000}                                                                                                                           | Configuration of the Kubelet http request on the server                                                                                   |
+| livenessProbe.httpGet.path                 | string  | "/health"                                                                                                                                                | Path to access on the HTTP server                                                                                                         |
+| livenessProbe.httpGet.port                 | int     | 8000                                                                                                                                                     | Name or number of the port to access on the container, on which the server is listening                                                   |
+| livenessProbe.initialDelaySeconds          | int     | 15                                                                                                                                                       | Number of seconds after the container has started before liveness probe is initiated                                                      |
+| livenessProbe.periodSeconds                | int     | 10                                                                                                                                                       | How often (in seconds) to perform the liveness probe                                                                                      |
+| maxUnavailablePodDisruptionBudget          | string  | ""                                                                                                                                                       | Disruption Budget Configuration                                                                                                           |
+| readinessProbe                             | object  | {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}                                                | Readiness probe configuration                                                                                                             |
+| readinessProbe.failureThreshold            | int     | 3                                                                                                                                                        | Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready |
+| readinessProbe.httpGet                     | object  | {"path":"/health","port":8000}                                                                                                                           | Configuration of the Kubelet http request on the server                                                                                   |
+| readinessProbe.httpGet.path                | string  | "/health"                                                                                                                                                | Path to access on the HTTP server                                                                                                         |
+| readinessProbe.httpGet.port                | int     | 8000                                                                                                                                                     | Name or number of the port to access on the container, on which the server is listening                                                   |
+| readinessProbe.initialDelaySeconds         | int     | 5                                                                                                                                                        | Number of seconds after the container has started before readiness probe is initiated                                                     |
+| readinessProbe.periodSeconds               | int     | 5                                                                                                                                                        | How often (in seconds) to perform the readiness probe                                                                                     |
+| replicaCount                               | int     | 1                                                                                                                                                        | Number of replicas                                                                                                                        |
+| resources                                  | object  | {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}                                          | Resource configuration                                                                                                                    |
+| resources.limits."nvidia.com/gpu"          | int     | 1                                                                                                                                                        | Number of gpus used                                                                                                                       |
+| resources.limits.cpu                       | int     | 4                                                                                                                                                        | Number of CPUs                                                                                                                            |
+| resources.limits.memory                    | string  | "16Gi"                                                                                                                                                   | CPU memory configuration                                                                                                                  |
+| resources.requests."nvidia.com/gpu"        | int     | 1                                                                                                                                                        | Number of gpus used                                                                                                                       |
+| resources.requests.cpu                     | int     | 4                                                                                                                                                        | Number of CPUs                                                                                                                            |
+| resources.requests.memory                  | string  | "16Gi"                                                                                                                                                   | CPU memory configuration                                                                                                                  |
+| secrets                                    | object  | {}                                                                                                                                                       | Secrets configuration                                                                                                                     |
+| serviceName                                | string  | Service name                                                                                                                                             |                                                                                                                                           |
+| servicePort                                | int     | 80                                                                                                                                                       | Service port                                                                                                                              |
+| labels.environment                         | string  | test                                                                                                                                                     | Environment name                                                                                                                          |
diff --git a/docs/source/deployment/frameworks/litellm.md b/docs/deployment/frameworks/litellm.md
similarity index 97%
rename from docs/source/deployment/frameworks/litellm.md
rename to docs/deployment/frameworks/litellm.md
index 6dd3607ca5e3..3011cde83018 100644
--- a/docs/source/deployment/frameworks/litellm.md
+++ b/docs/deployment/frameworks/litellm.md
@@ -1,6 +1,7 @@
-(deployment-litellm)=
-
-# LiteLLM
+---
+title: LiteLLM
+---
+[](){ #deployment-litellm }
 
 [LiteLLM](https://github.com/BerriAI/litellm) call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
 
diff --git a/docs/source/deployment/frameworks/lobe-chat.md b/docs/deployment/frameworks/lobe-chat.md
similarity index 89%
rename from docs/source/deployment/frameworks/lobe-chat.md
rename to docs/deployment/frameworks/lobe-chat.md
index 6d86b7fa9cce..cd95c028155e 100644
--- a/docs/source/deployment/frameworks/lobe-chat.md
+++ b/docs/deployment/frameworks/lobe-chat.md
@@ -1,6 +1,7 @@
-(deployment-lobe-chat)=
-
-# Lobe Chat
+---
+title: Lobe Chat
+---
+[](){ #deployment-lobe-chat }
 
 [Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework.
 
diff --git a/docs/source/deployment/frameworks/lws.md b/docs/deployment/frameworks/lws.md
similarity index 99%
rename from docs/source/deployment/frameworks/lws.md
rename to docs/deployment/frameworks/lws.md
index 4e9a03b5c4c1..18282a89ddff 100644
--- a/docs/source/deployment/frameworks/lws.md
+++ b/docs/deployment/frameworks/lws.md
@@ -1,6 +1,7 @@
-(deployment-lws)=
-
-# LWS
+---
+title: LWS
+---
+[](){ #deployment-lws }
 
 LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
 A major use case is for multi-host/multi-node distributed inference.
diff --git a/docs/source/deployment/frameworks/modal.md b/docs/deployment/frameworks/modal.md
similarity index 85%
rename from docs/source/deployment/frameworks/modal.md
rename to docs/deployment/frameworks/modal.md
index e7c42088e36a..dbdb739a1000 100644
--- a/docs/source/deployment/frameworks/modal.md
+++ b/docs/deployment/frameworks/modal.md
@@ -1,6 +1,7 @@
-(deployment-modal)=
-
-# Modal
+---
+title: Modal
+---
+[](){ #deployment-modal }
 
 vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
 
diff --git a/docs/source/deployment/frameworks/open-webui.md b/docs/deployment/frameworks/open-webui.md
similarity index 87%
rename from docs/source/deployment/frameworks/open-webui.md
rename to docs/deployment/frameworks/open-webui.md
index 83e5303a00ef..1ab1931068fa 100644
--- a/docs/source/deployment/frameworks/open-webui.md
+++ b/docs/deployment/frameworks/open-webui.md
@@ -1,6 +1,7 @@
-(deployment-open-webui)=
-
-# Open WebUI
+---
+title: Open WebUI
+---
+[](){ #deployment-open-webui }
 
 1. Install the [Docker](https://docs.docker.com/engine/install/)
 
@@ -25,5 +26,4 @@ ghcr.io/open-webui/open-webui:main
 
 On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
 
-:::{image} /assets/deployment/open_webui.png
-:::
+![](../../assets/deployment/open_webui.png)
diff --git a/docs/source/deployment/frameworks/retrieval_augmented_generation.md b/docs/deployment/frameworks/retrieval_augmented_generation.md
similarity index 96%
rename from docs/source/deployment/frameworks/retrieval_augmented_generation.md
rename to docs/deployment/frameworks/retrieval_augmented_generation.md
index f84451fafe91..cb26c8378dee 100644
--- a/docs/source/deployment/frameworks/retrieval_augmented_generation.md
+++ b/docs/deployment/frameworks/retrieval_augmented_generation.md
@@ -1,6 +1,7 @@
-(deployment-retrieval-augmented-generation)=
-
-# Retrieval-Augmented Generation
+---
+title: Retrieval-Augmented Generation
+---
+[](){ #deployment-retrieval-augmented-generation }
 
 [Retrieval-augmented generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) is a technique that enables generative artificial intelligence (Gen AI) models to retrieve and incorporate new information. It modifies interactions with a large language model (LLM) so that the model responds to user queries with reference to a specified set of documents, using this information to supplement information from its pre-existing training data. This allows LLMs to use domain-specific and/or updated information. Use cases include providing chatbot access to internal company data or generating responses based on authoritative sources.
 
diff --git a/docs/source/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md
similarity index 94%
rename from docs/source/deployment/frameworks/skypilot.md
rename to docs/deployment/frameworks/skypilot.md
index 5e101b900103..9763745f2378 100644
--- a/docs/source/deployment/frameworks/skypilot.md
+++ b/docs/deployment/frameworks/skypilot.md
@@ -1,12 +1,11 @@
-(deployment-skypilot)=
+---
+title: SkyPilot
+---
+[](){ #deployment-skypilot }
 
-# SkyPilot
-
-:::{raw} html
 <p align="center">
   <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
 </p>
-:::
 
 vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with [SkyPilot](https://github.com/skypilot-org/skypilot), an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in [SkyPilot AI gallery](https://skypilot.readthedocs.io/en/latest/gallery/index.html).
 
@@ -83,7 +82,11 @@ Check the output of the command. There will be a shareable gradio link (like the
 **Optional**: Serve the 70B model instead of the default 8B and use more GPU:
 
 ```console
-HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
+HF_TOKEN="your-huggingface-token" \
+  sky launch serving.yaml \
+  --gpus A100:8 \
+  --env HF_TOKEN \
+  --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
 ```
 
 ## Scale up to multiple replicas
@@ -104,10 +107,8 @@ service:
   max_completion_tokens: 1
 ```
 
-:::{raw} html
 <details>
 <summary>Click to see the full recipe YAML</summary>
-:::
 
 ```yaml
 service:
@@ -153,14 +154,14 @@ run: |
     2>&1 | tee api_server.log
 ```
 
-:::{raw} html
 </details>
-:::
 
 Start the serving the Llama-3 8B model on multiple replicas:
 
 ```console
-HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
+HF_TOKEN="your-huggingface-token" \
+  sky serve up -n vllm serving.yaml \
+  --env HF_TOKEN
 ```
 
 Wait until the service is ready:
@@ -169,10 +170,8 @@ Wait until the service is ready:
 watch -n10 sky serve status vllm
 ```
 
-:::{raw} html
 <details>
 <summary>Example outputs:</summary>
-:::
 
 ```console
 Services
@@ -185,9 +184,7 @@ vllm          1   1        xx.yy.zz.121  18 mins ago  1x GCP([Spot]{'L4': 1})  R
 vllm          2   1        xx.yy.zz.245  18 mins ago  1x GCP([Spot]{'L4': 1})  READY   us-east4
 ```
 
-:::{raw} html
 </details>
-:::
 
 After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
 
@@ -223,10 +220,8 @@ service:
 
 This will scale the service up to when the QPS exceeds 2 for each replica.
 
-:::{raw} html
 <details>
 <summary>Click to see the full recipe YAML</summary>
-:::
 
 ```yaml
 service:
@@ -275,9 +270,7 @@ run: |
     2>&1 | tee api_server.log
 ```
 
-:::{raw} html
 </details>
-:::
 
 To update the service with the new config:
 
@@ -295,10 +288,8 @@ sky serve down vllm
 
 It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
 
-:::{raw} html
 <details>
 <summary>Click to see the full GUI YAML</summary>
-:::
 
 ```yaml
 envs:
@@ -328,14 +319,14 @@ run: |
     --stop-token-ids 128009,128001 | tee ~/gradio.log
 ```
 
-:::{raw} html
 </details>
-:::
 
 1. Start the chat web UI:
 
     ```console
-    sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
+    sky launch \
+      -c gui ./gui.yaml \
+      --env ENDPOINT=$(sky serve status --endpoint vllm)
     ```
 
 2. Then, we can access the GUI at the returned gradio link:
diff --git a/docs/source/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md
similarity index 81%
rename from docs/source/deployment/frameworks/streamlit.md
rename to docs/deployment/frameworks/streamlit.md
index 084550ec991e..33ed8c5f5b54 100644
--- a/docs/source/deployment/frameworks/streamlit.md
+++ b/docs/deployment/frameworks/streamlit.md
@@ -1,6 +1,7 @@
-(deployment-streamlit)=
-
-# Streamlit
+---
+title: Streamlit
+---
+[](){ #deployment-streamlit }
 
 [Streamlit](https://github.com/streamlit/streamlit) lets you transform Python scripts into interactive web apps in minutes, instead of weeks. Build dashboards, generate reports, or create chat apps.
 
@@ -32,11 +33,11 @@ pip install streamlit openai
 streamlit run streamlit_openai_chatbot_webserver.py
 
 # or specify the VLLM_API_BASE or VLLM_API_KEY
-VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" streamlit run streamlit_openai_chatbot_webserver.py
+VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \
+    streamlit run streamlit_openai_chatbot_webserver.py
 
 # start with debug mode to view more details
 streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug
 ```
 
-:::{image} /assets/deployment/streamlit-chat.png
-:::
+![](../../assets/deployment/streamlit-chat.png)
diff --git a/docs/source/deployment/frameworks/triton.md b/docs/deployment/frameworks/triton.md
similarity index 87%
rename from docs/source/deployment/frameworks/triton.md
rename to docs/deployment/frameworks/triton.md
index 94d87120159c..082bc24d85aa 100644
--- a/docs/source/deployment/frameworks/triton.md
+++ b/docs/deployment/frameworks/triton.md
@@ -1,5 +1,6 @@
-(deployment-triton)=
-
-# NVIDIA Triton
+---
+title: NVIDIA Triton
+---
+[](){ #deployment-triton }
 
 The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
diff --git a/docs/source/deployment/integrations/kserve.md b/docs/deployment/integrations/kserve.md
similarity index 85%
rename from docs/source/deployment/integrations/kserve.md
rename to docs/deployment/integrations/kserve.md
index c780fd74e8f5..754b983dee92 100644
--- a/docs/source/deployment/integrations/kserve.md
+++ b/docs/deployment/integrations/kserve.md
@@ -1,6 +1,7 @@
-(deployment-kserve)=
-
-# KServe
+---
+title: KServe
+---
+[](){ #deployment-kserve }
 
 vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
 
diff --git a/docs/source/deployment/integrations/kubeai.md b/docs/deployment/integrations/kubeai.md
similarity index 93%
rename from docs/source/deployment/integrations/kubeai.md
rename to docs/deployment/integrations/kubeai.md
index 2f5772e075d8..ba0a3c52cca7 100644
--- a/docs/source/deployment/integrations/kubeai.md
+++ b/docs/deployment/integrations/kubeai.md
@@ -1,6 +1,7 @@
-(deployment-kubeai)=
-
-# KubeAI
+---
+title: KubeAI
+---
+[](){ #deployment-kubeai }
 
 [KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
 
diff --git a/docs/source/deployment/integrations/llamastack.md b/docs/deployment/integrations/llamastack.md
similarity index 94%
rename from docs/source/deployment/integrations/llamastack.md
rename to docs/deployment/integrations/llamastack.md
index a6c3569637ab..2ae600a423ff 100644
--- a/docs/source/deployment/integrations/llamastack.md
+++ b/docs/deployment/integrations/llamastack.md
@@ -1,6 +1,7 @@
-(deployment-llamastack)=
-
-# Llama Stack
+---
+title: Llama Stack
+---
+[](){ #deployment-llamastack }
 
 vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
 
diff --git a/docs/source/deployment/integrations/llmaz.md b/docs/deployment/integrations/llmaz.md
similarity index 87%
rename from docs/source/deployment/integrations/llmaz.md
rename to docs/deployment/integrations/llmaz.md
index cd4a76353d26..03d284c34769 100644
--- a/docs/source/deployment/integrations/llmaz.md
+++ b/docs/deployment/integrations/llmaz.md
@@ -1,6 +1,7 @@
-(deployment-llmaz)=
-
-# llmaz
+---
+title: llmaz
+---
+[](){ #deployment-llmaz }
 
 [llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
 
diff --git a/docs/source/deployment/integrations/production-stack.md b/docs/deployment/integrations/production-stack.md
similarity index 98%
rename from docs/source/deployment/integrations/production-stack.md
rename to docs/deployment/integrations/production-stack.md
index 05f1568306cc..8288a4b6e6be 100644
--- a/docs/source/deployment/integrations/production-stack.md
+++ b/docs/deployment/integrations/production-stack.md
@@ -1,6 +1,7 @@
-(deployment-production-stack)=
-
-# Production stack
+---
+title: Production stack
+---
+[](){ #deployment-production-stack }
 
 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using the [vLLM production stack](https://github.com/vllm-project/production-stack). Born out of a Berkeley-UChicago collaboration, [vLLM production stack](https://github.com/vllm-project/production-stack) is an officially released, production-optimized codebase under the [vLLM project](https://github.com/vllm-project), designed for LLM deployment with:
 
@@ -114,7 +115,7 @@ To remove the deployment, run:
 sudo helm uninstall vllm
 ```
 
-------
+---
 
 ### (Advanced) Configuring vLLM production stack
 
diff --git a/docs/source/deployment/k8s.md b/docs/deployment/k8s.md
similarity index 98%
rename from docs/source/deployment/k8s.md
rename to docs/deployment/k8s.md
index 9079cfa8e1b6..6b08c4960d02 100644
--- a/docs/source/deployment/k8s.md
+++ b/docs/deployment/k8s.md
@@ -1,6 +1,7 @@
-(deployment-k8s)=
-
-# Using Kubernetes
+---
+title: Using Kubernetes
+---
+[](){ #deployment-k8s }
 
 Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
 
@@ -8,6 +9,7 @@ Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine le
 * [Deployment with GPUs](#deployment-with-gpus)
 
 Alternatively, you can deploy vLLM to Kubernetes using any of the following:
+
 * [Helm](frameworks/helm.md)
 * [InftyAI/llmaz](integrations/llmaz.md)
 * [KServe](integrations/kserve.md)
@@ -19,9 +21,8 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
 
 ## Deployment with CPUs
 
-:::{note}
-The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs.
-:::
+!!! note
+    The use of CPUs here is for demonstration and testing purposes only and its performance will not be on par with GPUs.
 
 First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
 
diff --git a/docs/source/deployment/nginx.md b/docs/deployment/nginx.md
similarity index 60%
rename from docs/source/deployment/nginx.md
rename to docs/deployment/nginx.md
index bf404f1098c3..80242919ba5b 100644
--- a/docs/source/deployment/nginx.md
+++ b/docs/deployment/nginx.md
@@ -1,20 +1,21 @@
-(nginxloadbalancer)=
-
-# Using Nginx
+---
+title: Using Nginx
+---
+[](){ #nginxloadbalancer }
 
 This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
 
 Table of contents:
 
-1. [Build Nginx Container](#nginxloadbalancer-nginx-build)
-2. [Create Simple Nginx Config file](#nginxloadbalancer-nginx-conf)
-3. [Build vLLM Container](#nginxloadbalancer-nginx-vllm-container)
-4. [Create Docker Network](#nginxloadbalancer-nginx-docker-network)
-5. [Launch vLLM Containers](#nginxloadbalancer-nginx-launch-container)
-6. [Launch Nginx](#nginxloadbalancer-nginx-launch-nginx)
-7. [Verify That vLLM Servers Are Ready](#nginxloadbalancer-nginx-verify-nginx)
+1. [Build Nginx Container][nginxloadbalancer-nginx-build]
+2. [Create Simple Nginx Config file][nginxloadbalancer-nginx-conf]
+3. [Build vLLM Container][nginxloadbalancer-nginx-vllm-container]
+4. [Create Docker Network][nginxloadbalancer-nginx-docker-network]
+5. [Launch vLLM Containers][nginxloadbalancer-nginx-launch-container]
+6. [Launch Nginx][nginxloadbalancer-nginx-launch-nginx]
+7. [Verify That vLLM Servers Are Ready][nginxloadbalancer-nginx-verify-nginx]
 
-(nginxloadbalancer-nginx-build)=
+[](){ #nginxloadbalancer-nginx-build }
 
 ## Build Nginx Container
 
@@ -39,7 +40,7 @@ Build the container:
 docker build . -f Dockerfile.nginx --tag nginx-lb
 ```
 
-(nginxloadbalancer-nginx-conf)=
+[](){ #nginxloadbalancer-nginx-conf }
 
 ## Create Simple Nginx Config file
 
@@ -63,7 +64,7 @@ server {
 }
 ```
 
-(nginxloadbalancer-nginx-vllm-container)=
+[](){ #nginxloadbalancer-nginx-vllm-container }
 
 ## Build vLLM Container
 
@@ -76,10 +77,14 @@ If you are behind proxy, you can pass the proxy settings to the docker build com
 
 ```console
 cd $vllm_root
-docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
+docker build \
+    -f docker/Dockerfile . \
+    --tag vllm \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy
 ```
 
-(nginxloadbalancer-nginx-docker-network)=
+[](){ #nginxloadbalancer-nginx-docker-network }
 
 ## Create Docker Network
 
@@ -87,7 +92,7 @@ docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_prox
 docker network create vllm_nginx
 ```
 
-(nginxloadbalancer-nginx-launch-container)=
+[](){ #nginxloadbalancer-nginx-launch-container }
 
 ## Launch vLLM Containers
 
@@ -101,23 +106,45 @@ Notes:
 ```console
 mkdir -p ~/.cache/huggingface/hub/
 hf_cache_dir=~/.cache/huggingface/
-docker run -itd --ipc host --network vllm_nginx --gpus device=0 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf
-docker run -itd --ipc host --network vllm_nginx --gpus device=1 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
+docker run \
+    -itd \
+    --ipc host \
+    --network vllm_nginx \
+    --gpus device=0 \
+    --shm-size=10.24gb \
+    -v $hf_cache_dir:/root/.cache/huggingface/ \
+    -p 8081:8000 \
+    --name vllm0 vllm \
+    --model meta-llama/Llama-2-7b-chat-hf
+docker run \
+    -itd \
+    --ipc host \
+    --network vllm_nginx \
+    --gpus device=1 \
+    --shm-size=10.24gb \
+    -v $hf_cache_dir:/root/.cache/huggingface/ \
+    -p 8082:8000 \
+    --name vllm1 vllm \
+    --model meta-llama/Llama-2-7b-chat-hf
 ```
 
-:::{note}
-If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
-:::
+!!! note
+    If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
 
-(nginxloadbalancer-nginx-launch-nginx)=
+[](){ #nginxloadbalancer-nginx-launch-nginx }
 
 ## Launch Nginx
 
 ```console
-docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest
+docker run \
+    -itd \
+    -p 8000:80 \
+    --network vllm_nginx \
+    -v ./nginx_conf/:/etc/nginx/conf.d/ \
+    --name nginx-lb nginx-lb:latest
 ```
 
-(nginxloadbalancer-nginx-verify-nginx)=
+[](){ #nginxloadbalancer-nginx-verify-nginx }
 
 ## Verify That vLLM Servers Are Ready
 
diff --git a/docs/source/design/arch_overview.md b/docs/design/arch_overview.md
similarity index 81%
rename from docs/source/design/arch_overview.md
rename to docs/design/arch_overview.md
index 94bda8b5c58d..75d3e1b7ccc7 100644
--- a/docs/source/design/arch_overview.md
+++ b/docs/design/arch_overview.md
@@ -1,22 +1,18 @@
-(arch-overview)=
-
-# Architecture Overview
+---
+title: Architecture Overview
+---
+[](){ #arch-overview }
 
 This document provides an overview of the vLLM architecture.
 
-:::{contents} Table of Contents
-:depth: 2
-:local: true
-:::
+[TOC]
 
 ## Entrypoints
 
 vLLM provides a number of entrypoints for interacting with the system. The
 following diagram shows the relationship between them.
 
-:::{image} /assets/design/arch_overview/entrypoints.excalidraw.png
-:alt: Entrypoints Diagram
-:::
+![Entrypoints Diagram](../assets/design/arch_overview/entrypoints.excalidraw.png)
 
 ### LLM Class
 
@@ -77,16 +73,14 @@ python -m vllm.entrypoints.openai.api_server --model <model>
 
 That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
 
-More details on the API server can be found in the [OpenAI-Compatible Server](#openai-compatible-server) document.
+More details on the API server can be found in the [OpenAI-Compatible Server][openai-compatible-server] document.
 
 ## LLM Engine
 
 The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
 the vLLM system, handling model inference and asynchronous request processing.
 
-:::{image} /assets/design/arch_overview/llm_engine.excalidraw.png
-:alt: LLMEngine Diagram
-:::
+![LLMEngine Diagram](../assets/design/arch_overview/llm_engine.excalidraw.png)
 
 ### LLMEngine
 
@@ -137,18 +131,16 @@ input tensors and capturing cudagraphs.
 ## Model
 
 Every model runner object has one model object, which is the actual
-`torch.nn.Module` instance. See [huggingface_integration](#huggingface-integration) for how various
+`torch.nn.Module` instance. See [huggingface_integration][huggingface-integration] for how various
 configurations affect the class we ultimately get.
 
 ## Class Hierarchy
 
 The following figure shows the class hierarchy of vLLM:
 
-> :::{figure} /assets/design/hierarchy.png
-> :align: center
-> :alt: query
-> :width: 100%
-> :::
+> <figure markdown="span">
+>   ![](../assets/design/hierarchy.png){ align="center" alt="query" width="100%" }
+> </figure>
 
 There are several important design choices behind this class hierarchy:
 
@@ -178,44 +170,43 @@ of a vision model and a language model. By making the constructor uniform, we
 can easily create a vision model and a language model and compose them into a
 vision-language model.
 
-:::{note}
-To support this change, all vLLM models' signatures have been updated to:
-
-```python
-def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-```
-
-To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
+!!! note
+    To support this change, all vLLM models' signatures have been updated to:
 
-```python
-class MyOldModel(nn.Module):
-    def __init__(
-        self,
-        config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        ...
-
-from vllm.config import VllmConfig
-class MyNewModel(MyOldModel):
+    ```python
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
-        super().__init__(config, cache_config, quant_config, lora_config, prefix)
-
-if __version__ >= "0.6.4":
-    MyModel = MyNewModel
-else:
-    MyModel = MyOldModel
-```
-
-This way, the model can work with both old and new versions of vLLM.
-:::
+    ```
+
+    To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
+
+    ```python
+    class MyOldModel(nn.Module):
+        def __init__(
+            self,
+            config,
+            cache_config: Optional[CacheConfig] = None,
+            quant_config: Optional[QuantizationConfig] = None,
+            lora_config: Optional[LoRAConfig] = None,
+            prefix: str = "",
+        ) -> None:
+            ...
+
+    from vllm.config import VllmConfig
+    class MyNewModel(MyOldModel):
+        def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+            config = vllm_config.model_config.hf_config
+            cache_config = vllm_config.cache_config
+            quant_config = vllm_config.quant_config
+            lora_config = vllm_config.lora_config
+            super().__init__(config, cache_config, quant_config, lora_config, prefix)
+
+    if __version__ >= "0.6.4":
+        MyModel = MyNewModel
+    else:
+        MyModel = MyOldModel
+    ```
+
+    This way, the model can work with both old and new versions of vLLM.
 
 3\. **Sharding and Quantization at Initialization**: Certain features require
 changing the model weights. For example, tensor parallelism needs to shard the
diff --git a/docs/source/design/automatic_prefix_caching.md b/docs/design/automatic_prefix_caching.md
similarity index 98%
rename from docs/source/design/automatic_prefix_caching.md
rename to docs/design/automatic_prefix_caching.md
index 3928e0c16568..80883bb1d90d 100644
--- a/docs/source/design/automatic_prefix_caching.md
+++ b/docs/design/automatic_prefix_caching.md
@@ -1,6 +1,7 @@
-(design-automatic-prefix-caching)=
-
-# Automatic Prefix Caching
+---
+title: Automatic Prefix Caching
+---
+[](){ #design-automatic-prefix-caching }
 
 The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
 
diff --git a/docs/source/design/huggingface_integration.md b/docs/design/huggingface_integration.md
similarity index 64%
rename from docs/source/design/huggingface_integration.md
rename to docs/design/huggingface_integration.md
index 7d271b1cfb3a..2d462ccb6535 100644
--- a/docs/source/design/huggingface_integration.md
+++ b/docs/design/huggingface_integration.md
@@ -1,23 +1,22 @@
-(huggingface-integration)=
-
-# Integration with HuggingFace
+---
+title: Integration with HuggingFace
+---
+[](){ #huggingface-integration }
 
 This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
 
 Let's say we want to serve the popular QWen model by running `vllm serve Qwen/Qwen2-7B`.
 
 1. The `model` argument is `Qwen/Qwen2-7B`. vLLM determines whether this model exists by checking for the corresponding config file `config.json`. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182) for the implementation. Within this process:
-
-   - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
-   - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works.
-   - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file.
+    - If the `model` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
+    - If the `model` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the `model` argument as the model name and the `--revision` argument as the revision. See [their website](https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome) for more information on how the HuggingFace cache works.
+    - If the `model` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to [this function](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91) for the implementation. The input arguments include the `model` argument as the model name, the `--revision` argument as the revision, and the environment variable `HF_TOKEN` as the token to access the model hub. In our case, vLLM will download the [config.json](https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json) file.
 
 2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this [code snippet](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186) for the implementation.
 
 3. Next, vLLM [inspects](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189) the `model_type` field in the config dictionary to [generate](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L190-L216) the config object to use. There are some `model_type` values that vLLM directly supports; see [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48) for the list. If the `model_type` is not in the list, vLLM will use [AutoConfig.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained) to load the config class, with `model`, `--revision`, and `--trust_remote_code` as the arguments. Please note that:
-
-   - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example.
-   - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled.
+    - HuggingFace also has its own logic to determine the config class to use. It will again use the `model_type` field to search for the class name in the transformers library; see [here](https://github.com/huggingface/transformers/tree/main/src/transformers/models) for the list of supported models. If the `model_type` is not found, HuggingFace will use the `auto_map` field from the config JSON file to determine the class name. Specifically, it is the `AutoConfig` field under `auto_map`. See [DeepSeek](https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json) for an example.
+    - The `AutoConfig` field under `auto_map` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the `from_pretrained` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when `--trust_remote_code` is enabled.
 
 4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see [here](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244) for the implementation.
 
@@ -28,8 +27,7 @@ Beyond that, there are two more things vLLM depends on HuggingFace for.
 1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using [AutoTokenizer.from_pretrained](https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained) with the `model` argument as the model name and the `--revision` argument as the revision. It is also possible to use a tokenizer from another model by specifying the `--tokenizer` argument in the `vllm serve` command. Other relevant arguments are `--tokenizer-revision` and `--tokenizer-mode`. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the [get_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87) function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in [get_cached_tokenizer](https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24).
 
 2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the `model` argument as the model name and the `--revision` argument as the revision. vLLM provides the argument `--load-format` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass `--load-format dummy` to skip downloading the weights.
-
-   - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:
+    - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the [documentation](https://huggingface.co/docs/safetensors/en/index) for more information on the safetensors format. This part of the logic can be found [here](https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385). Please note that:
 
 This completes the integration between vLLM and HuggingFace.
 
diff --git a/docs/design/kernel/paged_attention.md b/docs/design/kernel/paged_attention.md
new file mode 100644
index 000000000000..6ebe1ee48acf
--- /dev/null
+++ b/docs/design/kernel/paged_attention.md
@@ -0,0 +1,498 @@
+---
+title: vLLM Paged Attention
+---
+[](){ #design-paged-attention }
+
+Currently, vLLM utilizes its own implementation of a multi-head query
+attention kernel (`csrc/attention/attention_kernels.cu`).
+This kernel is designed to be compatible with
+vLLM's paged KV caches, where the key and value cache are stored in
+separate blocks (note that this block concept differs from the GPU
+thread block. So in a later document, I will refer to vLLM paged
+attention block as "block", while refer to GPU thread block as
+"thread block").
+
+To achieve high performance, this kernel relies on a specially
+designed memory layout and access method, specifically when threads
+read data from global memory to shared memory. The purpose of this
+document is to provide a high-level explanation of the kernel
+implementation step by step, aiding those who wish to learn about the
+vLLM multi-head query attention kernel. After going through this
+document, users will likely have a better understanding and feel easier
+to follow the actual implementation.
+
+Please note that this document may not cover all details, such as how
+to calculate the correct index for the corresponding data or the dot
+multiplication implementation. However, after reading this document
+and becoming familiar with the high-level logic flow, it should be
+easier for you to read the actual code and understand the details.
+
+## Inputs
+
+The kernel function takes a list of arguments for the current thread
+to perform its assigned work. The three most important arguments are
+the input pointers `q`, `k_cache`, and `v_cache`, which point
+to query, key, and value data on global memory that need to be read
+and processed. The output pointer `out` points to global memory
+where the result should be written. These four pointers actually
+refer to multi-dimensional arrays, but each thread only accesses the
+portion of data assigned to it. I have omitted all other runtime
+parameters here for simplicity.
+
+```cpp
+template<typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int NUM_THREADS, int PARTITION_SIZE = 0>
+__device__ void paged_attention_kernel(
+    ... // Other side args.
+    const scalar_t* __restrict__ out,       // [num_seqs, num_heads, max_num_partitions, head_size]
+    const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
+    const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+    const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
+    ... // Other side args.
+)
+```
+
+There are also a list of template arguments above the function
+signature that are determined during compilation time. `scalar_t`
+represents the data type of the query, key, and value data elements,
+such as FP16. `HEAD_SIZE` indicates the number of elements in each
+head. `BLOCK_SIZE` refers to the number of tokens in each block.
+`NUM_THREADS` denotes the number of threads in each thread block.
+`PARTITION_SIZE` represents the number of tensor parallel GPUs (For
+simplicity, we assume this is 0 and tensor parallel is disabled).
+
+With these arguments, we need to perform a sequence of preparations.
+This includes calculating the current head index, block index, and
+other necessary variables. However, for now, we can ignore these
+preparations and proceed directly to the actual calculations. It will
+be easier to understand them once we grasp the entire flow.
+
+## Concepts
+
+Just before we dive into the calculation flow, I want to describe a
+few concepts that are needed for later sections. However, you may
+skip this section and return later if you encounter any confusing
+terminologies.
+
+- **Sequence**: A sequence represents a client request. For example,
+  the data pointed to by `q` has a shape of
+  `[num_seqs, num_heads, head_size]`. That represents there are total
+  `num_seqs` of query sequence data are pointed by `q`. Since this
+  kernel is a single query attention kernel, each sequence only has one
+  query token. Hence, the `num_seqs` equals the total number of tokens
+  that are processed in the batch.
+- **Context**: The context consists of the generated tokens from the
+  sequence. For instance, `["What", "is", "your"]` are the context
+  tokens, and the input query token is `"name"`. The model might
+  generate the token `"?"`.
+- **Vec**: The vec is a list of elements that are fetched and
+  calculated together. For query and key data, the vec size
+  (`VEC_SIZE`) is determined so that each thread group can fetch and
+  calculate 16 bytes of data at a time. For value data, the vec size
+  (`V_VEC_SIZE`) is determined so that each thread can fetch and
+  calculate 16 bytes of data at a time. For example, if the
+  `scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the
+  `VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8.
+- **Thread group**: The thread group is a small group of
+  threads(`THREAD_GROUP_SIZE`) that fetches and calculates one
+  query token and one key token at a time. Each thread handles only a
+  portion of the token data. The total number of elements processed by
+  one thread group is referred as `x`. For example, if the thread
+  group contains 2 threads and the head size is 8, then thread 0
+  handles the query and key elements at index 0, 2, 4, 6, while thread
+  1 handles the elements at index 1, 3, 5, 7.
+- **Block**: The key and value cache data in vLLM are split into
+  blocks. Each block stores data for a fixed number(`BLOCK_SIZE`)
+  of tokens at one head. Each block may contain only a portion of the
+  whole context tokens. For example, if the block size is 16 and the
+  head size is 128, then for one head, one block can store 16 * 128 =
+  2048 elements.
+- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that
+  execute simultaneously on a stream multiprocessor (SM). In this
+  kernel, each warp processes the calculation between one query token
+  and key tokens of one entire block at a time (it may process multiple
+  blocks in multiple iterations). For example, if there are 4 warps and
+  6 blocks for one context, the assignment would be like warp 0 handles
+  the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
+  handles the 2nd block and warp 3 handles the 3rd block.
+- **Thread block**: A thread block is a group of
+  threads(`NUM_THREADS`) that can access the same shared memory.
+  Each thread block contains multiple warps(`NUM_WARPS`), and in
+  this kernel, each thread block processes the calculation between one
+  query token and key tokens of a whole context.
+- **Grid**: A grid is a collection of thread blocks and defines the
+  shape of the collection. In this kernel, the shape is
+  `(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread
+  block only handles the calculation for one head, one sequence, and
+  one partition.
+
+## Query
+
+This section will introduce how query data is stored in memory and
+fetched by each thread. As mentioned above, each thread group fetches
+one query token data, while each thread itself only handles a part of
+one query token data. Within each warp, every thread group will fetch
+the same query token data, but will multiply it with different key
+token data.
+
+```cpp
+const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
+```
+
+<figure markdown="span">
+  ![](../../assets/kernel/query.png){ align="center" alt="query" width="70%" }
+</figure>
+
+Each thread defines its own `q_ptr` which points to the assigned
+query token data on global memory. For example, if `VEC_SIZE` is 4
+and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
+total of 128 elements divided into 128 / 4 = 32 vecs.
+
+<figure markdown="span">
+  ![](../../assets/kernel/q_vecs.png){ align="center" alt="q_vecs" width="70%" }
+</figure>
+
+```cpp
+__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
+```
+
+Next, we need to read the global memory data pointed to by `q_ptr`
+into shared memory as `q_vecs`. It is important to note that each
+vecs is assigned to a different row. For example, if the
+`THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs,
+while thread 1 handles the 1st row vecs. By reading the query data in
+this way, neighboring threads like thread 0 and thread 1 can read
+neighbor memory, achieving the memory coalescing to improve
+performance.
+
+## Key
+
+Similar to the "Query" section, this section introduces memory layout
+and assignment for keys. While each thread group only handle one
+query token one kernel run, it may handle multiple key tokens across
+multiple iterations. Meanwhile, each warp will process multiple blocks
+of key tokens in multiple iterations, ensuring that all context
+tokens are processed by the entire thread group after the kernel run.
+In this context, "handle" refers to performing the dot multiplication
+between query data and key data.
+
+```cpp
+const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
+                    + kv_head_idx * kv_head_stride
+                    + physical_block_offset * x;
+```
+
+Unlike to `q_ptr`, `k_ptr` in each thread will point to different
+key token at different iterations. As shown above, that `k_ptr`
+points to key token data based on `k_cache` at assigned block,
+assigned head and assigned token.
+
+<figure markdown="span">
+  ![](../../assets/kernel/key.png){ align="center" alt="key" width="70%" }
+</figure>
+
+The diagram above illustrates the memory layout for key data. It
+assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
+8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each
+rectangle represents all the elements for one key token at one head,
+which will be processed by one thread group. The left half shows the
+total 16 blocks of key token data for warp 0, while the right half
+represents the remaining key token data for other warps or
+iterations. Inside each rectangle, there are a total 32 vecs (128
+elements for one token) that will be processed by 2 threads (one
+thread group) separately.
+
+<figure markdown="span">
+  ![](../../assets/kernel/k_vecs.png){ align="center" alt="k_vecs" width="70%" }
+</figure>
+
+```cpp
+K_vec k_vecs[NUM_VECS_PER_THREAD]
+```
+
+Next, we need to read the key token data from `k_ptr` and store
+them on register memory as `k_vecs`. We use register memory for
+`k_vecs` because it will only be accessed by one thread once,
+whereas `q_vecs` will be accessed by multiple threads multiple
+times. Each `k_vecs` will contain multiple vectors for later
+calculation. Each vec will be set at each inner iteration. The
+assignment of vecs allows neighboring threads in a warp to read
+neighboring memory together, which again promotes the memory
+coalescing. For instance, thread 0 will read vec 0, while thread 1
+will read vec 1. In the next inner loop, thread 0 will read vec 2,
+while thread 1 will read vec 3, and so on.
+
+You may still be a little confused about the overall flow. Don't
+worry, please keep reading the next "QK" section. It will illustrate
+the query and key calculation flow in a clearer and higher-level
+manner.
+
+## QK
+
+As shown the pseudo code below, before the entire for loop block, we
+fetch the query data for one token and store it in `q_vecs`. Then,
+in the outer for loop, we iterate through different `k_ptrs` that
+point to different tokens and prepare the `k_vecs` in the inner for
+loop. Finally, we perform the dot multiplication between the
+`q_vecs` and each `k_vecs`.
+
+```cpp
+q_vecs = ...
+for ... {
+    k_ptr = ...
+    for ... {
+        k_vecs[i] = ...
+    }
+    ...
+    float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
+}
+```
+
+As mentioned before, for each thread, it only fetches part of the
+query and key token data at a time. However, there will be a cross
+thread group reduction happen in the `Qk_dot<>::dot` . So `qk`
+returned here is not just between part of the query and key token dot
+multiplication, but actually a full result between entire query and
+key token data.
+
+For example, if the value of `HEAD_SIZE` is 128 and
+`THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain
+total 64 elements. However, the returned `qk` is actually the
+result of dot multiplication between 128 query elements and 128 key
+elements. If you want to learn more about the details of the dot
+multiplication and reduction, you may refer to the implementation of
+`Qk_dot<>::dot`. However, for the sake of simplicity, I will not
+cover it in this document.
+
+## Softmax
+
+Next, we need to calculate the normalized softmax for all `qk`s,
+as shown above, where each $x$ represents a `qk`. To do this,
+we must obtain the reduced value of `qk_max`($m(x)$) and
+the `exp_sum`($\ell(x)$) of all `qk`s. The reduction
+should be performed across the entire thread block, encompassing
+results between the query token and all context key tokens.
+
+$$
+\begin{gather*}
+m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
+\quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
+\end{gather*}
+$$
+
+### `qk_max` and `logits`
+
+Just right after we get the `qk` result, we can set the temporary
+`logits` result with `qk` (In the end, the `logits` should
+store the normalized softmax result). Also we can compare and collect
+the `qk_max` for all `qk`s that are calculated by current
+thread group.
+
+```cpp
+if (thread_group_offset == 0) {
+    const bool mask = token_idx >= context_len;
+    logits[token_idx - start_token_idx] = mask ? 0.f : qk;
+    qk_max = mask ? qk_max : fmaxf(qk_max, qk);
+}
+```
+
+Please note that the `logits` here is on shared memory, so each
+thread group will set the fields for its own assigned context tokens.
+Overall, the size of logits should be number of context tokens.
+
+```cpp
+for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
+    qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+}
+
+if (lane == 0) {
+    red_smem[warp_idx] = qk_max;
+}
+```
+
+Then we need to get the reduced `qk_max` across each warp. The main
+idea is to make threads in warp to communicate with each other and
+get the final max `qk` .
+
+```cpp
+for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
+    qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
+}
+qk_max = VLLM_SHFL_SYNC(qk_max, 0);
+```
+
+Finally, we can get the reduced `qk_max` from whole thread block by
+compare the `qk_max` from all warps in this thread block. Then we
+need to broadcast the final result to each thread.
+
+### `exp_sum`
+
+Similar to `qk_max`, we need to get the reduced sum value from the
+entire thread block too.
+
+```cpp
+for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+    float val = __expf(logits[i] - qk_max);
+    logits[i] = val;
+    exp_sum += val;
+}
+...
+exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
+```
+
+Firstly, sum all exp values from each thread group, and meanwhile,
+convert each entry of `logits` from `qk` to `exp(qk - qk_max)`.
+Please note, the `qk_max` here is already the max `qk` across the
+whole thread block. And then we can do reduction for `exp_sum`
+across whole thread block just like the `qk_max`.
+
+```cpp
+const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
+for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
+    logits[i] *= inv_sum;
+}
+```
+
+Finally, with the reduced `qk_max` and `exp_sum`, we can obtain
+the final normalized softmax result as `logits`. This `logits`
+variable will be used for dot multiplication with the value data in
+later steps. Now, it should store the normalized softmax result of
+`qk` for all assigned context tokens.
+
+## Value
+
+<figure markdown="span">
+  ![](../../assets/kernel/value.png){ align="center" alt="value" width="70%" }
+</figure>
+
+<figure markdown="span">
+  ![](../../assets/kernel/logits_vec.png){ align="center" alt="logits_vec" width="50%" }
+</figure>
+
+<figure markdown="span">
+  ![](../../assets/kernel/v_vec.png){ align="center" alt="v_vec" width="70%" }
+</figure>
+
+Now we need to retrieve the value data and perform dot multiplication
+with `logits`. Unlike query and key, there is no thread group
+concept for value data. As shown in diagram, different from key token
+memory layout, elements from the same column correspond to the same
+value token. For one block of value data, there are `HEAD_SIZE` of
+rows and `BLOCK_SIZE` of columns that are split into multiple
+`v_vecs`.
+
+Each thread always fetches `V_VEC_SIZE` elements from the same
+`V_VEC_SIZE` of tokens at a time. As a result, a single thread
+retrieves multiple `v_vec`s from different rows and the same
+columns through multiple inner iterations. For each `v_vec`, it
+needs to be dot multiplied with the corresponding `logits_vec`,
+which is also `V_VEC_SIZE` elements from `logits`. Overall, with
+multiple inner iterations, each warp will process one block of value
+tokens. And with multiple outer iterations, the whole context value
+tokens are processed
+
+```cpp
+float accs[NUM_ROWS_PER_THREAD];
+for ... { // Iteration over different blocks.
+    logits_vec = ...
+    for ... { // Iteration over different rows.
+        v_vec = ...
+        ...
+        accs[i] += dot(logits_vec, v_vec);
+    }
+}
+```
+
+As shown in the above pseudo code, in the outer loop, similar to
+`k_ptr`, `logits_vec` iterates over different blocks and reads
+`V_VEC_SIZE` elements from `logits`. In the inner loop, each
+thread reads `V_VEC_SIZE` elements from the same tokens as a
+`v_vec` and performs dot multiplication. It is important to note
+that in each inner iteration, the thread fetches different head
+position elements for the same tokens. The dot result is then
+accumulated in `accs`. Therefore, each entry of `accs` is mapped
+to a head position assigned to the current thread.
+
+For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each
+thread fetches 8 value elements for 8 tokens at a time. Each element
+is from different tokens at the same head position. If `HEAD_SIZE`
+is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to
+fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are
+a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle
+a whole block of value tokens. And each `accs` in each thread
+contains 8 elements that accumulated at 8 different head positions.
+For the thread 0, the `accs` variable will have 8 elements, which
+are 0th, 32th … 224th elements of a value head that are accumulated
+from all assigned 8 tokens.
+
+## LV
+
+Now, we need to perform reduction for `accs` within each warp. This
+process allows each thread to accumulate the `accs` for the
+assigned head positions of all tokens in one block.
+
+```cpp
+for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+    float acc = accs[i];
+    for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
+        acc += VLLM_SHFL_XOR_SYNC(acc, mask);
+    }
+    accs[i] = acc;
+}
+```
+
+Next, we perform reduction for `accs` across all warps, allowing
+each thread to have the accumulation of `accs` for the assigned
+head positions of all context tokens. Please note that each `accs`
+in every thread only stores the accumulation for a portion of
+elements of the entire head for all context tokens. However, overall,
+all results for output have been calculated but are just stored in
+different thread register memory.
+
+```cpp
+float* out_smem = reinterpret_cast<float*>(shared_mem);
+for (int i = NUM_WARPS; i > 1; i /= 2) {
+    // Upper warps write to shared memory.
+    ...
+    float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
+    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+        ...
+        dst[row_idx] = accs[i];
+    }
+
+    // Lower warps update the output.
+    const float* src = &out_smem[warp_idx * HEAD_SIZE];
+    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+        ...
+        accs[i] += src[row_idx];
+    }
+
+    // Write out the accs.
+}
+```
+
+## Output
+
+Now we can write all of calculated result from local register memory
+to final output global memory.
+
+```cpp
+scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
+                + head_idx * max_num_partitions * HEAD_SIZE
+                + partition_idx * HEAD_SIZE;
+```
+
+First, we need to define the `out_ptr` variable, which points to
+the start address of the assigned sequence and assigned head.
+
+```cpp
+for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
+    const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
+    if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
+        from_float(*(out_ptr + row_idx), accs[i]);
+    }
+}
+```
+
+Finally, we need to iterate over different assigned head positions
+and write out the corresponding accumulated result based on the
+`out_ptr`.
diff --git a/docs/source/design/mm_processing.md b/docs/design/mm_processing.md
similarity index 61%
rename from docs/source/design/mm_processing.md
rename to docs/design/mm_processing.md
index dc92a3c2c511..f3685ce76a4b 100644
--- a/docs/source/design/mm_processing.md
+++ b/docs/design/mm_processing.md
@@ -1,10 +1,11 @@
-(mm-processing)=
+---
+title: Multi-Modal Data Processing
+---
+[](){ #mm-processing }
 
-# Multi-Modal Data Processing
+To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching][automatic-prefix-caching], we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
 
-To enable various optimizations in vLLM such as [chunked prefill](#chunked-prefill) and [prefix caching](#automatic-prefix-caching), we use {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
-
-Here are the main features of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`:
+Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]:
 
 ## Prompt Update Detection
 
@@ -15,7 +16,7 @@ One of the main responsibilities of HF processor is to update the prompt with pl
 
 The information about which tokens have been updated is key to finding the correspondence between placeholder feature tokens and multi-modal inputs.
 
-In vLLM, this information is specified using {class}`~vllm.multimodal.processing.PromptUpdate` in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`. We can automatically detect whether HF has updated the prompt by checking the existence of the updated tokens.
+In vLLM, this information is specified using [PromptUpdate][vllm.multimodal.processing.PromptUpdate] in [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates]. We can automatically detect whether HF has updated the prompt by checking the existence of the updated tokens.
 
 ## Tokenized Prompt Inputs
 
@@ -43,22 +44,22 @@ While HF processors support text + multi-modal inputs natively, this is not so f
 
 Moreover, since the tokenized text has not passed through the HF processor, we have to apply Step 3 by ourselves to keep the output tokens and multi-modal data consistent with each other.
 
-(mm-dummy-text)=
+[](){ #mm-dummy-text }
 
 ### Dummy text
 
-We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text`. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
+We work around the first issue by requiring each model to define how to generate dummy text based on the number of multi-modal inputs, via [get_dummy_text][vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text]. This lets us generate dummy text corresponding to the multi-modal inputs and input them together to obtain the processed multi-modal data.
 
-(mm-automatic-prompt-updating)=
+[](){ #mm-automatic-prompt-updating }
 
 ### Automatic prompt updating
 
 We address the second issue by implementing model-agnostic code in
-{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_updates` to automatically update the prompt with feature placeholder tokens based on the specification outputted by {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`.
+[_apply_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._apply_prompt_updates] to automatically update the prompt with feature placeholder tokens based on the specification outputted by [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates].
 
 ### Summary
 
-With the help of dummy text and automatic prompt updating, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main`.
+With the help of dummy text and automatic prompt updating, our multi-modal processor can finally accept both text and token prompts with multi-modal data. The detailed logic is shown in [_apply_hf_processor_main][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_main].
 
 ## Processor Output Caching
 
@@ -66,4 +67,4 @@ Some HF processors, such as the one for Qwen2-VL, are [very slow](gh-issue:9238)
 
 When new data is passed in, we first check which items are in the cache, and which ones are missing. The missing items are passed into the HF processor in a single batch and cached, before being merged with the existing items in the cache.
 
-Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text](#mm-dummy-text) to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating](#mm-automatic-prompt-updating) afterwards to keep the output tokens and multi-modal data consistent with each other.
+Since we only process the missing multi-modal data items, the number of input placeholder tokens no longer corresponds to the number of the multi-modal inputs, so they can't be passed alongside the text prompt to HF processor. Therefore, we process the text and multi-modal inputs separately, using [dummy text][mm-dummy-text] to avoid HF errors. Since this skips HF's prompt updating code, we apply [automatic prompt updating][mm-automatic-prompt-updating] afterwards to keep the output tokens and multi-modal data consistent with each other.
diff --git a/docs/source/design/multiprocessing.md b/docs/design/multiprocessing.md
similarity index 96%
rename from docs/source/design/multiprocessing.md
rename to docs/design/multiprocessing.md
index 43fe5fe2e5e9..412c42fd580e 100644
--- a/docs/source/design/multiprocessing.md
+++ b/docs/design/multiprocessing.md
@@ -2,14 +2,13 @@
 
 ## Debugging
 
-Please see the [Troubleshooting](#troubleshooting-python-multiprocessing)
+Please see the [Troubleshooting][troubleshooting-python-multiprocessing]
 page for information on known issues and how to solve them.
 
 ## Introduction
 
-:::{important}
-The source code references are to the state of the code at the time of writing in December, 2024.
-:::
+!!! warning
+    The source code references are to the state of the code at the time of writing in December, 2024.
 
 The use of Python multiprocessing in vLLM is complicated by:
 
@@ -124,7 +123,7 @@ what is happening. First, a log message from vLLM:
 WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
     initialized. We must use the `spawn` multiprocessing start method. Setting
     VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
-    https://docs.vllm.ai/en/latest/getting_started/debugging.html#python-multiprocessing
+    https://docs.vllm.ai/en/latest/usage/debugging.html#python-multiprocessing
     for more information.
 ```
 
diff --git a/docs/source/design/plugin_system.md b/docs/design/plugin_system.md
similarity index 83%
rename from docs/source/design/plugin_system.md
rename to docs/design/plugin_system.md
index 225030885f62..0764dfb6501b 100644
--- a/docs/source/design/plugin_system.md
+++ b/docs/design/plugin_system.md
@@ -1,12 +1,13 @@
-(plugin-system)=
-
-# vLLM's Plugin System
+---
+title: vLLM's Plugin System
+---
+[](){ #plugin-system }
 
 The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
 
 ## How Plugins Work in vLLM
 
-Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [](#arch-overview)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
+Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview][arch-overview]), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
 
 ## How vLLM Discovers Plugins
 
@@ -29,8 +30,10 @@ def register():
     from vllm import ModelRegistry
 
     if "MyLlava" not in ModelRegistry.get_supported_archs():
-        ModelRegistry.register_model("MyLlava",
-                                        "vllm_add_dummy_model.my_llava:MyLlava")
+        ModelRegistry.register_model(
+            "MyLlava",
+            "vllm_add_dummy_model.my_llava:MyLlava",
+        )
 ```
 
 For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
diff --git a/docs/source/design/v1/metrics.md b/docs/design/v1/metrics.md
similarity index 97%
rename from docs/source/design/v1/metrics.md
rename to docs/design/v1/metrics.md
index de8022655372..7156ee9dd3ec 100644
--- a/docs/source/design/v1/metrics.md
+++ b/docs/design/v1/metrics.md
@@ -57,11 +57,11 @@ In v0, the following metrics are exposed via a Prometheus-compatible `/metrics`
 - `vllm:spec_decode_num_draft_tokens_total` (Counter)
 - `vllm:spec_decode_num_emitted_tokens_total` (Counter)
 
-These are documented under [Inferencing and Serving -> Production Metrics](project:../../serving/metrics.md).
+These are documented under [Inferencing and Serving -> Production Metrics](../../usage/metrics.md).
 
 ### Grafana Dashboard
 
-vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/getting_started/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
+vLLM also provides [a reference example](https://docs.vllm.ai/en/latest/examples/prometheus_grafana.html) for how to collect and store these metrics using Prometheus and visualize them using a Grafana dashboard.
 
 The subset of metrics exposed in the Grafana dashboard gives us an indication of which metrics are especially important:
 
@@ -222,9 +222,7 @@ And the calculated intervals are:
 
 Put another way:
 
-:::{image} /assets/design/v1/metrics/intervals-1.png
-:alt: Interval calculations - common case
-:::
+![Interval calculations - common case](../../assets/design/v1/metrics/intervals-1.png)
 
 We explored the possibility of having the frontend calculate these
 intervals using the timing of events visible by the frontend. However,
@@ -239,17 +237,13 @@ When a preemption occurs during decode, since any already generated
 tokens are reused, we consider the preemption as affecting the
 inter-token, decode, and inference intervals.
 
-:::{image} /assets/design/v1/metrics/intervals-2.png
-:alt: Interval calculations - preempted decode
-:::
+![Interval calculations - preempted decode](../../assets/design/v1/metrics/intervals-2.png)
 
 When a preemption occurs during prefill (assuming such an event
 is possible), we consider the preemption as affecting the
 time-to-first-token and prefill intervals.
 
-:::{image} /assets/design/v1/metrics/intervals-3.png
-:alt: Interval calculations - preempted prefill
-:::
+![Interval calculations - preempted prefill](../../assets/design/v1/metrics/intervals-3.png)
 
 ### Frontend Stats Collection
 
@@ -467,7 +461,7 @@ In general:
    hatch](https://kubernetes.io/docs/concepts/cluster-administration/system-metrics/#show-hidden-metrics)
    for some time before deleting them.
 
-See the [deprecation policy](project:../../contributing/deprecation_policy.md) for
+See the [deprecation policy](../../contributing/deprecation_policy.md) for
 the project-wide deprecation policy.
 
 ### Unimplemented - `vllm:tokens_total`
@@ -679,7 +673,7 @@ v0 has support for OpenTelemetry tracing:
 - [OpenTelemetry blog
   post](https://opentelemetry.io/blog/2024/llm-observability/)
 - [User-facing
-  docs](https://docs.vllm.ai/en/latest/getting_started/examples/opentelemetry.html)
+  docs](https://docs.vllm.ai/en/latest/examples/opentelemetry.html)
 - [Blog
   post](https://medium.com/@ronen.schaffer/follow-the-trail-supercharging-vllm-with-opentelemetry-distributed-tracing-aa655229b46f)
 - [IBM product
diff --git a/docs/source/design/v1/prefix_caching.md b/docs/design/v1/prefix_caching.md
similarity index 94%
rename from docs/source/design/v1/prefix_caching.md
rename to docs/design/v1/prefix_caching.md
index 0f7475777797..ad041b0059f5 100644
--- a/docs/source/design/v1/prefix_caching.md
+++ b/docs/design/v1/prefix_caching.md
@@ -122,9 +122,7 @@ There are two design points to highlight:
 
 As a result, we will have the following components when the KV cache manager is initialized:
 
-:::{image} /assets/design/v1/prefix_caching/overview.png
-:alt: Component Overview
-:::
+![Component Overview](../../assets/design/v1/prefix_caching/overview.png)
 
 * Block Pool: A list of KVCacheBlock.  
 * Free Block Queue: Only store the pointers of head and tail blocks for manipulations.  
@@ -194,9 +192,7 @@ As can be seen, block 3 is a new full block and is cached. However, it is redund
 
 When a request is finished, we free all its blocks if no other requests are using them (reference count = 0). In this example, we free request 1 and block 2, 3, 4, 8 associated with it. We can see that the freed blocks are added to the tail of the free queue in the *reverse* order. This is because the last block of a request must hash more tokens and is less likely to be reused by other requests. As a result, it should be evicted first.
 
-:::{image} /assets/design/v1/prefix_caching/free.png
-:alt: Free Queue after Free a Request
-:::
+![Free queue after a request us freed](../../assets/design/v1/prefix_caching/free.png)
 
 ### Eviction (LRU)
 
@@ -212,36 +208,24 @@ In this example, we assume the block size is 4 (each block can cache 4 tokens),
 
 **Time 1: The cache is empty and a new request comes in.** We allocate 4 blocks. 3 of them are already full and cached. The fourth block is partially full with 3 of 4 tokens.
 
-:::{image} /assets/design/v1/prefix_caching/example-time-1.png
-:alt: Example Time 1
-:::
+![Example Time 1](../../assets/design/v1/prefix_caching/example-time-1.png)
 
 **Time 3: Request 0 makes the block 3 full and asks for a new block to keep decoding.** We cache block 3 and allocate block 4.
 
-:::{image} /assets/design/v1/prefix_caching/example-time-3.png
-:alt: Example Time 3
-:::
+![Example Time 3](../../assets/design/v1/prefix_caching/example-time-3.png)
 
 **Time 4: Request 1 comes in with the 14 prompt tokens, where the first 10 tokens are the same as request 0.** We can see that only the first 2 blocks (8 tokens) hit the cache, because the 3rd block only matches 2 of 4 tokens.
 
-:::{image} /assets/design/v1/prefix_caching/example-time-4.png
-:alt: Example Time 4
-:::
+![Example Time 4](../../assets/design/v1/prefix_caching/example-time-4.png)
 
 **Time 5: Request 0 is finished and free.** Blocks 2, 3 and 4 are added to the free queue in the reverse order (but block 2 and 3 are still cached). Block 0 and 1 are not added to the free queue because they are being used by Request 1.
 
-:::{image} /assets/design/v1/prefix_caching/example-time-5.png
-:alt: Example Time 5
-:::
+![Example Time 5](../../assets/design/v1/prefix_caching/example-time-5.png)
 
 **Time 6: Request 1 is finished and free.**
 
-:::{image} /assets/design/v1/prefix_caching/example-time-6.png
-:alt: Example Time 6
-:::
+![Example Time 6](../../assets/design/v1/prefix_caching/example-time-6.png)
 
 **Time 7: Request 2 comes in with the 29 prompt tokens, where the first 12 tokens are the same as request 0\.** Note that even the block order in the free queue was `7 - 8 - 9 - 4 - 3 - 2 - 6 - 5 - 1 - 0`, the cache hit blocks (i.e., 0, 1, 2) are touched and removed from the queue before allocation, so the free queue becomes `7 - 8 - 9 - 4 - 3 - 6 - 5`. As a result, the allocated blocks are 0 (cached), 1 (cached), 2 (cached), 7, 8, 9, 4, 3 (evicted).
 
-:::{image} /assets/design/v1/prefix_caching/example-time-7.png
-:alt: Example Time 7
-:::
+![Example Time 7](../../assets/design/v1/prefix_caching/example-time-7.png)
diff --git a/docs/source/design/v1/torch_compile.md b/docs/design/v1/torch_compile.md
similarity index 98%
rename from docs/source/design/v1/torch_compile.md
rename to docs/design/v1/torch_compile.md
index 4d8ce0fd9227..64b6f0cc0a9b 100644
--- a/docs/source/design/v1/torch_compile.md
+++ b/docs/design/v1/torch_compile.md
@@ -99,7 +99,9 @@ This time, Inductor compilation is completely bypassed, and we will load from di
 
 The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
 
-`vllm serve meta-llama/Llama-3.2-1B --compilation_config "{'compile_sizes': [1, 2, 4, 8]}"`
+```
+vllm serve meta-llama/Llama-3.2-1B --compilation_config '{"compile_sizes": [1, 2, 4, 8]}'
+```
 
 Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
 
@@ -134,12 +136,14 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh
 
 By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
 
-`vllm serve meta-llama/Llama-3.2-1B --compilation-config "{'cudagraph_capture_sizes': [1, 2, 4, 8]}"`
+```
+vllm serve meta-llama/Llama-3.2-1B --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
+```
 
 Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
 
 ### Full Cudagraph capture
 
-It is possible to include attention as part of the cudagraph if using an attention backend that is cudagraph compatible. This can improve performance in some cases such as decode speed for smaller models. Enable this using `--compilation-config "{'full_cuda_graph': True}"`
+It is possible to include attention as part of the cudagraph if using an attention backend that is cudagraph compatible. This can improve performance in some cases such as decode speed for smaller models. Enable this using `--compilation-config '{"full_cuda_graph": true}'`.
 
 Currently only FlashAttention 3 is compatible, and only when cascade attention is disabled.
diff --git a/docs/features/automatic_prefix_caching.md b/docs/features/automatic_prefix_caching.md
new file mode 100644
index 000000000000..5e92796ddda7
--- /dev/null
+++ b/docs/features/automatic_prefix_caching.md
@@ -0,0 +1,28 @@
+---
+title: Automatic Prefix Caching
+---
+[](){ #automatic-prefix-caching }
+
+## Introduction
+
+Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
+
+!!! note
+    Technical details on how vLLM implements APC can be found [here][design-automatic-prefix-caching].
+
+## Enabling APC in vLLM
+
+Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
+
+<gh-file:examples/offline_inference/automatic_prefix_caching.py>
+
+## Example workloads
+
+We describe two example workloads, where APC can provide huge performance benefit:
+
+- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency.
+- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency.
+
+## Limits
+
+APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused).
diff --git a/docs/features/compatibility_matrix.md b/docs/features/compatibility_matrix.md
new file mode 100644
index 000000000000..77ceea49f173
--- /dev/null
+++ b/docs/features/compatibility_matrix.md
@@ -0,0 +1,77 @@
+---
+title: Compatibility Matrix
+---
+[](){ #compatibility-matrix }
+
+The tables below show mutually exclusive features and the support on some hardware.
+
+The symbols used have the following meanings:
+
+- ✅ = Full compatibility
+- 🟠 = Partial compatibility
+- ❌ = No compatibility
+
+!!! note
+    Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
+
+## Feature x Feature
+
+<style>
+td:not(:first-child) {
+  text-align: center !important;
+}
+td {
+  padding: 0.5rem !important;
+  white-space: nowrap;
+}
+
+th {
+  padding: 0.5rem !important;
+  min-width: 0 !important;
+}
+
+th:not(:first-child) {
+  writing-mode: vertical-lr;
+  transform: rotate(180deg)
+}
+</style>
+
+| Feature                                                   | [CP][chunked-prefill]   | [APC][automatic-prefix-caching]   | [LoRA][lora-adapter]   | <abbr title="Prompt Adapter">prmpt adptr</abbr>   | [SD][spec-decode]   | CUDA graph   | <abbr title="Pooling Models">pooling</abbr>   | <abbr title="Encoder-Decoder Models">enc-dec</abbr>   | <abbr title="Logprobs">logP</abbr>   | <abbr title="Prompt Logprobs">prmpt logP</abbr>   | <abbr title="Async Output Processing">async output</abbr>   | multi-step         | <abbr title="Multimodal Inputs">mm</abbr>   | best-of   | beam-search   |
+|-----------------------------------------------------------|-------------------------|-----------------------------------|------------------------|---------------------------------------------------|---------------------|--------------|-----------------------------------------------|-------------------------------------------------------|--------------------------------------|---------------------------------------------------|-------------------------------------------------------------|--------------------|---------------------------------------------|-----------|---------------|
+| [CP][chunked-prefill]                                     | ✅                       |                                   |                        |                                                   |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
+| [APC][automatic-prefix-caching]                           | ✅                       | ✅                                 |                        |                                                   |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
+| [LoRA][lora-adapter]                                      | ✅                       | ✅                                 | ✅                      |                                                   |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
+| <abbr title="Prompt Adapter">prmpt adptr</abbr>           | ✅                       | ✅                                 | ✅                      | ✅                                                 |                     |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
+| [SD][spec-decode]                                         | ✅                       | ✅                                 | ❌                      | ✅                                                 | ✅                   |              |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
+| CUDA graph                                                | ✅                       | ✅                                 | ✅                      | ✅                                                 | ✅                   | ✅            |                                               |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
+| <abbr title="Pooling Models">pooling</abbr>               | ❌                       | ❌                                 | ❌                      | ❌                                                 | ❌                   | ❌            | ✅                                             |                                                       |                                      |                                                   |                                                             |                    |                                             |           |               |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ❌                       | [❌](gh-issue:7366)                | ❌                      | ❌                                                 | [❌](gh-issue:7366)  | ✅            | ✅                                             | ✅                                                     |                                      |                                                   |                                                             |                    |                                             |           |               |
+| <abbr title="Logprobs">logP</abbr>                        | ✅                       | ✅                                 | ✅                      | ✅                                                 | ✅                   | ✅            | ❌                                             | ✅                                                     | ✅                                    |                                                   |                                                             |                    |                                             |           |               |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                       | ✅                                 | ✅                      | ✅                                                 | ✅                   | ✅            | ❌                                             | ✅                                                     | ✅                                    | ✅                                                 |                                                             |                    |                                             |           |               |
+| <abbr title="Async Output Processing">async output</abbr> | ✅                       | ✅                                 | ✅                      | ✅                                                 | ❌                   | ✅            | ❌                                             | ❌                                                     | ✅                                    | ✅                                                 | ✅                                                           |                    |                                             |           |               |
+| multi-step                                                | ❌                       | ✅                                 | ❌                      | ✅                                                 | ❌                   | ✅            | ❌                                             | ❌                                                     | ✅                                    | ✅                                                 | ✅                                                           | ✅                  |                                             |           |               |
+| <abbr title="Multimodal Inputs">mm</abbr>                 | ✅                       | [🟠](gh-pr:8348)                   | [🟠](gh-pr:4194)        | ❔                                                 | ❔                   | ✅            | ✅                                             | ✅                                                     | ✅                                    | ✅                                                 | ✅                                                           | ❔                  | ✅                                           |           |               |
+| best-of                                                   | ✅                       | ✅                                 | ✅                      | ✅                                                 | [❌](gh-issue:6137)  | ✅            | ❌                                             | ✅                                                     | ✅                                    | ✅                                                 | ❔                                                           | [❌](gh-issue:7968) | ✅                                           | ✅         |               |
+| beam-search                                               | ✅                       | ✅                                 | ✅                      | ✅                                                 | [❌](gh-issue:6137)  | ✅            | ❌                                             | ✅                                                     | ✅                                    | ✅                                                 | ❔                                                           | [❌](gh-issue:7968) | ❔                                           | ✅         | ✅             |
+
+[](){ #feature-x-hardware }
+
+## Feature x Hardware
+
+| Feature                                                   | Volta              | Turing   | Ampere   | Ada   | Hopper   | CPU                | AMD   |
+|-----------------------------------------------------------|--------------------|----------|----------|-------|----------|--------------------|-------|
+| [CP][chunked-prefill]                                     | [❌](gh-issue:2729) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
+| [APC][automatic-prefix-caching]                           | [❌](gh-issue:3687) | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
+| [LoRA][lora-adapter]                                      | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
+| <abbr title="Prompt Adapter">prmpt adptr</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8475) | ✅     |
+| [SD][spec-decode]                                         | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
+| CUDA graph                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ✅     |
+| <abbr title="Pooling Models">pooling</abbr>               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❔     |
+| <abbr title="Encoder-Decoder Models">enc-dec</abbr>       | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ❌     |
+| <abbr title="Multimodal Inputs">mm</abbr>                 | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
+| <abbr title="Logprobs">logP</abbr>                        | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
+| <abbr title="Prompt Logprobs">prmpt logP</abbr>           | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
+| <abbr title="Async Output Processing">async output</abbr> | ✅                  | ✅        | ✅        | ✅     | ✅        | ❌                  | ❌     |
+| multi-step                                                | ✅                  | ✅        | ✅        | ✅     | ✅        | [❌](gh-issue:8477) | ✅     |
+| best-of                                                   | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
+| beam-search                                               | ✅                  | ✅        | ✅        | ✅     | ✅        | ✅                  | ✅     |
diff --git a/docs/source/features/disagg_prefill.md b/docs/features/disagg_prefill.md
similarity index 87%
rename from docs/source/features/disagg_prefill.md
rename to docs/features/disagg_prefill.md
index 2fa20140c086..54be05647d94 100644
--- a/docs/source/features/disagg_prefill.md
+++ b/docs/features/disagg_prefill.md
@@ -1,12 +1,12 @@
-(disagg-prefill)=
-
-# Disaggregated Prefilling (experimental)
+---
+title: Disaggregated Prefilling (experimental)
+---
+[](){ #disagg-prefill }
 
 This page introduces you the disaggregated prefilling feature in vLLM.
 
-:::{note}
-This feature is experimental and subject to change.
-:::
+!!! note
+    This feature is experimental and subject to change.
 
 ## Why disaggregated prefilling?
 
@@ -15,9 +15,8 @@ Two main reasons:
 - **Tuning time-to-first-token (TTFT) and inter-token-latency (ITL) separately**. Disaggregated prefilling put prefill and decode phase of LLM inference inside different vLLM instances. This gives you the flexibility to assign different parallel strategies (e.g. `tp` and `pp`) to tune TTFT without affecting ITL, or to tune ITL without affecting TTFT.
 - **Controlling tail ITL**. Without disaggregated prefilling, vLLM may insert some prefill jobs during the decoding of one request. This results in higher tail latency. Disaggregated prefilling helps you solve this issue and control tail ITL. Chunked prefill with a proper chunk size also can achieve the same goal, but in practice it's hard to figure out the correct chunk size value. So disaggregated prefilling is a much more reliable way to control tail ITL.
 
-:::{note}
-Disaggregated prefill DOES NOT improve throughput.
-:::
+!!! note
+    Disaggregated prefill DOES NOT improve throughput.
 
 ## Usage example
 
@@ -39,21 +38,16 @@ Key abstractions for disaggregated prefilling:
 - **LookupBuffer**: LookupBuffer provides two API: `insert` KV cache and `drop_select` KV cache. The semantics of `insert` and `drop_select` are similar to SQL, where `insert` inserts a KV cache into the buffer, and `drop_select` returns the KV cache that matches the given condition and drop it from the buffer.
 - **Pipe**: A single-direction FIFO pipe for tensor transmission. It supports `send_tensor` and `recv_tensor`.
 
-:::{note}
-`insert` is non-blocking operation but `drop_select` is blocking operation.
-:::
+!!! note
+    `insert` is non-blocking operation but `drop_select` is blocking operation.
 
 Here is a figure illustrating how the above 3 abstractions are organized:
 
-:::{image} /assets/features/disagg_prefill/abstraction.jpg
-:alt: Disaggregated prefilling abstractions
-:::
+![Disaggregated prefilling abstractions](../assets/features/disagg_prefill/abstraction.jpg)
 
 The workflow of disaggregated prefilling is as follows:
 
-:::{image} /assets/features/disagg_prefill/overview.jpg
-:alt: Disaggregated prefilling workflow
-:::
+![Disaggregated prefilling workflow](../assets/features/disagg_prefill/overview.jpg)
 
 The `buffer` corresponds to `insert` API in LookupBuffer, and the `drop_select` corresponds to `drop_select` API in LookupBuffer.
 
diff --git a/docs/source/features/lora.md b/docs/features/lora.md
similarity index 96%
rename from docs/source/features/lora.md
rename to docs/features/lora.md
index 5a3ce0c01f3f..642462f7c455 100644
--- a/docs/source/features/lora.md
+++ b/docs/features/lora.md
@@ -1,10 +1,11 @@
-(lora-adapter)=
-
-# LoRA Adapters
+---
+title: LoRA Adapters
+---
+[](){ #lora-adapter }
 
 This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model.
 
-LoRA adapters can be used with any vLLM model that implements {class}`~vllm.model_executor.models.interfaces.SupportsLoRA`.
+LoRA adapters can be used with any vLLM model that implements [SupportsLoRA][vllm.model_executor.models.interfaces.SupportsLoRA].
 
 Adapters can be efficiently served on a per request basis with minimal overhead. First we download the adapter(s) and save
 them locally with
@@ -60,9 +61,8 @@ vllm serve meta-llama/Llama-2-7b-hf \
     --lora-modules sql-lora=$HOME/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/
 ```
 
-:::{note}
-The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
-:::
+!!! note
+    The commit ID `0dfa347e8877a4d4ed19ee56c140fa518470028c` may change over time. Please check the latest commit ID in your environment to ensure you are using the correct one.
 
 The server entrypoint accepts all other LoRA configuration parameters (`max_loras`, `max_lora_rank`, `max_cpu_loras`,
 etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
diff --git a/docs/source/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
similarity index 84%
rename from docs/source/features/multimodal_inputs.md
rename to docs/features/multimodal_inputs.md
index bb2997f008ed..19b668172902 100644
--- a/docs/source/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -1,20 +1,20 @@
-(multimodal-inputs)=
+---
+title: Multimodal Inputs
+---
+[](){ #multimodal-inputs }
 
-# Multimodal Inputs
+This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM.
 
-This page teaches you how to pass multi-modal inputs to [multi-modal models](#supported-mm-models) in vLLM.
-
-:::{note}
-We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
-and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
-:::
+!!! note
+    We are actively iterating on multi-modal support. See [this RFC](gh-issue:4194) for upcoming changes,
+    and [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) if you have any feedback or feature requests.
 
 ## Offline Inference
 
-To input multi-modal data, follow this schema in {class}`vllm.inputs.PromptType`:
+To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
 
 - `prompt`: The prompt should follow the format that is documented on HuggingFace.
-- `multi_modal_data`: This is a dictionary that follows the schema defined in {class}`vllm.multimodal.inputs.MultiModalDataDict`.
+- `multi_modal_data`: This is a dictionary that follows the schema defined in [vllm.multimodal.inputs.MultiModalDataDict][].
 
 ### Image Inputs
 
@@ -211,16 +211,15 @@ for o in outputs:
 
 Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
 
-:::{important}
-A chat template is **required** to use Chat Completions API.
-For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
+!!! warning
+    A chat template is **required** to use Chat Completions API.
+    For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
 
-If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
-If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
+    If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
+    If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
 
-For certain models, we provide alternative chat templates inside <gh-dir:vllm/examples>.
-For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
-:::
+    For certain models, we provide alternative chat templates inside <gh-dir:vllm/examples>.
+    For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
 
 ### Image Inputs
 
@@ -284,25 +283,21 @@ print("Chat completion output:", chat_response.choices[0].message.content)
 
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
-:::{tip}
-Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
-and pass the file path as `url` in the API request.
-:::
-
-:::{tip}
-There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
-In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
-:::
+!!! tip
+    Loading from local file paths is also supported on vLLM: You can specify the allowed local media path via `--allowed-local-media-path` when launching the API server/engine,
+    and pass the file path as `url` in the API request.
 
-:::{note}
-By default, the timeout for fetching images through HTTP URL is `5` seconds.
-You can override this by setting the environment variable:
+!!! tip
+    There is no need to place image placeholders in the text content of the API request - they are already represented by the image content.
+    In fact, you can place image placeholders in the middle of the text by interleaving text and image content.
 
-```console
-export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
-```
+!!! note
+    By default, the timeout for fetching images through HTTP URL is `5` seconds.
+    You can override this by setting the environment variable:
 
-:::
+    ```console
+    export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
+    ```
 
 ### Video Inputs
 
@@ -357,15 +352,13 @@ print("Chat completion output from image url:", result)
 
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
-:::{note}
-By default, the timeout for fetching videos through HTTP URL is `30` seconds.
-You can override this by setting the environment variable:
+!!! note
+    By default, the timeout for fetching videos through HTTP URL is `30` seconds.
+    You can override this by setting the environment variable:
 
-```console
-export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
-```
-
-:::
+    ```console
+    export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
+    ```
 
 ### Audio Inputs
 
@@ -461,15 +454,13 @@ print("Chat completion output from audio url:", result)
 
 Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
 
-:::{note}
-By default, the timeout for fetching audios through HTTP URL is `10` seconds.
-You can override this by setting the environment variable:
-
-```console
-export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
-```
+!!! note
+    By default, the timeout for fetching audios through HTTP URL is `10` seconds.
+    You can override this by setting the environment variable:
 
-:::
+    ```console
+    export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
+    ```
 
 ### Embedding Inputs
 
@@ -535,7 +526,6 @@ chat_completion = client.chat.completions.create(
 )
 ```
 
-:::{note}
-Only one message can contain `{"type": "image_embeds"}`.
-If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
-:::
+!!! note
+    Only one message can contain `{"type": "image_embeds"}`.
+    If used with a model that requires additional parameters, you must also provide a tensor for each of them, e.g. `image_grid_thw`, `image_sizes`, etc.
diff --git a/docs/features/prompt_embeds.md b/docs/features/prompt_embeds.md
new file mode 100644
index 000000000000..6f5616e05d8c
--- /dev/null
+++ b/docs/features/prompt_embeds.md
@@ -0,0 +1,43 @@
+# Prompt Embedding Inputs
+
+This page teaches you how to pass prompt embedding inputs to vLLM.
+
+## What are prompt embeddings?
+
+The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary.
+
+!!! note
+    Prompt embeddings are currently only supported in the v0 engine.
+
+## Offline Inference
+
+To input multi-modal data, follow this schema in [vllm.inputs.EmbedsPrompt][]:
+
+- `prompt_embeds`: A torch tensor representing a sequence of prompt/token embeddings. This has the shape (sequence_length, hidden_size), where sequence length is the number of tokens embeddings and hidden_size is the hidden size (embedding size) of the model.
+
+### Hugging Face Transformers Inputs
+
+You can pass prompt embeddings from Hugging Face Transformers models to the  `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
+
+<gh-file:examples/offline_inference/prompt_embed_inference.py>
+
+## Online Serving
+
+Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package.
+
+When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.
+
+Prompt embeddings are passed in as base64 encoded torch tensors.
+
+### Transformers Inputs via OpenAI Client
+
+First, launch the OpenAI-compatible server:
+
+```bash
+vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \
+  --max-model-len 4096 --enable-prompt-embeds
+```
+
+Then, you can use the OpenAI client as follows:
+
+<gh-file:examples/online_serving/prompt_embed_inference_with_openai_client.py>
diff --git a/docs/features/quantization/README.md b/docs/features/quantization/README.md
new file mode 100644
index 000000000000..71f62065f63d
--- /dev/null
+++ b/docs/features/quantization/README.md
@@ -0,0 +1,22 @@
+---
+title: Quantization
+---
+[](){ #quantization-index }
+
+Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
+
+Contents:
+
+- [Supported_Hardware](supported_hardware.md)
+- [Auto_Awq](auto_awq.md)
+- [Bnb](bnb.md)
+- [Bitblas](bitblas.md)
+- [Gguf](gguf.md)
+- [Gptqmodel](gptqmodel.md)
+- [Int4](int4.md)
+- [Int8](int8.md)
+- [Fp8](fp8.md)
+- [Modelopt](modelopt.md)
+- [Quark](quark.md)
+- [Quantized_Kvcache](quantized_kvcache.md)
+- [Torchao](torchao.md)
diff --git a/docs/source/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md
similarity index 93%
rename from docs/source/features/quantization/auto_awq.md
rename to docs/features/quantization/auto_awq.md
index b4ac597f5a79..4366a080f52c 100644
--- a/docs/source/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@@ -1,6 +1,7 @@
-(auto-awq)=
-
-# AutoAWQ
+---
+title: AutoAWQ
+---
+[](){ #auto-awq }
 
 To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
 Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
@@ -41,7 +42,9 @@ print(f'Model is quantized and saved at "{quant_path}"')
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
 
 ```console
-python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
+python examples/offline_inference/llm_engine_example.py \
+    --model TheBloke/Llama-2-7b-Chat-AWQ \
+    --quantization awq
 ```
 
 AWQ models are also supported directly through the LLM entrypoint:
diff --git a/docs/source/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md
similarity index 62%
rename from docs/source/features/quantization/bitblas.md
rename to docs/features/quantization/bitblas.md
index d0b2bf858c9b..9001725d9c02 100644
--- a/docs/source/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@@ -1,14 +1,14 @@
-(bitblas)=
-
-# BitBLAS
+---
+title: BitBLAS
+---
+[](){ #bitblas }
 
 vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations.
 
-:::{note}
-Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
-Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
-For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html).
-:::
+!!! note
+    Ensure your hardware supports the selected `dtype` (`torch.bfloat16` or `torch.float16`).
+    Most recent NVIDIA GPUs support `float16`, while `bfloat16` is more common on newer architectures like Ampere or Hopper.
+    For details see [supported hardware](https://docs.vllm.ai/en/latest/features/quantization/supported_hardware.html).
 
 Below are the steps to utilize BitBLAS with vLLM.
 
@@ -33,7 +33,12 @@ import torch
 
 # "hxbgsyxh/llama-13b-4bit-g-1-bitblas" is a pre-quantized checkpoint.
 model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitblas")
+llm = LLM(
+    model=model_id,
+    dtype=torch.bfloat16,
+    trust_remote_code=True,
+    quantization="bitblas"
+)
 ```
 
 ## Read gptq format checkpoint
@@ -44,5 +49,11 @@ import torch
 
 # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
 model_id = "hxbgsyxh/llama-13b-4bit-g-1"
-llm = LLM(model=model_id, dtype=torch.float16, trust_remote_code=True, quantization="bitblas", max_model_len=1024)
+llm = LLM(
+    model=model_id,
+    dtype=torch.float16,
+    trust_remote_code=True,
+    quantization="bitblas",
+    max_model_len=1024
+)
 ```
diff --git a/docs/source/features/quantization/bnb.md b/docs/features/quantization/bnb.md
similarity index 79%
rename from docs/source/features/quantization/bnb.md
rename to docs/features/quantization/bnb.md
index 1843a33a3dfd..a8dc2476f30a 100644
--- a/docs/source/features/quantization/bnb.md
+++ b/docs/features/quantization/bnb.md
@@ -1,6 +1,7 @@
-(bits-and-bytes)=
-
-# BitsAndBytes
+---
+title: BitsAndBytes
+---
+[](){ #bits-and-bytes }
 
 vLLM now supports [BitsAndBytes](https://github.com/TimDettmers/bitsandbytes) for more efficient model inference.
 BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
@@ -14,7 +15,7 @@ pip install bitsandbytes>=0.45.3
 
 vLLM reads the model's config file and supports both in-flight quantization and pre-quantized checkpoint.
 
-You can find bitsandbytes quantized models on <https://huggingface.co/models?search=bitsandbytes>.
+You can find bitsandbytes quantized models on [Hugging Face](https://huggingface.co/models?search=bitsandbytes).
 And usually, these repositories have a config.json file that includes a quantization_config section.
 
 ## Read quantized checkpoint
@@ -26,7 +27,11 @@ from vllm import LLM
 import torch
 # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
 model_id = "unsloth/tinyllama-bnb-4bit"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True)
+llm = LLM(
+    model=model_id,
+    dtype=torch.bfloat16,
+    trust_remote_code=True
+)
 ```
 
 ## Inflight quantization: load as 4bit quantization
@@ -37,8 +42,12 @@ For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify
 from vllm import LLM
 import torch
 model_id = "huggyllama/llama-7b"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes")
+llm = LLM(
+    model=model_id,
+    dtype=torch.bfloat16,
+    trust_remote_code=True,
+    quantization="bitsandbytes"
+)
 ```
 
 ## OpenAI Compatible Server
diff --git a/docs/source/features/quantization/fp8.md b/docs/features/quantization/fp8.md
similarity index 88%
rename from docs/source/features/quantization/fp8.md
rename to docs/features/quantization/fp8.md
index cb304d54726c..01d5d9da046d 100644
--- a/docs/source/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -1,6 +1,7 @@
-(fp8)=
-
-# FP8 W8A8
+---
+title: FP8 W8A8
+---
+[](){ #fp8 }
 
 vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x.
 Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8.
@@ -14,10 +15,9 @@ The FP8 types typically supported in hardware have two distinct representations,
 - **E4M3**: Consists of 1 sign bit, 4 exponent bits, and 3 bits of mantissa. It can store values up to +/-448 and `nan`.
 - **E5M2**: Consists of 1 sign bit, 5 exponent bits, and 2 bits of mantissa. It can store values up to +/-57344, +/- `inf`, and `nan`. The tradeoff for the increased dynamic range is lower precision of the stored values.
 
-:::{note}
-FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper).
-FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin.
-:::
+!!! note
+    FP8 computation is supported on NVIDIA GPUs with compute capability > 8.9 (Ada Lovelace, Hopper).
+    FP8 models will run on compute capability > 8.0 (Ampere) as weight-only W8A16, utilizing FP8 Marlin.
 
 ## Installation
 
@@ -94,9 +94,8 @@ print(result[0].outputs[0].text)
 
 Evaluate accuracy with `lm_eval` (for example on 250 samples of `gsm8k`):
 
-:::{note}
-Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations.
-:::
+!!! note
+    Quantized models can be sensitive to the presence of the `bos` token. `lm_eval` does not add a `bos` token by default, so make sure to include the `add_bos_token=True` argument when running your evaluations.
 
 ```console
 $ MODEL=$PWD/Meta-Llama-3-8B-Instruct-FP8-Dynamic
@@ -133,6 +132,5 @@ result = model.generate("Hello, my name is")
 print(result[0].outputs[0].text)
 ```
 
-:::{warning}
-Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
-:::
+!!! warning
+    Currently, we load the model at original precision before quantizing down to 8-bits, so you need enough memory to load the whole model.
diff --git a/docs/source/features/quantization/gguf.md b/docs/features/quantization/gguf.md
similarity index 64%
rename from docs/source/features/quantization/gguf.md
rename to docs/features/quantization/gguf.md
index e93e4dcd3b57..72f758f653a8 100644
--- a/docs/source/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@@ -1,39 +1,42 @@
-(gguf)=
+---
+title: GGUF
+---
+[](){ #gguf }
 
-# GGUF
+!!! warning
+    Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
 
-:::{warning}
-Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
-:::
-
-:::{warning}
-Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.
-:::
+!!! warning
+    Currently, vllm only supports loading single-file GGUF models. If you have a multi-files GGUF model, you can use [gguf-split](https://github.com/ggerganov/llama.cpp/pull/6135) tool to merge them to a single-file model.
 
 To run a GGUF model with vLLM, you can download and use the local GGUF model from [TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF) with the following command:
 
 ```console
 wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
 # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
 ```
 
 You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
 
 ```console
 # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+   --tensor-parallel-size 2
 ```
 
-:::{warning}
-We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
-:::
+!!! warning
+    We recommend using the tokenizer from base model instead of GGUF model. Because the tokenizer conversion from GGUF is time-consuming and unstable, especially for some models with large vocab size.
 
 GGUF assumes that huggingface can convert the metadata to a config file. In case huggingface doesn't support your model you can manually create a config and pass it as hf-config-path
 
 ```console
 # If you model is not supported by huggingface you can manually provide a huggingface compatible config path
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+   --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
 ```
 
 You can also use the GGUF model directly through the LLM entrypoint:
diff --git a/docs/source/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md
similarity index 95%
rename from docs/source/features/quantization/gptqmodel.md
rename to docs/features/quantization/gptqmodel.md
index 9771d5a4fe9e..53e938d2cbd7 100644
--- a/docs/source/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@@ -1,6 +1,7 @@
-(gptqmodel)=
-
-# GPTQModel
+---
+title: GPTQModel
+---
+[](){ #gptqmodel }
 
 To create a new 4-bit or 8-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI.
 
@@ -58,7 +59,8 @@ model.save(quant_path)
 To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
 
 ```console
-python examples/offline_inference/llm_engine_example.py --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
+python examples/offline_inference/llm_engine_example.py \
+    --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
 ```
 
 ## Using GPTQModel with vLLM's Python API
diff --git a/docs/source/features/quantization/int4.md b/docs/features/quantization/int4.md
similarity index 94%
rename from docs/source/features/quantization/int4.md
rename to docs/features/quantization/int4.md
index 7a0ab4ad229e..b7d09206365f 100644
--- a/docs/source/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -1,14 +1,14 @@
-(int4)=
-
-# INT4 W4A16
+---
+title: INT4 W4A16
+---
+[](){ #int4 }
 
 vLLM supports quantizing weights to INT4 for memory savings and inference acceleration. This quantization method is particularly useful for reducing model size and maintaining low latency in workloads with low queries per second (QPS).
 
 Please visit the HF collection of [quantized INT4 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int4-llms-for-vllm-668ec34bf3c9fa45f857df2c).
 
-:::{note}
-INT4 computation is supported on NVIDIA GPUs with compute capability > 8.0 (Ampere, Ada Lovelace, Hopper, Blackwell).
-:::
+!!! note
+    INT4 computation is supported on NVIDIA GPUs with compute capability > 8.0 (Ampere, Ada Lovelace, Hopper, Blackwell).
 
 ## Prerequisites
 
@@ -121,9 +121,8 @@ $ lm_eval --model vllm \
   --batch_size 'auto'
 ```
 
-:::{note}
-Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
-:::
+!!! note
+    Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
 
 ## Best Practices
 
diff --git a/docs/source/features/quantization/int8.md b/docs/features/quantization/int8.md
similarity index 92%
rename from docs/source/features/quantization/int8.md
rename to docs/features/quantization/int8.md
index 1e4b01d35575..1d9fba9dc87f 100644
--- a/docs/source/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -1,15 +1,15 @@
-(int8)=
-
-# INT8 W8A8
+---
+title: INT8 W8A8
+---
+[](){ #int8 }
 
 vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration.
 This quantization method is particularly useful for reducing model size while maintaining good performance.
 
 Please visit the HF collection of [quantized INT8 checkpoints of popular LLMs ready to use with vLLM](https://huggingface.co/collections/neuralmagic/int8-llms-for-vllm-668ec32c049dca0369816415).
 
-:::{note}
-INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell).
-:::
+!!! note
+    INT8 computation is supported on NVIDIA GPUs with compute capability > 7.5 (Turing, Ampere, Ada Lovelace, Hopper, Blackwell).
 
 ## Prerequisites
 
@@ -125,9 +125,8 @@ $ lm_eval --model vllm \
   --batch_size 'auto'
 ```
 
-:::{note}
-Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
-:::
+!!! note
+    Quantized models can be sensitive to the presence of the `bos` token. Make sure to include the `add_bos_token=True` argument when running evaluations.
 
 ## Best Practices
 
diff --git a/docs/source/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md
similarity index 100%
rename from docs/source/features/quantization/modelopt.md
rename to docs/features/quantization/modelopt.md
diff --git a/docs/source/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
similarity index 98%
rename from docs/source/features/quantization/quantized_kvcache.md
rename to docs/features/quantization/quantized_kvcache.md
index 86e6354ec82e..e3ebd024bab3 100644
--- a/docs/source/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@@ -1,6 +1,7 @@
-(quantized-kvcache)=
-
-# Quantized KV Cache
+---
+title: Quantized KV Cache
+---
+[](){ #quantized-kvcache }
 
 ## FP8 KV Cache
 
diff --git a/docs/source/features/quantization/quark.md b/docs/features/quantization/quark.md
similarity index 94%
rename from docs/source/features/quantization/quark.md
rename to docs/features/quantization/quark.md
index 955890dbc75b..51da98cc09d3 100644
--- a/docs/source/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@@ -1,6 +1,7 @@
-(quark)=
-
-# AMD QUARK
+---
+title: AMD QUARK
+---
+[](){ #quark }
 
 Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve
 throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/),
@@ -86,13 +87,12 @@ We need to set the quantization configuration, you can check
 for further details. Here we use FP8 per-tensor quantization on weight, activation,
 kv-cache and the quantization algorithm is AutoSmoothQuant.
 
-:::{note}
-Note the quantization algorithm needs a JSON config file and the config file is located in
-[Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html),
-under the directory `examples/torch/language_modeling/llm_ptq/models`. For example,
-AutoSmoothQuant config file for Llama is
-`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
-:::
+!!! note
+    Note the quantization algorithm needs a JSON config file and the config file is located in
+    [Quark Pytorch examples](https://quark.docs.amd.com/latest/pytorch/pytorch_examples.html),
+    under the directory `examples/torch/language_modeling/llm_ptq/models`. For example,
+    AutoSmoothQuant config file for Llama is
+    `examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
 
 ```python
 from quark.torch.quantization import (Config, QuantizationConfig,
diff --git a/docs/features/quantization/supported_hardware.md b/docs/features/quantization/supported_hardware.md
new file mode 100644
index 000000000000..2967bf9c7504
--- /dev/null
+++ b/docs/features/quantization/supported_hardware.md
@@ -0,0 +1,28 @@
+---
+title: Supported Hardware
+---
+[](){ #quantization-supported-hardware }
+
+The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
+
+| Implementation        | Volta   | Turing   | Ampere   | Ada   | Hopper   | AMD GPU   | Intel GPU   | x86 CPU   | AWS Inferentia   | Google TPU   |
+|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------|------------------|--------------|
+| AWQ                   | ❌       | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ✅︎        | ❌                | ❌            |
+| GPTQ                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ✅︎          | ✅︎        | ❌                | ❌            |
+| Marlin (GPTQ/AWQ/FP8) | ❌       | ❌        | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
+| INT8 (W8A8)           | ❌       | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ✅︎        | ❌                | ✅︎           |
+| FP8 (W8A8)            | ❌       | ❌        | ❌        | ✅︎    | ✅︎       | ✅︎        | ❌           | ❌         | ❌                | ❌            |
+| BitBLAS (GPTQ)        | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
+| AQLM                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
+| bitsandbytes          | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
+| DeepSpeedFP           | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ❌         | ❌           | ❌         | ❌                | ❌            |
+| GGUF                  | ✅︎      | ✅︎       | ✅︎       | ✅︎    | ✅︎       | ✅︎        | ❌           | ❌         | ❌                | ❌            |
+
+- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
+- ✅︎ indicates that the quantization method is supported on the specified hardware.
+- ❌ indicates that the quantization method is not supported on the specified hardware.
+
+!!! note
+    This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
+
+    For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
diff --git a/docs/source/features/quantization/torchao.md b/docs/features/quantization/torchao.md
similarity index 86%
rename from docs/source/features/quantization/torchao.md
rename to docs/features/quantization/torchao.md
index 82100c6ddcac..a7a517af85aa 100644
--- a/docs/source/features/quantization/torchao.md
+++ b/docs/features/quantization/torchao.md
@@ -7,7 +7,9 @@ We recommend installing the latest torchao nightly with
 ```console
 # Install the latest TorchAO nightly build
 # Choose the CUDA version that matches your system (cu126, cu128, etc.)
-pip install --pre torchao>=10.0.0 --index-url https://download.pytorch.org/whl/nightly/cu126
+pip install \
+    --pre torchao>=10.0.0 \
+    --index-url https://download.pytorch.org/whl/nightly/cu126
 ```
 
 ## Quantizing HuggingFace Models
@@ -20,7 +22,12 @@ from torchao.quantization import Int8WeightOnlyConfig
 
 model_name = "meta-llama/Meta-Llama-3-8B"
 quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
-quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
diff --git a/docs/source/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
similarity index 96%
rename from docs/source/features/reasoning_outputs.md
rename to docs/features/reasoning_outputs.md
index bf4f8901a11a..cbcb246912f4 100644
--- a/docs/source/features/reasoning_outputs.md
+++ b/docs/features/reasoning_outputs.md
@@ -1,6 +1,7 @@
-(reasoning-outputs)=
-
-# Reasoning Outputs
+---
+title: Reasoning Outputs
+---
+[](){ #reasoning-outputs }
 
 vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
 
@@ -17,17 +18,17 @@ vLLM currently supports the following reasoning models:
 | [IBM Granite 3.2 language models](https://huggingface.co/collections/ibm-granite/granite-32-language-models-67b3bc8c13508f6d064cff9a) | `granite` | ❌ | ❌ |
 | [Qwen3 series](https://huggingface.co/collections/Qwen/qwen3-67dd247413f0e2e4f653967f) | `qwen3` | `guided_json`, `guided_regex` | ✅ |
 
-:::{note}
-IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
-The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
-:::
+!!! note
+    IBM Granite 3.2 reasoning is disabled by default; to enable it, you must also pass `thinking=True` in your `chat_template_kwargs`.
+    The reasoning feature for the Qwen3 series is enabled by default. To disable it, you must pass `enable_thinking=False` in your `chat_template_kwargs`.
 
 ## Quickstart
 
 To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
 
 ```bash
-vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --reasoning-parser deepseek_r1
 ```
 
 Next, make a request to the model that should return the reasoning content in the response.
@@ -167,12 +168,10 @@ client = OpenAI(
 models = client.models.list()
 model = models.data[0].id
 
-
 class People(BaseModel):
     name: str
     age: int
 
-
 json_schema = People.model_json_schema()
 
 prompt = ("Generate a JSON with the name and age of one random person.")
diff --git a/docs/source/features/spec_decode.md b/docs/features/spec_decode.md
similarity index 91%
rename from docs/source/features/spec_decode.md
rename to docs/features/spec_decode.md
index f16e0d96522d..5080960f72dd 100644
--- a/docs/source/features/spec_decode.md
+++ b/docs/features/spec_decode.md
@@ -1,16 +1,15 @@
-(spec-decode)=
+---
+title: Speculative Decoding
+---
+[](){ #spec-decode }
 
-# Speculative Decoding
+!!! warning
+    Please note that speculative decoding in vLLM is not yet optimized and does
+    not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
+    The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
 
-:::{warning}
-Please note that speculative decoding in vLLM is not yet optimized and does
-not usually yield inter-token latency reductions for all prompt datasets or sampling parameters.
-The work to optimize it is ongoing and can be followed here: <gh-issue:4630>
-:::
-
-:::{warning}
-Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
-:::
+!!! warning
+    Currently, speculative decoding in vLLM is not compatible with pipeline parallelism.
 
 This document shows how to use [Speculative Decoding](https://x.com/karpathy/status/1697318534555336961) with vLLM.
 Speculative decoding is a technique which improves inter-token latency in memory-bound LLM inference.
@@ -46,14 +45,18 @@ for output in outputs:
 To perform the same with an online mode launch the server:
 
 ```bash
-python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
-    --seed 42 -tp 1 --gpu_memory_utilization 0.8 \
+python -m vllm.entrypoints.openai.api_server \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --model facebook/opt-6.7b \
+    --seed 42 \
+    -tp 1 \
+    --gpu_memory_utilization 0.8 \
     --speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}'
 ```
 
-:::{warning}
-Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now.
-:::
+!!! warning
+    Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately has been deprecated now.
 
 Then use a client:
 
@@ -172,7 +175,7 @@ A variety of speculative models of this type are available on HF hub:
 ## Speculating using EAGLE based draft models
 
 The following code configures vLLM to use speculative decoding where proposals are generated by
-an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](<gh-file:examples/offline_inference/eagle.py>).
+an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
 
 ```python
 from vllm import LLM, SamplingParams
@@ -255,7 +258,7 @@ speculative decoding, breaking down the guarantees into three key areas:
 3. **vLLM Logprob Stability**
    \- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the
    same request across runs. For more details, see the FAQ section
-   titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).
+   titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs][faq].
 
 While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding
 can occur due to following factors:
@@ -264,7 +267,7 @@ can occur due to following factors:
 - **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially
   due to non-deterministic behavior in batched operations or numerical instability.
 
-For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](#faq).
+For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs][faq].
 
 ## Resources for vLLM contributors
 
diff --git a/docs/source/features/structured_outputs.md b/docs/features/structured_outputs.md
similarity index 96%
rename from docs/source/features/structured_outputs.md
rename to docs/features/structured_outputs.md
index 03119ec7441c..f96b598cff98 100644
--- a/docs/source/features/structured_outputs.md
+++ b/docs/features/structured_outputs.md
@@ -1,6 +1,7 @@
-(structured-outputs)=
-
-# Structured Outputs
+---
+title: Structured Outputs
+---
+[](){ #structured-outputs }
 
 vLLM supports the generation of structured outputs using
 [xgrammar](https://github.com/mlc-ai/xgrammar) or
@@ -20,7 +21,7 @@ The following parameters are supported, which must be added as extra parameters:
 - `guided_grammar`: the output will follow the context free grammar.
 - `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
 
-You can see the complete list of supported parameters on the [OpenAI-Compatible Server](#openai-compatible-server) page.
+You can see the complete list of supported parameters on the [OpenAI-Compatible Server][openai-compatible-server] page.
 
 Structured outputs are supported by default in the OpenAI-Compatible Server. You
 may choose to specify the backend to use by setting the
@@ -83,13 +84,11 @@ class CarType(str, Enum):
     truck = "Truck"
     coupe = "Coupe"
 
-
 class CarDescription(BaseModel):
     brand: str
     model: str
     car_type: CarType
 
-
 json_schema = CarDescription.model_json_schema()
 
 completion = client.chat.completions.create(
@@ -105,11 +104,10 @@ completion = client.chat.completions.create(
 print(completion.choices[0].message.content)
 ```
 
-:::{tip}
-While not strictly necessary, normally it´s better to indicate in the prompt the
-JSON schema and how the fields should be populated.  This can improve the
-results notably in most cases.
-:::
+!!! tip
+    While not strictly necessary, normally it´s better to indicate in the prompt the
+    JSON schema and how the fields should be populated.  This can improve the
+    results notably in most cases.
 
 Finally we have the `guided_grammar` option, which is probably the most
 difficult to use, but it´s really powerful. It allows us to define complete
@@ -160,12 +158,10 @@ Here is a simple example demonstrating how to get structured output using Pydant
 from pydantic import BaseModel
 from openai import OpenAI
 
-
 class Info(BaseModel):
     name: str
     age: int
 
-
 client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
 completion = client.beta.chat.completions.parse(
     model="meta-llama/Llama-3.1-8B-Instruct",
@@ -199,17 +195,14 @@ from typing import List
 from pydantic import BaseModel
 from openai import OpenAI
 
-
 class Step(BaseModel):
     explanation: str
     output: str
 
-
 class MathResponse(BaseModel):
     steps: list[Step]
     final_answer: str
 
-
 client = OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="dummy")
 completion = client.beta.chat.completions.parse(
     model="meta-llama/Llama-3.1-8B-Instruct",
diff --git a/docs/source/features/tool_calling.md b/docs/features/tool_calling.md
similarity index 95%
rename from docs/source/features/tool_calling.md
rename to docs/features/tool_calling.md
index 2795b769345e..6ee1060dd050 100644
--- a/docs/source/features/tool_calling.md
+++ b/docs/features/tool_calling.md
@@ -93,7 +93,7 @@ specify the `name` of one of the tools in the `tool_choice` parameter of the cha
 
 ## Required Function Calling
 
-vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/getting_started/v1_user_guide.html#feature-model) for the V1 engine.
+vLLM supports the `tool_choice='required'` option in the chat completion API. Similar to the named function calling, it also uses guided decoding, so this is enabled by default and will work with any supported model. The required guided decoding features (JSON schema with `anyOf`) are currently only supported in the V0 engine with the guided decoding backend `outlines`. However, support for alternative decoding backends are on the [roadmap](https://docs.vllm.ai/en/latest/usage/v1_guide.html#feature-model) for the V1 engine.
 
 When tool_choice='required' is set, the model is guaranteed to generate one or more tool calls based on the specified tool list in the `tools` parameter. The number of tool calls depends on the user's query. The output format strictly follows the schema defined in the `tools` parameter.
 
@@ -158,13 +158,13 @@ All Llama 3.1, 3.2 and 4 models should be supported.
 * `meta-llama/Llama-3.2-*`
 * `meta-llama/Llama-4-*`
 
-The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below.
+The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. As for llama 4 models, it is recommended to use the `llama4_pythonic` tool parser.
 
 Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
 
 Known issues:
 
-1. Parallel tool calls are not supported.
+1. Parallel tool calls are not supported for llama 3, but it is supported in llama 4 models.
 2. The model can generate parameters with a wrong format, such as generating
    an array serialized as string instead of an array.
 
@@ -177,11 +177,10 @@ images.
 
 Recommended flags: `--tool-call-parser llama3_json --chat-template {see_above}`
 
-VLLM also provides a JSON based chat template for Llama 4:
-* <gh-file:examples/tool_chat_template_llama4_json.jinja> - this is based on the "official" chat template for the Llama 4
-models, but tweaked so that it works better with vLLM.
+VLLM also provides a pythonic and JSON based chat template for Llama 4, but pythonic tool calling is recommended:
+* <gh-file:examples/tool_chat_template_llama4_pythonic.jinja> - this is based on the [official chat template](https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/) for the Llama 4 models.
 
-For Llama 4 use `--tool-call-parser llama4_json examples/tool_chat_template_llama4_json.jinja`.
+For Llama 4 model, use `--tool-call-parser llama4_pythonic --chat-template examples/tool_chat_template_llama4_pythonic.jinja`.
 
 #### IBM Granite
 
@@ -323,7 +322,6 @@ class ExampleToolParser(ToolParser):
                                             tool_calls=[],
                                             content=text)
 
-
 ```
 
 Then you can use this plugin in the command line like this.
diff --git a/docs/getting_started/installation/.nav.yml b/docs/getting_started/installation/.nav.yml
new file mode 100644
index 000000000000..7acfc015ff50
--- /dev/null
+++ b/docs/getting_started/installation/.nav.yml
@@ -0,0 +1,5 @@
+nav:
+  - README.md
+  - gpu.md
+  - cpu.md
+  - ai_accelerator.md
\ No newline at end of file
diff --git a/docs/getting_started/installation/README.md b/docs/getting_started/installation/README.md
new file mode 100644
index 000000000000..36bb16cc0224
--- /dev/null
+++ b/docs/getting_started/installation/README.md
@@ -0,0 +1,20 @@
+---
+title: Installation
+---
+[](){ #installation-index }
+
+vLLM supports the following hardware platforms:
+
+- [GPU](gpu.md)
+    - [NVIDIA CUDA](gpu.md#nvidia-cuda)
+    - [AMD ROCm](gpu.md#amd-rocm)
+    - [Intel XPU](gpu.md#intel-xpu)
+- [CPU](cpu.md)
+    - [Intel/AMD x86](cpu.md#intelamd-x86)
+    - [ARM AArch64](cpu.md#arm-aarch64)
+    - [Apple silicon](cpu.md#apple-silicon)
+    - [IBM Z (S390X)](cpu.md#ibm-z-s390x)
+- [Other AI accelerators](ai_accelerator.md)
+    - [Google TPU](ai_accelerator.md#google-tpu)
+    - [Intel Gaudi](ai_accelerator.md#intel-gaudi)
+    - [AWS Neuron](ai_accelerator.md#aws-neuron)
diff --git a/docs/getting_started/installation/ai_accelerator.md b/docs/getting_started/installation/ai_accelerator.md
new file mode 100644
index 000000000000..a4f136a172fe
--- /dev/null
+++ b/docs/getting_started/installation/ai_accelerator.md
@@ -0,0 +1,117 @@
+# Other AI accelerators
+
+vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions:
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:installation"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:installation"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:installation"
+
+## Requirements
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:requirements"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:requirements"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:requirements"
+
+## Configure a new environment
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:configure-a-new-environment"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:configure-a-new-environment"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:configure-a-new-environment"
+
+## Set up using Python
+
+### Pre-built wheels
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:pre-built-wheels"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:pre-built-wheels"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:pre-built-wheels"
+
+### Build wheel from source
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:build-wheel-from-source"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:build-wheel-from-source"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:build-wheel-from-source"
+
+## Set up using Docker
+
+### Pre-built images
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:pre-built-images"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:pre-built-images"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:pre-built-images"
+
+### Build image from source
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:build-image-from-source"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:build-image-from-source"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:build-image-from-source"
+
+## Extra information
+
+=== "Google TPU"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/tpu.inc.md:extra-information"
+
+=== "Intel Gaudi"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md:extra-information"
+
+=== "AWS Neuron"
+
+    --8<-- "docs/getting_started/installation/ai_accelerator/neuron.inc.md:extra-information"
diff --git a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
similarity index 83%
rename from docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
rename to docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
index 78938de317c4..00935a37417e 100644
--- a/docs/source/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
+++ b/docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -1,12 +1,12 @@
-# Installation
+# --8<-- [start:installation]
 
 This tab provides instructions on running vLLM with Intel Gaudi devices.
 
-:::{attention}
-There are no pre-built wheels or images for this device, so you must build vLLM from source.
-:::
+!!! warning
+    There are no pre-built wheels or images for this device, so you must build vLLM from source.
 
-## Requirements
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
 
 - OS: Ubuntu 22.04 LTS
 - Python: 3.10
@@ -45,16 +45,27 @@ Use the following commands to run a Docker image:
 
 ```console
 docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+docker run \
+  -it \
+  --runtime=habana \
+  -e HABANA_VISIBLE_DEVICES=all \
+  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+  --cap-add=sys_nice \
+  --net=host \
+  --ipc=host \
+  vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 ```
 
-## Set up using Python
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
 
-### Pre-built wheels
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
 
 Currently, there are no pre-built Intel Gaudi wheels.
 
-### Build wheel from source
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
 
 To build and install vLLM from source, run:
 
@@ -75,29 +86,39 @@ pip install -r requirements/hpu.txt
 python setup.py develop
 ```
 
-## Set up using Docker
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:set-up-using-docker]
 
-### Pre-built images
+# --8<-- [end:set-up-using-docker]
+# --8<-- [start:pre-built-images]
 
 Currently, there are no pre-built Intel Gaudi images.
 
-### Build image from source
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
 
 ```console
 docker build -f docker/Dockerfile.hpu -t vllm-hpu-env  .
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
+docker run \
+  -it \
+  --runtime=habana \
+  -e HABANA_VISIBLE_DEVICES=all \
+  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+  --cap-add=sys_nice \
+  --net=host \
+  --rm vllm-hpu-env
 ```
 
-:::{tip}
-If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
-:::
+!!! tip
+    If you're observing the following error: `docker: Error response from daemon: Unknown runtime specified habana.`, please refer to "Install Using Containers" section of [Intel Gaudi Software Stack and Driver Installation](https://docs.habana.ai/en/v1.18.0/Installation_Guide/Bare_Metal_Fresh_OS.html). Make sure you have `habana-container-runtime` package installed and that `habana` container runtime is registered.
 
-## Extra information
+# --8<-- [end:build-image-from-source]
+# --8<-- [start:extra-information]
 
 ## Supported features
 
-- [Offline inference](#offline-inference)
-- Online serving via [OpenAI-Compatible Server](#openai-compatible-server)
+- [Offline inference][offline-inference]
+- Online serving via [OpenAI-Compatible Server][openai-compatible-server]
 - HPU autodetection - no need to manually select device within vLLM
 - Paged KV cache with algorithms enabled for Intel Gaudi accelerators
 - Custom Intel Gaudi implementations of Paged Attention, KV cache ops,
@@ -157,41 +178,25 @@ Gaudi2 devices. Configurations that are not listed may or may not work.
 
 Currently in vLLM for HPU we support four execution modes, depending on selected HPU PyTorch Bridge backend (via `PT_HPU_LAZY_MODE` environment variable), and `--enforce-eager` flag.
 
-:::{list-table} vLLM execution modes
-:widths: 25 25 50
-:header-rows: 1
-
-- * `PT_HPU_LAZY_MODE`
-  * `enforce_eager`
-  * execution mode
-- * 0
-  * 0
-  * torch.compile
-- * 0
-  * 1
-  * PyTorch eager mode
-- * 1
-  * 0
-  * HPU Graphs
-- * 1
-  * 1
-  * PyTorch lazy mode
-:::
-
-:::{warning}
-In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
-:::
-
-(gaudi-bucketing-mechanism)=
+|   `PT_HPU_LAZY_MODE` |   `enforce_eager` | execution mode     |
+|----------------------|-------------------|--------------------|
+|                    0 |                 0 | torch.compile      |
+|                    0 |                 1 | PyTorch eager mode |
+|                    1 |                 0 | HPU Graphs         |
+  <figcaption>vLLM execution modes</figcaption>
+
+!!! warning
+    In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode.
+
+[](){ #gaudi-bucketing-mechanism }
 
 ### Bucketing mechanism
 
 Intel Gaudi accelerators work best when operating on models with fixed tensor shapes. [Intel Gaudi Graph Compiler](https://docs.habana.ai/en/latest/Gaudi_Overview/Intel_Gaudi_Software_Suite.html#graph-compiler-and-runtime) is responsible for generating optimized binary code that implements the given model topology on Gaudi. In its default configuration, the produced binary code may be heavily dependent on input and output tensor shapes, and can require graph recompilation when encountering differently shaped tensors within the same topology. While the resulting binaries utilize Gaudi efficiently, the compilation itself may introduce a noticeable overhead in end-to-end execution.
 In a dynamic inference serving scenario, there is a need to minimize the number of graph compilations and reduce the risk of graph compilation occurring during server runtime. Currently it is achieved by "bucketing" model's forward pass across two dimensions - `batch_size` and `sequence_length`.
 
-:::{note}
-Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
-:::
+!!! note
+    Bucketing allows us to reduce the number of required graphs significantly, but it does not handle any graph compilation and device code generation - this is done in warmup and HPUGraph capture phase.
 
 Bucketing ranges are determined with 3 parameters - `min`, `step` and `max`. They can be set separately for prompt and decode phase, and for batch size and sequence length dimension. These parameters can be observed in logs during vLLM startup:
 
@@ -224,15 +229,13 @@ min = 128, step = 128, max = 512
 
 In the logged scenario, 24 buckets were generated for prompt (prefill) runs, and 48 buckets for decode runs. Each bucket corresponds to a separate optimized device binary for a given model with specified tensor shapes. Whenever a batch of requests is processed, it is padded across batch and sequence length dimension to the smallest possible bucket.
 
-:::{warning}
-If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
-:::
+!!! warning
+    If a request exceeds maximum bucket size in any dimension, it will be processed without padding, and its processing may require a graph compilation, potentially significantly increasing end-to-end latency. The boundaries of the buckets are user-configurable via environment variables, and upper bucket boundaries can be increased to avoid such scenario.
 
 As an example, if a request of 3 sequences, with max sequence length of 412 comes in to an idle vLLM server, it will be padded executed as `(4, 512)` prefill bucket, as `batch_size` (number of sequences) will be padded to 4 (closest batch_size dimension higher than 3), and max sequence length will be padded to 512 (closest sequence length dimension higher than 412). After prefill stage, it will be executed as `(4, 512)` decode bucket and will continue as that bucket until either batch dimension changes (due to request being finished) - in which case it will become a `(2, 512)` bucket, or context length increases above 512 tokens, in which case it will become `(4, 640)` bucket.
 
-:::{note}
-Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
-:::
+!!! note
+    Bucketing is transparent to a client -- padding in sequence length dimension is never returned to the client, and padding in batch dimension does not create new requests.
 
 ### Warmup
 
@@ -252,11 +255,10 @@ INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][47/48] batch_size
 INFO 08-01 22:27:16 hpu_model_runner.py:1066] [Warmup][Decode][48/48] batch_size:1 seq_len:128 free_mem:55.43 GiB
 ```
 
-This example uses the same buckets as in the [Bucketing Mechanism](#gaudi-bucketing-mechanism) section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
+This example uses the same buckets as in the [Bucketing Mechanism][gaudi-bucketing-mechanism] section. Each output line corresponds to execution of a single bucket. When bucket is executed for the first time, its graph is compiled and can be reused later on, skipping further graph compilations.
 
-:::{tip}
-Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
-:::
+!!! tip
+    Compiling all the buckets might take some time and can be turned off with `VLLM_SKIP_WARMUP=true` environment variable. Keep in mind that if you do that, you may face graph compilations once executing a given bucket for the first time. It is fine to disable warmup for development, but it's highly recommended to enable it in deployment.
 
 ### HPU Graph capture
 
@@ -271,9 +273,8 @@ With its default value (`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory wil
 Environment variable `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (`VLLM_GRAPH_PROMPT_RATIO=0.3`), both stages have equal memory constraints.
 Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. `VLLM_GRAPH_PROMPT_RATIO=0.2` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs.
 
-:::{note}
-`gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
-:::
+!!! note
+    `gpu_memory_utilization` does not correspond to the absolute memory usage across HPU. It specifies the memory margin after loading the model and performing a profile run. If device has 100 GiB of total memory, and 50 GiB of free memory after loading model weights and executing profiling run, `gpu_memory_utilization` at its default value will mark 90% of 50 GiB as usable, leaving 5 GiB of margin, regardless of total device memory.
 
 User can also configure the strategy for capturing HPU Graphs for prompt and decode stages separately. Strategy affects the order of capturing graphs. There are two strategies implemented:
 
@@ -282,9 +283,8 @@ User can also configure the strategy for capturing HPU Graphs for prompt and dec
 
 When there's large amount of requests pending, vLLM scheduler will attempt to fill the maximum batch size for decode as soon as possible. When a request is finished, decode batch size decreases. When that happens, vLLM will attempt to schedule a prefill iteration for requests in the waiting queue, to fill the decode batch size to its previous state. This means that in a full load scenario, decode batch size is often at its maximum, which makes large batch size HPU Graphs crucial to capture, as reflected by `max_bs` strategy. On the other hand, prefills will be executed most frequently with very low batch sizes (1-4), which is reflected in `min_tokens` strategy.
 
-:::{note}
-`VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
-:::
+!!! note
+    `VLLM_GRAPH_PROMPT_RATIO` does not set a hard limit on memory taken by graphs for each stage (prefill and decode). vLLM will first attempt to use up entirety of usable prefill graph memory (usable graph memory * `VLLM_GRAPH_PROMPT_RATIO`) for capturing prefill HPU Graphs, next it will attempt do the same for decode graphs and usable decode graph memory pool. If one stage is fully captured, and there is unused memory left within usable graph memory pool, vLLM will attempt further graph capture for the other stage, until no more HPU Graphs can be captured without exceeding reserved memory pool. The behavior on that mechanism can be observed in the example below.
 
 Each described step is logged by vLLM server, as follows (negative values correspond to memory being released):
 
@@ -401,3 +401,4 @@ the below:
   higher batches. You can do that by adding `--enforce-eager` flag to
   server (for online serving), or by passing `enforce_eager=True`
   argument to LLM constructor (for offline inference).
+# --8<-- [end:extra-information]
diff --git a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/getting_started/installation/ai_accelerator/neuron.inc.md
similarity index 74%
rename from docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
rename to docs/getting_started/installation/ai_accelerator/neuron.inc.md
index b4bfb696faa2..f08c78fba6c8 100644
--- a/docs/source/getting_started/installation/ai_accelerator/neuron.inc.md
+++ b/docs/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -1,14 +1,14 @@
-# Installation
+# --8<-- [start:installation]
 
 vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
 Paged Attention and Chunked Prefill are currently in development and will be available soon.
 Data types currently supported in Neuron SDK are FP16 and BF16.
 
-:::{attention}
-There are no pre-built wheels or images for this device, so you must build vLLM from source.
-:::
+!!! warning
+    There are no pre-built wheels or images for this device, so you must build vLLM from source.
 
-## Requirements
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
 
 - OS: Linux
 - Python: 3.9 -- 3.11
@@ -38,7 +38,8 @@ The installation of drivers and tools wouldn't be necessary, if [Deep Learning A
 sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
 deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
 EOF
-wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
+wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB \
+    | sudo apt-key add -
 
 # Update OS packages
 sudo apt-get update -y
@@ -63,17 +64,19 @@ sudo apt-get install aws-neuronx-tools=2.* -y
 export PATH=/opt/aws/neuron/bin:$PATH
 ```
 
-## Set up using Python
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
 
-### Pre-built wheels
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
 
 Currently, there are no pre-built Neuron wheels.
 
-### Build wheel from source
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
 
-:::{note}
-The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
-:::
+!!! note
+    The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with `vllm >= 0.5.3`. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
 
 Following instructions are applicable to Neuron SDK 2.16 and beyond.
 
@@ -94,12 +97,17 @@ source aws_neuron_venv_pytorch/bin/activate
 
 # Install Jupyter notebook kernel
 pip install ipykernel
-python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
+python3.10 -m ipykernel install \
+    --user \
+    --name aws_neuron_venv_pytorch \
+    --display-name "Python (torch-neuronx)"
 pip install jupyter notebook
 pip install environment_kernels
 
 # Set pip repository pointing to the Neuron repository
-python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+python -m pip config set \
+    global.extra-index-url \
+    https://pip.repos.neuron.amazonaws.com
 
 # Install wget, awscli
 python -m pip install wget
@@ -122,18 +130,23 @@ VLLM_TARGET_DEVICE="neuron" pip install .
 
 If neuron packages are detected correctly in the installation process, `vllm-0.3.0+neuron212` will be installed.
 
-## Set up using Docker
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:set-up-using-docker]
 
-### Pre-built images
+# --8<-- [end:set-up-using-docker]
+# --8<-- [start:pre-built-images]
 
 Currently, there are no pre-built Neuron images.
 
-### Build image from source
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
 
-See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.
+See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image.
 
 Make sure to use <gh-file:docker/Dockerfile.neuron> in place of the default Dockerfile.
 
-## Extra information
+# --8<-- [end:build-image-from-source]
+# --8<-- [start:extra-information]
 
 There is no extra information for this device.
+# --8<-- [end:extra-information]
diff --git a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md b/docs/getting_started/installation/ai_accelerator/tpu.inc.md
similarity index 55%
rename from docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
rename to docs/getting_started/installation/ai_accelerator/tpu.inc.md
index 4459cc61e1cd..d0b168120137 100644
--- a/docs/source/getting_started/installation/ai_accelerator/tpu.inc.md
+++ b/docs/getting_started/installation/ai_accelerator/tpu.inc.md
@@ -1,4 +1,4 @@
-# Installation
+# --8<-- [start:installation]
 
 Tensor Processing Units (TPUs) are Google's custom-developed application-specific
 integrated circuits (ASICs) used to accelerate machine learning workloads. TPUs
@@ -30,11 +30,11 @@ For TPU pricing information, see [Cloud TPU pricing](https://cloud.google.com/tp
 You may need additional persistent storage for your TPU VMs. For more
 information, see [Storage options for Cloud TPU data](https://cloud.devsite.corp.google.com/tpu/docs/storage-options).
 
-:::{attention}
-There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
-:::
+!!! warning
+    There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
 
-## Requirements
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
 
 - Google Cloud TPU VM
 - TPU versions: v6e, v5e, v5p, v4
@@ -51,10 +51,9 @@ When you request queued resources, the request is added to a queue maintained by
 the Cloud TPU service. When the requested resource becomes available, it's
 assigned to your Google Cloud project for your immediate exclusive use.
 
-:::{note}
-In all of the following commands, replace the ALL CAPS parameter names with
-appropriate values. See the parameter descriptions table for more information.
-:::
+!!! note
+    In all of the following commands, replace the ALL CAPS parameter names with
+    appropriate values. See the parameter descriptions table for more information.
 
 ### Provision Cloud TPUs with GKE
 
@@ -79,33 +78,15 @@ gcloud alpha compute tpus queued-resources create QUEUED_RESOURCE_ID \
 --service-account SERVICE_ACCOUNT
 ```
 
-:::{list-table} Parameter descriptions
-:header-rows: 1
-
-- * Parameter name
-  * Description
-- * QUEUED_RESOURCE_ID
-  * The user-assigned ID of the queued resource request.
-- * TPU_NAME
-  * The user-assigned name of the TPU which is created when the queued
-    resource request is allocated.
-- * PROJECT_ID
-  * Your Google Cloud project
-- * ZONE
-  * The GCP zone where you want to create your Cloud TPU. The value you use
-    depends on the version of TPUs you are using. For more information, see
-    `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_
-- * ACCELERATOR_TYPE
-  * The TPU version you want to use. Specify the TPU version, for example
-    `v5litepod-4` specifies a v5e TPU with 4 cores, `v6e-1` specifies a v6e TPU with 1 core. For more information,
-    see [TPU versions](https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions).
-- * RUNTIME_VERSION
-  * The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images](https://cloud.google.com/tpu/docs/runtimes).
-- * SERVICE_ACCOUNT
-  * The email address for your service account. You can find it in the IAM
-    Cloud Console under *Service Accounts*. For example:
-    `tpu-service-account@<your_project_ID>.iam.gserviceaccount.com`
-:::
+| Parameter name     | Description                                                                                                                                                                                              |
+|--------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| QUEUED_RESOURCE_ID | The user-assigned ID of the queued resource request.                                                                                                                                                     |
+| TPU_NAME           | The user-assigned name of the TPU which is created when the queued                                                                                                                                       |
+| PROJECT_ID         | Your Google Cloud project                                                                                                                                                                                |
+| ZONE               | The GCP zone where you want to create your Cloud TPU. The value you use                                                                                                                                  |
+| ACCELERATOR_TYPE   | The TPU version you want to use. Specify the TPU version, for example                                                                                                                                    |
+| RUNTIME_VERSION    | The TPU VM runtime version to use. For example, use `v2-alpha-tpuv6e` for a VM loaded with one or more v6e TPU(s). For more information see [TPU VM images](https://cloud.google.com/tpu/docs/runtimes). |
+  <figcaption>Parameter descriptions</figcaption>
 
 Connect to your TPU using SSH:
 
@@ -113,13 +94,16 @@ Connect to your TPU using SSH:
 gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
 ```
 
-## Set up using Python
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
 
-### Pre-built wheels
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
 
 Currently, there are no pre-built TPU wheels.
 
-### Build wheel from source
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
 
 Install Miniconda:
 
@@ -161,13 +145,16 @@ Run the setup script:
 VLLM_TARGET_DEVICE="tpu" python -m pip install -e .
 ```
 
-## Set up using Docker
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:set-up-using-docker]
 
-### Pre-built images
+# --8<-- [end:set-up-using-docker]
+# --8<-- [start:pre-built-images]
 
-See <project:#deployment-docker-pre-built-image> for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
+See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image, making sure to substitute the image name `vllm/vllm-openai` with `vllm/vllm-tpu`.
 
-### Build image from source
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
 
 You can use <gh-file:docker/Dockerfile.tpu> to build a Docker image with TPU support.
 
@@ -182,31 +169,30 @@ Run the Docker image with the following command:
 docker run --privileged --net host --shm-size=16G -it vllm-tpu
 ```
 
-:::{note}
-Since TPU relies on XLA which requires static shapes, vLLM bucketizes the
-possible input shapes and compiles an XLA graph for each shape. The
-compilation time may take 20~30 minutes in the first run. However, the
-compilation time reduces to ~5 minutes afterwards because the XLA graphs are
-cached in the disk (in {code}`VLLM_XLA_CACHE_PATH` or {code}`~/.cache/vllm/xla_cache` by default).
-:::
+!!! note
+    Since TPU relies on XLA which requires static shapes, vLLM bucketizes the
+    possible input shapes and compiles an XLA graph for each shape. The
+    compilation time may take 20~30 minutes in the first run. However, the
+    compilation time reduces to ~5 minutes afterwards because the XLA graphs are
+    cached in the disk (in `VLLM_XLA_CACHE_PATH` or `~/.cache/vllm/xla_cache` by default).
 
-:::{tip}
-If you encounter the following error:
+!!! tip
+    If you encounter the following error:
 
-```console
-from torch._C import *  # noqa: F403
-ImportError: libopenblas.so.0: cannot open shared object file: No such
-file or directory
-```
-
-Install OpenBLAS with the following command:
+    ```console
+    from torch._C import *  # noqa: F403
+    ImportError: libopenblas.so.0: cannot open shared object file: No such
+    file or directory
+    ```
 
-```console
-sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
-```
+    Install OpenBLAS with the following command:
 
-:::
+    ```console
+    sudo apt-get install libopenblas-base libopenmpi-dev libomp-dev
+    ```
 
-## Extra information
+# --8<-- [end:build-image-from-source]
+# --8<-- [start:extra-information]
 
 There is no extra information for this device.
+# --8<-- [end:extra-information]
diff --git a/docs/source/getting_started/installation/cpu.md b/docs/getting_started/installation/cpu.md
similarity index 74%
rename from docs/source/getting_started/installation/cpu.md
rename to docs/getting_started/installation/cpu.md
index 2c0ec60d7100..18c96b264ad8 100644
--- a/docs/source/getting_started/installation/cpu.md
+++ b/docs/getting_started/installation/cpu.md
@@ -2,107 +2,47 @@
 
 vLLM is a Python library that supports the following CPU variants. Select your CPU type to see vendor specific instructions:
 
-:::::{tab-set}
-:sync-group: device
+=== "Intel/AMD x86"
 
-::::{tab-item} Intel/AMD x86
-:selected:
-:sync: x86
+    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:installation"
 
-:::{include} cpu/x86.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
+=== "ARM AArch64"
 
-::::
+    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:installation"
 
-::::{tab-item} ARM AArch64
-:sync: arm
+=== "Apple silicon"
 
-:::{include} cpu/arm.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
+    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:installation"
 
-::::
+=== "IBM Z (S390X)"
 
-::::{tab-item} Apple silicon
-:sync: apple
-
-:::{include} cpu/apple.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-
-::::
-
-::::{tab-item} IBM Z (S390X)
-:sync: s390x
-
-:::{include} cpu/s390x.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-
-::::
-
-:::::
+    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:installation"
 
 ## Requirements
 
 - Python: 3.9 -- 3.12
 
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} Intel/AMD x86
-:sync: x86
-
-:::{include} cpu/x86.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-:::
-
-::::
-
-::::{tab-item} ARM AArch64
-:sync: arm
-
-:::{include} cpu/arm.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-:::
+=== "Intel/AMD x86"
 
-::::
+    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:requirements"
 
-::::{tab-item} Apple silicon
-:sync: apple
+=== "ARM AArch64"
 
-:::{include} cpu/apple.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-:::
+    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:requirements"
 
-::::
+=== "Apple silicon"
 
-::::{tab-item} IBM Z (S390X)
-:sync: s390x
+    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:requirements"
 
-:::{include} cpu/s390x.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-:::
+=== "IBM Z (S390X)"
 
-::::
-
-:::::
+    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:requirements"
 
 ## Set up using Python
 
 ### Create a new Python environment
 
-:::{include} python_env_setup.inc.md
-:::
+--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
 
 ### Pre-built wheels
 
@@ -110,69 +50,29 @@ Currently, there are no pre-built CPU wheels.
 
 ### Build wheel from source
 
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} Intel/AMD x86
-:sync: x86
-
-:::{include} cpu/x86.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-
-::::
-
-::::{tab-item} ARM AArch64
-:sync: arm
+=== "Intel/AMD x86"
 
-:::{include} cpu/arm.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
+    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:build-wheel-from-source"
 
-::::
+=== "ARM AArch64"
 
-::::{tab-item} Apple silicon
-:sync: apple
+    --8<-- "docs/getting_started/installation/cpu/arm.inc.md:build-wheel-from-source"
 
-:::{include} cpu/apple.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
+=== "Apple silicon"
 
-::::
+    --8<-- "docs/getting_started/installation/cpu/apple.inc.md:build-wheel-from-source"
 
-::::{tab-item} IBM Z (s390x)
-:sync: s390x
+=== "IBM Z (s390x)"
 
-:::{include} cpu/s390x.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-
-::::
-
-:::::
+    --8<-- "docs/getting_started/installation/cpu/s390x.inc.md:build-wheel-from-source"
 
 ## Set up using Docker
 
 ### Pre-built images
 
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} Intel/AMD x86
-:sync: x86
-
-:::{include} cpu/x86.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-
-::::
+=== "Intel/AMD x86"
 
-:::::
+    --8<-- "docs/getting_started/installation/cpu/x86.inc.md:pre-built-images"
 
 ### Build image from source
 
@@ -192,13 +92,11 @@ $ docker run --rm \
              other vLLM OpenAI server arguments
 ```
 
-::::{tip}
-For ARM or Apple silicon, use `docker/Dockerfile.arm`
-::::
+!!! tip
+    For ARM or Apple silicon, use `docker/Dockerfile.arm`
 
-::::{tip}
-For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float`
-::::
+!!! tip
+    For IBM Z (s390x), use `docker/Dockerfile.s390x` and in `docker run` use flag `--dtype float`
 
 ## Supported features
 
diff --git a/docs/source/getting_started/installation/cpu/apple.inc.md b/docs/getting_started/installation/cpu/apple.inc.md
similarity index 58%
rename from docs/source/getting_started/installation/cpu/apple.inc.md
rename to docs/getting_started/installation/cpu/apple.inc.md
index 7bc9e85ecd96..7a91e3ce5e5b 100644
--- a/docs/source/getting_started/installation/cpu/apple.inc.md
+++ b/docs/getting_started/installation/cpu/apple.inc.md
@@ -1,24 +1,27 @@
-# Installation
+# --8<-- [start:installation]
 
 vLLM has experimental support for macOS with Apple silicon. For now, users shall build from the source vLLM to natively run on macOS.
 
 Currently the CPU implementation for macOS supports FP32 and FP16 datatypes.
 
-:::{attention}
-There are no pre-built wheels or images for this device, so you must build vLLM from source.
-:::
+!!! warning
+    There are no pre-built wheels or images for this device, so you must build vLLM from source.
 
-## Requirements
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
 
 - OS: `macOS Sonoma` or later
 - SDK: `XCode 15.4` or later with Command Line Tools
 - Compiler: `Apple Clang >= 15.0.0`
 
-## Set up using Python
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
 
-### Pre-built wheels
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
 
-### Build wheel from source
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
 
 After installation of XCode and the Command Line Tools, which include Apple Clang, execute the following commands to build and install vLLM from the source.
 
@@ -29,9 +32,8 @@ pip install -r requirements/cpu.txt
 pip install -e . 
 ```
 
-:::{note}
-On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
-:::
+!!! note
+    On macOS the `VLLM_TARGET_DEVICE` is automatically set to `cpu`, which currently is the only supported device.
 
 #### Troubleshooting
 
@@ -51,10 +53,15 @@ If the build has error like the following snippet where standard C++ headers can
       1 error generated.
 ```
 
-## Set up using Docker
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:set-up-using-docker]
 
-### Pre-built images
+# --8<-- [end:set-up-using-docker]
+# --8<-- [start:pre-built-images]
 
-### Build image from source
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
 
-## Extra information
+# --8<-- [end:build-image-from-source]
+# --8<-- [start:extra-information]
+# --8<-- [end:extra-information]
diff --git a/docs/getting_started/installation/cpu/arm.inc.md b/docs/getting_started/installation/cpu/arm.inc.md
new file mode 100644
index 000000000000..59b71dcaf911
--- /dev/null
+++ b/docs/getting_started/installation/cpu/arm.inc.md
@@ -0,0 +1,41 @@
+# --8<-- [start:installation]
+
+vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform.
+
+ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
+
+!!! warning
+    There are no pre-built wheels or images for this device, so you must build vLLM from source.
+
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
+
+- OS: Linux
+- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
+- Instruction Set Architecture (ISA): NEON support is required
+
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
+
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
+
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
+
+--8<-- "docs/getting_started/installation/cpu/cpu/build.inc.md"
+
+Testing has been conducted on AWS Graviton3 instances for compatibility.
+
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:set-up-using-docker]
+
+# --8<-- [end:set-up-using-docker]
+# --8<-- [start:pre-built-images]
+
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
+
+# --8<-- [end:build-image-from-source]
+# --8<-- [start:extra-information]
+# --8<-- [end:extra-information]
diff --git a/docs/source/getting_started/installation/cpu/build.inc.md b/docs/getting_started/installation/cpu/build.inc.md
similarity index 96%
rename from docs/source/getting_started/installation/cpu/build.inc.md
rename to docs/getting_started/installation/cpu/build.inc.md
index f385f3d5b198..7d6472afa7ea 100644
--- a/docs/source/getting_started/installation/cpu/build.inc.md
+++ b/docs/getting_started/installation/cpu/build.inc.md
@@ -32,3 +32,5 @@ If you want to develop vllm, install it in editable mode instead.
 ```console
 VLLM_TARGET_DEVICE=cpu python setup.py develop
 ```
+
+# --8<-- [end:extra-information]
diff --git a/docs/source/getting_started/installation/cpu/s390x.inc.md b/docs/getting_started/installation/cpu/s390x.inc.md
similarity index 64%
rename from docs/source/getting_started/installation/cpu/s390x.inc.md
rename to docs/getting_started/installation/cpu/s390x.inc.md
index 9b41173b44ce..670485feefb6 100644
--- a/docs/source/getting_started/installation/cpu/s390x.inc.md
+++ b/docs/getting_started/installation/cpu/s390x.inc.md
@@ -1,25 +1,28 @@
-# Installation
+# --8<-- [start:installation]
 
 vLLM has experimental support for s390x architecture on IBM Z platform. For now, users shall build from the vLLM source to natively run on IBM Z platform.
 
 Currently the CPU implementation for s390x architecture supports FP32 datatype only.
 
-:::{attention}
-There are no pre-built wheels or images for this device, so you must build vLLM from source.
-:::
+!!! warning
+    There are no pre-built wheels or images for this device, so you must build vLLM from source.
 
-## Requirements
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
 
 - OS: `Linux`
 - SDK: `gcc/g++ >= 12.3.0` or later with Command Line Tools
 - Instruction Set Architecture (ISA): VXE support is required. Works with Z14 and above.
 - Build install python packages: `pyarrow`, `torch` and `torchvision`
 
-## Set up using Python
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
 
-### Pre-built wheels
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
 
-### Build wheel from source
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
 
 Install the following packages from the package manager before building the vLLM. For example on RHEL 9.4:
 
@@ -39,9 +42,8 @@ curl https://sh.rustup.rs -sSf | sh -s -- -y && \
 
 Execute the following commands to build and install vLLM from the source.
 
-::::{tip}
-Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM.
-::::
+!!! tip
+    Please build the following dependencies, `torchvision`, `pyarrow` from the source before building vLLM.
 
 ```console
     sed -i '/^torch/d' requirements-build.txt    # remove torch from requirements-build.txt since we use nightly builds
@@ -53,10 +55,15 @@ Please build the following dependencies, `torchvision`, `pyarrow` from the sourc
     pip install dist/*.whl
 ```
 
-## Set up using Docker
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:set-up-using-docker]
 
-### Pre-built images
+# --8<-- [end:set-up-using-docker]
+# --8<-- [start:pre-built-images]
 
-### Build image from source
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
 
-## Extra information
+# --8<-- [end:build-image-from-source]
+# --8<-- [start:extra-information]
+# --8<-- [end:extra-information]
diff --git a/docs/getting_started/installation/cpu/x86.inc.md b/docs/getting_started/installation/cpu/x86.inc.md
new file mode 100644
index 000000000000..9434eeea8b4a
--- /dev/null
+++ b/docs/getting_started/installation/cpu/x86.inc.md
@@ -0,0 +1,46 @@
+# --8<-- [start:installation]
+
+vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16.
+
+!!! warning
+    There are no pre-built wheels or images for this device, so you must build vLLM from source.
+
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
+
+- OS: Linux
+- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
+- Instruction Set Architecture (ISA): AVX512 (optional, recommended)
+
+!!! tip
+    [Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
+
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
+
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
+
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
+
+--8<-- "docs/getting_started/installation/cpu/cpu/build.inc.md"
+
+!!! note
+    - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
+    - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
+
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:set-up-using-docker]
+
+# --8<-- [end:set-up-using-docker]
+# --8<-- [start:pre-built-images]
+
+See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
+
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
+
+# --8<-- [end:build-image-from-source]
+# --8<-- [start:extra-information]
+# --8<-- [end:extra-information]
diff --git a/docs/source/getting_started/installation/device.template.md b/docs/getting_started/installation/device.template.md
similarity index 100%
rename from docs/source/getting_started/installation/device.template.md
rename to docs/getting_started/installation/device.template.md
diff --git a/docs/getting_started/installation/gpu.md b/docs/getting_started/installation/gpu.md
new file mode 100644
index 000000000000..3c983f600673
--- /dev/null
+++ b/docs/getting_started/installation/gpu.md
@@ -0,0 +1,124 @@
+# GPU
+
+vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions:
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:installation"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:installation"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:installation"
+
+## Requirements
+
+- OS: Linux
+- Python: 3.9 -- 3.12
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:requirements"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:requirements"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:requirements"
+
+## Set up using Python
+
+### Create a new Python environment
+
+--8<-- "docs/getting_started/installation/python_env_setup.inc.md"
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:create-a-new-python-environment"
+
+=== "AMD ROCm"
+
+    There is no extra information on creating a new Python environment for this device.
+
+=== "Intel XPU"
+
+    There is no extra information on creating a new Python environment for this device.
+
+### Pre-built wheels
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-wheels"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-wheels"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-wheels"
+
+[](){ #build-from-source }
+
+### Build wheel from source
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-wheel-from-source"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-wheel-from-source"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-wheel-from-source"
+
+## Set up using Docker
+
+### Pre-built images
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:pre-built-images"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:pre-built-images"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:pre-built-images"
+
+### Build image from source
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:build-image-from-source"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:build-image-from-source"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:build-image-from-source"
+
+## Supported features
+
+=== "NVIDIA CUDA"
+
+    --8<-- "docs/getting_started/installation/gpu/cuda.inc.md:supported-features"
+
+=== "AMD ROCm"
+
+    --8<-- "docs/getting_started/installation/gpu/rocm.inc.md:supported-features"
+
+=== "Intel XPU"
+
+    --8<-- "docs/getting_started/installation/gpu/xpu.inc.md:supported-features"
diff --git a/docs/source/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
similarity index 62%
rename from docs/source/getting_started/installation/gpu/cuda.inc.md
rename to docs/getting_started/installation/gpu/cuda.inc.md
index 06915f09dd51..64dccef63d73 100644
--- a/docs/source/getting_started/installation/gpu/cuda.inc.md
+++ b/docs/getting_started/installation/gpu/cuda.inc.md
@@ -1,43 +1,52 @@
-# Installation
+# --8<-- [start:installation]
 
-vLLM contains pre-compiled C++ and CUDA (12.6) binaries.
+vLLM contains pre-compiled C++ and CUDA (12.8) binaries.
 
-## Requirements
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
 
 - GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 
-## Set up using Python
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
 
 ### Create a new Python environment
 
-:::{note}
-PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
-:::
+!!! note
+    PyTorch installed via `conda` will statically link `NCCL` library, which can cause issues when vLLM tries to use `NCCL`. See <gh-issue:8420> for more details.
 
 In order to be performant, vLLM has to compile many cuda kernels. The compilation unfortunately introduces binary incompatibility with other CUDA versions and PyTorch versions, even for the same PyTorch version with different building configurations.
 
-Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below](#build-from-source) for more details.
+Therefore, it is recommended to install vLLM with a **fresh new** environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See [below][build-from-source] for more details.
 
-### Pre-built wheels
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
 
 You can install vLLM using either `pip` or `uv pip`:
 
 ```console
-# Install vLLM with CUDA 12.6.
-pip install vllm # If you are using pip.
-uv pip install vllm # If you are using uv.
+# Install vLLM with CUDA 12.8.
+# If you are using pip.
+pip install vllm --extra-index-url https://download.pytorch.org/whl/cu128
+# If you are using uv.
+uv pip install vllm --torch-backend=auto
 ```
 
-As of now, vLLM's binaries are compiled with CUDA 12.6 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.8, 11.8, and public PyTorch release versions:
+We recommend leveraging `uv` to [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`). If this doesn't work, try running `uv self update` to update `uv` first.
+
+!!! note
+    NVIDIA Blackwell GPUs (B200, GB200) require a minimum of CUDA 12.8, so make sure you are installing PyTorch wheels with at least that version. PyTorch itself offers a [dedicated interface](https://pytorch.org/get-started/locally/) to determine the appropriate pip command to run for a given target configuration.
+
+As of now, vLLM's binaries are compiled with CUDA 12.8 and public PyTorch release versions by default. We also provide vLLM binaries compiled with CUDA 12.6, 11.8, and public PyTorch release versions:
 
 ```console
 # Install vLLM with CUDA 11.8.
 export VLLM_VERSION=0.6.1.post1
-export PYTHON_VERSION=310
-pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
+export PYTHON_VERSION=312
+uv pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
 ```
 
-(install-the-latest-code)=
+[](){ #install-the-latest-code }
 
 #### Install the latest code
 
@@ -46,40 +55,47 @@ LLM inference is a fast-evolving field, and the latest code may contain bug fixe
 ##### Install the latest code using `pip`
 
 ```console
-pip install -U vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
+pip install -U vllm \
+    --pre \
+    --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
 `--pre` is required for `pip` to consider pre-released versions.
 
-If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
+Another way to install the latest code is to use `uv`:
 
 ```console
-export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
-pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+uv pip install -U vllm \
+    --torch-backend=auto \
+    --extra-index-url https://wheels.vllm.ai/nightly
 ```
 
-Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
-
-##### Install the latest code using `uv`
+##### Install specific revisions using `pip`
 
-Another way to install the latest code is to use `uv`:
+If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), due to the limitation of `pip`, you have to specify the full URL of the wheel file by embedding the commit hash in the URL:
 
 ```console
-uv pip install -U vllm --extra-index-url https://wheels.vllm.ai/nightly
+export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+pip install https://wheels.vllm.ai/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 ```
 
+Note that the wheels are built with Python 3.8 ABI (see [PEP 425](https://peps.python.org/pep-0425/) for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (`1.0.0.dev`) is just a placeholder to have a unified URL for the wheels, the actual versions of wheels are contained in the wheel metadata (the wheels listed in the extra index url have correct versions). Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
+
 ##### Install specific revisions using `uv`
 
 If you want to access the wheels for previous commits (e.g. to bisect the behavior change, performance regression), you can specify the commit hash in the URL:
 
 ```console
 export VLLM_COMMIT=72d9c316d3f6ede485146fe5aabd4e61dbc59069 # use full commit hash from the main branch
-uv pip install vllm --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
+uv pip install vllm \
+    --torch-backend=auto \
+    --extra-index-url https://wheels.vllm.ai/${VLLM_COMMIT}
 ```
 
 The `uv` approach works for vLLM `v0.6.6` and later and offers an easy-to-remember command. A unique feature of `uv` is that packages in `--extra-index-url` have [higher priority than the default index](https://docs.astral.sh/uv/pip/compatibility/#packages-that-exist-on-multiple-indexes). If the latest public release is `v0.6.6.post1`, `uv`'s behavior allows installing a commit before `v0.6.6.post1` by specifying the `--extra-index-url`. In contrast, `pip` combines packages from `--extra-index-url` and the default index, choosing only the latest version, which makes it difficult to install a development version prior to the released version.
 
-### Build wheel from source
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
 
 #### Set up using Python-only build (without compilation)
 
@@ -92,15 +108,15 @@ VLLM_USE_PRECOMPILED=1 pip install --editable .
 ```
 
 This command will do the following:
+
 1. Look for the current branch in your vLLM clone.
-2. Identify the corresponding base commit in the main branch.
-3. Download the pre-built wheel of the base commit.
-4. Use its compiled libraries in the installation.
+1. Identify the corresponding base commit in the main branch.
+1. Download the pre-built wheel of the base commit.
+1. Use its compiled libraries in the installation.
 
-:::{note}
-1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol.
-2. If you rebase your dev branch, it is recommended to uninstall vllm and re-run the above command to make sure your libraries are up to date.
-:::
+!!! note
+    1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol.
+    2. If you rebase your dev branch, it is recommended to uninstall vllm and re-run the above command to make sure your libraries are up to date.
 
 In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the main branch was just merged and the wheel is being built. In this case, you can wait for around an hour to try again, or manually assign the previous commit in the installation using the `VLLM_PRECOMPILED_WHEEL_LOCATION` environment variable.
 
@@ -110,12 +126,11 @@ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://wheels.vllm.ai/${VLLM_COMMIT}/vll
 pip install --editable .
 ```
 
-You can find more information about vLLM's wheels in <project:#install-the-latest-code>.
+You can find more information about vLLM's wheels in [install-the-latest-code][install-the-latest-code].
 
-:::{note}
-There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
-It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to <project:#install-the-latest-code> for instructions on how to install a specified wheel.
-:::
+!!! note
+    There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
+    It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to [install-the-latest-code][install-the-latest-code] for instructions on how to install a specified wheel.
 
 #### Full build (with compilation)
 
@@ -127,17 +142,16 @@ cd vllm
 pip install -e .
 ```
 
-:::{tip}
-Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
+!!! tip
+    Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
 
-For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
-As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
+    For example, you can install [ccache](https://github.com/ccache/ccache) using `conda install ccache` or `apt install ccache` .
+    As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
 
-When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built.
+    When using `ccache` with `pip install -e .`, you should run `CCACHE_NOHASHDIR="true" pip install --no-build-isolation -e .`. This is because `pip` creates a new folder with a random name for each build, preventing `ccache` from recognizing that the same files are being built.
 
-[sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
-The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
-:::
+    [sccache](https://github.com/mozilla/sccache) works similarly to `ccache`, but has the capability to utilize caching in remote storage environments.
+    The following environment variables can be set to configure the vLLM `sccache` remote: `SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1`. We also recommend setting `SCCACHE_IDLE_TIMEOUT=0`.
 
 ##### Use an existing PyTorch installation
 
@@ -184,7 +198,11 @@ Additionally, if you have trouble building vLLM, we recommend using the NVIDIA P
 
 ```console
 # Use `--ipc=host` to make sure the shared memory is large enough.
-docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
+docker run \
+    --gpus all \
+    -it \
+    --rm \
+    --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
 ```
 
 If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from [the official website](https://developer.nvidia.com/cuda-toolkit-archive). After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.:
@@ -212,11 +230,13 @@ export VLLM_TARGET_DEVICE=empty
 pip install -e .
 ```
 
-## Set up using Docker
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:set-up-using-docker]
 
-### Pre-built images
+# --8<-- [end:set-up-using-docker]
+# --8<-- [start:pre-built-images]
 
-See <project:#deployment-docker-pre-built-image> for instructions on using the official Docker image.
+See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image.
 
 Another way to access the latest code is to use the docker images:
 
@@ -229,10 +249,12 @@ These docker images are used for CI and testing only, and they are not intended
 
 The latest code can contain bugs and may not be stable. Please use it with caution.
 
-### Build image from source
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
 
-See <project:#deployment-docker-build-image-from-source> for instructions on building the Docker image.
+See [deployment-docker-build-image-from-source][deployment-docker-build-image-from-source] for instructions on building the Docker image.
 
 ## Supported features
 
-See <project:#feature-x-hardware> compatibility matrix for feature support information.
+See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
+# --8<-- [end:extra-information]
diff --git a/docs/source/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
similarity index 66%
rename from docs/source/getting_started/installation/gpu/rocm.inc.md
rename to docs/getting_started/installation/gpu/rocm.inc.md
index dc74368fe2c9..0029b3a24496 100644
--- a/docs/source/getting_started/installation/gpu/rocm.inc.md
+++ b/docs/getting_started/installation/gpu/rocm.inc.md
@@ -1,28 +1,31 @@
-# Installation
+# --8<-- [start:installation]
 
 vLLM supports AMD GPUs with ROCm 6.3.
 
-:::{attention}
-There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
-:::
+!!! warning
+    There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
 
-## Requirements
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
 
 - GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100/1101), Radeon RX 9000 series (gfx1200/1201)
 - ROCm 6.3
 
-## Set up using Python
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
 
-### Pre-built wheels
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
 
 Currently, there are no pre-built ROCm wheels.
 
-### Build wheel from source
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
 
 0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
 
-- [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html)
-- [PyTorch](https://pytorch.org/)
+    - [ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html)
+    - [PyTorch](https://pytorch.org/)
 
     For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.3_ubuntu24.04_py3.12_pytorch_release_2.4.0`, `rocm/pytorch-nightly`. If you are using docker image, you can skip to Step 3.
 
@@ -49,9 +52,8 @@ Currently, there are no pre-built ROCm wheels.
     cd ../..
     ```
 
-    :::{note}
-    If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
-    :::
+    !!! note
+        If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
 
 2. Optionally, if you choose to use CK flash attention, you can install [flash attention for ROCm](https://github.com/ROCm/flash-attention)
 
@@ -69,9 +71,8 @@ Currently, there are no pre-built ROCm wheels.
     cd ..
     ```
 
-    :::{note}
-    You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
-    :::
+    !!! note
+        You might need to downgrade the "ninja" version to 1.10 as it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
 
 3. If you choose to build AITER yourself to use a certain branch or commit, you can build AITER using the following steps:
 
@@ -84,55 +85,56 @@ Currently, there are no pre-built ROCm wheels.
     python3 setup.py develop
     ```
 
-    :::{note}
-    You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose.
-    :::
+    !!! note
+        You will need to config the `$AITER_BRANCH_OR_COMMIT` for your purpose.
 
 4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:
 
     ```bash
-    $ pip install --upgrade pip
+    pip install --upgrade pip
 
     # Build & install AMD SMI
-    $ pip install /opt/rocm/share/amd_smi
+    pip install /opt/rocm/share/amd_smi
 
     # Install dependencies
-    $ pip install --upgrade numba scipy huggingface-hub[cli,hf_transfer] setuptools_scm
-    $ pip install "numpy<2"
-    $ pip install -r requirements/rocm.txt
+    pip install --upgrade numba \
+        scipy \
+        huggingface-hub[cli,hf_transfer] \
+        setuptools_scm
+    pip install "numpy<2"
+    pip install -r requirements/rocm.txt
 
     # Build vLLM for MI210/MI250/MI300.
-    $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-    $ python3 setup.py develop
+    export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+    python3 setup.py develop
     ```
 
     This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
 
-    :::{tip}
-   - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
-   - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
-   - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
-   - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
-    :::
+    !!! tip
+        - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
+        - Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
+        - To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
+        - The ROCm version of PyTorch, ideally, should match the ROCm driver version.
 
-:::{tip}
-- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
-  For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
-:::
+!!! tip
+    - For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
+      For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
 
 ## Set up using Docker (Recommended)
 
-### Pre-built images
+# --8<-- [end:set-up-using-docker]
+# --8<-- [start:pre-built-images]
 
 The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
 docker image designed for validating inference performance on the AMD Instinct™ MI300X accelerator.
 
-:::{tip}
-Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
-for instructions on how to use this prebuilt docker image.
-:::
+!!! tip
+    Please check [LLM inference performance validation on AMD Instinct MI300X](https://rocm.docs.amd.com/en/latest/how-to/performance-validation/mi300x/vllm-benchmark.html)
+    for instructions on how to use this prebuilt docker image.
 
-### Build image from source
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
 
 Building the Docker image from source is the recommended way to use vLLM with ROCm.
 
@@ -155,7 +157,9 @@ It is important that the user kicks off the docker build using buildkit. Either
 To build vllm on ROCm 6.3 for MI200 and MI300 series, you can use the default:
 
 ```console
-DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm_base -t rocm/vllm-dev:base .
+DOCKER_BUILDKIT=1 docker build \
+    -f docker/Dockerfile.rocm_base \
+    -t rocm/vllm-dev:base .
 ```
 
 #### Build an image with vLLM
@@ -190,7 +194,11 @@ DOCKER_BUILDKIT=1 docker build -f docker/Dockerfile.rocm -t vllm-rocm .
 To build vllm on ROCm 6.3 for Radeon RX7900 series (gfx1100), you should pick the alternative base image:
 
 ```console
-DOCKER_BUILDKIT=1 docker build --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" -f docker/Dockerfile.rocm -t vllm-rocm .
+DOCKER_BUILDKIT=1 docker build \
+    --build-arg BASE_IMAGE="rocm/vllm-dev:navi_base" \
+    -f docker/Dockerfile.rocm \
+    -t vllm-rocm \
+    .
 ```
 
 To run the above docker image `vllm-rocm`, use the below command:
@@ -213,4 +221,5 @@ Where the `<path/to/model>` is the location where the model is stored, for examp
 
 ## Supported features
 
-See <project:#feature-x-hardware> compatibility matrix for feature support information.
+See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
+# --8<-- [end:extra-information]
diff --git a/docs/source/getting_started/installation/gpu/xpu.inc.md b/docs/getting_started/installation/gpu/xpu.inc.md
similarity index 67%
rename from docs/source/getting_started/installation/gpu/xpu.inc.md
rename to docs/getting_started/installation/gpu/xpu.inc.md
index 4ab41a21c2a1..bee9a7ebb717 100644
--- a/docs/source/getting_started/installation/gpu/xpu.inc.md
+++ b/docs/getting_started/installation/gpu/xpu.inc.md
@@ -1,23 +1,26 @@
-# Installation
+# --8<-- [start:installation]
 
 vLLM initially supports basic model inference and serving on Intel GPU platform.
 
-:::{attention}
-There are no pre-built wheels or images for this device, so you must build vLLM from source.
-:::
+!!! warning
+    There are no pre-built wheels or images for this device, so you must build vLLM from source.
 
-## Requirements
+# --8<-- [end:installation]
+# --8<-- [start:requirements]
 
 - Supported Hardware: Intel Data Center GPU, Intel ARC GPU
 - OneAPI requirements: oneAPI 2025.0
 
-## Set up using Python
+# --8<-- [end:requirements]
+# --8<-- [start:set-up-using-python]
 
-### Pre-built wheels
+# --8<-- [end:set-up-using-python]
+# --8<-- [start:pre-built-wheels]
 
 Currently, there are no pre-built XPU wheels.
 
-### Build wheel from source
+# --8<-- [end:pre-built-wheels]
+# --8<-- [start:build-wheel-from-source]
 
 - First, install required driver and Intel OneAPI 2025.0 or later.
 - Second, install Python packages for vLLM XPU backend building:
@@ -35,18 +38,20 @@ pip install -v -r requirements/xpu.txt
 VLLM_TARGET_DEVICE=xpu python setup.py install
 ```
 
-:::{note}
-- FP16 is the default data type in the current XPU backend. The BF16 data
-  type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
-:::
+!!! note
+    - FP16 is the default data type in the current XPU backend. The BF16 data
+      type is supported on Intel Data Center GPU, not supported on Intel Arc GPU yet.
 
-## Set up using Docker
+# --8<-- [end:build-wheel-from-source]
+# --8<-- [start:set-up-using-docker]
 
-### Pre-built images
+# --8<-- [end:set-up-using-docker]
+# --8<-- [start:pre-built-images]
 
 Currently, there are no pre-built XPU images.
 
-### Build image from source
+# --8<-- [end:pre-built-images]
+# --8<-- [start:build-image-from-source]
 
 ```console
 $ docker build -f docker/Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
@@ -66,7 +71,6 @@ XPU platform supports **tensor parallel** inference/serving and also supports **
 python -m vllm.entrypoints.openai.api_server \
      --model=facebook/opt-13b \
      --dtype=bfloat16 \
-     --device=xpu \
      --max_model_len=1024 \
      --distributed-executor-backend=ray \
      --pipeline-parallel-size=2 \
@@ -74,3 +78,4 @@ python -m vllm.entrypoints.openai.api_server \
 ```
 
 By default, a ray instance will be launched automatically if no existing one is detected in the system, with `num-gpus` equals to `parallel_config.world_size`. We recommend properly starting a ray cluster before execution, referring to the <gh-file:examples/online_serving/run_cluster.sh> helper script.
+# --8<-- [end:extra-information]
diff --git a/docs/getting_started/installation/python_env_setup.inc.md b/docs/getting_started/installation/python_env_setup.inc.md
new file mode 100644
index 000000000000..911301d68335
--- /dev/null
+++ b/docs/getting_started/installation/python_env_setup.inc.md
@@ -0,0 +1,6 @@
+It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python environment manager, to create and manage Python environments. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment and install vLLM using the following commands:
+
+```console
+uv venv --python 3.12 --seed
+source .venv/bin/activate
+```
diff --git a/docs/source/getting_started/quickstart.md b/docs/getting_started/quickstart.md
similarity index 68%
rename from docs/source/getting_started/quickstart.md
rename to docs/getting_started/quickstart.md
index 298ba59f7d8b..d24e75e8141d 100644
--- a/docs/source/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -1,11 +1,12 @@
-(quickstart)=
-
-# Quickstart
+---
+title: Quickstart
+---
+[](){ #quickstart }
 
 This guide will help you quickly get started with vLLM to perform:
 
-- [Offline batched inference](#quickstart-offline)
-- [Online serving using OpenAI-compatible server](#quickstart-online)
+- [Offline batched inference][quickstart-offline]
+- [Online serving using OpenAI-compatible server][quickstart-online]
 
 ## Prerequisites
 
@@ -21,48 +22,49 @@ It's recommended to use [uv](https://docs.astral.sh/uv/), a very fast Python env
 ```console
 uv venv --python 3.12 --seed
 source .venv/bin/activate
-uv pip install vllm
+uv pip install vllm --torch-backend=auto
 ```
 
-Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating an environment:
+`uv` can [automatically select the appropriate PyTorch index at runtime](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection) by inspecting the installed CUDA driver version via `--torch-backend=auto` (or `UV_TORCH_BACKEND=auto`). To select a specific backend (e.g., `cu126`), set `--torch-backend=cu126` (or `UV_TORCH_BACKEND=cu126`).
+
+Another delightful way is to use `uv run` with `--with [dependency]` option, which allows you to run commands such as `vllm serve` without creating any permanent environment:
 
 ```console
 uv run --with vllm vllm --help
 ```
 
-You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments.
+You can also use [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) to create and manage Python environments. You can install `uv` to the conda environment through `pip` if you want to manage it within the environment.
 
 ```console
 conda create -n myenv python=3.12 -y
 conda activate myenv
-pip install vllm
+pip install --upgrade uv
+uv pip install vllm --torch-backend=auto
 ```
 
-:::{note}
-For non-CUDA platforms, please refer [here](#installation-index) for specific instructions on how to install vLLM.
-:::
+!!! note
+    For more detail and non-CUDA platforms, please refer [here][installation-index] for specific instructions on how to install vLLM.
 
-(quickstart-offline)=
+[](){ #quickstart-offline }
 
 ## Offline Batched Inference
 
 With vLLM installed, you can start generating texts for list of input prompts (i.e. offline batch inferencing). See the example script: <gh-file:examples/offline_inference/basic/basic.py>
 
-The first line of this example imports the classes {class}`~vllm.LLM` and {class}`~vllm.SamplingParams`:
+The first line of this example imports the classes [LLM][vllm.LLM] and [SamplingParams][vllm.SamplingParams]:
 
-- {class}`~vllm.LLM` is the main class for running offline inference with vLLM engine.
-- {class}`~vllm.SamplingParams` specifies the parameters for the sampling process.
+- [LLM][vllm.LLM] is the main class for running offline inference with vLLM engine.
+- [SamplingParams][vllm.SamplingParams] specifies the parameters for the sampling process.
 
 ```python
 from vllm import LLM, SamplingParams
 ```
 
-The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params).
-:::{important}
-By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified.
+The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here][sampling-params].
+!!! warning
+    By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
 
-However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance.
-:::
+    However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
 
 ```python
 prompts = [
@@ -74,15 +76,18 @@ prompts = [
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 ```
 
-The {class}`~vllm.LLM` class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here](#supported-models).
+The [LLM][vllm.LLM] class initializes vLLM's engine and the [OPT-125M model](https://arxiv.org/abs/2205.01068) for offline inference. The list of supported models can be found [here][supported-models].
 
 ```python
 llm = LLM(model="facebook/opt-125m")
 ```
 
-:::{note}
-By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
-:::
+!!! note
+    By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
+
+    ```shell
+    export VLLM_USE_MODELSCOPE=True
+    ```
 
 Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
 
@@ -95,7 +100,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-(quickstart-online)=
+[](){ #quickstart-online }
 
 ## OpenAI-Compatible Server
 
@@ -108,15 +113,13 @@ Run the following command to start the vLLM server with the [Qwen2.5-1.5B-Instru
 vllm serve Qwen/Qwen2.5-1.5B-Instruct
 ```
 
-:::{note}
-By default, the server uses a predefined chat template stored in the tokenizer.
-You can learn about overriding it [here](#chat-template).
-:::
-:::{important}
-By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
+!!! note
+    By default, the server uses a predefined chat template stored in the tokenizer.
+    You can learn about overriding it [here][chat-template].
+!!! warning
+    By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
 
-To disable this behavior, please pass `--generation-config vllm` when launching the server.
-:::
+    To disable this behavior, please pass `--generation-config vllm` when launching the server.
 
 This server can be queried in the same format as OpenAI API. For example, to list the models:
 
@@ -207,6 +210,5 @@ Currently, vLLM supports multiple backends for efficient Attention computation a
 
 If desired, you can also manually set the backend of your choice by configuring the environment variable `VLLM_ATTENTION_BACKEND` to one of the following options: `FLASH_ATTN`, `FLASHINFER` or `XFORMERS`.
 
-```{attention}
-There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see <gh-file:docker/Dockerfile> for instructions on how to install it.
-```
+!!! warning
+    There are no pre-built vllm wheels containing Flash Infer, so you must install it in your environment first. Refer to the [Flash Infer official docs](https://docs.flashinfer.ai/) or see <gh-file:docker/Dockerfile> for instructions on how to install it.
diff --git a/docs/make.bat b/docs/make.bat
deleted file mode 100644
index 747ffb7b3033..000000000000
--- a/docs/make.bat
+++ /dev/null
@@ -1,35 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
diff --git a/docs/mkdocs/hooks/generate_examples.py b/docs/mkdocs/hooks/generate_examples.py
new file mode 100644
index 000000000000..6f290efe45c2
--- /dev/null
+++ b/docs/mkdocs/hooks/generate_examples.py
@@ -0,0 +1,162 @@
+# SPDX-License-Identifier: Apache-2.0
+import itertools
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Literal
+
+import regex as re
+
+ROOT_DIR = Path(__file__).parent.parent.parent.parent
+ROOT_DIR_RELATIVE = '../../../../..'
+EXAMPLE_DIR = ROOT_DIR / "examples"
+EXAMPLE_DOC_DIR = ROOT_DIR / "docs/examples"
+print(ROOT_DIR.resolve())
+print(EXAMPLE_DIR.resolve())
+print(EXAMPLE_DOC_DIR.resolve())
+
+
+def fix_case(text: str) -> str:
+    subs = {
+        "api": "API",
+        "cli": "CLI",
+        "cpu": "CPU",
+        "llm": "LLM",
+        "mae": "MAE",
+        "tpu": "TPU",
+        "aqlm": "AQLM",
+        "gguf": "GGUF",
+        "lora": "LoRA",
+        "rlhf": "RLHF",
+        "vllm": "vLLM",
+        "openai": "OpenAI",
+        "lmcache": "LMCache",
+        "multilora": "MultiLoRA",
+        "mlpspeculator": "MLPSpeculator",
+        r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32
+        r"int\d+": lambda x: x.group(0).upper(),  # e.g. int8, int16
+    }
+    for pattern, repl in subs.items():
+        text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE)
+    return text
+
+
+@dataclass
+class Example:
+    """
+    Example class for generating documentation content from a given path.
+
+    Attributes:
+        path (Path): The path to the main directory or file.
+        category (str): The category of the document.
+        main_file (Path): The main file in the directory.
+        other_files (list[Path]): list of other files in the directory.
+        title (str): The title of the document.
+
+    Methods:
+        __post_init__(): Initializes the main_file, other_files, and title attributes.
+        determine_main_file() -> Path: Determines the main file in the given path.
+        determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
+        determine_title() -> str: Determines the title of the document.
+        generate() -> str: Generates the documentation content.
+    """ # noqa: E501
+    path: Path
+    category: str = None
+    main_file: Path = field(init=False)
+    other_files: list[Path] = field(init=False)
+    title: str = field(init=False)
+
+    def __post_init__(self):
+        self.main_file = self.determine_main_file()
+        self.other_files = self.determine_other_files()
+        self.title = self.determine_title()
+
+    def determine_main_file(self) -> Path:
+        """
+        Determines the main file in the given path.
+        If the path is a file, it returns the path itself. Otherwise, it searches
+        for Markdown files (*.md) in the directory and returns the first one found.
+        Returns:
+            Path: The main file path, either the original path if it's a file or the first
+            Markdown file found in the directory.
+        Raises:
+            IndexError: If no Markdown files are found in the directory.
+        """ # noqa: E501
+        return self.path if self.path.is_file() else list(
+            self.path.glob("*.md")).pop()
+
+    def determine_other_files(self) -> list[Path]:
+        """
+        Determine other files in the directory excluding the main file.
+
+        This method checks if the given path is a file. If it is, it returns an empty list.
+        Otherwise, it recursively searches through the directory and returns a list of all
+        files that are not the main file.
+
+        Returns:
+            list[Path]: A list of Path objects representing the other files in the directory.
+        """ # noqa: E501
+        if self.path.is_file():
+            return []
+        is_other_file = lambda file: file.is_file() and file != self.main_file
+        return [file for file in self.path.rglob("*") if is_other_file(file)]
+
+    def determine_title(self) -> str:
+        return fix_case(self.path.stem.replace("_", " ").title())
+
+    def generate(self) -> str:
+        content = f"---\ntitle: {self.title}\n---\n\n"
+        content += f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
+
+        # Use long code fence to avoid issues with
+        # included files containing code fences too
+        code_fence = "``````"
+        is_code = self.main_file.suffix != ".md"
+        if is_code:
+            content += f"{code_fence}{self.main_file.suffix[1:]}\n"
+        content += f'--8<-- "{self.main_file}"\n'
+        if is_code:
+            content += f"{code_fence}\n"
+        content += "\n"
+
+        if not self.other_files:
+            return content
+
+        content += "## Example materials\n\n"
+        for file in sorted(self.other_files):
+            content += f'??? abstract "{file.relative_to(self.path)}"\n'
+            if file.suffix != ".md":
+                content += f"    {code_fence}{file.suffix[1:]}\n"
+            content += f'    --8<-- "{file}"\n'
+            if file.suffix != ".md":
+                content += f"    {code_fence}\n"
+
+        return content
+
+
+def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
+    # Create the EXAMPLE_DOC_DIR if it doesn't exist
+    if not EXAMPLE_DOC_DIR.exists():
+        EXAMPLE_DOC_DIR.mkdir(parents=True)
+
+    categories = sorted(p for p in EXAMPLE_DIR.iterdir() if p.is_dir())
+
+    examples = []
+    glob_patterns = ["*.py", "*.md", "*.sh"]
+    # Find categorised examples
+    for category in categories:
+        globs = [category.glob(pattern) for pattern in glob_patterns]
+        for path in itertools.chain(*globs):
+            examples.append(Example(path, category.stem))
+        # Find examples in subdirectories
+        for path in category.glob("*/*.md"):
+            examples.append(Example(path.parent, category.stem))
+
+    # Generate the example documentation
+    for example in sorted(examples, key=lambda e: e.path.stem):
+        example_name = f"{example.path.stem}.md"
+        doc_path = EXAMPLE_DOC_DIR / example.category / example_name
+        print(doc_path)
+        if not doc_path.parent.exists():
+            doc_path.parent.mkdir(parents=True)
+        with open(doc_path, "w+") as f:
+            f.write(example.generate())
diff --git a/docs/mkdocs/hooks/remove_announcement.py b/docs/mkdocs/hooks/remove_announcement.py
new file mode 100644
index 000000000000..e5f8549d8383
--- /dev/null
+++ b/docs/mkdocs/hooks/remove_announcement.py
@@ -0,0 +1,16 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import Literal
+
+
+def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool):
+    # see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
+    if os.getenv('READTHEDOCS_VERSION_TYPE') == "tag":
+        # remove the warning banner if the version is a tagged release
+        docs_dir = os.path.dirname(__file__)
+        announcement_path = os.path.join(docs_dir,
+                                         "mkdocs/overrides/main.html")
+        # The file might be removed already if the build is triggered multiple
+        # times (readthedocs build both HTML and PDF versions separately)
+        if os.path.exists(announcement_path):
+            os.remove(announcement_path)
diff --git a/docs/mkdocs/hooks/url_schemes.py b/docs/mkdocs/hooks/url_schemes.py
new file mode 100644
index 000000000000..c738828085ba
--- /dev/null
+++ b/docs/mkdocs/hooks/url_schemes.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+import regex as re
+from mkdocs.config.defaults import MkDocsConfig
+from mkdocs.structure.files import Files
+from mkdocs.structure.pages import Page
+
+
+def on_page_markdown(markdown: str, *, page: Page, config: MkDocsConfig,
+                     files: Files):
+    gh_icon = ":octicons-mark-github-16:"
+    gh_url = "https://github.com"
+    repo_url = f"{gh_url}/vllm-project/vllm"
+    org_url = f"{gh_url}/orgs/vllm-project"
+    urls = {
+        "issue": f"{repo_url}/issues",
+        "pr": f"{repo_url}/pull",
+        "project": f"{org_url}/projects",
+        "dir": f"{repo_url}/tree/main",
+        "file": f"{repo_url}/blob/main",
+    }
+    titles = {
+        "issue": "Issue #",
+        "pr": "Pull Request #",
+        "project": "Project #",
+        "dir": "",
+        "file": "",
+    }
+
+    scheme = r"gh-(?P<type>.+?):(?P<path>.+?)(#(?P<fragment>.+?))?"
+    inline_link = re.compile(r"\[(?P<title>[^\[]+?)\]\(" + scheme + r"\)")
+    auto_link = re.compile(f"<{scheme}>")
+
+    def replace_inline_link(match: re.Match) -> str:
+        url = f'{urls[match.group("type")]}/{match.group("path")}'
+        if fragment := match.group("fragment"):
+            url += f"#{fragment}"
+
+        return f'[{gh_icon} {match.group("title")}]({url})'
+
+    def replace_auto_link(match: re.Match) -> str:
+        type = match.group("type")
+        path = match.group("path")
+        title = f"{titles[type]}{path}"
+        url = f"{urls[type]}/{path}"
+        if fragment := match.group("fragment"):
+            url += f"#{fragment}"
+
+        return f"[{gh_icon} {title}]({url})"
+
+    markdown = inline_link.sub(replace_inline_link, markdown)
+    markdown = auto_link.sub(replace_auto_link, markdown)
+
+    return markdown
diff --git a/docs/source/_static/custom.js b/docs/mkdocs/javascript/run_llm_widget.js
similarity index 54%
rename from docs/source/_static/custom.js
rename to docs/mkdocs/javascript/run_llm_widget.js
index 58bc2ebb9614..d0e5560e92b4 100644
--- a/docs/source/_static/custom.js
+++ b/docs/mkdocs/javascript/run_llm_widget.js
@@ -17,22 +17,3 @@ document.addEventListener("DOMContentLoaded", function () {
     script.async = true;
     document.head.appendChild(script);
   });
-
-// Update URL search params when tab is clicked
-  document.addEventListener("DOMContentLoaded", function () {
-    const tabs = document.querySelectorAll(".sd-tab-label");
-
-    function updateURL(tab) {
-      const syncGroup = tab.getAttribute("data-sync-group");
-      const syncId = tab.getAttribute("data-sync-id");
-      if (syncGroup && syncId) {
-          const url = new URL(window.location);
-          url.searchParams.set(syncGroup, syncId);
-          window.history.replaceState(null, "", url);
-      }
-    }
-
-    tabs.forEach(tab => {
-        tab.addEventListener("click", () => updateURL(tab));
-    });
-});
diff --git a/docs/mkdocs/overrides/main.html b/docs/mkdocs/overrides/main.html
new file mode 100644
index 000000000000..bdd62ebc158d
--- /dev/null
+++ b/docs/mkdocs/overrides/main.html
@@ -0,0 +1,5 @@
+{% extends "base.html" %}
+
+{% block announce %}
+  <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
+{% endblock %}
diff --git a/docs/mkdocs/stylesheets/extra.css b/docs/mkdocs/stylesheets/extra.css
new file mode 100644
index 000000000000..088143ed5956
--- /dev/null
+++ b/docs/mkdocs/stylesheets/extra.css
@@ -0,0 +1,36 @@
+/* Warning for latest docs */
+.md-banner {
+    background-color: var(--md-warning-bg-color);
+    color: var(--md-warning-fg-color);
+}
+
+/* https://christianoliff.com/blog/styling-external-links-with-an-icon-in-css/ */
+a:not(:has(svg)):not(.md-icon):not(.autorefs-external) {
+    align-items: center;
+
+    &[href^="//"]::after,
+    &[href^="http://"]::after,
+    &[href^="https://"]::after {
+        content: "";
+        width: 12px;
+        height: 12px;
+        margin-left: 4px;
+        background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='16' height='16' stroke='gray' viewBox='0 0 16 16'%3E%3Cpath fill-rule='evenodd' d='M8.636 3.5a.5.5 0 0 0-.5-.5H1.5A1.5 1.5 0 0 0 0 4.5v10A1.5 1.5 0 0 0 1.5 16h10a1.5 1.5 0 0 0 1.5-1.5V7.864a.5.5 0 0 0-1 0V14.5a.5.5 0 0 1-.5.5h-10a.5.5 0 0 1-.5-.5v-10a.5.5 0 0 1 .5-.5h6.636a.5.5 0 0 0 .5-.5z'/%3E%3Cpath fill-rule='evenodd' d='M16 .5a.5.5 0 0 0-.5-.5h-5a.5.5 0 0 0 0 1h3.793L6.146 9.146a.5.5 0 1 0 .708.708L15 1.707V5.5a.5.5 0 0 0 1 0v-5z'/%3E%3C/svg%3E");
+        background-position: center;
+        background-repeat: no-repeat;
+        background-size: contain;
+        display: inline-block;
+    }
+}
+
+/* Light mode: darker section titles */
+body[data-md-color-scheme="default"] .md-nav__item--section > label.md-nav__link .md-ellipsis {
+  color: rgba(0, 0, 0, 0.7) !important;
+  font-weight: 700;
+}
+
+/* Dark mode: lighter gray section titles */
+body[data-md-color-scheme="slate"] .md-nav__item--section > label.md-nav__link .md-ellipsis {
+  color: rgba(255, 255, 255, 0.75) !important;
+  font-weight: 700;
+}
diff --git a/docs/source/models/extensions/fastsafetensor.md b/docs/models/extensions/fastsafetensor.md
similarity index 100%
rename from docs/source/models/extensions/fastsafetensor.md
rename to docs/models/extensions/fastsafetensor.md
diff --git a/docs/source/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md
similarity index 61%
rename from docs/source/models/extensions/runai_model_streamer.md
rename to docs/models/extensions/runai_model_streamer.md
index e0daa6f86dde..6755b574ea67 100644
--- a/docs/source/models/extensions/runai_model_streamer.md
+++ b/docs/models/extensions/runai_model_streamer.md
@@ -1,6 +1,7 @@
-(runai-model-streamer)=
-
-# Loading models with Run:ai Model Streamer
+---
+title: Loading models with Run:ai Model Streamer
+---
+[](){ #runai-model-streamer }
 
 Run:ai Model Streamer is a library to read tensors in concurrency, while streaming it to GPU memory.
 Further reading can be found in [Run:ai Model Streamer Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/README.md).
@@ -15,19 +16,25 @@ pip3 install vllm[runai]
 To run it as an OpenAI-compatible server, add the `--load-format runai_streamer` flag:
 
 ```console
-vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer
+vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
+    --load-format runai_streamer
 ```
 
 To run model from AWS S3 object store run:
 
 ```console
-vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
+vllm serve s3://core-llm/Llama-3-8b \
+    --load-format runai_streamer
 ```
 
 To run model from a S3 compatible object store run:
 
 ```console
-RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 AWS_EC2_METADATA_DISABLED=true AWS_ENDPOINT_URL=https://storage.googleapis.com vllm serve s3://core-llm/Llama-3-8b --load-format runai_streamer
+RUNAI_STREAMER_S3_USE_VIRTUAL_ADDRESSING=0 \
+AWS_EC2_METADATA_DISABLED=true \
+AWS_ENDPOINT_URL=https://storage.googleapis.com \
+vllm serve s3://core-llm/Llama-3-8b \
+    --load-format runai_streamer
 ```
 
 ## Tunable parameters
@@ -38,19 +45,22 @@ You can tune `concurrency` that controls the level of concurrency and number of
 For reading from S3, it will be the number of client instances the host is opening to the S3 server.
 
 ```console
-vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"concurrency":16}'
+vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
+    --load-format runai_streamer \
+    --model-loader-extra-config '{"concurrency":16}'
 ```
 
 You can control the size of the CPU Memory buffer to which tensors are read from the file, and limit this size.
 You can read further about CPU buffer memory limiting [here](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md#runai_streamer_memory_limit).
 
 ```console
-vllm serve /home/meta-llama/Llama-3.2-3B-Instruct --load-format runai_streamer --model-loader-extra-config '{"memory_limit":5368709120}'
+vllm serve /home/meta-llama/Llama-3.2-3B-Instruct \
+    --load-format runai_streamer \
+    --model-loader-extra-config '{"memory_limit":5368709120}'
 ```
 
-:::{note}
-For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md).
-:::
+!!! note
+    For further instructions about tunable parameters and additional parameters configurable through environment variables, read the [Environment Variables Documentation](https://github.com/run-ai/runai-model-streamer/blob/master/docs/src/env-vars.md).
 
 ## Sharded Model Loading
 
@@ -63,7 +73,9 @@ vllm serve /path/to/sharded/model --load-format runai_streamer_sharded
 The sharded loader expects model files to follow the same naming pattern as the regular sharded state loader: `model-rank-{rank}-part-{part}.safetensors`. You can customize this pattern using the `pattern` parameter in `--model-loader-extra-config`:
 
 ```console
-vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}'
+vllm serve /path/to/sharded/model \
+    --load-format runai_streamer_sharded \
+    --model-loader-extra-config '{"pattern":"custom-model-rank-{rank}-part-{part}.safetensors"}'
 ```
 
 To create sharded model files, you can use the script provided in <gh-file:examples/offline_inference/save_sharded_state.py>. This script demonstrates how to save a model in the sharded format that is compatible with the Run:ai Model Streamer sharded loader.
@@ -71,9 +83,10 @@ To create sharded model files, you can use the script provided in <gh-file:examp
 The sharded loader supports all the same tunable parameters as the regular Run:ai Model Streamer, including `concurrency` and `memory_limit`. These can be configured in the same way:
 
 ```console
-vllm serve /path/to/sharded/model --load-format runai_streamer_sharded --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}'
+vllm serve /path/to/sharded/model \
+    --load-format runai_streamer_sharded \
+    --model-loader-extra-config '{"concurrency":16, "memory_limit":5368709120}'
 ```
 
-:::{note}
-The sharded loader is particularly efficient for tensor or pipeline parallel models where each worker only needs to read its own shard rather than the entire checkpoint.
-:::
+!!! note
+    The sharded loader is particularly efficient for tensor or pipeline parallel models where each worker only needs to read its own shard rather than the entire checkpoint.
diff --git a/docs/source/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md
similarity index 69%
rename from docs/source/models/extensions/tensorizer.md
rename to docs/models/extensions/tensorizer.md
index cd94c81e620a..b6feb405c6ca 100644
--- a/docs/source/models/extensions/tensorizer.md
+++ b/docs/models/extensions/tensorizer.md
@@ -1,6 +1,7 @@
-(tensorizer)=
-
-# Loading models with CoreWeave's Tensorizer
+---
+title: Loading models with CoreWeave's Tensorizer
+---
+[](){ #tensorizer }
 
 vLLM supports loading models with [CoreWeave's Tensorizer](https://docs.coreweave.com/coreweave-machine-learning-and-ai/inference/tensorizer).
 vLLM model tensors that have been serialized to disk, an HTTP/HTTPS endpoint, or S3 endpoint can be deserialized
@@ -9,8 +10,7 @@ shorter Pod startup times and CPU memory usage. Tensor encryption is also suppor
 
 For more information on CoreWeave's Tensorizer, please refer to
 [CoreWeave's Tensorizer documentation](https://github.com/coreweave/tensorizer). For more information on serializing a vLLM model, as well a general usage guide to using Tensorizer with vLLM, see
-the [vLLM example script](https://docs.vllm.ai/en/latest/getting_started/examples/tensorize_vllm_model.html).
+the [vLLM example script](https://docs.vllm.ai/en/latest/examples/tensorize_vllm_model.html).
 
-:::{note}
-Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
-:::
+!!! note
+    Note that to use this feature you will need to install `tensorizer` by running `pip install vllm[tensorizer]`.
diff --git a/docs/source/models/generative_models.md b/docs/models/generative_models.md
similarity index 63%
rename from docs/source/models/generative_models.md
rename to docs/models/generative_models.md
index dd765e4a9765..566b1c29fca9 100644
--- a/docs/source/models/generative_models.md
+++ b/docs/models/generative_models.md
@@ -1,24 +1,25 @@
-(generative-models)=
-
-# Generative Models
+---
+title: Generative Models
+---
+[](){ #generative-models }
 
 vLLM provides first-class support for generative models, which covers most of LLMs.
 
-In vLLM, generative models implement the {class}`~vllm.model_executor.models.VllmModelForTextGeneration` interface.
+In vLLM, generative models implement the [VllmModelForTextGeneration][vllm.model_executor.models.VllmModelForTextGeneration] interface.
 Based on the final hidden states of the input, these models output log probabilities of the tokens to generate,
-which are then passed through {class}`~vllm.model_executor.layers.Sampler` to obtain the final text.
+which are then passed through [Sampler][vllm.model_executor.layers.Sampler] to obtain the final text.
 
 For generative models, the only supported `--task` option is `"generate"`.
 Usually, this is automatically inferred so you don't have to specify it.
 
 ## Offline Inference
 
-The {class}`~vllm.LLM` class provides various methods for offline inference.
-See <project:#configuration> for a list of options when initializing the model.
+The [LLM][vllm.LLM] class provides various methods for offline inference.
+See [configuration][configuration] for a list of options when initializing the model.
 
 ### `LLM.generate`
 
-The {class}`~vllm.LLM.generate` method is available to all generative models in vLLM.
+The [generate][vllm.LLM.generate] method is available to all generative models in vLLM.
 It is similar to [its counterpart in HF Transformers](https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate),
 except that tokenization and detokenization are also performed automatically.
 
@@ -34,7 +35,7 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-You can optionally control the language generation by passing {class}`~vllm.SamplingParams`.
+You can optionally control the language generation by passing [SamplingParams][vllm.SamplingParams].
 For example, you can use greedy sampling by setting `temperature=0`:
 
 ```python
@@ -50,16 +51,15 @@ for output in outputs:
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 
-:::{important}
-By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified.
+!!! warning
+    By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if [SamplingParams][vllm.SamplingParams] is not specified.
 
-However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance.
-:::
+    However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the [LLM][vllm.LLM] instance.
 A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>
 
 ### `LLM.beam_search`
 
-The {class}`~vllm.LLM.beam_search` method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search) on top of {class}`~vllm.LLM.generate`.
+The [beam_search][vllm.LLM.beam_search] method implements [beam search](https://huggingface.co/docs/transformers/en/generation_strategies#beam-search) on top of [generate][vllm.LLM.generate].
 For example, to search using 5 beams and output at most 50 tokens:
 
 ```python
@@ -77,14 +77,13 @@ for output in outputs:
 
 ### `LLM.chat`
 
-The {class}`~vllm.LLM.chat` method implements chat functionality on top of {class}`~vllm.LLM.generate`.
+The [chat][vllm.LLM.chat] method implements chat functionality on top of [generate][vllm.LLM.generate].
 In particular, it accepts input similar to [OpenAI Chat Completions API](https://platform.openai.com/docs/api-reference/chat)
 and automatically applies the model's [chat template](https://huggingface.co/docs/transformers/en/chat_templating) to format the prompt.
 
-:::{important}
-In general, only instruction-tuned models have a chat template.
-Base models may perform poorly as they are not trained to respond to the chat conversation.
-:::
+!!! warning
+    In general, only instruction-tuned models have a chat template.
+    Base models may perform poorly as they are not trained to respond to the chat conversation.
 
 ```python
 from vllm import LLM
@@ -133,7 +132,7 @@ outputs = llm.chat(conversation, chat_template=custom_template)
 
 ## Online Serving
 
-Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
+Our [OpenAI-Compatible Server][openai-compatible-server] provides endpoints that correspond to the offline APIs:
 
-- [Completions API](#completions-api) is similar to `LLM.generate` but only accepts text.
-- [Chat API](#chat-api)  is similar to `LLM.chat`, accepting both text and [multi-modal inputs](#multimodal-inputs) for models with a chat template.
+- [Completions API][completions-api] is similar to `LLM.generate` but only accepts text.
+- [Chat API][chat-api]  is similar to `LLM.chat`, accepting both text and [multi-modal inputs][multimodal-inputs] for models with a chat template.
diff --git a/docs/source/models/pooling_models.md b/docs/models/pooling_models.md
similarity index 62%
rename from docs/source/models/pooling_models.md
rename to docs/models/pooling_models.md
index 3fd35e2e8bd1..89a128915a76 100644
--- a/docs/source/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -1,70 +1,48 @@
-(pooling-models)=
-
-# Pooling Models
+---
+title: Pooling Models
+---
+[](){ #pooling-models }
 
 vLLM also supports pooling models, including embedding, reranking and reward models.
 
-In vLLM, pooling models implement the {class}`~vllm.model_executor.models.VllmModelForPooling` interface.
-These models use a {class}`~vllm.model_executor.layers.Pooler` to extract the final hidden states of the input
+In vLLM, pooling models implement the [VllmModelForPooling][vllm.model_executor.models.VllmModelForPooling] interface.
+These models use a [Pooler][vllm.model_executor.layers.Pooler] to extract the final hidden states of the input
 before returning them.
 
-:::{note}
-We currently support pooling models primarily as a matter of convenience.
-As shown in the [Compatibility Matrix](#compatibility-matrix), most vLLM features are not applicable to
-pooling models as they only work on the generation or decode stage, so performance may not improve as much.
-:::
+!!! note
+    We currently support pooling models primarily as a matter of convenience.
+    As shown in the [Compatibility Matrix][compatibility-matrix], most vLLM features are not applicable to
+    pooling models as they only work on the generation or decode stage, so performance may not improve as much.
 
 For pooling models, we support the following `--task` options.
 The selected option sets the default pooler used to extract the final hidden states:
 
-:::{list-table}
-:widths: 50 25 25 25
-:header-rows: 1
-
-- * Task
-  * Pooling Type
-  * Normalization
-  * Softmax
-- * Embedding (`embed`)
-  * `LAST`
-  * ✅︎
-  * ❌
-- * Classification (`classify`)
-  * `LAST`
-  * ❌
-  * ✅︎
-- * Sentence Pair Scoring (`score`)
-  * \*
-  * \*
-  * \*
-- * Reward Modeling (`reward`)
-  * `ALL`
-  * ❌
-  * ❌
-:::
+| Task                            | Pooling Type   | Normalization   | Softmax   |
+|---------------------------------|----------------|-----------------|-----------|
+| Embedding (`embed`)             | `LAST`         | ✅︎              | ❌         |
+| Classification (`classify`)     | `LAST`         | ❌               | ✅︎        |
+| Sentence Pair Scoring (`score`) | \*             | \*              | \*        |
 
 \*The default pooler is always defined by the model.
 
-:::{note}
-If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table.
-:::
+!!! note
+    If the model's implementation in vLLM defines its own pooler, the default pooler is set to that instead of the one specified in this table.
 
 When loading [Sentence Transformers](https://huggingface.co/sentence-transformers) models,
 we attempt to override the default pooler based on its Sentence Transformers configuration file (`modules.json`).
 
-:::{tip}
-You can customize the model's pooling method via the `--override-pooler-config` option,
-which takes priority over both the model's and Sentence Transformers's defaults.
-:::
+!!! tip
+    You can customize the model's pooling method via the `--override-pooler-config` option,
+    which takes priority over both the model's and Sentence Transformers's defaults.
 
 ## Offline Inference
 
-The {class}`~vllm.LLM` class provides various methods for offline inference.
-See <project:#configuration> for a list of options when initializing the model.
+The [LLM][vllm.LLM] class provides various methods for offline inference.
+See [configuration][configuration] for a list of options when initializing the model.
 
 ### `LLM.encode`
 
-The {class}`~vllm.LLM.encode` method is available to all pooling models in vLLM.
+The [encode][vllm.LLM.encode] method is available to all pooling models in vLLM.
 It returns the extracted hidden states directly, which is useful for reward models.
 
 ```python
@@ -79,7 +57,7 @@ print(f"Data: {data!r}")
 
 ### `LLM.embed`
 
-The {class}`~vllm.LLM.embed` method outputs an embedding vector for each prompt.
+The [embed][vllm.LLM.embed] method outputs an embedding vector for each prompt.
 It is primarily designed for embedding models.
 
 ```python
@@ -96,7 +74,7 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/embe
 
 ### `LLM.classify`
 
-The {class}`~vllm.LLM.classify` method outputs a probability vector for each prompt.
+The [classify][vllm.LLM.classify] method outputs a probability vector for each prompt.
 It is primarily designed for classification models.
 
 ```python
@@ -113,13 +91,12 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/clas
 
 ### `LLM.score`
 
-The {class}`~vllm.LLM.score` method outputs similarity scores between sentence pairs.
+The [score][vllm.LLM.score] method outputs similarity scores between sentence pairs.
 It is designed for embedding models and cross encoder models. Embedding models use cosine similarity, and [cross-encoder models](https://www.sbert.net/examples/applications/cross-encoder/README.html) serve as rerankers between candidate query-document pairs in RAG systems.
 
-:::{note}
-vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
-To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain).
-:::
+!!! note
+    vLLM can only perform the model inference component (e.g. embedding, reranking) of RAG.
+    To handle RAG at a higher level, you should use integration frameworks such as [LangChain](https://github.com/langchain-ai/langchain).
 
 ```python
 from vllm import LLM
@@ -136,27 +113,25 @@ A code example can be found here: <gh-file:examples/offline_inference/basic/scor
 
 ## Online Serving
 
-Our [OpenAI-Compatible Server](#openai-compatible-server) provides endpoints that correspond to the offline APIs:
+Our [OpenAI-Compatible Server][openai-compatible-server] provides endpoints that correspond to the offline APIs:
 
-- [Pooling API](#pooling-api) is similar to `LLM.encode`, being applicable to all types of pooling models.
-- [Embeddings API](#embeddings-api) is similar to `LLM.embed`, accepting both text and [multi-modal inputs](#multimodal-inputs) for embedding models.
-- [Classification API](#classification-api) is similar to `LLM.classify` and is applicable to sequence classification models.
-- [Score API](#score-api) is similar to `LLM.score` for cross-encoder models.
+- [Pooling API][pooling-api] is similar to `LLM.encode`, being applicable to all types of pooling models.
+- [Embeddings API][embeddings-api] is similar to `LLM.embed`, accepting both text and [multi-modal inputs][multimodal-inputs] for embedding models.
+- [Classification API][classification-api] is similar to `LLM.classify` and is applicable to sequence classification models.
+- [Score API][score-api] is similar to `LLM.score` for cross-encoder models.
 
 ## Matryoshka Embeddings
 
 [Matryoshka Embeddings](https://sbert.net/examples/sentence_transformer/training/matryoshka/README.html#matryoshka-embeddings) or [Matryoshka Representation Learning (MRL)](https://arxiv.org/abs/2205.13147) is a technique used in training embedding models. It allows user to trade off between performance and cost.
 
-:::{warning}
-Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
-
-For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error.
+!!! warning
+    Not all embedding models are trained using Matryoshka Representation Learning. To avoid misuse of the `dimensions` parameter, vLLM returns an error for requests that attempt to change the output dimension of models that do not support Matryoshka Embeddings.
 
-```json
-{"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400}
-```
+    For example, setting `dimensions` parameter while using the `BAAI/bge-m3` model will result in the following error.
 
-:::
+    ```json
+    {"object":"error","message":"Model \"BAAI/bge-m3\" does not support matryoshka representation, changing output dimensions will lead to poor results.","type":"BadRequestError","param":null,"code":400}
+    ```
 
 ### Manually enable Matryoshka Embeddings
 
@@ -172,7 +147,7 @@ vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf_overrides '{"matryoshka_
 
 ### Offline Inference
 
-You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in {class}`~vllm.PoolingParams`.
+You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter in [PoolingParams][vllm.PoolingParams].
 
 ```python
 from vllm import LLM, PoolingParams
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
new file mode 100644
index 000000000000..7594c6e6fbf1
--- /dev/null
+++ b/docs/models/supported_models.md
@@ -0,0 +1,690 @@
+---
+title: Supported Models
+---
+[](){ #supported-models }
+
+vLLM supports [generative](./generative_models.md) and [pooling](./pooling_models.md) models across various tasks.
+If a model supports more than one task, you can set the task via the `--task` argument.
+
+For each task, we list the model architectures that have been implemented in vLLM.
+Alongside each architecture, we include some popular models that use it.
+
+## Model Implementation
+
+### vLLM
+
+If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>.
+
+These models are what we list in [supported-text-models][supported-text-models] and [supported-mm-models][supported-mm-models].
+
+[](){ #transformers-backend }
+
+### Transformers
+
+vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned!
+
+To check if the modeling backend is Transformers, you can simply do this:
+
+```python
+from vllm import LLM
+llm = LLM(model=..., task="generate")  # Name or path of your model
+llm.apply_model(lambda model: print(type(model)))
+```
+
+If it is `TransformersForCausalLM` then it means it's based on Transformers!
+
+!!! tip
+    You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for [offline-inference][offline-inference] or `--model-impl transformers` for the [openai-compatible-server][openai-compatible-server].
+
+!!! note
+    vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
+
+#### Custom models
+
+If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
+
+For a model to be compatible with the Transformers backend for vLLM it must:
+
+- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
+    * The model directory must have the correct structure (e.g. `config.json` is present).
+    * `config.json` must contain `auto_map.AutoModel`.
+- be a Transformers backend for vLLM compatible model (see [writing-custom-models][writing-custom-models]):
+    * Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
+
+If the compatible model is:
+
+- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for [offline-inference][offline-inference] or `--trust-remote-code` for the [openai-compatible-server][openai-compatible-server].
+- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for [offline-inference][offline-inference] or `vllm serve <MODEL_DIR>` for the [openai-compatible-server][openai-compatible-server].
+
+This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
+
+[](){ #writing-custom-models }
+
+#### Writing custom models
+
+This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
+
+To make your model compatible with the Transformers backend, it needs:
+
+1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
+2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
+3. `MyModel` must contain `_supports_attention_backend = True`.
+
+```python title="modeling_my_model.py"
+
+from transformers import PreTrainedModel
+from torch import nn
+
+class MyAttention(nn.Module):
+
+    def forward(self, hidden_states, **kwargs):
+        ...
+        attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            **kwargs,
+        )
+        ...
+
+class MyModel(PreTrainedModel):
+    _supports_attention_backend = True
+```
+
+Here is what happens in the background when this model is loaded:
+
+1. The config is loaded.
+2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
+3. `MyModel` is loaded into `TransformersForCausalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
+
+That's it!
+
+For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
+
+```python title="configuration_my_model.py"
+
+from transformers import PretrainedConfig
+
+class MyConfig(PretrainedConfig):
+    base_model_tp_plan = {
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+```
+
+- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
+- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
+    * You only need to do this for layers which are not present on all pipeline stages
+    * vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages
+    * The `list` in the first element of the `tuple` contains the names of the input arguments
+    * The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code
+
+## Loading a Model
+
+### Hugging Face Hub
+
+By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models). To change the download path for models, you can set the `HF_HOME` environment variable; for more details, refer to [their official documentation](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome).
+
+To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
+If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
+
+Models do not _need_ to be natively supported to be used in vLLM.
+The [Transformers backend][transformers-backend] enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
+
+!!! tip
+    The easiest way to check if your model is really supported at runtime is to run the program below:
+
+    ```python
+    from vllm import LLM
+
+    # For generative models (task=generate) only
+    llm = LLM(model=..., task="generate")  # Name or path of your model
+    output = llm.generate("Hello, my name is")
+    print(output)
+
+    # For pooling models (task={embed,classify,reward,score}) only
+    llm = LLM(model=..., task="embed")  # Name or path of your model
+    output = llm.encode("Hello, my name is")
+    print(output)
+    ```
+
+    If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
+
+Otherwise, please refer to [Adding a New Model][new-model] for instructions on how to implement your model in vLLM.
+Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
+
+#### Download a model
+
+If you prefer, you can use the Hugging Face CLI to [download a model](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-download) or specific files from a model repository:
+
+```console
+# Download a model
+huggingface-cli download HuggingFaceH4/zephyr-7b-beta
+
+# Specify a custom cache directory
+huggingface-cli download HuggingFaceH4/zephyr-7b-beta --cache-dir ./path/to/cache
+
+# Download a specific file from a model repo
+huggingface-cli download HuggingFaceH4/zephyr-7b-beta eval_results.json
+```
+
+#### List the downloaded models
+
+Use the Hugging Face CLI to [manage models](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#scan-your-cache) stored in local cache:
+
+```console
+# List cached models
+huggingface-cli scan-cache
+
+# Show detailed (verbose) output
+huggingface-cli scan-cache -v
+
+# Specify a custom cache directory
+huggingface-cli scan-cache --dir ~/.cache/huggingface/hub
+```
+
+#### Delete a cached model
+
+Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache:
+
+```console
+# The `delete-cache` command requires extra dependencies to work with the TUI.
+# Please run `pip install huggingface_hub[cli]` to install them.
+
+# Launch the interactive TUI to select models to delete
+$ huggingface-cli delete-cache
+? Select revisions to delete: 1 revisions selected counting for 438.9M.
+  ○ None of the following (if selected, nothing will be deleted).
+Model BAAI/bge-base-en-v1.5 (438.9M, used 1 week ago)
+❯ ◉ a5beb1e3: main # modified 1 week ago
+
+Model BAAI/bge-large-en-v1.5 (1.3G, used 1 week ago)
+  ○ d4aa6901: main # modified 1 week ago
+
+Model BAAI/bge-reranker-base (1.1G, used 4 weeks ago)
+  ○ 2cfc18c9: main # modified 4 weeks ago
+
+Press <space> to select, <enter> to validate and <ctrl+c> to quit without modification.
+
+# Need to confirm after selected
+? Select revisions to delete: 1 revision(s) selected.
+? 1 revisions selected counting for 438.9M. Confirm deletion ? Yes
+Start deletion.
+Done. Deleted 1 repo(s) and 0 revision(s) for a total of 438.9M.
+```
+
+#### Using a proxy
+
+Here are some tips for loading/downloading models from Hugging Face using a proxy:
+
+- Set the proxy globally for your session (or set it in the profile file):
+
+```shell
+export http_proxy=http://your.proxy.server:port
+export https_proxy=http://your.proxy.server:port
+```
+
+- Set the proxy for just the current command:
+
+```shell
+https_proxy=http://your.proxy.server:port huggingface-cli download <model_name>
+
+# or use vllm cmd directly
+https_proxy=http://your.proxy.server:port  vllm serve <model_name> --disable-log-requests
+```
+
+- Set the proxy in Python interpreter:
+
+```python
+import os
+
+os.environ['http_proxy'] = 'http://your.proxy.server:port'
+os.environ['https_proxy'] = 'http://your.proxy.server:port'
+```
+
+### ModelScope
+
+To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable:
+
+```shell
+export VLLM_USE_MODELSCOPE=True
+```
+
+And use with `trust_remote_code=True`.
+
+```python
+from vllm import LLM
+
+llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
+
+# For generative models (task=generate) only
+output = llm.generate("Hello, my name is")
+print(output)
+
+# For pooling models (task={embed,classify,reward,score}) only
+output = llm.encode("Hello, my name is")
+print(output)
+```
+
+[](){ #feature-status-legend }
+
+## Feature Status Legend
+
+- ✅︎ indicates that the feature is supported for the model.
+
+- 🚧 indicates that the feature is planned but not yet supported for the model.
+
+- ⚠️ indicates that the feature is available but may have known issues or limitations.
+
+[](){ #supported-text-models }
+
+## List of Text-only Language Models
+
+### Generative Models
+
+See [this page][generative-models] for more information on how to use generative models.
+
+#### Text Generation
+
+Specified using `--task generate`.
+
+| Architecture                                      | Models                                              | Example HF Models                                                                                                                                                            | [LoRA][lora-adapter]   | [PP][distributed-serving]   |
+|---------------------------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|
+| `AquilaForCausalLM`                               | Aquila, Aquila2                                     | `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          |
+| `ArcticForCausalLM`                               | Arctic                                              | `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.                                                                                               | ✅︎                     |                             |
+| `BaiChuanForCausalLM`                             | Baichuan2, Baichuan                                 | `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.                                                                                                          | ✅︎                     | ✅︎                          |
+| `BambaForCausalLM`                                | Bamba                                               | `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B`                                                                                                                   |                        |                             |
+| `BloomForCausalLM`                                | BLOOM, BLOOMZ, BLOOMChat                            | `bigscience/bloom`, `bigscience/bloomz`, etc.                                                                                                                                | ✅︎                     |                             |
+| `BartForConditionalGeneration`                    | BART                                                | `facebook/bart-base`, `facebook/bart-large-cnn`, etc.                                                                                                                        |                        |                             |
+| `ChatGLMModel`, `ChatGLMForConditionalGeneration` | ChatGLM                                             | `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc.                                                                                                       | ✅︎                     | ✅︎                          |
+| `CohereForCausalLM`, `Cohere2ForCausalLM`         | Command-R                                           | `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.                                                                                               | ✅︎                     | ✅︎                          |
+| `DbrxForCausalLM`                                 | DBRX                                                | `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.                                                                                                                     | ✅︎                     |                             |
+| `DeciLMForCausalLM`                               | DeciLM                                              | `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.                                                                                                                               | ✅︎                     |                             |
+| `DeepseekForCausalLM`                             | DeepSeek                                            | `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.                                                                                                 | ✅︎                     |                             |
+| `DeepseekV2ForCausalLM`                           | DeepSeek-V2                                         | `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.                                                                                                               | ✅︎                     |                             |
+| `DeepseekV3ForCausalLM`                           | DeepSeek-V3                                         | `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.                                                                                                               | ✅︎                     |                             |
+| `ExaoneForCausalLM`                               | EXAONE-3                                            | `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          |
+| `FalconForCausalLM`                               | Falcon                                              | `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.                                                                                                         | ✅︎                     |                             |
+| `FalconMambaForCausalLM`                          | FalconMamba                                         | `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.                                                                                                            | ✅︎                     | ✅︎                          |
+| `FalconH1ForCausalLM`                             | Falcon-H1                                           | `tiiuae/Falcon-H1-34B-Base`, `tiiuae/Falcon-H1-34B-Instruct`, etc.                                                                                                           | ✅︎                     | ✅︎                          |
+| `GemmaForCausalLM`                                | Gemma                                               | `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc.                                                                                                                            | ✅︎                     | ✅︎                          |
+| `Gemma2ForCausalLM`                               | Gemma 2                                             | `google/gemma-2-9b`, `google/gemma-2-27b`, etc.                                                                                                                              | ✅︎                     | ✅︎                          |
+| `Gemma3ForCausalLM`                               | Gemma 3                                             | `google/gemma-3-1b-it`, etc.                                                                                                                                                 | ✅︎                     | ✅︎                          |
+| `GlmForCausalLM`                                  | GLM-4                                               | `THUDM/glm-4-9b-chat-hf`, etc.                                                                                                                                               | ✅︎                     | ✅︎                          |
+| `Glm4ForCausalLM`                                 | GLM-4-0414                                          | `THUDM/GLM-4-32B-0414`, etc.                                                                                                                                                 | ✅︎                     | ✅︎                          |
+| `GPT2LMHeadModel`                                 | GPT-2                                               | `gpt2`, `gpt2-xl`, etc.                                                                                                                                                      | ✅︎                     |                             |
+| `GPTBigCodeForCausalLM`                           | StarCoder, SantaCoder, WizardCoder                  | `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.                                                                                 | ✅︎                     | ✅︎                          |
+| `GPTJForCausalLM`                                 | GPT-J                                               | `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.                                                                                                                            | ✅︎                     |                             |
+| `GPTNeoXForCausalLM`                              | GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM | `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc. | ✅︎                     |                             |
+| `GraniteForCausalLM`                              | Granite 3.0, Granite 3.1, PowerLM                   | `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.                                                                             | ✅︎                     | ✅︎                          |
+| `GraniteMoeForCausalLM`                           | Granite 3.0 MoE, PowerMoE                           | `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.                                                                | ✅︎                     | ✅︎                          |
+| `GraniteMoeHybridForCausalLM`                     | Granite 4.0 MoE Hybrid                              | `ibm-granite/granite-4.0-tiny-preview`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          |
+| `GraniteMoeSharedForCausalLM`                     | Granite MoE Shared                                  | `ibm-research/moe-7b-1b-active-shared-experts` (test model)                                                                                                                  | ✅︎                     | ✅︎                          |
+| `GritLM`                                          | GritLM                                              | `parasail-ai/GritLM-7B-vllm`.                                                                                                                                                | ✅︎                     | ✅︎                          |
+| `Grok1ModelForCausalLM`                           | Grok1                                               | `hpcai-tech/grok-1`.                                                                                                                                                         | ✅︎                     | ✅︎                          |
+| `InternLMForCausalLM`                             | InternLM                                            | `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.                                                                                                                    | ✅︎                     | ✅︎                          |
+| `InternLM2ForCausalLM`                            | InternLM2                                           | `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.                                                                                                                  | ✅︎                     | ✅︎                          |
+| `InternLM3ForCausalLM`                            | InternLM3                                           | `internlm/internlm3-8b-instruct`, etc.                                                                                                                                       | ✅︎                     | ✅︎                          |
+| `JAISLMHeadModel`                                 | Jais                                                | `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.                                                         | ✅︎                     |                             |
+| `JambaForCausalLM`                                | Jamba                                               | `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.                                                                                 | ✅︎                     | ✅︎                          |
+| `LlamaForCausalLM`                                | Llama 3.1, Llama 3, Llama 2, LLaMA, Yi              | `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.        | ✅︎                     | ✅︎                          |
+| `MambaForCausalLM`                                | Mamba                                               | `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.                                                                               | ✅︎                     |                             |
+| `MiniCPMForCausalLM`                              | MiniCPM                                             | `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.                                                                               | ✅︎                     | ✅︎                          |
+| `MiniCPM3ForCausalLM`                             | MiniCPM3                                            | `openbmb/MiniCPM3-4B`, etc.                                                                                                                                                  | ✅︎                     | ✅︎                          |
+| `MistralForCausalLM`                              | Mistral, Mistral-Instruct                           | `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.                                                                                                      | ✅︎                     | ✅︎                          |
+| `MixtralForCausalLM`                              | Mixtral-8x7B, Mixtral-8x7B-Instruct                 | `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.                                                          | ✅︎                     | ✅︎                          |
+| `MPTForCausalLM`                                  | MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter        | `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.                                                                                                   | ✅︎                     |                             |
+| `NemotronForCausalLM`                             | Nemotron-3, Nemotron-4, Minitron                    | `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.                                                                                                         | ✅︎                     | ✅︎                          |
+| `OLMoForCausalLM`                                 | OLMo                                                | `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.                                                                                                                             | ✅︎                     |                             |
+| `OLMo2ForCausalLM`                                | OLMo2                                               | `allenai/OLMo-2-0425-1B`, etc.                                                                                                                                               | ✅︎                     |                             |
+| `OLMoEForCausalLM`                                | OLMoE                                               | `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.                                                                                                        | ✅︎                     | ✅︎                          |
+| `OPTForCausalLM`                                  | OPT, OPT-IML                                        | `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.                                                                                                                         | ✅︎                     |                             |
+| `OrionForCausalLM`                                | Orion                                               | `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.                                                                                                             | ✅︎                     |                             |
+| `PhiForCausalLM`                                  | Phi                                                 | `microsoft/phi-1_5`, `microsoft/phi-2`, etc.                                                                                                                                 | ✅︎                     | ✅︎                          |
+| `Phi3ForCausalLM`                                 | Phi-4, Phi-3                                        | `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.   | ✅︎                     | ✅︎                          |
+| `Phi3SmallForCausalLM`                            | Phi-3-Small                                         | `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.                                                                                             | ✅︎                     |                             |
+| `PhiMoEForCausalLM`                               | Phi-3.5-MoE                                         | `microsoft/Phi-3.5-MoE-instruct`, etc.                                                                                                                                       | ✅︎                     | ✅︎                          |
+| `PersimmonForCausalLM`                            | Persimmon                                           | `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.                                                                                                                   | ✅︎                     |                             |
+| `Plamo2ForCausalLM`                               | PLaMo2                                              | `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc.                                                                                                                                 |                        |                             |
+| `QWenLMHeadModel`                                 | Qwen                                                | `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.                                                                                                                                    | ✅︎                     | ✅︎                          |
+| `Qwen2ForCausalLM`                                | QwQ, Qwen2                                          | `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.                                                                                                      | ✅︎                     | ✅︎                          |
+| `Qwen2MoeForCausalLM`                             | Qwen2MoE                                            | `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.                                                                                                                | ✅︎                     |                             |
+| `Qwen3ForCausalLM`                                | Qwen3                                               | `Qwen/Qwen3-8B`, etc.                                                                                                                                                        | ✅︎                     | ✅︎                          |
+| `Qwen3MoeForCausalLM`                             | Qwen3MoE                                            | `Qwen/Qwen3-30B-A3B`, etc.                                                                                                                                                   | ✅︎                     |                             |
+| `StableLmForCausalLM`                             | StableLM                                            | `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.                                                                                                | ✅︎                     |                             |
+| `Starcoder2ForCausalLM`                           | Starcoder2                                          | `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.                                                                                             | ✅︎                     |                             |
+| `SolarForCausalLM`                                | Solar Pro                                           | `upstage/solar-pro-preview-instruct`, etc.                                                                                                                                   | ✅︎                     | ✅︎                          |
+| `TeleChat2ForCausalLM`                            | TeleChat2                                           | `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc.                                                                                                | ✅︎                     | ✅︎                          |
+| `TeleFLMForCausalLM`                              | TeleFLM                                             | `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc.                                                                                                                    | ✅︎                     | ✅︎                          |
+| `XverseForCausalLM`                               | XVERSE                                              | `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.                                                                                            | ✅︎                     | ✅︎                          |
+| `MiniMaxText01ForCausalLM`                        | MiniMax-Text                                        | `MiniMaxAI/MiniMax-Text-01`, etc.                                                                                                                                            | ✅︎                     |                             |
+| `Zamba2ForCausalLM`                               | Zamba2                                              | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.                                                                              |                        |                             |
+
+!!! note
+    Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
+
+### Pooling Models
+
+See [this page](./pooling_models.md) for more information on how to use pooling models.
+
+!!! warning
+    Since some model architectures support both generative and pooling tasks,
+    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+
+#### Text Embedding
+
+Specified using `--task embed`.
+
+| Architecture                                           | Models              | Example HF Models                                                                                                   | [LoRA][lora-adapter]   | [PP][distributed-serving]   |
+|--------------------------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|
+| `BertModel`                                            | BERT-based          | `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc.                                                |                        |                             |
+| `Gemma2Model`                                          | Gemma 2-based       | `BAAI/bge-multilingual-gemma2`, etc.                                                                                | ✅︎                     |                             |
+| `GritLM`                                               | GritLM              | `parasail-ai/GritLM-7B-vllm`.                                                                                       | ✅︎                     | ✅︎                          |
+| `GteModel`                                             | Arctic-Embed-2.0-M  | `Snowflake/snowflake-arctic-embed-m-v2.0`.                                                                          | ︎                      |                             |
+| `GteNewModel`                                          | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-base`, etc.                                                                           | ︎                      | ︎                           |
+| `ModernBertModel`                                      | ModernBERT-based    | `Alibaba-NLP/gte-modernbert-base`, etc.                                                                             | ︎                      | ︎                           |
+| `NomicBertModel`                                       | Nomic BERT          | `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc. | ︎                      | ︎                           |
+| `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc. | Llama-based         | `intfloat/e5-mistral-7b-instruct`, etc.                                                                             | ✅︎                     | ✅︎                          |
+| `Qwen2Model`, `Qwen2ForCausalLM`                       | Qwen2-based         | `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.              | ✅︎                     | ✅︎                          |
+| `RobertaModel`, `RobertaForMaskedLM`                   | RoBERTa-based       | `sentence-transformers/all-roberta-large-v1`, etc.                                                                  |                        |                             |
+
+!!! note
+    `ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
+    You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
+
+!!! note
+    For `Alibaba-NLP/gte-Qwen2-*`, you need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
+    See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
+
+!!! note
+    `jinaai/jina-embeddings-v3` supports multiple tasks through lora, while vllm temporarily only supports text-matching tasks by merging lora weights.
+
+!!! note
+    The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture.
+
+If your model is not in the above list, we will try to automatically convert the model using
+[as_embedding_model][vllm.model_executor.models.adapters.as_embedding_model]. By default, the embeddings
+of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
+
+#### Reward Modeling
+
+Specified using `--task reward`.
+
+| Architecture              | Models          | Example HF Models                                                      | [LoRA][lora-adapter]   | [PP][distributed-serving]   |
+|---------------------------|-----------------|------------------------------------------------------------------------|------------------------|-----------------------------|
+| `InternLM2ForRewardModel` | InternLM2-based | `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc. | ✅︎                     | ✅︎                          |
+| `LlamaForCausalLM`        | Llama-based     | `peiyi9979/math-shepherd-mistral-7b-prm`, etc.                         | ✅︎                     | ✅︎                          |
+| `Qwen2ForRewardModel`     | Qwen2-based     | `Qwen/Qwen2.5-Math-RM-72B`, etc.                                       | ✅︎                     | ✅︎                          |
+
+If your model is not in the above list, we will try to automatically convert the model using
+[as_reward_model][vllm.model_executor.models.adapters.as_reward_model]. By default, we return the hidden states of each token directly.
+
+!!! warning
+    For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
+    e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
+
+#### Classification
+
+Specified using `--task classify`.
+
+| Architecture                     | Models   | Example HF Models                      | [LoRA][lora-adapter]   | [PP][distributed-serving]   |
+|----------------------------------|----------|----------------------------------------|------------------------|-----------------------------|
+| `JambaForSequenceClassification` | Jamba    | `ai21labs/Jamba-tiny-reward-dev`, etc. | ✅︎                     | ✅︎                          |
+
+If your model is not in the above list, we will try to automatically convert the model using
+[as_classification_model][vllm.model_executor.models.adapters.as_classification_model]. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
+
+#### Sentence Pair Scoring
+
+Specified using `--task score`.
+
+| Architecture                          | Models            | Example HF Models                            |
+|---------------------------------------|-------------------|----------------------------------------------|
+| `BertForSequenceClassification`       | BERT-based        | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. |
+| `RobertaForSequenceClassification`    | RoBERTa-based     | `cross-encoder/quora-roberta-base`, etc.     |
+| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc.              |
+
+[](){ #supported-mm-models }
+
+## List of Multimodal Language Models
+
+The following modalities are supported depending on the model:
+
+- **T**ext
+- **I**mage
+- **V**ideo
+- **A**udio
+
+Any combination of modalities joined by `+` are supported.
+
+- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs.
+
+On the other hand, modalities separated by `/` are mutually exclusive.
+
+- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
+
+See [this page][multimodal-inputs] on how to pass multi-modal inputs to the model.
+
+!!! warning
+    **To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
+    or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
+
+    Offline inference:
+
+    ```python
+    from vllm import LLM
+
+    llm = LLM(
+        model="Qwen/Qwen2-VL-7B-Instruct",
+        limit_mm_per_prompt={"image": 4},
+    )
+    ```
+
+    Online serving:
+
+    ```bash
+    vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}'
+    ```
+
+    **This is no longer required if you are using vLLM V1.**
+
+!!! note
+    vLLM currently only supports adding LoRA to the language backbone of multimodal models.
+
+### Generative Models
+
+See [this page][generative-models] for more information on how to use generative models.
+
+#### Text Generation
+
+Specified using `--task generate`.
+
+| Architecture                                 | Models                                                                   | Inputs                                                                | Example HF Models                                                                                                                                       | [LoRA][lora-adapter]   | [PP][distributed-serving]   | [V1](gh-issue:8779)   |
+|----------------------------------------------|--------------------------------------------------------------------------|-----------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------|-----------------------------|-----------------------|
+| `AriaForConditionalGeneration`               | Aria                                                                     | T + I<sup>+</sup>                                                     | `rhymes-ai/Aria`                                                                                                                                        | ✅︎                     | ✅︎                          |                       |
+| `AyaVisionForConditionalGeneration`          | Aya Vision                                                               | T + I<sup>+</sup>                                                     | `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.                                                                                         | ✅︎                     | ✅︎                          |                       |
+| `Blip2ForConditionalGeneration`              | BLIP-2                                                                   | T + I<sup>E</sup>                                                     | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.                                                                                          | ✅︎                     | ✅︎                          |                       |
+| `ChameleonForConditionalGeneration`          | Chameleon                                                                | T + I                                                                 | `facebook/chameleon-7b` etc.                                                                                                                            | ✅︎                     | ✅︎                          |                       |
+| `DeepseekVLV2ForCausalLM`<sup>^</sup>        | DeepSeek-VL2                                                             | T + I<sup>+</sup>                                                     | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc.                                                      | ✅︎                     | ✅︎                          |                       |
+| `Florence2ForConditionalGeneration`          | Florence-2                                                               | T + I                                                                 | `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc.                                                                                          |                        |                             |                       |
+| `FuyuForCausalLM`                            | Fuyu                                                                     | T + I                                                                 | `adept/fuyu-8b` etc.                                                                                                                                    | ✅︎                     | ✅︎                          |                       |
+| `Gemma3ForConditionalGeneration`             | Gemma 3                                                                  | T + I<sup>+</sup>                                                     | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.                                                                                                   | ✅︎                     | ✅︎                          | ⚠️                    |
+| `GLM4VForCausalLM`<sup>^</sup>               | GLM-4V                                                                   | T + I                                                                 | `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc.                                                                                                    | ✅︎                     | ✅︎                          | ✅︎                    |
+| `GraniteSpeechForConditionalGeneration`      | Granite Speech                                                           | T + A                                                                 | `ibm-granite/granite-speech-3.3-8b`                                                                                                                     | ✅︎                     | ✅︎                          | ✅︎                    |
+| `H2OVLChatModel`                             | H2OVL                                                                    | T + I<sup>E+</sup>                                                    | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.                                                                                      | ✅︎                     | ✅︎\*                        |                       |
+| `Idefics3ForConditionalGeneration`           | Idefics3                                                                 | T + I                                                                 | `HuggingFaceM4/Idefics3-8B-Llama3` etc.                                                                                                                 | ✅︎                     | ✅︎                          |                       |
+| `InternVLChatModel`                          | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>)                                                    | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.  | ✅︎                     | ✅︎                          |                       |
+| `KimiVLForConditionalGeneration`             | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking                               | T + I<sup>+</sup>                                                     | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`                                                                                    | ✅︎                     |                             |                       |
+| `Llama4ForConditionalGeneration`             | Llama 4                                                                  | T + I<sup>+</sup>                                                     | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎                     | ✅︎                          |                       |
+| `LlavaForConditionalGeneration`              | LLaVA-1.5                                                                | T + I<sup>E+</sup>                                                    | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.                                                                        | ✅︎                     | ✅︎                          |                       |
+| `LlavaNextForConditionalGeneration`          | LLaVA-NeXT                                                               | T + I<sup>E+</sup>                                                    | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.                                                                           | ✅︎                     | ✅︎                          |                       |
+| `LlavaNextVideoForConditionalGeneration`     | LLaVA-NeXT-Video                                                         | T + V                                                                 | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.                                                                                                                 | ✅︎                     | ✅︎                          |                       |
+| `LlavaOnevisionForConditionalGeneration`     | LLaVA-Onevision                                                          | T + I<sup>+</sup> + V<sup>+</sup>                                     | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.                                                            | ✅︎                     | ✅︎                          |                       |
+| `MiniCPMO`                                   | MiniCPM-O                                                                | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup>                  | `openbmb/MiniCPM-o-2_6`, etc.                                                                                                                           | ✅︎                     | ✅︎                          | ✅︎                    |
+| `MiniCPMV`                                   | MiniCPM-V                                                                | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.                                                         | ✅︎                     | ✅︎                          | ✅︎                    |
+| `MiniMaxVL01ForConditionalGeneration`        | MiniMax-VL                                                               | T + I<sup>E+</sup>                                                    | `MiniMaxAI/MiniMax-VL-01`, etc.                                                                                                                         | ✅︎                     | ✅︎                          |                       |
+| `Mistral3ForConditionalGeneration`           | Mistral3                                                                 | T + I<sup>+</sup>                                                     | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.                                                                                                   | ✅︎                     | ✅︎                          | ✅︎                    |
+| `MllamaForConditionalGeneration`             | Llama 3.2                                                                | T + I<sup>+</sup>                                                     | `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.                                                                     |                        |                             |                       |
+| `MolmoForCausalLM`                           | Molmo                                                                    | T + I<sup>+</sup>                                                     | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc.                                                                                              | ✅︎                     | ✅︎                          | ✅︎                    |
+| `NVLM_D_Model`                               | NVLM-D 1.0                                                               | T + I<sup>+</sup>                                                     | `nvidia/NVLM-D-72B`, etc.                                                                                                                               | ✅︎                     | ✅︎                          |                       |
+| `Ovis`                                       | Ovis2, Ovis1.6                                                           | T + I<sup>+</sup>                                                     | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc.                                                                                                 | ✅︎                     |                             |                       |
+| `PaliGemmaForConditionalGeneration`          | PaliGemma, PaliGemma 2                                                   | T + I<sup>E</sup>                                                     | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.                                                  | ✅︎                     | ⚠️                          |                       |
+| `Phi3VForCausalLM`                           | Phi-3-Vision, Phi-3.5-Vision                                             | T + I<sup>E+</sup>                                                    | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.                                                                       | ✅︎                     | ✅︎                          |                       |
+| `Phi4MMForCausalLM`                          | Phi-4-multimodal                                                         | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc.                                                                                                             | ✅︎                     | ✅︎                          |                       |
+| `PixtralForConditionalGeneration`            | Pixtral                                                                  | T + I<sup>+</sup>                                                     | `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.                                                                  | ✅︎                     | ✅︎                          |                       |
+| `QwenVLForConditionalGeneration`<sup>^</sup> | Qwen-VL                                                                  | T + I<sup>E+</sup>                                                    | `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.                                                                                                               | ✅︎                     | ✅︎                          | ✅︎                    |
+| `Qwen2AudioForConditionalGeneration`         | Qwen2-Audio                                                              | T + A<sup>+</sup>                                                     | `Qwen/Qwen2-Audio-7B-Instruct`                                                                                                                          | ✅︎                     | ✅︎                          |                       |
+| `Qwen2VLForConditionalGeneration`            | QVQ, Qwen2-VL                                                            | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.                                                                 | ✅︎                     | ✅︎                          | ✅︎                    |
+| `Qwen2_5_VLForConditionalGeneration`         | Qwen2.5-VL                                                               | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc.                                                                                     | ✅︎                     | ✅︎                          | ✅︎                    |
+| `Qwen2_5OmniThinkerForConditionalGeneration` | Qwen2.5-Omni                                                             | T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup>                   | `Qwen/Qwen2.5-Omni-7B`                                                                                                                                  | ✅︎                     | ✅︎\*                        |                       |
+| `SkyworkR1VChatModel`                        | Skywork-R1V-38B                                                          | T + I                                                                 | `Skywork/Skywork-R1V-38B`                                                                                                                               | ✅︎                     | ✅︎                          |                       |
+| `SmolVLMForConditionalGeneration`            | SmolVLM2                                                                 | T + I                                                                 | `SmolVLM2-2.2B-Instruct`                                                                                                                                | ✅︎                     | ✅︎                          |                       |
+
+<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.  
+&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:  
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`  
+<sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
+<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
+
+!!! warning
+    Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
+    However, there are differences in how they handle text + image inputs:
+
+    V0 correctly implements the model's attention pattern:
+    - Uses bidirectional attention between the image tokens corresponding to the same image
+    - Uses causal attention for other tokens
+    - Implemented via (naive) PyTorch SDPA with masking tensors
+    - Note: May use significant memory for long prompts with image
+
+    V1 currently uses a simplified attention pattern:
+    - Uses causal attention for all tokens, including image tokens
+    - Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
+    - Will be updated in the future to support the correct behavior
+
+    This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
+
+!!! note
+    Only `InternVLChatModel` with Qwen2.5 text backbone (`OpenGVLab/InternVL3-2B`, `OpenGVLab/InternVL2.5-1B` etc) has video inputs support currently.
+
+!!! note
+    `h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80.
+
+!!! note
+    To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
+
+!!! warning
+    The output quality of `AllenAI/Molmo-7B-D-0924` (especially in object localization tasks) has deteriorated in recent updates.
+
+    For the best results, we recommend using the following dependency versions (tested on A10 and L40):
+
+    ```text
+    # Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
+    torch==2.5.1
+    torchvision==0.20.1
+    transformers==4.48.1
+    tokenizers==0.21.0
+    tiktoken==0.7.0
+    vllm==0.7.0
+
+    # Optional but recommended for improved performance and stability
+    triton==3.1.0
+    xformers==0.0.28.post3
+    uvloop==0.21.0
+    protobuf==5.29.3
+    openai==1.60.2
+    opencv-python-headless==4.11.0.86
+    pillow==10.4.0
+
+    # Installed FlashAttention (for float16 only)
+    flash-attn>=2.5.6  # Not used in float32, but should be documented
+    ```
+
+    **Note:** Make sure you understand the security implications of using outdated packages.
+
+!!! note
+    The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
+    For more details, please see: <gh-pr:4087#issuecomment-2250397630>
+
+!!! warning
+    Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
+
+!!! note
+    To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
+    `pip install git+https://github.com/huggingface/transformers.git`.
+
+    Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
+    `--mm-processor-kwargs '{"use_audio_in_video": true}'`.
+
+### Pooling Models
+
+See [this page](./pooling_models.md) for more information on how to use pooling models.
+
+!!! warning
+    Since some model architectures support both generative and pooling tasks,
+    you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
+
+#### Text Embedding
+
+Specified using `--task embed`.
+
+Any text generation model can be converted into an embedding model by passing `--task embed`.
+
+!!! note
+    To get the best results, you should use pooling models that are specifically trained as such.
+
+The following table lists those that are tested in vLLM.
+
+| Architecture                        | Models             | Inputs   | Example HF Models        | [LoRA][lora-adapter]   | [PP][distributed-serving]   |
+|-------------------------------------|--------------------|----------|--------------------------|------------------------|-----------------------------|
+| `LlavaNextForConditionalGeneration` | LLaVA-NeXT-based   | T / I    | `royokong/e5-v`          | ✅︎                     |                             |
+| `Phi3VForCausalLM`                  | Phi-3-Vision-based | T + I    | `TIGER-Lab/VLM2Vec-Full` | 🚧                      | ✅︎                          |
+
+#### Transcription
+
+Specified using `--task transcription`.
+
+Speech2Text models trained specifically for Automatic Speech Recognition.
+
+| Architecture   | Models   | Example HF Models   | [LoRA][lora-adapter]   | [PP][distributed-serving]   |
+|----------------|----------|---------------------|------------------------|-----------------------------|
+
+---
+
+## Model Support Policy
+
+At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
+
+1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated!
+
+2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
+
+    !!! tip
+        When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
+
+3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
+
+4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use.
+
+5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement.
+
+Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem.
+
+Note that, as an inference engine, vLLM does not introduce new models. Therefore, all models supported by vLLM are third-party models in this regard.
+
+We have the following levels of testing for models:
+
+1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
+2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
+3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test.
+4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
diff --git a/docs/seed_parameter_behavior.md b/docs/seed_parameter_behavior.md
deleted file mode 100644
index ff17525cf8e2..000000000000
--- a/docs/seed_parameter_behavior.md
+++ /dev/null
@@ -1,51 +0,0 @@
-# Seed Parameter Behavior in vLLM
-
-## Overview
-
-The `seed` parameter in vLLM is used to control the random states for various random number generators. This parameter can affect the behavior of random operations in user code, especially when working with models in vLLM.
-
-## Default Behavior
-
-By default, the `seed` parameter is set to `None`. When the `seed` parameter is `None`, the global random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that the random operations will behave as expected, without any fixed random states.
-
-## Specifying a Seed
-
-If a specific seed value is provided, the global random states for `random`, `np.random`, and `torch.manual_seed` will be set accordingly. This can be useful for reproducibility, as it ensures that the random operations produce the same results across multiple runs.
-
-## Example Usage
-
-### Without Specifying a Seed
-
-```python
-import random
-from vllm import LLM
-
-# Initialize a vLLM model without specifying a seed
-model = LLM(model="Qwen/Qwen2.5-0.5B-Instruct")
-
-# Try generating random numbers
-print(random.randint(0, 100))  # Outputs different numbers across runs
-```
-
-### Specifying a Seed
-
-```python
-import random
-from vllm import LLM
-
-# Initialize a vLLM model with a specific seed
-model = LLM(model="Qwen/Qwen2.5-0.5B-Instruct", seed=42)
-
-# Try generating random numbers
-print(random.randint(0, 100))  # Outputs the same number across runs
-```
-
-## Important Notes
-
-- If the `seed` parameter is not specified, the behavior of global random states remains unaffected.
-- If a specific seed value is provided, the global random states for `random`, `np.random`, and `torch.manual_seed` will be set to that value.
-- This behavior can be useful for reproducibility but may lead to non-intuitive behavior if the user is not explicitly aware of it.
-
-## Conclusion
-
-Understanding the behavior of the `seed` parameter in vLLM is crucial for ensuring the expected behavior of random operations in your code. By default, the `seed` parameter is set to `None`, which means that the global random states are not affected. However, specifying a seed value can help achieve reproducibility in your experiments.
diff --git a/docs/source/serving/distributed_serving.md b/docs/serving/distributed_serving.md
similarity index 73%
rename from docs/source/serving/distributed_serving.md
rename to docs/serving/distributed_serving.md
index c285ef3e8e1c..259af5cabcb8 100644
--- a/docs/source/serving/distributed_serving.md
+++ b/docs/serving/distributed_serving.md
@@ -1,6 +1,7 @@
-(distributed-serving)=
-
-# Distributed Inference and Serving
+---
+title: Distributed Inference and Serving
+---
+[](){ #distributed-serving }
 
 ## How to decide the distributed inference strategy?
 
@@ -14,9 +15,8 @@ In short, you should increase the number of GPUs and the number of nodes until y
 
 After adding enough GPUs and nodes to hold the model, you can run vLLM first, which will print some logs like `# GPU blocks: 790`. Multiply the number by `16` (the block size), and you can get roughly the maximum number of tokens that can be served on the current configuration. If this number is not satisfying, e.g. you want higher throughput, you can further increase the number of GPUs or nodes, until the number of blocks is enough.
 
-:::{note}
-There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
-:::
+!!! note
+    There is one edge case: if the model fits in a single node with multiple GPUs, but the number of GPUs cannot divide the model size evenly, you can use pipeline parallelism, which splits the model along layers and supports uneven splits. In this case, the tensor parallel size should be 1 and the pipeline parallel size should be the number of GPUs.
 
 ## Running vLLM on a single node
 
@@ -77,13 +77,11 @@ bash run_cluster.sh \
 
 Then you get a ray cluster of **containers**. Note that you need to keep the shells running these commands alive to hold the cluster. Any shell disconnect will terminate the cluster. In addition, please note that the argument `ip_of_head_node` should be the IP address of the head node, which is accessible by all the worker nodes. The IP addresses of each worker node should be specified in the `VLLM_HOST_IP` environment variable, and should be different for each worker node. Please check the network configuration of your cluster to make sure the nodes can communicate with each other through the specified IP addresses.
 
-:::{warning}
-It is considered best practice to set `VLLM_HOST_IP` to an address on a private network segment for the vLLM cluster. The traffic sent here is not encrypted. The endpoints are also exchanging data in a format that could be exploited to execute arbitrary code should a malicious party gain access to the network. Please ensure that this network is not reachable by any untrusted parties.
-:::
+!!! warning
+    It is considered best practice to set `VLLM_HOST_IP` to an address on a private network segment for the vLLM cluster. The traffic sent here is not encrypted. The endpoints are also exchanging data in a format that could be exploited to execute arbitrary code should a malicious party gain access to the network. Please ensure that this network is not reachable by any untrusted parties.
 
-:::{warning}
-Since this is a ray cluster of **containers**, all the following commands should be executed in the **containers**, otherwise you are executing the commands on the host machine, which is not connected to the ray cluster. To enter the container, you can use `docker exec -it node /bin/bash`.
-:::
+!!! warning
+    Since this is a ray cluster of **containers**, all the following commands should be executed in the **containers**, otherwise you are executing the commands on the host machine, which is not connected to the ray cluster. To enter the container, you can use `docker exec -it node /bin/bash`.
 
 Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
 
@@ -104,16 +102,13 @@ vllm serve /path/to/the/model/in/the/container \
 
 To make tensor parallel performant, you should make sure the communication between nodes is efficient, e.g. using high-speed network cards like Infiniband. To correctly set up the cluster to use Infiniband, append additional arguments like `--privileged -e NCCL_IB_HCA=mlx5` to the `run_cluster.sh` script. Please contact your system administrator for more information on how to set up the flags. One way to confirm if the Infiniband is working is to run vLLM with `NCCL_DEBUG=TRACE` environment variable set, e.g. `NCCL_DEBUG=TRACE vllm serve ...` and check the logs for the NCCL version and the network used. If you find `[send] via NET/Socket` in the logs, it means NCCL uses raw TCP Socket, which is not efficient for cross-node tensor parallel. If you find `[send] via NET/IB/GDRDMA` in the logs, it means NCCL uses Infiniband with GPU-Direct RDMA, which is efficient.
 
-:::{warning}
-After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script](#troubleshooting-incorrect-hardware-driver) for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
-:::
+!!! warning
+    After you start the Ray cluster, you'd better also check the GPU-GPU communication between nodes. It can be non-trivial to set up. Please refer to the [sanity check script][troubleshooting-incorrect-hardware-driver] for more information. If you need to set some environment variables for the communication configuration, you can append them to the `run_cluster.sh` script, e.g. `-e NCCL_SOCKET_IFNAME=eth0`. Note that setting environment variables in the shell (e.g. `NCCL_SOCKET_IFNAME=eth0 vllm serve ...`) only works for the processes in the same node, not for the processes in the other nodes. Setting environment variables when you create the cluster is the recommended way. See <gh-issue:6803> for more information.
 
-:::{warning}
-Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes.
+!!! warning
+    Please make sure you downloaded the model to all the nodes (with the same path), or the model is downloaded to some distributed file system that is accessible by all nodes.
 
-When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model.
-:::
+    When you use huggingface repo id to refer to the model, you should append your huggingface token to the `run_cluster.sh` script, e.g. `-e HF_TOKEN=`. The recommended way is to download the model first, and then use the path to refer to the model.
 
-:::{warning}
-If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` and `ray list nodes` to see the IP address used by Ray. See <gh-issue:7815> for more information.
-:::
+!!! warning
+    If you keep receiving the error message `Error: No available node types can fulfill resource request` but you have enough GPUs in the cluster, chances are your nodes have multiple IP addresses and vLLM cannot find the right one, especially when you are using multi-node inference. Please make sure vLLM and ray use the same IP address. You can set the `VLLM_HOST_IP` environment variable to the right IP address in the `run_cluster.sh` script (different for each node!), and check `ray status` and `ray list nodes` to see the IP address used by Ray. See <gh-issue:7815> for more information.
diff --git a/docs/source/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md
similarity index 93%
rename from docs/source/serving/integrations/langchain.md
rename to docs/serving/integrations/langchain.md
index 03142d23b145..14ea6a044341 100644
--- a/docs/source/serving/integrations/langchain.md
+++ b/docs/serving/integrations/langchain.md
@@ -1,6 +1,7 @@
-(serving-langchain)=
-
-# LangChain
+---
+title: LangChain
+---
+[](){ #serving-langchain }
 
 vLLM is also available via [LangChain](https://github.com/langchain-ai/langchain) .
 
diff --git a/docs/source/serving/integrations/llamaindex.md b/docs/serving/integrations/llamaindex.md
similarity index 91%
rename from docs/source/serving/integrations/llamaindex.md
rename to docs/serving/integrations/llamaindex.md
index 8c72605202cf..251b7155c556 100644
--- a/docs/source/serving/integrations/llamaindex.md
+++ b/docs/serving/integrations/llamaindex.md
@@ -1,6 +1,7 @@
-(serving-llamaindex)=
-
-# LlamaIndex
+---
+title: LlamaIndex
+---
+[](){ #serving-llamaindex }
 
 vLLM is also available via [LlamaIndex](https://github.com/run-llama/llama_index) .
 
diff --git a/docs/serving/offline_inference.md b/docs/serving/offline_inference.md
new file mode 100644
index 000000000000..b238199e4144
--- /dev/null
+++ b/docs/serving/offline_inference.md
@@ -0,0 +1,29 @@
+---
+title: Offline Inference
+---
+[](){ #offline-inference }
+
+You can run vLLM in your own code on a list of prompts.
+
+The offline API is based on the [LLM][vllm.LLM] class.
+To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run.
+
+For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace
+and runs it in vLLM using the default configuration.
+
+```python
+from vllm import LLM
+
+llm = LLM(model="facebook/opt-125m")
+```
+
+After initializing the `LLM` instance, you can perform model inference using various APIs.
+The available APIs depend on the type of model that is being run:
+
+- [Generative models][generative-models] output logprobs which are sampled from to obtain the final output text.
+- [Pooling models][pooling-models] output their hidden states directly.
+
+Please refer to the above pages for more details about each API.
+
+!!! info
+    [API Reference][offline-inference-api]
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md
similarity index 60%
rename from docs/source/serving/openai_compatible_server.md
rename to docs/serving/openai_compatible_server.md
index 61f7e98bf108..c2e39d029dd5 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -1,13 +1,16 @@
-(openai-compatible-server)=
-
-# OpenAI-Compatible Server
+---
+title: OpenAI-Compatible Server
+---
+[](){ #openai-compatible-server }
 
 vLLM provides an HTTP server that implements OpenAI's [Completions API](https://platform.openai.com/docs/api-reference/completions), [Chat API](https://platform.openai.com/docs/api-reference/chat), and more! This functionality lets you serve models and interact with them using an HTTP client.
 
-In your terminal, you can [install](../getting_started/installation.md) vLLM, then start the server with the [`vllm serve`](#serve-args) command. (You can also use our [Docker](#deployment-docker) image.)
+In your terminal, you can [install](../getting_started/installation/README.md) vLLM, then start the server with the [`vllm serve`][serve-args] command. (You can also use our [Docker][deployment-docker] image.)
 
 ```bash
-vllm serve NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
+vllm serve NousResearch/Meta-Llama-3-8B-Instruct \
+  --dtype auto \
+  --api-key token-abc123
 ```
 
 To call the server, in your preferred text editor, create a script that uses an HTTP client. Include any messages that you want to send to the model. Then run that script. Below is an example script using the [official OpenAI Python client](https://github.com/openai/openai-python).
@@ -20,58 +23,56 @@ client = OpenAI(
 )
 
 completion = client.chat.completions.create(
-  model="NousResearch/Meta-Llama-3-8B-Instruct",
-  messages=[
-    {"role": "user", "content": "Hello!"}
-  ]
+    model="NousResearch/Meta-Llama-3-8B-Instruct",
+    messages=[
+        {"role": "user", "content": "Hello!"}
+    ]
 )
 
 print(completion.choices[0].message)
 ```
 
-:::{tip}
-vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
-You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
-:::
+!!! tip
+    vLLM supports some parameters that are not supported by OpenAI, `top_k` for example.
+    You can pass these parameters to vLLM using the OpenAI client in the `extra_body` parameter of your requests, i.e. `extra_body={"top_k": 50}` for `top_k`.
 
-:::{important}
-By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
+!!! warning
+    By default, the server applies `generation_config.json` from the Hugging Face model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
 
-To disable this behavior, please pass `--generation-config vllm` when launching the server.
-:::
+    To disable this behavior, please pass `--generation-config vllm` when launching the server.
 
 ## Supported APIs
 
 We currently support the following OpenAI APIs:
 
-- [Completions API](#completions-api) (`/v1/completions`)
-  - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`).
-  - *Note: `suffix` parameter is not supported.*
-- [Chat Completions API](#chat-api) (`/v1/chat/completions`)
-  - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template](#chat-template).
-  - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
-- [Embeddings API](#embeddings-api) (`/v1/embeddings`)
-  - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`).
-- [Transcriptions API](#transcriptions-api) (`/v1/audio/transcriptions`)
-  - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).
+- [Completions API][completions-api] (`/v1/completions`)
+    - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`).
+    - *Note: `suffix` parameter is not supported.*
+- [Chat Completions API][chat-api] (`/v1/chat/completions`)
+    - Only applicable to [text generation models](../models/generative_models.md) (`--task generate`) with a [chat template][chat-template].
+    - *Note: `parallel_tool_calls` and `user` parameters are ignored.*
+- [Embeddings API][embeddings-api] (`/v1/embeddings`)
+    - Only applicable to [embedding models](../models/pooling_models.md) (`--task embed`).
+- [Transcriptions API][transcriptions-api] (`/v1/audio/transcriptions`)
+    - Only applicable to Automatic Speech Recognition (ASR) models (OpenAI Whisper) (`--task generate`).
 
 In addition, we have the following custom APIs:
 
-- [Tokenizer API](#tokenizer-api) (`/tokenize`, `/detokenize`)
-  - Applicable to any model with a tokenizer.
-- [Pooling API](#pooling-api) (`/pooling`)
-  - Applicable to all [pooling models](../models/pooling_models.md).
-- [Classification API](#classification-api) (`/classify`)
-  - Only applicable to [classification models](../models/pooling_models.md) (`--task classify`).
-- [Score API](#score-api) (`/score`)
-  - Applicable to embedding models and [cross-encoder models](../models/pooling_models.md) (`--task score`).
-- [Re-rank API](#rerank-api) (`/rerank`, `/v1/rerank`, `/v2/rerank`)
-  - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
-  - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
-  - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response.
-  - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
-
-(chat-template)=
+- [Tokenizer API][tokenizer-api] (`/tokenize`, `/detokenize`)
+    - Applicable to any model with a tokenizer.
+- [Pooling API][pooling-api] (`/pooling`)
+    - Applicable to all [pooling models](../models/pooling_models.md).
+- [Classification API][classification-api] (`/classify`)
+    - Only applicable to [classification models](../models/pooling_models.md) (`--task classify`).
+- [Score API][score-api] (`/score`)
+    - Applicable to embedding models and [cross-encoder models](../models/pooling_models.md) (`--task score`).
+- [Re-rank API][rerank-api] (`/rerank`, `/v1/rerank`, `/v2/rerank`)
+    - Implements [Jina AI's v1 re-rank API](https://jina.ai/reranker/)
+    - Also compatible with [Cohere's v1 & v2 re-rank APIs](https://docs.cohere.com/v2/reference/rerank)
+    - Jina and Cohere's APIs are very similar; Jina's includes extra information in the rerank endpoint's response.
+    - Only applicable to [cross-encoder models](../models/pooling_models.md) (`--task score`).
+
+[](){ #chat-template }
 
 ## Chat Template
 
@@ -97,10 +98,10 @@ both a `type` and a `text` field. An example is provided below:
 
 ```python
 completion = client.chat.completions.create(
-  model="NousResearch/Meta-Llama-3-8B-Instruct",
-  messages=[
-    {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
-  ]
+    model="NousResearch/Meta-Llama-3-8B-Instruct",
+    messages=[
+        {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
+    ]
 )
 ```
 
@@ -111,9 +112,9 @@ request. vLLM provides best-effort support to detect this automatically, which i
 the detected format, which can be one of:
 
 - `"string"`: A string.
-  - Example: `"Hello world"`
+    - Example: `"Hello world"`
 - `"openai"`: A list of dictionaries, similar to OpenAI schema.
-  - Example: `[{"type": "text", "text": "Hello world!"}]`
+    - Example: `[{"type": "text", "text": "Hello world!"}]`
 
 If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument
 to override which format to use.
@@ -126,13 +127,13 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
 
 ```python
 completion = client.chat.completions.create(
-  model="NousResearch/Meta-Llama-3-8B-Instruct",
-  messages=[
-    {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
-  ],
-  extra_body={
-    "guided_choice": ["positive", "negative"]
-  }
+    model="NousResearch/Meta-Llama-3-8B-Instruct",
+    messages=[
+        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+    ],
+    extra_body={
+        "guided_choice": ["positive", "negative"]
+    }
 )
 ```
 
@@ -148,29 +149,29 @@ with `--enable-request-id-headers`.
 
 ```python
 completion = client.chat.completions.create(
-  model="NousResearch/Meta-Llama-3-8B-Instruct",
-  messages=[
-    {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
-  ],
-  extra_headers={
-    "x-request-id": "sentiment-classification-00001",
-  }
+    model="NousResearch/Meta-Llama-3-8B-Instruct",
+    messages=[
+        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+    ],
+    extra_headers={
+        "x-request-id": "sentiment-classification-00001",
+    }
 )
 print(completion._request_id)
 
 completion = client.completions.create(
-  model="NousResearch/Meta-Llama-3-8B-Instruct",
-  prompt="A robot may not injure a human being",
-  extra_headers={
-    "x-request-id": "completion-test",
-  }
+    model="NousResearch/Meta-Llama-3-8B-Instruct",
+    prompt="A robot may not injure a human being",
+    extra_headers={
+        "x-request-id": "completion-test",
+    }
 )
 print(completion._request_id)
 ```
 
 ## API Reference
 
-(completions-api)=
+[](){ #completions-api }
 
 ### Completions API
 
@@ -181,23 +182,19 @@ Code example: <gh-file:examples/online_serving/openai_completion_client.py>
 
 #### Extra parameters
 
-The following [sampling parameters](#sampling-params) are supported.
+The following [sampling parameters][sampling-params] are supported.
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-completion-sampling-params
-:end-before: end-completion-sampling-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:completion-sampling-params"
+```
 
 The following extra parameters are supported:
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-completion-extra-params
-:end-before: end-completion-extra-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:completion-extra-params"
+```
 
-(chat-api)=
+[](){ #chat-api }
 
 ### Chat API
 
@@ -206,37 +203,33 @@ you can use the [official OpenAI Python client](https://github.com/openai/openai
 
 We support both [Vision](https://platform.openai.com/docs/guides/vision)- and
 [Audio](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-in)-related parameters;
-see our [Multimodal Inputs](#multimodal-inputs) guide for more information.
+see our [Multimodal Inputs][multimodal-inputs] guide for more information.
 - *Note: `image_url.detail` parameter is not supported.*
 
 Code example: <gh-file:examples/online_serving/openai_chat_completion_client.py>
 
 #### Extra parameters
 
-The following [sampling parameters](#sampling-params) are supported.
+The following [sampling parameters][sampling-params] are supported.
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-chat-completion-sampling-params
-:end-before: end-chat-completion-sampling-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-sampling-params"
+```
 
 The following extra parameters are supported:
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-chat-completion-extra-params
-:end-before: end-chat-completion-extra-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:chat-completion-extra-params"
+```
 
-(embeddings-api)=
+[](){ #embeddings-api }
 
 ### Embeddings API
 
 Our Embeddings API is compatible with [OpenAI's Embeddings API](https://platform.openai.com/docs/api-reference/embeddings);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-If the model has a [chat template](#chat-template), you can replace `inputs` with a list of `messages` (same schema as [Chat API](#chat-api))
+If the model has a [chat template][chat-template], you can replace `inputs` with a list of `messages` (same schema as [Chat API][chat-api])
 which will be treated as a single prompt to the model.
 
 Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
@@ -246,138 +239,121 @@ Code example: <gh-file:examples/online_serving/openai_embedding_client.py>
 You can pass multi-modal inputs to embedding models by defining a custom chat template for the server
 and passing a list of `messages` in the request. Refer to the examples below for illustration.
 
-:::::{tab-set}
-::::{tab-item} VLM2Vec
-
-To serve the model:
+=== "VLM2Vec"
 
-```bash
-vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
-  --trust-remote-code --max-model-len 4096 --chat-template examples/template_vlm2vec.jinja
-```
+    To serve the model:
 
-:::{important}
-Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
-to run this model in embedding mode instead of text generation mode.
+    ```bash
+    vllm serve TIGER-Lab/VLM2Vec-Full --task embed \
+      --trust-remote-code \
+      --max-model-len 4096 \
+      --chat-template examples/template_vlm2vec.jinja
+    ```
 
-The custom chat template is completely different from the original one for this model,
-and can be found here: <gh-file:examples/template_vlm2vec.jinja>
-:::
+    !!! warning
+        Since VLM2Vec has the same model architecture as Phi-3.5-Vision, we have to explicitly pass `--task embed`
+        to run this model in embedding mode instead of text generation mode.
 
-Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
+        The custom chat template is completely different from the original one for this model,
+        and can be found here: <gh-file:examples/template_vlm2vec.jinja>
 
-```python
-import requests
-
-image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-
-response = requests.post(
-    "http://localhost:8000/v1/embeddings",
-    json={
-        "model": "TIGER-Lab/VLM2Vec-Full",
-        "messages": [{
-            "role": "user",
-            "content": [
-                {"type": "image_url", "image_url": {"url": image_url}},
-                {"type": "text", "text": "Represent the given image."},
-            ],
-        }],
-        "encoding_format": "float",
-    },
-)
-response.raise_for_status()
-response_json = response.json()
-print("Embedding output:", response_json["data"][0]["embedding"])
-```
+    Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
 
-::::
+    ```python
+    import requests
 
-::::{tab-item} DSE-Qwen2-MRL
+    image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
-To serve the model:
+    response = requests.post(
+        "http://localhost:8000/v1/embeddings",
+        json={
+            "model": "TIGER-Lab/VLM2Vec-Full",
+            "messages": [{
+                "role": "user",
+                "content": [
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                    {"type": "text", "text": "Represent the given image."},
+                ],
+            }],
+            "encoding_format": "float",
+        },
+    )
+    response.raise_for_status()
+    response_json = response.json()
+    print("Embedding output:", response_json["data"][0]["embedding"])
+    ```
 
-```bash
-vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
-  --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
-```
+=== "DSE-Qwen2-MRL"
 
-:::{important}
-Like with VLM2Vec, we have to explicitly pass `--task embed`.
+    To serve the model:
 
-Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
-by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
-:::
+    ```bash
+    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embed \
+      --trust-remote-code \
+      --max-model-len 8192 \
+      --chat-template examples/template_dse_qwen2_vl.jinja
+    ```
 
-:::{important}
-`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
-example below for details.
-:::
+    !!! warning
+        Like with VLM2Vec, we have to explicitly pass `--task embed`.
 
-::::
+        Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
+        by a custom chat template: <gh-file:examples/template_dse_qwen2_vl.jinja>
 
-:::::
+    !!! warning
+        `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
+        example below for details.
 
 Full example: <gh-file:examples/online_serving/openai_chat_embedding_client_for_multimodal.py>
 
 #### Extra parameters
 
-The following [pooling parameters](#pooling-params) are supported.
+The following [pooling parameters][pooling-params] are supported.
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-embedding-pooling-params
-:end-before: end-embedding-pooling-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:embedding-pooling-params"
+```
 
 The following extra parameters are supported by default:
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-embedding-extra-params
-:end-before: end-embedding-extra-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:embedding-extra-params"
+```
 
 For chat-like input (i.e. if `messages` is passed), these extra parameters are supported instead:
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-chat-embedding-extra-params
-:end-before: end-chat-embedding-extra-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:chat-embedding-extra-params"
+```
 
-(transcriptions-api)=
+[](){ #transcriptions-api }
 
 ### Transcriptions API
 
 Our Transcriptions API is compatible with [OpenAI's Transcriptions API](https://platform.openai.com/docs/api-reference/audio/createTranscription);
 you can use the [official OpenAI Python client](https://github.com/openai/openai-python) to interact with it.
 
-:::{note}
-To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`.
-:::
+!!! note
+    To use the Transcriptions API, please install with extra audio dependencies using `pip install vllm[audio]`.
 
 Code example: <gh-file:examples/online_serving/openai_transcription_client.py>
 <!-- TODO: api enforced limits + uploading audios -->
 
 #### Extra Parameters
 
-The following [sampling parameters](#sampling-params) are supported.
+The following [sampling parameters][sampling-params] are supported.
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-transcription-sampling-params
-:end-before: end-transcription-sampling-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
+```
 
 The following extra parameters are supported:
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-transcription-extra-params
-:end-before: end-transcription-extra-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
+```
 
-(tokenizer-api)=
+[](){ #tokenizer-api }
 
 ### Tokenizer API
 
@@ -387,17 +363,17 @@ It consists of two endpoints:
 - `/tokenize` corresponds to calling `tokenizer.encode()`.
 - `/detokenize` corresponds to calling `tokenizer.decode()`.
 
-(pooling-api)=
+[](){ #pooling-api }
 
 ### Pooling API
 
 Our Pooling API encodes input prompts using a [pooling model](../models/pooling_models.md) and returns the corresponding hidden states.
 
-The input format is the same as [Embeddings API](#embeddings-api), but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
+The input format is the same as [Embeddings API][embeddings-api], but the output data can contain an arbitrary nested list, not just a 1-D list of floats.
 
 Code example: <gh-file:examples/online_serving/openai_pooling_client.py>
 
-(classification-api)=
+[](){ #classification-api }
 
 ### Classification API
 
@@ -505,23 +481,19 @@ Response:
 
 #### Extra parameters
 
-The following [pooling parameters](#pooling-params) are supported.
+The following [pooling parameters][pooling-params] are supported.
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-classification-pooling-params
-:end-before: end-classification-pooling-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:classification-pooling-params"
+```
 
 The following extra parameters are supported:
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-classification-extra-params
-:end-before: end-classification-extra-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:classification-extra-params"
+```
 
-(score-api)=
+[](){ #score-api }
 
 ### Score API
 
@@ -668,23 +640,19 @@ Response:
 
 #### Extra parameters
 
-The following [pooling parameters](#pooling-params) are supported.
+The following [pooling parameters][pooling-params] are supported.
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-score-pooling-params
-:end-before: end-score-pooling-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:score-pooling-params"
+```
 
 The following extra parameters are supported:
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-score-extra-params
-:end-before: end-score-extra-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:score-extra-params"
+```
 
-(rerank-api)=
+[](){ #rerank-api }
 
 ### Re-rank API
 
@@ -755,18 +723,14 @@ Response:
 
 #### Extra parameters
 
-The following [pooling parameters](#pooling-params) are supported.
+The following [pooling parameters][pooling-params] are supported.
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-rerank-pooling-params
-:end-before: end-rerank-pooling-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:rerank-pooling-params"
+```
 
 The following extra parameters are supported:
 
-:::{literalinclude} ../../../vllm/entrypoints/openai/protocol.py
-:language: python
-:start-after: begin-rerank-extra-params
-:end-before: end-rerank-extra-params
-:::
+```python
+--8<-- "vllm/entrypoints/openai/protocol.py:rerank-extra-params"
+```
diff --git a/docs/source/_static/custom.css b/docs/source/_static/custom.css
deleted file mode 100644
index 79bd2082b49e..000000000000
--- a/docs/source/_static/custom.css
+++ /dev/null
@@ -1,8 +0,0 @@
-.vertical-table-header th.head:not(.stub) {
-    writing-mode: sideways-lr;
-    white-space: nowrap;
-    max-width: 0;
-    p {
-       margin: 0;
-    }
-}
diff --git a/docs/source/_templates/sections/header.html b/docs/source/_templates/sections/header.html
deleted file mode 100644
index 7174431b1027..000000000000
--- a/docs/source/_templates/sections/header.html
+++ /dev/null
@@ -1,39 +0,0 @@
-<style>
-  .notification-bar {
-    width: 100vw;
-    display: flex;
-    justify-content: center;
-    align-items: center;
-    font-size: 16px;
-    padding: 0 6px 0 6px;
-  }
-  .notification-bar p {
-    margin: 0;
-  }
-  .notification-bar a {
-    font-weight: bold;
-    text-decoration: none;
-  }
-
-  /* Light mode styles (default) */
-  .notification-bar {
-    background-color: #fff3cd;
-    color: #856404;
-  }
-  .notification-bar a {
-    color: #d97706;
-  }
-
-  /* Dark mode styles */
-  html[data-theme=dark] .notification-bar {
-    background-color: #333;
-    color: #ddd;
-  }
-  html[data-theme=dark] .notification-bar a {
-    color: #ffa500; /* Brighter color for visibility */
-  }
-</style>
-
-<div class="notification-bar">
-  <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
-</div>
diff --git a/docs/source/api/summary.md b/docs/source/api/summary.md
deleted file mode 100644
index 46de545f9ded..000000000000
--- a/docs/source/api/summary.md
+++ /dev/null
@@ -1,133 +0,0 @@
-# Summary
-
-(configuration)=
-
-## Configuration
-
-API documentation for vLLM's configuration classes.
-
-```{autodoc2-summary}
-    vllm.config.ModelConfig
-    vllm.config.CacheConfig
-    vllm.config.TokenizerPoolConfig
-    vllm.config.LoadConfig
-    vllm.config.ParallelConfig
-    vllm.config.SchedulerConfig
-    vllm.config.DeviceConfig
-    vllm.config.SpeculativeConfig
-    vllm.config.LoRAConfig
-    vllm.config.PromptAdapterConfig
-    vllm.config.MultiModalConfig
-    vllm.config.PoolerConfig
-    vllm.config.DecodingConfig
-    vllm.config.ObservabilityConfig
-    vllm.config.KVTransferConfig
-    vllm.config.CompilationConfig
-    vllm.config.VllmConfig
-```
-
-(offline-inference-api)=
-
-## Offline Inference
-
-LLM Class.
-
-```{autodoc2-summary}
-    vllm.LLM
-```
-
-LLM Inputs.
-
-```{autodoc2-summary}
-    vllm.inputs.PromptType
-    vllm.inputs.TextPrompt
-    vllm.inputs.TokensPrompt
-```
-
-## vLLM Engines
-
-Engine classes for offline and online inference.
-
-```{autodoc2-summary}
-    vllm.LLMEngine
-    vllm.AsyncLLMEngine
-```
-
-## Inference Parameters
-
-Inference parameters for vLLM APIs.
-
-(sampling-params)=
-(pooling-params)=
-
-```{autodoc2-summary}
-    vllm.SamplingParams
-    vllm.PoolingParams
-```
-
-(multi-modality)=
-
-## Multi-Modality
-
-vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
-
-Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
-via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
-
-Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
-
-```{autodoc2-summary}
-    vllm.multimodal.MULTIMODAL_REGISTRY
-```
-
-### Inputs
-
-User-facing inputs.
-
-```{autodoc2-summary}
-    vllm.multimodal.inputs.MultiModalDataDict
-```
-
-Internal data structures.
-
-```{autodoc2-summary}
-    vllm.multimodal.inputs.PlaceholderRange
-    vllm.multimodal.inputs.NestedTensors
-    vllm.multimodal.inputs.MultiModalFieldElem
-    vllm.multimodal.inputs.MultiModalFieldConfig
-    vllm.multimodal.inputs.MultiModalKwargsItem
-    vllm.multimodal.inputs.MultiModalKwargs
-    vllm.multimodal.inputs.MultiModalInputs
-```
-
-### Data Parsing
-
-```{autodoc2-summary}
-    vllm.multimodal.parse
-```
-
-### Data Processing
-
-```{autodoc2-summary}
-    vllm.multimodal.processing
-```
-
-### Memory Profiling
-
-```{autodoc2-summary}
-    vllm.multimodal.profiling
-```
-
-### Registry
-
-```{autodoc2-summary}
-    vllm.multimodal.registry
-```
-
-## Model Development
-
-```{autodoc2-summary}
-    vllm.model_executor.models.interfaces_base
-    vllm.model_executor.models.interfaces
-    vllm.model_executor.models.adapters
-```
diff --git a/docs/source/autodoc2_docstring_parser.py b/docs/source/autodoc2_docstring_parser.py
deleted file mode 100644
index 41c49ed1c545..000000000000
--- a/docs/source/autodoc2_docstring_parser.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-from docutils import nodes
-from myst_parser.parsers.sphinx_ import MystParser
-from sphinx.ext.napoleon import docstring
-
-
-class NapoleonParser(MystParser):
-
-    def parse(self, input_string: str, document: nodes.document) -> None:
-        # Get the Sphinx configuration
-        config = document.settings.env.config
-
-        parsed_content = str(
-            docstring.GoogleDocstring(
-                str(docstring.NumpyDocstring(input_string, config)),
-                config,
-            ))
-        return super().parse(parsed_content, document)
-
-
-Parser = NapoleonParser
diff --git a/docs/source/community/blog.md b/docs/source/community/blog.md
deleted file mode 100644
index e8030edfa02e..000000000000
--- a/docs/source/community/blog.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# vLLM Blog
-
-vLLM blog posts are published [here](https://blog.vllm.ai/).
diff --git a/docs/source/conf.py b/docs/source/conf.py
deleted file mode 100644
index 5620d6de2c59..000000000000
--- a/docs/source/conf.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-
-import datetime
-import logging
-import os
-import re
-import sys
-from pathlib import Path
-
-import requests
-
-logger = logging.getLogger(__name__)
-REPO_ROOT = Path(__file__).resolve().parent.parent.parent
-sys.path.append(os.path.abspath(REPO_ROOT))
-
-# -- Project information -----------------------------------------------------
-
-project = 'vLLM'
-copyright = f'{datetime.datetime.now().year}, vLLM Team'
-author = 'the vLLM Team'
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    "sphinx.ext.napoleon",
-    "sphinx.ext.linkcode",
-    "sphinx.ext.intersphinx",
-    "sphinx_copybutton",
-    "autodoc2",
-    "myst_parser",
-    "sphinxarg.ext",
-    "sphinx_design",
-    "sphinx_togglebutton",
-]
-myst_enable_extensions = [
-    "colon_fence",
-    "fieldlist",
-]
-autodoc2_packages = [
-    {
-        "path": "../../vllm",
-        "exclude_dirs": ["__pycache__", "third_party"],
-    },
-]
-autodoc2_output_dir = "api"
-autodoc2_render_plugin = "myst"
-autodoc2_hidden_objects = ["dunder", "private", "inherited"]
-autodoc2_sort_names = True
-autodoc2_index_template = None
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: list[str] = ["**/*.template.md", "**/*.inc.md"]
-
-# Exclude the prompt "$" when copying code
-copybutton_prompt_text = r"\$ "
-copybutton_prompt_is_regexp = True
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_title = project
-html_theme = 'sphinx_book_theme'
-html_logo = 'assets/logos/vllm-logo-text-light.png'
-html_favicon = 'assets/logos/vllm-logo-only-light.ico'
-html_theme_options = {
-    'path_to_docs': 'docs/source',
-    'repository_url': 'https://github.com/vllm-project/vllm',
-    'use_repository_button': True,
-    'use_edit_page_button': True,
-    # Prevents the full API being added to the left sidebar of every page.
-    # Reduces build time by 2.5x and reduces build size from ~225MB to ~95MB.
-    'collapse_navbar': True,
-    # Makes API visible in the right sidebar on API reference pages.
-    'show_toc_level': 3,
-}
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
-html_js_files = ["custom.js"]
-html_css_files = ["custom.css"]
-
-myst_heading_anchors = 2
-myst_url_schemes = {
-    'http': None,
-    'https': None,
-    'mailto': None,
-    'ftp': None,
-    "gh-issue": {
-        "url":
-        "https://github.com/vllm-project/vllm/issues/{{path}}#{{fragment}}",
-        "title": "Issue #{{path}}",
-        "classes": ["github"],
-    },
-    "gh-pr": {
-        "url":
-        "https://github.com/vllm-project/vllm/pull/{{path}}#{{fragment}}",
-        "title": "Pull Request #{{path}}",
-        "classes": ["github"],
-    },
-    "gh-project": {
-        "url": "https://github.com/orgs/vllm-project/projects/{{path}}",
-        "title": "Project #{{path}}",
-        "classes": ["github"],
-    },
-    "gh-dir": {
-        "url": "https://github.com/vllm-project/vllm/tree/main/{{path}}",
-        "title": "{{path}}",
-        "classes": ["github"],
-    },
-    "gh-file": {
-        "url": "https://github.com/vllm-project/vllm/blob/main/{{path}}",
-        "title": "{{path}}",
-        "classes": ["github"],
-    },
-}
-
-# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
-READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')
-if READTHEDOCS_VERSION_TYPE == "tag":
-    # remove the warning banner if the version is a tagged release
-    header_file = os.path.join(os.path.dirname(__file__),
-                               "_templates/sections/header.html")
-    # The file might be removed already if the build is triggered multiple times
-    # (readthedocs build both HTML and PDF versions separately)
-    if os.path.exists(header_file):
-        os.remove(header_file)
-
-
-# Generate additional rst documentation here.
-def setup(app):
-    from docs.source.generate_examples import generate_examples
-    generate_examples()
-
-
-_cached_base: str = ""
-_cached_branch: str = ""
-
-
-def get_repo_base_and_branch(pr_number):
-    global _cached_base, _cached_branch
-    if _cached_base and _cached_branch:
-        return _cached_base, _cached_branch
-
-    url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}"
-    response = requests.get(url)
-    if response.status_code == 200:
-        data = response.json()
-        _cached_base = data['head']['repo']['full_name']
-        _cached_branch = data['head']['ref']
-        return _cached_base, _cached_branch
-    else:
-        logger.error("Failed to fetch PR details: %s", response)
-        return None, None
-
-
-def linkcode_resolve(domain, info):
-    if domain != 'py':
-        return None
-    if not info['module']:
-        return None
-
-    # Get path from module name
-    file = Path(f"{info['module'].replace('.', '/')}.py")
-    path = REPO_ROOT / file
-    if not path.exists():
-        path = REPO_ROOT / file.with_suffix("") / "__init__.py"
-    if not path.exists():
-        return None
-
-    # Get the line number of the object
-    with open(path) as f:
-        lines = f.readlines()
-    name = info['fullname'].split(".")[-1]
-    pattern = fr"^( {{4}})*((def|class) )?{name}\b.*"
-    for lineno, line in enumerate(lines, 1):
-        if not line or line.startswith("#"):
-            continue
-        if re.match(pattern, line):
-            break
-
-    # If the line number is not found, return None
-    if lineno == len(lines):
-        return None
-
-    # If the line number is found, create the URL
-    filename = path.relative_to(REPO_ROOT)
-    if "checkouts" in path.parts:
-        # a PR build on readthedocs
-        pr_number = REPO_ROOT.name
-        base, branch = get_repo_base_and_branch(pr_number)
-        if base and branch:
-            return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
-    # Otherwise, link to the source file on the main branch
-    return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"
-
-
-# Mock out external dependencies here, otherwise sphinx-argparse won't work.
-autodoc_mock_imports = [
-    "huggingface_hub",
-    "pydantic",
-    "zmq",
-    "cloudpickle",
-    "aiohttp",
-    "starlette",
-    "blake3",
-    "cpuinfo",
-    "transformers",
-    "psutil",
-    "vllm._C",
-    "PIL",
-    "numpy",
-    "tqdm",
-    # The mocks below are required by
-    # docs/source/serving/openai_compatible_server.md's
-    # vllm.entrypoints.openai.cli_args
-    "openai",
-    "fastapi",
-    "partial_json_parser",
-]
-
-for mock_target in autodoc_mock_imports:
-    if mock_target in sys.modules:
-        logger.info(
-            "Potentially problematic mock target (%s) found; "
-            "autodoc_mock_imports cannot mock modules that have already "
-            "been loaded into sys.modules when the sphinx build starts.",
-            mock_target)
-
-intersphinx_mapping = {
-    "python": ("https://docs.python.org/3", None),
-    "typing_extensions":
-    ("https://typing-extensions.readthedocs.io/en/latest", None),
-    "aiohttp": ("https://docs.aiohttp.org/en/stable", None),
-    "pillow": ("https://pillow.readthedocs.io/en/stable", None),
-    "numpy": ("https://numpy.org/doc/stable", None),
-    "torch": ("https://pytorch.org/docs/stable", None),
-    "psutil": ("https://psutil.readthedocs.io/en/stable", None),
-}
-
-navigation_with_keys = False
diff --git a/docs/source/contributing/model/index.md b/docs/source/contributing/model/index.md
deleted file mode 100644
index 721ee3cd2047..000000000000
--- a/docs/source/contributing/model/index.md
+++ /dev/null
@@ -1,27 +0,0 @@
-(new-model)=
-
-# Adding a New Model
-
-This section provides more information on how to integrate a [PyTorch](https://pytorch.org/) model into vLLM.
-
-:::{toctree}
-:caption: Contents
-:maxdepth: 1
-
-basic
-registration
-tests
-multimodal
-:::
-
-:::{note}
-The complexity of adding a new model depends heavily on the model's architecture.
-The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
-However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
-:::
-
-:::{tip}
-If you are encountering issues while integrating your model into vLLM, feel free to open a [GitHub issue](https://github.com/vllm-project/vllm/issues)
-or ask on our [developer slack](https://slack.vllm.ai).
-We will be happy to help you out!
-:::
diff --git a/docs/source/contributing/model/multimodal.md b/docs/source/contributing/model/multimodal.md
deleted file mode 100644
index b42536f054d7..000000000000
--- a/docs/source/contributing/model/multimodal.md
+++ /dev/null
@@ -1,834 +0,0 @@
-(supports-multimodal)=
-
-# Multi-Modal Support
-
-This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](#multimodal-inputs).
-
-## 1. Update the base vLLM model
-
-It is assumed that you have already implemented the model in vLLM according to [these steps](#new-model-basic).
-Further update the model as follows:
-
-- Reserve a keyword parameter in {meth}`~torch.nn.Module.forward` for each input tensor that corresponds to a multi-modal input, as shown in the following example:
-
-  ```diff
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-  +     pixel_values: torch.Tensor,
-    ) -> SamplerOutput:
-  ```
-  
-  More conveniently, you can simply pass `**kwargs` to the {meth}`~torch.nn.Module.forward` method and retrieve the keyword parameters for multimodal inputs from it.
-
-- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings` that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
-
-    ```python
-    class YourModelForImage2Seq(nn.Module):
-        ...
-
-        def _process_image_input(self, image_input: YourModelImageInputs) -> torch.Tensor:
-
-            assert self.vision_encoder is not None
-            image_features = self.vision_encoder(image_input)
-            return self.multi_modal_projector(image_features)
-
-        def get_multimodal_embeddings(
-                self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-
-            # Validate the multimodal input keyword arguments
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            if image_input is None:
-                return None
-
-            # Run multimodal inputs through encoder and projector
-            vision_embeddings = self._process_image_input(image_input)
-            return vision_embeddings
-    ```
-
-    :::{important}
-    The returned `multimodal_embeddings` must be either a **3D {class}`torch.Tensor`** of shape `(num_items, feature_size, hidden_size)`, or a **list / tuple of 2D {class}`torch.Tensor`'s** of shape `(feature_size, hidden_size)`, so that `multimodal_embeddings[i]` retrieves the embeddings generated from the `i`-th multimodal data item (e.g, image) of the request.
-    :::
-
-- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings` to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
-
-    ```python
-    from .utils import merge_multimodal_embeddings
-
-    class YourModelForImage2Seq(nn.Module):
-        ...
-
-        def get_input_embeddings(
-            self,
-            input_ids: torch.Tensor,
-            multimodal_embeddings: Optional[MultiModalEmbeddings] = None,
-        ) -> torch.Tensor:
-
-            # `get_input_embeddings` should already be implemented for the language 
-            # model as one of the requirements of basic vLLM model implementation.
-            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
-
-            if multimodal_embeddings is not None:
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids=input_ids, 
-                    inputs_embeds=inputs_embeds, 
-                    multimodal_embeddings=multimodal_embeddings,
-                    placeholder_token_id=self.config.image_token_index)
-
-            return inputs_embeds
-    ```
-
-- Implement {meth}`~vllm.model_executor.models.interfaces.SupportsMultiModal.get_language_model` getter to provide stable access to the underlying language model.
-
-    ```python
-    class YourModelForImage2Seq(nn.Module):
-        ...
-
-        def get_language_model(self) -> torch.nn.Module:
-            # Change `language_model` according to your implementation.
-            return self.language_model
-    ```
-
-- Once the above steps are done, update the model class with the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
-
-  ```diff
-  + from vllm.model_executor.models.interfaces import SupportsMultiModal
-
-  - class YourModelForImage2Seq(nn.Module):
-  + class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
-  ```
-
-  :::{note}
-  The model class does not have to be named {code}`*ForCausalLM`.
-  Check out [the HuggingFace Transformers documentation](https://huggingface.co/docs/transformers/model_doc/auto#multimodal) for some examples.
-  :::
-
-## 2. Specify processing information
-
-Next, create a subclass of {class}`~vllm.multimodal.processing.BaseProcessingInfo`
-to provide basic information related to HF processing.
-
-### Maximum number of input items
-
-You need to override the abstract method {meth}`~vllm.multimodal.processing.BaseProcessingInfo.get_supported_mm_limits`
-to return the maximum number of input items for each modality supported by the model.
-
-For example, if the model supports any number of images but only one video per prompt:
-
-```python
-def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
-    return {"image": None, "video": 1}
-```
-
-## 3. Specify dummy inputs
-
-Then, inherit {class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` to construct dummy inputs for
-HF processing as well as memory profiling.
-
-### For memory profiling
-
-Override the abstract methods {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_text` and {meth}`~vllm.multimodal.profiling.BaseDummyInputsBuilder.get_dummy_mm_data` to construct dummy inputs for memory profiling. These dummy inputs should result in the worst-case memory usage of the model so that vLLM can reserve the correct amount of memory for it.
-
-Assuming that the memory usage increases with the number of tokens, the dummy inputs can be constructed to maximize the number of output embeddings, which is the same number as placeholder feature tokens.
-
-::::{tab-set}
-:::{tab-item} Basic example: LLaVA
-:sync: llava
-
-Looking at the code of HF's `LlavaForConditionalGeneration`:
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
-n_image_tokens = (input_ids == self.config.image_token_index).sum().item()
-n_image_features = image_features.shape[0] * image_features.shape[1]
-
-if n_image_tokens != n_image_features:
-    raise ValueError(
-        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-    )
-special_image_mask = (
-    (input_ids == self.config.image_token_index)
-    .unsqueeze(-1)
-    .expand_as(inputs_embeds)
-    .to(inputs_embeds.device)
-)
-image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-```
-
-The number of placeholder feature tokens per image is `image_features.shape[1]`.
-`image_features` is calculated inside the `get_image_features` method:
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
-image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
-
-selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
-if vision_feature_select_strategy == "default":
-    selected_image_feature = selected_image_feature[:, 1:]
-elif vision_feature_select_strategy == "full":
-    selected_image_feature = selected_image_feature
-else:
-    raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
-image_features = self.multi_modal_projector(selected_image_feature)
-return image_features
-```
-
-We can infer that `image_features.shape[1]` is based on `image_outputs.hidden_states.shape[1]` from the vision tower
-(`CLIPVisionModel` for the [`llava-hf/llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) model).
-Moreover, we only need the sequence length (the second dimension of the tensor) to get `image_features.shape[1]`.
-The sequence length is determined by the initial hidden states in `CLIPVisionTransformer` since the attention
-mechanism doesn't change the sequence length of the output hidden states.
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L1094-L1102
-hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
-hidden_states = self.pre_layrnorm(hidden_states)
-
-encoder_outputs = self.encoder(
-    inputs_embeds=hidden_states,
-    output_attentions=output_attentions,
-    output_hidden_states=output_hidden_states,
-    return_dict=return_dict,
-)
-```
-
-To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
-target_dtype = self.patch_embedding.weight.dtype
-patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
-patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
-
-class_embeds = self.class_embedding.expand(batch_size, 1, -1)
-embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
-if interpolate_pos_encoding:
-    embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
-else:
-    embeddings = embeddings + self.position_embedding(self.position_ids)
-return embeddings
-```
-
-We can infer that `embeddings.shape[1] == self.num_positions`, where
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L195-L196
-self.num_patches = (self.image_size // self.patch_size) ** 2
-self.num_positions = self.num_patches + 1
-```
-
-Overall, the number of placeholder feature tokens for an image can be calculated as:
-
-```python
-def get_num_image_tokens(
-    self,
-    *,
-    image_width: int,
-    image_height: int,
-) -> int:
-    hf_config = self.get_hf_config()
-    hf_processor = self.get_hf_processor()
-
-    image_size = hf_config.vision_config.image_size
-    patch_size = hf_config.vision_config.patch_size
-
-    num_image_tokens = (image_size // patch_size) ** 2 + 1
-    if hf_processor.vision_feature_select_strategy == "default":
-        num_image_tokens -= 1
-
-    return num_image_tokens
-```
-
-Notice that the number of image tokens doesn't depend on the image width and height.
-We can simply use a dummy `image_size` to calculate the multimodal profiling data:
-
-```python
-# NOTE: In actuality, this is usually implemented as part of the
-# model's subclass of `BaseProcessingInfo`, but we show it as is
-# here for simplicity.
-def get_image_size_with_most_features(self) -> ImageSize:
-    hf_config = self.get_hf_config()
-    width = height = hf_config.image_size
-    return ImageSize(width=width, height=height)
-
-def get_dummy_mm_data(
-    self,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-) -> MultiModalDataDict:
-    num_images = mm_counts.get("image", 0)
-
-    target_width, target_height = \
-        self.info.get_image_size_with_most_features()
-
-    return {
-        "image":
-        self._get_dummy_images(width=target_width,
-                               height=target_height,
-                               num_images=num_images)
-    }
-```
-
-For the text, we simply expand the multimodal image token from the model config to match the desired number of images.
-
-```python
-def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
-    num_images = mm_counts.get("image", 0)
-
-    processor = self.info.get_hf_processor()
-    image_token = processor.image_token
-
-    return image_token * num_images
-```
-
-:::
-
-:::{tab-item} No input placeholders: Fuyu
-:sync: fuyu
-
-Looking at the code of HF's `FuyuForCausalLM`:
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
-if image_patches is not None and past_key_values is None:
-    patch_embeddings = [
-        self.vision_embed_tokens(patch.to(self.vision_embed_tokens.weight.dtype))
-        .squeeze(0)
-        .to(inputs_embeds.device)
-        for patch in image_patches
-    ]
-    inputs_embeds = self.gather_continuous_embeddings(
-        word_embeddings=inputs_embeds,
-        continuous_embeddings=patch_embeddings,
-        image_patch_input_indices=image_patches_indices,
-    )
-```
-
-The number of placeholder feature tokens for the `i`th item in the batch is `patch_embeddings[i].shape[0]`,
-which is the same as `image_patches[i].shape[0]`, i.e. `num_total_patches`.
-
-Unlike LLaVA, Fuyu does not define the number of patches inside the modeling file. Where can we get more information?
-Considering that the model input comes from the output of `FuyuProcessor`, let's **look at the preprocessing files**.
-
-The image outputs are obtained by calling `FuyuImageProcessor.preprocess` and then
-`FuyuImageProcessor.preprocess_with_tokenizer_info` inside `FuyuProcessor`.
-
-In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
-returning the dimensions after resizing (but before padding) as metadata.
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
-image_encoding = self.image_processor.preprocess(images, **output_kwargs["images_kwargs"])
-batch_images = image_encoding["images"]
-image_unpadded_heights = image_encoding["image_unpadded_heights"]
-image_unpadded_widths = image_encoding["image_unpadded_widths"]
-
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L480-L
-if do_resize:
-    batch_images = [
-        [self.resize(image, size=size, input_data_format=input_data_format) for image in images]
-        for images in batch_images
-    ]
-
-image_sizes = [get_image_size(images[0], channel_dim=input_data_format) for images in batch_images]
-image_unpadded_heights = [[image_size[0]] for image_size in image_sizes]
-image_unpadded_widths = [[image_size[1]] for image_size in image_sizes]
-
-if do_pad:
-    batch_images = [
-        [
-            self.pad_image(
-                image,
-                size=size,
-                mode=padding_mode,
-                constant_values=padding_value,
-                input_data_format=input_data_format,
-            )
-            for image in images
-        ]
-        for images in batch_images
-    ]
-```
-
-In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
-model_image_input = self.image_processor.preprocess_with_tokenizer_info(
-    image_input=tensor_batch_images,
-    image_present=image_present,
-    image_unpadded_h=image_unpadded_heights,
-    image_unpadded_w=image_unpadded_widths,
-    image_placeholder_id=image_placeholder_id,
-    image_newline_id=image_newline_id,
-    variable_sized=True,
-)
-
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L638-L658
-image_height, image_width = image.shape[1], image.shape[2]
-if variable_sized:  # variable_sized=True
-    new_h = min(
-        image_height,
-        math.ceil(image_unpadded_h[batch_index, subseq_index] / patch_height) * patch_height,
-    )
-    new_w = min(
-        image_width,
-        math.ceil(image_unpadded_w[batch_index, subseq_index] / patch_width) * patch_width,
-    )
-    image = image[:, :new_h, :new_w]
-    image_height, image_width = new_h, new_w
-
-num_patches = self.get_num_patches(image_height=image_height, image_width=image_width)
-tensor_of_image_ids = torch.full(
-    [num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device
-)
-patches = self.patchify_image(image=image.unsqueeze(0)).squeeze(0)
-assert num_patches == patches.shape[0]
-```
-
-The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
-patch_size = patch_size if patch_size is not None else self.patch_size
-patch_height, patch_width = self.patch_size["height"], self.patch_size["width"]
-
-if image_height % patch_height != 0:
-    raise ValueError(f"{image_height=} must be divisible by {patch_height}")
-if image_width % patch_width != 0:
-    raise ValueError(f"{image_width=} must be divisible by {patch_width}")
-
-num_patches_per_dim_h = image_height // patch_height
-num_patches_per_dim_w = image_width // patch_width
-num_patches = num_patches_per_dim_h * num_patches_per_dim_w
-```
-
-These image patches correspond to placeholder tokens (`|SPEAKER|`). So, we just need to maximize the number of image patches. Since input images are first resized
-to fit within `image_processor.size`, we can maximize the number of image patches by inputting an image with size equal to `image_processor.size`.
-
-```python
-def get_image_size_with_most_features(self) -> ImageSize:
-    image_processor = self.get_image_processor()
-    return ImageSize(width=image_processor.size["width"],
-                        height=image_processor.size["height"])
-```
-
-Fuyu does not expect image placeholders in the inputs to HF processor, so
-the dummy prompt text is empty regardless of the number of images.
-
-```python
-def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
-    return ""
-```
-
-For the multimodal image profiling data, the logic is very similar to LLaVA:
-
-```python
-def get_dummy_mm_data(
-    self,
-    seq_len: int,
-    mm_counts: Mapping[str, int],
-) -> MultiModalDataDict:
-    target_width, target_height = \
-        self.info.get_image_size_with_most_features()
-    num_images = mm_counts.get("image", 0)
-
-    return {
-        "image":
-        self._get_dummy_images(width=target_width,
-                               height=target_height,
-                               num_images=num_images)
-    }
-```
-
-:::
-
-::::
-
-## 4. Specify processing details
-
-Afterwards, create a subclass of {class}`~vllm.multimodal.processing.BaseMultiModalProcessor`
-to fill in the missing details about HF processing.
-
-:::{seealso}
-[Multi-Modal Data Processing](#mm-processing)
-:::
-
-### Multi-modal fields
-
-Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` to
-return a schema of the tensors outputted by the HF processor that are related to the input multi-modal items.
-
-:::::{tab-set}
-::::{tab-item} Basic example: LLaVA
-:sync: llava
-
-The output of `CLIPImageProcessor` is a simple tensor with shape
-`(num_images, num_channels, image_height, image_width)`:
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/image_processing_clip.py#L339-L345
-images = [
-    to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
-    for image in all_images
-]
-
-data = {"pixel_values": images}
-return BatchFeature(data=data, tensor_type=return_tensors)
-```
-
-So, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` as follows:
-
-```python
-def _get_mm_fields_config(
-    self,
-    hf_inputs: BatchFeature,
-    hf_processor_mm_kwargs: Mapping[str, object],
-) -> Mapping[str, MultiModalFieldConfig]:
-    return dict(
-        pixel_values=MultiModalFieldConfig.batched("image"),
-    )
-```
-
-:::{note}
-Our [actual code](gh-file:vllm/model_executor/models/llava.py) additionally supports
-pre-computed image embeddings, which can be passed to be model via the `image_embeds` argument.
-:::
-
-::::
-
-::::{tab-item} With postprocessing: Fuyu
-:sync: fuyu
-
-The `image_patches` output of `FuyuImageProcessor.preprocess_with_tokenizer_info` concatenates
-the patches from each image belonging to an item in the batch:
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L673-L679
-        image_input_ids.append(tensor_of_image_ids)
-        image_patches.append(patches)
-    else:
-        image_input_ids.append(torch.tensor([], dtype=torch.int32, device=image_input.device))
-
-batch_image_input_ids.append(image_input_ids)
-batch_image_patches.append(image_patches)
-```
-
-The shape of `image_patches` outputted by `FuyuImageProcessor` is therefore
-`(1, num_images, num_patches, patch_width * patch_height * num_channels)`.
-
-In order to support the use of {func}`MultiModalFieldConfig.batched` like in LLaVA,
-we remove the extra batch dimension by overriding {meth}`BaseMultiModalProcessor._call_hf_processor`:
-
-```python
-def _call_hf_processor(
-    self,
-    prompt: str,
-    mm_data: Mapping[str, object],
-    mm_kwargs: Mapping[str, object],
-) -> BatchFeature:
-    processed_outputs = super()._call_hf_processor(
-        prompt=prompt,
-        mm_data=mm_data,
-        mm_kwargs=mm_kwargs,
-    )
-
-    image_patches = processed_outputs.get("image_patches")
-    if image_patches is not None:
-        images = mm_data["images"]
-        assert isinstance(images, list)
-
-        # Original output: (1, num_images, Pn, Px * Py * C)
-        # New output: (num_images, Pn, Px * Py * C)
-        assert (isinstance(image_patches, list)
-                and len(image_patches) == 1)
-        assert (isinstance(image_patches[0], torch.Tensor)
-                and len(image_patches[0]) == len(images))
-
-        processed_outputs["image_patches"] = image_patches[0]
-
-    return processed_outputs
-```
-
-:::{note}
-Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
-for text-only inputs to prevent unnecessary warnings from HF processor.
-:::
-
-This lets us override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config` as follows:
-
-```python
-def _get_mm_fields_config(
-    self,
-    hf_inputs: BatchFeature,
-    hf_processor_mm_kwargs: Mapping[str, object],
-) -> Mapping[str, MultiModalFieldConfig]:
-    return dict(image_patches=MultiModalFieldConfig.batched("image"))
-```
-
-::::
-
-:::::
-
-### Prompt updates
-
-Override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` to
-return a list of {class}`~vllm.multimodal.processing.PromptUpdate` instances.
-
-Each {class}`~vllm.multimodal.processing.PromptUpdate` instance specifies an update operation
-(e.g.: insertion, replacement) performed by the HF processor.
-
-::::{tab-set}
-:::{tab-item} Basic example: LLaVA
-:sync: llava
-
-Looking at HF's `LlavaProcessor`:
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/processing_llava.py#L167-L170
-prompt_strings = []
-for sample in text:
-    sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
-    prompt_strings.append(sample)
-```
-
-It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
-Based on this, we override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` as follows:
-
-```python
-def _get_prompt_updates(
-    self,
-    mm_items: MultiModalDataItems,
-    hf_processor_mm_kwargs: Mapping[str, object],
-    out_mm_kwargs: MultiModalKwargs,
-) -> Sequence[PromptUpdate]:
-    hf_config = self.info.get_hf_config()
-    image_token_id = hf_config.image_token_index
-
-    def get_replacement(item_idx: int):
-        images = mm_items.get_items("image", ImageProcessorItems)
-
-        image_size = images.get_image_size(item_idx)
-        num_image_tokens = self.info.get_num_image_tokens(
-            image_width=image_size.width,
-            image_height=image_size.height,
-        )
-
-        return [image_token_id] * num_image_tokens
-
-    return [
-        PromptReplacement(
-            modality="image",
-            target=[image_token_id],
-            replacement=get_replacement,
-        ),
-    ]
-```
-
-:::
-
-:::{tab-item} Handling additional tokens: Fuyu
-:sync: fuyu
-
-Recall the layout of feature tokens from Step 2:
-
-```
-|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
-|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
-...
-|SPEAKER||SPEAKER|...|SPEAKER||NEWLINE|
-```
-
-We define a helper function to return `ncols` and `nrows` directly:
-
-```python
-def get_image_feature_grid_size(
-    self,
-    *,
-    image_width: int,
-    image_height: int,
-) -> tuple[int, int]:
-    image_processor = self.get_image_processor()
-    target_width = image_processor.size["width"]
-    target_height = image_processor.size["height"]
-    patch_width = image_processor.patch_size["width"]
-    patch_height = image_processor.patch_size["height"]
-
-    if not (image_width <= target_width and image_height <= target_height):
-        height_scale_factor = target_height / image_height
-        width_scale_factor = target_width / image_width
-        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
-
-        image_height = int(image_height * optimal_scale_factor)
-        image_width = int(image_width * optimal_scale_factor)
-
-    ncols = math.ceil(image_width / patch_width)
-    nrows = math.ceil(image_height / patch_height)
-    return ncols, nrows
-```
-
-Based on this, we can initially define our replacement tokens as:
-
-```python
-def get_replacement(item_idx: int):
-    images = mm_items.get_items("image", ImageProcessorItems)
-    image_size = images.get_image_size(item_idx)
-
-    ncols, nrows = self.info.get_image_feature_grid_size(
-        image_width=image_size.width,
-        image_height=image_size.height,
-    )
-
-    # `_IMAGE_TOKEN_ID` corresponds to `|SPEAKER|`
-    # `_NEWLINE_TOKEN_ID` corresponds to `|NEWLINE|`
-    return ([_IMAGE_TOKEN_ID] * ncols + [_NEWLINE_TOKEN_ID]) * nrows
-```
-
-However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
-a BOS token (`<s>`) is also added to the promopt:
-
-```python
-# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
-model_image_input = self.image_processor.preprocess_with_tokenizer_info(
-    image_input=tensor_batch_images,
-    image_present=image_present,
-    image_unpadded_h=image_unpadded_heights,
-    image_unpadded_w=image_unpadded_widths,
-    image_placeholder_id=image_placeholder_id,
-    image_newline_id=image_newline_id,
-    variable_sized=True,
-)
-prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
-    tokenizer=self.tokenizer,
-    prompts=prompts,
-    scale_factors=scale_factors,
-    max_tokens_to_generate=self.max_tokens_to_generate,
-    max_position_embeddings=self.max_position_embeddings,
-    add_BOS=True,
-    add_beginning_of_answer_token=True,
-)
-```
-
-To assign the vision embeddings to only the image tokens, instead of a string
-you can return an instance of {class}`~vllm.multimodal.processing.PromptUpdateDetails`:
-
-```python
-hf_config = self.info.get_hf_config()
-bos_token_id = hf_config.bos_token_id  # `<s>`
-assert isinstance(bos_token_id, int)
-
-def get_replacement_fuyu(item_idx: int):
-    images = mm_items.get_items("image", ImageProcessorItems)
-    image_size = images.get_image_size(item_idx)
-
-    ncols, nrows = self.info.get_image_feature_grid_size(
-        image_width=image_size.width,
-        image_height=image_size.height,
-    )
-    image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
-                    [_NEWLINE_TOKEN_ID]) * nrows
-
-    return PromptUpdateDetails.select_token_id(
-        image_tokens + [bos_token_id],
-        embed_token_id=_IMAGE_TOKEN_ID,
-    )
-```
-
-Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
-we can search for it to conduct the replacement at the start of the string:
-
-```python
-def _get_prompt_updates(
-    self,
-    mm_items: MultiModalDataItems,
-    hf_processor_mm_kwargs: Mapping[str, object],
-    out_mm_kwargs: MultiModalKwargs,
-) -> Sequence[PromptUpdate]:
-    hf_config = self.info.get_hf_config()
-    bos_token_id = hf_config.bos_token_id
-    assert isinstance(bos_token_id, int)
-
-    tokenizer = self.info.get_tokenizer()
-    eot_token_id = tokenizer.bos_token_id
-    assert isinstance(eot_token_id, int)
-
-    def get_replacement_fuyu(item_idx: int):
-        images = mm_items.get_items("image", ImageProcessorItems)
-        image_size = images.get_image_size(item_idx)
-
-        ncols, nrows = self.info.get_image_feature_grid_size(
-            image_width=image_size.width,
-            image_height=image_size.height,
-        )
-        image_tokens = ([_IMAGE_TOKEN_ID] * ncols +
-                        [_NEWLINE_TOKEN_ID]) * nrows
-
-        return PromptUpdateDetails.select_token_id(
-            image_tokens + [bos_token_id],
-            embed_token_id=_IMAGE_TOKEN_ID,
-        )
-
-    return [
-        PromptReplacement(
-            modality="image",
-            target=[eot_token_id],
-            replacement=get_replacement_fuyu,
-        )
-    ]
-```
-
-:::
-
-::::
-
-## 5. Register processor-related classes
-
-After you have defined {class}`~vllm.multimodal.processing.BaseProcessingInfo` (Step 2),
-{class}`~vllm.multimodal.profiling.BaseDummyInputsBuilder` (Step 3),
-and {class}`~vllm.multimodal.processing.BaseMultiModalProcessor` (Step 4),
-decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor <vllm.multimodal.registry.MultiModalRegistry.register_processor>`
-to register them to the multi-modal registry:
-
-```diff
-  from vllm.model_executor.models.interfaces import SupportsMultiModal
-+ from vllm.multimodal import MULTIMODAL_REGISTRY
-
-+ @MULTIMODAL_REGISTRY.register_processor(YourMultiModalProcessor,
-+                                         info=YourProcessingInfo,
-+                                         dummy_inputs=YourDummyInputsBuilder)
-  class YourModelForImage2Seq(nn.Module, SupportsMultiModal):
-```
-
-## Notes
-
-### Inserting feature tokens without replacement
-
-Some HF processors directly insert feature tokens without replacing anything in the original prompt. In that case, you can use {class}`~vllm.multimodal.processing.PromptInsertion` instead of {class}`~vllm.multimodal.processing.PromptReplacement` inside {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates`.
-
-Examples:
-
-- BLIP-2 (insert at start of prompt): <gh-file:vllm/model_executor/models/blip2.py>
-- Florence2 (insert at start of prompt): <gh-file:vllm/model_executor/models/florence2.py>
-- Molmo (insert after `<|endoftext|>` token): <gh-file:vllm/model_executor/models/molmo.py>
-
-### Handling prompt updates unrelated to multi-modal data
-
-{meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates` assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only` so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design](#mm-processing).
-
-Examples:
-
-- Chameleon (appends `sep_token`): <gh-file:vllm/model_executor/models/chameleon.py>
-- Fuyu (appends `boa_token`): <gh-file:vllm/model_executor/models/fuyu.py>
-- Molmo (applies chat template which is not defined elsewhere): <gh-file:vllm/model_executor/models/molmo.py>
-
-### Custom HF processor
-
-Some models don't define a HF processor class on HF Hub. In that case, you can define a custom HF processor that has the same call signature as HF processors and pass it to {meth}`~vllm.multimodal.processing.BaseMultiModalProcessor._call_hf_processor`.
-
-Examples:
-
-- DeepSeek-VL2: <gh-file:vllm/model_executor/models/deepseek_vl2.py>
-- InternVL: <gh-file:vllm/model_executor/models/internvl.py>
-- Qwen-VL: <gh-file:vllm/model_executor/models/qwen_vl.py>
diff --git a/docs/source/contributing/model/registration.md b/docs/source/contributing/model/registration.md
deleted file mode 100644
index 64cd25b53807..000000000000
--- a/docs/source/contributing/model/registration.md
+++ /dev/null
@@ -1,55 +0,0 @@
-(new-model-registration)=
-
-# Registering a Model to vLLM
-
-vLLM relies on a model registry to determine how to run each model.
-A list of pre-registered architectures can be found [here](#supported-models).
-
-If your model is not on this list, you must register it to vLLM.
-This page provides detailed instructions on how to do so.
-
-## Built-in models
-
-To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source](#build-from-source).
-This gives you the ability to modify the codebase and test your model.
-
-After you have implemented your model (see [tutorial](#new-model-basic)), put it into the <gh-dir:vllm/model_executor/models> directory.
-Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
-Finally, update our [list of supported models](#supported-models) to promote your model!
-
-:::{important}
-The list of models in each section should be maintained in alphabetical order.
-:::
-
-## Out-of-tree models
-
-You can load an external model using a plugin without modifying the vLLM codebase.
-
-:::{seealso}
-[vLLM's Plugin System](#plugin-system)
-:::
-
-To register the model, use the following code:
-
-```python
-from vllm import ModelRegistry
-from your_code import YourModelForCausalLM
-ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
-```
-
-If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
-
-```python
-from vllm import ModelRegistry
-
-ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
-```
-
-:::{important}
-If your model is a multimodal model, ensure the model class implements the {class}`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
-Read more about that [here](#supports-multimodal).
-:::
-
-:::{note}
-Although you can directly put these code snippets in your script using `vllm.LLM`, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
-:::
diff --git a/docs/source/deployment/docker.md b/docs/source/deployment/docker.md
deleted file mode 100644
index ca56710bc2ef..000000000000
--- a/docs/source/deployment/docker.md
+++ /dev/null
@@ -1,133 +0,0 @@
-(deployment-docker)=
-
-# Using Docker
-
-(deployment-docker-pre-built-image)=
-
-## Use vLLM's Official Docker Image
-
-vLLM offers an official Docker image for deployment.
-The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
-
-```console
-$ docker run --runtime nvidia --gpus all \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
-    -p 8000:8000 \
-    --ipc=host \
-    vllm/vllm-openai:latest \
-    --model mistralai/Mistral-7B-v0.1
-```
-
-This image can also be used with other container engines such as [Podman](https://podman.io/).
-
-```console
-$ podman run --gpus all \
-  -v ~/.cache/huggingface:/root/.cache/huggingface \
-  --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
-  -p 8000:8000 \
-  --ipc=host \
-  vllm/vllm-openai:latest \
-  --model mistralai/Mistral-7B-v0.1
-```
-
-You can add any other <project:#engine-args> you need after the image tag (`vllm/vllm-openai:latest`).
-
-:::{note}
-You can either use the `ipc=host` flag or `--shm-size` flag to allow the
-container to access the host's shared memory. vLLM uses PyTorch, which uses shared
-memory to share data between processes under the hood, particularly for tensor parallel inference.
-:::
-
-:::{note}
-Optional dependencies are not included in order to avoid licensing issues (e.g. <gh-issue:8030>).
-
-If you need to use those dependencies (having accepted the license terms),
-create a custom Dockerfile on top of the base image with an extra layer that installs them:
-
-```Dockerfile
-FROM vllm/vllm-openai:v0.8.3
-
-# e.g. install the `audio` optional dependencies
-# NOTE: Make sure the version of vLLM matches the base image!
-RUN uv pip install --system vllm[audio]==0.8.3
-```
-
-:::
-
-:::{tip}
-Some new models may only be available on the main branch of [HF Transformers](https://github.com/huggingface/transformers).
-
-To use the development version of `transformers`, create a custom Dockerfile on top of the base image
-with an extra layer that installs their code from source:
-
-```Dockerfile
-FROM vllm/vllm-openai:latest
-
-RUN uv pip install --system git+https://github.com/huggingface/transformers.git
-```
-
-:::
-
-(deployment-docker-build-image-from-source)=
-
-## Building vLLM's Docker Image from Source
-
-You can build and run vLLM from source via the provided <gh-file:docker/Dockerfile>. To build vLLM:
-
-```console
-# optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile
-```
-
-:::{note}
-By default vLLM will build for all GPU types for widest distribution. If you are just building for the
-current GPU type the machine is running on, you can add the argument `--build-arg torch_cuda_arch_list=""`
-for vLLM to find the current GPU type and build for that.
-
-If you are using Podman instead of Docker, you might need to disable SELinux labeling by
-adding `--security-opt label=disable` when running `podman build` command to avoid certain [existing issues](https://github.com/containers/buildah/discussions/4184).
-:::
-
-## Building for Arm64/aarch64
-
-A docker container can be built for aarch64 systems such as the Nvidia Grace-Hopper. At time of this writing, this requires the use
-of PyTorch Nightly and should be considered **experimental**. Using the flag `--platform "linux/arm64"` will attempt to build for arm64.
-
-:::{note}
-Multiple modules must be compiled, so this process can take a while. Recommend using `--build-arg max_jobs=` & `--build-arg nvcc_threads=`
-flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
-Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
-:::
-
-```console
-# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
-$ python3 use_existing_torch.py
-$ DOCKER_BUILDKIT=1 docker build . \
-  --file docker/Dockerfile \
-  --target vllm-openai \
-  --platform "linux/arm64" \
-  -t vllm/vllm-gh200-openai:latest \
-  --build-arg max_jobs=66 \
-  --build-arg nvcc_threads=2 \
-  --build-arg torch_cuda_arch_list="9.0+PTX" \
-  --build-arg vllm_fa_cmake_gpu_arches="90-real"
-```
-
-## Use the custom-built vLLM Docker image
-
-To run vLLM with the custom-built Docker image:
-
-```console
-$ docker run --runtime nvidia --gpus all \
-    -v ~/.cache/huggingface:/root/.cache/huggingface \
-    -p 8000:8000 \
-    --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
-    vllm/vllm-openai <args...>
-```
-
-The argument `vllm/vllm-openai` specifies the image to run, and should be replaced with the name of the custom-built image (the `-t` tag from the build command).
-
-:::{note}
-**For version 0.4.1 and 0.4.2 only** - the vLLM docker images under these versions are supposed to be run under the root user since a library under the root user's home directory, i.e. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` is required to be loaded during runtime. If you are running the container under a different user, you may need to first change the permissions of the library (and all the parent directories) to allow the user to access it, then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
-:::
diff --git a/docs/source/deployment/frameworks/helm.md b/docs/source/deployment/frameworks/helm.md
deleted file mode 100644
index 7320d727fbaa..000000000000
--- a/docs/source/deployment/frameworks/helm.md
+++ /dev/null
@@ -1,250 +0,0 @@
-(deployment-helm)=
-
-# Helm
-
-A Helm chart to deploy vLLM for Kubernetes
-
-Helm is a package manager for Kubernetes. It will help you to deploy vLLM on k8s and automate the deployment of vLLM Kubernetes applications. With Helm, you can deploy the same framework architecture with different configurations to multiple namespaces by overriding variable values.
-
-This guide will walk you through the process of deploying vLLM with Helm, including the necessary prerequisites, steps for helm installation and documentation on architecture and values file.
-
-## Prerequisites
-
-Before you begin, ensure that you have the following:
-
-- A running Kubernetes cluster
-- NVIDIA Kubernetes Device Plugin (`k8s-device-plugin`): This can be found at [https://github.com/NVIDIA/k8s-device-plugin](https://github.com/NVIDIA/k8s-device-plugin)
-- Available GPU resources in your cluster
-- S3 with the model which will be deployed
-
-## Installing the chart
-
-To install the chart with the release name `test-vllm`:
-
-```console
-helm upgrade --install --create-namespace --namespace=ns-vllm test-vllm . -f values.yaml --set secrets.s3endpoint=$ACCESS_POINT --set secrets.s3bucketname=$BUCKET --set secrets.s3accesskeyid=$ACCESS_KEY --set secrets.s3accesskey=$SECRET_KEY
-```
-
-## Uninstalling the Chart
-
-To uninstall the `test-vllm` deployment:
-
-```console
-helm uninstall test-vllm --namespace=ns-vllm
-```
-
-The command removes all the Kubernetes components associated with the
-chart **including persistent volumes** and deletes the release.
-
-## Architecture
-
-:::{image} /assets/deployment/architecture_helm_deployment.png
-:::
-
-## Values
-
-:::{list-table}
-:widths: 25 25 25 25
-:header-rows: 1
-
-- * Key
-  * Type
-  * Default
-  * Description
-- * autoscaling
-  * object
-  * {"enabled":false,"maxReplicas":100,"minReplicas":1,"targetCPUUtilizationPercentage":80}
-  * Autoscaling configuration
-- * autoscaling.enabled
-  * bool
-  * false
-  * Enable autoscaling
-- * autoscaling.maxReplicas
-  * int
-  * 100
-  * Maximum replicas
-- * autoscaling.minReplicas
-  * int
-  * 1
-  * Minimum replicas
-- * autoscaling.targetCPUUtilizationPercentage
-  * int
-  * 80
-  * Target CPU utilization for autoscaling
-- * configs
-  * object
-  * {}
-  * Configmap
-- * containerPort
-  * int
-  * 8000
-  * Container port
-- * customObjects
-  * list
-  * []
-  * Custom Objects configuration
-- * deploymentStrategy
-  * object
-  * {}
-  * Deployment strategy configuration
-- * externalConfigs
-  * list
-  * []
-  * External configuration
-- * extraContainers
-  * list
-  * []
-  * Additional containers configuration
-- * extraInit
-  * object
-  * {"pvcStorage":"1Gi","s3modelpath":"relative_s3_model_path/opt-125m", "awsEc2MetadataDisabled": true}
-  * Additional configuration for the init container
-- * extraInit.pvcStorage
-  * string
-  * "50Gi"
-  * Storage size of the s3
-- * extraInit.s3modelpath
-  * string
-  * "relative_s3_model_path/opt-125m"
-  * Path of the model on the s3 which hosts model weights and config files
-- * extraInit.awsEc2MetadataDisabled
-  * boolean
-  * true
-  * Disables the use of the Amazon EC2 instance metadata service
-- * extraPorts
-  * list
-  * []
-  * Additional ports configuration
-- * gpuModels
-  * list
-  * ["TYPE_GPU_USED"]
-  * Type of gpu used
-- * image
-  * object
-  * {"command":["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"],"repository":"vllm/vllm-openai","tag":"latest"}
-  * Image configuration
-- * image.command
-  * list
-  * ["vllm","serve","/data/","--served-model-name","opt-125m","--host","0.0.0.0","--port","8000"]
-  * Container launch command
-- * image.repository
-  * string
-  * "vllm/vllm-openai"
-  * Image repository
-- * image.tag
-  * string
-  * "latest"
-  * Image tag
-- * livenessProbe
-  * object
-  * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":15,"periodSeconds":10}
-  * Liveness probe configuration
-- * livenessProbe.failureThreshold
-  * int
-  * 3
-  * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not alive
-- * livenessProbe.httpGet
-  * object
-  * {"path":"/health","port":8000}
-  * Configuration of the Kubelet http request on the server
-- * livenessProbe.httpGet.path
-  * string
-  * "/health"
-  * Path to access on the HTTP server
-- * livenessProbe.httpGet.port
-  * int
-  * 8000
-  * Name or number of the port to access on the container, on which the server is listening
-- * livenessProbe.initialDelaySeconds
-  * int
-  * 15
-  * Number of seconds after the container has started before liveness probe is initiated
-- * livenessProbe.periodSeconds
-  * int
-  * 10
-  * How often (in seconds) to perform the liveness probe
-- * maxUnavailablePodDisruptionBudget
-  * string
-  * ""
-  * Disruption Budget Configuration
-- * readinessProbe
-  * object
-  * {"failureThreshold":3,"httpGet":{"path":"/health","port":8000},"initialDelaySeconds":5,"periodSeconds":5}
-  * Readiness probe configuration
-- * readinessProbe.failureThreshold
-  * int
-  * 3
-  * Number of times after which if a probe fails in a row, Kubernetes considers that the overall check has failed: the container is not ready
-- * readinessProbe.httpGet
-  * object
-  * {"path":"/health","port":8000}
-  * Configuration of the Kubelet http request on the server
-- * readinessProbe.httpGet.path
-  * string
-  * "/health"
-  * Path to access on the HTTP server
-- * readinessProbe.httpGet.port
-  * int
-  * 8000
-  * Name or number of the port to access on the container, on which the server is listening
-- * readinessProbe.initialDelaySeconds
-  * int
-  * 5
-  * Number of seconds after the container has started before readiness probe is initiated
-- * readinessProbe.periodSeconds
-  * int
-  * 5
-  * How often (in seconds) to perform the readiness probe
-- * replicaCount
-  * int
-  * 1
-  * Number of replicas
-- * resources
-  * object
-  * {"limits":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1},"requests":{"cpu":4,"memory":"16Gi","nvidia.com/gpu":1}}
-  * Resource configuration
-- * resources.limits."nvidia.com/gpu"
-  * int
-  * 1
-  * Number of gpus used
-- * resources.limits.cpu
-  * int
-  * 4
-  * Number of CPUs
-- * resources.limits.memory
-  * string
-  * "16Gi"
-  * CPU memory configuration
-- * resources.requests."nvidia.com/gpu"
-  * int
-  * 1
-  * Number of gpus used
-- * resources.requests.cpu
-  * int
-  * 4
-  * Number of CPUs
-- * resources.requests.memory
-  * string
-  * "16Gi"
-  * CPU memory configuration
-- * secrets
-  * object
-  * {}
-  * Secrets configuration
-- * serviceName
-  * string
-  *
-  * Service name
-- * servicePort
-  * int
-  * 80
-  * Service port
-- * labels.environment
-  * string
-  * test
-  * Environment name
-- * labels.release
-  * string
-  * test
-  * Release name
-:::
diff --git a/docs/source/deployment/frameworks/index.md b/docs/source/deployment/frameworks/index.md
deleted file mode 100644
index 3408c6c10ede..000000000000
--- a/docs/source/deployment/frameworks/index.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# Using other frameworks
-
-:::{toctree}
-:maxdepth: 1
-
-anything-llm
-bentoml
-cerebrium
-chatbox
-dify
-dstack
-helm
-litellm
-lobe-chat
-lws
-modal
-open-webui
-retrieval_augmented_generation
-skypilot
-streamlit
-triton
-:::
diff --git a/docs/source/deployment/integrations/index.md b/docs/source/deployment/integrations/index.md
deleted file mode 100644
index 410742b88c73..000000000000
--- a/docs/source/deployment/integrations/index.md
+++ /dev/null
@@ -1,11 +0,0 @@
-# External Integrations
-
-:::{toctree}
-:maxdepth: 1
-
-kserve
-kubeai
-llamastack
-llmaz
-production-stack
-:::
diff --git a/docs/source/design/kernel/paged_attention.md b/docs/source/design/kernel/paged_attention.md
deleted file mode 100644
index e1770c822643..000000000000
--- a/docs/source/design/kernel/paged_attention.md
+++ /dev/null
@@ -1,529 +0,0 @@
-(design-paged-attention)=
-
-# vLLM Paged Attention
-
-- Currently, vLLM utilizes its own implementation of a multi-head query
-  attention kernel (`csrc/attention/attention_kernels.cu`).
-  This kernel is designed to be compatible with
-  vLLM's paged KV caches, where the key and value cache are stored in
-  separate blocks (note that this block concept differs from the GPU
-  thread block. So in a later document, I will refer to vLLM paged
-  attention block as "block", while refer to GPU thread block as
-  "thread block").
-- To achieve high performance, this kernel relies on a specially
-  designed memory layout and access method, specifically when threads
-  read data from global memory to shared memory. The purpose of this
-  document is to provide a high-level explanation of the kernel
-  implementation step by step, aiding those who wish to learn about the
-  vLLM multi-head query attention kernel. After going through this
-  document, users will likely have a better understanding and feel easier
-  to follow the actual implementation.
-- Please note that this document may not cover all details, such as how
-  to calculate the correct index for the corresponding data or the dot
-  multiplication implementation. However, after reading this document
-  and becoming familiar with the high-level logic flow, it should be
-  easier for you to read the actual code and understand the details.
-
-## Inputs
-
-- The kernel function takes a list of arguments for the current thread
-  to perform its assigned work. The three most important arguments are
-  the input pointers `q`, `k_cache`, and `v_cache`, which point
-  to query, key, and value data on global memory that need to be read
-  and processed. The output pointer `out` points to global memory
-  where the result should be written. These four pointers actually
-  refer to multi-dimensional arrays, but each thread only accesses the
-  portion of data assigned to it. I have omitted all other runtime
-  parameters here for simplicity.
-
-  ```cpp
-  template<
-  typename scalar_t,
-  int HEAD_SIZE,
-  int BLOCK_SIZE,
-  int NUM_THREADS,
-  int PARTITION_SIZE = 0>
-  __device__ void paged_attention_kernel(
-  ... // Other side args.
-  const scalar_t* __restrict__ out,       // [num_seqs, num_heads, max_num_partitions, head_size]
-  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
-  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
-  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
-  ... // Other side args.
-  )
-  ```
-
-- There are also a list of template arguments above the function
-  signature that are determined during compilation time. `scalar_t`
-  represents the data type of the query, key, and value data elements,
-  such as FP16. `HEAD_SIZE` indicates the number of elements in each
-  head. `BLOCK_SIZE` refers to the number of tokens in each block.
-  `NUM_THREADS` denotes the number of threads in each thread block.
-  `PARTITION_SIZE` represents the number of tensor parallel GPUs (For
-  simplicity, we assume this is 0 and tensor parallel is disabled).
-
-- With these arguments, we need to perform a sequence of preparations.
-  This includes calculating the current head index, block index, and
-  other necessary variables. However, for now, we can ignore these
-  preparations and proceed directly to the actual calculations. It will
-  be easier to understand them once we grasp the entire flow.
-
-## Concepts
-
-- Just before we dive into the calculation flow, I want to describe a
-  few concepts that are needed for later sections. However, you may
-  skip this section and return later if you encounter any confusing
-  terminologies.
-- **Sequence**: A sequence represents a client request. For example,
-  the data pointed to by `q` has a shape of
-  `[num_seqs, num_heads, head_size]`. That represents there are total
-  `num_seqs` of query sequence data are pointed by `q`. Since this
-  kernel is a single query attention kernel, each sequence only has one
-  query token. Hence, the `num_seqs` equals the total number of tokens
-  that are processed in the batch.
-- **Context**: The context consists of the generated tokens from the
-  sequence. For instance, `["What", "is", "your"]` are the context
-  tokens, and the input query token is `"name"`. The model might
-  generate the token `"?"`.
-- **Vec**: The vec is a list of elements that are fetched and
-  calculated together. For query and key data, the vec size
-  (`VEC_SIZE`) is determined so that each thread group can fetch and
-  calculate 16 bytes of data at a time. For value data, the vec size
-  (`V_VEC_SIZE`) is determined so that each thread can fetch and
-  calculate 16 bytes of data at a time. For example, if the
-  `scalar_t` is FP16 (2 bytes) and `THREAD_GROUP_SIZE` is 2, the
-  `VEC_SIZE` will be 4, while the `V_VEC_SIZE` will be 8.
-- **Thread group**: The thread group is a small group of
-  threads(`THREAD_GROUP_SIZE`) that fetches and calculates one
-  query token and one key token at a time. Each thread handles only a
-  portion of the token data. The total number of elements processed by
-  one thread group is referred as `x`. For example, if the thread
-  group contains 2 threads and the head size is 8, then thread 0
-  handles the query and key elements at index 0, 2, 4, 6, while thread
-  1 handles the elements at index 1, 3, 5, 7.
-- **Block**: The key and value cache data in vLLM are split into
-  blocks. Each block stores data for a fixed number(`BLOCK_SIZE`)
-  of tokens at one head. Each block may contain only a portion of the
-  whole context tokens. For example, if the block size is 16 and the
-  head size is 128, then for one head, one block can store 16 * 128 =
-  2048 elements.
-- **Warp**: A warp is a group of 32 threads(`WARP_SIZE`) that
-  execute simultaneously on a stream multiprocessor (SM). In this
-  kernel, each warp processes the calculation between one query token
-  and key tokens of one entire block at a time (it may process multiple
-  blocks in multiple iterations). For example, if there are 4 warps and
-  6 blocks for one context, the assignment would be like warp 0 handles
-  the 0th, 4th blocks, warp 1 handles the 1st, 5th blocks, warp 2
-  handles the 2nd block and warp 3 handles the 3rd block.
-- **Thread block**: A thread block is a group of
-  threads(`NUM_THREADS`) that can access the same shared memory.
-  Each thread block contains multiple warps(`NUM_WARPS`), and in
-  this kernel, each thread block processes the calculation between one
-  query token and key tokens of a whole context.
-- **Grid**: A grid is a collection of thread blocks and defines the
-  shape of the collection. In this kernel, the shape is
-  `(num_heads, num_seqs, max_num_partitions)`. Therefore, each thread
-  block only handles the calculation for one head, one sequence, and
-  one partition.
-
-## Query
-
-- This section will introduce how query data is stored in memory and
-  fetched by each thread. As mentioned above, each thread group fetches
-  one query token data, while each thread itself only handles a part of
-  one query token data. Within each warp, every thread group will fetch
-  the same query token data, but will multiply it with different key
-  token data.
-
-  ```cpp
-  const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-  ```
-
-  :::{figure} ../../assets/kernel/query.png
-  :align: center
-  :alt: query
-  :width: 70%
-
-  Query data of one token at one head
-  :::
-
-- Each thread defines its own `q_ptr` which points to the assigned
-  query token data on global memory. For example, if `VEC_SIZE` is 4
-  and `HEAD_SIZE` is 128, the `q_ptr` points to data that contains
-  total of 128 elements divided into 128 / 4 = 32 vecs.
-
-  :::{figure} ../../assets/kernel/q_vecs.png
-  :align: center
-  :alt: q_vecs
-  :width: 70%
-
-  `q_vecs` for one thread group
-  :::
-
-  ```cpp
-  __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
-  ```
-
-- Next, we need to read the global memory data pointed to by `q_ptr`
-  into shared memory as `q_vecs`. It is important to note that each
-  vecs is assigned to a different row. For example, if the
-  `THREAD_GROUP_SIZE` is 2, thread 0 will handle the 0th row vecs,
-  while thread 1 handles the 1st row vecs. By reading the query data in
-  this way, neighboring threads like thread 0 and thread 1 can read
-  neighbor memory, achieving the memory coalescing to improve
-  performance.
-
-## Key
-
-- Similar to the "Query" section, this section introduces memory layout
-  and assignment for keys. While each thread group only handle one
-  query token one kernel run, it may handle multiple key tokens across
-  multiple iterations. Meanwhile, each warp will process multiple blocks
-  of key tokens in multiple iterations, ensuring that all context
-  tokens are processed by the entire thread group after the kernel run.
-  In this context, "handle" refers to performing the dot multiplication
-  between query data and key data.
-
-  ```cpp
-  const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
-                      + kv_head_idx * kv_head_stride
-                      + physical_block_offset * x;
-  ```
-
-- Unlike to `q_ptr`, `k_ptr` in each thread will point to different
-  key token at different iterations. As shown above, that `k_ptr`
-  points to key token data based on `k_cache` at assigned block,
-  assigned head and assigned token.
-
-  :::{figure} ../../assets/kernel/key.png
-  :align: center
-  :alt: key
-  :width: 70%
-
-  Key data of all context tokens at one head
-  :::
-
-- The diagram above illustrates the memory layout for key data. It
-  assumes that the `BLOCK_SIZE` is 16, `HEAD_SIZE` is 128, `x` is
-  8, `THREAD_GROUP_SIZE` is 2, and there are a total of 4 warps. Each
-  rectangle represents all the elements for one key token at one head,
-  which will be processed by one thread group. The left half shows the
-  total 16 blocks of key token data for warp 0, while the right half
-  represents the remaining key token data for other warps or
-  iterations. Inside each rectangle, there are a total 32 vecs (128
-  elements for one token) that will be processed by 2 threads (one
-  thread group) separately.
-
-  :::{figure} ../../assets/kernel/k_vecs.png
-  :align: center
-  :alt: k_vecs
-  :width: 70%
-
-  `k_vecs` for one thread
-  :::
-
-  ```cpp
-  K_vec k_vecs[NUM_VECS_PER_THREAD]
-  ```
-
-- Next, we need to read the key token data from `k_ptr` and store
-  them on register memory as `k_vecs`. We use register memory for
-  `k_vecs` because it will only be accessed by one thread once,
-  whereas `q_vecs` will be accessed by multiple threads multiple
-  times. Each `k_vecs` will contain multiple vectors for later
-  calculation. Each vec will be set at each inner iteration. The
-  assignment of vecs allows neighboring threads in a warp to read
-  neighboring memory together, which again promotes the memory
-  coalescing. For instance, thread 0 will read vec 0, while thread 1
-  will read vec 1. In the next inner loop, thread 0 will read vec 2,
-  while thread 1 will read vec 3, and so on.
-
-- You may still be a little confused about the overall flow. Don't
-  worry, please keep reading the next "QK" section. It will illustrate
-  the query and key calculation flow in a clearer and higher-level
-  manner.
-
-## QK
-
-- As shown the pseudo code below, before the entire for loop block, we
-  fetch the query data for one token and store it in `q_vecs`. Then,
-  in the outer for loop, we iterate through different `k_ptrs` that
-  point to different tokens and prepare the `k_vecs` in the inner for
-  loop. Finally, we perform the dot multiplication between the
-  `q_vecs` and each `k_vecs`.
-
-  ```cpp
-  q_vecs = ...
-  for ... {
-     k_ptr = ...
-     for ... {
-        k_vecs[i] = ...
-     }
-     ...
-     float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
-  }
-  ```
-
-- As mentioned before, for each thread, it only fetches part of the
-  query and key token data at a time. However, there will be a cross
-  thread group reduction happen in the `Qk_dot<>::dot` . So `qk`
-  returned here is not just between part of the query and key token dot
-  multiplication, but actually a full result between entire query and
-  key token data.
-
-- For example, if the value of `HEAD_SIZE` is 128 and
-  `THREAD_GROUP_SIZE` is 2, each thread's `k_vecs` will contain
-  total 64 elements. However, the returned `qk` is actually the
-  result of dot multiplication between 128 query elements and 128 key
-  elements. If you want to learn more about the details of the dot
-  multiplication and reduction, you may refer to the implementation of
-  `Qk_dot<>::dot`. However, for the sake of simplicity, I will not
-  cover it in this document.
-
-## Softmax
-
-- Next, we need to calculate the normalized softmax for all `qk`s,
-  as shown above, where each $x$ represents a `qk`. To do this,
-  we must obtain the reduced value of `qk_max`($m(x)$) and
-  the `exp_sum`($\ell(x)$) of all `qk`s. The reduction
-  should be performed across the entire thread block, encompassing
-  results between the query token and all context key tokens.
-
-  :::{math}
-  :nowrap: true
-
-  \begin{gather*}
-  m(x):=\max _i \quad x_i \\ \quad f(x):=\left[\begin{array}{lll}e^{x_1-m(x)} & \ldots & e^{x_B-m(x)}\end{array}\right]\\ \quad \ell(x):=\sum_i f(x)_i \\
-  \quad \operatorname{softmax}(x):=\frac{f(x)}{\ell(x)}
-  \end{gather*}
-  :::
-
-### `qk_max` and `logits`
-
-- Just right after we get the `qk` result, we can set the temporary
-  `logits` result with `qk` (In the end, the `logits` should
-  store the normalized softmax result). Also we can compare and collect
-  the `qk_max` for all `qk`s that are calculated by current
-  thread group.
-
-  ```cpp
-  if (thread_group_offset == 0) {
-     const bool mask = token_idx >= context_len;
-     logits[token_idx - start_token_idx] = mask ? 0.f : qk;
-     qk_max = mask ? qk_max : fmaxf(qk_max, qk);
-  }
-  ```
-
-- Please note that the `logits` here is on shared memory, so each
-  thread group will set the fields for its own assigned context tokens.
-  Overall, the size of logits should be number of context tokens.
-
-  ```cpp
-  for (int mask = WARP_SIZE / 2; mask >= THREAD_GROUP_SIZE; mask /= 2) {
-      qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
-  }
-
-  if (lane == 0) {
-     red_smem[warp_idx] = qk_max;
-  }
-  ```
-
-- Then we need to get the reduced `qk_max` across each warp. The main
-  idea is to make threads in warp to communicate with each other and
-  get the final max `qk` .
-
-  ```cpp
-  for (int mask = NUM_WARPS / 2; mask >= 1; mask /= 2) {
-      qk_max = fmaxf(qk_max, VLLM_SHFL_XOR_SYNC(qk_max, mask));
-  }
-  qk_max = VLLM_SHFL_SYNC(qk_max, 0);
-  ```
-
-- Finally, we can get the reduced `qk_max` from whole thread block by
-  compare the `qk_max` from all warps in this thread block. Then we
-  need to broadcast the final result to each thread.
-
-### `exp_sum`
-
-- Similar to `qk_max`, we need to get the reduced sum value from the
-  entire thread block too.
-
-  ```cpp
-  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
-      float val = __expf(logits[i] - qk_max);
-      logits[i] = val;
-      exp_sum += val;
-  }
-  ...
-  exp_sum = block_sum<NUM_WARPS>(&red_smem[NUM_WARPS], exp_sum);
-  ```
-
-- Firstly, sum all exp values from each thread group, and meanwhile,
-  convert each entry of `logits` from `qk` to `exp(qk - qk_max)`.
-  Please note, the `qk_max` here is already the max `qk` across the
-  whole thread block. And then we can do reduction for `exp_sum`
-  across whole thread block just like the `qk_max`.
-
-  ```cpp
-  const float inv_sum = __fdividef(1.f, exp_sum + 1e-6f);
-  for (int i = thread_idx; i < num_tokens; i += NUM_THREADS) {
-     logits[i] *= inv_sum;
-  }
-  ```
-
-- Finally, with the reduced `qk_max` and `exp_sum`, we can obtain
-  the final normalized softmax result as `logits`. This `logits`
-  variable will be used for dot multiplication with the value data in
-  later steps. Now, it should store the normalized softmax result of
-  `qk` for all assigned context tokens.
-
-## Value
-
-:::{figure} ../../assets/kernel/value.png
-:align: center
-:alt: value
-:width: 70%
-
-Value data of all context tokens at one head
-:::
-
-:::{figure} ../../assets/kernel/logits_vec.png
-:align: center
-:alt: logits_vec
-:width: 50%
-
-`logits_vec` for one thread
-:::
-
-:::{figure} ../../assets/kernel/v_vec.png
-:align: center
-:alt: v_vec
-:width: 70%
-
-List of `v_vec` for one thread
-:::
-
-- Now we need to retrieve the value data and perform dot multiplication
-  with `logits`. Unlike query and key, there is no thread group
-  concept for value data. As shown in diagram, different from key token
-  memory layout, elements from the same column correspond to the same
-  value token. For one block of value data, there are `HEAD_SIZE` of
-  rows and `BLOCK_SIZE` of columns that are split into multiple
-  `v_vecs`.
-
-- Each thread always fetches `V_VEC_SIZE` elements from the same
-  `V_VEC_SIZE` of tokens at a time. As a result, a single thread
-  retrieves multiple `v_vec`s from different rows and the same
-  columns through multiple inner iterations. For each `v_vec`, it
-  needs to be dot multiplied with the corresponding `logits_vec`,
-  which is also `V_VEC_SIZE` elements from `logits`. Overall, with
-  multiple inner iterations, each warp will process one block of value
-  tokens. And with multiple outer iterations, the whole context value
-  tokens are processed
-
-  ```cpp
-  float accs[NUM_ROWS_PER_THREAD];
-  for ... { // Iteration over different blocks.
-      logits_vec = ...
-      for ... { // Iteration over different rows.
-          v_vec = ...
-          ...
-          accs[i] += dot(logits_vec, v_vec);
-      }
-  }
-  ```
-
-- As shown in the above pseudo code, in the outer loop, similar to
-  `k_ptr`, `logits_vec` iterates over different blocks and reads
-  `V_VEC_SIZE` elements from `logits`. In the inner loop, each
-  thread reads `V_VEC_SIZE` elements from the same tokens as a
-  `v_vec` and performs dot multiplication. It is important to note
-  that in each inner iteration, the thread fetches different head
-  position elements for the same tokens. The dot result is then
-  accumulated in `accs`. Therefore, each entry of `accs` is mapped
-  to a head position assigned to the current thread.
-
-- For example, if `BLOCK_SIZE` is 16 and `V_VEC_SIZE` is 8, each
-  thread fetches 8 value elements for 8 tokens at a time. Each element
-  is from different tokens at the same head position. If `HEAD_SIZE`
-  is 128 and `WARP_SIZE` is 32, for each inner loop, a warp needs to
-  fetch `WARP_SIZE * V_VEC_SIZE = 256` elements. This means there are
-  a total of 128 * 16 / 256 = 8 inner iterations for a warp to handle
-  a whole block of value tokens. And each `accs` in each thread
-  contains 8 elements that accumulated at 8 different head positions.
-  For the thread 0, the `accs` variable will have 8 elements, which
-  are 0th, 32th … 224th elements of a value head that are accumulated
-  from all assigned 8 tokens.
-
-## LV
-
-- Now, we need to perform reduction for `accs` within each warp. This
-  process allows each thread to accumulate the `accs` for the
-  assigned head positions of all tokens in one block.
-
-  ```cpp
-  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-     float acc = accs[i];
-     for (int mask = NUM_V_VECS_PER_ROW / 2; mask >= 1; mask /= 2) {
-        acc += VLLM_SHFL_XOR_SYNC(acc, mask);
-     }
-     accs[i] = acc;
-  }
-  ```
-
-- Next, we perform reduction for `accs` across all warps, allowing
-  each thread to have the accumulation of `accs` for the assigned
-  head positions of all context tokens. Please note that each `accs`
-  in every thread only stores the accumulation for a portion of
-  elements of the entire head for all context tokens. However, overall,
-  all results for output have been calculated but are just stored in
-  different thread register memory.
-
-  ```cpp
-  float* out_smem = reinterpret_cast<float*>(shared_mem);
-  for (int i = NUM_WARPS; i > 1; i /= 2) {
-      // Upper warps write to shared memory.
-      ...
-          float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
-          for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-                  ...
-          dst[row_idx] = accs[i];
-      }
-
-      // Lower warps update the output.
-          const float* src = &out_smem[warp_idx * HEAD_SIZE];
-      for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-                  ...
-          accs[i] += src[row_idx];
-      }
-
-          // Write out the accs.
-  }
-  ```
-
-## Output
-
-- Now we can write all of calculated result from local register memory
-  to final output global memory.
-
-  ```cpp
-  scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
-                  + head_idx * max_num_partitions * HEAD_SIZE
-                  + partition_idx * HEAD_SIZE;
-  ```
-
-- First, we need to define the `out_ptr` variable, which points to
-  the start address of the assigned sequence and assigned head.
-
-  ```cpp
-  for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
-  const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
-  if (row_idx < HEAD_SIZE && lane % NUM_V_VECS_PER_ROW == 0) {
-      from_float(*(out_ptr + row_idx), accs[i]);
-  }
-  }
-  ```
-
-- Finally, we need to iterate over different assigned head positions
-  and write out the corresponding accumulated result based on the
-  `out_ptr`.
diff --git a/docs/source/features/compatibility_matrix.md b/docs/source/features/compatibility_matrix.md
deleted file mode 100644
index 8865d26deaed..000000000000
--- a/docs/source/features/compatibility_matrix.md
+++ /dev/null
@@ -1,476 +0,0 @@
-(compatibility-matrix)=
-
-# Compatibility Matrix
-
-The tables below show mutually exclusive features and the support on some hardware.
-
-The symbols used have the following meanings:
-
-- ✅ = Full compatibility
-- 🟠 = Partial compatibility
-- ❌ = No compatibility
-
-:::{note}
-Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
-:::
-
-## Feature x Feature
-
-:::{raw} html
-<style>
-  /* Make smaller to try to improve readability  */
-  td {
-    font-size: 0.8rem;
-    text-align: center;
-  }
-
-  th {
-    text-align: center;
-    font-size: 0.8rem;
-  }
-</style>
-:::
-
-:::{list-table}
-:header-rows: 1
-:stub-columns: 1
-:widths: auto
-:class: vertical-table-header
-
-- * Feature
-  * [CP](#chunked-prefill)
-  * [APC](#automatic-prefix-caching)
-  * [LoRA](#lora-adapter)
-  * <abbr title="Prompt Adapter">prmpt adptr</abbr>
-  * [SD](#spec-decode)
-  * CUDA graph
-  * <abbr title="Pooling Models">pooling</abbr>
-  * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-  * <abbr title="Logprobs">logP</abbr>
-  * <abbr title="Prompt Logprobs">prmpt logP</abbr>
-  * <abbr title="Async Output Processing">async output</abbr>
-  * multi-step
-  * <abbr title="Multimodal Inputs">mm</abbr>
-  * best-of
-  * beam-search
-  * <abbr title="Guided Decoding">guided dec</abbr>
-- * [CP](#chunked-prefill)
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-- * [APC](#automatic-prefix-caching)
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-- * [LoRA](#lora-adapter)
-  * ✅
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-- * <abbr title="Prompt Adapter">prmpt adptr</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-- * [SD](#spec-decode)
-  * ✅
-  * ✅
-  * ❌
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-- * CUDA graph
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-- * <abbr title="Pooling Models">pooling</abbr>
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-- * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-  * ❌
-  * [❌](gh-issue:7366)
-  * ❌
-  * ❌
-  * [❌](gh-issue:7366)
-  * ✅
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-- * <abbr title="Logprobs">logP</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❌
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-  *
-- * <abbr title="Prompt Logprobs">prmpt logP</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❌
-  * ✅
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-  *
-- * <abbr title="Async Output Processing">async output</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❌
-  * ✅
-  * ❌
-  * ❌
-  * ✅
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-  *
-- * multi-step
-  * ❌
-  * ✅
-  * ❌
-  * ✅
-  * ❌
-  * ✅
-  * ❌
-  * ❌
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  *
-  *
-  *
-  *
-- * <abbr title="Multimodal Inputs">mm</abbr>
-  * ✅
-  * [🟠](gh-pr:8348)
-  * [🟠](gh-pr:4194)
-  * ❔
-  * ❔
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❔
-  * ✅
-  *
-  *
-  *
-- * best-of
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * [❌](gh-issue:6137)
-  * ✅
-  * ❌
-  * ✅
-  * ✅
-  * ✅
-  * ❔
-  * [❌](gh-issue:7968)
-  * ✅
-  * ✅
-  *
-  *
-- * beam-search
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * [❌](gh-issue:6137)
-  * ✅
-  * ❌
-  * ✅
-  * ✅
-  * ✅
-  * ❔
-  * [❌](gh-issue:7968)
-  * ❔
-  * ✅
-  * ✅
-  *
-- * <abbr title="Guided Decoding">guided dec</abbr>
-  * ✅
-  * ✅
-  * ❔
-  * ❔
-  * [❌](gh-issue:11484)
-  * ✅
-  * ❌
-  * ❔
-  * ✅
-  * ✅
-  * ✅
-  * [❌](gh-issue:9893)
-  * ❔
-  * ✅
-  * ✅
-  * ✅
-:::
-
-(feature-x-hardware)=
-
-## Feature x Hardware
-
-:::{list-table}
-:header-rows: 1
-:stub-columns: 1
-:widths: auto
-
-- * Feature
-  * Volta
-  * Turing
-  * Ampere
-  * Ada
-  * Hopper
-  * CPU
-  * AMD
-- * [CP](#chunked-prefill)
-  * [❌](gh-issue:2729)
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-- * [APC](#automatic-prefix-caching)
-  * [❌](gh-issue:3687)
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-- * [LoRA](#lora-adapter)
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-- * <abbr title="Prompt Adapter">prmpt adptr</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * [❌](gh-issue:8475)
-  * ✅
-- * [SD](#spec-decode)
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-- * CUDA graph
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❌
-  * ✅
-- * <abbr title="Pooling Models">pooling</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❔
-- * <abbr title="Encoder-Decoder Models">enc-dec</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❌
-- * <abbr title="Multimodal Inputs">mm</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-- * <abbr title="Logprobs">logP</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-- * <abbr title="Prompt Logprobs">prmpt logP</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-- * <abbr title="Async Output Processing">async output</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ❌
-  * ❌
-- * multi-step
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * [❌](gh-issue:8477)
-  * ✅
-- * best-of
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-- * beam-search
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-- * <abbr title="Guided Decoding">guided dec</abbr>
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-  * ✅
-:::
diff --git a/docs/source/features/prompt_embeds.md b/docs/source/features/prompt_embeds.md
deleted file mode 100644
index 4e4648d171d5..000000000000
--- a/docs/source/features/prompt_embeds.md
+++ /dev/null
@@ -1,144 +0,0 @@
-# Prompt Embedding Inputs
-
-This page teaches you how to pass prompt embedding inputs to vLLM.
-
-## What are prompt embeddings?
-
-The traditional flow of text data for a Large Language Model goes from text to token ids (via a tokenizer) then from token ids to prompt embeddings. For a traditional decoder-only model (such as meta-llama/Llama-3.1-8B-Instruct), this step of converting token ids to prompt embeddings happens via a look-up from a learned embedding matrix, but the model is not limited to processing only the embeddings corresponding to its token vocabulary.
-
-:::{note}
-Prompt embeddings are currently only supported in the v0 engine.
-:::
-
-## Offline Inference
-
-To input multi-modal data, follow this schema in {class}`vllm.inputs.EmbedsPrompt`:
-
-- `prompt_embeds`: A torch tensor representing a sequence of prompt/token embeddings. This has the shape (sequence_length, hidden_size), where sequence length is the number of tokens embeddings and hidden_size is the hidden size (embedding size) of the model.
-
-### Hugging Face Transformers Inputs
-
-You can pass prompt embeddings from Hugging Face Transformers models to the  `'prompt_embeds'` field of the prompt embedding dictionary, as shown in the following examples:
-
-```python
-from vllm import LLM
-import transformers
-
-model_name = "meta-llama/Llama-3.2-1B-Instruct"
-
-# Transformers
-tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
-transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
-
-llm = LLM(model=model_name, enable_prompt_embeds=True)
-
-# Refer to the HuggingFace repo for the correct format to use
-chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
-token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')
-
-embedding_layer = transformers_model.get_input_embeddings()
-prompt_embeds = embedding_layer(token_ids).squeeze(0)
-
-# Single prompt inference
-outputs = llm.generate({
-    "prompt_embeds": prompt_embeds,
-})
-
-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-
-# Batch inference
-
-chats = [
-    [{"role": "user", "content": "Please tell me about the capital of France."}],
-    [{"role": "user", "content": "When is the day longest during the year?"}],
-    [{"role": "user", "content": "Where is bigger, the moon or the sun?"}]
-]
-
-token_ids_list = [
-    tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt') for chat in chats
-]
-prompt_embeds_list = [embedding_layer(token_ids).squeeze(0) for token_ids in token_ids_list]
-
-outputs = llm.generate(
-    [
-        {
-            "prompt_embeds": prompt_embeds,
-        } for prompt_embeds in prompt_embeds_list
-    ]
-)
-
-for o in outputs:
-    generated_text = o.outputs[0].text
-    print(generated_text)
-```
-
-## Online Serving
-
-Our OpenAI-compatible server accepts prompt embeddings inputs via the [Completions API](https://platform.openai.com/docs/api-reference/completions). Prompt embeddings inputs are added via a new `'prompt_embeds'` key in the JSON package.
-
-When a mixture of `'prompt_embeds'` and `'prompt'` inputs are provided in a single request, the prompt embeds are always returned first.
-
-Prompt embeddings are passed in as base64 encoded torch tensors.
-
-### Transformers Inputs via OpenAI Client
-
-First, launch the OpenAI-compatible server:
-
-```bash
-vllm serve meta-llama/Llama-3.2-1B-Instruct --task generate \
-  --max-model-len 4096 --enable-prompt-embeds
-```
-
-Then, you can use the OpenAI client as follows:
-
-```python
-from openai import OpenAI
-import transformers
-import torch
-
-openai_api_key = "EMPTY"
-openai_api_base = "http://localhost:8000/v1"
-
-client = OpenAI(
-    api_key=openai_api_key,
-    base_url=openai_api_base,
-)
-
-model_name = "meta-llama/Llama-3.2-1B-Instruct"
-
-# Transformers
-tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
-transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
-
-
-# Refer to the HuggingFace repo for the correct format to use
-chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
-token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors='pt')
-
-embedding_layer = transformers_model.get_input_embeddings()
-prompt_embeds = embedding_layer(token_ids).squeeze(0)
-
-# Prompt embeddings
-buffer = io.BytesIO()
-torch.save(prompt_embeds, buffer)
-buffer.seek(0)
-binary_data = buffer.read()
-encoded_embeds = base64.b64encode(binary_data).decode('utf-8')
-
-
-completion = client_with_prompt_embeds.completions.create(
-    model=model_name,
-    # NOTE: The OpenAI client does not allow `None` as an input to 
-    # `prompt`. Use an empty string if you have no text prompts.
-    prompt="",  
-    max_tokens=5,
-    temperature=0.0,
-    # NOTE: The OpenAI client allows passing in extra JSON body via the
-    # `extra_body` argument.
-    extra_body={"prompt_embeds": encoded_embeds}
-)
-
-print(completion.choices[0].text)
-```
diff --git a/docs/source/features/quantization/index.md b/docs/source/features/quantization/index.md
deleted file mode 100644
index 7ad46b7094ee..000000000000
--- a/docs/source/features/quantization/index.md
+++ /dev/null
@@ -1,24 +0,0 @@
-(quantization-index)=
-
-# Quantization
-
-Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
-
-:::{toctree}
-:caption: Contents
-:maxdepth: 1
-
-supported_hardware
-auto_awq
-bnb
-bitblas
-gguf
-gptqmodel
-int4
-int8
-fp8
-modelopt
-quark
-quantized_kvcache
-torchao
-:::
diff --git a/docs/source/features/quantization/supported_hardware.md b/docs/source/features/quantization/supported_hardware.md
deleted file mode 100644
index f8af1ba60b12..000000000000
--- a/docs/source/features/quantization/supported_hardware.md
+++ /dev/null
@@ -1,153 +0,0 @@
-(quantization-supported-hardware)=
-
-# Supported Hardware
-
-The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
-
-:::{list-table}
-:header-rows: 1
-:widths: 20 8 8 8 8 8 8 8 8 8 8
-
-- * Implementation
-  * Volta
-  * Turing
-  * Ampere
-  * Ada
-  * Hopper
-  * AMD GPU
-  * Intel GPU
-  * x86 CPU
-  * AWS Inferentia
-  * Google TPU
-- * AWQ
-  * ❌
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-- * GPTQ
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-- * Marlin (GPTQ/AWQ/FP8)
-  * ❌
-  * ❌
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-- * INT8 (W8A8)
-  * ❌
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ✅︎
-  * ❌
-  * ✅︎
-- * FP8 (W8A8)
-  * ❌
-  * ❌
-  * ❌
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-- * BitBLAS (GPTQ)
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-- * AQLM
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-- * bitsandbytes
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-- * DeepSpeedFP
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-- * GGUF
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-- * modelopt
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎
-  * ✅︎︎
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-  * ❌
-:::
-
-- Volta refers to SM 7.0, Turing to SM 7.5, Ampere to SM 8.0/8.6, Ada to SM 8.9, and Hopper to SM 9.0.
-- ✅︎ indicates that the quantization method is supported on the specified hardware.
-- ❌ indicates that the quantization method is not supported on the specified hardware.
-
-:::{note}
-This compatibility chart is subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
-
-For the most up-to-date information on hardware support and quantization methods, please refer to <gh-dir:vllm/model_executor/layers/quantization> or consult with the vLLM development team.
-:::
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
deleted file mode 100644
index f77dbefb0a01..000000000000
--- a/docs/source/generate_examples.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import itertools
-import re
-from dataclasses import dataclass, field
-from pathlib import Path
-
-ROOT_DIR = Path(__file__).parent.parent.parent.resolve()
-ROOT_DIR_RELATIVE = '../../../..'
-EXAMPLE_DIR = ROOT_DIR / "examples"
-EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples"
-
-
-def fix_case(text: str) -> str:
-    subs = {
-        "api": "API",
-        "cli": "CLI",
-        "cpu": "CPU",
-        "llm": "LLM",
-        "mae": "MAE",
-        "tpu": "TPU",
-        "aqlm": "AQLM",
-        "gguf": "GGUF",
-        "lora": "LoRA",
-        "rlhf": "RLHF",
-        "vllm": "vLLM",
-        "openai": "OpenAI",
-        "lmcache": "LMCache",
-        "multilora": "MultiLoRA",
-        "mlpspeculator": "MLPSpeculator",
-        r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32
-        r"int\d+": lambda x: x.group(0).upper(),  # e.g. int8, int16
-    }
-    for pattern, repl in subs.items():
-        text = re.sub(rf'\b{pattern}\b', repl, text, flags=re.IGNORECASE)
-    return text
-
-
-@dataclass
-class Index:
-    """
-    Index class to generate a structured document index.
-
-    Attributes:
-        path (Path): The path save the index file to.
-        title (str): The title of the index.
-        description (str): A brief description of the index.
-        caption (str): An optional caption for the table of contents.
-        maxdepth (int): The maximum depth of the table of contents. Defaults to 1.
-        documents (list[str]): A list of document paths to include in the index. Defaults to an empty list.
-
-    Methods:
-        generate() -> str:
-            Generates the index content as a string in the specified format.
-    """ # noqa: E501
-    path: Path
-    title: str
-    description: str
-    caption: str
-    maxdepth: int = 1
-    documents: list[str] = field(default_factory=list)
-
-    def generate(self) -> str:
-        content = f"# {self.title}\n\n{self.description}\n\n"
-        content += ":::{toctree}\n"
-        content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
-        content += "\n".join(self.documents) + "\n:::\n"
-        return content
-
-
-@dataclass
-class Example:
-    """
-    Example class for generating documentation content from a given path.
-
-    Attributes:
-        path (Path): The path to the main directory or file.
-        category (str): The category of the document.
-        main_file (Path): The main file in the directory.
-        other_files (list[Path]): list of other files in the directory.
-        title (str): The title of the document.
-
-    Methods:
-        __post_init__(): Initializes the main_file, other_files, and title attributes.
-        determine_main_file() -> Path: Determines the main file in the given path.
-        determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
-        determine_title() -> str: Determines the title of the document.
-        generate() -> str: Generates the documentation content.
-    """ # noqa: E501
-    path: Path
-    category: str = None
-    main_file: Path = field(init=False)
-    other_files: list[Path] = field(init=False)
-    title: str = field(init=False)
-
-    def __post_init__(self):
-        self.main_file = self.determine_main_file()
-        self.other_files = self.determine_other_files()
-        self.title = self.determine_title()
-
-    def determine_main_file(self) -> Path:
-        """
-        Determines the main file in the given path.
-        If the path is a file, it returns the path itself. Otherwise, it searches
-        for Markdown files (*.md) in the directory and returns the first one found.
-        Returns:
-            Path: The main file path, either the original path if it's a file or the first
-            Markdown file found in the directory.
-        Raises:
-            IndexError: If no Markdown files are found in the directory.
-        """ # noqa: E501
-        return self.path if self.path.is_file() else list(
-            self.path.glob("*.md")).pop()
-
-    def determine_other_files(self) -> list[Path]:
-        """
-        Determine other files in the directory excluding the main file.
-
-        This method checks if the given path is a file. If it is, it returns an empty list.
-        Otherwise, it recursively searches through the directory and returns a list of all
-        files that are not the main file.
-
-        Returns:
-            list[Path]: A list of Path objects representing the other files in the directory.
-        """ # noqa: E501
-        if self.path.is_file():
-            return []
-        is_other_file = lambda file: file.is_file() and file != self.main_file
-        return [file for file in self.path.rglob("*") if is_other_file(file)]
-
-    def determine_title(self) -> str:
-        return fix_case(self.path.stem.replace("_", " ").title())
-
-    def generate(self) -> str:
-        # Convert the path to a relative path from __file__
-        make_relative = lambda path: ROOT_DIR_RELATIVE / path.relative_to(
-            ROOT_DIR)
-
-        content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
-        include = "include" if self.main_file.suffix == ".md" else \
-            "literalinclude"
-        if include == "literalinclude":
-            content += f"# {self.title}\n\n"
-        content += f":::{{{include}}} {make_relative(self.main_file)}\n"
-        if include == "literalinclude":
-            content += f":language: {self.main_file.suffix[1:]}\n"
-        content += ":::\n\n"
-
-        if not self.other_files:
-            return content
-
-        content += "## Example materials\n\n"
-        for file in sorted(self.other_files):
-            include = "include" if file.suffix == ".md" else "literalinclude"
-            content += f":::{{admonition}} {file.relative_to(self.path)}\n"
-            content += ":class: dropdown\n\n"
-            content += f":::{{{include}}} {make_relative(file)}\n:::\n"
-            content += ":::\n\n"
-
-        return content
-
-
-def generate_examples():
-    # Create the EXAMPLE_DOC_DIR if it doesn't exist
-    if not EXAMPLE_DOC_DIR.exists():
-        EXAMPLE_DOC_DIR.mkdir(parents=True)
-
-    # Create empty indices
-    examples_index = Index(
-        path=EXAMPLE_DOC_DIR / "examples_index.md",
-        title="Examples",
-        description=
-        "A collection of examples demonstrating usage of vLLM.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.",  # noqa: E501
-        caption="Examples",
-        maxdepth=2)
-    # Category indices stored in reverse order because they are inserted into
-    # examples_index.documents at index 0 in order
-    category_indices = {
-        "other":
-        Index(
-            path=EXAMPLE_DOC_DIR / "examples_other_index.md",
-            title="Other",
-            description=
-            "Other examples that don't strongly fit into the online or offline serving categories.",  # noqa: E501
-            caption="Examples",
-        ),
-        "online_serving":
-        Index(
-            path=EXAMPLE_DOC_DIR / "examples_online_serving_index.md",
-            title="Online Serving",
-            description=
-            "Online serving examples demonstrate how to use vLLM in an online setting, where the model is queried for predictions in real-time.",  # noqa: E501
-            caption="Examples",
-        ),
-        "offline_inference":
-        Index(
-            path=EXAMPLE_DOC_DIR / "examples_offline_inference_index.md",
-            title="Offline Inference",
-            description=
-            "Offline inference examples demonstrate how to use vLLM in an offline setting, where the model is queried for predictions in batches. We recommend starting with <project:basic.md>.",  # noqa: E501
-            caption="Examples",
-        ),
-    }
-
-    examples = []
-    glob_patterns = ["*.py", "*.md", "*.sh"]
-    # Find categorised examples
-    for category in category_indices:
-        category_dir = EXAMPLE_DIR / category
-        globs = [category_dir.glob(pattern) for pattern in glob_patterns]
-        for path in itertools.chain(*globs):
-            examples.append(Example(path, category))
-        # Find examples in subdirectories
-        for path in category_dir.glob("*/*.md"):
-            examples.append(Example(path.parent, category))
-    # Find uncategorised examples
-    globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns]
-    for path in itertools.chain(*globs):
-        examples.append(Example(path))
-    # Find examples in subdirectories
-    for path in EXAMPLE_DIR.glob("*/*.md"):
-        # Skip categorised examples
-        if path.parent.name in category_indices:
-            continue
-        examples.append(Example(path.parent))
-
-    # Generate the example documentation
-    for example in sorted(examples, key=lambda e: e.path.stem):
-        doc_path = EXAMPLE_DOC_DIR / f"{example.path.stem}.md"
-        with open(doc_path, "w+") as f:
-            f.write(example.generate())
-        # Add the example to the appropriate index
-        index = category_indices.get(example.category, examples_index)
-        index.documents.append(example.path.stem)
-
-    # Generate the index files
-    for category_index in category_indices.values():
-        if category_index.documents:
-            examples_index.documents.insert(0, category_index.path.name)
-            with open(category_index.path, "w+") as f:
-                f.write(category_index.generate())
-
-    with open(examples_index.path, "w+") as f:
-        f.write(examples_index.generate())
diff --git a/docs/source/getting_started/installation.md b/docs/source/getting_started/installation.md
deleted file mode 100644
index 44134bf01b76..000000000000
--- a/docs/source/getting_started/installation.md
+++ /dev/null
@@ -1,28 +0,0 @@
-(installation-index)=
-
-# Installation
-
-vLLM supports the following hardware platforms:
-
-:::{toctree}
-:maxdepth: 1
-:hidden:
-
-installation/gpu
-installation/cpu
-installation/ai_accelerator
-:::
-
-- <project:installation/gpu.md>
-  - NVIDIA CUDA
-  - AMD ROCm
-  - Intel XPU
-- <project:installation/cpu.md>
-  - Intel/AMD x86
-  - ARM AArch64
-  - Apple silicon
-  - IBM Z (S390X)
-- <project:installation/ai_accelerator.md>
-  - Google TPU
-  - Intel Gaudi
-  - AWS Neuron
diff --git a/docs/source/getting_started/installation/ai_accelerator.md b/docs/source/getting_started/installation/ai_accelerator.md
deleted file mode 100644
index 0a207af1a4c7..000000000000
--- a/docs/source/getting_started/installation/ai_accelerator.md
+++ /dev/null
@@ -1,299 +0,0 @@
-# Other AI accelerators
-
-vLLM is a Python library that supports the following AI accelerators. Select your AI accelerator type to see vendor specific instructions:
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} Google TPU
-:selected:
-:sync: tpu
-
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-
-::::
-
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-
-::::
-
-::::{tab-item} AWS Neuron
-:sync: neuron
-
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-
-::::
-
-:::::
-
-## Requirements
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} Google TPU
-:sync: tpu
-
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "## Requirements"
-:end-before: "## Configure a new environment"
-:::
-
-::::
-
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "## Requirements"
-:end-before: "## Configure a new environment"
-:::
-
-::::
-
-::::{tab-item} AWS Neuron
-:sync: neuron
-
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "## Requirements"
-:end-before: "## Configure a new environment"
-:::
-
-::::
-
-:::::
-
-## Configure a new environment
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} Google TPU
-:sync: tpu
-
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "## Configure a new environment"
-:end-before: "## Set up using Python"
-:::
-
-::::
-
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "## Configure a new environment"
-:end-before: "## Set up using Python"
-:::
-
-::::
-
-::::{tab-item} AWS Neuron
-:sync: neuron
-
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "## Configure a new environment"
-:end-before: "## Set up using Python"
-:::
-
-::::
-
-:::::
-
-## Set up using Python
-
-### Pre-built wheels
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} Google TPU
-:sync: tpu
-
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-
-::::
-
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-
-::::
-
-::::{tab-item} AWS Neuron
-:sync: neuron
-
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-
-::::
-
-:::::
-
-### Build wheel from source
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} Google TPU
-:sync: tpu
-
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-
-::::
-
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-
-::::
-
-::::{tab-item} AWS Neuron
-:sync: neuron
-
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-
-::::
-
-:::::
-
-## Set up using Docker
-
-### Pre-built images
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} Google TPU
-:sync: tpu
-
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-
-::::
-
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-
-::::
-
-::::{tab-item} AWS Neuron
-:sync: neuron
-
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-
-::::
-
-:::::
-
-### Build image from source
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} Google TPU
-:sync: tpu
-
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Extra information"
-:::
-
-::::
-
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Extra information"
-:::
-
-::::
-
-::::{tab-item} AWS Neuron
-:sync: neuron
-
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Extra information"
-:::
-
-::::
-
-:::::
-
-## Extra information
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} Google TPU
-:sync: tpu
-
-:::{include} ai_accelerator/tpu.inc.md
-:start-after: "## Extra information"
-:::
-
-::::
-
-::::{tab-item} Intel Gaudi
-:sync: hpu-gaudi
-
-:::{include} ai_accelerator/hpu-gaudi.inc.md
-:start-after: "## Extra information"
-:::
-
-::::
-
-::::{tab-item} AWS Neuron
-:sync: neuron
-
-:::{include} ai_accelerator/neuron.inc.md
-:start-after: "## Extra information"
-:::
-
-::::
-
-:::::
diff --git a/docs/source/getting_started/installation/cpu/arm.inc.md b/docs/source/getting_started/installation/cpu/arm.inc.md
deleted file mode 100644
index e7d8d60630dc..000000000000
--- a/docs/source/getting_started/installation/cpu/arm.inc.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# Installation
-
-vLLM has been adapted to work on ARM64 CPUs with NEON support, leveraging the CPU backend initially developed for the x86 platform.
-
-ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
-
-:::{attention}
-There are no pre-built wheels or images for this device, so you must build vLLM from source.
-:::
-
-## Requirements
-
-- OS: Linux
-- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
-- Instruction Set Architecture (ISA): NEON support is required
-
-## Set up using Python
-
-### Pre-built wheels
-
-### Build wheel from source
-
-:::{include} cpu/build.inc.md
-:::
-
-Testing has been conducted on AWS Graviton3 instances for compatibility.
-
-## Set up using Docker
-
-### Pre-built images
-
-### Build image from source
-
-## Extra information
diff --git a/docs/source/getting_started/installation/cpu/x86.inc.md b/docs/source/getting_started/installation/cpu/x86.inc.md
deleted file mode 100644
index 9ae2035db543..000000000000
--- a/docs/source/getting_started/installation/cpu/x86.inc.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# Installation
-
-vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16.
-
-:::{attention}
-There are no pre-built wheels or images for this device, so you must build vLLM from source.
-:::
-
-## Requirements
-
-- OS: Linux
-- Compiler: `gcc/g++ >= 12.3.0` (optional, recommended)
-- Instruction Set Architecture (ISA): AVX512 (optional, recommended)
-
-:::{tip}
-[Intel Extension for PyTorch (IPEX)](https://github.com/intel/intel-extension-for-pytorch) extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware.
-:::
-
-## Set up using Python
-
-### Pre-built wheels
-
-### Build wheel from source
-
-:::{include} cpu/build.inc.md
-:::
-
-:::{note}
-- AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, which brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16.
-- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
-:::
-
-## Set up using Docker
-
-### Pre-built images
-
-See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
-
-### Build image from source
-
-## Extra information
diff --git a/docs/source/getting_started/installation/gpu.md b/docs/source/getting_started/installation/gpu.md
deleted file mode 100644
index 22db992354fb..000000000000
--- a/docs/source/getting_started/installation/gpu.md
+++ /dev/null
@@ -1,301 +0,0 @@
-# GPU
-
-vLLM is a Python library that supports the following GPU variants. Select your GPU type to see vendor specific instructions:
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} NVIDIA CUDA
-:selected:
-:sync: cuda
-
-:::{include} gpu/cuda.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-
-::::
-
-::::{tab-item} AMD ROCm
-:sync: rocm
-
-:::{include} gpu/rocm.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-
-::::
-
-::::{tab-item} Intel XPU
-:sync: xpu
-
-:::{include} gpu/xpu.inc.md
-:start-after: "# Installation"
-:end-before: "## Requirements"
-:::
-
-::::
-
-:::::
-
-## Requirements
-
-- OS: Linux
-- Python: 3.9 -- 3.12
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} NVIDIA CUDA
-:sync: cuda
-
-:::{include} gpu/cuda.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-:::
-
-::::
-
-::::{tab-item} AMD ROCm
-:sync: rocm
-
-:::{include} gpu/rocm.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-:::
-
-::::
-
-::::{tab-item} Intel XPU
-:sync: xpu
-
-:::{include} gpu/xpu.inc.md
-:start-after: "## Requirements"
-:end-before: "## Set up using Python"
-:::
-
-::::
-
-:::::
-
-## Set up using Python
-
-### Create a new Python environment
-
-:::{include} python_env_setup.inc.md
-:::
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} NVIDIA CUDA
-:sync: cuda
-
-:::{include} gpu/cuda.inc.md
-:start-after: "## Create a new Python environment"
-:end-before: "### Pre-built wheels"
-:::
-
-::::
-
-::::{tab-item} AMD ROCm
-:sync: rocm
-
-There is no extra information on creating a new Python environment for this device.
-
-::::
-
-::::{tab-item} Intel XPU
-:sync: xpu
-
-There is no extra information on creating a new Python environment for this device.
-
-::::
-
-:::::
-
-### Pre-built wheels
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} NVIDIA CUDA
-:sync: cuda
-
-:::{include} gpu/cuda.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-
-::::
-
-::::{tab-item} AMD ROCm
-:sync: rocm
-
-:::{include} gpu/rocm.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-
-::::
-
-::::{tab-item} Intel XPU
-:sync: xpu
-
-:::{include} gpu/xpu.inc.md
-:start-after: "### Pre-built wheels"
-:end-before: "### Build wheel from source"
-:::
-
-::::
-
-:::::
-
-(build-from-source)=
-
-### Build wheel from source
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} NVIDIA CUDA
-:sync: cuda
-
-:::{include} gpu/cuda.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-
-::::
-
-::::{tab-item} AMD ROCm
-:sync: rocm
-
-:::{include} gpu/rocm.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-
-::::
-
-::::{tab-item} Intel XPU
-:sync: xpu
-
-:::{include} gpu/xpu.inc.md
-:start-after: "### Build wheel from source"
-:end-before: "## Set up using Docker"
-:::
-
-::::
-
-:::::
-
-## Set up using Docker
-
-### Pre-built images
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} NVIDIA CUDA
-:sync: cuda
-
-:::{include} gpu/cuda.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-
-::::
-
-::::{tab-item} AMD ROCm
-:sync: rocm
-
-:::{include} gpu/rocm.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-
-::::
-
-::::{tab-item} Intel XPU
-:sync: xpu
-
-:::{include} gpu/xpu.inc.md
-:start-after: "### Pre-built images"
-:end-before: "### Build image from source"
-:::
-
-::::
-
-:::::
-
-### Build image from source
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} NVIDIA CUDA
-:sync: cuda
-
-:::{include} gpu/cuda.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Supported features"
-:::
-
-::::
-
-::::{tab-item} AMD ROCm
-:sync: rocm
-
-:::{include} gpu/rocm.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Supported features"
-:::
-
-::::
-
-::::{tab-item} Intel XPU
-:sync: xpu
-
-:::{include} gpu/xpu.inc.md
-:start-after: "### Build image from source"
-:end-before: "## Supported features"
-:::
-
-::::
-
-:::::
-
-## Supported features
-
-:::::{tab-set}
-:sync-group: device
-
-::::{tab-item} NVIDIA CUDA
-:sync: cuda
-
-:::{include} gpu/cuda.inc.md
-:start-after: "## Supported features"
-:::
-
-::::
-
-::::{tab-item} AMD ROCm
-:sync: rocm
-
-:::{include} gpu/rocm.inc.md
-:start-after: "## Supported features"
-:::
-
-::::
-
-::::{tab-item} Intel XPU
-:sync: xpu
-
-:::{include} gpu/xpu.inc.md
-:start-after: "## Supported features"
-:::
-
-::::
-
-:::::
diff --git a/docs/source/getting_started/installation/python_env_setup.inc.md b/docs/source/getting_started/installation/python_env_setup.inc.md
deleted file mode 100644
index 00b61ea5c826..000000000000
--- a/docs/source/getting_started/installation/python_env_setup.inc.md
+++ /dev/null
@@ -1,19 +0,0 @@
-You can create a new Python environment using [conda](https://docs.conda.io/projects/conda/en/stable/user-guide/getting-started.html):
-
-```console
-# (Recommended) Create a new conda environment.
-conda create -n vllm python=3.12 -y
-conda activate vllm
-```
-
-:::{note}
-[PyTorch has deprecated the conda release channel](https://github.com/pytorch/pytorch/issues/138506). If you use `conda`, please only use it to create Python environment rather than installing packages.
-:::
-
-Or you can create a new Python environment using [uv](https://docs.astral.sh/uv/), a very fast Python environment manager. Please follow the [documentation](https://docs.astral.sh/uv/#getting-started) to install `uv`. After installing `uv`, you can create a new Python environment using the following command:
-
-```console
-# (Recommended) Create a new uv environment. Use `--seed` to install `pip` and `setuptools` in the environment.
-uv venv --python 3.12 --seed
-source .venv/bin/activate
-```
diff --git a/docs/source/index.md b/docs/source/index.md
deleted file mode 100644
index db2192e87dcf..000000000000
--- a/docs/source/index.md
+++ /dev/null
@@ -1,217 +0,0 @@
-# Welcome to vLLM
-
-:::{figure} ./assets/logos/vllm-logo-text-light.png
-:align: center
-:alt: vLLM
-:class: no-scaled-link
-:width: 60%
-:::
-
-:::{raw} html
-<p style="text-align:center">
-<strong>Easy, fast, and cheap LLM serving for everyone
-</strong>
-</p>
-
-<p style="text-align:center">
-<script async defer src="https://buttons.github.io/buttons.js"></script>
-<a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
-<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
-<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
-</p>
-:::
-
-vLLM is a fast and easy-to-use library for LLM inference and serving.
-
-Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
-
-vLLM is fast with:
-
-- State-of-the-art serving throughput
-- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
-- Continuous batching of incoming requests
-- Fast model execution with CUDA/HIP graph
-- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8
-- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
-- Speculative decoding
-- Chunked prefill
-
-vLLM is flexible and easy to use with:
-
-- Seamless integration with popular HuggingFace models
-- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
-- Tensor parallelism and pipeline parallelism support for distributed inference
-- Streaming outputs
-- OpenAI-compatible API server
-- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, IBM Power CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
-- Prefix caching support
-- Multi-lora support
-
-For more information, check out the following:
-
-- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
-- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
-- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
-- [vLLM Meetups](#meetups)
-
-## Documentation
-
-% How to start using vLLM?
-
-:::{toctree}
-:caption: Getting Started
-:maxdepth: 1
-
-getting_started/installation
-getting_started/quickstart
-getting_started/examples/examples_index
-getting_started/troubleshooting
-getting_started/faq
-getting_started/v1_user_guide
-
-:::
-
-% What does vLLM support?
-
-:::{toctree}
-:caption: Models
-:maxdepth: 1
-
-models/supported_models
-models/generative_models
-models/pooling_models
-models/extensions/index
-:::
-
-% Additional capabilities
-
-:::{toctree}
-:caption: Features
-:maxdepth: 1
-
-features/quantization/index
-features/multimodal_inputs
-features/prompt_embeds
-features/lora
-features/tool_calling
-features/reasoning_outputs
-features/structured_outputs
-features/automatic_prefix_caching
-features/disagg_prefill
-features/spec_decode
-features/compatibility_matrix
-:::
-
-% Details about running vLLM
-
-:::{toctree}
-:caption: Training
-:maxdepth: 1
-
-training/trl.md
-training/rlhf.md
-
-:::
-
-:::{toctree}
-:caption: Inference and Serving
-:maxdepth: 1
-
-serving/offline_inference
-serving/openai_compatible_server
-serving/serve_args
-serving/distributed_serving
-serving/metrics
-serving/engine_args
-serving/env_vars
-serving/usage_stats
-serving/integrations/index
-:::
-
-% Scaling up vLLM for production
-
-:::{toctree}
-:caption: Deployment
-:maxdepth: 1
-
-deployment/security
-deployment/docker
-deployment/k8s
-deployment/nginx
-deployment/frameworks/index
-deployment/integrations/index
-:::
-
-% Making the most out of vLLM
-
-:::{toctree}
-:caption: Performance
-:maxdepth: 1
-
-performance/optimization
-performance/benchmarks
-:::
-
-% Explanation of vLLM internals
-
-:::{toctree}
-:caption: Design Documents
-:maxdepth: 2
-
-design/arch_overview
-design/huggingface_integration
-design/plugin_system
-design/kernel/paged_attention
-design/mm_processing
-design/automatic_prefix_caching
-design/multiprocessing
-:::
-
-:::{toctree}
-:caption: V1 Design Documents
-:maxdepth: 2
-
-design/v1/torch_compile
-design/v1/prefix_caching
-design/v1/metrics
-:::
-
-% How to contribute to the vLLM project
-
-:::{toctree}
-:caption: Developer Guide
-:maxdepth: 2
-
-contributing/overview
-contributing/deprecation_policy
-contributing/profiling/profiling_index
-contributing/dockerfile/dockerfile
-contributing/model/index
-contributing/vulnerability_management
-:::
-
-% Technical API specifications
-
-:::{toctree}
-:caption: API Reference
-:maxdepth: 2
-
-api/summary
-api/vllm/vllm
-:::
-
-% Latest news and acknowledgements
-
-:::{toctree}
-:caption: Community
-:maxdepth: 1
-
-community/blog
-community/meetups
-community/sponsors
-:::
-
-## Indices and tables
-
-- {ref}`genindex`
-- {ref}`modindex`
diff --git a/docs/source/models/extensions/index.md b/docs/source/models/extensions/index.md
deleted file mode 100644
index cdcdaa5b3501..000000000000
--- a/docs/source/models/extensions/index.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Built-in Extensions
-
-:::{toctree}
-:maxdepth: 1
-
-runai_model_streamer
-tensorizer
-fastsafetensor
-:::
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
deleted file mode 100644
index 4d574216242b..000000000000
--- a/docs/source/models/supported_models.md
+++ /dev/null
@@ -1,1401 +0,0 @@
-(supported-models)=
-
-# Supported Models
-
-vLLM supports [generative](generative-models) and [pooling](pooling-models) models across various tasks.
-If a model supports more than one task, you can set the task via the `--task` argument.
-
-For each task, we list the model architectures that have been implemented in vLLM.
-Alongside each architecture, we include some popular models that use it.
-
-## Model Implementation
-
-### vLLM
-
-If vLLM natively supports a model, its implementation can be found in <gh-file:vllm/model_executor/models>.
-
-These models are what we list in <project:#supported-text-models> and <project:#supported-mm-models>.
-
-(transformers-backend)=
-
-### Transformers
-
-vLLM also supports model implementations that are available in Transformers. This does not currently work for all models, but most decoder language models are supported, and vision language model support is planned!
-
-To check if the modeling backend is Transformers, you can simply do this:
-
-```python
-from vllm import LLM
-llm = LLM(model=..., task="generate")  # Name or path of your model
-llm.apply_model(lambda model: print(type(model)))
-```
-
-If it is `TransformersForCausalLM` then it means it's based on Transformers!
-
-:::{tip}
-You can force the use of `TransformersForCausalLM` by setting `model_impl="transformers"` for <project:#offline-inference> or `--model-impl transformers` for the <project:#openai-compatible-server>.
-:::
-
-:::{note}
-vLLM may not fully optimise the Transformers implementation so you may see degraded performance if comparing a native model to a Transformers model in vLLM.
-:::
-
-#### Custom models
-
-If a model is neither supported natively by vLLM or Transformers, it can still be used in vLLM!
-
-For a model to be compatible with the Transformers backend for vLLM it must:
-
-- be a Transformers compatible custom model (see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)):
-  * The model directory must have the correct structure (e.g. `config.json` is present).
-  * `config.json` must contain `auto_map.AutoModel`.
-- be a Transformers backend for vLLM compatible model (see <project:#writing-custom-models>):
-  * Customisation should be done in the base model (e.g. in `MyModel`, not `MyModelForCausalLM`).
-
-If the compatible model is:
-
-- on the Hugging Face Model Hub, simply set `trust_remote_code=True` for <project:#offline-inference> or `--trust-remote-code` for the <project:#openai-compatible-server>.
-- in a local directory, simply pass directory path to `model=<MODEL_DIR>` for <project:#offline-inference> or `vllm serve <MODEL_DIR>` for the <project:#openai-compatible-server>.
-
-This means that, with the Transformers backend for vLLM, new models can be used before they are officially supported in Transformers or vLLM!
-
-(writing-custom-models)=
-
-#### Writing custom models
-
-This section details the necessary modifications to make to a Transformers compatible custom model that make it compatible with the Transformers backend for vLLM. (We assume that a Transformers compatible custom model has already been created, see [Transformers - Customizing models](https://huggingface.co/docs/transformers/en/custom_models)).
-
-To make your model compatible with the Transformers backend, it needs:
-
-1. `kwargs` passed down through all modules from `MyModel` to `MyAttention`.
-2. `MyAttention` must use `ALL_ATTENTION_FUNCTIONS` to call attention.
-3. `MyModel` must contain `_supports_attention_backend = True`.
-
-```{code-block} python
-:caption: modeling_my_model.py
-
-from transformers import PreTrainedModel
-from torch import nn
-
-class MyAttention(nn.Module):
-
-  def forward(self, hidden_states, **kwargs):
-    ...
-    attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
-    attn_output, attn_weights = attention_interface(
-      self,
-      query_states,
-      key_states,
-      value_states,
-      **kwargs,
-    )
-    ...
-
-class MyModel(PreTrainedModel):
-  _supports_attention_backend = True
-```
-
-Here is what happens in the background when this model is loaded:
-
-1. The config is loaded.
-2. `MyModel` Python class is loaded from the `auto_map` in config, and we check that the model `is_backend_compatible()`.
-3. `MyModel` is loaded into `TransformersForCausalLM` (see <gh-file:vllm/model_executor/models/transformers.py>) which sets `self.config._attn_implementation = "vllm"` so that vLLM's attention layer is used.
-
-That's it!
-
-For your model to be compatible with vLLM's tensor parallel and/or pipeline parallel features, you must add `base_model_tp_plan` and/or `base_model_pp_plan` to your model's config class:
-
-```{code-block} python
-:caption: configuration_my_model.py
-
-from transformers import PretrainedConfig
-
-class MyConfig(PretrainedConfig):
-  base_model_tp_plan = {
-    "layers.*.self_attn.k_proj": "colwise",
-    "layers.*.self_attn.v_proj": "colwise",
-    "layers.*.self_attn.o_proj": "rowwise",
-    "layers.*.mlp.gate_proj": "colwise",
-    "layers.*.mlp.up_proj": "colwise",
-    "layers.*.mlp.down_proj": "rowwise",
-  }
-  base_model_pp_plan = {
-    "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-    "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-    "norm": (["hidden_states"], ["hidden_states"]),
-  }
-```
-
-- `base_model_tp_plan` is a `dict` that maps fully qualified layer name patterns to tensor parallel styles (currently only `"colwise"` and `"rowwise"` are supported).
-- `base_model_pp_plan` is a `dict` that maps direct child layer names to `tuple`s of `list`s of `str`s:
-  * You only need to do this for layers which are not present on all pipeline stages
-  * vLLM assumes that there will be only one `nn.ModuleList`, which is distributed across the pipeline stages
-  * The `list` in the first element of the `tuple` contains the names of the input arguments
-  * The `list` in the last element of the `tuple` contains the names of the variables the layer outputs to in your modeling code
-
-## Loading a Model
-
-### Hugging Face Hub
-
-By default, vLLM loads models from [Hugging Face (HF) Hub](https://huggingface.co/models). To change the download path for models, you can set the `HF_HOME` environment variable; for more details, refer to [their official documentation](https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables#hfhome).
-
-To determine whether a given model is natively supported, you can check the `config.json` file inside the HF repository.
-If the `"architectures"` field contains a model architecture listed below, then it should be natively supported.
-
-Models do not _need_ to be natively supported to be used in vLLM.
-The [Transformers backend](#transformers-backend) enables you to run models directly using their Transformers implementation (or even remote code on the Hugging Face Model Hub!).
-
-:::{tip}
-The easiest way to check if your model is really supported at runtime is to run the program below:
-
-```python
-from vllm import LLM
-
-# For generative models (task=generate) only
-llm = LLM(model=..., task="generate")  # Name or path of your model
-output = llm.generate("Hello, my name is")
-print(output)
-
-# For pooling models (task={embed,classify,reward,score}) only
-llm = LLM(model=..., task="embed")  # Name or path of your model
-output = llm.encode("Hello, my name is")
-print(output)
-```
-
-If vLLM successfully returns text (for generative models) or hidden states (for pooling models), it indicates that your model is supported.
-:::
-
-Otherwise, please refer to [Adding a New Model](#new-model) for instructions on how to implement your model in vLLM.
-Alternatively, you can [open an issue on GitHub](https://github.com/vllm-project/vllm/issues/new/choose) to request vLLM support.
-
-#### Download a model
-
-If you prefer, you can use the Hugging Face CLI to [download a model](https://huggingface.co/docs/huggingface_hub/guides/cli#huggingface-cli-download) or specific files from a model repository:
-
-```console
-# Download a model
-huggingface-cli download HuggingFaceH4/zephyr-7b-beta
-
-# Specify a custom cache directory
-huggingface-cli download HuggingFaceH4/zephyr-7b-beta --cache-dir ./path/to/cache
-
-# Download a specific file from a model repo
-huggingface-cli download HuggingFaceH4/zephyr-7b-beta eval_results.json
-```
-
-#### List the downloaded models
-
-Use the Hugging Face CLI to [manage models](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#scan-your-cache) stored in local cache:
-
-```console
-# List cached models
-huggingface-cli scan-cache
-
-# Show detailed (verbose) output
-huggingface-cli scan-cache -v
-
-# Specify a custom cache directory
-huggingface-cli scan-cache --dir ~/.cache/huggingface/hub
-```
-
-#### Delete a cached model
-
-Use the Hugging Face CLI to interactively [delete downloaded model](https://huggingface.co/docs/huggingface_hub/guides/manage-cache#clean-your-cache) from the cache:
-
-```console
-# The `delete-cache` command requires extra dependencies to work with the TUI.
-# Please run `pip install huggingface_hub[cli]` to install them.
-
-# Launch the interactive TUI to select models to delete
-$ huggingface-cli delete-cache
-? Select revisions to delete: 1 revisions selected counting for 438.9M.
-  ○ None of the following (if selected, nothing will be deleted).
-Model BAAI/bge-base-en-v1.5 (438.9M, used 1 week ago)
-❯ ◉ a5beb1e3: main # modified 1 week ago
-
-Model BAAI/bge-large-en-v1.5 (1.3G, used 1 week ago)
-  ○ d4aa6901: main # modified 1 week ago
-
-Model BAAI/bge-reranker-base (1.1G, used 4 weeks ago)
-  ○ 2cfc18c9: main # modified 4 weeks ago
-
-Press <space> to select, <enter> to validate and <ctrl+c> to quit without modification.
-
-# Need to confirm after selected
-? Select revisions to delete: 1 revision(s) selected.
-? 1 revisions selected counting for 438.9M. Confirm deletion ? Yes
-Start deletion.
-Done. Deleted 1 repo(s) and 0 revision(s) for a total of 438.9M.
-```
-
-#### Using a proxy
-
-Here are some tips for loading/downloading models from Hugging Face using a proxy:
-
-- Set the proxy globally for your session (or set it in the profile file):
-
-```shell
-export http_proxy=http://your.proxy.server:port
-export https_proxy=http://your.proxy.server:port
-```
-
-- Set the proxy for just the current command:
-
-```shell
-https_proxy=http://your.proxy.server:port huggingface-cli download <model_name>
-
-# or use vllm cmd directly
-https_proxy=http://your.proxy.server:port  vllm serve <model_name> --disable-log-requests
-```
-
-- Set the proxy in Python interpreter:
-
-```python
-import os
-
-os.environ['http_proxy'] = 'http://your.proxy.server:port'
-os.environ['https_proxy'] = 'http://your.proxy.server:port'
-```
-
-### ModelScope
-
-To use models from [ModelScope](https://www.modelscope.cn) instead of Hugging Face Hub, set an environment variable:
-
-```shell
-export VLLM_USE_MODELSCOPE=True
-```
-
-And use with `trust_remote_code=True`.
-
-```python
-from vllm import LLM
-
-llm = LLM(model=..., revision=..., task=..., trust_remote_code=True)
-
-# For generative models (task=generate) only
-output = llm.generate("Hello, my name is")
-print(output)
-
-# For pooling models (task={embed,classify,reward,score}) only
-output = llm.encode("Hello, my name is")
-print(output)
-```
-
-(feature-status-legend)=
-
-## Feature Status Legend
-
-- ✅︎ indicates that the feature is supported for the model.
-
-- 🚧 indicates that the feature is planned but not yet supported for the model.
-
-- ⚠️ indicates that the feature is available but may have known issues or limitations.
-
-(supported-text-models)=
-
-## List of Text-only Language Models
-
-### Generative Models
-
-See [this page](#generative-models) for more information on how to use generative models.
-
-#### Text Generation
-
-Specified using `--task generate`.
-
-:::{list-table}
-:widths: 25 25 50 5 5
-:header-rows: 1
-
-- * Architecture
-  * Models
-  * Example HF Models
-  * [LoRA](#lora-adapter)
-  * [PP](#distributed-serving)
-- * `AquilaForCausalLM`
-  * Aquila, Aquila2
-  * `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.
-  * ✅︎
-  * ✅︎
-- * `ArcticForCausalLM`
-  * Arctic
-  * `Snowflake/snowflake-arctic-base`, `Snowflake/snowflake-arctic-instruct`, etc.
-  *
-  * ✅︎
-- * `BaiChuanForCausalLM`
-  * Baichuan2, Baichuan
-  * `baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.
-  * ✅︎
-  * ✅︎
-- * `BambaForCausalLM`
-  * Bamba
-  * `ibm-ai-platform/Bamba-9B-fp8`, `ibm-ai-platform/Bamba-9B`
-  *
-  *
-- * `BloomForCausalLM`
-  * BLOOM, BLOOMZ, BLOOMChat
-  * `bigscience/bloom`, `bigscience/bloomz`, etc.
-  *
-  * ✅︎
-- * `BartForConditionalGeneration`
-  * BART
-  * `facebook/bart-base`, `facebook/bart-large-cnn`, etc.
-  *
-  *
-- * `ChatGLMModel`, `ChatGLMForConditionalGeneration`
-  * ChatGLM
-  * `THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, `ShieldLM-6B-chatglm3`, etc.
-  * ✅︎
-  * ✅︎
-- * `CohereForCausalLM`, `Cohere2ForCausalLM`
-  * Command-R
-  * `CohereForAI/c4ai-command-r-v01`, `CohereForAI/c4ai-command-r7b-12-2024`, etc.
-  * ✅︎
-  * ✅︎
-- * `DbrxForCausalLM`
-  * DBRX
-  * `databricks/dbrx-base`, `databricks/dbrx-instruct`, etc.
-  *
-  * ✅︎
-- * `DeciLMForCausalLM`
-  * DeciLM
-  * `nvidia/Llama-3_3-Nemotron-Super-49B-v1`, etc.
-  *
-  * ✅︎
-- * `DeepseekForCausalLM`
-  * DeepSeek
-  * `deepseek-ai/deepseek-llm-67b-base`, `deepseek-ai/deepseek-llm-7b-chat` etc.
-  *
-  * ✅︎
-- * `DeepseekV2ForCausalLM`
-  * DeepSeek-V2
-  * `deepseek-ai/DeepSeek-V2`, `deepseek-ai/DeepSeek-V2-Chat` etc.
-  *
-  * ✅︎
-- * `DeepseekV3ForCausalLM`
-  * DeepSeek-V3
-  * `deepseek-ai/DeepSeek-V3-Base`, `deepseek-ai/DeepSeek-V3` etc.
-  *
-  * ✅︎
-- * `ExaoneForCausalLM`
-  * EXAONE-3
-  * `LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct`, etc.
-  * ✅︎
-  * ✅︎
-- * `FalconForCausalLM`
-  * Falcon
-  * `tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.
-  *
-  * ✅︎
-- * `FalconMambaForCausalLM`
-  * FalconMamba
-  * `tiiuae/falcon-mamba-7b`, `tiiuae/falcon-mamba-7b-instruct`, etc.
-  * ✅︎
-  * ✅︎
-- * `GemmaForCausalLM`
-  * Gemma
-  * `google/gemma-2b`, `google/gemma-1.1-2b-it`, etc.
-  * ✅︎
-  * ✅︎
-- * `Gemma2ForCausalLM`
-  * Gemma 2
-  * `google/gemma-2-9b`, `google/gemma-2-27b`, etc.
-  * ✅︎
-  * ✅︎
-- * `Gemma3ForCausalLM`
-  * Gemma 3
-  * `google/gemma-3-1b-it`, etc.
-  * ✅︎
-  * ✅︎
-- * `GlmForCausalLM`
-  * GLM-4
-  * `THUDM/glm-4-9b-chat-hf`, etc.
-  * ✅︎
-  * ✅︎
-- * `Glm4ForCausalLM`
-  * GLM-4-0414
-  * `THUDM/GLM-4-32B-0414`, etc.
-  * ✅︎
-  * ✅︎
-- * `GPT2LMHeadModel`
-  * GPT-2
-  * `gpt2`, `gpt2-xl`, etc.
-  *
-  * ✅︎
-- * `GPTBigCodeForCausalLM`
-  * StarCoder, SantaCoder, WizardCoder
-  * `bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, `WizardLM/WizardCoder-15B-V1.0`, etc.
-  * ✅︎
-  * ✅︎
-- * `GPTJForCausalLM`
-  * GPT-J
-  * `EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.
-  *
-  * ✅︎
-- * `GPTNeoXForCausalLM`
-  * GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
-  * `EleutherAI/gpt-neox-20b`, `EleutherAI/pythia-12b`, `OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.
-  *
-  * ✅︎
-- * `GraniteForCausalLM`
-  * Granite 3.0, Granite 3.1, PowerLM
-  * `ibm-granite/granite-3.0-2b-base`, `ibm-granite/granite-3.1-8b-instruct`, `ibm/PowerLM-3b`, etc.
-  * ✅︎
-  * ✅︎
-- * `GraniteMoeForCausalLM`
-  * Granite 3.0 MoE, PowerMoE
-  * `ibm-granite/granite-3.0-1b-a400m-base`, `ibm-granite/granite-3.0-3b-a800m-instruct`, `ibm/PowerMoE-3b`, etc.
-  * ✅︎
-  * ✅︎
-- * `GraniteMoeHybridForCausalLM`
-  * Granite 4.0 MoE Hybrid
-  * `ibm-granite/granite-4.0-tiny-preview`, etc.
-  * ✅︎
-  * ✅︎
-- * `GraniteMoeSharedForCausalLM`
-  * Granite MoE Shared
-  * `ibm-research/moe-7b-1b-active-shared-experts` (test model)
-  * ✅︎
-  * ✅︎
-- * `GritLM`
-  * GritLM
-  * `parasail-ai/GritLM-7B-vllm`.
-  * ✅︎
-  * ✅︎
-- * `Grok1ModelForCausalLM`
-  * Grok1
-  * `hpcai-tech/grok-1`.
-  * ✅︎
-  * ✅︎
-- * `InternLMForCausalLM`
-  * InternLM
-  * `internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.
-  * ✅︎
-  * ✅︎
-- * `InternLM2ForCausalLM`
-  * InternLM2
-  * `internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.
-  * ✅︎
-  * ✅︎
-- * `InternLM3ForCausalLM`
-  * InternLM3
-  * `internlm/internlm3-8b-instruct`, etc.
-  * ✅︎
-  * ✅︎
-- * `JAISLMHeadModel`
-  * Jais
-  * `inceptionai/jais-13b`, `inceptionai/jais-13b-chat`, `inceptionai/jais-30b-v3`, `inceptionai/jais-30b-chat-v3`, etc.
-  *
-  * ✅︎
-- * `JambaForCausalLM`
-  * Jamba
-  * `ai21labs/AI21-Jamba-1.5-Large`, `ai21labs/AI21-Jamba-1.5-Mini`, `ai21labs/Jamba-v0.1`, etc.
-  * ✅︎
-  * ✅︎
-- * `LlamaForCausalLM`
-  * Llama 3.1, Llama 3, Llama 2, LLaMA, Yi
-  * `meta-llama/Meta-Llama-3.1-405B-Instruct`, `meta-llama/Meta-Llama-3.1-70B`, `meta-llama/Meta-Llama-3-70B-Instruct`, `meta-llama/Llama-2-70b-hf`, `01-ai/Yi-34B`, etc.
-  * ✅︎
-  * ✅︎
-- * `MambaForCausalLM`
-  * Mamba
-  * `state-spaces/mamba-130m-hf`, `state-spaces/mamba-790m-hf`, `state-spaces/mamba-2.8b-hf`, etc.
-  *
-  * ✅︎
-- * `MiniCPMForCausalLM`
-  * MiniCPM
-  * `openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, `openbmb/MiniCPM-S-1B-sft`, etc.
-  * ✅︎
-  * ✅︎
-- * `MiniCPM3ForCausalLM`
-  * MiniCPM3
-  * `openbmb/MiniCPM3-4B`, etc.
-  * ✅︎
-  * ✅︎
-- * `MistralForCausalLM`
-  * Mistral, Mistral-Instruct
-  * `mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.
-  * ✅︎
-  * ✅︎
-- * `MixtralForCausalLM`
-  * Mixtral-8x7B, Mixtral-8x7B-Instruct
-  * `mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.
-  * ✅︎
-  * ✅︎
-- * `MPTForCausalLM`
-  * MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
-  * `mosaicml/mpt-7b`, `mosaicml/mpt-7b-storywriter`, `mosaicml/mpt-30b`, etc.
-  *
-  * ✅︎
-- * `NemotronForCausalLM`
-  * Nemotron-3, Nemotron-4, Minitron
-  * `nvidia/Minitron-8B-Base`, `mgoin/Nemotron-4-340B-Base-hf-FP8`, etc.
-  * ✅︎
-  * ✅︎
-- * `OLMoForCausalLM`
-  * OLMo
-  * `allenai/OLMo-1B-hf`, `allenai/OLMo-7B-hf`, etc.
-  *
-  * ✅︎
-- * `OLMo2ForCausalLM`
-  * OLMo2
-  * `allenai/OLMo-2-0425-1B`, etc.
-  *
-  * ✅︎
-- * `OLMoEForCausalLM`
-  * OLMoE
-  * `allenai/OLMoE-1B-7B-0924`, `allenai/OLMoE-1B-7B-0924-Instruct`, etc.
-  * ✅︎
-  * ✅︎
-- * `OPTForCausalLM`
-  * OPT, OPT-IML
-  * `facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.
-  *
-  * ✅︎
-- * `OrionForCausalLM`
-  * Orion
-  * `OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.
-  *
-  * ✅︎
-- * `PhiForCausalLM`
-  * Phi
-  * `microsoft/phi-1_5`, `microsoft/phi-2`, etc.
-  * ✅︎
-  * ✅︎
-- * `Phi3ForCausalLM`
-  * Phi-4, Phi-3
-  * `microsoft/Phi-4-mini-instruct`, `microsoft/Phi-4`, `microsoft/Phi-3-mini-4k-instruct`, `microsoft/Phi-3-mini-128k-instruct`, `microsoft/Phi-3-medium-128k-instruct`, etc.
-  * ✅︎
-  * ✅︎
-- * `Phi3SmallForCausalLM`
-  * Phi-3-Small
-  * `microsoft/Phi-3-small-8k-instruct`, `microsoft/Phi-3-small-128k-instruct`, etc.
-  *
-  * ✅︎
-- * `PhiMoEForCausalLM`
-  * Phi-3.5-MoE
-  * `microsoft/Phi-3.5-MoE-instruct`, etc.
-  * ✅︎
-  * ✅︎
-- * `PersimmonForCausalLM`
-  * Persimmon
-  * `adept/persimmon-8b-base`, `adept/persimmon-8b-chat`, etc.
-  *
-  * ✅︎
-- * `Plamo2ForCausalLM`
-  * PLaMo2
-  * `pfnet/plamo-2-1b`, `pfnet/plamo-2-8b`, etc.
-  *
-  *
-- * `QWenLMHeadModel`
-  * Qwen
-  * `Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.
-  * ✅︎
-  * ✅︎
-- * `Qwen2ForCausalLM`
-  * QwQ, Qwen2
-  * `Qwen/QwQ-32B-Preview`, `Qwen/Qwen2-7B-Instruct`, `Qwen/Qwen2-7B`, etc.
-  * ✅︎
-  * ✅︎
-- * `Qwen2MoeForCausalLM`
-  * Qwen2MoE
-  * `Qwen/Qwen1.5-MoE-A2.7B`, `Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
-  *
-  * ✅︎
-- * `Qwen3ForCausalLM`
-  * Qwen3
-  * `Qwen/Qwen3-8B`, etc.
-  * ✅︎
-  * ✅︎
-- * `Qwen3MoeForCausalLM`
-  * Qwen3MoE
-  * `Qwen/Qwen3-30B-A3B`, etc.
-  *
-  * ✅︎
-- * `StableLmForCausalLM`
-  * StableLM
-  * `stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.
-  *
-  * ✅︎
-- * `Starcoder2ForCausalLM`
-  * Starcoder2
-  * `bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.
-  *
-  * ✅︎
-- * `SolarForCausalLM`
-  * Solar Pro
-  * `upstage/solar-pro-preview-instruct`, etc.
-  * ✅︎
-  * ✅︎
-- * `TeleChat2ForCausalLM`
-  * TeleChat2
-  * `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc.
-  * ✅︎
-  * ✅︎
-- * `TeleFLMForCausalLM`
-  * TeleFLM
-  * `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc.
-  * ✅︎
-  * ✅︎
-- * `XverseForCausalLM`
-  * XVERSE
-  * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
-  * ✅︎
-  * ✅︎
-- * `MiniMaxText01ForCausalLM`
-  * MiniMax-Text
-  * `MiniMaxAI/MiniMax-Text-01`, etc.
-  *
-  * ✅︎
-- * `Zamba2ForCausalLM`
-  * Zamba2
-  * `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
-  *
-  *
-- * `MiMoForCausalLM`
-  * MiMo
-  * `XiaomiMiMo/MiMo-7B-RL`, etc.
-  *
-  *
-:::
-
-:::{note}
-Currently, the ROCm version of vLLM supports Mistral and Mixtral only for context lengths up to 4096.
-:::
-
-### Pooling Models
-
-See [this page](pooling-models) for more information on how to use pooling models.
-
-:::{important}
-Since some model architectures support both generative and pooling tasks,
-you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
-:::
-
-#### Text Embedding
-
-Specified using `--task embed`.
-
-:::{list-table}
-:widths: 25 25 50 5 5
-:header-rows: 1
-
-- * Architecture
-  * Models
-  * Example HF Models
-  * [LoRA](#lora-adapter)
-  * [PP](#distributed-serving)
-- * `BertModel`
-  * BERT-based
-  * `BAAI/bge-base-en-v1.5`, `Snowflake/snowflake-arctic-embed-xs`, etc.
-  *
-  *
-- * `Gemma2Model`
-  * Gemma 2-based
-  * `BAAI/bge-multilingual-gemma2`, etc.
-  *
-  * ✅︎
-- * `GritLM`
-  * GritLM
-  * `parasail-ai/GritLM-7B-vllm`.
-  * ✅︎
-  * ✅︎
-- * `GteModel`
-  * Arctic-Embed-2.0-M
-  * `Snowflake/snowflake-arctic-embed-m-v2.0`.
-  *
-  * ︎
-- * `GteNewModel`
-  * mGTE-TRM (see note)
-  * `Alibaba-NLP/gte-multilingual-base`, etc.
-  * ︎
-  * ︎
-- * `ModernBertModel`
-  * ModernBERT-based
-  * `Alibaba-NLP/gte-modernbert-base`, etc.
-  * ︎
-  * ︎
-- * `NomicBertModel`
-  * Nomic BERT
-  * `nomic-ai/nomic-embed-text-v1`, `nomic-ai/nomic-embed-text-v2-moe`, `Snowflake/snowflake-arctic-embed-m-long`, etc.
-  * ︎
-  * ︎
-- * `LlamaModel`, `LlamaForCausalLM`, `MistralModel`, etc.
-  * Llama-based
-  * `intfloat/e5-mistral-7b-instruct`, etc.
-  * ✅︎
-  * ✅︎
-- * `Qwen2Model`, `Qwen2ForCausalLM`
-  * Qwen2-based
-  * `ssmits/Qwen2-7B-Instruct-embed-base` (see note), `Alibaba-NLP/gte-Qwen2-7B-instruct` (see note), etc.
-  * ✅︎
-  * ✅︎
-- * `RobertaModel`, `RobertaForMaskedLM`
-  * RoBERTa-based
-  * `sentence-transformers/all-roberta-large-v1`, etc.
-  *
-  *
-- * `XLMRobertaModel`
-  * XLM-RoBERTa-based
-  * `intfloat/multilingual-e5-large`, `jinaai/jina-reranker-v2-base-multilingual`, `Snowflake/snowflake-arctic-embed-l-v2.0`, `jinaai/jina-embeddings-v3`(see note), etc.
-  *
-  *
-:::
-
-:::{note}
-`ssmits/Qwen2-7B-Instruct-embed-base` has an improperly defined Sentence Transformers config.
-You should manually set mean pooling by passing `--override-pooler-config '{"pooling_type": "MEAN"}'`.
-:::
-
-:::{note}
-The HF implementation of `Alibaba-NLP/gte-Qwen2-1.5B-instruct` is hardcoded to use causal attention despite what is shown in `config.json`. To compare vLLM vs HF results,
-you should set `--hf-overrides '{"is_causal": true}'` in vLLM so that the two implementations are consistent with each other.
-
-For both the 1.5B and 7B variants, you also need to enable `--trust-remote-code` for the correct tokenizer to be loaded.
-See [relevant issue on HF Transformers](https://github.com/huggingface/transformers/issues/34882).
-:::
-
-:::{note}
-`jinaai/jina-embeddings-v3` supports multiple tasks through lora, while vllm temporarily only supports text-matching tasks by merging lora weights.
-:::
-
-:::{note}
-The second-generation GTE model (mGTE-TRM) is named `NewModel`. The name `NewModel` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewModel"]}'` to specify the use of the `GteNewModel` architecture.
-:::
-
-If your model is not in the above list, we will try to automatically convert the model using
-{func}`~vllm.model_executor.models.adapters.as_embedding_model`. By default, the embeddings
-of the whole prompt are extracted from the normalized hidden state corresponding to the last token.
-
-#### Reward Modeling
-
-Specified using `--task reward`.
-
-:::{list-table}
-:widths: 25 25 50 5 5
-:header-rows: 1
-
-- * Architecture
-  * Models
-  * Example HF Models
-  * [LoRA](#lora-adapter)
-  * [PP](#distributed-serving)
-- * `InternLM2ForRewardModel`
-  * InternLM2-based
-  * `internlm/internlm2-1_8b-reward`, `internlm/internlm2-7b-reward`, etc.
-  * ✅︎
-  * ✅︎
-- * `LlamaForCausalLM`
-  * Llama-based
-  * `peiyi9979/math-shepherd-mistral-7b-prm`, etc.
-  * ✅︎
-  * ✅︎
-- * `Qwen2ForRewardModel`
-  * Qwen2-based
-  * `Qwen/Qwen2.5-Math-RM-72B`, etc.
-  * ✅︎
-  * ✅︎
-- * `Qwen2ForProcessRewardModel`
-  * Qwen2-based
-  * `Qwen/Qwen2.5-Math-PRM-7B`, `Qwen/Qwen2.5-Math-PRM-72B`, etc.
-  * ✅︎
-  * ✅︎
-:::
-
-If your model is not in the above list, we will try to automatically convert the model using
-{func}`~vllm.model_executor.models.adapters.as_reward_model`. By default, we return the hidden states of each token directly.
-
-:::{important}
-For process-supervised reward models such as `peiyi9979/math-shepherd-mistral-7b-prm`, the pooling config should be set explicitly,
-e.g.: `--override-pooler-config '{"pooling_type": "STEP", "step_tag_id": 123, "returned_token_ids": [456, 789]}'`.
-:::
-
-#### Classification
-
-Specified using `--task classify`.
-
-:::{list-table}
-:widths: 25 25 50 5 5
-:header-rows: 1
-
-- * Architecture
-  * Models
-  * Example HF Models
-  * [LoRA](#lora-adapter)
-  * [PP](#distributed-serving)
-- * `JambaForSequenceClassification`
-  * Jamba
-  * `ai21labs/Jamba-tiny-reward-dev`, etc.
-  * ✅︎
-  * ✅︎
-- * `Qwen2ForSequenceClassification`
-  * Qwen2-based
-  * `jason9693/Qwen2.5-1.5B-apeach`, etc.
-  * ✅︎
-  * ✅︎
-:::
-
-If your model is not in the above list, we will try to automatically convert the model using
-{func}`~vllm.model_executor.models.adapters.as_classification_model`. By default, the class probabilities are extracted from the softmaxed hidden state corresponding to the last token.
-
-#### Sentence Pair Scoring
-
-Specified using `--task score`.
-
-:::{list-table}
-:widths: 25 25 50 5 5
-:header-rows: 1
-
-- * Architecture
-  * Models
-  * Example HF Models
-  * [LoRA](#lora-adapter)
-  * [PP](#distributed-serving)
-- * `BertForSequenceClassification`
-  * BERT-based
-  * `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc.
-  *
-  *
-- * `RobertaForSequenceClassification`
-  * RoBERTa-based
-  * `cross-encoder/quora-roberta-base`, etc.
-  *
-  *
-- * `XLMRobertaForSequenceClassification`
-  * XLM-RoBERTa-based
-  * `BAAI/bge-reranker-v2-m3`, etc.
-  *
-  *
-- * `ModernBertForSequenceClassification`
-  * ModernBert-based
-  * `Alibaba-NLP/gte-reranker-modernbert-base`, etc.
-  *
-  *
-:::
-
-(supported-mm-models)=
-
-## List of Multimodal Language Models
-
-The following modalities are supported depending on the model:
-
-- **T**ext
-- **I**mage
-- **V**ideo
-- **A**udio
-
-Any combination of modalities joined by `+` are supported.
-
-- e.g.: `T + I` means that the model supports text-only, image-only, and text-with-image inputs.
-
-On the other hand, modalities separated by `/` are mutually exclusive.
-
-- e.g.: `T / I` means that the model supports text-only and image-only inputs, but not text-with-image inputs.
-
-See [this page](#multimodal-inputs) on how to pass multi-modal inputs to the model.
-
-:::{important}
-**To enable multiple multi-modal items per text prompt in vLLM V0**, you have to set `limit_mm_per_prompt` (offline inference)
-or `--limit-mm-per-prompt` (online serving). For example, to enable passing up to 4 images per text prompt:
-
-Offline inference:
-
-```python
-from vllm import LLM
-
-llm = LLM(
-    model="Qwen/Qwen2-VL-7B-Instruct",
-    limit_mm_per_prompt={"image": 4},
-)
-```
-
-Online serving:
-
-```bash
-vllm serve Qwen/Qwen2-VL-7B-Instruct --limit-mm-per-prompt '{"image":4}'
-```
-
-**This is no longer required if you are using vLLM V1.**
-
-:::
-
-:::{note}
-vLLM currently only supports adding LoRA to the language backbone of multimodal models.
-:::
-
-### Generative Models
-
-See [this page](#generative-models) for more information on how to use generative models.
-
-#### Text Generation
-
-Specified using `--task generate`.
-
-:::{list-table}
-:widths: 25 25 15 20 5 5 5
-:header-rows: 1
-
-- * Architecture
-  * Models
-  * Inputs
-  * Example HF Models
-  * [LoRA](#lora-adapter)
-  * [PP](#distributed-serving)
-  * [V1](gh-issue:8779)
-- * `AriaForConditionalGeneration`
-  * Aria
-  * T + I<sup>+</sup>
-  * `rhymes-ai/Aria`
-  *
-  * ✅︎
-  * ✅︎
-- * `AyaVisionForConditionalGeneration`
-  * Aya Vision
-  * T + I<sup>+</sup>
-  * `CohereForAI/aya-vision-8b`, `CohereForAI/aya-vision-32b`, etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `Blip2ForConditionalGeneration`
-  * BLIP-2
-  * T + I<sup>E</sup>
-  * `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `ChameleonForConditionalGeneration`
-  * Chameleon
-  * T + I
-  * `facebook/chameleon-7b` etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `DeepseekVLV2ForCausalLM`<sup>^</sup>
-  * DeepSeek-VL2
-  * T + I<sup>+</sup>
-  * `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2` etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `Florence2ForConditionalGeneration`
-  * Florence-2
-  * T + I
-  * `microsoft/Florence-2-base`, `microsoft/Florence-2-large` etc.
-  *
-  *
-  *
-- * `FuyuForCausalLM`
-  * Fuyu
-  * T + I
-  * `adept/fuyu-8b` etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `Gemma3ForConditionalGeneration`
-  * Gemma 3
-  * T + I<sup>+</sup>
-  * `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc.
-  * ✅︎
-  * ✅︎
-  * ⚠️
-- * `GLM4VForCausalLM`<sup>^</sup>
-  * GLM-4V
-  * T + I
-  * `THUDM/glm-4v-9b`, `THUDM/cogagent-9b-20241220` etc.
-  * ✅︎
-  * ✅︎
-  * ✅︎
-- * `GraniteSpeechForConditionalGeneration`
-  * Granite Speech
-  * T + A
-  * `ibm-granite/granite-speech-3.3-8b`
-  * ✅︎
-  * ✅︎
-  * ✅︎
-- * `H2OVLChatModel`
-  * H2OVL
-  * T + I<sup>E+</sup>
-  * `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.
-  *
-  * ✅︎
-  * ✅︎\*
-- * `Idefics3ForConditionalGeneration`
-  * Idefics3
-  * T + I
-  * `HuggingFaceM4/Idefics3-8B-Llama3` etc.
-  * ✅︎
-  *
-  * ✅︎
-- * `InternVLChatModel`
-  * InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0
-  * T + I<sup>E+</sup>
-  * `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `KimiVLForConditionalGeneration`
-  * Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking
-  * T + I<sup>+</sup>
-  * `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`
-  *
-  *
-  * ✅︎
-- * `Llama4ForConditionalGeneration`
-  * Llama 4
-  * T + I<sup>+</sup>
-  * `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `LlavaForConditionalGeneration`
-  * LLaVA-1.5
-  * T + I<sup>E+</sup>
-  * `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `LlavaNextForConditionalGeneration`
-  * LLaVA-NeXT
-  * T + I<sup>E+</sup>
-  * `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `LlavaNextVideoForConditionalGeneration`
-  * LLaVA-NeXT-Video
-  * T + V
-  * `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `LlavaOnevisionForConditionalGeneration`
-  * LLaVA-Onevision
-  * T + I<sup>+</sup> + V<sup>+</sup>
-  * `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `MiniCPMO`
-  * MiniCPM-O
-  * T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>E+</sup>
-  * `openbmb/MiniCPM-o-2_6`, etc.
-  * ✅︎
-  * ✅︎
-  * ✅︎
-- * `MiniCPMV`
-  * MiniCPM-V
-  * T + I<sup>E+</sup> + V<sup>E+</sup>
-  * `openbmb/MiniCPM-V-2` (see note), `openbmb/MiniCPM-Llama3-V-2_5`, `openbmb/MiniCPM-V-2_6`, etc.
-  * ✅︎
-  * ✅︎
-  * ✅︎
-- * `MiniMaxVL01ForConditionalGeneration`
-  * MiniMax-VL
-  * T + I<sup>E+</sup>
-  * `MiniMaxAI/MiniMax-VL-01`, etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `Mistral3ForConditionalGeneration`
-  * Mistral3
-  * T + I<sup>+</sup>
-  * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, etc.
-  * ✅︎
-  * ✅︎
-  * ✅︎
-- * `MllamaForConditionalGeneration`
-  * Llama 3.2
-  * T + I<sup>+</sup>
-  * `meta-llama/Llama-3.2-90B-Vision-Instruct`, `meta-llama/Llama-3.2-11B-Vision`, etc.
-  *
-  *
-  *
-- * `MolmoForCausalLM`
-  * Molmo
-  * T + I<sup>+</sup>
-  * `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc.
-  * ✅︎
-  * ✅︎
-  * ✅︎
-- * `NVLM_D_Model`
-  * NVLM-D 1.0
-  * T + I<sup>+</sup>
-  * `nvidia/NVLM-D-72B`, etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `Ovis`
-  * Ovis2, Ovis1.6
-  * T + I<sup>+</sup>
-  * `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc.
-  *
-  *
-  * ✅︎
-- * `PaliGemmaForConditionalGeneration`
-  * PaliGemma, PaliGemma 2
-  * T + I<sup>E</sup>
-  * `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.
-  *
-  * ✅︎
-  * ⚠️
-- * `Phi3VForCausalLM`
-  * Phi-3-Vision, Phi-3.5-Vision
-  * T + I<sup>E+</sup>
-  * `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `Phi4MMForCausalLM`
-  * Phi-4-multimodal
-  * T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup>
-  * `microsoft/Phi-4-multimodal-instruct`, etc.
-  * ✅︎
-  *
-  * ✅︎
-- * `PixtralForConditionalGeneration`
-  * Pixtral
-  * T + I<sup>+</sup>
-  * `mistralai/Mistral-Small-3.1-24B-Instruct-2503`, `mistral-community/pixtral-12b`, etc.
-  *
-  * ✅︎
-  * ✅︎
-- * `QwenVLForConditionalGeneration`<sup>^</sup>
-  * Qwen-VL
-  * T + I<sup>E+</sup>
-  * `Qwen/Qwen-VL`, `Qwen/Qwen-VL-Chat`, etc.
-  * ✅︎
-  * ✅︎
-  * ✅︎
-- * `Qwen2AudioForConditionalGeneration`
-  * Qwen2-Audio
-  * T + A<sup>+</sup>
-  * `Qwen/Qwen2-Audio-7B-Instruct`
-  *
-  * ✅︎
-  * ✅︎
-- * `Qwen2VLForConditionalGeneration`
-  * QVQ, Qwen2-VL
-  * T + I<sup>E+</sup> + V<sup>E+</sup>
-  * `Qwen/QVQ-72B-Preview`, `Qwen/Qwen2-VL-7B-Instruct`, `Qwen/Qwen2-VL-72B-Instruct`, etc.
-  * ✅︎
-  * ✅︎
-  * ✅︎
-- * `Qwen2_5_VLForConditionalGeneration`
-  * Qwen2.5-VL
-  * T + I<sup>E+</sup> + V<sup>E+</sup>
-  * `Qwen/Qwen2.5-VL-3B-Instruct`, `Qwen/Qwen2.5-VL-72B-Instruct`, etc.
-  * ✅︎
-  * ✅︎
-  * ✅︎
-- * `Qwen2_5OmniThinkerForConditionalGeneration`
-  * Qwen2.5-Omni
-  * T + I<sup>E+</sup> + V<sup>E+</sup> + A<sup>+</sup>
-  * `Qwen/Qwen2.5-Omni-7B`
-  *
-  * ✅︎
-  * ✅︎\*
-- * `SkyworkR1VChatModel`
-  * Skywork-R1V-38B
-  * T + I
-  * `Skywork/Skywork-R1V-38B`
-  *
-  * ✅︎
-  * ✅︎
-- * `SmolVLMForConditionalGeneration`
-  * SmolVLM2
-  * T + I
-  * `SmolVLM2-2.2B-Instruct`
-  *
-  * ✅︎
-  * ✅︎
-- * `UltravoxModel`
-  * Ultravox
-  * T + A<sup>E+</sup>
-  * `fixie-ai/ultravox-v0_5-llama-3_2-1b`
-  * ✅︎
-  * ✅︎
-  * ✅︎
-:::
-
-<sup>^</sup> You need to set the architecture name via `--hf-overrides` to match the one in vLLM.  
-&nbsp;&nbsp;&nbsp;&nbsp;• For example, to use DeepSeek-VL2 series models:  
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`--hf-overrides '{"architectures": ["DeepseekVLV2ForCausalLM"]}'`  
-<sup>E</sup> Pre-computed embeddings can be inputted for this modality.  
-<sup>+</sup> Multiple items can be inputted per text prompt for this modality.
-
-:::{warning}
-Both V0 and V1 support `Gemma3ForConditionalGeneration` for text-only inputs.
-However, there are differences in how they handle text + image inputs:
-
-V0 correctly implements the model's attention pattern:
-- Uses bidirectional attention between the image tokens corresponding to the same image
-- Uses causal attention for other tokens
-- Implemented via (naive) PyTorch SDPA with masking tensors
-- Note: May use significant memory for long prompts with image
-
-V1 currently uses a simplified attention pattern:
-- Uses causal attention for all tokens, including image tokens
-- Generates reasonable outputs but does not match the original model's attention for text + image inputs, especially when `{"do_pan_and_scan": true}`
-- Will be updated in the future to support the correct behavior
-
-This limitation exists because the model's mixed attention pattern (bidirectional for images, causal otherwise) is not yet supported by vLLM's attention backends.
-:::
-
-:::{note}
-`h2oai/h2ovl-mississippi-2b` will be available in V1 once we support head size 80.
-:::
-
-:::{note}
-To use `TIGER-Lab/Mantis-8B-siglip-llama3`, you have to pass `--hf_overrides '{"architectures": ["MantisForConditionalGeneration"]}'` when running vLLM.
-:::
-
-:::{warning}
-The output quality of `AllenAI/Molmo-7B-D-0924` (especially in object localization tasks) has deteriorated in recent updates.
-
-For the best results, we recommend using the following dependency versions (tested on A10 and L40):
-
-```text
-# Core vLLM-compatible dependencies with Molmo accuracy setup (tested on L40)
-torch==2.5.1
-torchvision==0.20.1
-transformers==4.48.1
-tokenizers==0.21.0
-tiktoken==0.7.0
-vllm==0.7.0
-
-# Optional but recommended for improved performance and stability
-triton==3.1.0
-xformers==0.0.28.post3
-uvloop==0.21.0
-protobuf==5.29.3
-openai==1.60.2
-opencv-python-headless==4.11.0.86
-pillow==10.4.0
-
-# Installed FlashAttention (for float16 only)
-flash-attn>=2.5.6  # Not used in float32, but should be documented
-```
-
-**Note:** Make sure you understand the security implications of using outdated packages.
-:::
-
-:::{note}
-The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now.
-For more details, please see: <gh-pr:4087#issuecomment-2250397630>
-:::
-
-:::{warning}
-Our PaliGemma implementations have the same problem as Gemma 3 (see above) for both V0 and V1.
-:::
-
-:::{note}
-To use Qwen2.5-Omni, you have to install Hugging Face Transformers library from source via
-`pip install git+https://github.com/huggingface/transformers.git`.
-
-Read audio from video pre-processing is currently supported on V0 (but not V1), because overlapping modalities is not yet supported in V1.
-`--mm-processor-kwargs '{"use_audio_in_video": true}'`.
-:::
-
-### Pooling Models
-
-See [this page](pooling-models) for more information on how to use pooling models.
-
-:::{important}
-Since some model architectures support both generative and pooling tasks,
-you should explicitly specify the task type to ensure that the model is used in pooling mode instead of generative mode.
-:::
-
-#### Text Embedding
-
-Specified using `--task embed`.
-
-Any text generation model can be converted into an embedding model by passing `--task embed`.
-
-:::{note}
-To get the best results, you should use pooling models that are specifically trained as such.
-:::
-
-The following table lists those that are tested in vLLM.
-
-:::{list-table}
-:widths: 25 25 15 25 5 5
-:header-rows: 1
-
-- * Architecture
-  * Models
-  * Inputs
-  * Example HF Models
-  * [LoRA](#lora-adapter)
-  * [PP](#distributed-serving)
-- * `LlavaNextForConditionalGeneration`
-  * LLaVA-NeXT-based
-  * T / I
-  * `royokong/e5-v`
-  *
-  * ✅︎
-- * `Phi3VForCausalLM`
-  * Phi-3-Vision-based
-  * T + I
-  * `TIGER-Lab/VLM2Vec-Full`
-  * 🚧
-  * ✅︎
-- * `Qwen2VLForConditionalGeneration`
-  * Qwen2-VL-based
-  * T + I
-  * `MrLight/dse-qwen2-2b-mrl-v1`
-  *
-  * ✅︎
-:::
-
-#### Transcription
-
-Specified using `--task transcription`.
-
-Speech2Text models trained specifically for Automatic Speech Recognition.
-
-:::{list-table}
-:widths: 25 25 25 5 5
-:header-rows: 1
-
-- * Architecture
-  * Models
-  * Example HF Models
-  * [LoRA](#lora-adapter)
-  * [PP](#distributed-serving)
-- * `Whisper`
-  * Whisper-based
-  * `openai/whisper-large-v3-turbo`
-  * 🚧
-  * 🚧
-:::
-
-_________________
-
-## Model Support Policy
-
-At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
-
-1. **Community-Driven Support**: We encourage community contributions for adding new models. When a user requests support for a new model, we welcome pull requests (PRs) from the community. These contributions are evaluated primarily on the sensibility of the output they generate, rather than strict consistency with existing implementations such as those in transformers. **Call for contribution:** PRs coming directly from model vendors are greatly appreciated!
-
-2. **Best-Effort Consistency**: While we aim to maintain a level of consistency between the models implemented in vLLM and other frameworks like transformers, complete alignment is not always feasible. Factors like acceleration techniques and the use of low-precision computations can introduce discrepancies. Our commitment is to ensure that the implemented models are functional and produce sensible results.
-
-    :::{tip}
-    When comparing the output of `model.generate` from Hugging Face Transformers with the output of `llm.generate` from vLLM, note that the former reads the model's generation config file (i.e., [generation_config.json](https://github.com/huggingface/transformers/blob/19dabe96362803fb0a9ae7073d03533966598b17/src/transformers/generation/utils.py#L1945)) and applies the default parameters for generation, while the latter only uses the parameters passed to the function. Ensure all sampling parameters are identical when comparing outputs.
-    :::
-
-3. **Issue Resolution and Model Updates**: Users are encouraged to report any bugs or issues they encounter with third-party models. Proposed fixes should be submitted via PRs, with a clear explanation of the problem and the rationale behind the proposed solution. If a fix for one model impacts another, we rely on the community to highlight and address these cross-model dependencies. Note: for bugfix PRs, it is good etiquette to inform the original author to seek their feedback.
-
-4. **Monitoring and Updates**: Users interested in specific models should monitor the commit history for those models (e.g., by tracking changes in the main/vllm/model_executor/models directory). This proactive approach helps users stay informed about updates and changes that may affect the models they use.
-
-5. **Selective Focus**: Our resources are primarily directed towards models with significant user interest and impact. Models that are less frequently used may receive less attention, and we rely on the community to play a more active role in their upkeep and improvement.
-
-Through this approach, vLLM fosters a collaborative environment where both the core development team and the broader community contribute to the robustness and diversity of the third-party models supported in our ecosystem.
-
-Note that, as an inference engine, vLLM does not introduce new models. Therefore, all models supported by vLLM are third-party models in this regard.
-
-We have the following levels of testing for models:
-
-1. **Strict Consistency**: We compare the output of the model with the output of the model in the HuggingFace Transformers library under greedy decoding. This is the most stringent test. Please refer to [models tests](https://github.com/vllm-project/vllm/blob/main/tests/models) for the models that have passed this test.
-2. **Output Sensibility**: We check if the output of the model is sensible and coherent, by measuring the perplexity of the output and checking for any obvious errors. This is a less stringent test.
-3. **Runtime Functionality**: We check if the model can be loaded and run without errors. This is the least stringent test. Please refer to [functionality tests](gh-dir:tests) and [examples](gh-dir:examples) for the models that have passed this test.
-4. **Community Feedback**: We rely on the community to provide feedback on the models. If a model is broken or not working as expected, we encourage users to raise issues to report it or open pull requests to fix it. The rest of the models fall under this category.
diff --git a/docs/source/serving/engine_args.md b/docs/source/serving/engine_args.md
deleted file mode 100644
index 9325a2406e8c..000000000000
--- a/docs/source/serving/engine_args.md
+++ /dev/null
@@ -1,36 +0,0 @@
-(engine-args)=
-
-# Engine Arguments
-
-Engine arguments control the behavior of the vLLM engine.
-
-- For [offline inference](#offline-inference), they are part of the arguments to `LLM` class.
-- For [online serving](#openai-compatible-server), they are part of the arguments to `vllm serve`.
-
-For references to all arguments available from `vllm serve` see the [serve args](#serve-args) documentation.
-
-Below, you can find an explanation of every engine argument:
-
-<!--- pyml disable-num-lines 7 no-space-in-emphasis -->
-```{eval-rst}
-.. argparse::
-    :module: vllm.engine.arg_utils
-    :func: _engine_args_parser
-    :prog: vllm serve
-    :nodefaultconst:
-    :markdownhelp:
-```
-
-## Async Engine Arguments
-
-Additional arguments are available to the asynchronous engine which is used for online serving:
-
-<!--- pyml disable-num-lines 7 no-space-in-emphasis -->
-```{eval-rst}
-.. argparse::
-    :module: vllm.engine.arg_utils
-    :func: _async_engine_args_parser
-    :prog: vllm serve
-    :nodefaultconst:
-    :markdownhelp:
-```
diff --git a/docs/source/serving/env_vars.md b/docs/source/serving/env_vars.md
deleted file mode 100644
index 9845241930a4..000000000000
--- a/docs/source/serving/env_vars.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Environment Variables
-
-vLLM uses the following environment variables to configure the system:
-
-:::{warning}
-Please note that `VLLM_PORT` and `VLLM_HOST_IP` set the port and ip for vLLM's **internal usage**. It is not the port and ip for the API server. If you use `--host $VLLM_HOST_IP` and `--port $VLLM_PORT` to start the API server, it will not work.
-
-All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
-:::
-
-:::{literalinclude} ../../../vllm/envs.py
-:end-before: end-env-vars-definition
-:language: python
-:start-after: begin-env-vars-definition
-:::
diff --git a/docs/source/serving/integrations/index.md b/docs/source/serving/integrations/index.md
deleted file mode 100644
index e2b4c0814605..000000000000
--- a/docs/source/serving/integrations/index.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# External Integrations
-
-:::{toctree}
-:maxdepth: 1
-
-langchain
-llamaindex
-:::
diff --git a/docs/source/serving/offline_inference.md b/docs/source/serving/offline_inference.md
deleted file mode 100644
index 433d2e894dd8..000000000000
--- a/docs/source/serving/offline_inference.md
+++ /dev/null
@@ -1,217 +0,0 @@
-(offline-inference)=
-
-# Offline Inference
-
-You can run vLLM in your own code on a list of prompts.
-
-The offline API is based on the {class}`~vllm.LLM` class.
-To initialize the vLLM engine, create a new instance of `LLM` and specify the model to run.
-
-For example, the following code downloads the [`facebook/opt-125m`](https://huggingface.co/facebook/opt-125m) model from HuggingFace
-and runs it in vLLM using the default configuration.
-
-```python
-from vllm import LLM
-
-llm = LLM(model="facebook/opt-125m")
-```
-
-After initializing the `LLM` instance, you can perform model inference using various APIs.
-The available APIs depend on the type of model that is being run:
-
-- [Generative models](#generative-models) output logprobs which are sampled from to obtain the final output text.
-- [Pooling models](#pooling-models) output their hidden states directly.
-
-Please refer to the above pages for more details about each API.
-
-:::{seealso}
-[API Reference](#offline-inference-api)
-:::
-
-(configuration-options)=
-
-## Configuration Options
-
-This section lists the most common options for running the vLLM engine.
-For a full list, refer to the <project:#configuration> page.
-
-(model-resolution)=
-
-### Model resolution
-
-vLLM loads HuggingFace-compatible models by inspecting the `architectures` field in `config.json` of the model repository
-and finding the corresponding implementation that is registered to vLLM.
-Nevertheless, our model resolution may fail for the following reasons:
-
-- The `config.json` of the model repository lacks the `architectures` field.
-- Unofficial repositories refer to a model using alternative names which are not recorded in vLLM.
-- The same architecture name is used for multiple models, creating ambiguity as to which model should be loaded.
-
-To fix this, explicitly specify the model architecture by passing `config.json` overrides to the `hf_overrides` option.
-For example:
-
-```python
-from vllm import LLM
-
-model = LLM(
-    model="cerebras/Cerebras-GPT-1.3B",
-    hf_overrides={"architectures": ["GPT2LMHeadModel"]},  # GPT-2
-)
-```
-
-Our [list of supported models](#supported-models) shows the model architectures that are recognized by vLLM.
-
-(reducing-memory-usage)=
-
-### Reducing memory usage
-
-Large models might cause your machine to run out of memory (OOM). Here are some options that help alleviate this problem.
-
-#### Tensor Parallelism (TP)
-
-Tensor parallelism (`tensor_parallel_size` option) can be used to split the model across multiple GPUs.
-
-The following code splits the model across 2 GPUs.
-
-```python
-from vllm import LLM
-
-llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
-          tensor_parallel_size=2)
-```
-
-:::{important}
-To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. {func}`torch.cuda.set_device`)
-before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
-
-To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
-:::
-
-:::{note}
-With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
-
-You can convert the model checkpoint to a sharded checkpoint using <gh-file:examples/offline_inference/save_sharded_state.py>. The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
-:::
-
-#### Quantization
-
-Quantized models take less memory at the cost of lower precision.
-
-Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Red Hat AI](https://huggingface.co/RedHatAI))
-and used directly without extra configuration.
-
-Dynamic quantization is also supported via the `quantization` option -- see [here](#quantization-index) for more details.
-
-#### Context length and batch size
-
-You can further reduce memory usage by limiting the context length of the model (`max_model_len` option)
-and the maximum batch size (`max_num_seqs` option).
-
-```python
-from vllm import LLM
-
-llm = LLM(model="adept/fuyu-8b",
-          max_model_len=2048,
-          max_num_seqs=2)
-```
-
-#### Reduce CUDA Graphs
-
-By default, we optimize model inference using CUDA graphs which take up extra memory in the GPU.
-
-:::{important}
-CUDA graph capture takes up more memory in V1 than in V0.
-:::
-
-You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
-
-```python
-from vllm import LLM
-from vllm.config import CompilationConfig, CompilationLevel
-
-llm = LLM(
-    model="meta-llama/Llama-3.1-8B-Instruct",
-    compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        # By default, it goes up to max_num_seqs
-        cudagraph_capture_sizes=[1, 2, 4, 8, 16],
-    ),
-)
-```
-
-You can disable graph capturing completely via the `enforce_eager` flag:
-
-```python
-from vllm import LLM
-
-llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
-          enforce_eager=True)
-```
-
-#### Adjust cache size
-
-If you run out of CPU RAM, try the following options:
-
-- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
-- (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
-
-#### Multi-modal input limits
-
-You can allow a smaller number of multi-modal items per prompt to reduce the memory footprint of the model:
-
-```python
-from vllm import LLM
-
-# Accept up to 3 images and 1 video per prompt
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          limit_mm_per_prompt={"image": 3, "video": 1})
-```
-
-You can go a step further and disable unused modalities completely by setting its limit to zero.
-For example, if your application only accepts image input, there is no need to allocate any memory for videos.
-
-```python
-from vllm import LLM
-
-# Accept any number of images but no videos
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          limit_mm_per_prompt={"video": 0})
-```
-
-You can even run a multi-modal model for text-only inference:
-
-```python
-from vllm import LLM
-
-# Don't accept images. Just text.
-llm = LLM(model="google/gemma-3-27b-it",
-          limit_mm_per_prompt={"image": 0})
-```
-
-#### Multi-modal processor arguments
-
-For certain models, you can adjust the multi-modal processor arguments to
-reduce the size of the processed multi-modal inputs, which in turn saves memory.
-
-Here are some examples:
-
-```python
-from vllm import LLM
-
-# Available for Qwen2-VL series models
-llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-          mm_processor_kwargs={
-              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
-          })
-
-# Available for InternVL series models
-llm = LLM(model="OpenGVLab/InternVL2-2B",
-          mm_processor_kwargs={
-              "max_dynamic_patch": 4,  # Default is 12
-          })
-```
-
-### Performance optimization and tuning
-
-You can potentially improve the performance of vLLM by finetuning various options.
-Please refer to [this guide](#optimization-and-tuning) for more details.
diff --git a/docs/source/serving/serve_args.md b/docs/source/serving/serve_args.md
deleted file mode 100644
index edb49f4ba6de..000000000000
--- a/docs/source/serving/serve_args.md
+++ /dev/null
@@ -1,47 +0,0 @@
-(serve-args)=
-
-# Server Arguments
-
-The `vllm serve` command is used to launch the OpenAI-compatible server.
-
-## CLI Arguments
-
-The following are all arguments available from the `vllm serve` command:
-
-<!--- pyml disable-num-lines 7 no-space-in-emphasis -->
-```{eval-rst}
-.. argparse::
-    :module: vllm.entrypoints.openai.cli_args
-    :func: create_parser_for_docs
-    :prog: vllm serve
-    :nodefaultconst:
-    :markdownhelp:
-```
-
-## Configuration file
-
-You can load CLI arguments via a [YAML](https://yaml.org/) config file.
-The argument names must be the long form of those outlined [above](#serve-args).
-
-For example:
-
-```yaml
-# config.yaml
-
-model: meta-llama/Llama-3.1-8B-Instruct
-host: "127.0.0.1"
-port: 6379
-uvicorn-log-level: "info"
-```
-
-To use the above config file:
-
-```bash
-vllm serve --config config.yaml
-```
-
-:::{note}
-In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
-The order of priorities is `command line > config file values > defaults`.
-e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file.
-:::
diff --git a/docs/source/training/rlhf.md b/docs/training/rlhf.md
similarity index 69%
rename from docs/source/training/rlhf.md
rename to docs/training/rlhf.md
index 72e89c0c7478..4f75e4e01495 100644
--- a/docs/source/training/rlhf.md
+++ b/docs/training/rlhf.md
@@ -6,6 +6,6 @@ vLLM can be used to generate the completions for RLHF. The best way to do this i
 
 See the following basic examples to get started if you don't want to use an existing library:
 
-- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf.html)
-- [Training and inference processes are colocated on the same GPUs using Ray](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf_colocate.html)
-- [Utilities for performing RLHF with vLLM](https://docs.vllm.ai/en/latest/getting_started/examples/rlhf_utils.html)
+- [Training and inference processes are located on separate GPUs (inspired by OpenRLHF)](../examples/offline_inference/rlhf.md)
+- [Training and inference processes are colocated on the same GPUs using Ray](../examples/offline_inference/rlhf_colocate.md)
+- [Utilities for performing RLHF with vLLM](../examples/offline_inference/rlhf_utils.md)
diff --git a/docs/source/training/trl.md b/docs/training/trl.md
similarity index 66%
rename from docs/source/training/trl.md
rename to docs/training/trl.md
index ebdf593dbde5..c7c1a5a3bbd1 100644
--- a/docs/source/training/trl.md
+++ b/docs/training/trl.md
@@ -6,8 +6,7 @@ Online methods such as GRPO or Online DPO require the model to generate completi
 
 See the guide [vLLM for fast generation in online methods](https://huggingface.co/docs/trl/main/en/speeding_up_training#vllm-for-fast-generation-in-online-methods) in the TRL documentation for more information.
 
-:::{seealso}
-For more information on the `use_vllm` flag you can provide to the configs of these online methods, see:
-- [`trl.GRPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/grpo_trainer#trl.GRPOConfig.use_vllm)
-- [`trl.OnlineDPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/online_dpo_trainer#trl.OnlineDPOConfig.use_vllm)
-:::
+!!! info
+    For more information on the `use_vllm` flag you can provide to the configs of these online methods, see:
+    - [`trl.GRPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/grpo_trainer#trl.GRPOConfig.use_vllm)
+    - [`trl.OnlineDPOConfig.use_vllm`](https://huggingface.co/docs/trl/main/en/online_dpo_trainer#trl.OnlineDPOConfig.use_vllm)
diff --git a/docs/usage/README.md b/docs/usage/README.md
new file mode 100644
index 000000000000..681db57d8e0f
--- /dev/null
+++ b/docs/usage/README.md
@@ -0,0 +1,7 @@
+# Using vLLM
+
+vLLM supports the following usage patterns:
+
+- [Inference and Serving](../serving/offline_inference.md): Run a single instance of a model.
+- [Deployment](../deployment/docker.md): Scale up model instances for production.
+- [Training](../training/rlhf.md): Train or fine-tune a model.
diff --git a/docs/source/getting_started/faq.md b/docs/usage/faq.md
similarity index 91%
rename from docs/source/getting_started/faq.md
rename to docs/usage/faq.md
index c1bb28937c14..51977d4434f5 100644
--- a/docs/source/getting_started/faq.md
+++ b/docs/usage/faq.md
@@ -1,23 +1,24 @@
-(faq)=
-
-# Frequently Asked Questions
+---
+title: Frequently Asked Questions
+---
+[](){ #faq }
 
 > Q: How can I serve multiple models on a single port using the OpenAI API?
 
 A: Assuming that you're referring to using OpenAI compatible server to serve multiple models at once, that is not currently supported, you can run multiple instances of the server (each serving a different model) at the same time, and have another layer to route the incoming request to the correct server accordingly.
 
-______________________________________________________________________
+---
 
 > Q: Which model to use for offline inference embedding?
 
 A: You can try [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) and [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5);
-more are listed [here](#supported-models).
+more are listed [here][supported-models].
 
 By extracting hidden states, vLLM can automatically convert text generation models like [Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B),
 [Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) into embedding models,
 but they are expected to be inferior to models that are specifically trained on embedding tasks.
 
-______________________________________________________________________
+---
 
 > Q: Can the output of a prompt vary across runs in vLLM?
 
diff --git a/docs/source/serving/metrics.md b/docs/usage/metrics.md
similarity index 90%
rename from docs/source/serving/metrics.md
rename to docs/usage/metrics.md
index 647ece3f85f0..9ad7253184d9 100644
--- a/docs/source/serving/metrics.md
+++ b/docs/usage/metrics.md
@@ -4,7 +4,7 @@ vLLM exposes a number of metrics that can be used to monitor the health of the
 system. These metrics are exposed via the `/metrics` endpoint on the vLLM
 OpenAI compatible API server.
 
-You can start the server using Python, or using [Docker](#deployment-docker):
+You can start the server using Python, or using [Docker][deployment-docker]:
 
 ```console
 vllm serve unsloth/Llama-3.2-1B-Instruct
@@ -31,11 +31,9 @@ vllm:iteration_tokens_total_bucket{le="512.0",model_name="unsloth/Llama-3.2-1B-I
 
 The following metrics are exposed:
 
-:::{literalinclude} ../../../vllm/engine/metrics.py
-:end-before: end-metrics-definitions
-:language: python
-:start-after: begin-metrics-definitions
-:::
+```python
+--8<-- "vllm/engine/metrics.py:metrics-definitions"
+```
 
 The following metrics are deprecated and due to be removed in a future version:
 
diff --git a/docs/usage/reproducibility.md b/docs/usage/reproducibility.md
new file mode 100644
index 000000000000..a494dcf19191
--- /dev/null
+++ b/docs/usage/reproducibility.md
@@ -0,0 +1,52 @@
+# Reproducibility
+
+vLLM does not guarantee the reproducibility of the results by default, for the sake of performance. You need to do the following to achieve
+reproducible results:
+
+- For V1: Turn off multiprocessing to make the scheduling deterministic by setting `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
+- For V0: Set the global seed (see below).
+
+Example: <gh-file:examples/offline_inference/reproducibility.py>
+
+!!! warning
+
+    Applying the above settings [changes the random state in user code](#locality-of-random-state).
+
+!!! note
+
+    Even with the above settings, vLLM only provides reproducibility
+    when it runs on the same hardware and the same vLLM version.
+    Also, the online serving API (`vllm serve`) does not support reproducibility
+    because it is almost impossible to make the scheduling deterministic in the
+    online setting.
+
+## Setting the global seed
+
+The `seed` parameter in vLLM is used to control the random states for various random number generators.
+
+If a specific seed value is provided, the random states for `random`, `np.random`, and `torch.manual_seed` will be set accordingly.
+
+However, in some cases, setting the seed will also [change the random state in user code](#locality-of-random-state).
+
+### Default Behavior
+
+In V0, the `seed` parameter defaults to `None`. When the `seed` parameter is `None`, the random states for `random`, `np.random`, and `torch.manual_seed` are not set. This means that each run of vLLM will produce different results if `temperature > 0`, as expected.
+
+In V1, the `seed` parameter defaults to `0` which sets the random state for each worker, so the results will remain consistent for each vLLM run even if `temperature > 0`.
+
+!!! note
+
+    It is impossible to un-specify a seed for V1 because different workers need to sample the same outputs
+    for workflows such as speculative decoding.
+    
+    For more information, see: <gh-pr:17929>
+
+### Locality of random state
+
+The random state in user code (i.e. the code that constructs [LLM][vllm.LLM] class) is updated by vLLM under the following conditions:
+
+- For V0: The seed is specified.
+- For V1: The workers are run in the same process as user code, i.e.: `VLLM_ENABLE_V1_MULTIPROCESSING=0`.
+
+By default, these conditions are not active so you can use vLLM without having to worry about
+accidentally making deterministic subsequent operations that rely on random state.
diff --git a/docs/source/deployment/security.md b/docs/usage/security.md
similarity index 99%
rename from docs/source/deployment/security.md
rename to docs/usage/security.md
index 9c4d639c0b3d..f1661828d68a 100644
--- a/docs/source/deployment/security.md
+++ b/docs/usage/security.md
@@ -1,4 +1,4 @@
-# Security Guide
+# Security
 
 ## Inter-Node Communication
 
diff --git a/docs/source/getting_started/troubleshooting.md b/docs/usage/troubleshooting.md
similarity index 85%
rename from docs/source/getting_started/troubleshooting.md
rename to docs/usage/troubleshooting.md
index a4744827f226..889cfccdacac 100644
--- a/docs/source/getting_started/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -1,12 +1,12 @@
-(troubleshooting)=
-
-# Troubleshooting
+---
+title: Troubleshooting
+---
+[](){ #troubleshooting }
 
 This document outlines some troubleshooting strategies you can consider. If you think you've discovered a bug, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
 
-:::{note}
-Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
-:::
+!!! note
+    Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
 
 ## Hangs downloading a model
 
@@ -18,13 +18,12 @@ It's recommended to download the model first using the [huggingface-cli](https:/
 If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow.
 It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
 
-:::{note}
-To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
-:::
+!!! note
+    To isolate the model downloading and loading issue, you can use the `--load-format dummy` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
 
 ## Out of memory
 
-If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider adopting [these options](#reducing-memory-usage) to reduce the memory consumption.
+If the model is too large to fit in a single GPU, you will get an out-of-memory (OOM) error. Consider adopting [these options](../configuration/conserving_memory.md) to reduce the memory consumption.
 
 ## Generation quality changed
 
@@ -53,9 +52,9 @@ You might also need to set `export NCCL_SOCKET_IFNAME=<your_network_interface>`
 ## Error near `self.graph.replay()`
 
 If vLLM crashes and the error trace captures it somewhere around `self.graph.replay()` in `vllm/worker/model_runner.py`, it is a CUDA error inside CUDAGraph.
-To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the {class}`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
+To identify the particular CUDA operation that causes the error, you can add `--enforce-eager` to the command line, or `enforce_eager=True` to the [LLM][vllm.LLM] class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
 
-(troubleshooting-incorrect-hardware-driver)=
+[](){ #troubleshooting-incorrect-hardware-driver }
 
 ## Incorrect hardware/driver
 
@@ -140,16 +139,15 @@ If the script runs successfully, you should see the message `sanity check is suc
 
 If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as `export NCCL_P2P_DISABLE=1` to see if it helps. Please check [their documentation](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html) for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
 
-:::{note}
-A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
+!!! note
+    A multi-node environment is more complicated than a single-node one. If you see errors such as `torch.distributed.DistNetworkError`, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
 
-- In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`.
-- In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`.
+    - In the first node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py`.
+    - In the second node, run `NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py`.
 
-Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
-:::
+    Adjust `--nproc-per-node`, `--nnodes`, and `--node-rank` according to your setup, being sure to execute different commands (with different `--node-rank`) on different nodes.
 
-(troubleshooting-python-multiprocessing)=
+[](){ #troubleshooting-python-multiprocessing }
 
 ## Python multiprocessing
 
@@ -161,7 +159,7 @@ If you have seen a warning in your logs like this:
 WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
     initialized. We must use the `spawn` multiprocessing start method. Setting
     VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
-    https://docs.vllm.ai/en/latest/getting_started/troubleshooting.html#python-multiprocessing
+    https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing
     for more information.
 ```
 
@@ -260,7 +258,7 @@ or:
 ValueError: Model architectures ['<arch>'] are not supported for now. Supported architectures: [...]
 ```
 
-But you are sure that the model is in the [list of supported models](#supported-models), there may be some issue with vLLM's model resolution. In that case, please follow [these steps](#model-resolution) to explicitly specify the vLLM implementation for the model.
+But you are sure that the model is in the [list of supported models][supported-models], there may be some issue with vLLM's model resolution. In that case, please follow [these steps](../configuration/model_resolution.md) to explicitly specify the vLLM implementation for the model.
 
 ## Failed to infer device type
 
diff --git a/docs/source/serving/usage_stats.md b/docs/usage/usage_stats.md
similarity index 100%
rename from docs/source/serving/usage_stats.md
rename to docs/usage/usage_stats.md
diff --git a/docs/source/getting_started/v1_user_guide.md b/docs/usage/v1_guide.md
similarity index 99%
rename from docs/source/getting_started/v1_user_guide.md
rename to docs/usage/v1_guide.md
index de90b8a7851e..3d5d7ce45cce 100644
--- a/docs/source/getting_started/v1_user_guide.md
+++ b/docs/usage/v1_guide.md
@@ -1,4 +1,4 @@
-# vLLM V1 User Guide
+# vLLM V1
 
 V1 is now enabled by default for all supported use cases, and we will gradually enable it for every use case we plan to support. Please share any feedback on [GitHub](https://github.com/vllm-project/vllm) or in the [vLLM Slack](https://inviter.co/vllm-slack).
 
diff --git a/examples/offline_inference/audio_language.py b/examples/offline_inference/audio_language.py
index bab41c915c32..56cdd6861baa 100644
--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -1,11 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-This example shows how to use vLLM for running offline inference 
+This example shows how to use vLLM for running offline inference
 with the correct prompt format on audio language models.
 
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
+
 import os
 from dataclasses import asdict
 from typing import NamedTuple, Optional
@@ -22,7 +23,7 @@
 question_per_audio_count = {
     0: "What is 1+1?",
     1: "What is recited in the audio?",
-    2: "What sport and what nursery rhyme are referenced?"
+    2: "What sport and what nursery rhyme are referenced?",
 }
 
 
@@ -72,8 +73,7 @@ def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
 # MiniCPM-O
 def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
     model_name = "openbmb/MiniCPM-o-2_6"
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     engine_args = EngineArgs(
         model=model_name,
         trust_remote_code=True,
@@ -82,19 +82,18 @@ def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
         limit_mm_per_prompt={"audio": audio_count},
     )
 
-    stop_tokens = ['<|im_end|>', '<|endoftext|>']
+    stop_tokens = ["<|im_end|>", "<|endoftext|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 
     audio_placeholder = "(<audio>./</audio>)" * audio_count
     audio_chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<|spk_bos|><|spk|><|spk_eos|><|tts_bos|>' }}{% endif %}"  # noqa: E501
-    messages = [{
-        'role': 'user',
-        'content': f'{audio_placeholder}\n{question}'
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True,
-                                           chat_template=audio_chat_template)
+    messages = [{"role": "user", "content": f"{audio_placeholder}\n{question}"}]
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+        chat_template=audio_chat_template,
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -113,7 +112,7 @@ def run_phi4mm(question: str, audio_count: int) -> ModelRequestData:
     # Since the vision-lora and speech-lora co-exist with the base model,
     # we have to manually specify the path of the lora weights.
     speech_lora_path = os.path.join(model_path, "speech-lora")
-    placeholders = "".join([f"<|audio_{i+1}|>" for i in range(audio_count)])
+    placeholders = "".join([f"<|audio_{i + 1}|>" for i in range(audio_count)])
 
     prompts = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
 
@@ -145,15 +144,19 @@ def run_qwen2_audio(question: str, audio_count: int) -> ModelRequestData:
         limit_mm_per_prompt={"audio": audio_count},
     )
 
-    audio_in_prompt = "".join([
-        f"Audio {idx+1}: "
-        f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
-    ])
+    audio_in_prompt = "".join(
+        [
+            f"Audio {idx + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n"
+            for idx in range(audio_count)
+        ]
+    )
 
-    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-              "<|im_start|>user\n"
-              f"{audio_in_prompt}{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompt = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{audio_in_prompt}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -172,19 +175,22 @@ def run_qwen2_5_omni(question: str, audio_count: int):
         limit_mm_per_prompt={"audio": audio_count},
     )
 
-    audio_in_prompt = "".join([
-        "<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
-    ])
+    audio_in_prompt = "".join(
+        ["<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)]
+    )
 
     default_system = (
         "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
         "Group, capable of perceiving auditory and visual inputs, as well as "
-        "generating text and speech.")
+        "generating text and speech."
+    )
 
-    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
-              "<|im_start|>user\n"
-              f"{audio_in_prompt}{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{audio_in_prompt}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
@@ -196,13 +202,10 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
     model_name = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    messages = [{
-        'role': 'user',
-        'content': "<|audio|>\n" * audio_count + question
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    messages = [{"role": "user", "content": "<|audio|>\n" * audio_count + question}]
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     engine_args = EngineArgs(
         model=model_name,
@@ -220,8 +223,7 @@ def run_ultravox(question: str, audio_count: int) -> ModelRequestData:
 
 # Whisper
 def run_whisper(question: str, audio_count: int) -> ModelRequestData:
-    assert audio_count == 1, (
-        "Whisper only support single audio input per prompt")
+    assert audio_count == 1, "Whisper only support single audio input per prompt"
     model_name = "openai/whisper-large-v3-turbo"
 
     prompt = "<|startoftranscript|>"
@@ -252,27 +254,33 @@ def run_whisper(question: str, audio_count: int) -> ModelRequestData:
 
 def parse_args():
     parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
-        'audio language models')
-    parser.add_argument('--model-type',
-                        '-m',
-                        type=str,
-                        default="ultravox",
-                        choices=model_example_map.keys(),
-                        help='Huggingface "model_type".')
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        default=1,
-                        help='Number of prompts to run.')
-    parser.add_argument("--num-audios",
-                        type=int,
-                        default=1,
-                        choices=[0, 1, 2],
-                        help="Number of audio items per prompt.")
-    parser.add_argument("--seed",
-                        type=int,
-                        default=None,
-                        help="Set the seed when initializing `vllm.LLM`.")
+        description="Demo on using vLLM for offline inference with "
+        "audio language models"
+    )
+    parser.add_argument(
+        "--model-type",
+        "-m",
+        type=str,
+        default="ultravox",
+        choices=model_example_map.keys(),
+        help='Huggingface "model_type".',
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=1, help="Number of prompts to run."
+    )
+    parser.add_argument(
+        "--num-audios",
+        type=int,
+        default=1,
+        choices=[0, 1, 2],
+        help="Number of audio items per prompt.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
 
     return parser.parse_args()
 
@@ -283,29 +291,30 @@ def main(args):
         raise ValueError(f"Model type {model} is not supported.")
 
     audio_count = args.num_audios
-    req_data = model_example_map[model](question_per_audio_count[audio_count],
-                                        audio_count)
+    req_data = model_example_map[model](
+        question_per_audio_count[audio_count], audio_count
+    )
 
     # Disable other modalities to save memory
     default_limits = {"image": 0, "video": 0, "audio": 0}
     req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-        req_data.engine_args.limit_mm_per_prompt or {})
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
 
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
     llm = LLM(**engine_args)
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(temperature=0.2,
-                                     max_tokens=64,
-                                     stop_token_ids=req_data.stop_token_ids)
+    sampling_params = SamplingParams(
+        temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
+    )
 
     mm_data = {}
     if audio_count > 0:
         mm_data = {
             "audio": [
-                asset.audio_and_sample_rate
-                for asset in audio_assets[:audio_count]
+                asset.audio_and_sample_rate for asset in audio_assets[:audio_count]
             ]
         }
 
@@ -315,8 +324,9 @@ def main(args):
         # Batch inference
         inputs = [inputs] * args.num_prompts
     # Add LoRA request if applicable
-    lora_request = (req_data.lora_requests *
-                    args.num_prompts if req_data.lora_requests else None)
+    lora_request = (
+        req_data.lora_requests * args.num_prompts if req_data.lora_requests else None
+    )
 
     outputs = llm.generate(
         inputs,
diff --git a/docs/source/features/automatic_prefix_caching.md b/examples/offline_inference/automatic_prefix_caching.py
similarity index 63%
rename from docs/source/features/automatic_prefix_caching.md
rename to examples/offline_inference/automatic_prefix_caching.py
index 59016d7fcf6b..0d8c73304237 100644
--- a/docs/source/features/automatic_prefix_caching.md
+++ b/examples/offline_inference/automatic_prefix_caching.py
@@ -1,26 +1,31 @@
-(automatic-prefix-caching)=
-
-# Automatic Prefix Caching
-
-## Introduction
+# SPDX-License-Identifier: Apache-2.0
+"""
+Demonstration script for Automatic Prefix Caching (APC) in vLLM.
 
-Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
+Automatic Prefix Caching (APC) allows the vLLM engine to reuse cached
+KV (key-value) pairs from previous prompts if a new query shares the same
+prefix. This reduces redundant computation and improves inference speed.
 
-:::{note}
-Technical details on how vLLM implements APC can be found [here](#design-automatic-prefix-caching).
-:::
+To enable APC, set `enable_prefix_caching=True` when initializing the
+vLLM engine.
 
-## Enabling APC in vLLM
+This script uses a long Markdown table as the shared prompt prefix and
+compares the generation time for two queries that share the same prefix
+but ask different questions.
 
-Set `enable_prefix_caching=True` in vLLM engine to enable APC. Here is an example:
+Run:
+python examples/offline_inference/automatic_prefix_caching.py
+"""
 
-```python
 import time
-from vllm import LLM, SamplingParams
 
+from vllm import LLM, SamplingParams
 
+# ruff: noqa: E501
 # A prompt containing a large markdown table. The table is randomly generated by GPT-4.
-LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
+LONG_PROMPT = (
+    "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n"
+    + """
 | ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
 |-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
 | 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
@@ -54,6 +59,7 @@
 | 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
 | 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
 """
+)
 
 
 def get_generation_time(llm, sampling_params, prompts):
@@ -62,41 +68,35 @@ def get_generation_time(llm, sampling_params, prompts):
     output = llm.generate(prompts, sampling_params=sampling_params)
     end_time = time.time()
     # print the output and generation time
+    print("-" * 30)
     print(f"Output: {output[0].outputs[0].text}")
     print(f"Generation time: {end_time - start_time} seconds.")
+    print("-" * 30)
 
 
-# set enable_prefix_caching=True to enable APC
-llm = LLM(
-    model='lmsys/longchat-13b-16k',
-    enable_prefix_caching=True
-)
-
-sampling_params = SamplingParams(temperature=0, max_tokens=100)
-
-# Querying the age of John Doe
-get_generation_time(
-    llm,
-    sampling_params,
-    LONG_PROMPT + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
-)
-
-# Querying the age of Zack Blue
-# This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
-get_generation_time(
-    llm,
-    sampling_params,
-    LONG_PROMPT + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
-)
-```
+def main():
+    # set enable_prefix_caching=True to enable APC
+    llm = LLM(model="lmsys/longchat-13b-16k", enable_prefix_caching=True)
 
-## Example workloads
+    sampling_params = SamplingParams(temperature=0, max_tokens=100)
 
-We describe two example workloads, where APC can provide huge performance benefit:
+    # Querying the age of John Doe
+    get_generation_time(
+        llm,
+        sampling_params,
+        LONG_PROMPT
+        + "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
+    )
 
-- Long document query, where the user repeatedly queries the same long document (e.g. software manual or annual report) with different queries. In this case, instead of processing the long document again and again, APC allows vLLM to process this long document *only once*, and all future requests can avoid recomputing this long document by reusing its KV cache. This allows vLLM to serve future requests with much higher throughput and much lower latency.
-- Multi-round conversation, where the user may chat with the application multiple times in the same chatting session. In this case, instead of processing the whole chatting history again and again, APC allows vLLM to reuse the processing results of the chat history across all future rounds of conversation, allowing vLLM to serve future requests with much higher throughput and much lower latency.
+    # Querying the age of Zack Blue
+    # This query will be faster since vllm avoids computing the KV cache of LONG_PROMPT again.
+    get_generation_time(
+        llm,
+        sampling_params,
+        LONG_PROMPT
+        + "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is ",
+    )
 
-## Limits
 
-APC in general does not reduce the performance of vLLM. With that being said, APC only reduces the time of processing the queries (the prefilling phase) and does not reduce the time of generating new tokens (the decoding phase). So APC does not bring performance gain when vLLM spends most of the time generating answers to the queries (e.g. when the length of the answer is long), or new queries do not share the same prefix with any of existing queries (so that the computation cannot be reused).
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/basic/chat.py b/examples/offline_inference/basic/chat.py
index 8e6f78ed7de2..b0bb5aa71b8a 100644
--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@@ -56,22 +56,12 @@ def print_outputs(outputs):
 
     # In this script, we demonstrate how to pass input to the chat method:
     conversation = [
-        {
-            "role": "system",
-            "content": "You are a helpful assistant"
-        },
-        {
-            "role": "user",
-            "content": "Hello"
-        },
-        {
-            "role": "assistant",
-            "content": "Hello! How can I assist you today?"
-        },
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hello! How can I assist you today?"},
         {
             "role": "user",
-            "content":
-            "Write an essay about the importance of higher education.",
+            "content": "Write an essay about the importance of higher education.",
         },
     ]
     outputs = llm.chat(conversation, sampling_params, use_tqdm=False)
diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
index 5b6dcb41eee1..40ccb1294e42 100644
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -10,9 +10,9 @@ def parse_args():
     parser = FlexibleArgumentParser()
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
-    parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
-                        task="classify",
-                        enforce_eager=True)
+    parser.set_defaults(
+        model="jason9693/Qwen2.5-1.5B-apeach", task="classify", enforce_eager=True
+    )
     return parser.parse_args()
 
 
@@ -36,10 +36,11 @@ def main(args: Namespace):
     print("\nGenerated Outputs:\n" + "-" * 60)
     for prompt, output in zip(prompts, outputs):
         probs = output.outputs.probs
-        probs_trimmed = ((str(probs[:16])[:-1] +
-                          ", ...]") if len(probs) > 16 else probs)
-        print(f"Prompt: {prompt!r} \n"
-              f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
+        probs_trimmed = (str(probs[:16])[:-1] + ", ...]") if len(probs) > 16 else probs
+        print(
+            f"Prompt: {prompt!r} \n"
+            f"Class Probabilities: {probs_trimmed} (size={len(probs)})"
+        )
         print("-" * 60)
 
 
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index cb5f923ffb69..38a73ccca251 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -10,9 +10,9 @@ def parse_args():
     parser = FlexibleArgumentParser()
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
-    parser.set_defaults(model="intfloat/e5-mistral-7b-instruct",
-                        task="embed",
-                        enforce_eager=True)
+    parser.set_defaults(
+        model="intfloat/e5-mistral-7b-instruct", task="embed", enforce_eager=True
+    )
     return parser.parse_args()
 
 
@@ -36,10 +36,10 @@ def main(args: Namespace):
     print("\nGenerated Outputs:\n" + "-" * 60)
     for prompt, output in zip(prompts, outputs):
         embeds = output.outputs.embedding
-        embeds_trimmed = ((str(embeds[:16])[:-1] +
-                           ", ...]") if len(embeds) > 16 else embeds)
-        print(f"Prompt: {prompt!r} \n"
-              f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+        embeds_trimmed = (
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
+        )
+        print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
         print("-" * 60)
 
 
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
index d2bda8b3180c..3da73c6c407d 100644
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -10,9 +10,9 @@ def parse_args():
     parser = FlexibleArgumentParser()
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
-    parser.set_defaults(model="BAAI/bge-reranker-v2-m3",
-                        task="score",
-                        enforce_eager=True)
+    parser.set_defaults(
+        model="BAAI/bge-reranker-v2-m3", task="score", enforce_eager=True
+    )
     return parser.parse_args()
 
 
diff --git a/examples/offline_inference/batch_llm_inference.py b/examples/offline_inference/batch_llm_inference.py
index 6548857b6d11..c1edfb52ff70 100644
--- a/examples/offline_inference/batch_llm_inference.py
+++ b/examples/offline_inference/batch_llm_inference.py
@@ -17,12 +17,14 @@
 Learn more about Ray Data's LLM integration:
 https://docs.ray.io/en/latest/data/working-with-llms.html
 """
+
 import ray
 from packaging.version import Version
 from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig
 
-assert Version(ray.__version__) >= Version(
-    "2.44.1"), "Ray version must be at least 2.44.1"
+assert Version(ray.__version__) >= Version("2.44.1"), (
+    "Ray version must be at least 2.44.1"
+)
 
 # Uncomment to reduce clutter in stdout
 # ray.init(log_to_driver=False)
@@ -53,20 +55,18 @@
 vllm_processor = build_llm_processor(
     config,
     preprocess=lambda row: dict(
-        messages=[{
-            "role": "system",
-            "content": "You are a bot that responds with haikus."
-        }, {
-            "role": "user",
-            "content": row["text"]
-        }],
+        messages=[
+            {"role": "system", "content": "You are a bot that responds with haikus."},
+            {"role": "user", "content": row["text"]},
+        ],
         sampling_params=dict(
             temperature=0.3,
             max_tokens=250,
-        )),
+        ),
+    ),
     postprocess=lambda row: dict(
         answer=row["generated_text"],
-        **row  # This will return all the original columns in the dataset.
+        **row,  # This will return all the original columns in the dataset.
     ),
 )
 
diff --git a/examples/offline_inference/chat_with_tools.py b/examples/offline_inference/chat_with_tools.py
index b532bf42adfb..61230d895584 100644
--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
@@ -50,87 +50,93 @@
 # or any other mistral model with function calling ability
 
 sampling_params = SamplingParams(max_tokens=8192, temperature=0.0)
-llm = LLM(model=model_name,
-          tokenizer_mode="mistral",
-          config_format="mistral",
-          load_format="mistral")
+llm = LLM(
+    model=model_name,
+    tokenizer_mode="mistral",
+    config_format="mistral",
+    load_format="mistral",
+)
 
 
 def generate_random_id(length=9):
     characters = string.ascii_letters + string.digits
-    random_id = ''.join(random.choice(characters) for _ in range(length))
+    random_id = "".join(random.choice(characters) for _ in range(length))
     return random_id
 
 
 # simulate an API that can be called
-def get_current_weather(city: str, state: str, unit: 'str'):
-    return (f"The weather in {city}, {state} is 85 degrees {unit}. It is "
-            "partly cloudly, with highs in the 90's.")
+def get_current_weather(city: str, state: str, unit: "str"):
+    return (
+        f"The weather in {city}, {state} is 85 degrees {unit}. It is "
+        "partly cloudly, with highs in the 90's."
+    )
 
 
 tool_functions = {"get_current_weather": get_current_weather}
 
-tools = [{
-    "type": "function",
-    "function": {
-        "name": "get_current_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "city": {
-                    "type":
-                    "string",
-                    "description":
-                    "The city to find the weather for, e.g. 'San Francisco'"
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "city": {
+                        "type": "string",
+                        "description": "The city to find the weather for, e.g. 'San Francisco'",
+                    },
+                    "state": {
+                        "type": "string",
+                        "description": "the two-letter abbreviation for the state that the city is"
+                        " in, e.g. 'CA' which would mean 'California'",
+                    },
+                    "unit": {
+                        "type": "string",
+                        "description": "The unit to fetch the temperature in",
+                        "enum": ["celsius", "fahrenheit"],
+                    },
                 },
-                "state": {
-                    "type":
-                    "string",
-                    "description":
-                    "the two-letter abbreviation for the state that the city is"
-                    " in, e.g. 'CA' which would mean 'California'"
-                },
-                "unit": {
-                    "type": "string",
-                    "description": "The unit to fetch the temperature in",
-                    "enum": ["celsius", "fahrenheit"]
-                }
+                "required": ["city", "state", "unit"],
             },
-            "required": ["city", "state", "unit"]
-        }
+        },
     }
-}]
+]
 
-messages = [{
-    "role":
-    "user",
-    "content":
-    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
-}]
+messages = [
+    {
+        "role": "user",
+        "content": "Can you tell me what the temperate will be in Dallas, in fahrenheit?",
+    }
+]
 
 outputs = llm.chat(messages, sampling_params=sampling_params, tools=tools)
 output = outputs[0].outputs[0].text.strip()
 
 # append the assistant message
-messages.append({
-    "role": "assistant",
-    "content": output,
-})
+messages.append(
+    {
+        "role": "assistant",
+        "content": output,
+    }
+)
 
 # let's now actually parse and execute the model's output simulating an API call by using the
 # above defined function
 tool_calls = json.loads(output)
 tool_answers = [
-    tool_functions[call['name']](**call['arguments']) for call in tool_calls
+    tool_functions[call["name"]](**call["arguments"]) for call in tool_calls
 ]
 
 # append the answer as a tool message and let the LLM give you an answer
-messages.append({
-    "role": "tool",
-    "content": "\n\n".join(tool_answers),
-    "tool_call_id": generate_random_id(),
-})
+messages.append(
+    {
+        "role": "tool",
+        "content": "\n\n".join(tool_answers),
+        "tool_call_id": generate_random_id(),
+    }
+)
 
 outputs = llm.chat(messages, sampling_params, tools=tools)
 
diff --git a/examples/offline_inference/data_parallel.py b/examples/offline_inference/data_parallel.py
index f636a08c0b09..bf60d883c410 100644
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -27,6 +27,7 @@
                     --master-addr=10.99.48.128 \
                     --master-port=13345
 """
+
 import os
 from time import sleep
 
@@ -36,46 +37,46 @@
 
 def parse_args():
     import argparse
+
     parser = argparse.ArgumentParser(description="Data Parallel Inference")
-    parser.add_argument("--model",
-                        type=str,
-                        default="ibm-research/PowerMoE-3b",
-                        help="Model name or path")
-    parser.add_argument("--dp-size",
-                        type=int,
-                        default=2,
-                        help="Data parallel size")
-    parser.add_argument("--tp-size",
-                        type=int,
-                        default=2,
-                        help="Tensor parallel size")
-    parser.add_argument("--node-size",
-                        type=int,
-                        default=1,
-                        help="Total number of nodes")
-    parser.add_argument("--node-rank",
-                        type=int,
-                        default=0,
-                        help="Rank of the current node")
-    parser.add_argument("--master-addr",
-                        type=str,
-                        default="",
-                        help="Master node IP address")
-    parser.add_argument("--master-port",
-                        type=int,
-                        default=0,
-                        help="Master node port")
-    parser.add_argument("--enforce-eager",
-                        action='store_true',
-                        help="Enforce eager mode execution.")
-    parser.add_argument("--trust-remote-code",
-                        action='store_true',
-                        help="Trust remote code.")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="ibm-research/PowerMoE-3b",
+        help="Model name or path",
+    )
+    parser.add_argument("--dp-size", type=int, default=2, help="Data parallel size")
+    parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size")
+    parser.add_argument(
+        "--node-size", type=int, default=1, help="Total number of nodes"
+    )
+    parser.add_argument(
+        "--node-rank", type=int, default=0, help="Rank of the current node"
+    )
+    parser.add_argument(
+        "--master-addr", type=str, default="", help="Master node IP address"
+    )
+    parser.add_argument("--master-port", type=int, default=0, help="Master node port")
+    parser.add_argument(
+        "--enforce-eager", action="store_true", help="Enforce eager mode execution."
+    )
+    parser.add_argument(
+        "--trust-remote-code", action="store_true", help="Trust remote code."
+    )
     return parser.parse_args()
 
 
-def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
-         dp_master_port, GPUs_per_dp_rank, enforce_eager, trust_remote_code):
+def main(
+    model,
+    dp_size,
+    local_dp_rank,
+    global_dp_rank,
+    dp_master_ip,
+    dp_master_port,
+    GPUs_per_dp_rank,
+    enforce_eager,
+    trust_remote_code,
+):
     os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
     os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
     os.environ["VLLM_DP_SIZE"] = str(dp_size)
@@ -110,9 +111,9 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
     # since we are doing data parallel, every rank can have different
     # sampling params. here we set different max_tokens for different
     # ranks for demonstration.
-    sampling_params = SamplingParams(temperature=0.8,
-                                     top_p=0.95,
-                                     max_tokens=[16, 20][global_dp_rank % 2])
+    sampling_params = SamplingParams(
+        temperature=0.8, top_p=0.95, max_tokens=[16, 20][global_dp_rank % 2]
+    )
 
     # Create an LLM.
     llm = LLM(
@@ -130,15 +131,16 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
             break
         prompt = output.prompt
         generated_text = output.outputs[0].text
-        print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
-              f"Generated text: {generated_text!r}")
+        print(
+            f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
+            f"Generated text: {generated_text!r}"
+        )
 
     # Give engines time to pause their processing loops before exiting.
     sleep(1)
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     dp_size = args.dp_size
@@ -160,20 +162,29 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
 
     procs = []
     for local_dp_rank, global_dp_rank in enumerate(
-            range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)):
-        proc = Process(target=main,
-                       args=(args.model, dp_size, local_dp_rank,
-                             global_dp_rank, dp_master_ip, dp_master_port,
-                             tp_size, args.enforce_eager,
-                             args.trust_remote_code))
+        range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)
+    ):
+        proc = Process(
+            target=main,
+            args=(
+                args.model,
+                dp_size,
+                local_dp_rank,
+                global_dp_rank,
+                dp_master_ip,
+                dp_master_port,
+                tp_size,
+                args.enforce_eager,
+                args.trust_remote_code,
+            ),
+        )
         proc.start()
         procs.append(proc)
     exit_code = 0
     for proc in procs:
         proc.join(timeout=300)
         if proc.exitcode is None:
-            print(f"Killing process {proc.pid} that "
-                  f"didn't stop within 5 minutes.")
+            print(f"Killing process {proc.pid} that didn't stop within 5 minutes.")
             proc.kill()
             exit_code = 1
         elif proc.exitcode:
diff --git a/examples/offline_inference/disaggregated-prefill-v1/README.md b/examples/offline_inference/disaggregated-prefill-v1/README.md
index f708eb253838..9cbdb19820f5 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/README.md
+++ b/examples/offline_inference/disaggregated-prefill-v1/README.md
@@ -5,5 +5,6 @@ This example contains scripts that demonstrate disaggregated prefill in the offl
 ## Files
 
 - `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
+  - Make sure you are in the `examples/offline_inference/disaggregated-prefill-v1` directory before running `run.sh`.
 - `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
 - `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
diff --git a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
index 11918f72feec..4ae5d3310e0b 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@@ -3,35 +3,48 @@
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
 
-# Read prompts from output.txt
-prompts = []
-try:
-    with open("output.txt") as f:
-        for line in f:
-            prompts.append(line.strip())
-    print(f"Loaded {len(prompts)} prompts from output.txt")
-except FileNotFoundError:
-    print("Error: output.txt file not found")
-    exit(-1)
-
-sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
-
-llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
-          enforce_eager=True,
-          gpu_memory_utilization=0.8,
-          max_num_batched_tokens=64,
-          max_num_seqs=16,
-          kv_transfer_config=KVTransferConfig(
-              kv_connector="SharedStorageConnector",
-              kv_role="kv_both",
-              kv_connector_extra_config={
-                  "shared_storage_path": "local_storage"
-              }))  #, max_model_len=2048, max_num_batched_tokens=2048)
-
-# 1ST generation (prefill instance)
-outputs = llm.generate(prompts, sampling_params)
-
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+def read_prompts():
+    """Read prompts from output.txt"""
+    prompts = []
+    try:
+        with open("output.txt") as f:
+            for line in f:
+                prompts.append(line.strip())
+        print(f"Loaded {len(prompts)} prompts from output.txt")
+        return prompts
+    except FileNotFoundError:
+        print("Error: output.txt file not found")
+        exit(-1)
+
+
+def main():
+    prompts = read_prompts()
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        gpu_memory_utilization=0.8,
+        max_num_batched_tokens=64,
+        max_num_seqs=16,
+        kv_transfer_config=KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        ),
+    )  # , max_model_len=2048, max_num_batched_tokens=2048)
+
+    # 1ST generation (prefill instance)
+    outputs = llm.generate(prompts, sampling_params)
+
+    print("-" * 30)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 30)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
index 798128301e0f..5757a8a84b86 100644
--- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@@ -3,42 +3,55 @@
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
 
-context = "Hi " * 1000
-context2 = "Hey " * 500
-prompts = [
-    context + "Hello, my name is",
-    context + "The capital of France is",
-    context2 + "Your name is",
-    context2 + "The capital of China is",
-]
-
-sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
-
-llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
-          enforce_eager=True,
-          gpu_memory_utilization=0.8,
-          kv_transfer_config=KVTransferConfig(
-              kv_connector="SharedStorageConnector",
-              kv_role="kv_both",
-              kv_connector_extra_config={
-                  "shared_storage_path": "local_storage"
-              }))  #, max_model_len=2048, max_num_batched_tokens=2048)
-
-# 1ST generation (prefill instance)
-outputs = llm.generate(
-    prompts,
-    sampling_params,
-)
-
-new_prompts = []
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    new_prompts.append(prompt + generated_text)
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-# Write new_prompts to output.txt
-with open("output.txt", "w") as f:
-    for prompt in new_prompts:
-        f.write(prompt + "\n")
-print(f"Saved {len(new_prompts)} prompts to output.txt")
+
+def read_prompts():
+    context = "Hi " * 1000
+    context2 = "Hey " * 500
+    return [
+        context + "Hello, my name is",
+        context + "The capital of France is",
+        context2 + "Your name is",
+        context2 + "The capital of China is",
+    ]
+
+
+def main():
+    prompts = read_prompts()
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
+
+    llm = LLM(
+        model="meta-llama/Llama-3.2-1B-Instruct",
+        enforce_eager=True,
+        gpu_memory_utilization=0.8,
+        kv_transfer_config=KVTransferConfig(
+            kv_connector="SharedStorageConnector",
+            kv_role="kv_both",
+            kv_connector_extra_config={"shared_storage_path": "local_storage"},
+        ),
+    )  # , max_model_len=2048, max_num_batched_tokens=2048)
+
+    # 1ST generation (prefill instance)
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+    )
+
+    new_prompts = []
+    print("-" * 30)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        new_prompts.append(prompt + generated_text)
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 30)
+
+    # Write new_prompts to output.txt
+    with open("output.txt", "w") as f:
+        for prompt in new_prompts:
+            f.write(prompt + "\n")
+    print(f"Saved {len(new_prompts)} prompts to output.txt")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/disaggregated_prefill.py b/examples/offline_inference/disaggregated_prefill.py
index bb6fdd48f79e..3ccab0dcd6d3 100644
--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
@@ -4,6 +4,7 @@
 We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
 and then transfer the KV cache between them.
 """
+
 import os
 import time
 from multiprocessing import Event, Process
@@ -32,17 +33,21 @@ def run_prefill(prefill_done):
     # This instance is the prefill node (kv_producer, rank 0).
     # The number of parallel instances for KV cache transfer is set to 2,
     # as required for PyNcclConnector.
-    ktc = KVTransferConfig(kv_connector="PyNcclConnector",
-                           kv_role="kv_producer",
-                           kv_rank=0,
-                           kv_parallel_size=2)
+    ktc = KVTransferConfig(
+        kv_connector="PyNcclConnector",
+        kv_role="kv_producer",
+        kv_rank=0,
+        kv_parallel_size=2,
+    )
 
     # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
     # memory. You may need to adjust the value to fit your GPU.
-    llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
-              kv_transfer_config=ktc,
-              max_model_len=2000,
-              gpu_memory_utilization=0.8)
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        kv_transfer_config=ktc,
+        max_model_len=2000,
+        gpu_memory_utilization=0.8,
+    )
 
     llm.generate(prompts, sampling_params)
     print("Prefill node is finished.")
@@ -72,17 +77,21 @@ def run_decode(prefill_done):
     # This instance is the decode node (kv_consumer, rank 1).
     # The number of parallel instances for KV cache transfer is set to 2,
     # as required for PyNcclConnector.
-    ktc = KVTransferConfig(kv_connector="PyNcclConnector",
-                           kv_role="kv_consumer",
-                           kv_rank=1,
-                           kv_parallel_size=2)
+    ktc = KVTransferConfig(
+        kv_connector="PyNcclConnector",
+        kv_role="kv_consumer",
+        kv_rank=1,
+        kv_parallel_size=2,
+    )
 
     # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
     # memory. You may need to adjust the value to fit your GPU.
-    llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
-              kv_transfer_config=ktc,
-              max_model_len=2000,
-              gpu_memory_utilization=0.8)
+    llm = LLM(
+        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
+        kv_transfer_config=ktc,
+        max_model_len=2000,
+        gpu_memory_utilization=0.8,
+    )
 
     # Wait for the producer to start the pipe
     print("Waiting for prefill node to finish...")
@@ -99,8 +108,8 @@ def run_decode(prefill_done):
 
 def main():
     prefill_done = Event()
-    prefill_process = Process(target=run_prefill, args=(prefill_done, ))
-    decode_process = Process(target=run_decode, args=(prefill_done, ))
+    prefill_process = Process(target=run_prefill, args=(prefill_done,))
+    decode_process = Process(target=run_decode, args=(prefill_done,))
 
     # Start prefill node
     prefill_process.start()
diff --git a/examples/offline_inference/eagle.py b/examples/offline_inference/eagle.py
index 615f67e9f8d8..606ce7799a88 100644
--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -6,6 +6,7 @@
 from transformers import AutoTokenizer
 
 from vllm import LLM, SamplingParams
+from vllm.v1.metrics.reader import Counter, Vector
 
 
 def load_prompts(dataset_path, num_prompts):
@@ -20,9 +21,7 @@ def load_prompts(dataset_path, num_prompts):
             print(f"Error reading dataset: {e}")
             return []
     else:
-        prompts = [
-            "The future of AI is", "The president of the United States is"
-        ]
+        prompts = ["The future of AI is", "The president of the United States is"]
 
     return prompts[:num_prompts]
 
@@ -33,34 +32,32 @@ def parse_args():
         "--dataset",
         type=str,
         default="./examples/data/gsm8k.jsonl",
-        help="downloaded from the eagle repo " \
-        "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
+        help="downloaded from the eagle repo "
+        "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/",
+    )
+    parser.add_argument(
+        "--method", type=str, default="eagle", choices=["eagle", "eagle3"]
     )
-    parser.add_argument("--method",
-                        type=str,
-                        default='eagle',
-                        choices=['eagle', 'eagle3'])
     parser.add_argument("--max_num_seqs", type=int, default=8)
     parser.add_argument("--num_prompts", type=int, default=80)
     parser.add_argument("--num_spec_tokens", type=int, default=2)
     parser.add_argument("--tp", type=int, default=1)
     parser.add_argument("--draft_tp", type=int, default=1)
-    parser.add_argument("--enforce_eager", action='store_true')
-    parser.add_argument("--enable_chunked_prefill", action='store_true')
+    parser.add_argument("--enforce_eager", action="store_true")
+    parser.add_argument("--enable_chunked_prefill", action="store_true")
     parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
     parser.add_argument("--temp", type=float, default=0)
     return parser.parse_args()
 
 
 def main():
-
     args = parse_args()
 
     model_dir = "meta-llama/Llama-3.1-8B-Instruct"
 
-    if args.method == 'eagle':
+    if args.method == "eagle":
         eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
-    elif args.method == 'eagle3':
+    elif args.method == "eagle3":
         eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
     else:
         raise ValueError(f"unknown method: {args.method}")
@@ -72,11 +69,9 @@ def main():
     prompts = load_prompts(args.dataset, args.num_prompts)
 
     prompt_ids = [
-        tokenizer.apply_chat_template([{
-            "role": "user",
-            "content": prompt
-        }],
-                                      add_generation_prompt=True)
+        tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}], add_generation_prompt=True
+        )
         for prompt in prompts
     ]
 
@@ -102,8 +97,7 @@ def main():
 
     sampling_params = SamplingParams(temperature=args.temp, max_tokens=256)
 
-    outputs = llm.generate(prompt_token_ids=prompt_ids,
-                           sampling_params=sampling_params)
+    outputs = llm.generate(prompt_token_ids=prompt_ids, sampling_params=sampling_params)
 
     # print the generated text
     for output in outputs:
@@ -112,27 +106,33 @@ def main():
         print(f"generated text: {output.outputs[0].text}")
         print("-" * 50)
 
-    if not hasattr(outputs, "metrics") or outputs.metrics is None:
+    try:
+        metrics = llm.get_metrics()
+    except AssertionError:
+        print("Metrics are not supported in the V0 engine.")
         return
 
-    # calculate the average number of accepted tokens per forward pass, +1 is
-    # to account for the token from the target model that's always going to be
-    # accepted
-    acceptance_counts = [0] * (args.num_spec_tokens + 1)
-    for output in outputs:
-        for step, count in enumerate(
-                output.metrics.spec_token_acceptance_counts):
-            acceptance_counts[step] += count
+    num_drafts = num_accepted = 0
+    acceptance_counts = [0] * args.num_spec_tokens
+    for metric in metrics:
+        if metric.name == "vllm:spec_decode_num_drafts":
+            assert isinstance(metric, Counter)
+            num_drafts += metric.value
+        elif metric.name == "vllm:spec_decode_num_accepted_tokens":
+            assert isinstance(metric, Counter)
+            num_accepted += metric.value
+        elif metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
+            assert isinstance(metric, Vector)
+            for pos in range(len(metric.values)):
+                acceptance_counts[pos] += metric.values[pos]
 
     print("-" * 50)
-    print(f"mean acceptance length (including bonus tokens): \
-        {1 + (sum(acceptance_counts) / acceptance_counts[0]):.2f}")
+    print(f"mean acceptance length: {1 + (num_accepted / num_drafts):.2f}")
     print("-" * 50)
 
     # print acceptance at each token position
     for i in range(len(acceptance_counts)):
-        print(f"acceptance at token {i}:"
-              f"{acceptance_counts[i] / (acceptance_counts[0]):.2f}")
+        print(f"acceptance at token {i}:{acceptance_counts[i] / num_drafts:.2f}")
 
 
 if __name__ == "__main__":
diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py
index b347ddbf3197..23f60c431fc2 100644
--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -10,9 +10,9 @@ def parse_args():
     parser = FlexibleArgumentParser()
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
-    parser.set_defaults(model="jinaai/jina-embeddings-v3",
-                        task="embed",
-                        trust_remote_code=True)
+    parser.set_defaults(
+        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+    )
     return parser.parse_args()
 
 
@@ -41,11 +41,14 @@ def main(args: Namespace):
     print("-" * 60)
     for prompt, output in zip(prompts, outputs):
         embeds = output.outputs.embedding
-        embeds_trimmed = ((str(embeds[:16])[:-1] +
-                           ", ...]") if len(embeds) > 16 else embeds)
-        print(f"Prompt: {prompt!r} \n"
-              f"Embeddings for text matching: {embeds_trimmed} "
-              f"(size={len(embeds)})")
+        embeds_trimmed = (
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
+        )
+        print(
+            f"Prompt: {prompt!r} \n"
+            f"Embeddings for text matching: {embeds_trimmed} "
+            f"(size={len(embeds)})"
+        )
         print("-" * 60)
 
 
diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py
index 7a6cb02556d9..59c0592ae9e2 100644
--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -10,9 +10,9 @@ def parse_args():
     parser = FlexibleArgumentParser()
     parser = EngineArgs.add_cli_args(parser)
     # Set example specific arguments
-    parser.set_defaults(model="jinaai/jina-embeddings-v3",
-                        task="embed",
-                        trust_remote_code=True)
+    parser.set_defaults(
+        model="jinaai/jina-embeddings-v3", task="embed", trust_remote_code=True
+    )
     return parser.parse_args()
 
 
@@ -39,11 +39,10 @@ def main(args: Namespace):
     print("-" * 60)
     for prompt, output in zip(prompts, outputs):
         embeds = output.outputs.embedding
-        embeds_trimmed = ((str(embeds[:16])[:-1] +
-                           ", ...]") if len(embeds) > 16 else embeds)
-        print(f"Prompt: {prompt!r} \n"
-              f"Embeddings: {embeds_trimmed} "
-              f"(size={len(embeds)})")
+        embeds_trimmed = (
+            (str(embeds[:16])[:-1] + ", ...]") if len(embeds) > 16 else embeds
+        )
+        print(f"Prompt: {prompt!r} \nEmbeddings: {embeds_trimmed} (size={len(embeds)})")
         print("-" * 60)
 
 
diff --git a/examples/offline_inference/encoder_decoder.py b/examples/offline_inference/encoder_decoder.py
index c4916e00f473..83dd1f667eb5 100644
--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -1,12 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
-'''
+"""
 Demonstrate prompting of text-to-text
 encoder/decoder models, specifically BART
-'''
+"""
 
 from vllm import LLM, SamplingParams
-from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
-                         TokensPrompt, zip_enc_dec_prompts)
+from vllm.inputs import (
+    ExplicitEncoderDecoderPrompt,
+    TextPrompt,
+    TokensPrompt,
+    zip_enc_dec_prompts,
+)
 
 
 def create_prompts(tokenizer):
@@ -18,8 +22,9 @@ def create_prompts(tokenizer):
     # - Helpers for building prompts
     text_prompt_raw = "Hello, my name is"
     text_prompt = TextPrompt(prompt="The president of the United States is")
-    tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
-        prompt="The capital of France is"))
+    tokens_prompt = TokensPrompt(
+        prompt_token_ids=tokenizer.encode(prompt="The capital of France is")
+    )
     # - Pass a single prompt to encoder/decoder model
     #   (implicitly encoder input prompt);
     #   decoder input prompt is assumed to be None
@@ -57,14 +62,19 @@ def create_prompts(tokenizer):
     #   decoder prompts together into a list of ExplicitEncoderDecoderPrompt
     #   instances
     zipped_prompt_list = zip_enc_dec_prompts(
-        ['An encoder prompt', 'Another encoder prompt'],
-        ['A decoder prompt', 'Another decoder prompt'])
+        ["An encoder prompt", "Another encoder prompt"],
+        ["A decoder prompt", "Another decoder prompt"],
+    )
 
     # - Let's put all of the above example prompts together into one list
     #   which we will pass to the encoder/decoder LLM.
     return [
-        single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
-        enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
+        single_text_prompt_raw,
+        single_text_prompt,
+        single_tokens_prompt,
+        enc_dec_prompt1,
+        enc_dec_prompt2,
+        enc_dec_prompt3,
     ] + zipped_prompt_list
 
 
@@ -85,10 +95,12 @@ def print_outputs(outputs):
         prompt = output.prompt
         encoder_prompt = output.encoder_prompt
         generated_text = output.outputs[0].text
-        print(f"Output {i+1}:")
-        print(f"Encoder prompt: {encoder_prompt!r}\n"
-              f"Decoder prompt: {prompt!r}\n"
-              f"Generated text: {generated_text!r}")
+        print(f"Output {i + 1}:")
+        print(
+            f"Encoder prompt: {encoder_prompt!r}\n"
+            f"Decoder prompt: {prompt!r}\n"
+            f"Generated text: {generated_text!r}"
+        )
         print("-" * 50)
 
 
diff --git a/examples/offline_inference/encoder_decoder_multimodal.py b/examples/offline_inference/encoder_decoder_multimodal.py
index 2883c37ca236..ae3737e37594 100644
--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -3,6 +3,7 @@
 This example shows how to use vLLM for running offline inference with
 the explicit/implicit prompt format on enc-dec LMMs for text generation.
 """
+
 import time
 from collections.abc import Sequence
 from dataclasses import asdict
@@ -30,18 +31,14 @@ def run_florence2():
     )
 
     prompts = [
-        {   # implicit prompt with task token
+        {  # implicit prompt with task token
             "prompt": "<DETAILED_CAPTION>",
-            "multi_modal_data": {
-                "image": ImageAsset("stop_sign").pil_image
-            },
+            "multi_modal_data": {"image": ImageAsset("stop_sign").pil_image},
         },
-        {   # explicit encoder/decoder prompt
+        {  # explicit encoder/decoder prompt
             "encoder_prompt": {
                 "prompt": "Describe in detail what is shown in the image.",
-                "multi_modal_data": {
-                    "image": ImageAsset("cherry_blossom").pil_image
-                },
+                "multi_modal_data": {"image": ImageAsset("cherry_blossom").pil_image},
             },
             "decoder_prompt": "",
         },
@@ -63,20 +60,20 @@ def run_mllama():
     )
 
     prompts = [
-        {   # Implicit prompt
-            "prompt": "<|image|><|begin_of_text|>What is the content of this image?",   # noqa: E501
+        {  # Implicit prompt
+            "prompt": "<|image|><|begin_of_text|>What is the content of this image?",  # noqa: E501
             "multi_modal_data": {
                 "image": ImageAsset("stop_sign").pil_image,
             },
         },
-        {   # Explicit prompt
+        {  # Explicit prompt
             "encoder_prompt": {
                 "prompt": "<|image|>",
                 "multi_modal_data": {
                     "image": ImageAsset("stop_sign").pil_image,
                 },
             },
-            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",   # noqa: E501
+            "decoder_prompt": "<|image|><|begin_of_text|>Please describe the image.",  # noqa: E501
         },
     ]
 
@@ -96,13 +93,13 @@ def run_whisper():
     )
 
     prompts = [
-        {   # Test implicit prompt
+        {  # Test implicit prompt
             "prompt": "<|startoftranscript|>",
             "multi_modal_data": {
                 "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
             },
         },
-        {   # Test explicit encoder/decoder prompt
+        {  # Test explicit encoder/decoder prompt
             "encoder_prompt": {
                 "prompt": "",
                 "multi_modal_data": {
@@ -110,7 +107,7 @@ def run_whisper():
                 },
             },
             "decoder_prompt": "<|startoftranscript|>",
-        }
+        },
     ]
 
     return ModelRequestData(
@@ -128,18 +125,23 @@ def run_whisper():
 
 def parse_args():
     parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
-        'vision language models for text generation')
-    parser.add_argument('--model-type',
-                        '-m',
-                        type=str,
-                        default="mllama",
-                        choices=model_example_map.keys(),
-                        help='Huggingface "model_type".')
-    parser.add_argument("--seed",
-                        type=int,
-                        default=None,
-                        help="Set the seed when initializing `vllm.LLM`.")
+        description="Demo on using vLLM for offline inference with "
+        "vision language models for text generation"
+    )
+    parser.add_argument(
+        "--model-type",
+        "-m",
+        type=str,
+        default="mllama",
+        choices=model_example_map.keys(),
+        help='Huggingface "model_type".',
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
     return parser.parse_args()
 
 
@@ -153,7 +155,8 @@ def main(args):
     # Disable other modalities to save memory
     default_limits = {"image": 0, "video": 0, "audio": 0}
     req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-        req_data.engine_args.limit_mm_per_prompt or {})
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
 
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
     llm = LLM(**engine_args)
@@ -179,8 +182,7 @@ def main(args):
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
-        print(f"Decoder prompt: {prompt!r}, "
-              f"Generated text: {generated_text!r}")
+        print(f"Decoder prompt: {prompt!r}, Generated text: {generated_text!r}")
 
     duration = time.time() - start
 
diff --git a/examples/offline_inference/llm_engine_example.py b/examples/offline_inference/llm_engine_example.py
index d84cd9ee9f52..5d5e55a83d22 100644
--- a/examples/offline_inference/llm_engine_example.py
+++ b/examples/offline_inference/llm_engine_example.py
@@ -3,6 +3,7 @@
 This file demonstrates using the `LLMEngine`
 for processing prompts with various sampling parameters.
 """
+
 import argparse
 
 from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
@@ -12,24 +13,26 @@
 def create_test_prompts() -> list[tuple[str, SamplingParams]]:
     """Create a list of test prompts with their sampling parameters."""
     return [
-        ("A robot may not injure a human being",
-         SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
-        ("To be or not to be,",
-         SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
-        ("What is the meaning of life?",
-         SamplingParams(n=2,
-                        temperature=0.8,
-                        top_p=0.95,
-                        frequency_penalty=0.1)),
+        (
+            "A robot may not injure a human being",
+            SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1),
+        ),
+        (
+            "To be or not to be,",
+            SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2),
+        ),
+        (
+            "What is the meaning of life?",
+            SamplingParams(n=2, temperature=0.8, top_p=0.95, frequency_penalty=0.1),
+        ),
     ]
 
 
-def process_requests(engine: LLMEngine,
-                     test_prompts: list[tuple[str, SamplingParams]]):
+def process_requests(engine: LLMEngine, test_prompts: list[tuple[str, SamplingParams]]):
     """Continuously process a list of prompts and handle the outputs."""
     request_id = 0
 
-    print('-' * 50)
+    print("-" * 50)
     while test_prompts or engine.has_unfinished_requests():
         if test_prompts:
             prompt, sampling_params = test_prompts.pop(0)
@@ -41,7 +44,7 @@ def process_requests(engine: LLMEngine,
         for request_output in request_outputs:
             if request_output.finished:
                 print(request_output)
-                print('-' * 50)
+                print("-" * 50)
 
 
 def initialize_engine(args: argparse.Namespace) -> LLMEngine:
@@ -52,7 +55,8 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine:
 
 def parse_args():
     parser = FlexibleArgumentParser(
-        description='Demo on using the LLMEngine class directly')
+        description="Demo on using the LLMEngine class directly"
+    )
     parser = EngineArgs.add_cli_args(parser)
     return parser.parse_args()
 
@@ -64,6 +68,6 @@ def main(args: argparse.Namespace):
     process_requests(engine, test_prompts)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     args = parse_args()
     main(args)
diff --git a/examples/offline_inference/load_sharded_state.py b/examples/offline_inference/load_sharded_state.py
index 7e90d5d25e29..5bb2327a3f83 100644
--- a/examples/offline_inference/load_sharded_state.py
+++ b/examples/offline_inference/load_sharded_state.py
@@ -36,22 +36,21 @@ def parse_args():
     parser.set_defaults(load_format="sharded_state")
 
     # Add validation arguments
-    parser.add_argument("--prompt",
-                        type=str,
-                        default="Hello, world!",
-                        help="Prompt for validation")
-    parser.add_argument("--max-tokens",
-                        type=int,
-                        default=100,
-                        help="Maximum number of tokens to generate")
-    parser.add_argument("--temperature",
-                        type=float,
-                        default=0.7,
-                        help="Sampling temperature")
-    parser.add_argument("--top-p",
-                        type=float,
-                        default=1.0,
-                        help="Top-p sampling parameter")
+    parser.add_argument(
+        "--prompt", type=str, default="Hello, world!", help="Prompt for validation"
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=100,
+        help="Maximum number of tokens to generate",
+    )
+    parser.add_argument(
+        "--temperature", type=float, default=0.7, help="Sampling temperature"
+    )
+    parser.add_argument(
+        "--top-p", type=float, default=1.0, help="Top-p sampling parameter"
+    )
 
     return parser.parse_args()
 
@@ -60,8 +59,9 @@ def main():
     args = parse_args()
     engine_args = EngineArgs.from_cli_args(args)
 
-    print(f"Loading model from {engine_args.model} "
-          f"using format {engine_args.load_format}")
+    print(
+        f"Loading model from {engine_args.model} using format {engine_args.load_format}"
+    )
     print(f"Tensor parallel size: {engine_args.tensor_parallel_size}")
 
     # Load the model using engine args
@@ -90,4 +90,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/examples/offline_inference/lora_with_quantization_inference.py b/examples/offline_inference/lora_with_quantization_inference.py
index b6608ec6e958..33c660015ba7 100644
--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -17,50 +17,55 @@
 
 
 def create_test_prompts(
-        lora_path: str
+    lora_path: str,
 ) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
     return [
         # this is an example of using quantization without LoRA
-        ("My name is",
-         SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
-                        max_tokens=128), None),
+        (
+            "My name is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            None,
+        ),
         # the next three examples use quantization with LoRA
-        ("my name is",
-         SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
-                        max_tokens=128),
-         LoRARequest("lora-test-1", 1, lora_path)),
-        ("The capital of USA is",
-         SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
-                        max_tokens=128),
-         LoRARequest("lora-test-2", 1, lora_path)),
-        ("The capital of France is",
-         SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
-                        max_tokens=128),
-         LoRARequest("lora-test-3", 1, lora_path)),
+        (
+            "my name is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            LoRARequest("lora-test-1", 1, lora_path),
+        ),
+        (
+            "The capital of USA is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            LoRARequest("lora-test-2", 1, lora_path),
+        ),
+        (
+            "The capital of France is",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            LoRARequest("lora-test-3", 1, lora_path),
+        ),
     ]
 
 
-def process_requests(engine: LLMEngine,
-                     test_prompts: list[tuple[str, SamplingParams,
-                                              Optional[LoRARequest]]]):
+def process_requests(
+    engine: LLMEngine,
+    test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
+):
     """Continuously process a list of prompts and handle the outputs."""
     request_id = 0
 
     while test_prompts or engine.has_unfinished_requests():
         if test_prompts:
             prompt, sampling_params, lora_request = test_prompts.pop(0)
-            engine.add_request(str(request_id),
-                               prompt,
-                               sampling_params,
-                               lora_request=lora_request)
+            engine.add_request(
+                str(request_id), prompt, sampling_params, lora_request=lora_request
+            )
             request_id += 1
 
         request_outputs: list[RequestOutput] = engine.step()
@@ -71,15 +76,18 @@ def process_requests(engine: LLMEngine,
                 print(f"Output: {request_output.outputs[0].text}")
 
 
-def initialize_engine(model: str, quantization: str,
-                      lora_repo: Optional[str]) -> LLMEngine:
+def initialize_engine(
+    model: str, quantization: str, lora_repo: Optional[str]
+) -> LLMEngine:
     """Initialize the LLMEngine."""
 
-    engine_args = EngineArgs(model=model,
-                             quantization=quantization,
-                             enable_lora=True,
-                             max_lora_rank=64,
-                             max_loras=4)
+    engine_args = EngineArgs(
+        model=model,
+        quantization=quantization,
+        enable_lora=True,
+        max_lora_rank=64,
+        max_loras=4,
+    )
     return LLMEngine.from_engine_args(engine_args)
 
 
@@ -90,32 +98,30 @@ def main():
         # QLoRA (https://arxiv.org/abs/2305.14314)
         {
             "name": "qlora_inference_example",
-            'model': "huggyllama/llama-7b",
-            'quantization': "bitsandbytes",
-            'lora_repo': 'timdettmers/qlora-flan-7b'
+            "model": "huggyllama/llama-7b",
+            "quantization": "bitsandbytes",
+            "lora_repo": "timdettmers/qlora-flan-7b",
         },
         {
             "name": "AWQ_inference_with_lora_example",
-            'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ',
-            'quantization': "awq",
-            'lora_repo': 'jashing/tinyllama-colorist-lora'
+            "model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",
+            "quantization": "awq",
+            "lora_repo": "jashing/tinyllama-colorist-lora",
         },
         {
             "name": "GPTQ_inference_with_lora_example",
-            'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ',
-            'quantization': "gptq",
-            'lora_repo': 'jashing/tinyllama-colorist-lora'
-        }
+            "model": "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
+            "quantization": "gptq",
+            "lora_repo": "jashing/tinyllama-colorist-lora",
+        },
     ]
 
     for test_config in test_configs:
-        print(
-            f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~"
+        print(f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~")
+        engine = initialize_engine(
+            test_config["model"], test_config["quantization"], test_config["lora_repo"]
         )
-        engine = initialize_engine(test_config['model'],
-                                   test_config['quantization'],
-                                   test_config['lora_repo'])
-        lora_path = snapshot_download(repo_id=test_config['lora_repo'])
+        lora_path = snapshot_download(repo_id=test_config["lora_repo"])
         test_prompts = create_test_prompts(lora_path)
         process_requests(engine, test_prompts)
 
@@ -125,5 +131,5 @@ def main():
         torch.cuda.empty_cache()
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/offline_inference/metrics.py b/examples/offline_inference/metrics.py
new file mode 100644
index 000000000000..7927f758cb57
--- /dev/null
+++ b/examples/offline_inference/metrics.py
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Vector
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(model="facebook/opt-125m", disable_log_stats=False)
+
+    # Generate texts from the prompts.
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    print("-" * 50)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
+        print("-" * 50)
+
+    # Dump all metrics
+    for metric in llm.get_metrics():
+        if isinstance(metric, Gauge):
+            print(f"{metric.name} (gauge) = {metric.value}")
+        elif isinstance(metric, Counter):
+            print(f"{metric.name} (counter) = {metric.value}")
+        elif isinstance(metric, Vector):
+            print(f"{metric.name} (vector) = {metric.values}")
+        elif isinstance(metric, Histogram):
+            print(f"{metric.name} (histogram)")
+            print(f"    sum = {metric.sum}")
+            print(f"    count = {metric.count}")
+            for bucket_le, value in metric.buckets.items():
+                print(f"    {bucket_le} = {value}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
index 37c3181dc5fa..98fef2648f6b 100644
--- a/examples/offline_inference/mistral-small.py
+++ b/examples/offline_inference/mistral-small.py
@@ -74,19 +74,10 @@ def run_simple_demo(args: argparse.Namespace):
 
     messages = [
         {
-            "role":
-            "user",
+            "role": "user",
             "content": [
-                {
-                    "type": "text",
-                    "text": prompt
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                },
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": {"url": image_url}},
             ],
         },
     ]
@@ -121,25 +112,11 @@ def run_advanced_demo(args: argparse.Namespace):
 
     messages = [
         {
-            "role":
-            "user",
+            "role": "user",
             "content": [
-                {
-                    "type": "text",
-                    "text": prompt
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": url_1
-                    }
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": url_2
-                    }
-                },
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": {"url": url_1}},
+                {"type": "image_url", "image_url": {"url": url_2}},
             ],
         },
         {
@@ -153,12 +130,7 @@ def run_advanced_demo(args: argparse.Namespace):
         {
             "role": "user",
             "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": url_3
-                    }
-                },
+                {"type": "image_url", "image_url": {"url": url_3}},
             ],
         },
     ]
@@ -171,7 +143,8 @@ def run_advanced_demo(args: argparse.Namespace):
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        description="Run a demo in simple or advanced mode.")
+        description="Run a demo in simple or advanced mode."
+    )
 
     parser.add_argument(
         "mode",
@@ -179,15 +152,18 @@ def parse_args():
         help="Specify the demo mode: 'simple' or 'advanced'",
     )
 
-    parser.add_argument('--format',
-                        choices=["mistral", "hf"],
-                        default="mistral",
-                        help='Specify the format of the model to load.')
+    parser.add_argument(
+        "--format",
+        choices=["mistral", "hf"],
+        default="mistral",
+        help="Specify the format of the model to load.",
+    )
 
     parser.add_argument(
-        '--disable-mm-preprocessor-cache',
-        action='store_true',
-        help='If True, disables caching of multi-modal preprocessor/mapper.')
+        "--disable-mm-preprocessor-cache",
+        action="store_true",
+        help="If True, disables caching of multi-modal preprocessor/mapper.",
+    )
     return parser.parse_args()
 
 
diff --git a/examples/offline_inference/mlpspeculator.py b/examples/offline_inference/mlpspeculator.py
index 53c58a76d9dc..b750397f45b8 100644
--- a/examples/offline_inference/mlpspeculator.py
+++ b/examples/offline_inference/mlpspeculator.py
@@ -13,8 +13,9 @@
 from vllm import LLM, SamplingParams
 
 
-def time_generation(llm: LLM, prompts: list[str],
-                    sampling_params: SamplingParams, title: str):
+def time_generation(
+    llm: LLM, prompts: list[str], sampling_params: SamplingParams, title: str
+):
     # Generate texts from the prompts. The output is a list of RequestOutput
     # objects that contain the prompt, generated text, and other information.
     # Warmup first
@@ -25,8 +26,7 @@ def time_generation(llm: LLM, prompts: list[str],
     end = time.time()
     print("-" * 50)
     print(title)
-    print("time: ",
-          (end - start) / sum(len(o.outputs[0].token_ids) for o in outputs))
+    print("time: ", (end - start) / sum(len(o.outputs[0].token_ids) for o in outputs))
     # Print the outputs.
     for output in outputs:
         generated_text = output.outputs[0].text
@@ -38,7 +38,8 @@ def main():
     template = (
         "Below is an instruction that describes a task. Write a response "
         "that appropriately completes the request.\n\n### Instruction:\n{}"
-        "\n\n### Response:\n")
+        "\n\n### Response:\n"
+    )
 
     # Sample prompts.
     prompts = [
diff --git a/examples/offline_inference/multilora_inference.py b/examples/offline_inference/multilora_inference.py
index de409740292a..1fa2f16f82a8 100644
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -15,7 +15,7 @@
 
 
 def create_test_prompts(
-        lora_path: str
+    lora_path: str,
 ) -> list[tuple[str, SamplingParams, Optional[LoRARequest]]]:
     """Create a list of test prompts with their sampling parameters.
 
@@ -26,38 +26,49 @@ def create_test_prompts(
     first adapter have finished.
     """
     return [
-        ("A robot may not injure a human being",
-         SamplingParams(temperature=0.0,
-                        logprobs=1,
-                        prompt_logprobs=1,
-                        max_tokens=128), None),
-        ("To be or not to be,",
-         SamplingParams(temperature=0.8,
-                        top_k=5,
-                        presence_penalty=0.2,
-                        max_tokens=128), None),
+        (
+            "A robot may not injure a human being",
+            SamplingParams(
+                temperature=0.0, logprobs=1, prompt_logprobs=1, max_tokens=128
+            ),
+            None,
+        ),
+        (
+            "To be or not to be,",
+            SamplingParams(
+                temperature=0.8, top_k=5, presence_penalty=0.2, max_tokens=128
+            ),
+            None,
+        ),
         (
             "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-            SamplingParams(temperature=0.0,
-                           logprobs=1,
-                           prompt_logprobs=1,
-                           max_tokens=128,
-                           stop_token_ids=[32003]),
-            LoRARequest("sql-lora", 1, lora_path)),
+            SamplingParams(
+                temperature=0.0,
+                logprobs=1,
+                prompt_logprobs=1,
+                max_tokens=128,
+                stop_token_ids=[32003],
+            ),
+            LoRARequest("sql-lora", 1, lora_path),
+        ),
         (
             "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
-            SamplingParams(temperature=0.0,
-                           logprobs=1,
-                           prompt_logprobs=1,
-                           max_tokens=128,
-                           stop_token_ids=[32003]),
-            LoRARequest("sql-lora2", 2, lora_path)),
+            SamplingParams(
+                temperature=0.0,
+                logprobs=1,
+                prompt_logprobs=1,
+                max_tokens=128,
+                stop_token_ids=[32003],
+            ),
+            LoRARequest("sql-lora2", 2, lora_path),
+        ),
     ]
 
 
-def process_requests(engine: LLMEngine,
-                     test_prompts: list[tuple[str, SamplingParams,
-                                              Optional[LoRARequest]]]):
+def process_requests(
+    engine: LLMEngine,
+    test_prompts: list[tuple[str, SamplingParams, Optional[LoRARequest]]],
+):
     """Continuously process a list of prompts and handle the outputs."""
     request_id = 0
 
@@ -65,10 +76,9 @@ def process_requests(engine: LLMEngine,
     while test_prompts or engine.has_unfinished_requests():
         if test_prompts:
             prompt, sampling_params, lora_request = test_prompts.pop(0)
-            engine.add_request(str(request_id),
-                               prompt,
-                               sampling_params,
-                               lora_request=lora_request)
+            engine.add_request(
+                str(request_id), prompt, sampling_params, lora_request=lora_request
+            )
             request_id += 1
 
         request_outputs: list[RequestOutput] = engine.step()
@@ -88,12 +98,14 @@ def initialize_engine() -> LLMEngine:
     #   numbers will cause higher memory usage. If you know that all LoRAs will
     #   use the same rank, it is recommended to set this as low as possible.
     # max_cpu_loras: controls the size of the CPU LoRA cache.
-    engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf",
-                             enable_lora=True,
-                             max_loras=1,
-                             max_lora_rank=8,
-                             max_cpu_loras=2,
-                             max_num_seqs=256)
+    engine_args = EngineArgs(
+        model="meta-llama/Llama-2-7b-hf",
+        enable_lora=True,
+        max_loras=1,
+        max_lora_rank=8,
+        max_cpu_loras=2,
+        max_num_seqs=256,
+    )
     return LLMEngine.from_engine_args(engine_args)
 
 
@@ -105,5 +117,5 @@ def main():
     process_requests(engine, test_prompts)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/offline_inference/neuron.py b/examples/offline_inference/neuron.py
index 5906c7b2c6b3..f2d7698f22d7 100644
--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
@@ -30,7 +30,8 @@ def main():
         # The device argument can be either unspecified for automated detection,
         # or explicitly assigned.
         device="neuron",
-        tensor_parallel_size=2)
+        tensor_parallel_size=2,
+    )
     # Generate texts from the prompts. The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.
     outputs = llm.generate(prompts, sampling_params)
diff --git a/examples/offline_inference/neuron_eagle.py b/examples/offline_inference/neuron_eagle.py
index 4f63f1a2fb3c..5d7fb819d347 100644
--- a/examples/offline_inference/neuron_eagle.py
+++ b/examples/offline_inference/neuron_eagle.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-This example shows how to run offline inference with an EAGLE speculative 
+This example shows how to run offline inference with an EAGLE speculative
 decoding model on neuron. To use EAGLE speculative decoding, you must use
 a draft model that is specifically fine-tuned for EAGLE speculation.
 Additionally, to use EAGLE with NxD Inference, the draft model must include
@@ -15,40 +15,46 @@
     "What is annapurna labs?",
 ]
 
-# Create a sampling params object.
-sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True)
-
-# Create an LLM.
-llm = LLM(
-    model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct",
-    speculative_config={
-        "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft",
-        "num_speculative_tokens": 5,
-        "max_model_len": 2048
-    },
-    max_num_seqs=4,
-    # The max_model_len and block_size arguments are required to be same as
-    # max sequence length when targeting neuron device.
-    # Currently, this is a known limitation in continuous batching support
-    # in neuronx-distributed-inference.
-    max_model_len=2048,
-    block_size=2048,
-    # The device can be automatically detected when AWS Neuron SDK is installed.
-    # The device argument can be either unspecified for automated detection,
-    # or explicitly assigned.
-    device="neuron",
-    tensor_parallel_size=32,
-    override_neuron_config={
-        "enable_eagle_speculation": True,
-        "enable_fused_speculation": True
-    },
-)
-
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}")
+
+def main():
+    # Create a sampling params object.
+    sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True)
+
+    # Create an LLM.
+    llm = LLM(
+        model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct",
+        speculative_config={
+            "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft",
+            "num_speculative_tokens": 5,
+            "max_model_len": 2048,
+        },
+        max_num_seqs=4,
+        # The max_model_len and block_size arguments are required to be same as
+        # max sequence length when targeting neuron device.
+        # Currently, this is a known limitation in continuous batching support
+        # in neuronx-distributed-inference.
+        max_model_len=2048,
+        block_size=2048,
+        # The device can be automatically detected when AWS Neuron SDK is installed.
+        # The device argument can be either unspecified for automated detection,
+        # or explicitly assigned.
+        device="neuron",
+        tensor_parallel_size=32,
+        override_neuron_config={
+            "enable_eagle_speculation": True,
+            "enable_fused_speculation": True,
+        },
+    )
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/neuron_int8_quantization.py b/examples/offline_inference/neuron_int8_quantization.py
index af21274a3a5b..ec38525b9daf 100644
--- a/examples/offline_inference/neuron_int8_quantization.py
+++ b/examples/offline_inference/neuron_int8_quantization.py
@@ -5,12 +5,12 @@
 from vllm import LLM, SamplingParams
 
 # creates XLA hlo graphs for all the context length buckets.
-os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
+os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
 # creates XLA hlo graphs for all the token gen buckets.
-os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
+os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
 # Quantizes neuron model weight to int8 ,
 # The default config for quantization is int8 dtype.
-os.environ['NEURON_QUANT_DTYPE'] = "s8"
+os.environ["NEURON_QUANT_DTYPE"] = "s8"
 
 # Sample prompts.
 prompts = [
@@ -44,7 +44,8 @@ def main():
         override_neuron_config={
             "cast_logits_dtype": "bfloat16",
         },
-        tensor_parallel_size=2)
+        tensor_parallel_size=2,
+    )
     # Generate texts from the prompts. The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.
     outputs = llm.generate(prompts, sampling_params)
diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py
index bef434bae5ba..ecacbab771c2 100644
--- a/examples/offline_inference/neuron_speculation.py
+++ b/examples/offline_inference/neuron_speculation.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-This example shows how to run offline inference with a speculative 
+This example shows how to run offline inference with a speculative
 decoding model on neuron.
 """
 
@@ -19,9 +19,9 @@
 def config_buckets():
     """Configure context length and token gen buckets."""
     # creates XLA hlo graphs for all the context length buckets.
-    os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
+    os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
     # creates XLA hlo graphs for all the token gen buckets.
-    os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
+    os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
 
 
 def initialize_model():
@@ -31,7 +31,7 @@ def initialize_model():
         speculative_config={
             "model": "openlm-research/open_llama_3b",
             "num_speculative_tokens": 4,
-            "max_model_len": 2048
+            "max_model_len": 2048,
         },
         max_num_seqs=4,
         max_model_len=2048,
@@ -60,5 +60,5 @@ def main():
     process_requests(model, sampling_params)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/offline_inference/prefix_caching.py b/examples/offline_inference/prefix_caching.py
index f0bec387d3a9..d3dad24956a6 100644
--- a/examples/offline_inference/prefix_caching.py
+++ b/examples/offline_inference/prefix_caching.py
@@ -16,7 +16,8 @@
     "teaching role. They have 5 years of previous teaching experience "
     "as an assistant teacher at a co-ed, public school with experience "
     "in middle school math teaching. Based on these information, fulfill "
-    "the following paragraph: ")
+    "the following paragraph: "
+)
 
 # Sample prompts.
 prompts = [
@@ -58,9 +59,11 @@ def main():
     cleanup_dist_env_and_memory()
 
     # Create an LLM with prefix caching enabled.
-    prefix_cached_llm = LLM(model="facebook/opt-125m",
-                            enable_prefix_caching=True,
-                            gpu_memory_utilization=0.4)
+    prefix_cached_llm = LLM(
+        model="facebook/opt-125m",
+        enable_prefix_caching=True,
+        gpu_memory_utilization=0.4,
+    )
 
     # Warmup so that the shared prompt's KV cache is computed.
     prefix_cached_llm.generate(generating_prompts[0], sampling_params)
@@ -81,10 +84,12 @@ def main():
         print("-" * 50)
 
     # Compare the results and display the speedup
-    generated_same = all([
-        regular_generated_texts[i] == cached_generated_texts[i]
-        for i in range(len(prompts))
-    ])
+    generated_same = all(
+        [
+            regular_generated_texts[i] == cached_generated_texts[i]
+            for i in range(len(prompts))
+        ]
+    )
     print(f"Generated answers are the same: {generated_same}")
 
 
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
index f97a1f32e621..21f7668adc86 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -16,16 +16,17 @@
 Run the example:
 python prithvi_geospatial_mae.py
 
-""" # noqa: E501
+"""  # noqa: E501
+
 import argparse
 import datetime
 import os
-import re
 from typing import Union
 
 import albumentations
 import numpy as np
 import rasterio
+import regex as re
 import torch
 from einops import rearrange
 from terratorch.datamodules import Sen1Floods11NonGeoDataModule
@@ -110,77 +111,67 @@
 
 # Temporarily creating the "config.json" for the model.
 # This is going to disappear once the correct config.json is available on HF
-with open(os.path.join(os.path.dirname(__file__), "./model/config.json"),
-          'w') as config_file:
+with open(
+    os.path.join(os.path.dirname(__file__), "./model/config.json"), "w"
+) as config_file:
     config_file.write(model_config)
 
 datamodule_config = {
-    'bands': ['BLUE', 'GREEN', 'RED', 'NIR_NARROW', 'SWIR_1', 'SWIR_2'],
-    'batch_size':
-    16,
-    'constant_scale':
-    0.0001,
-    'data_root':
-    '/dccstor/geofm-finetuning/datasets/sen1floods11',
-    'drop_last':
-    True,
-    'no_data_replace':
-    0.0,
-    'no_label_replace':
-    -1,
-    'num_workers':
-    8,
-    'test_transform': [
-        albumentations.Resize(always_apply=False,
-                              height=448,
-                              interpolation=1,
-                              p=1,
-                              width=448),
-        albumentations.pytorch.ToTensorV2(transpose_mask=False,
-                                          always_apply=True,
-                                          p=1.0)
+    "bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"],
+    "batch_size": 16,
+    "constant_scale": 0.0001,
+    "data_root": "/dccstor/geofm-finetuning/datasets/sen1floods11",
+    "drop_last": True,
+    "no_data_replace": 0.0,
+    "no_label_replace": -1,
+    "num_workers": 8,
+    "test_transform": [
+        albumentations.Resize(
+            always_apply=False, height=448, interpolation=1, p=1, width=448
+        ),
+        albumentations.pytorch.ToTensorV2(
+            transpose_mask=False, always_apply=True, p=1.0
+        ),
     ],
 }
 
 
 class PrithviMAE:
-
     def __init__(self):
         print("Initializing PrithviMAE model")
-        self.model = LLM(model=os.path.join(os.path.dirname(__file__),
-                                            "./model"),
-                         skip_tokenizer_init=True,
-                         dtype="float32")
+        self.model = LLM(
+            model=os.path.join(os.path.dirname(__file__), "./model"),
+            skip_tokenizer_init=True,
+            dtype="float32",
+        )
 
     def run(self, input_data, location_coords):
         print("################ Running inference on vLLM ##############")
         # merge the inputs into one data structure
         mm_data = {
-            "pixel_values":
-            torch.empty(0) if input_data is None else input_data,
-            "location_coords":
-            torch.empty(0) if location_coords is None else location_coords
+            "pixel_values": torch.empty(0) if input_data is None else input_data,
+            "location_coords": torch.empty(0)
+            if location_coords is None
+            else location_coords,
         }
 
         prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
 
         outputs = self.model.encode(prompt, use_tqdm=False)
-        print(
-            "################ Inference done (it took seconds)  ##############"
-        )
+        print("################ Inference done (it took seconds)  ##############")
 
         return outputs[0].outputs.data
 
 
 def generate_datamodule():
     datamodule = Sen1Floods11NonGeoDataModule(
-        data_root=datamodule_config['data_root'],
+        data_root=datamodule_config["data_root"],
         batch_size=datamodule_config["batch_size"],
         num_workers=datamodule_config["num_workers"],
         bands=datamodule_config["bands"],
         drop_last=datamodule_config["drop_last"],
-        test_transform=datamodule_config["test_transform"
-                                         ""])
+        test_transform=datamodule_config["test_transform"],
+    )
 
     return datamodule
 
@@ -204,8 +195,7 @@ def process_channel_group(orig_img, channels):
     max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE))
     min_value = OFFSET
 
-    orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0,
-                           1)
+    orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0, 1)
 
     # No data as zeros
     orig_img[~valid_mask] = 0
@@ -300,18 +290,21 @@ def load_example(
             location_coords.append(coords)
 
         try:
-            match = re.search(r'(\d{7,8}T\d{6})', file)
+            match = re.search(r"(\d{7,8}T\d{6})", file)
             if match:
                 year = int(match.group(1)[:4])
-                julian_day = match.group(1).split('T')[0][4:]
+                julian_day = match.group(1).split("T")[0][4:]
                 if len(julian_day) == 3:
                     julian_day = int(julian_day)
                 else:
-                    julian_day = datetime.datetime.strptime(
-                        julian_day, '%m%d').timetuple().tm_yday
+                    julian_day = (
+                        datetime.datetime.strptime(julian_day, "%m%d")
+                        .timetuple()
+                        .tm_yday
+                    )
                 temporal_coords.append([year, julian_day])
         except Exception as e:
-            print(f'Could not extract timestamp for {file} ({e})')
+            print(f"Could not extract timestamp for {file} ({e})")
 
     imgs = np.stack(imgs, axis=0)  # num_frames, H, W, C
     imgs = np.moveaxis(imgs, -1, 0).astype("float32")
@@ -320,50 +313,44 @@ def load_example(
     return imgs, temporal_coords, location_coords, metas
 
 
-def run_model(input_data,
-              temporal_coords,
-              location_coords,
-              model,
-              datamodule,
-              img_size,
-              lightning_model=None):
+def run_model(
+    input_data,
+    temporal_coords,
+    location_coords,
+    model,
+    datamodule,
+    img_size,
+    lightning_model=None,
+):
     # Reflect pad if not divisible by img_size
     original_h, original_w = input_data.shape[-2:]
     pad_h = (img_size - (original_h % img_size)) % img_size
     pad_w = (img_size - (original_w % img_size)) % img_size
-    input_data = np.pad(input_data,
-                        ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)),
-                        mode="reflect")
+    input_data = np.pad(
+        input_data, ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)), mode="reflect"
+    )
 
     # Build sliding window
     batch_size = 1
     batch = torch.tensor(input_data, device="cpu")
-    windows = (batch.unfold(3, img_size,
-                            img_size).unfold(4, img_size, img_size))
+    windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size)
     h1, w1 = windows.shape[3:5]
-    windows = rearrange(windows,
-                        "b c t h1 w1 h w -> (b h1 w1) c t h w",
-                        h=img_size,
-                        w=img_size)
+    windows = rearrange(
+        windows, "b c t h1 w1 h w -> (b h1 w1) c t h w", h=img_size, w=img_size
+    )
 
     # Split into batches if number of windows > batch_size
-    num_batches = windows.shape[0] // batch_size if windows.shape[
-        0] > batch_size else 1
+    num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1
     windows = torch.tensor_split(windows, num_batches, dim=0)
 
-    if torch.cuda.is_available():
-        device = torch.device('cuda')
-    else:
-        device = torch.device('cpu')
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 
     if temporal_coords:
-        temporal_coords = torch.tensor(temporal_coords,
-                                       device=device).unsqueeze(0)
+        temporal_coords = torch.tensor(temporal_coords, device=device).unsqueeze(0)
     else:
         temporal_coords = None
     if location_coords:
-        location_coords = torch.tensor(location_coords[0],
-                                       device=device).unsqueeze(0)
+        location_coords = torch.tensor(location_coords[0], device=device).unsqueeze(0)
     else:
         location_coords = None
 
@@ -371,26 +358,24 @@ def run_model(input_data,
     pred_imgs = []
     for x in windows:
         # Apply standardization
-        x = datamodule.test_transform(
-            image=x.squeeze().numpy().transpose(1, 2, 0))
-        x = datamodule.aug(x)['image']
+        x = datamodule.test_transform(image=x.squeeze().numpy().transpose(1, 2, 0))
+        x = datamodule.aug(x)["image"]
 
         with torch.no_grad():
             x = x.to(device)
             pred = model.run(x, location_coords=location_coords)
             if lightning_model:
                 pred_lightning = lightning_model(
-                    x,
-                    temporal_coords=temporal_coords,
-                    location_coords=location_coords)
+                    x, temporal_coords=temporal_coords, location_coords=location_coords
+                )
                 pred_lightning = pred_lightning.output.detach().cpu()
                 if not torch.equal(pred, pred_lightning):
                     print("Inference output is not equal")
         y_hat = pred.argmax(dim=1)
 
-        y_hat = torch.nn.functional.interpolate(y_hat.unsqueeze(1).float(),
-                                                size=img_size,
-                                                mode="nearest")
+        y_hat = torch.nn.functional.interpolate(
+            y_hat.unsqueeze(1).float(), size=img_size, mode="nearest"
+        )
 
         pred_imgs.append(y_hat)
 
@@ -437,8 +422,7 @@ def parse_args():
         default=[1, 2, 3, 8, 11, 12],
         type=int,
         nargs="+",
-        help=
-        "0-based indices of the six Prithvi channels to be selected from the  "
+        help="0-based indices of the six Prithvi channels to be selected from the  "
         "input. By default selects [1,2,3,8,11,12] for S2L1C data.",
     )
     parser.add_argument(
@@ -478,17 +462,18 @@ def main(
     # Running model ------------------------------------------------------------
 
     channels = [
-        datamodule_config['bands'].index(b) for b in ["RED", "GREEN", "BLUE"]
+        datamodule_config["bands"].index(b) for b in ["RED", "GREEN", "BLUE"]
     ]  # BGR -> RGB
 
-    pred = run_model(input_data, temporal_coords, location_coords, model_obj,
-                     datamodule, img_size)
+    pred = run_model(
+        input_data, temporal_coords, location_coords, model_obj, datamodule, img_size
+    )
 
     # Save pred
     meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
     pred_file = os.path.join(
-        output_dir,
-        f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
+        output_dir, f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff"
+    )
     save_geotiff(_convert_np_uint8(pred), pred_file, meta_data)
 
     # Save image + pred
@@ -502,13 +487,13 @@ def main(
         channels=channels,
     )
 
-    pred[pred == 0.] = np.nan
+    pred[pred == 0.0] = np.nan
     img_pred = rgb_orig * 0.7 + pred * 0.3
     img_pred[img_pred.isnan()] = rgb_orig[img_pred.isnan()]
 
     img_pred_file = os.path.join(
-        output_dir,
-        f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
+        output_dir, f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff"
+    )
     save_geotiff(
         image=_convert_np_uint8(img_pred),
         output_path=img_pred_file,
@@ -518,8 +503,9 @@ def main(
     # Save image rgb
     if rgb_outputs:
         rgb_file = os.path.join(
-            output_dir, "original_rgb_"
-            f"{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
+            output_dir,
+            f"original_rgb_{os.path.splitext(os.path.basename(data_file))[0]}.tiff",
+        )
         save_geotiff(
             image=_convert_np_uint8(rgb_orig),
             output_path=rgb_file,
@@ -528,7 +514,6 @@ def main(
 
 
 if __name__ == "__main__":
-
     args = parse_args()
 
     main(**vars(args))
diff --git a/examples/offline_inference/profiling.py b/examples/offline_inference/profiling.py
index 3cf0c340d670..244a64b891c9 100644
--- a/examples/offline_inference/profiling.py
+++ b/examples/offline_inference/profiling.py
@@ -44,14 +44,17 @@ def get_dtype(dtype: str):
 
 
 OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
-def compute_request_output_lengths(batch_size: int, step_requests: list[int]) \
-      -> OutputLen_NumReqs_Map:
+
+
+def compute_request_output_lengths(
+    batch_size: int, step_requests: list[int]
+) -> OutputLen_NumReqs_Map:
     """
     Given the number of requests, batch_size, and the number of requests
     that each engine-step should process, step_requests, determine the
     output lengths of the requests such that step_request is honoured.
 
-    Example: 
+    Example:
     if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
     then return,
     {2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
@@ -100,17 +103,19 @@ def compute_request_output_lengths(batch_size: int, step_requests: list[int]) \
         output_length -= 1
 
     # sanity checks.
-    assert sum(ol_nr.values()) == batch_size, \
-            ("Number of requests in output-length assignment does not match "
-             f"batch-size.\n batch size {batch_size} - "
-             f"step requests {step_requests} - assignments {ol_nr}")
+    assert sum(ol_nr.values()) == batch_size, (
+        "Number of requests in output-length assignment does not match "
+        f"batch-size.\n batch size {batch_size} - "
+        f"step requests {step_requests} - assignments {ol_nr}"
+    )
 
     # Check that the output-length is in [1, num-steps]. Output length must be
     # at least 1 as all requests must participate in the prefill-step.
-    assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), \
-            ("Output lengths of requests should be in range "
-             f"[1, num-engine-steps].\n batch size {batch_size} - "
-             f"step requests {step_requests} - assignments {ol_nr}")
+    assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), (
+        "Output lengths of requests should be in range "
+        f"[1, num-engine-steps].\n batch size {batch_size} - "
+        f"step requests {step_requests} - assignments {ol_nr}"
+    )
 
     return ol_nr
 
@@ -131,7 +136,7 @@ def determine_requests_per_step(context: ProfileContext) -> list[int]:
         context: ProfileContext object.
 
     Returns:
-        list[int]: Number of requests to process for all engine-steps. 
+        list[int]: Number of requests to process for all engine-steps.
          output[i], contains the number of requests that the ith step
          should process.
     """
@@ -140,10 +145,13 @@ def determine_requests_per_step(context: ProfileContext) -> list[int]:
         # that their output lengths must be equal to num_engine_steps.
         return [context.batch_size] * context.num_steps
 
-    assert context.complete_num_requests_per_step and \
-                context.complete_num_requests_per_step > 0, \
-        (f"Expected a positive complete_num_requests_per_step argument."
-         f"Instead got {context.complete_num_requests_per_step}")
+    assert (
+        context.complete_num_requests_per_step
+        and context.complete_num_requests_per_step > 0
+    ), (
+        f"Expected a positive complete_num_requests_per_step argument."
+        f"Instead got {context.complete_num_requests_per_step}"
+    )
 
     # We start dropping after the first decode step.
     step_requests = [
@@ -165,8 +173,9 @@ def determine_requests_per_step(context: ProfileContext) -> list[int]:
     return step_requests
 
 
-def run_profile(context: ProfileContext, csv_output: Optional[str],
-                json_output: Optional[str]):
+def run_profile(
+    context: ProfileContext, csv_output: Optional[str], json_output: Optional[str]
+):
     print("Run profile with:")
     for key, value in asdict(context).items():
         print(f"  {key} = {value}")
@@ -174,7 +183,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
     requests_per_step: list[int] = determine_requests_per_step(context)
 
     ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
-        context.batch_size, requests_per_step)
+        context.batch_size, requests_per_step
+    )
 
     num_steps_to_profile: int = len(requests_per_step)
     max_output_len: int = max(ol_nr.keys())
@@ -186,7 +196,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
         top_p=0.95,
         # max_tokens is set on a per-request basis.
         max_tokens=None,
-        ignore_eos=True)
+        ignore_eos=True,
+    )
 
     # Create LLM
     llm = LLM(**asdict(context.engine_args))
@@ -199,31 +210,37 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
     max_num_seqs = scheduler_config.max_num_seqs
 
     if batch_size * prompt_len > max_num_batched_tokens:
-        print(f"ERROR: chosen batch_size * prompt_len "
-              f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is  "
-              f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
-              f"and therefore cannot be run in a single profile step, please "
-              f"choose a smaller batch size or prompt length, or increase "
-              f"--max-num-batched-tokens")
+        print(
+            f"ERROR: chosen batch_size * prompt_len "
+            f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is  "
+            f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
+            f"and therefore cannot be run in a single profile step, please "
+            f"choose a smaller batch size or prompt length, or increase "
+            f"--max-num-batched-tokens"
+        )
         sys.exit(-1)
     if batch_size > max_num_seqs:
         print(
             f"ERROR: chosen batch_size ({batch_size}) is larger than "
             f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
-            f"single profile step, please choose a smaller batch size")
+            f"single profile step, please choose a smaller batch size"
+        )
         sys.exit(-1)
-    print("llm.llm_engine.model_config.max_model_len: ",
-          llm.llm_engine.model_config.max_model_len)
+    print(
+        "llm.llm_engine.model_config.max_model_len: ",
+        llm.llm_engine.model_config.max_model_len,
+    )
     if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
-        print(f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
-              f"{max_output_len} = {prompt_len + max_output_len}) is larger "
-              f"than the model's max_model_len ({max_model_len}), please "
-              f"choose a smaller prompt_len or max_output_len, or increase "
-              f"--max-model-len")
+        print(
+            f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
+            f"{max_output_len} = {prompt_len + max_output_len}) is larger "
+            f"than the model's max_model_len ({max_model_len}), please "
+            f"choose a smaller prompt_len or max_output_len, or increase "
+            f"--max-model-len"
+        )
         sys.exit(-1)
 
     def add_requests():
-
         def get_output_len_generator() -> Generator[int, Any, Any]:
             for output_len, num_reqs in ol_nr.items():
                 for _ in range(num_reqs):
@@ -234,13 +251,15 @@ def get_output_len_generator() -> Generator[int, Any, Any]:
             sampling_params.max_tokens = next(output_len_generator)
             assert isinstance(sampling_params.max_tokens, int)
 
-            prompt_token_ids = torch.randint(llm.get_tokenizer().vocab_size,
-                                             size=(prompt_len, )).tolist()
+            prompt_token_ids = torch.randint(
+                llm.get_tokenizer().vocab_size, size=(prompt_len,)
+            ).tolist()
 
             llm.llm_engine.add_request(
                 request_id=f"seq{i}",
-                prompt={'prompt_token_ids': prompt_token_ids},
-                params=sampling_params)
+                prompt={"prompt_token_ids": prompt_token_ids},
+                params=sampling_params,
+            )
 
     def abort_requests():
         for i in range(batch_size):
@@ -261,10 +280,8 @@ def abort_requests():
 
     decode_profs = []
     for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
-        num_running_seqs = llm.llm_engine.scheduler[
-            0].get_num_unfinished_seq_groups()
-        with layerwise_profile(
-                num_running_seqs=num_running_seqs) as decode_prof:
+        num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups()
+        with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof:
             llm.llm_engine.step()
         decode_profs.append(decode_prof)
 
@@ -274,8 +291,7 @@ def abort_requests():
 
     LINE_WIDTH = 80
     print("=" * LINE_WIDTH)
-    print(f"= Prefill Model Table "
-          f"(prompt_len={prompt_len}, batch_size={batch_size})")
+    print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})")
     print("=" * LINE_WIDTH)
     print()
     prefill_results.print_model_table()
@@ -283,16 +299,17 @@ def abort_requests():
     if has_decode:
         print()
         print("=" * LINE_WIDTH)
-        print(f"= First Decode Step Model Table "
-              f"(prompt_len={prompt_len}, batch_size={batch_size})")
+        print(
+            f"= First Decode Step Model Table "
+            f"(prompt_len={prompt_len}, batch_size={batch_size})"
+        )
         print("=" * LINE_WIDTH)
         print()
         decode_results_list[0].print_model_table()
 
     print()
     print("=" * LINE_WIDTH)
-    print(f"= Prefill Summary Table "
-          f"(prompt_len={prompt_len}, batch_size={batch_size})")
+    print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})")
     print("=" * LINE_WIDTH)
     print()
     prefill_results.print_summary_table()
@@ -300,25 +317,32 @@ def abort_requests():
     if has_decode:
         print()
         print("=" * LINE_WIDTH)
-        print(f"= First Decode Step Summary Table "
-              f"(prompt_len={prompt_len}, batch_size={batch_size})")
+        print(
+            f"= First Decode Step Summary Table "
+            f"(prompt_len={prompt_len}, batch_size={batch_size})"
+        )
         print("=" * LINE_WIDTH)
         print()
         decode_results_list[0].print_summary_table()
 
     if csv_output:
-        csv_filename_base = csv_output[:-4] \
-                if csv_output.endswith('.csv') else csv_output
+        csv_filename_base = (
+            csv_output[:-4] if csv_output.endswith(".csv") else csv_output
+        )
         prefill_results.export_model_stats_table_csv(
-            csv_filename_base + "_prefill_model_table.csv")
+            csv_filename_base + "_prefill_model_table.csv"
+        )
         prefill_results.export_summary_stats_table_csv(
-            csv_filename_base + "_prefill_summary_table.csv")
+            csv_filename_base + "_prefill_summary_table.csv"
+        )
 
         if has_decode:
-            decode_results_list[0].export_model_stats_table_csv(\
-                csv_filename_base + "_decode_model_table.csv")
+            decode_results_list[0].export_model_stats_table_csv(
+                csv_filename_base + "_decode_model_table.csv"
+            )
             decode_results_list[0].export_summary_stats_table_csv(
-                csv_filename_base + "_decode_summary_table.csv")
+                csv_filename_base + "_decode_summary_table.csv"
+            )
 
     if json_output:
         cuda_devices = [
@@ -332,7 +356,7 @@ def abort_requests():
                 "torch_version": f"{torch.__version__}",
                 "torch_cuda_version": f"{torch.version.cuda}",
                 "cuda_devices": f"{cuda_devices}",
-                **asdict(context)
+                **asdict(context),
             },
             "prefill": prefill_results.convert_stats_to_dict(),
         }
@@ -342,8 +366,9 @@ def abort_requests():
                 json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
 
         # Add .json to json_output filename if it doesn't exist already.
-        json_output_file = json_output if json_output.endswith(
-            '.json') else json_output + '.json'
+        json_output_file = (
+            json_output if json_output.endswith(".json") else json_output + ".json"
+        )
         with open(json_output_file, "w+") as f:
             json.dump(json_dict, f, indent=2)
         pass
@@ -351,16 +376,21 @@ def abort_requests():
     if context.save_chrome_traces_folder is not None:
         os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
         prefill_prof.profiler.export_chrome_trace(
-            context.save_chrome_traces_folder + "/prefill.json")
+            context.save_chrome_traces_folder + "/prefill.json"
+        )
         for idx, decode_prof in enumerate(decode_profs):
             decode_prof.profiler.export_chrome_trace(
-                context.save_chrome_traces_folder + f"/decode_{idx + 1}.json")
-        print("Traces saved as prefill.json and decode_1.json, etc."
-              f" in folder {context.save_chrome_traces_folder}")
+                context.save_chrome_traces_folder + f"/decode_{idx + 1}.json"
+            )
+        print(
+            "Traces saved as prefill.json and decode_1.json, etc."
+            f" in folder {context.save_chrome_traces_folder}"
+        )
 
 
 def parse_args():
-    parser = FlexibleArgumentParser(description="""
+    parser = FlexibleArgumentParser(
+        description="""
 Profile a model
 
     example:
@@ -384,7 +414,8 @@ def parse_args():
             --output-directory profile_breakdown --plot-metric pct_cuda_time
         ```
 """,
-                                    formatter_class=RawTextHelpFormatter)
+        formatter_class=RawTextHelpFormatter,
+    )
     parser.add_argument(
         "--csv",
         type=str,
@@ -393,59 +424,68 @@ def parse_args():
         "filename, will create <filename>_prefill_model_table.csv, "
         "<filename>_prefill_summary_table.csv, "
         "<filename>_decode_model_table.csv, and "
-        "<filename>_decode_summary_table.csv")
+        "<filename>_decode_summary_table.csv",
+    )
     parser.add_argument(
         "--json",
         type=str,
         default=None,
-        help="Export the results as a json file. This should be the filename")
-    parser.add_argument("--save-chrome-traces-folder",
-                        type=str,
-                        help="Save chrome traces for the prefill and decode "
-                        "will save traces as prefill.json and decode_1.json, "
-                        "etc. inside this folder")
+        help="Export the results as a json file. This should be the filename",
+    )
+    parser.add_argument(
+        "--save-chrome-traces-folder",
+        type=str,
+        help="Save chrome traces for the prefill and decode "
+        "will save traces as prefill.json and decode_1.json, "
+        "etc. inside this folder",
+    )
     parser.add_argument(
         "--prompt-len",
         type=int,
         default=PROMPT_LEN_DEFAULT,
         help=f"Length of the random prompt to use when profiling, all batched "
-        f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}")
-    parser.add_argument("--batch-size",
-                        type=int,
-                        default=BATCH_SIZE_DEFAULT,
-                        help=f"Number of requests to run as a single batch, "
-                        f"default={BATCH_SIZE_DEFAULT}")
+        f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}",
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=BATCH_SIZE_DEFAULT,
+        help=f"Number of requests to run as a single batch, "
+        f"default={BATCH_SIZE_DEFAULT}",
+    )
 
     subparsers = parser.add_subparsers(dest="cmd")
 
     run_num_steps_parser = subparsers.add_parser(
-        "run_num_steps",
-        help="This variation profiles n engine.step() invocations.")
+        "run_num_steps", help="This variation profiles n engine.step() invocations."
+    )
     run_num_steps_parser.add_argument(
-        '-n',
-        '--num-steps',
+        "-n",
+        "--num-steps",
         type=int,
         help="Number of engine steps to profile.\n"
         "Setting it to 1, profiles only the prefill step.\n"
         "Setting it to 2, profiles the prefill and first decode step\n"
         "Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
-        "and so on ...")
+        "and so on ...",
+    )
 
     run_to_completion_parser = subparsers.add_parser(
         "run_to_completion",
         help="This variation profiles all the engine.step() invocations"
-        "until the engine exhausts all submitted requests.")
+        "until the engine exhausts all submitted requests.",
+    )
     run_to_completion_parser.add_argument(
-        '-n',
-        '--complete-num-requests-per-step',
+        "-n",
+        "--complete-num-requests-per-step",
         type=int,
-        help=
-        "Complete complete_num_requests_per_step requests every decode step."
+        help="Complete complete_num_requests_per_step requests every decode step."
         "For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
         "the profiler is run for 6 engine steps, with the steps processing, "
         "128, 128, 96, 64, 32, 1 requests respectively.\n"
         "Note that we tack-on a one-request step at the end as it is often "
-        "useful.")
+        "useful.",
+    )
 
     EngineArgs.add_cli_args(parser)
 
@@ -459,7 +499,8 @@ def main(args):
             k: v
             for k, v in vars(args).items()
             if k in inspect.signature(ProfileContext).parameters
-        })
+        },
+    )
     run_profile(context, csv_output=args.csv, json_output=args.json)
 
 
diff --git a/examples/offline_inference/profiling_tpu/profiling.py b/examples/offline_inference/profiling_tpu/profiling.py
index 61da4705e18e..82737d538df4 100644
--- a/examples/offline_inference/profiling_tpu/profiling.py
+++ b/examples/offline_inference/profiling_tpu/profiling.py
@@ -31,18 +31,16 @@ def main(args: argparse.Namespace):
         max_tokens=args.output_len,
     )
     print(sampling_params)
-    dummy_prompt_token_ids = np.random.randint(10000,
-                                               size=(args.batch_size,
-                                                     args.input_len))
-    dummy_prompts: list[PromptType] = [{
-        "prompt_token_ids": batch
-    } for batch in dummy_prompt_token_ids.tolist()]
+    dummy_prompt_token_ids = np.random.randint(
+        10000, size=(args.batch_size, args.input_len)
+    )
+    dummy_prompts: list[PromptType] = [
+        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
+    ]
 
     def run_to_completion():
         start_time = time.perf_counter()
-        llm.generate(dummy_prompts,
-                     sampling_params=sampling_params,
-                     use_tqdm=False)
+        llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
         end_time = time.perf_counter()
         latency = end_time - start_time
         return latency
@@ -58,10 +56,9 @@ def run_to_completion():
     profile_dir = args.profile_result_dir
     print(f"Profiling (results will be saved to '{profile_dir}')...")
     # Enable tracing on server
-    xp.trace_detached("localhost:9012",
-                      profile_dir,
-                      delay_ms=DELAY_MS,
-                      duration_ms=DURATION_MS)
+    xp.trace_detached(
+        "localhost:9012", profile_dir, delay_ms=DELAY_MS, duration_ms=DURATION_MS
+    )
     if DELAY_MS == 0:
         time.sleep(1.0)
     profile_latencies = []
@@ -72,30 +69,36 @@ def run_to_completion():
     return
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     parser = FlexibleArgumentParser(
-        description='Benchmark the latency of processing a single batch of '
-        'requests till completion.')
-    parser.add_argument('--input-len', type=int, default=32)
-    parser.add_argument('--output-len', type=int, default=128)
-    parser.add_argument('--batch-size', type=int, default=8)
-    parser.add_argument('--num-iters-warmup',
-                        type=int,
-                        default=5,
-                        help='Number of iterations to run for warmup.')
-    parser.add_argument('--num-iters',
-                        type=int,
-                        default=1,
-                        help='Number of iterations to run for profiling.')
+        description="Benchmark the latency of processing a single batch of "
+        "requests till completion."
+    )
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=5,
+        help="Number of iterations to run for warmup.",
+    )
+    parser.add_argument(
+        "--num-iters",
+        type=int,
+        default=1,
+        help="Number of iterations to run for profiling.",
+    )
     parser.add_argument(
-        '--profile-result-dir',
+        "--profile-result-dir",
         type=str,
         default="profiles",
-        help=
-        ('path to save the pytorch profiler output. Can be visualized '
-         'with ui.perfetto.dev or Tensorboard '
-         '(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).'
-         ))
+        help=(
+            "path to save the pytorch profiler output. Can be visualized "
+            "with ui.perfetto.dev or Tensorboard "
+            "(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)."
+        ),
+    )
 
     parser = EngineArgs.add_cli_args(parser)
     args = parser.parse_args()
diff --git a/examples/offline_inference/prompt_embed_inference.py b/examples/offline_inference/prompt_embed_inference.py
new file mode 100644
index 000000000000..9f6a602233f8
--- /dev/null
+++ b/examples/offline_inference/prompt_embed_inference.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+Demonstrates how to generate prompt embeddings using
+Hugging Face Transformers  and use them as input to vLLM
+for both single and batch inference.
+
+Model: meta-llama/Llama-3.2-1B-Instruct
+Note: This model is gated on Hugging Face Hub.
+      You must request access to use it:
+      https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
+
+Requirements:
+- vLLM
+- transformers
+
+Run:
+    python examples/offline_inference/prompt_embed_inference.py
+"""
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer
+
+from vllm import LLM
+
+
+def init_tokenizer_and_llm(model_name: str):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
+    embedding_layer = transformers_model.get_input_embeddings()
+    llm = LLM(model=model_name, enable_prompt_embeds=True)
+    return tokenizer, embedding_layer, llm
+
+
+def get_prompt_embeds(
+    chat: list[dict[str, str]],
+    tokenizer: PreTrainedTokenizer,
+    embedding_layer: torch.nn.Module,
+):
+    token_ids = tokenizer.apply_chat_template(
+        chat, add_generation_prompt=True, return_tensors="pt"
+    )
+    prompt_embeds = embedding_layer(token_ids).squeeze(0)
+    return prompt_embeds
+
+
+def single_prompt_inference(
+    llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module
+):
+    chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
+    prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
+
+    outputs = llm.generate(
+        {
+            "prompt_embeds": prompt_embeds,
+        }
+    )
+
+    print("\n[Single Inference Output]")
+    print("-" * 30)
+    for o in outputs:
+        print(o.outputs[0].text)
+    print("-" * 30)
+
+
+def batch_prompt_inference(
+    llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module
+):
+    chats = [
+        [{"role": "user", "content": "Please tell me about the capital of France."}],
+        [{"role": "user", "content": "When is the day longest during the year?"}],
+        [{"role": "user", "content": "Where is bigger, the moon or the sun?"}],
+    ]
+
+    prompt_embeds_list = [
+        get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
+    ]
+
+    outputs = llm.generate([{"prompt_embeds": embeds} for embeds in prompt_embeds_list])
+
+    print("\n[Batch Inference Outputs]")
+    print("-" * 30)
+    for i, o in enumerate(outputs):
+        print(f"Q{i + 1}: {chats[i][0]['content']}")
+        print(f"A{i + 1}: {o.outputs[0].text}\n")
+    print("-" * 30)
+
+
+def main():
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+    tokenizer, embedding_layer, llm = init_tokenizer_and_llm(model_name)
+    single_prompt_inference(llm, tokenizer, embedding_layer)
+    batch_prompt_inference(llm, tokenizer, embedding_layer)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md
index c30541a598ce..16d44cbadbc9 100644
--- a/examples/offline_inference/qwen2_5_omni/README.md
+++ b/examples/offline_inference/qwen2_5_omni/README.md
@@ -6,14 +6,19 @@ This folder provides several example scripts on how to inference Qwen2.5-Omni of
 
 ```bash
 # Audio + image + video
-python examples/offline_inference/qwen2_5_omni/only_thinker.py -q mixed_modalities
+python examples/offline_inference/qwen2_5_omni/only_thinker.py \
+    -q mixed_modalities
 
 # Read vision and audio inputs from a single video file
 # NOTE: V1 engine does not support interleaved modalities yet.
-VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q use_audio_in_video
+VLLM_USE_V1=0 \
+python examples/offline_inference/qwen2_5_omni/only_thinker.py \
+    -q use_audio_in_video
 
 # Multiple audios
-VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q multi_audios
+VLLM_USE_V1=0 \
+python examples/offline_inference/qwen2_5_omni/only_thinker.py \
+    -q multi_audios
 ```
 
 This script will run the thinker part of Qwen2.5-Omni, and generate text response.
@@ -22,11 +27,16 @@ You can also test Qwen2.5-Omni on a single modality:
 
 ```bash
 # Process audio inputs
-python examples/offline_inference/audio_language.py --model-type qwen2_5_omni
+python examples/offline_inference/audio_language.py \
+    --model-type qwen2_5_omni
 
 # Process image inputs
-python examples/offline_inference/vision_language.py --modality image --model-type qwen2_5_omni
+python examples/offline_inference/vision_language.py \
+    --modality image \
+    --model-type qwen2_5_omni
 
 # Process video inputs
-python examples/offline_inference/vision_language.py --modality video --model-type qwen2_5_omni
+python examples/offline_inference/vision_language.py \
+    --modality video \
+    --model-type qwen2_5_omni
 ```
diff --git a/examples/offline_inference/qwen2_5_omni/only_thinker.py b/examples/offline_inference/qwen2_5_omni/only_thinker.py
index 52b6e977eaa2..6482490d1a93 100644
--- a/examples/offline_inference/qwen2_5_omni/only_thinker.py
+++ b/examples/offline_inference/qwen2_5_omni/only_thinker.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-This example shows how to use vLLM for running offline inference 
+This example shows how to use vLLM for running offline inference
 with the correct prompt format on Qwen2.5-Omni (thinker only).
 """
 
@@ -11,6 +11,7 @@
 from vllm.assets.audio import AudioAsset
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
+from vllm.multimodal.image import convert_image_mode
 from vllm.utils import FlexibleArgumentParser
 
 
@@ -26,50 +27,55 @@ class QueryResult(NamedTuple):
 default_system = (
     "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
     "Group, capable of perceiving auditory and visual inputs, as well as "
-    "generating text and speech.")
+    "generating text and speech."
+)
 
 
 def get_mixed_modalities_query() -> QueryResult:
-    question = ("What is recited in the audio? "
-                "What is the content of this image? Why is this video funny?")
-    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
-              "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
-              "<|vision_bos|><|IMAGE|><|vision_eos|>"
-              "<|vision_bos|><|VIDEO|><|vision_eos|>"
-              f"{question}<|im_end|>\n"
-              f"<|im_start|>assistant\n")
+    question = (
+        "What is recited in the audio? "
+        "What is the content of this image? Why is this video funny?"
+    )
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
+        "<|vision_bos|><|IMAGE|><|vision_eos|>"
+        "<|vision_bos|><|VIDEO|><|vision_eos|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
     return QueryResult(
         inputs={
             "prompt": prompt,
             "multi_modal_data": {
-                "audio":
-                AudioAsset("mary_had_lamb").audio_and_sample_rate,
-                "image":
-                ImageAsset("cherry_blossom").pil_image.convert("RGB"),
-                "video":
-                VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
+                "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
+                "image": convert_image_mode(
+                    ImageAsset("cherry_blossom").pil_image, "RGB"
+                ),
+                "video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
             },
         },
-        limit_mm_per_prompt={
-            "audio": 1,
-            "image": 1,
-            "video": 1
-        },
+        limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
     )
 
 
 def get_use_audio_in_video_query() -> QueryResult:
-    question = ("Describe the content of the video, "
-                "then convert what the baby say into text.")
-    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
-              "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
-              f"{question}<|im_end|>\n"
-              f"<|im_start|>assistant\n")
+    question = (
+        "Describe the content of the video, then convert what the baby say into text."
+    )
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
     asset = VideoAsset(name="baby_reading", num_frames=16)
     audio = asset.get_audio(sampling_rate=16000)
-    assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. "
-                                  "Please launch this example with "
-                                  "`VLLM_USE_V1=0`.")
+    assert not envs.VLLM_USE_V1, (
+        "V1 does not support use_audio_in_video. "
+        "Please launch this example with "
+        "`VLLM_USE_V1=0`."
+    )
     return QueryResult(
         inputs={
             "prompt": prompt,
@@ -81,20 +87,19 @@ def get_use_audio_in_video_query() -> QueryResult:
                 "use_audio_in_video": True,
             },
         },
-        limit_mm_per_prompt={
-            "audio": 1,
-            "video": 1
-        },
+        limit_mm_per_prompt={"audio": 1, "video": 1},
     )
 
 
 def get_multi_audios_query() -> QueryResult:
     question = "Are these two audio clips the same?"
-    prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
-              "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
-              "<|audio_bos|><|AUDIO|><|audio_eos|>"
-              f"{question}<|im_end|>\n"
-              f"<|im_start|>assistant\n")
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
+        "<|audio_bos|><|AUDIO|><|audio_eos|>"
+        f"{question}<|im_end|>\n"
+        f"<|im_start|>assistant\n"
+    )
     return QueryResult(
         inputs={
             "prompt": prompt,
@@ -122,18 +127,19 @@ def main(args):
     model_name = "Qwen/Qwen2.5-Omni-7B"
     query_result = query_map[args.query_type]()
 
-    llm = LLM(model=model_name,
-              max_model_len=5632,
-              max_num_seqs=5,
-              limit_mm_per_prompt=query_result.limit_mm_per_prompt,
-              seed=args.seed)
+    llm = LLM(
+        model=model_name,
+        max_model_len=5632,
+        max_num_seqs=5,
+        limit_mm_per_prompt=query_result.limit_mm_per_prompt,
+        seed=args.seed,
+    )
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
     sampling_params = SamplingParams(temperature=0.2, max_tokens=64)
 
-    outputs = llm.generate(query_result.inputs,
-                           sampling_params=sampling_params)
+    outputs = llm.generate(query_result.inputs, sampling_params=sampling_params)
 
     for o in outputs:
         generated_text = o.outputs[0].text
@@ -142,18 +148,23 @@ def main(args):
 
 def parse_args():
     parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
-        'audio language models')
-    parser.add_argument('--query-type',
-                        '-q',
-                        type=str,
-                        default="mixed_modalities",
-                        choices=query_map.keys(),
-                        help='Query type.')
-    parser.add_argument("--seed",
-                        type=int,
-                        default=None,
-                        help="Set the seed when initializing `vllm.LLM`.")
+        description="Demo on using vLLM for offline inference with "
+        "audio language models"
+    )
+    parser.add_argument(
+        "--query-type",
+        "-q",
+        type=str,
+        default="mixed_modalities",
+        choices=query_map.keys(),
+        help="Query type.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
 
     return parser.parse_args()
 
diff --git a/examples/offline_inference/qwen_1m.py b/examples/offline_inference/qwen_1m.py
index 64a1f4c54b67..856a35b0e59b 100644
--- a/examples/offline_inference/qwen_1m.py
+++ b/examples/offline_inference/qwen_1m.py
@@ -17,10 +17,10 @@ def load_prompt() -> str:
     # https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt
 
     with urlopen(
-            "https://qianwen-res.oss-cn-beijing.aliyuncs.com"
-            "/Qwen2.5-1M/test-data/600k.txt",
-            timeout=5) as response:
-        prompt = response.read().decode('utf-8')
+        "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt",
+        timeout=5,
+    ) as response:
+        prompt = response.read().decode("utf-8")
     return prompt
 
 
@@ -41,18 +41,22 @@ def process_requests(llm: LLM, prompts: list[str]) -> None:
     for output in outputs:
         prompt_token_ids = output.prompt_token_ids
         generated_text = output.outputs[0].text
-        print(f"Prompt length: {len(prompt_token_ids)}, "
-              f"Generated text: {generated_text!r}")
+        print(
+            f"Prompt length: {len(prompt_token_ids)}, "
+            f"Generated text: {generated_text!r}"
+        )
 
 
 # Create an LLM.
 def initialize_engine() -> LLM:
-    llm = LLM(model="Qwen/Qwen2.5-7B-Instruct-1M",
-              max_model_len=1048576,
-              tensor_parallel_size=4,
-              enforce_eager=True,
-              enable_chunked_prefill=True,
-              max_num_batched_tokens=131072)
+    llm = LLM(
+        model="Qwen/Qwen2.5-7B-Instruct-1M",
+        max_model_len=1048576,
+        tensor_parallel_size=4,
+        enforce_eager=True,
+        enable_chunked_prefill=True,
+        max_num_batched_tokens=131072,
+    )
     return llm
 
 
@@ -62,5 +66,5 @@ def main():
     process_requests(llm, [prompt])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/examples/offline_inference/reproducibility.py b/examples/offline_inference/reproducibility.py
index b2be117d1a0a..6d048986e710 100644
--- a/examples/offline_inference/reproducibility.py
+++ b/examples/offline_inference/reproducibility.py
@@ -1,24 +1,22 @@
 # SPDX-License-Identifier: Apache-2.0
+"""
+Demonstrates how to achieve reproducibility in vLLM.
+
+Main article: https://docs.vllm.ai/en/latest/usage/reproducibility.html
+"""
+
 import os
+import random
 
 from vllm import LLM, SamplingParams
 
-# vLLM does not guarantee the reproducibility of the results by default,
-# for the sake of performance. You need to do the following to achieve
-# reproducible results:
-# 1. Turn off multiprocessing to make the scheduling deterministic.
-#    NOTE(woosuk): This is not needed and will be ignored for V0.
+# V1 only: Turn off multiprocessing to make the scheduling deterministic.
 os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
-# 2. Fix the global seed for reproducibility. The default seed is None, which is
+
+# V0 only: Set the global seed. The default seed is None, which is
 # not reproducible.
 SEED = 42
 
-# NOTE(woosuk): Even with the above two settings, vLLM only provides
-# reproducibility when it runs on the same hardware and the same vLLM version.
-# Also, the online serving API (`vllm serve`) does not support reproducibility
-# because it is almost impossible to make the scheduling deterministic in the
-# online serving setting.
-
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -38,6 +36,11 @@ def main():
         print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
         print("-" * 50)
 
+    # Try generating random numbers outside vLLM
+    # The same number is output across runs, meaning that the random state
+    # in the user code has been updated by vLLM
+    print(random.randint(0, 100))
+
 
 if __name__ == "__main__":
     main()
diff --git a/examples/offline_inference/rlhf.py b/examples/offline_inference/rlhf.py
index e0ed0ac49754..a8f6977e29a4 100644
--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
@@ -12,6 +12,7 @@
 and multiple inference instances. For the full implementation, please refer
 to the OpenRLHF framework.
 """
+
 import os
 
 import ray
@@ -26,7 +27,6 @@
 
 
 class MyLLM(LLM):
-
     def __init__(self, *args, **kwargs):
         # a hack to make the script work.
         # stop ray from manipulating CUDA_VISIBLE_DEVICES
@@ -89,8 +89,7 @@ def __init__(self, *args, **kwargs):
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}\n"
-          f"Generated text: {generated_text!r}")
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
     print("-" * 50)
 
 # set up the communication between the training process
@@ -98,11 +97,13 @@ def __init__(self, *args, **kwargs):
 master_address = get_ip()
 master_port = get_open_port()
 
-handle = llm.collective_rpc.remote("init_weight_update_group",
-                                   args=(master_address, master_port, 1, 3))
+handle = llm.collective_rpc.remote(
+    "init_weight_update_group", args=(master_address, master_port, 1, 3)
+)
 
-model_update_group = stateless_init_process_group(master_address, master_port,
-                                                  0, 3, torch.device("cuda:0"))
+model_update_group = stateless_init_process_group(
+    master_address, master_port, 0, 3, torch.device("cuda:0")
+)
 ray.get(handle)
 
 # simulate training, modify the weights of the model.
@@ -111,8 +112,7 @@ def __init__(self, *args, **kwargs):
 
 # sync weight from the training process to the inference engine.
 for name, p in train_model.named_parameters():
-    handle = llm.collective_rpc.remote("update_weight",
-                                       args=(name, p.dtype, p.shape))
+    handle = llm.collective_rpc.remote("update_weight", args=(name, p.dtype, p.shape))
     model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
     ray.get(handle)
 
@@ -126,6 +126,5 @@ def __init__(self, *args, **kwargs):
 for output in outputs_updated:
     prompt = output.prompt
     generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}\n"
-          f"Generated text: {generated_text!r}")
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
     print("-" * 50)
diff --git a/examples/offline_inference/rlhf_colocate.py b/examples/offline_inference/rlhf_colocate.py
index 3ceac0fa2e20..76eafdca1f6c 100644
--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -9,6 +9,7 @@
 - Use cuda-ipc to pass tensors, since NCCL does not work when we have
     multiple processes on the same GPU.
 """
+
 import os
 
 import ray
@@ -20,7 +21,6 @@
 
 
 class MyLLM(LLM):
-
     def __init__(self, *args, bundle_indices: list, **kwargs):
         # a hack to make the script work.
         # stop ray from manipulating CUDA_VISIBLE_DEVICES
@@ -29,17 +29,16 @@ def __init__(self, *args, bundle_indices: list, **kwargs):
         # every worker will use 0.4 GPU, so that we can schedule
         # 2 instances on the same GPUs.
         os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
-        os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(
-            map(str, bundle_indices))
+        os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(map(str, bundle_indices))
         print(f"creating LLM with bundle_indices={bundle_indices}")
         super().__init__(*args, **kwargs)
 
 
 class RayTrainingActor:
-
     def __init__(self):
         # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
         from transformers import AutoModelForCausalLM
+
         self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
         self.model.to("cuda:0")
         for name, p in self.model.named_parameters():
@@ -48,6 +47,7 @@ def __init__(self):
         # the argument for get_device_uuid is the index
         # of the GPU in the visible devices.
         from vllm.platforms import current_platform
+
         self.device_uuid = current_platform.get_device_uuid(0)
 
     def report_device_id(self) -> str:
@@ -55,6 +55,7 @@ def report_device_id(self) -> str:
 
     def get_weight_ipc_handles(self):
         from torch.multiprocessing.reductions import reduce_tensor
+
         data = {}
         for name, p in self.model.named_parameters():
             # the training actor might only have a subset of the weights
@@ -101,7 +102,7 @@ def get_weight_ipc_handles(self):
     print(f"training actor {bundle_index} is on {device_id}")
     training_actor_device_ids.append(device_id)
 
-for (i, bundle_indices) in enumerate([[0, 1], [2, 3]]):
+for i, bundle_indices in enumerate([[0, 1], [2, 3]]):
     # IMPORTANT: when creating vLLM instances, we need to
     # make sure there are no GPU activities on the target GPUs,
     # otherwise, they will interfere with the vLLM memory profiling,
@@ -128,7 +129,8 @@ def get_weight_ipc_handles(self):
 
 for i, llm in enumerate(inference_engines):
     inference_engine_device_ids.append(
-        ray.get(llm.collective_rpc.remote("report_device_id", args=tuple())))
+        ray.get(llm.collective_rpc.remote("report_device_id", args=tuple()))
+    )
     print(f"inference engine {i} is on {inference_engine_device_ids[-1]}")
 
 # check the placement
@@ -147,9 +149,10 @@ def get_weight_ipc_handles(self):
 print("update the weights of the inference engines")
 for llm in inference_engines:
     ray.get(
-        llm.collective_rpc.remote("update_weights_from_ipc_handles",
-                                  args=(ipc_handles, )))
+        llm.collective_rpc.remote(
+            "update_weights_from_ipc_handles", args=(ipc_handles,)
+        )
+    )
 print("check if the weights are updated")
 for llm in inference_engines:
-    assert ray.get(
-        llm.collective_rpc.remote("check_weights_changed", args=tuple()))
+    assert ray.get(llm.collective_rpc.remote("check_weights_changed", args=tuple()))
diff --git a/examples/offline_inference/rlhf_utils.py b/examples/offline_inference/rlhf_utils.py
index 11b73b7c4a0a..3461af707eba 100644
--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
@@ -2,21 +2,20 @@
 import torch
 
 
-def stateless_init_process_group(master_address, master_port, rank, world_size,
-                                 device):
+def stateless_init_process_group(master_address, master_port, rank, world_size, device):
     """
     vLLM provides `StatelessProcessGroup` to create a process group
     without considering the global process group in torch.distributed.
     It is recommended to create `StatelessProcessGroup`, and then initialize
-    the data-plane communication (NCCL) between external (train processes) 
+    the data-plane communication (NCCL) between external (train processes)
     and vLLM workers.
     """
     from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
     from vllm.distributed.utils import StatelessProcessGroup
-    pg = StatelessProcessGroup.create(host=master_address,
-                                      port=master_port,
-                                      rank=rank,
-                                      world_size=world_size)
+
+    pg = StatelessProcessGroup.create(
+        host=master_address, port=master_port, rank=rank, world_size=world_size
+    )
     pynccl = PyNcclCommunicator(pg, device=device)
     return pynccl
 
@@ -31,9 +30,11 @@ class WorkerExtension:
     should pass the full qualified name as `worker_extension_cls` argument.
     """
 
-    def init_weight_update_group(self, master_address, master_port,
-                                 rank_offset, world_size):
+    def init_weight_update_group(
+        self, master_address, master_port, rank_offset, world_size
+    ):
         from vllm.distributed.parallel_state import get_world_group
+
         rank = get_world_group().rank + rank_offset
         self.model_update_group = stateless_init_process_group(
             master_address,
@@ -45,9 +46,9 @@ def init_weight_update_group(self, master_address, master_port,
 
     def update_weight(self, name, dtype, shape):
         weight = torch.empty(shape, dtype=dtype, device="cuda")
-        self.model_update_group.broadcast(weight,
-                                          src=0,
-                                          stream=torch.cuda.current_stream())
+        self.model_update_group.broadcast(
+            weight, src=0, stream=torch.cuda.current_stream()
+        )
 
         self.model_runner.model.load_weights(weights=[(name, weight)])
 
@@ -59,8 +60,7 @@ def check_weights_changed(self):
         """
         weights_updated = True
         for name, p in self.model_runner.model.named_parameters():
-            weights_updated = weights_updated and torch.allclose(
-                p, torch.zeros_like(p))
+            weights_updated = weights_updated and torch.allclose(p, torch.zeros_like(p))
         return weights_updated
 
 
@@ -76,6 +76,7 @@ class ColocateWorkerExtension:
 
     def report_device_id(self) -> str:
         from vllm.platforms import current_platform
+
         self.device_uuid = current_platform.get_device_uuid(self.device.index)
         return self.device_uuid
 
@@ -100,6 +101,5 @@ def check_weights_changed(self):
         """
         weights_updated = True
         for name, p in self.model_runner.model.named_parameters():
-            weights_updated = weights_updated and torch.allclose(
-                p, torch.zeros_like(p))
+            weights_updated = weights_updated and torch.allclose(p, torch.zeros_like(p))
         return weights_updated
diff --git a/examples/offline_inference/save_sharded_state.py b/examples/offline_inference/save_sharded_state.py
index 338380cc9684..860fe2b5fe06 100644
--- a/examples/offline_inference/save_sharded_state.py
+++ b/examples/offline_inference/save_sharded_state.py
@@ -21,6 +21,7 @@
     tensor_parallel_size=8,
 )
 """
+
 import dataclasses
 import os
 import shutil
@@ -33,18 +34,18 @@
 def parse_args():
     parser = FlexibleArgumentParser()
     EngineArgs.add_cli_args(parser)
-    parser.add_argument("--output",
-                        "-o",
-                        required=True,
-                        type=str,
-                        help="path to output checkpoint")
-    parser.add_argument("--file-pattern",
-                        type=str,
-                        help="string pattern of saved filenames")
-    parser.add_argument("--max-file-size",
-                        type=str,
-                        default=5 * 1024**3,
-                        help="max size (in bytes) of each safetensors file")
+    parser.add_argument(
+        "--output", "-o", required=True, type=str, help="path to output checkpoint"
+    )
+    parser.add_argument(
+        "--file-pattern", type=str, help="string pattern of saved filenames"
+    )
+    parser.add_argument(
+        "--max-file-size",
+        type=str,
+        default=5 * 1024**3,
+        help="max size (in bytes) of each safetensors file",
+    )
     return parser.parse_args()
 
 
@@ -68,23 +69,23 @@ def main(args):
         # For V1 engine, we need to use engine_core.save_sharded_state
         print("Using V1 engine save path")
         llm.llm_engine.engine_core.save_sharded_state(
-            path=args.output,
-            pattern=args.file_pattern,
-            max_size=args.max_file_size)
+            path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
+        )
     else:
         # For V0 engine
         print("Using V0 engine save path")
         model_executor = llm.llm_engine.model_executor
-        model_executor.save_sharded_state(path=args.output,
-                                          pattern=args.file_pattern,
-                                          max_size=args.max_file_size)
+        model_executor.save_sharded_state(
+            path=args.output, pattern=args.file_pattern, max_size=args.max_file_size
+        )
 
     # Copy metadata files to output directory
     for file in os.listdir(model_path):
         if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"):
             if os.path.isdir(os.path.join(model_path, file)):
-                shutil.copytree(os.path.join(model_path, file),
-                                os.path.join(args.output, file))
+                shutil.copytree(
+                    os.path.join(model_path, file), os.path.join(args.output, file)
+                )
             else:
                 shutil.copy(os.path.join(model_path, file), args.output)
 
diff --git a/examples/offline_inference/structured_outputs.py b/examples/offline_inference/structured_outputs.py
index 363b500e0adf..9ed7299606b7 100644
--- a/examples/offline_inference/structured_outputs.py
+++ b/examples/offline_inference/structured_outputs.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 """
-This file demonstrates the example usage of guided decoding 
-to generate structured outputs using vLLM. It shows how to apply 
-different guided decoding techniques such as Choice, Regex, JSON schema, 
-and Grammar to produce structured and formatted results 
+This file demonstrates the example usage of guided decoding
+to generate structured outputs using vLLM. It shows how to apply
+different guided decoding techniques such as Choice, Regex, JSON schema,
+and Grammar to produce structured and formatted results
 based on specific prompts.
 """
 
@@ -15,20 +15,20 @@
 from vllm.sampling_params import GuidedDecodingParams
 
 # Guided decoding by Choice (list of possible options)
-guided_decoding_params_choice = GuidedDecodingParams(
-    choice=["Positive", "Negative"])
-sampling_params_choice = SamplingParams(
-    guided_decoding=guided_decoding_params_choice)
+guided_decoding_params_choice = GuidedDecodingParams(choice=["Positive", "Negative"])
+sampling_params_choice = SamplingParams(guided_decoding=guided_decoding_params_choice)
 prompt_choice = "Classify this sentiment: vLLM is wonderful!"
 
 # Guided decoding by Regex
 guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
 sampling_params_regex = SamplingParams(
-    guided_decoding=guided_decoding_params_regex, stop=["\n"])
+    guided_decoding=guided_decoding_params_regex, stop=["\n"]
+)
 prompt_regex = (
     "Generate an email address for Alan Turing, who works in Enigma."
     "End in .com and new line. Example result:"
-    "alan.turing@enigma.com\n")
+    "alan.turing@enigma.com\n"
+)
 
 
 # Guided decoding by JSON using Pydantic schema
@@ -47,10 +47,11 @@ class CarDescription(BaseModel):
 
 json_schema = CarDescription.model_json_schema()
 guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
-sampling_params_json = SamplingParams(
-    guided_decoding=guided_decoding_params_json)
-prompt_json = ("Generate a JSON with the brand, model and car_type of"
-               "the most iconic car from the 90's")
+sampling_params_json = SamplingParams(guided_decoding=guided_decoding_params_json)
+prompt_json = (
+    "Generate a JSON with the brand, model and car_type of"
+    "the most iconic car from the 90's"
+)
 
 # Guided decoding by Grammar
 simplified_sql_grammar = """
@@ -61,12 +62,11 @@ class CarDescription(BaseModel):
 condition ::= column "= " number
 number ::= "1 " | "2 "
 """
-guided_decoding_params_grammar = GuidedDecodingParams(
-    grammar=simplified_sql_grammar)
-sampling_params_grammar = SamplingParams(
-    guided_decoding=guided_decoding_params_grammar)
-prompt_grammar = ("Generate an SQL query to show the 'username' and 'email'"
-                  "from the 'users' table.")
+guided_decoding_params_grammar = GuidedDecodingParams(grammar=simplified_sql_grammar)
+sampling_params_grammar = SamplingParams(guided_decoding=guided_decoding_params_grammar)
+prompt_grammar = (
+    "Generate an SQL query to show the 'username' and 'email'from the 'users' table."
+)
 
 
 def format_output(title: str, output: str):
@@ -90,8 +90,7 @@ def main():
     json_output = generate_output(prompt_json, sampling_params_json, llm)
     format_output("Guided decoding by JSON", json_output)
 
-    grammar_output = generate_output(prompt_grammar, sampling_params_grammar,
-                                     llm)
+    grammar_output = generate_output(prompt_grammar, sampling_params_grammar, llm)
     format_output("Guided decoding by Grammar", grammar_output)
 
 
diff --git a/examples/offline_inference/torchrun_example.py b/examples/offline_inference/torchrun_example.py
index bb61a0a29e32..2fa49c0835e3 100644
--- a/examples/offline_inference/torchrun_example.py
+++ b/examples/offline_inference/torchrun_example.py
@@ -45,8 +45,7 @@
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}\n"
-              f"Generated text: {generated_text!r}\n")
+        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
         print("-" * 50)
     """
 Further tips:
diff --git a/examples/offline_inference/tpu.py b/examples/offline_inference/tpu.py
index 71cd88f2788a..e4a75b3f9380 100644
--- a/examples/offline_inference/tpu.py
+++ b/examples/offline_inference/tpu.py
@@ -20,10 +20,12 @@
 def main():
     # Set `enforce_eager=True` to avoid ahead-of-time compilation.
     # In real workloads, `enforace_eager` should be `False`.
-    llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
-              max_num_batched_tokens=64,
-              max_num_seqs=4,
-              max_model_len=128)
+    llm = LLM(
+        model="Qwen/Qwen2-1.5B-Instruct",
+        max_num_batched_tokens=64,
+        max_num_seqs=4,
+        max_model_len=128,
+    )
     outputs = llm.generate(prompts, sampling_params)
     print("-" * 50)
     for output, answer in zip(outputs, answers):
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
index c54f328c7a38..f0504501639d 100644
--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -6,6 +6,7 @@
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
+
 import os
 import random
 from contextlib import contextmanager
@@ -19,6 +20,7 @@
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
 from vllm.lora.request import LoRARequest
+from vllm.multimodal.image import convert_image_mode
 from vllm.utils import FlexibleArgumentParser
 
 
@@ -48,9 +50,13 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
         limit_mm_per_prompt={modality: 1},
     )
 
-    prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
-                "<|im_end|>\n<|im_start|>assistant\n")
-               for question in questions]
+    prompts = [
+        (
+            f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
+            "<|im_end|>\n<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
 
     stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
 
@@ -134,8 +140,7 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
     )
 
     prompts = [
-        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:"
-        for question in questions
+        f"<|User|>: <image>\n{question}\n\n<|Assistant|>:" for question in questions
     ]
 
     return ModelRequestData(
@@ -197,9 +202,14 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
         limit_mm_per_prompt={modality: 1},
     )
 
-    prompts = [("<bos><start_of_turn>user\n"
-                f"<start_of_image>{question}<end_of_turn>\n"
-                "<start_of_turn>model\n") for question in questions]
+    prompts = [
+        (
+            "<bos><start_of_turn>user\n"
+            f"<start_of_image>{question}<end_of_turn>\n"
+            "<start_of_turn>model\n"
+        )
+        for question in questions
+    ]
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -224,7 +234,8 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
 
     prompts = [
         f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
-        {question}<|assistant|>" for question in questions
+        {question}<|assistant|>"
+        for question in questions
     ]
 
     stop_token_ids = [151329, 151336, 151338]
@@ -249,15 +260,13 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
         limit_mm_per_prompt={modality: 1},
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    messages = [[{
-        'role': 'user',
-        'content': f"<image>\n{question}"
-    }] for question in questions]
-    prompts = tokenizer.apply_chat_template(messages,
-                                            tokenize=False,
-                                            add_generation_prompt=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     # Stop tokens for H2OVL-Mississippi
     # https://huggingface.co/h2oai/h2ovl-mississippi-800m
@@ -283,15 +292,14 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
         # if you are running out of memory, you can reduce the "longest_edge".
         # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
         mm_processor_kwargs={
-            "size": {
-                "longest_edge": 3 * 364
-            },
+            "size": {"longest_edge": 3 * 364},
         },
         limit_mm_per_prompt={modality: 1},
     )
-    prompts = [(
-        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
-    ) for question in questions]
+    prompts = [
+        (f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:")
+        for question in questions
+    ]
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -310,9 +318,7 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
         max_num_seqs=2,
         enforce_eager=True,
         mm_processor_kwargs={
-            "max_image_size": {
-                "longest_edge": 384
-            },
+            "max_image_size": {"longest_edge": 384},
         },
         limit_mm_per_prompt={modality: 1},
     )
@@ -329,26 +335,28 @@ def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
 
 # InternVL
 def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
-    assert modality == "image"
-
-    model_name = "OpenGVLab/InternVL2-2B"
+    model_name = "OpenGVLab/InternVL3-2B"
 
     engine_args = EngineArgs(
         model=model_name,
         trust_remote_code=True,
-        max_model_len=4096,
+        max_model_len=8192,
         limit_mm_per_prompt={modality: 1},
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    messages = [[{
-        'role': 'user',
-        'content': f"<image>\n{question}"
-    }] for question in questions]
-    prompts = tokenizer.apply_chat_template(messages,
-                                            tokenize=False,
-                                            add_generation_prompt=True)
+    if modality == "image":
+        placeholder = "<image>"
+    elif modality == "video":
+        placeholder = "<video>"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     # Stop tokens for InternVL
     # models variants may have different stop tokens
@@ -356,6 +364,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -371,7 +380,8 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
     prompts = [
         "<|im_user|>user<|im_middle|><|media_start|>image<|media_content|>"
         f"<|media_pad|><|media_end|>{question}<|im_end|>"
-        "<|im_assistant|>assistant<|im_middle|>" for question in questions
+        "<|im_assistant|>assistant<|im_middle|>"
+        for question in questions
     ]
 
     engine_args = EngineArgs(
@@ -391,9 +401,7 @@ def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
 def run_llava(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
-    prompts = [
-        f"USER: <image>\n{question}\nASSISTANT:" for question in questions
-    ]
+    prompts = [f"USER: <image>\n{question}\nASSISTANT:" for question in questions]
 
     engine_args = EngineArgs(
         model="llava-hf/llava-1.5-7b-hf",
@@ -426,13 +434,10 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
 
 # LlaVA-NeXT-Video
 # Currently only support for video input
-def run_llava_next_video(questions: list[str],
-                         modality: str) -> ModelRequestData:
+def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "video"
 
-    prompts = [
-        f"USER: <video>\n{question} ASSISTANT:" for question in questions
-    ]
+    prompts = [f"USER: <video>\n{question} ASSISTANT:" for question in questions]
     engine_args = EngineArgs(
         model="llava-hf/LLaVA-NeXT-Video-7B-hf",
         max_model_len=8192,
@@ -447,19 +452,19 @@ def run_llava_next_video(questions: list[str],
 
 
 # LLaVA-OneVision
-def run_llava_onevision(questions: list[str],
-                        modality: str) -> ModelRequestData:
-
+def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData:
     if modality == "video":
         prompts = [
             f"<|im_start|>user <video>\n{question}<|im_end|> \
-        <|im_start|>assistant\n" for question in questions
+        <|im_start|>assistant\n"
+            for question in questions
         ]
 
     elif modality == "image":
         prompts = [
             f"<|im_start|>user <image>\n{question}<|im_end|> \
-        <|im_start|>assistant\n" for question in questions
+        <|im_start|>assistant\n"
+            for question in questions
         ]
 
     engine_args = EngineArgs(
@@ -478,11 +483,8 @@ def run_llava_onevision(questions: list[str],
 def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
 
-    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'  # noqa: E501
-    prompts = [
-        llama3_template.format(f"{question}\n<image>")
-        for question in questions
-    ]
+    llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"  # noqa: E501
+    prompts = [llama3_template.format(f"{question}\n<image>") for question in questions]
 
     engine_args = EngineArgs(
         model="TIGER-Lab/Mantis-8B-siglip-llama3",
@@ -522,8 +524,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
     # 2.6: image, video
     # o2.6: image, video, audio
     # model_name = "openbmb/MiniCPM-o-2_6"
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
     engine_args = EngineArgs(
         model=model_name,
         max_model_len=4096,
@@ -539,7 +540,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
     # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
 
     # 2.6 / o2.6
-    stop_tokens = ['<|im_end|>', '<|endoftext|>']
+    stop_tokens = ["<|im_end|>", "<|endoftext|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 
     modality_placeholder = {
@@ -549,12 +550,16 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
 
     prompts = [
         tokenizer.apply_chat_template(
-            [{
-                'role': 'user',
-                'content': f"{modality_placeholder[modality]}\n{question}"
-            }],
+            [
+                {
+                    "role": "user",
+                    "content": f"{modality_placeholder[modality]}\n{question}",
+                }
+            ],
             tokenize=False,
-            add_generation_prompt=True) for question in questions
+            add_generation_prompt=True,
+        )
+        for question in questions
     ]
 
     return ModelRequestData(
@@ -614,19 +619,18 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    messages = [[{
-        "role":
-        "user",
-        "content": [{
-            "type": "image"
-        }, {
-            "type": "text",
-            "text": question
-        }]
-    }] for question in questions]
-    prompts = tokenizer.apply_chat_template(messages,
-                                            add_generation_prompt=True,
-                                            tokenize=False)
+    messages = [
+        [
+            {
+                "role": "user",
+                "content": [{"type": "image"}, {"type": "text", "text": question}],
+            }
+        ]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=False
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -649,19 +653,18 @@ def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
-    messages = [[{
-        "role":
-        "user",
-        "content": [{
-            "type": "image"
-        }, {
-            "type": "text",
-            "text": f"{question}"
-        }]
-    }] for question in questions]
-    prompts = tokenizer.apply_chat_template(messages,
-                                            add_generation_prompt=True,
-                                            tokenize=False)
+    messages = [
+        [
+            {
+                "role": "user",
+                "content": [{"type": "image"}, {"type": "text", "text": f"{question}"}],
+            }
+        ]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, add_generation_prompt=True, tokenize=False
+    )
     stop_token_ids = None
     return ModelRequestData(
         engine_args=engine_args,
@@ -685,7 +688,8 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
 
     prompts = [
         f"<|im_start|>user <image>\n{question}<|im_end|> \
-        <|im_start|>assistant\n" for question in questions
+        <|im_start|>assistant\n"
+        for question in questions
     ]
 
     return ModelRequestData(
@@ -709,15 +713,13 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
         limit_mm_per_prompt={modality: 1},
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    messages = [[{
-        'role': 'user',
-        'content': f"<image>\n{question}"
-    }] for question in questions]
-    prompts = tokenizer.apply_chat_template(messages,
-                                            tokenize=False,
-                                            add_generation_prompt=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -740,15 +742,13 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
         limit_mm_per_prompt={modality: 1},
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    messages = [[{
-        'role': 'user',
-        'content': f"<image>\n{question}"
-    }] for question in questions]
-    prompts = tokenizer.apply_chat_template(messages,
-                                            tokenize=False,
-                                            add_generation_prompt=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -839,8 +839,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
     # we have to manually specify the path of the lora weights.
     vision_lora_path = os.path.join(model_path, "vision-lora")
     prompts = [
-        f"<|user|><|image_1|>{question}<|end|><|assistant|>"
-        for question in questions
+        f"<|user|><|image_1|>{question}<|end|><|assistant|>" for question in questions
     ]
     engine_args = EngineArgs(
         model=model_path,
@@ -907,7 +906,6 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
 
 # Qwen2-VL
 def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
-
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
     engine_args = EngineArgs(
@@ -928,10 +926,13 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
         placeholder = "<|video_pad|>"
 
     prompts = [
-        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
-         f"{question}<|im_end|>\n"
-         "<|im_start|>assistant\n") for question in questions
+        (
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
     ]
 
     return ModelRequestData(
@@ -942,7 +943,6 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
 
 # Qwen2.5-VL
 def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
-
     model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
 
     engine_args = EngineArgs(
@@ -963,10 +963,13 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
         placeholder = "<|video_pad|>"
 
     prompts = [
-        ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-         f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
-         f"{question}<|im_end|>\n"
-         "<|im_start|>assistant\n") for question in questions
+        (
+            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
     ]
 
     return ModelRequestData(
@@ -999,12 +1002,18 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
     default_system = (
         "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
         "Group, capable of perceiving auditory and visual inputs, as well as "
-        "generating text and speech.")
+        "generating text and speech."
+    )
 
-    prompts = [(f"<|im_start|>system\n{default_system}<|im_end|>\n"
-                f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
-                f"{question}<|im_end|>\n"
-                "<|im_start|>assistant\n") for question in questions]
+    prompts = [
+        (
+            f"<|im_start|>system\n{default_system}<|im_end|>\n"
+            f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
     return ModelRequestData(
         engine_args=engine_args,
         prompts=prompts,
@@ -1024,15 +1033,13 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
         limit_mm_per_prompt={modality: 1},
     )
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    messages = [[{
-        'role': 'user',
-        'content': f"<image>\n{question}"
-    }] for question in questions]
-    prompts = tokenizer.apply_chat_template(messages,
-                                            tokenize=False,
-                                            add_generation_prompt=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"<image>\n{question}"}] for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     # Stop tokens for SkyworkR1V
     # https://huggingface.co/Skywork/Skywork-R1V-38B/blob/main/conversation.py
@@ -1096,8 +1103,7 @@ def get_multi_modal_input(args):
     """
     if args.modality == "image":
         # Input image and question
-        image = ImageAsset("cherry_blossom") \
-            .pil_image.convert("RGB")
+        image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
         img_questions = [
             "What is the content of this image?",
             "Describe the content of this image in detail.",
@@ -1112,8 +1118,7 @@ def get_multi_modal_input(args):
 
     if args.modality == "video":
         # Input video and question
-        video = VideoAsset(name="baby_reading",
-                           num_frames=args.num_frames).np_ndarrays
+        video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
         vid_questions = ["Why is this video funny?"]
 
         return {
@@ -1125,12 +1130,13 @@ def get_multi_modal_input(args):
     raise ValueError(msg)
 
 
-def apply_image_repeat(image_repeat_prob, num_prompts, data,
-                       prompts: list[str], modality):
-    """Repeats images with provided probability of "image_repeat_prob". 
+def apply_image_repeat(
+    image_repeat_prob, num_prompts, data, prompts: list[str], modality
+):
+    """Repeats images with provided probability of "image_repeat_prob".
     Used to simulate hit/miss for the MM preprocessor cache.
     """
-    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
+    assert image_repeat_prob <= 1.0 and image_repeat_prob >= 0
     no_yes = [0, 1]
     probs = [1.0 - image_repeat_prob, image_repeat_prob]
 
@@ -1145,12 +1151,12 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data,
                 new_val = (i // 256 // 256, i // 256, i % 256)
                 cur_image.putpixel((0, 0), new_val)
 
-        inputs.append({
-            "prompt": prompts[i % len(prompts)],
-            "multi_modal_data": {
-                modality: cur_image
+        inputs.append(
+            {
+                "prompt": prompts[i % len(prompts)],
+                "multi_modal_data": {modality: cur_image},
             }
-        })
+        )
 
     return inputs
 
@@ -1159,6 +1165,7 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data,
 def time_counter(enable: bool):
     if enable:
         import time
+
         start_time = time.time()
         yield
         elapsed_time = time.time() - start_time
@@ -1171,54 +1178,65 @@ def time_counter(enable: bool):
 
 def parse_args():
     parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
-        'vision language models for text generation')
-    parser.add_argument('--model-type',
-                        '-m',
-                        type=str,
-                        default="llava",
-                        choices=model_example_map.keys(),
-                        help='Huggingface "model_type".')
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        default=4,
-                        help='Number of prompts to run.')
-    parser.add_argument('--modality',
-                        type=str,
-                        default="image",
-                        choices=['image', 'video'],
-                        help='Modality of the input.')
-    parser.add_argument('--num-frames',
-                        type=int,
-                        default=16,
-                        help='Number of frames to extract from the video.')
-    parser.add_argument("--seed",
-                        type=int,
-                        default=None,
-                        help="Set the seed when initializing `vllm.LLM`.")
+        description="Demo on using vLLM for offline inference with "
+        "vision language models for text generation"
+    )
+    parser.add_argument(
+        "--model-type",
+        "-m",
+        type=str,
+        default="llava",
+        choices=model_example_map.keys(),
+        help='Huggingface "model_type".',
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=4, help="Number of prompts to run."
+    )
+    parser.add_argument(
+        "--modality",
+        type=str,
+        default="image",
+        choices=["image", "video"],
+        help="Modality of the input.",
+    )
+    parser.add_argument(
+        "--num-frames",
+        type=int,
+        default=16,
+        help="Number of frames to extract from the video.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
 
     parser.add_argument(
-        '--image-repeat-prob',
+        "--image-repeat-prob",
         type=float,
         default=None,
-        help='Simulates the hit-ratio for multi-modal preprocessor cache'
-        ' (if enabled)')
+        help="Simulates the hit-ratio for multi-modal preprocessor cache (if enabled)",
+    )
 
     parser.add_argument(
-        '--disable-mm-preprocessor-cache',
-        action='store_true',
-        help='If True, disables caching of multi-modal preprocessor/mapper.')
+        "--disable-mm-preprocessor-cache",
+        action="store_true",
+        help="If True, disables caching of multi-modal preprocessor/mapper.",
+    )
 
     parser.add_argument(
-        '--time-generate',
-        action='store_true',
-        help='If True, then print the total generate() call time')
+        "--time-generate",
+        action="store_true",
+        help="If True, then print the total generate() call time",
+    )
 
     parser.add_argument(
-        '--use-different-prompt-per-request',
-        action='store_true',
-        help='If True, then use different prompt (with the same multi-modal '
-        'data) for each request.')
+        "--use-different-prompt-per-request",
+        action="store_true",
+        help="If True, then use different prompt (with the same multi-modal "
+        "data) for each request.",
+    )
     return parser.parse_args()
 
 
@@ -1237,7 +1255,8 @@ def main(args):
     # Disable other modalities to save memory
     default_limits = {"image": 0, "video": 0, "audio": 0}
     req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-        req_data.engine_args.limit_mm_per_prompt or {})
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
 
     engine_args = asdict(req_data.engine_args) | {
         "seed": args.seed,
@@ -1246,44 +1265,46 @@ def main(args):
     llm = LLM(**engine_args)
 
     # Don't want to check the flag multiple times, so just hijack `prompts`.
-    prompts = req_data.prompts if args.use_different_prompt_per_request else [
-        req_data.prompts[0]
-    ]
+    prompts = (
+        req_data.prompts
+        if args.use_different_prompt_per_request
+        else [req_data.prompts[0]]
+    )
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(temperature=0.2,
-                                     max_tokens=64,
-                                     stop_token_ids=req_data.stop_token_ids)
+    sampling_params = SamplingParams(
+        temperature=0.2, max_tokens=64, stop_token_ids=req_data.stop_token_ids
+    )
 
     assert args.num_prompts > 0
     if args.num_prompts == 1:
         # Single inference
         inputs = {
             "prompt": prompts[0],
-            "multi_modal_data": {
-                modality: data
-            },
+            "multi_modal_data": {modality: data},
         }
     else:
         # Batch inference
         if args.image_repeat_prob is not None:
             # Repeat images with specified probability of "image_repeat_prob"
-            inputs = apply_image_repeat(args.image_repeat_prob,
-                                        args.num_prompts, data, prompts,
-                                        modality)
+            inputs = apply_image_repeat(
+                args.image_repeat_prob, args.num_prompts, data, prompts, modality
+            )
         else:
             # Use the same image for all prompts
-            inputs = [{
-                "prompt": prompts[i % len(prompts)],
-                "multi_modal_data": {
-                    modality: data
-                },
-            } for i in range(args.num_prompts)]
+            inputs = [
+                {
+                    "prompt": prompts[i % len(prompts)],
+                    "multi_modal_data": {modality: data},
+                }
+                for i in range(args.num_prompts)
+            ]
 
     # Add LoRA request if applicable
-    lora_request = (req_data.lora_requests *
-                    args.num_prompts if req_data.lora_requests else None)
+    lora_request = (
+        req_data.lora_requests * args.num_prompts if req_data.lora_requests else None
+    )
 
     with time_counter(args.time_generate):
         outputs = llm.generate(
diff --git a/examples/offline_inference/vision_language_embedding.py b/examples/offline_inference/vision_language_embedding.py
index 2637949551a1..cee02d06c607 100644
--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -6,6 +6,7 @@
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
+
 from argparse import Namespace
 from dataclasses import asdict
 from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
@@ -44,19 +45,17 @@ class ModelRequestData(NamedTuple):
 
 
 def run_e5_v(query: Query) -> ModelRequestData:
-    llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
+    llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"  # noqa: E501
 
     if query["modality"] == "text":
         text = query["text"]
-        prompt = llama3_template.format(
-            f"{text}\nSummary above sentence in one word: ")
+        prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ")
         image = None
     elif query["modality"] == "image":
-        prompt = llama3_template.format(
-            "<image>\nSummary above image in one word: ")
+        prompt = llama3_template.format("<image>\nSummary above image in one word: ")
         image = query["image"]
     else:
-        modality = query['modality']
+        modality = query["modality"]
         raise ValueError(f"Unsupported query modality: '{modality}'")
 
     engine_args = EngineArgs(
@@ -83,10 +82,12 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
         image = query["image"]
     elif query["modality"] == "text+image":
         text = query["text"]
-        prompt = f"<|image_1|> Represent the given image with the following question: {text}"  # noqa: E501
+        prompt = (
+            f"<|image_1|> Represent the given image with the following question: {text}"  # noqa: E501
+        )
         image = query["image"]
     else:
-        modality = query['modality']
+        modality = query["modality"]
         raise ValueError(f"Unsupported query modality: '{modality}'")
 
     engine_args = EngineArgs(
@@ -136,7 +137,8 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
     # Disable other modalities to save memory
     default_limits = {"image": 0, "video": 0, "audio": 0}
     req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-        req_data.engine_args.limit_mm_per_prompt or {})
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
 
     engine_args = asdict(req_data.engine_args) | {"seed": seed}
     llm = LLM(**engine_args)
@@ -145,10 +147,12 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
     if req_data.image is not None:
         mm_data["image"] = req_data.image
 
-    outputs = llm.embed({
-        "prompt": req_data.prompt,
-        "multi_modal_data": mm_data,
-    })
+    outputs = llm.embed(
+        {
+            "prompt": req_data.prompt,
+            "multi_modal_data": mm_data,
+        }
+    )
 
     print("-" * 50)
     for output in outputs:
@@ -164,23 +168,30 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
 
 def parse_args():
     parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
-        'vision language models for multimodal embedding')
-    parser.add_argument('--model-name',
-                        '-m',
-                        type=str,
-                        default="vlm2vec",
-                        choices=model_example_map.keys(),
-                        help='The name of the embedding model.')
-    parser.add_argument('--modality',
-                        type=str,
-                        default="image",
-                        choices=get_args(QueryModality),
-                        help='Modality of the input.')
-    parser.add_argument("--seed",
-                        type=int,
-                        default=None,
-                        help="Set the seed when initializing `vllm.LLM`.")
+        description="Demo on using vLLM for offline inference with "
+        "vision language models for multimodal embedding"
+    )
+    parser.add_argument(
+        "--model-name",
+        "-m",
+        type=str,
+        default="vlm2vec",
+        choices=model_example_map.keys(),
+        help="The name of the embedding model.",
+    )
+    parser.add_argument(
+        "--modality",
+        type=str,
+        default="image",
+        choices=get_args(QueryModality),
+        help="Modality of the input.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
     return parser.parse_args()
 
 
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
index 20a8e635e322..e776ff7fe6ae 100644
--- a/examples/offline_inference/vision_language_multi_image.py
+++ b/examples/offline_inference/vision_language_multi_image.py
@@ -4,6 +4,7 @@
 multi-image input on vision language models for text generation,
 using the chat template defined by the model.
 """
+
 import os
 from argparse import Namespace
 from dataclasses import asdict
@@ -59,8 +60,9 @@ def load_aria(question: str, image_urls: list[str]) -> ModelRequestData:
         limit_mm_per_prompt={"image": len(image_urls)},
     )
     placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
-    prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
-              "<|im_start|>assistant\n")
+    prompt = (
+        f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n<|im_start|>assistant\n"
+    )
     stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
 
     return ModelRequestData(
@@ -81,23 +83,21 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
     placeholders = [{"type": "image", "image": url} for url in image_urls]
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *placeholders,
-            {
-                "type": "text",
-                "text": question
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
 
     processor = AutoProcessor.from_pretrained(model_name)
 
-    prompt = processor.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -106,8 +106,7 @@ def load_aya_vision(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
-def load_deepseek_vl2(question: str,
-                      image_urls: list[str]) -> ModelRequestData:
+def load_deepseek_vl2(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "deepseek-ai/deepseek-vl2-tiny"
 
     engine_args = EngineArgs(
@@ -118,8 +117,9 @@ def load_deepseek_vl2(question: str,
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
-    placeholder = "".join(f"image_{i}:<image>\n"
-                          for i, _ in enumerate(image_urls, start=1))
+    placeholder = "".join(
+        f"image_{i}:<image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
     prompt = f"<|User|>: {placeholder}{question}\n\n<|Assistant|>:"
 
     return ModelRequestData(
@@ -140,23 +140,21 @@ def load_gemma3(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
     placeholders = [{"type": "image", "image": url} for url in image_urls]
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *placeholders,
-            {
-                "type": "text",
-                "text": question
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
 
     processor = AutoProcessor.from_pretrained(model_name)
 
-    prompt = processor.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -176,15 +174,15 @@ def load_h2ovl(question: str, image_urls: list[str]) -> ModelRequestData:
         mm_processor_kwargs={"max_dynamic_patch": 4},
     )
 
-    placeholders = "\n".join(f"Image-{i}: <image>\n"
-                             for i, _ in enumerate(image_urls, start=1))
-    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     # Stop tokens for H2OVL-Mississippi
     # https://huggingface.co/h2oai/h2ovl-mississippi-800m
@@ -211,14 +209,13 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
         # if you are running out of memory, you can reduce the "longest_edge".
         # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
         mm_processor_kwargs={
-            "size": {
-                "longest_edge": 2 * 364
-            },
+            "size": {"longest_edge": 2 * 364},
         },
     )
 
-    placeholders = "\n".join(f"Image-{i}: <image>\n"
-                             for i, _ in enumerate(image_urls, start=1))
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
     prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
     return ModelRequestData(
         engine_args=engine_args,
@@ -238,15 +235,16 @@ def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
         enforce_eager=True,
         limit_mm_per_prompt={"image": len(image_urls)},
         mm_processor_kwargs={
-            "max_image_size": {
-                "longest_edge": 384
-            },
+            "max_image_size": {"longest_edge": 384},
         },
     )
 
-    placeholders = "\n".join(f"Image-{i}: <image>\n"
-                             for i, _ in enumerate(image_urls, start=1))
-    prompt = f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    prompt = (
+        f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    )
     return ModelRequestData(
         engine_args=engine_args,
         prompt=prompt,
@@ -265,15 +263,15 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
         mm_processor_kwargs={"max_dynamic_patch": 4},
     )
 
-    placeholders = "\n".join(f"Image-{i}: <image>\n"
-                             for i, _ in enumerate(image_urls, start=1))
-    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     # Stop tokens for InternVL
     # models variants may have different stop tokens
@@ -301,23 +299,21 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
     placeholders = [{"type": "image", "image": url} for url in image_urls]
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *placeholders,
-            {
-                "type": "text",
-                "text": question
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
 
     processor = AutoProcessor.from_pretrained(model_name)
 
-    prompt = processor.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -338,24 +334,21 @@ def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
     placeholders = [{"type": "image", "image": url} for url in image_urls]
-    messages = [{
-        "role":
-        "user",
-        "content": [
-            *placeholders,
-            {
-                "type": "text",
-                "text": question
-            },
-        ],
-    }]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
 
-    processor = AutoProcessor.from_pretrained(model_name,
-                                              trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
 
-    prompt = processor.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -419,15 +412,15 @@ def load_nvlm_d(question: str, image_urls: list[str]) -> ModelRequestData:
         mm_processor_kwargs={"max_dynamic_patch": 4},
     )
 
-    placeholders = "\n".join(f"Image-{i}: <image>\n"
-                             for i, _ in enumerate(image_urls, start=1))
-    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -449,15 +442,15 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
         limit_mm_per_prompt={"image": len(image_urls)},
     )
 
-    placeholders = "\n".join(f"Image-{i}: <image>\n"
-                             for i, _ in enumerate(image_urls, start=1))
-    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -509,8 +502,9 @@ def load_phi3v(question: str, image_urls: list[str]) -> ModelRequestData:
         limit_mm_per_prompt={"image": len(image_urls)},
         mm_processor_kwargs={"num_crops": 4},
     )
-    placeholders = "\n".join(f"<|image_{i}|>"
-                             for i, _ in enumerate(image_urls, start=1))
+    placeholders = "\n".join(
+        f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)
+    )
     prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
 
     return ModelRequestData(
@@ -542,8 +536,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
         mm_processor_kwargs={"dynamic_hd": 4},
     )
 
-    placeholders = "".join(f"<|image_{i}|>"
-                           for i, _ in enumerate(image_urls, start=1))
+    placeholders = "".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1))
     prompt = f"<|user|>{placeholders}{question}<|end|><|assistant|>"
 
     return ModelRequestData(
@@ -554,8 +547,7 @@ def load_phi4mm(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
-def load_qwen_vl_chat(question: str,
-                      image_urls: list[str]) -> ModelRequestData:
+def load_qwen_vl_chat(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "Qwen/Qwen-VL-Chat"
     engine_args = EngineArgs(
         model=model_name,
@@ -565,24 +557,26 @@ def load_qwen_vl_chat(question: str,
         hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
         limit_mm_per_prompt={"image": len(image_urls)},
     )
-    placeholders = "".join(f"Picture {i}: <img></img>\n"
-                           for i, _ in enumerate(image_urls, start=1))
+    placeholders = "".join(
+        f"Picture {i}: <img></img>\n" for i, _ in enumerate(image_urls, start=1)
+    )
 
     # This model does not have a chat_template attribute on its tokenizer,
     # so we need to explicitly pass it. We use ChatML since it's used in the
     # generation utils of the model:
     # https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
-    tokenizer = AutoTokenizer.from_pretrained(model_name,
-                                              trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
     # Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
     chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"  # noqa: E501
 
-    messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True,
-                                           chat_template=chat_template)
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True,
+        chat_template=chat_template,
+    )
 
     stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
@@ -600,9 +594,11 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     try:
         from qwen_vl_utils import process_vision_info
     except ModuleNotFoundError:
-        print('WARNING: `qwen-vl-utils` not installed, input images will not '
-              'be automatically resized. You can enable this functionality by '
-              '`pip install qwen-vl-utils`.')
+        print(
+            "WARNING: `qwen-vl-utils` not installed, input images will not "
+            "be automatically resized. You can enable this functionality by "
+            "`pip install qwen-vl-utils`."
+        )
         process_vision_info = None
 
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
@@ -616,26 +612,22 @@ def load_qwen2_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
     placeholders = [{"type": "image", "image": url} for url in image_urls]
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role":
-        "user",
-        "content": [
-            *placeholders,
-            {
-                "type": "text",
-                "text": question
-            },
-        ],
-    }]
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
 
     processor = AutoProcessor.from_pretrained(model_name)
 
-    prompt = processor.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     if process_vision_info is None:
         image_data = [fetch_image(url) for url in image_urls]
@@ -653,9 +645,11 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     try:
         from qwen_vl_utils import process_vision_info
     except ModuleNotFoundError:
-        print('WARNING: `qwen-vl-utils` not installed, input images will not '
-              'be automatically resized. You can enable this functionality by '
-              '`pip install qwen-vl-utils`.')
+        print(
+            "WARNING: `qwen-vl-utils` not installed, input images will not "
+            "be automatically resized. You can enable this functionality by "
+            "`pip install qwen-vl-utils`."
+        )
         process_vision_info = None
 
     model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
@@ -668,32 +662,27 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
     placeholders = [{"type": "image", "image": url} for url in image_urls]
-    messages = [{
-        "role": "system",
-        "content": "You are a helpful assistant."
-    }, {
-        "role":
-        "user",
-        "content": [
-            *placeholders,
-            {
-                "type": "text",
-                "text": question
-            },
-        ],
-    }]
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
 
     processor = AutoProcessor.from_pretrained(model_name)
 
-    prompt = processor.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
 
     if process_vision_info is None:
         image_data = [fetch_image(url) for url in image_urls]
     else:
-        image_data, _ = process_vision_info(messages,
-                                            return_video_kwargs=False)
+        image_data, _ = process_vision_info(messages, return_video_kwargs=False)
 
     return ModelRequestData(
         engine_args=engine_args,
@@ -726,23 +715,20 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
 }
 
 
-def run_generate(model, question: str, image_urls: list[str],
-                 seed: Optional[int]):
+def run_generate(model, question: str, image_urls: list[str], seed: Optional[int]):
     req_data = model_example_map[model](question, image_urls)
 
     engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
     llm = LLM(**engine_args)
 
-    sampling_params = SamplingParams(temperature=0.0,
-                                     max_tokens=256,
-                                     stop_token_ids=req_data.stop_token_ids)
+    sampling_params = SamplingParams(
+        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
+    )
 
     outputs = llm.generate(
         {
             "prompt": req_data.prompt,
-            "multi_modal_data": {
-                "image": req_data.image_data
-            },
+            "multi_modal_data": {"image": req_data.image_data},
         },
         sampling_params=sampling_params,
         lora_request=req_data.lora_requests,
@@ -755,38 +741,40 @@ def run_generate(model, question: str, image_urls: list[str],
         print("-" * 50)
 
 
-def run_chat(model: str, question: str, image_urls: list[str],
-             seed: Optional[int]):
+def run_chat(model: str, question: str, image_urls: list[str], seed: Optional[int]):
     req_data = model_example_map[model](question, image_urls)
 
     # Disable other modalities to save memory
     default_limits = {"image": 0, "video": 0, "audio": 0}
     req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-        req_data.engine_args.limit_mm_per_prompt or {})
+        req_data.engine_args.limit_mm_per_prompt or {}
+    )
 
     engine_args = asdict(req_data.engine_args) | {"seed": seed}
     llm = LLM(**engine_args)
 
-    sampling_params = SamplingParams(temperature=0.0,
-                                     max_tokens=256,
-                                     stop_token_ids=req_data.stop_token_ids)
+    sampling_params = SamplingParams(
+        temperature=0.0, max_tokens=256, stop_token_ids=req_data.stop_token_ids
+    )
     outputs = llm.chat(
-        [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": question,
-                },
-                *({
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": question,
                     },
-                } for image_url in image_urls),
-            ],
-        }],
+                    *(
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": image_url},
+                        }
+                        for image_url in image_urls
+                    ),
+                ],
+            }
+        ],
         sampling_params=sampling_params,
         chat_template=req_data.chat_template,
         lora_request=req_data.lora_requests,
@@ -801,32 +789,39 @@ def run_chat(model: str, question: str, image_urls: list[str],
 
 def parse_args():
     parser = FlexibleArgumentParser(
-        description='Demo on using vLLM for offline inference with '
-        'vision language models that support multi-image input for text '
-        'generation')
-    parser.add_argument('--model-type',
-                        '-m',
-                        type=str,
-                        default="phi3_v",
-                        choices=model_example_map.keys(),
-                        help='Huggingface "model_type".')
-    parser.add_argument("--method",
-                        type=str,
-                        default="generate",
-                        choices=["generate", "chat"],
-                        help="The method to run in `vllm.LLM`.")
-    parser.add_argument("--seed",
-                        type=int,
-                        default=None,
-                        help="Set the seed when initializing `vllm.LLM`.")
+        description="Demo on using vLLM for offline inference with "
+        "vision language models that support multi-image input for text "
+        "generation"
+    )
+    parser.add_argument(
+        "--model-type",
+        "-m",
+        type=str,
+        default="phi3_v",
+        choices=model_example_map.keys(),
+        help='Huggingface "model_type".',
+    )
+    parser.add_argument(
+        "--method",
+        type=str,
+        default="generate",
+        choices=["generate", "chat"],
+        help="The method to run in `vllm.LLM`.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=None,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
     parser.add_argument(
         "--num-images",
         "-n",
         type=int,
-        choices=list(range(1,
-                           len(IMAGE_URLS) + 1)),  # the max number of images
+        choices=list(range(1, len(IMAGE_URLS) + 1)),  # the max number of images
         default=2,
-        help="Number of images to use for the demo.")
+        help="Number of images to use for the demo.",
+    )
     return parser.parse_args()
 
 
@@ -835,7 +830,7 @@ def main(args: Namespace):
     method = args.method
     seed = args.seed
 
-    image_urls = IMAGE_URLS[:args.num_images]
+    image_urls = IMAGE_URLS[: args.num_images]
 
     if method == "generate":
         run_generate(model, QUESTION, image_urls, seed)
diff --git a/examples/online_serving/api_client.py b/examples/online_serving/api_client.py
index 36079ff11d07..cc190e91c141 100644
--- a/examples/online_serving/api_client.py
+++ b/examples/online_serving/api_client.py
@@ -17,16 +17,15 @@
 
 
 def clear_line(n: int = 1) -> None:
-    LINE_UP = '\033[1A'
-    LINE_CLEAR = '\x1b[2K'
+    LINE_UP = "\033[1A"
+    LINE_CLEAR = "\x1b[2K"
     for _ in range(n):
         print(LINE_UP, end=LINE_CLEAR, flush=True)
 
 
-def post_http_request(prompt: str,
-                      api_url: str,
-                      n: int = 1,
-                      stream: bool = False) -> requests.Response:
+def post_http_request(
+    prompt: str, api_url: str, n: int = 1, stream: bool = False
+) -> requests.Response:
     headers = {"User-Agent": "Test Client"}
     pload = {
         "prompt": prompt,
@@ -35,17 +34,14 @@ def post_http_request(prompt: str,
         "max_tokens": 16,
         "stream": stream,
     }
-    response = requests.post(api_url,
-                             headers=headers,
-                             json=pload,
-                             stream=stream)
+    response = requests.post(api_url, headers=headers, json=pload, stream=stream)
     return response
 
 
 def get_streaming_response(response: requests.Response) -> Iterable[list[str]]:
-    for chunk in response.iter_lines(chunk_size=8192,
-                                     decode_unicode=False,
-                                     delimiter=b"\n"):
+    for chunk in response.iter_lines(
+        chunk_size=8192, decode_unicode=False, delimiter=b"\n"
+    ):
         if chunk:
             data = json.loads(chunk.decode("utf-8"))
             output = data["text"]
diff --git a/examples/online_serving/cohere_rerank_client.py b/examples/online_serving/cohere_rerank_client.py
index c2d4ef08ddbb..e57b94e8805f 100644
--- a/examples/online_serving/cohere_rerank_client.py
+++ b/examples/online_serving/cohere_rerank_client.py
@@ -6,6 +6,7 @@
 
 run: vllm serve BAAI/bge-reranker-base
 """
+
 from typing import Union
 
 import cohere
@@ -16,28 +17,28 @@
 query = "What is the capital of France?"
 
 documents = [
-    "The capital of France is Paris", "Reranking is fun!",
-    "vLLM is an open-source framework for fast AI serving"
+    "The capital of France is Paris",
+    "Reranking is fun!",
+    "vLLM is an open-source framework for fast AI serving",
 ]
 
 
-def cohere_rerank(client: Union[Client, ClientV2], model: str, query: str,
-                  documents: list[str]) -> dict:
+def cohere_rerank(
+    client: Union[Client, ClientV2], model: str, query: str, documents: list[str]
+) -> dict:
     return client.rerank(model=model, query=query, documents=documents)
 
 
 def main():
     # cohere v1 client
-    cohere_v1 = cohere.Client(base_url="http://localhost:8000",
-                              api_key="sk-fake-key")
+    cohere_v1 = cohere.Client(base_url="http://localhost:8000", api_key="sk-fake-key")
     rerank_v1_result = cohere_rerank(cohere_v1, model, query, documents)
     print("-" * 50)
     print("rerank_v1_result:\n", rerank_v1_result)
     print("-" * 50)
 
     # or the v2
-    cohere_v2 = cohere.ClientV2("sk-fake-key",
-                                base_url="http://localhost:8000")
+    cohere_v2 = cohere.ClientV2("sk-fake-key", base_url="http://localhost:8000")
     rerank_v2_result = cohere_rerank(cohere_v2, model, query, documents)
     print("rerank_v2_result:\n", rerank_v2_result)
     print("-" * 50)
diff --git a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
index c6d26778ee49..2ffba4a7ed3f 100644
--- a/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
+++ b/examples/online_serving/disaggregated_serving/disagg_proxy_demo.py
@@ -13,6 +13,7 @@
 Note: This demo will be removed once the PDController implemented in PR 15343
 (https://github.com/vllm-project/vllm/pull/15343) supports XpYd.
 """
+
 import argparse
 import ipaddress
 import itertools
@@ -26,8 +27,7 @@
 import aiohttp
 import requests
 import uvicorn
-from fastapi import (APIRouter, Depends, FastAPI, Header, HTTPException,
-                     Request, status)
+from fastapi import APIRouter, Depends, FastAPI, Header, HTTPException, Request, status
 from fastapi.responses import JSONResponse, StreamingResponse
 
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@@ -36,24 +36,24 @@
 
 
 class SchedulingPolicy(ABC):
-
     @abstractmethod
     def schedule(self, cycler: itertools.cycle):
         raise NotImplementedError("Scheduling Proxy is not set.")
 
 
 class Proxy:
-
     def __init__(
         self,
         prefill_instances: list[str],
         decode_instances: list[str],
         model: str,
         scheduling_policy: SchedulingPolicy,
-        custom_create_completion: Optional[Callable[[Request],
-                                                    StreamingResponse]] = None,
-        custom_create_chat_completion: Optional[Callable[
-            [Request], StreamingResponse]] = None,
+        custom_create_completion: Optional[
+            Callable[[Request], StreamingResponse]
+        ] = None,
+        custom_create_chat_completion: Optional[
+            Callable[[Request], StreamingResponse]
+        ] = None,
     ):
         self.prefill_instances = prefill_instances
         self.decode_instances = decode_instances
@@ -68,30 +68,30 @@ def __init__(
 
     def setup_routes(self):
         self.router.post(
-            "/v1/completions",
-            dependencies=[
-                Depends(self.validate_json_request)
-            ])(self.custom_create_completion if self.
-               custom_create_completion else self.create_completion)
+            "/v1/completions", dependencies=[Depends(self.validate_json_request)]
+        )(
+            self.custom_create_completion
+            if self.custom_create_completion
+            else self.create_completion
+        )
         self.router.post(
-            "/v1/chat/completions",
-            dependencies=[
-                Depends(self.validate_json_request)
-            ])(self.custom_create_chat_completion if self.
-               custom_create_chat_completion else self.create_chat_completion)
-        self.router.get("/status",
-                        response_class=JSONResponse)(self.get_status)
-        self.router.post("/instances/add",
-                         dependencies=[Depends(self.api_key_authenticate)
-                                       ])(self.add_instance_endpoint)
+            "/v1/chat/completions", dependencies=[Depends(self.validate_json_request)]
+        )(
+            self.custom_create_chat_completion
+            if self.custom_create_chat_completion
+            else self.create_chat_completion
+        )
+        self.router.get("/status", response_class=JSONResponse)(self.get_status)
+        self.router.post(
+            "/instances/add", dependencies=[Depends(self.api_key_authenticate)]
+        )(self.add_instance_endpoint)
 
     async def validate_json_request(self, raw_request: Request):
         content_type = raw_request.headers.get("content-type", "").lower()
         if content_type != "application/json":
             raise HTTPException(
                 status_code=415,
-                detail=
-                "Unsupported Media Type: Only 'application/json' is allowed",
+                detail="Unsupported Media Type: Only 'application/json' is allowed",
             )
 
     def api_key_authenticate(self, x_api_key: str = Header(...)):
@@ -103,8 +103,7 @@ def api_key_authenticate(self, x_api_key: str = Header(...)):
                 detail="Server configuration error.",
             )
         if x_api_key != expected_api_key:
-            logger.warning("Unauthorized access attempt with API Key: %s",
-                           x_api_key)
+            logger.warning("Unauthorized access attempt with API Key: %s", x_api_key)
             raise HTTPException(
                 status_code=status.HTTP_403_FORBIDDEN,
                 detail="Forbidden: Invalid API Key.",
@@ -113,8 +112,7 @@ def api_key_authenticate(self, x_api_key: str = Header(...)):
     async def validate_instance(self, instance: str) -> bool:
         url = f"http://{instance}/v1/models"
         try:
-            async with aiohttp.ClientSession(
-                    timeout=AIOHTTP_TIMEOUT) as client:
+            async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as client:
                 logger.info("Verifying %s ...", instance)
                 async with client.get(url) as response:
                     if response.status == 200:
@@ -122,12 +120,15 @@ async def validate_instance(self, instance: str) -> bool:
                         if "data" in data and len(data["data"]) > 0:
                             model_cur = data["data"][0].get("id", "")
                             if model_cur == self.model:
-                                logger.info("Instance: %s could be added.",
-                                            instance)
+                                logger.info("Instance: %s could be added.", instance)
                                 return True
                             else:
-                                logger.warning("Mismatch model %s : %s != %s",
-                                               instance, model_cur, self.model)
+                                logger.warning(
+                                    "Mismatch model %s : %s != %s",
+                                    instance,
+                                    model_cur,
+                                    self.model,
+                                )
                                 return False
                         else:
                             return False
@@ -147,48 +148,47 @@ async def add_instance_endpoint(self, request: Request):
             instance_type = data.get("type")
             instance = data.get("instance")
             if instance_type not in ["prefill", "decode"]:
-                raise HTTPException(status_code=400,
-                                    detail="Invalid instance type.")
+                raise HTTPException(status_code=400, detail="Invalid instance type.")
             if not instance or ":" not in instance:
-                raise HTTPException(status_code=400,
-                                    detail="Invalid instance format.")
+                raise HTTPException(status_code=400, detail="Invalid instance format.")
             host, port_str = instance.split(":")
             try:
                 if host != "localhost":
                     ipaddress.ip_address(host)
                 port = int(port_str)
                 if not (0 < port < 65536):
-                    raise HTTPException(status_code=400,
-                                        detail="Invalid port number.")
+                    raise HTTPException(status_code=400, detail="Invalid port number.")
             except Exception as e:
-                raise HTTPException(status_code=400,
-                                    detail="Invalid instance address.") from e
+                raise HTTPException(
+                    status_code=400, detail="Invalid instance address."
+                ) from e
 
             is_valid = await self.validate_instance(instance)
             if not is_valid:
-                raise HTTPException(status_code=400,
-                                    detail="Instance validation failed.")
+                raise HTTPException(
+                    status_code=400, detail="Instance validation failed."
+                )
 
             if instance_type == "prefill":
                 if instance not in self.prefill_instances:
                     self.prefill_instances.append(instance)
-                    self.prefill_cycler = itertools.cycle(
-                        self.prefill_instances)
+                    self.prefill_cycler = itertools.cycle(self.prefill_instances)
                 else:
-                    raise HTTPException(status_code=400,
-                                        detail="Instance already exists.")
+                    raise HTTPException(
+                        status_code=400, detail="Instance already exists."
+                    )
             else:
                 if instance not in self.decode_instances:
                     self.decode_instances.append(instance)
                     self.decode_cycler = itertools.cycle(self.decode_instances)
                 else:
-                    raise HTTPException(status_code=400,
-                                        detail="Instance already exists.")
+                    raise HTTPException(
+                        status_code=400, detail="Instance already exists."
+                    )
 
-            return JSONResponse(content={
-                "message":
-                f"Added {instance} to {instance_type}_instances."
-            })
+            return JSONResponse(
+                content={"message": f"Added {instance} to {instance_type}_instances."}
+            )
         except HTTPException as http_exc:
             raise http_exc
         except Exception as e:
@@ -197,16 +197,16 @@ async def add_instance_endpoint(self, request: Request):
 
     async def forward_request(self, url, data, use_chunked=True):
         async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-            headers = {
-                "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
-            }
+            headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
             try:
-                async with session.post(url=url, json=data,
-                                        headers=headers) as response:
+                async with session.post(
+                    url=url, json=data, headers=headers
+                ) as response:
                     if 200 <= response.status < 300 or 400 <= response.status < 500:  # noqa: E501
                         if use_chunked:
                             async for chunk_bytes in response.content.iter_chunked(  # noqa: E501
-                                    1024):
+                                1024
+                            ):
                                 yield chunk_bytes
                         else:
                             content = await response.read()
@@ -217,20 +217,21 @@ async def forward_request(self, url, data, use_chunked=True):
                             error_content = json.loads(error_content)
                         except json.JSONDecodeError:
                             error_content = error_content
-                        logger.error("Request failed with status %s: %s",
-                                     response.status, error_content)
+                        logger.error(
+                            "Request failed with status %s: %s",
+                            response.status,
+                            error_content,
+                        )
                         raise HTTPException(
                             status_code=response.status,
-                            detail=
-                            f"Request failed with status {response.status}: "
+                            detail=f"Request failed with status {response.status}: "
                             f"{error_content}",
                         )
             except aiohttp.ClientError as e:
                 logger.error("ClientError occurred: %s", str(e))
                 raise HTTPException(
                     status_code=502,
-                    detail=
-                    "Bad Gateway: Error communicating with upstream server.",
+                    detail="Bad Gateway: Error communicating with upstream server.",
                 ) from e
             except Exception as e:
                 logger.error("Unexpected error: %s", str(e))
@@ -258,8 +259,8 @@ async def create_completion(self, raw_request: Request):
             prefill_instance = self.schedule(self.prefill_cycler)
             try:
                 async for _ in self.forward_request(
-                        f"http://{prefill_instance}/v1/completions",
-                        kv_prepare_request):
+                    f"http://{prefill_instance}/v1/completions", kv_prepare_request
+                ):
                     continue
             except HTTPException as http_exc:
                 self.remove_instance_endpoint("prefill", prefill_instance)
@@ -270,7 +271,8 @@ async def create_completion(self, raw_request: Request):
 
             try:
                 generator = self.forward_request(
-                    f"http://{decode_instance}/v1/completions", request)
+                    f"http://{decode_instance}/v1/completions", request
+                )
             except HTTPException as http_exc:
                 self.remove_instance_endpoint("decode", decode_instance)
                 raise http_exc
@@ -295,8 +297,8 @@ async def create_chat_completion(self, raw_request: Request):
             prefill_instance = self.schedule(self.prefill_cycler)
             try:
                 async for _ in self.forward_request(
-                        f"http://{prefill_instance}/v1/chat/completions",
-                        kv_prepare_request):
+                    f"http://{prefill_instance}/v1/chat/completions", kv_prepare_request
+                ):
                     continue
             except HTTPException as http_exc:
                 self.remove_instance_endpoint("prefill", prefill_instance)
@@ -306,8 +308,8 @@ async def create_chat_completion(self, raw_request: Request):
 
             try:
                 generator = self.forward_request(
-                    "http://" + decode_instance + "/v1/chat/completions",
-                    request)
+                    "http://" + decode_instance + "/v1/chat/completions", request
+                )
             except HTTPException as http_exc:
                 self.remove_instance_endpoint("decode", decode_instance)
                 raise http_exc
@@ -318,20 +320,20 @@ async def create_chat_completion(self, raw_request: Request):
             error_messages = [str(e) for e in exc_info if e]
             print("Error occurred in disagg proxy server")
             print(error_messages)
-            return StreamingResponse(content=iter(error_messages),
-                                     media_type="text/event-stream")
+            return StreamingResponse(
+                content=iter(error_messages), media_type="text/event-stream"
+            )
 
     def remove_instance_endpoint(self, instance_type, instance):
-        if (instance_type == "decode" and instance in self.decode_instances):
+        if instance_type == "decode" and instance in self.decode_instances:
             self.decode_instances.remove(instance)
             self.decode_cycler = itertools.cycle(self.decode_instances)
-        if (instance_type == "prefill" and instance in self.decode_instances):
+        if instance_type == "prefill" and instance in self.decode_instances:
             self.prefill_instances.remove(instance)
             self.prefill_cycler = itertools.cycle(self.decode_instances)
 
 
 class RoundRobinSchedulingPolicy(SchedulingPolicy):
-
     def __init__(self):
         super().__init__()
 
@@ -340,15 +342,12 @@ def schedule(self, cycler: itertools.cycle) -> str:
 
 
 class ProxyServer:
-
     def __init__(
         self,
         args: argparse.Namespace,
         scheduling_policy: Optional[SchedulingPolicy] = None,
-        create_completion: Optional[Callable[[Request],
-                                             StreamingResponse]] = None,
-        create_chat_completion: Optional[Callable[[Request],
-                                                  StreamingResponse]] = None,
+        create_completion: Optional[Callable[[Request], StreamingResponse]] = None,
+        create_chat_completion: Optional[Callable[[Request], StreamingResponse]] = None,
     ):
         self.validate_parsed_serve_args(args)
         self.port = args.port
@@ -356,8 +355,11 @@ def __init__(
             prefill_instances=[] if args.prefill is None else args.prefill,
             decode_instances=[] if args.decode is None else args.decode,
             model=args.model,
-            scheduling_policy=(scheduling_policy if scheduling_policy
-                               is not None else RoundRobinSchedulingPolicy()),
+            scheduling_policy=(
+                scheduling_policy
+                if scheduling_policy is not None
+                else RoundRobinSchedulingPolicy()
+            ),
             custom_create_completion=create_completion,
             custom_create_chat_completion=create_chat_completion,
         )
@@ -382,11 +384,9 @@ def validate_instances(self, instances: list):
                     ipaddress.ip_address(host)
                 port = int(port)
                 if not (0 < port < 65536):
-                    raise ValueError(
-                        f"Invalid port number in instance: {instance}")
+                    raise ValueError(f"Invalid port number in instance: {instance}")
             except Exception as e:
-                raise ValueError(
-                    f"Invalid instance {instance}: {str(e)}") from e
+                raise ValueError(f"Invalid instance {instance}: {str(e)}") from e
 
     def verify_model_config(self, instances: list, model: str) -> None:
         model_suffix = model.split("/")[-1]
@@ -399,12 +399,14 @@ def verify_model_config(self, instances: list, model: str) -> None:
                     if model_cur_suffix != model_suffix:
                         raise ValueError(
                             f"{instance} serves a different model: "
-                            f"{model_cur} != {model}")
+                            f"{model_cur} != {model}"
+                        )
                 else:
                     raise ValueError(f"Cannot get model id from {instance}!")
             except requests.RequestException as e:
                 raise ValueError(
-                    f"Error communicating with {instance}: {str(e)}") from e
+                    f"Error communicating with {instance}: {str(e)}"
+                ) from e
 
     def run_server(self):
         app = FastAPI()
@@ -417,11 +419,7 @@ def run_server(self):
 def parse_args():
     # Todo: allow more config
     parser = argparse.ArgumentParser("vLLM disaggregated proxy server.")
-    parser.add_argument("--model",
-                        "-m",
-                        type=str,
-                        required=True,
-                        help="Model name")
+    parser.add_argument("--model", "-m", type=str, required=True, help="Model name")
 
     parser.add_argument(
         "--prefill",
diff --git a/examples/online_serving/gradio_openai_chatbot_webserver.py b/examples/online_serving/gradio_openai_chatbot_webserver.py
index 314f1c5b7395..3f2a3d01b456 100644
--- a/examples/online_serving/gradio_openai_chatbot_webserver.py
+++ b/examples/online_serving/gradio_openai_chatbot_webserver.py
@@ -17,6 +17,7 @@
 2. Rename the downloaded file to: frpc_linux_amd64_v0.3
 3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
 """
+
 import argparse
 
 import gradio as gr
@@ -24,16 +25,12 @@
 
 
 def format_history_to_openai(history):
-    history_openai_format = [{
-        "role": "system",
-        "content": "You are a great AI assistant."
-    }]
+    history_openai_format = [
+        {"role": "system", "content": "You are a great AI assistant."}
+    ]
     for human, assistant in history:
         history_openai_format.append({"role": "user", "content": human})
-        history_openai_format.append({
-            "role": "assistant",
-            "content": assistant
-        })
+        history_openai_format.append({"role": "assistant", "content": assistant})
     return history_openai_format
 
 
@@ -49,17 +46,17 @@ def predict(message, history, client, model_name, temp, stop_token_ids):
         temperature=temp,
         stream=True,
         extra_body={
-            'repetition_penalty':
-            1,
-            'stop_token_ids':
-            [int(id.strip())
-             for id in stop_token_ids.split(',')] if stop_token_ids else []
-        })
+            "repetition_penalty": 1,
+            "stop_token_ids": [int(id.strip()) for id in stop_token_ids.split(",")]
+            if stop_token_ids
+            else [],
+        },
+    )
 
     # Collect all chunks and concatenate them into a full message
     full_message = ""
     for chunk in stream:
-        full_message += (chunk.choices[0].delta.content or "")
+        full_message += chunk.choices[0].delta.content or ""
 
     # Return the full message as a single response
     return full_message
@@ -67,38 +64,34 @@ def predict(message, history, client, model_name, temp, stop_token_ids):
 
 def parse_args():
     parser = argparse.ArgumentParser(
-        description='Chatbot Interface with Customizable Parameters')
-    parser.add_argument('--model-url',
-                        type=str,
-                        default='http://localhost:8000/v1',
-                        help='Model URL')
-    parser.add_argument('-m',
-                        '--model',
-                        type=str,
-                        required=True,
-                        help='Model name for the chatbot')
-    parser.add_argument('--temp',
-                        type=float,
-                        default=0.8,
-                        help='Temperature for text generation')
-    parser.add_argument('--stop-token-ids',
-                        type=str,
-                        default='',
-                        help='Comma-separated stop token IDs')
+        description="Chatbot Interface with Customizable Parameters"
+    )
+    parser.add_argument(
+        "--model-url", type=str, default="http://localhost:8000/v1", help="Model URL"
+    )
+    parser.add_argument(
+        "-m", "--model", type=str, required=True, help="Model name for the chatbot"
+    )
+    parser.add_argument(
+        "--temp", type=float, default=0.8, help="Temperature for text generation"
+    )
+    parser.add_argument(
+        "--stop-token-ids", type=str, default="", help="Comma-separated stop token IDs"
+    )
     parser.add_argument("--host", type=str, default=None)
     parser.add_argument("--port", type=int, default=8001)
     return parser.parse_args()
 
 
 def build_gradio_interface(client, model_name, temp, stop_token_ids):
-
     def chat_predict(message, history):
-        return predict(message, history, client, model_name, temp,
-                       stop_token_ids)
+        return predict(message, history, client, model_name, temp, stop_token_ids)
 
-    return gr.ChatInterface(fn=chat_predict,
-                            title="Chatbot Interface",
-                            description="A simple chatbot powered by vLLM")
+    return gr.ChatInterface(
+        fn=chat_predict,
+        title="Chatbot Interface",
+        description="A simple chatbot powered by vLLM",
+    )
 
 
 def main():
@@ -113,12 +106,13 @@ def main():
     client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
 
     # Define the Gradio chatbot interface using the predict function
-    gradio_interface = build_gradio_interface(client, args.model, args.temp,
-                                              args.stop_token_ids)
+    gradio_interface = build_gradio_interface(
+        client, args.model, args.temp, args.stop_token_ids
+    )
 
-    gradio_interface.queue().launch(server_name=args.host,
-                                    server_port=args.port,
-                                    share=True)
+    gradio_interface.queue().launch(
+        server_name=args.host, server_port=args.port, share=True
+    )
 
 
 if __name__ == "__main__":
diff --git a/examples/online_serving/gradio_webserver.py b/examples/online_serving/gradio_webserver.py
index 2e7c2a0c5838..fd341ff493b5 100644
--- a/examples/online_serving/gradio_webserver.py
+++ b/examples/online_serving/gradio_webserver.py
@@ -17,6 +17,7 @@
 2. Rename the downloaded file to: frpc_linux_amd64_v0.3
 3. Move the file to this location: /home/user/.cache/huggingface/gradio/frpc
 """
+
 import argparse
 import json
 
@@ -31,14 +32,11 @@ def http_bot(prompt):
         "stream": True,
         "max_tokens": 128,
     }
-    response = requests.post(args.model_url,
-                             headers=headers,
-                             json=pload,
-                             stream=True)
-
-    for chunk in response.iter_lines(chunk_size=8192,
-                                     decode_unicode=False,
-                                     delimiter=b"\n"):
+    response = requests.post(args.model_url, headers=headers, json=pload, stream=True)
+
+    for chunk in response.iter_lines(
+        chunk_size=8192, decode_unicode=False, delimiter=b"\n"
+    ):
         if chunk:
             data = json.loads(chunk.decode("utf-8"))
             output = data["text"][0]
@@ -48,10 +46,10 @@ def http_bot(prompt):
 def build_demo():
     with gr.Blocks() as demo:
         gr.Markdown("# vLLM text completion demo\n")
-        inputbox = gr.Textbox(label="Input",
-                              placeholder="Enter text and press ENTER")
-        outputbox = gr.Textbox(label="Output",
-                               placeholder="Generated result from the model")
+        inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")
+        outputbox = gr.Textbox(
+            label="Output", placeholder="Generated result from the model"
+        )
         inputbox.submit(http_bot, [inputbox], [outputbox])
     return demo
 
@@ -60,17 +58,15 @@ def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default=None)
     parser.add_argument("--port", type=int, default=8001)
-    parser.add_argument("--model-url",
-                        type=str,
-                        default="http://localhost:8000/generate")
+    parser.add_argument(
+        "--model-url", type=str, default="http://localhost:8000/generate"
+    )
     return parser.parse_args()
 
 
 def main(args):
     demo = build_demo()
-    demo.queue().launch(server_name=args.host,
-                        server_port=args.port,
-                        share=True)
+    demo.queue().launch(server_name=args.host, server_port=args.port, share=True)
 
 
 if __name__ == "__main__":
diff --git a/examples/online_serving/jinaai_rerank_client.py b/examples/online_serving/jinaai_rerank_client.py
index 3076bba765ce..7eb3d2193f41 100644
--- a/examples/online_serving/jinaai_rerank_client.py
+++ b/examples/online_serving/jinaai_rerank_client.py
@@ -5,6 +5,7 @@
 
 run: vllm serve BAAI/bge-reranker-base
 """
+
 import json
 
 import requests
@@ -14,14 +15,13 @@
 headers = {"accept": "application/json", "Content-Type": "application/json"}
 
 data = {
-    "model":
-    "BAAI/bge-reranker-base",
-    "query":
-    "What is the capital of France?",
+    "model": "BAAI/bge-reranker-base",
+    "query": "What is the capital of France?",
     "documents": [
         "The capital of Brazil is Brasilia.",
-        "The capital of France is Paris.", "Horses and cows are both animals"
-    ]
+        "The capital of France is Paris.",
+        "Horses and cows are both animals",
+    ],
 }
 
 
diff --git a/examples/online_serving/kv_events_subscriber.py b/examples/online_serving/kv_events_subscriber.py
index 88bbbebd7478..65d74dccab80 100644
--- a/examples/online_serving/kv_events_subscriber.py
+++ b/examples/online_serving/kv_events_subscriber.py
@@ -9,17 +9,14 @@
 #
 # Types copied from vllm.distributed.kv_events
 #
-class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True,
-                 gc=False):
+class EventBatch(msgspec.Struct, array_like=True, omit_defaults=True, gc=False):
     ts: float
     events: list[Any]
 
 
-class KVCacheEvent(msgspec.Struct,
-                   array_like=True,
-                   omit_defaults=True,
-                   gc=False,
-                   tag=True):
+class KVCacheEvent(
+    msgspec.Struct, array_like=True, omit_defaults=True, gc=False, tag=True
+):
     """Base class for all KV cache-related events"""
 
 
@@ -77,8 +74,9 @@ def main():
 
                 if last_seq >= 0 and seq > last_seq + 1:
                     missed = seq - last_seq - 1
-                    print(f"Missed {missed} messages"
-                          f" (last: {last_seq}, current: {seq})")
+                    print(
+                        f"Missed {missed} messages (last: {last_seq}, current: {seq})"
+                    )
 
                     replay.send((last_seq + 1).to_bytes(8, "big"))
 
diff --git a/examples/online_serving/openai_chat_completion_client.py b/examples/online_serving/openai_chat_completion_client.py
index 74e0c045d621..2856e3be3e2d 100644
--- a/examples/online_serving/openai_chat_completion_client.py
+++ b/examples/online_serving/openai_chat_completion_client.py
@@ -3,28 +3,35 @@
 NOTE: start a supported chat completion model server with `vllm serve`, e.g.
     vllm serve meta-llama/Llama-2-7b-chat-hf
 """
+
+import argparse
+
 from openai import OpenAI
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-messages = [{
-    "role": "system",
-    "content": "You are a helpful assistant."
-}, {
-    "role": "user",
-    "content": "Who won the world series in 2020?"
-}, {
-    "role": "assistant",
-    "content": "The Los Angeles Dodgers won the World Series in 2020."
-}, {
-    "role": "user",
-    "content": "Where was it played?"
-}]
-
-
-def main():
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Who won the world series in 2020?"},
+    {
+        "role": "assistant",
+        "content": "The Los Angeles Dodgers won the World Series in 2020.",
+    },
+    {"role": "user", "content": "Where was it played?"},
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Client for vLLM API server")
+    parser.add_argument(
+        "--stream", action="store_true", help="Enable streaming response"
+    )
+    return parser.parse_args()
+
+
+def main(args):
     client = OpenAI(
         # defaults to os.environ.get("OPENAI_API_KEY")
         api_key=openai_api_key,
@@ -34,16 +41,23 @@ def main():
     models = client.models.list()
     model = models.data[0].id
 
+    # Chat Completion API
     chat_completion = client.chat.completions.create(
         messages=messages,
         model=model,
+        stream=args.stream,
     )
 
     print("-" * 50)
     print("Chat completion results:")
-    print(chat_completion)
+    if args.stream:
+        for c in chat_completion:
+            print(c)
+    else:
+        print(chat_completion)
     print("-" * 50)
 
 
 if __name__ == "__main__":
-    main()
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
index 2707d46f46e2..8c3c6ecdd4b0 100644
--- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py
@@ -43,7 +43,7 @@ def encode_base64_content_from_url(content_url: str) -> str:
 
     with requests.get(content_url) as response:
         response.raise_for_status()
-        result = base64.b64encode(response.content).decode('utf-8')
+        result = base64.b64encode(response.content).decode("utf-8")
 
     return result
 
@@ -51,10 +51,7 @@ def encode_base64_content_from_url(content_url: str) -> str:
 # Text-only inference
 def run_text_only(model: str) -> None:
     chat_completion = client.chat.completions.create(
-        messages=[{
-            "role": "user",
-            "content": "What's the capital of France?"
-        }],
+        messages=[{"role": "user", "content": "What's the capital of France?"}],
         model=model,
         max_completion_tokens=64,
     )
@@ -65,26 +62,21 @@ def run_text_only(model: str) -> None:
 
 # Single-image input inference
 def run_single_image(model: str) -> None:
-
     ## Use image url in the payload
     image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
     chat_completion_from_url = client.chat.completions.create(
-        messages=[{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this image?"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url},
                     },
-                },
-            ],
-        }],
+                ],
+            }
+        ],
         model=model,
         max_completion_tokens=64,
     )
@@ -95,22 +87,18 @@ def run_single_image(model: str) -> None:
     ## Use base64 encoded image in the payload
     image_base64 = encode_base64_content_from_url(image_url)
     chat_completion_from_base64 = client.chat.completions.create(
-        messages=[{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this image?"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/jpeg;base64,{image_base64}"
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
                     },
-                },
-            ],
-        }],
+                ],
+            }
+        ],
         model=model,
         max_completion_tokens=64,
     )
@@ -124,28 +112,22 @@ def run_multi_image(model: str) -> None:
     image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
     image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
     chat_completion_from_url = client.chat.completions.create(
-        messages=[{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What are the animals in these images?"
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url_duck
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What are the animals in these images?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url_duck},
                     },
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url_lion
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url_lion},
                     },
-                },
-            ],
-        }],
+                ],
+            }
+        ],
         model=model,
         max_completion_tokens=64,
     )
@@ -161,22 +143,18 @@ def run_video(model: str) -> None:
 
     ## Use video url in the payload
     chat_completion_from_url = client.chat.completions.create(
-        messages=[{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this video?"
-                },
-                {
-                    "type": "video_url",
-                    "video_url": {
-                        "url": video_url
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this video?"},
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": video_url},
                     },
-                },
-            ],
-        }],
+                ],
+            }
+        ],
         model=model,
         max_completion_tokens=64,
     )
@@ -186,22 +164,18 @@ def run_video(model: str) -> None:
 
     ## Use base64 encoded video in the payload
     chat_completion_from_base64 = client.chat.completions.create(
-        messages=[{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this video?"
-                },
-                {
-                    "type": "video_url",
-                    "video_url": {
-                        "url": f"data:video/mp4;base64,{video_base64}"
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this video?"},
+                    {
+                        "type": "video_url",
+                        "video_url": {"url": f"data:video/mp4;base64,{video_base64}"},
                     },
-                },
-            ],
-        }],
+                ],
+            }
+        ],
         model=model,
         max_completion_tokens=64,
     )
@@ -219,24 +193,22 @@ def run_audio(model: str) -> None:
 
     # OpenAI-compatible schema (`input_audio`)
     chat_completion_from_base64 = client.chat.completions.create(
-        messages=[{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this audio?"
-                },
-                {
-                    "type": "input_audio",
-                    "input_audio": {
-                        # Any format supported by librosa is supported
-                        "data": audio_base64,
-                        "format": "wav"
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this audio?"},
+                    {
+                        "type": "input_audio",
+                        "input_audio": {
+                            # Any format supported by librosa is supported
+                            "data": audio_base64,
+                            "format": "wav",
+                        },
                     },
-                },
-            ],
-        }],
+                ],
+            }
+        ],
         model=model,
         max_completion_tokens=64,
     )
@@ -246,23 +218,21 @@ def run_audio(model: str) -> None:
 
     # HTTP URL
     chat_completion_from_url = client.chat.completions.create(
-        messages=[{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this audio?"
-                },
-                {
-                    "type": "audio_url",
-                    "audio_url": {
-                        # Any format supported by librosa is supported
-                        "url": audio_url
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this audio?"},
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            # Any format supported by librosa is supported
+                            "url": audio_url
+                        },
                     },
-                },
-            ],
-        }],
+                ],
+            }
+        ],
         model=model,
         max_completion_tokens=64,
     )
@@ -272,23 +242,21 @@ def run_audio(model: str) -> None:
 
     # base64 URL
     chat_completion_from_base64 = client.chat.completions.create(
-        messages=[{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's in this audio?"
-                },
-                {
-                    "type": "audio_url",
-                    "audio_url": {
-                        # Any format supported by librosa is supported
-                        "url": f"data:audio/ogg;base64,{audio_base64}"
+        messages=[
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this audio?"},
+                    {
+                        "type": "audio_url",
+                        "audio_url": {
+                            # Any format supported by librosa is supported
+                            "url": f"data:audio/ogg;base64,{audio_base64}"
+                        },
                     },
-                },
-            ],
-        }],
+                ],
+            }
+        ],
         model=model,
         max_completion_tokens=64,
     )
@@ -308,14 +276,17 @@ def run_audio(model: str) -> None:
 
 def parse_args():
     parser = FlexibleArgumentParser(
-        description='Demo on using OpenAI client for online serving with '
-        'multimodal language models served with vLLM.')
-    parser.add_argument('--chat-type',
-                        '-c',
-                        type=str,
-                        default="single-image",
-                        choices=list(example_function_map.keys()),
-                        help='Conversation type with multimodal data.')
+        description="Demo on using OpenAI client for online serving with "
+        "multimodal language models served with vLLM."
+    )
+    parser.add_argument(
+        "--chat-type",
+        "-c",
+        type=str,
+        default="single-image",
+        choices=list(example_function_map.keys()),
+        help="Conversation type with multimodal data.",
+    )
     return parser.parse_args()
 
 
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools.py b/examples/online_serving/openai_chat_completion_client_with_tools.py
index 94f9c1570586..a0d7841f644f 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools.py
@@ -16,6 +16,7 @@
             --chat-template examples/tool_chat_template_hermes.jinja \
             --enable-auto-tool-choice --tool-call-parser hermes
 """
+
 import json
 from typing import Any
 
@@ -25,55 +26,55 @@
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-tools = [{
-    "type": "function",
-    "function": {
-        "name": "get_current_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "city": {
-                    "type":
-                    "string",
-                    "description":
-                    "The city to find the weather for, e.g. 'San Francisco'"
-                },
-                "state": {
-                    "type":
-                    "string",
-                    "description":
-                    "the two-letter abbreviation for the state that the city is"
-                    " in, e.g. 'CA' which would mean 'California'"
-                },
-                "unit": {
-                    "type": "string",
-                    "description": "The unit to fetch the temperature in",
-                    "enum": ["celsius", "fahrenheit"]
-                }
+properties = {
+    "city": {
+        "type": "string",
+        "description": "The city to find the weather for, e.g. 'San Francisco'",
+    },
+    "state": {
+        "type": "string",
+        "description": "the two-letter abbreviation for the state that the city is"
+        " in, e.g. 'CA' which would mean 'California'",
+    },
+    "unit": {
+        "type": "string",
+        "description": "The unit to fetch the temperature in",
+        "enum": ["celsius", "fahrenheit"],
+    },
+}
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": properties,
+                "required": ["city", "state", "unit"],
             },
-            "required": ["city", "state", "unit"]
-        }
+        },
     }
-}]
-
-messages = [{
-    "role": "user",
-    "content": "Hi! How are you doing today?"
-}, {
-    "role": "assistant",
-    "content": "I'm doing well! How can I help you?"
-}, {
-    "role":
-    "user",
-    "content":
-    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
-}]
-
-
-def get_current_weather(city: str, state: str, unit: 'str'):
-    return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
-            "partly cloudly, with highs in the 90's.")
+]
+
+messages = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": (
+            "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+        ),
+    },
+]
+
+
+def get_current_weather(city: str, state: str, unit: "str"):
+    return (
+        "The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
+        "partly cloudly, with highs in the 90's."
+    )
 
 
 def handle_tool_calls_stream(
@@ -82,10 +83,9 @@ def handle_tool_calls_stream(
     model: str,
     tools: list[dict[str, Any]],
 ) -> list[Any]:
-    tool_calls_stream = client.chat.completions.create(messages=messages,
-                                                       model=model,
-                                                       tools=tools,
-                                                       stream=True)
+    tool_calls_stream = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, stream=True
+    )
     chunks = []
     print("chunks: ")
     for chunk in tool_calls_stream:
@@ -106,8 +106,7 @@ def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]:
             tool_call = chunk.choices[0].delta.tool_calls[0]
             if tool_call.index != tool_call_idx:
                 if tool_call_idx >= 0:
-                    print(f"streamed tool call arguments: "
-                          f"{arguments[tool_call_idx]}")
+                    print(f"streamed tool call arguments: {arguments[tool_call_idx]}")
                 tool_call_idx = chunk.choices[0].delta.tool_calls[0].index
                 arguments.append("")
             if tool_call.id:
@@ -115,8 +114,7 @@ def handle_tool_calls_arguments(chunks: list[Any]) -> list[str]:
 
             if tool_call.function:
                 if tool_call.function.name:
-                    print(
-                        f"streamed tool call name: {tool_call.function.name}")
+                    print(f"streamed tool call name: {tool_call.function.name}")
 
                 if tool_call.function.arguments:
                     arguments[tool_call_idx] += tool_call.function.arguments
@@ -136,9 +134,9 @@ def main():
     models = client.models.list()
     model = models.data[0].id
 
-    chat_completion = client.chat.completions.create(messages=messages,
-                                                     model=model,
-                                                     tools=tools)
+    chat_completion = client.chat.completions.create(
+        messages=messages, model=model, tools=tools
+    )
 
     print("-" * 70)
     print("Chat completion results:")
@@ -158,10 +156,12 @@ def main():
     print("-" * 70)
 
     # Add tool call results to the conversation
-    messages.append({
-        "role": "assistant",
-        "tool_calls": chat_completion.choices[0].message.tool_calls
-    })
+    messages.append(
+        {
+            "role": "assistant",
+            "tool_calls": chat_completion.choices[0].message.tool_calls,
+        }
+    )
 
     # Now, simulate a tool call
     available_tools = {"get_current_weather": get_current_weather}
@@ -172,17 +172,18 @@ def main():
         args = json.loads(call.function.arguments)
         result = tool_to_call(**args)
         print("tool_to_call result: ", result)
-        messages.append({
-            "role": "tool",
-            "content": result,
-            "tool_call_id": call.id,
-            "name": call.function.name
-        })
-
-    chat_completion_2 = client.chat.completions.create(messages=messages,
-                                                       model=model,
-                                                       tools=tools,
-                                                       stream=False)
+        messages.append(
+            {
+                "role": "tool",
+                "content": result,
+                "tool_call_id": call.id,
+                "name": call.function.name,
+            }
+        )
+
+    chat_completion_2 = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, stream=False
+    )
     print("Chat completion2 results:")
     print(chat_completion_2)
     print("-" * 70)
diff --git a/examples/online_serving/openai_chat_completion_client_with_tools_required.py b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
index 97d900bb75f1..45c4232fe1de 100644
--- a/examples/online_serving/openai_chat_completion_client_with_tools_required.py
+++ b/examples/online_serving/openai_chat_completion_client_with_tools_required.py
@@ -28,18 +28,16 @@
                 "type": "object",
                 "properties": {
                     "city": {
-                        "type":
-                        "string",
-                        "description":
-                        "The city to find the weather for"
+                        "type": "string",
+                        "description": "The city to find the weather for"
                         ", e.g. 'San Francisco'",
                     },
                     "state": {
-                        "type":
-                        "string",
-                        "description":
-                        "the two-letter abbreviation for the state that the "
-                        "city is in, e.g. 'CA' which would mean 'California'",
+                        "type": "string",
+                        "description": (
+                            "the two-letter abbreviation for the state that the "
+                            "city is in, e.g. 'CA' which would mean 'California'"
+                        ),
                     },
                     "unit": {
                         "type": "string",
@@ -60,22 +58,20 @@
                 "type": "object",
                 "properties": {
                     "city": {
-                        "type":
-                        "string",
-                        "description":
-                        "The city to get the forecast for, e.g. 'New York'",
+                        "type": "string",
+                        "description": (
+                            "The city to get the forecast for, e.g. 'New York'"
+                        ),
                     },
                     "state": {
-                        "type":
-                        "string",
-                        "description":
-                        "The two-letter abbreviation for the state, e.g. 'NY'",
+                        "type": "string",
+                        "description": (
+                            "The two-letter abbreviation for the state, e.g. 'NY'"
+                        ),
                     },
                     "days": {
-                        "type":
-                        "integer",
-                        "description":
-                        "Number of days to get the forecast for (1-7)",
+                        "type": "integer",
+                        "description": "Number of days to get the forecast for (1-7)",
                     },
                     "unit": {
                         "type": "string",
@@ -90,19 +86,11 @@
 ]
 
 messages = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
     {
         "role": "user",
-        "content": "Hi! How are you doing today?"
-    },
-    {
-        "role": "assistant",
-        "content": "I'm doing well! How can I help you?"
-    },
-    {
-        "role":
-        "user",
-        "content":
-        "Can you tell me what the current weather is in Dallas \
+        "content": "Can you tell me what the current weather is in Dallas \
             and the forecast for the next 5 days, in fahrenheit?",
     },
 ]
@@ -123,17 +111,16 @@ def main():
         model=model,
         tools=tools,
         tool_choice="required",
-        stream=True  # Enable streaming response
+        stream=True,  # Enable streaming response
     )
 
     for chunk in chat_completion:
         if chunk.choices and chunk.choices[0].delta.tool_calls:
             print(chunk.choices[0].delta.tool_calls)
 
-    chat_completion = client.chat.completions.create(messages=messages,
-                                                     model=model,
-                                                     tools=tools,
-                                                     tool_choice="required")
+    chat_completion = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, tool_choice="required"
+    )
 
     print(chat_completion.choices[0].message.tool_calls)
 
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs.py b/examples/online_serving/openai_chat_completion_structured_outputs.py
index 660369e55d40..a4134ea43c4b 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs.py
@@ -12,15 +12,17 @@
 from openai import BadRequestError, OpenAI
 from pydantic import BaseModel
 
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
 
 # Guided decoding by Choice (list of possible options)
 def guided_choice_completion(client: OpenAI, model: str):
     completion = client.chat.completions.create(
         model=model,
-        messages=[{
-            "role": "user",
-            "content": "Classify this sentiment: vLLM is wonderful!"
-        }],
+        messages=[
+            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+        ],
         extra_body={"guided_choice": ["positive", "negative"]},
     )
     return completion.choices[0].message.content
@@ -28,20 +30,21 @@ def guided_choice_completion(client: OpenAI, model: str):
 
 # Guided decoding by Regex
 def guided_regex_completion(client: OpenAI, model: str):
-    prompt = ("Generate an email address for Alan Turing, who works in Enigma."
-              "End in .com and new line. Example result:"
-              "alan.turing@enigma.com\n")
+    prompt = (
+        "Generate an email address for Alan Turing, who works in Enigma."
+        "End in .com and new line. Example result:"
+        "alan.turing@enigma.com\n"
+    )
 
     completion = client.chat.completions.create(
         model=model,
-        messages=[{
-            "role": "user",
-            "content": prompt,
-        }],
-        extra_body={
-            "guided_regex": r"\w+@\w+\.com\n",
-            "stop": ["\n"]
-        },
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
+        extra_body={"guided_regex": r"\w+@\w+\.com\n", "stop": ["\n"]},
     )
     return completion.choices[0].message.content
 
@@ -63,14 +66,18 @@ class CarDescription(BaseModel):
 def guided_json_completion(client: OpenAI, model: str):
     json_schema = CarDescription.model_json_schema()
 
-    prompt = ("Generate a JSON with the brand, model and car_type of"
-              "the most iconic car from the 90's")
+    prompt = (
+        "Generate a JSON with the brand, model and car_type of"
+        "the most iconic car from the 90's"
+    )
     completion = client.chat.completions.create(
         model=model,
-        messages=[{
-            "role": "user",
-            "content": prompt,
-        }],
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
         extra_body={"guided_json": json_schema},
     )
     return completion.choices[0].message.content
@@ -92,14 +99,18 @@ def guided_grammar_completion(client: OpenAI, model: str):
         number ::= "1 " | "2 "
     """
 
-    prompt = ("Generate an SQL query to show the 'username' and 'email'"
-              "from the 'users' table.")
+    prompt = (
+        "Generate an SQL query to show the 'username' and 'email'"
+        "from the 'users' table."
+    )
     completion = client.chat.completions.create(
         model=model,
-        messages=[{
-            "role": "user",
-            "content": prompt,
-        }],
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
         extra_body={"guided_grammar": simplified_sql_grammar},
     )
     return completion.choices[0].message.content
@@ -107,19 +118,23 @@ def guided_grammar_completion(client: OpenAI, model: str):
 
 # Extra backend options
 def extra_backend_options_completion(client: OpenAI, model: str):
-    prompt = ("Generate an email address for Alan Turing, who works in Enigma."
-              "End in .com and new line. Example result:"
-              "alan.turing@enigma.com\n")
+    prompt = (
+        "Generate an email address for Alan Turing, who works in Enigma."
+        "End in .com and new line. Example result:"
+        "alan.turing@enigma.com\n"
+    )
 
     try:
         # The guided_decoding_disable_fallback option forces vLLM to use
         # xgrammar, so when it fails you get a 400 with the reason why
         completion = client.chat.completions.create(
             model=model,
-            messages=[{
-                "role": "user",
-                "content": prompt,
-            }],
+            messages=[
+                {
+                    "role": "user",
+                    "content": prompt,
+                }
+            ],
             extra_body={
                 "guided_regex": r"\w+@\w+\.com\n",
                 "stop": ["\n"],
@@ -134,8 +149,8 @@ def extra_backend_options_completion(client: OpenAI, model: str):
 
 def main():
     client: OpenAI = OpenAI(
-        base_url="http://localhost:8000/v1",
-        api_key="-",
+        base_url=openai_api_base,
+        api_key=openai_api_key,
     )
 
     model = client.models.list().data[0].id
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
index 42aa12c451c0..c73208abe600 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_structural_tag.py
@@ -7,18 +7,20 @@
 # to enforce the format of a tool call response, but it could be used for
 # any structured output within a subset of the response.
 
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
 
 def main():
     client = OpenAI(
-        base_url="http://localhost:8000/v1",
-        api_key="-",
+        base_url=openai_api_base,
+        api_key=openai_api_key,
     )
 
-    messages = [{
-        "role":
-        "user",
-        "content":
-        """
+    messages = [
+        {
+            "role": "user",
+            "content": """
 You have access to the following function to retrieve the weather in a city:
 
     {
@@ -55,29 +57,28 @@ def main():
 
 Given the previous instructions, what is the weather in New York City, Boston,
 and San Francisco?
-"""
-    }]
+""",
+        }
+    ]
 
     response = client.chat.completions.create(
         model=client.models.list().data[0].id,
         messages=messages,
         response_format={
-            "type":
-            "structural_tag",
-            "structures": [{
-                "begin": "<function=get_weather>",
-                "schema": {
-                    "type": "object",
-                    "properties": {
-                        "city": {
-                            "type": "string"
-                        }
-                    }
-                },
-                "end": "</function>"
-            }],
-            "triggers": ["<function="]
-        })
+            "type": "structural_tag",
+            "structures": [
+                {
+                    "begin": "<function=get_weather>",
+                    "schema": {
+                        "type": "object",
+                        "properties": {"city": {"type": "string"}},
+                    },
+                    "end": "</function>",
+                }
+            ],
+            "triggers": ["<function="],
+        },
+    )
     print(response)
 
 
diff --git a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
index a04f0cdf12f7..1ca61a8d5895 100644
--- a/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
@@ -27,21 +27,22 @@
 
 
 def print_completion_details(completion):
-    print("reasoning_content: ",
-          completion.choices[0].message.reasoning_content)
+    print("reasoning_content: ", completion.choices[0].message.reasoning_content)
     print("content: ", completion.choices[0].message.content)
 
 
 # Guided decoding by Regex
 def guided_regex_completion(client: OpenAI, model: str):
-    prompt = ("What is the capital of France?")
+    prompt = "What is the capital of France?"
 
     completion = client.chat.completions.create(
         model=model,
-        messages=[{
-            "role": "user",
-            "content": prompt,
-        }],
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
         extra_body={
             "guided_regex": "(Paris|London)",
         },
@@ -57,13 +58,15 @@ class People(BaseModel):
 def guided_json_completion(client: OpenAI, model: str):
     json_schema = People.model_json_schema()
 
-    prompt = ("Generate a JSON with the name and age of one random person.")
+    prompt = "Generate a JSON with the name and age of one random person."
     completion = client.chat.completions.create(
         model=model,
-        messages=[{
-            "role": "user",
-            "content": prompt,
-        }],
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
         extra_body={"guided_json": json_schema},
     )
     print_completion_details(completion)
@@ -86,14 +89,18 @@ class CarDescription(BaseModel):
 def guided_car_json_completion(client: OpenAI, model: str):
     json_schema = CarDescription.model_json_schema()
 
-    prompt = ("Generate a JSON with the brand, model and car_type of"
-              "the most iconic car from the 90's")
+    prompt = (
+        "Generate a JSON with the brand, model and car_type of"
+        "the most iconic car from the 90's"
+    )
     completion = client.chat.completions.create(
         model=model,
-        messages=[{
-            "role": "user",
-            "content": prompt,
-        }],
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
         extra_body={"guided_json": json_schema},
     )
     print_completion_details(completion)
@@ -116,14 +123,18 @@ def guided_grammar_completion(client: OpenAI, model: str):
     """
 
     # This may be very slow https://github.com/vllm-project/vllm/issues/12122
-    prompt = ("Generate an SQL query to show the 'username' and 'email'"
-              "from the 'users' table.")
+    prompt = (
+        "Generate an SQL query to show the 'username' and 'email'"
+        "from the 'users' table."
+    )
     completion = client.chat.completions.create(
         model=model,
-        messages=[{
-            "role": "user",
-            "content": prompt,
-        }],
+        messages=[
+            {
+                "role": "user",
+                "content": prompt,
+            }
+        ],
         extra_body={"guided_grammar": simplified_sql_grammar},
     )
     print_completion_details(completion)
diff --git a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
index 9417abd3989a..a5febad45863 100644
--- a/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
@@ -20,9 +20,11 @@
 
 
 # Now, simulate a tool call
-def get_current_weather(city: str, state: str, unit: 'str'):
-    return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
-            "partly cloudly, with highs in the 90's.")
+def get_current_weather(city: str, state: str, unit: "str"):
+    return (
+        "The weather in Dallas, Texas is 85 degrees fahrenheit. It is "
+        "partly cloudly, with highs in the 90's."
+    )
 
 
 available_tools = {"get_current_weather": get_current_weather}
@@ -31,49 +33,47 @@ def get_current_weather(city: str, state: str, unit: 'str'):
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
 
-tools = [{
-    "type": "function",
-    "function": {
-        "name": "get_current_weather",
-        "description": "Get the current weather in a given location",
-        "parameters": {
-            "type": "object",
-            "properties": {
-                "city": {
-                    "type":
-                    "string",
-                    "description":
-                    "The city to find the weather for, e.g. 'San Francisco'"
-                },
-                "state": {
-                    "type":
-                    "string",
-                    "description":
-                    "the two-letter abbreviation for the state that the city is"
-                    " in, e.g. 'CA' which would mean 'California'"
-                },
-                "unit": {
-                    "type": "string",
-                    "description": "The unit to fetch the temperature in",
-                    "enum": ["celsius", "fahrenheit"]
-                }
+properties = {
+    "city": {
+        "type": "string",
+        "description": "The city to find the weather for, e.g. 'San Francisco'",
+    },
+    "state": {
+        "type": "string",
+        "description": "the two-letter abbreviation for the state that the city is"
+        " in, e.g. 'CA' which would mean 'California'",
+    },
+    "unit": {
+        "type": "string",
+        "description": "The unit to fetch the temperature in",
+        "enum": ["celsius", "fahrenheit"],
+    },
+}
+
+tools = [
+    {
+        "type": "function",
+        "function": {
+            "name": "get_current_weather",
+            "description": "Get the current weather in a given location",
+            "parameters": {
+                "type": "object",
+                "properties": properties,
+                "required": ["city", "state", "unit"],
             },
-            "required": ["city", "state", "unit"]
-        }
+        },
     }
-}]
-messages = [{
-    "role": "user",
-    "content": "Hi! How are you doing today?"
-}, {
-    "role": "assistant",
-    "content": "I'm doing well! How can I help you?"
-}, {
-    "role":
-    "user",
-    "content":
-    "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
-}]
+]
+messages = [
+    {"role": "user", "content": "Hi! How are you doing today?"},
+    {"role": "assistant", "content": "I'm doing well! How can I help you?"},
+    {
+        "role": "user",
+        "content": (
+            "Can you tell me what the temperate will be in Dallas, in fahrenheit?"
+        ),
+    },
+]
 
 
 def extract_reasoning_and_calls(chunks: list):
@@ -110,73 +110,55 @@ def main():
     models = client.models.list()
     model = models.data[0].id
 
+    print("---------Full Generate With Automatic Function Calling-------------")
+    tool_calls = client.chat.completions.create(
+        messages=messages, model=model, tools=tools
+    )
+    print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
+    print(f"function name: {tool_calls.choices[0].message.tool_calls[0].function.name}")
     print(
-        "---------Full Generate With Automatic Function Calling-------------")
-    tool_calls = client.chat.completions.create(messages=messages,
-                                                model=model,
-                                                tools=tools)
-    print(
-        f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}"
+        f"function arguments: "
+        f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}"
     )
-    print(f"function name: "
-          f"{tool_calls.choices[0].message.tool_calls[0].function.name}")
-    print(f"function arguments: "
-          f"{tool_calls.choices[0].message.tool_calls[0].function.arguments}")
 
-    print(
-        "----------Stream Generate With Automatic Function Calling-----------")
-    tool_calls_stream = client.chat.completions.create(messages=messages,
-                                                       model=model,
-                                                       tools=tools,
-                                                       stream=True)
+    print("----------Stream Generate With Automatic Function Calling-----------")
+    tool_calls_stream = client.chat.completions.create(
+        messages=messages, model=model, tools=tools, stream=True
+    )
 
     chunks = list(tool_calls_stream)
 
-    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
-        chunks)
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
 
     print(f"reasoning_content: {reasoning_content}")
     print(f"function name: {function_names[0]}")
     print(f"function arguments: {arguments[0]}")
 
-    print(
-        "----------Full Generate With Named Function Calling-----------------")
-    tool_calls = client.chat.completions.create(messages=messages,
-                                                model=model,
-                                                tools=tools,
-                                                tool_choice={
-                                                    "type": "function",
-                                                    "function": {
-                                                        "name":
-                                                        "get_current_weather"
-                                                    }
-                                                })
+    print("----------Full Generate With Named Function Calling-----------------")
+    tool_calls = client.chat.completions.create(
+        messages=messages,
+        model=model,
+        tools=tools,
+        tool_choice={"type": "function", "function": {"name": "get_current_weather"}},
+    )
 
     tool_call = tool_calls.choices[0].message.tool_calls[0].function
-    print(
-        f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}"
-    )
+    print(f"reasoning_content: {tool_calls.choices[0].message.reasoning_content}")
     print(f"function name: {tool_call.name}")
     print(f"function arguments: {tool_call.arguments}")
-    print(
-        "----------Stream Generate With Named Function Calling--------------")
+    print("----------Stream Generate With Named Function Calling--------------")
 
     tool_calls_stream = client.chat.completions.create(
         messages=messages,
         model=model,
         tools=tools,
-        tool_choice={
-            "type": "function",
-            "function": {
-                "name": "get_current_weather"
-            }
-        },
-        stream=True)
+        tool_choice={"type": "function", "function": {"name": "get_current_weather"}},
+        stream=True,
+    )
 
     chunks = list(tool_calls_stream)
 
-    reasoning_content, arguments, function_names = extract_reasoning_and_calls(
-        chunks)
+    reasoning_content, arguments, function_names = extract_reasoning_and_calls(chunks)
     print(f"reasoning_content: {reasoning_content}")
     print(f"function name: {function_names[0]}")
     print(f"function arguments: {arguments[0]}")
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning.py b/examples/online_serving/openai_chat_completion_with_reasoning.py
index 4bf7731cb41e..f6b8082115f1 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning.py
@@ -45,12 +45,12 @@ def main():
 
     # Round 2
     messages.append({"role": "assistant", "content": content})
-    messages.append({
-        "role":
-        "user",
-        "content":
-        "How many Rs are there in the word 'strawberry'?",
-    })
+    messages.append(
+        {
+            "role": "user",
+            "content": "How many Rs are there in the word 'strawberry'?",
+        }
+    )
     response = client.chat.completions.create(model=model, messages=messages)
 
     reasoning_content = response.choices[0].message.reasoning_content
diff --git a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
index 9cc0a5f2476b..f984fbabf24f 100644
--- a/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
+++ b/examples/online_serving/openai_chat_completion_with_reasoning_streaming.py
@@ -43,9 +43,7 @@ def main():
 
     # ruff: noqa: E501
     # For granite: add: `extra_body={"chat_template_kwargs": {"thinking": True}}`
-    stream = client.chat.completions.create(model=model,
-                                            messages=messages,
-                                            stream=True)
+    stream = client.chat.completions.create(model=model, messages=messages, stream=True)
 
     print("client: Start streaming chat completions...")
     printed_reasoning_content = False
diff --git a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
index c850b5aa2f80..ee519e555ff7 100644
--- a/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/online_serving/openai_chat_embedding_client_for_multimodal.py
@@ -14,26 +14,17 @@ def vlm2vec():
     response = requests.post(
         "http://localhost:8000/v1/embeddings",
         json={
-            "model":
-            "TIGER-Lab/VLM2Vec-Full",
-            "messages": [{
-                "role":
-                "user",
-                "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": image_url
-                        }
-                    },
-                    {
-                        "type": "text",
-                        "text": "Represent the given image."
-                    },
-                ],
-            }],
-            "encoding_format":
-            "float",
+            "model": "TIGER-Lab/VLM2Vec-Full",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": image_url}},
+                        {"type": "text", "text": "Represent the given image."},
+                    ],
+                }
+            ],
+            "encoding_format": "float",
         },
     )
     response.raise_for_status()
@@ -45,19 +36,20 @@ def vlm2vec():
 def dse_qwen2_vl(inp: dict):
     # Embedding an Image
     if inp["type"] == "image":
-        messages = [{
-            "role":
-            "user",
-            "content": [{
-                "type": "image_url",
-                "image_url": {
-                    "url": inp["image_url"],
-                }
-            }, {
-                "type": "text",
-                "text": "What is shown in this image?"
-            }]
-        }]
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": inp["image_url"],
+                        },
+                    },
+                    {"type": "text", "text": "What is shown in this image?"},
+                ],
+            }
+        ]
     # Embedding a Text Query
     else:
         # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
@@ -66,23 +58,21 @@ def dse_qwen2_vl(inp: dict):
         image_placeholder = Image.new("RGB", (56, 56))
         image_placeholder.save(buffer, "png")
         buffer.seek(0)
-        image_placeholder = base64.b64encode(buffer.read()).decode('utf-8')
-        messages = [{
-            "role":
-            "user",
-            "content": [
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/jpeg;base64,{image_placeholder}",
-                    }
-                },
-                {
-                    "type": "text",
-                    "text": f"Query: {inp['content']}"
-                },
-            ]
-        }]
+        image_placeholder = base64.b64encode(buffer.read()).decode("utf-8")
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{image_placeholder}",
+                        },
+                    },
+                    {"type": "text", "text": f"Query: {inp['content']}"},
+                ],
+            }
+        ]
 
     response = requests.post(
         "http://localhost:8000/v1/embeddings",
@@ -101,12 +91,15 @@ def dse_qwen2_vl(inp: dict):
 def parse_args():
     parser = argparse.ArgumentParser(
         "Script to call a specified VLM through the API. Make sure to serve "
-        "the model with --task embed before running this.")
-    parser.add_argument("--model",
-                        type=str,
-                        choices=["vlm2vec", "dse_qwen2_vl"],
-                        required=True,
-                        help="Which model to call.")
+        "the model with --task embed before running this."
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        choices=["vlm2vec", "dse_qwen2_vl"],
+        required=True,
+        help="Which model to call.",
+    )
     return parser.parse_args()
 
 
@@ -114,16 +107,20 @@ def main(args):
     if args.model == "vlm2vec":
         vlm2vec()
     elif args.model == "dse_qwen2_vl":
-        dse_qwen2_vl({
-            "type": "image",
-            "image_url": image_url,
-        })
-        dse_qwen2_vl({
-            "type": "text",
-            "content": "What is the weather like today?",
-        })
+        dse_qwen2_vl(
+            {
+                "type": "image",
+                "image_url": image_url,
+            }
+        )
+        dse_qwen2_vl(
+            {
+                "type": "text",
+                "content": "What is the weather like today?",
+            }
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     args = parse_args()
     main(args)
diff --git a/examples/online_serving/openai_classification_client.py b/examples/online_serving/openai_classification_client.py
index 99241346373e..649cfa5d6686 100644
--- a/examples/online_serving/openai_classification_client.py
+++ b/examples/online_serving/openai_classification_client.py
@@ -16,9 +16,7 @@ def parse_args():
     parse = argparse.ArgumentParser()
     parse.add_argument("--host", type=str, default="localhost")
     parse.add_argument("--port", type=int, default=8000)
-    parse.add_argument("--model",
-                       type=str,
-                       default="jason9693/Qwen2.5-1.5B-apeach")
+    parse.add_argument("--model", type=str, default="jason9693/Qwen2.5-1.5B-apeach")
     return parse.parse_args()
 
 
diff --git a/examples/online_serving/openai_completion_client.py b/examples/online_serving/openai_completion_client.py
index 6ab7619bff19..b1d21b5e4b9f 100644
--- a/examples/online_serving/openai_completion_client.py
+++ b/examples/online_serving/openai_completion_client.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
+import argparse
+
 from openai import OpenAI
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
@@ -7,7 +9,15 @@
 openai_api_base = "http://localhost:8000/v1"
 
 
-def main():
+def parse_args():
+    parser = argparse.ArgumentParser(description="Client for vLLM API server")
+    parser.add_argument(
+        "--stream", action="store_true", help="Enable streaming response"
+    )
+    return parser.parse_args()
+
+
+def main(args):
     client = OpenAI(
         # defaults to os.environ.get("OPENAI_API_KEY")
         api_key=openai_api_key,
@@ -18,18 +28,18 @@ def main():
     model = models.data[0].id
 
     # Completion API
-    stream = False
     completion = client.completions.create(
         model=model,
         prompt="A robot may not injure a human being",
         echo=False,
         n=2,
-        stream=stream,
-        logprobs=3)
+        stream=args.stream,
+        logprobs=3,
+    )
 
     print("-" * 50)
     print("Completion results:")
-    if stream:
+    if args.stream:
         for c in completion:
             print(c)
     else:
@@ -38,4 +48,5 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    args = parse_args()
+    main(args)
diff --git a/examples/online_serving/openai_cross_encoder_score.py b/examples/online_serving/openai_cross_encoder_score.py
index 20a64ddb2141..7891e14cb71e 100644
--- a/examples/online_serving/openai_cross_encoder_score.py
+++ b/examples/online_serving/openai_cross_encoder_score.py
@@ -4,6 +4,7 @@
 
 Run `vllm serve <model> --task score` to start up the server in vLLM.
 """
+
 import argparse
 import pprint
 
@@ -38,9 +39,7 @@ def main(args):
     pprint.pprint(score_response.json())
 
     text_1 = "What is the capital of France?"
-    text_2 = [
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
-    ]
+    text_2 = ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
     print("\nPrompt when text_1 is string and text_2 is a list:")
@@ -48,12 +47,8 @@ def main(args):
     print("\nScore Response:")
     pprint.pprint(score_response.json())
 
-    text_1 = [
-        "What is the capital of Brazil?", "What is the capital of France?"
-    ]
-    text_2 = [
-        "The capital of Brazil is Brasilia.", "The capital of France is Paris."
-    ]
+    text_1 = ["What is the capital of Brazil?", "What is the capital of France?"]
+    text_2 = ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]
     prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
     score_response = post_http_request(prompt=prompt, api_url=api_url)
     print("\nPrompt when text_1 and text_2 are both lists:")
diff --git a/examples/online_serving/openai_embedding_client.py b/examples/online_serving/openai_embedding_client.py
index bc217f7ca7a0..a055654e9133 100644
--- a/examples/online_serving/openai_embedding_client.py
+++ b/examples/online_serving/openai_embedding_client.py
@@ -21,7 +21,7 @@ def main():
         # ruff: noqa: E501
         input=[
             "Hello my name is",
-            "The best thing about vLLM is that it supports many different models"
+            "The best thing about vLLM is that it supports many different models",
         ],
         model=model,
     )
diff --git a/examples/online_serving/openai_pooling_client.py b/examples/online_serving/openai_pooling_client.py
index abcfe27c2769..2620a1232024 100644
--- a/examples/online_serving/openai_pooling_client.py
+++ b/examples/online_serving/openai_pooling_client.py
@@ -5,6 +5,7 @@
 Run `vllm serve <model> --task <embed|classify|reward|score>`
 to start up the server in vLLM.
 """
+
 import argparse
 import pprint
 
@@ -21,9 +22,7 @@ def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--model",
-                        type=str,
-                        default="jason9693/Qwen2.5-1.5B-apeach")
+    parser.add_argument("--model", type=str, default="jason9693/Qwen2.5-1.5B-apeach")
 
     return parser.parse_args()
 
@@ -42,15 +41,13 @@ def main(args):
 
     # Input like Chat API
     prompt = {
-        "model":
-        model_name,
-        "messages": [{
-            "role": "user",
-            "content": [{
-                "type": "text",
-                "text": "vLLM is great!"
-            }],
-        }]
+        "model": model_name,
+        "messages": [
+            {
+                "role": "user",
+                "content": [{"type": "text", "text": "vLLM is great!"}],
+            }
+        ],
     }
     pooling_response = post_http_request(prompt=prompt, api_url=api_url)
     print("Pooling Response:")
diff --git a/examples/online_serving/openai_transcription_client.py b/examples/online_serving/openai_transcription_client.py
index 66e622672ef2..eb501ae72aa9 100644
--- a/examples/online_serving/openai_transcription_client.py
+++ b/examples/online_serving/openai_transcription_client.py
@@ -7,8 +7,8 @@
 
 from vllm.assets.audio import AudioAsset
 
-mary_had_lamb = AudioAsset('mary_had_lamb').get_local_path()
-winning_call = AudioAsset('winning_call').get_local_path()
+mary_had_lamb = AudioAsset("mary_had_lamb").get_local_path()
+winning_call = AudioAsset("winning_call").get_local_path()
 
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
@@ -31,7 +31,8 @@ def sync_openai():
             extra_body=dict(
                 seed=4419,
                 repetition_penalty=1.3,
-            ))
+            ),
+        )
         print("transcription result:", transcription.text)
 
 
@@ -42,33 +43,30 @@ def sync_openai():
 async def stream_openai_response():
     data = {
         "language": "en",
-        'stream': True,
+        "stream": True,
         "model": "openai/whisper-large-v3",
     }
     url = openai_api_base + "/audio/transcriptions"
     headers = {"Authorization": f"Bearer {openai_api_key}"}
-    print("transcription result:", end=' ')
+    print("transcription result:", end=" ")
     async with httpx.AsyncClient() as client:
         with open(str(winning_call), "rb") as f:
-            async with client.stream('POST',
-                                     url,
-                                     files={'file': f},
-                                     data=data,
-                                     headers=headers) as response:
+            async with client.stream(
+                "POST", url, files={"file": f}, data=data, headers=headers
+            ) as response:
                 async for line in response.aiter_lines():
                     # Each line is a JSON object prefixed with 'data: '
                     if line:
-                        if line.startswith('data: '):
-                            line = line[len('data: '):]
+                        if line.startswith("data: "):
+                            line = line[len("data: ") :]
                         # Last chunk, stream ends
-                        if line.strip() == '[DONE]':
+                        if line.strip() == "[DONE]":
                             break
                         # Parse the JSON response
                         chunk = json.loads(line)
                         # Extract and print the content
-                        content = chunk['choices'][0].get('delta',
-                                                          {}).get('content')
-                        print(content, end='')
+                        content = chunk["choices"][0].get("delta", {}).get("content")
+                        print(content, end="")
 
 
 # Run the asynchronous function
diff --git a/examples/online_serving/opentelemetry/dummy_client.py b/examples/online_serving/opentelemetry/dummy_client.py
index a8b353090d79..33d365f0caa5 100644
--- a/examples/online_serving/opentelemetry/dummy_client.py
+++ b/examples/online_serving/opentelemetry/dummy_client.py
@@ -1,14 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import requests
-from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
-    OTLPSpanExporter)
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
 from opentelemetry.sdk.trace import TracerProvider
-from opentelemetry.sdk.trace.export import (BatchSpanProcessor,
-                                            ConsoleSpanExporter)
+from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
 from opentelemetry.trace import SpanKind, set_tracer_provider
-from opentelemetry.trace.propagation.tracecontext import (
-    TraceContextTextMapPropagator)
+from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
 
 trace_provider = TracerProvider()
 set_tracer_provider(trace_provider)
diff --git a/examples/online_serving/prompt_embed_inference_with_openai_client.py b/examples/online_serving/prompt_embed_inference_with_openai_client.py
new file mode 100644
index 000000000000..85ea2340736e
--- /dev/null
+++ b/examples/online_serving/prompt_embed_inference_with_openai_client.py
@@ -0,0 +1,84 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+vLLM OpenAI-Compatible Client with Prompt Embeddings
+
+This script demonstrates how to:
+1. Generate prompt embeddings using Hugging Face Transformers
+2. Encode them in base64 format
+3. Send them to a vLLM server via the OpenAI-compatible Completions API
+
+Run the vLLM server first:
+vllm serve meta-llama/Llama-3.2-1B-Instruct \
+  --task generate \
+  --max-model-len 4096 \
+  --enable-prompt-embeds
+
+Run the client:
+python examples/online_serving/prompt_embed_inference_with_openai_client.py
+
+Model: meta-llama/Llama-3.2-1B-Instruct
+Note: This model is gated on Hugging Face Hub.
+      You must request access to use it:
+      https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
+
+Dependencies:
+- transformers
+- torch
+- openai
+"""
+
+import base64
+import io
+
+import torch
+import transformers
+from openai import OpenAI
+
+
+def main():
+    client = OpenAI(
+        api_key="EMPTY",
+        base_url="http://localhost:8000/v1",
+    )
+
+    model_name = "meta-llama/Llama-3.2-1B-Instruct"
+
+    # Transformers
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+    transformers_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
+
+    # Refer to the HuggingFace repo for the correct format to use
+    chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
+    token_ids = tokenizer.apply_chat_template(
+        chat, add_generation_prompt=True, return_tensors="pt"
+    )
+
+    embedding_layer = transformers_model.get_input_embeddings()
+    prompt_embeds = embedding_layer(token_ids).squeeze(0)
+
+    # Prompt embeddings
+    buffer = io.BytesIO()
+    torch.save(prompt_embeds, buffer)
+    buffer.seek(0)
+    binary_data = buffer.read()
+    encoded_embeds = base64.b64encode(binary_data).decode("utf-8")
+
+    completion = client.completions.create(
+        model=model_name,
+        # NOTE: The OpenAI client does not allow `None` as an input to
+        # `prompt`. Use an empty string if you have no text prompts.
+        prompt="",
+        max_tokens=5,
+        temperature=0.0,
+        # NOTE: The OpenAI client allows passing in extra JSON body via the
+        # `extra_body` argument.
+        extra_body={"prompt_embeds": encoded_embeds},
+    )
+
+    print("-" * 30)
+    print(completion.choices[0].text)
+    print("-" * 30)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/ray_serve_deepseek.py b/examples/online_serving/ray_serve_deepseek.py
index e2dce107e78a..a76020130c3a 100644
--- a/examples/online_serving/ray_serve_deepseek.py
+++ b/examples/online_serving/ray_serve_deepseek.py
@@ -28,9 +28,7 @@
     },
     # Change to the accelerator type of the node
     accelerator_type="H100",
-    runtime_env={"env_vars": {
-        "VLLM_USE_V1": "1"
-    }},
+    runtime_env={"env_vars": {"VLLM_USE_V1": "1"}},
     # Customize engine arguments as needed (e.g. vLLM engine kwargs)
     engine_kwargs={
         "tensor_parallel_size": 8,
diff --git a/examples/online_serving/retrieval_augmented_generation_with_langchain.py b/examples/online_serving/retrieval_augmented_generation_with_langchain.py
index 73063065cb36..37af3b3887f5 100644
--- a/examples/online_serving/retrieval_augmented_generation_with_langchain.py
+++ b/examples/online_serving/retrieval_augmented_generation_with_langchain.py
@@ -55,7 +55,7 @@ def load_and_split_documents(config: dict[str, Any]):
     Load and split documents from web URL
     """
     try:
-        loader = WebBaseLoader(web_paths=(config["url"], ))
+        loader = WebBaseLoader(web_paths=(config["url"],))
         docs = loader.load()
 
         text_splitter = RecursiveCharacterTextSplitter(
@@ -121,64 +121,71 @@ def create_qa_chain(retriever: Any, llm: ChatOpenAI, prompt: PromptTemplate):
     """
     Set up question answering chain
     """
-    return ({
-        "context": retriever | format_docs,
-        "question": RunnablePassthrough(),
-    }
-            | prompt
-            | llm
-            | StrOutputParser())
+    return (
+        {
+            "context": retriever | format_docs,
+            "question": RunnablePassthrough(),
+        }
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
 
 
 def get_parser() -> argparse.ArgumentParser:
     """
     Parse command line arguments
     """
-    parser = argparse.ArgumentParser(description='RAG with vLLM and langchain')
+    parser = argparse.ArgumentParser(description="RAG with vLLM and langchain")
 
     # Add command line arguments
-    parser.add_argument('--vllm-api-key',
-                        default="EMPTY",
-                        help='API key for vLLM compatible services')
-    parser.add_argument('--vllm-embedding-endpoint',
-                        default="http://localhost:8000/v1",
-                        help='Base URL for embedding service')
-    parser.add_argument('--vllm-chat-endpoint',
-                        default="http://localhost:8001/v1",
-                        help='Base URL for chat service')
-    parser.add_argument('--uri',
-                        default="./milvus.db",
-                        help='URI for Milvus database')
     parser.add_argument(
-        '--url',
-        default=("https://docs.vllm.ai/en/latest/getting_started/"
-                 "quickstart.html"),
-        help='URL of the document to process')
-    parser.add_argument('--embedding-model',
-                        default="ssmits/Qwen2-7B-Instruct-embed-base",
-                        help='Model name for embeddings')
-    parser.add_argument('--chat-model',
-                        default="qwen/Qwen1.5-0.5B-Chat",
-                        help='Model name for chat')
-    parser.add_argument('-i',
-                        '--interactive',
-                        action='store_true',
-                        help='Enable interactive Q&A mode')
-    parser.add_argument('-k',
-                        '--top-k',
-                        type=int,
-                        default=3,
-                        help='Number of top results to retrieve')
-    parser.add_argument('-c',
-                        '--chunk-size',
-                        type=int,
-                        default=1000,
-                        help='Chunk size for document splitting')
-    parser.add_argument('-o',
-                        '--chunk-overlap',
-                        type=int,
-                        default=200,
-                        help='Chunk overlap for document splitting')
+        "--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services"
+    )
+    parser.add_argument(
+        "--vllm-embedding-endpoint",
+        default="http://localhost:8000/v1",
+        help="Base URL for embedding service",
+    )
+    parser.add_argument(
+        "--vllm-chat-endpoint",
+        default="http://localhost:8001/v1",
+        help="Base URL for chat service",
+    )
+    parser.add_argument("--uri", default="./milvus.db", help="URI for Milvus database")
+    parser.add_argument(
+        "--url",
+        default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"),
+        help="URL of the document to process",
+    )
+    parser.add_argument(
+        "--embedding-model",
+        default="ssmits/Qwen2-7B-Instruct-embed-base",
+        help="Model name for embeddings",
+    )
+    parser.add_argument(
+        "--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat"
+    )
+    parser.add_argument(
+        "-i", "--interactive", action="store_true", help="Enable interactive Q&A mode"
+    )
+    parser.add_argument(
+        "-k", "--top-k", type=int, default=3, help="Number of top results to retrieve"
+    )
+    parser.add_argument(
+        "-c",
+        "--chunk-size",
+        type=int,
+        default=1000,
+        help="Chunk size for document splitting",
+    )
+    parser.add_argument(
+        "-o",
+        "--chunk-overlap",
+        type=int,
+        default=200,
+        help="Chunk overlap for document splitting",
+    )
 
     return parser
 
@@ -198,7 +205,7 @@ def init_config(args: Namespace):
         "url": args.url,
         "chunk_size": args.chunk_size,
         "chunk_overlap": args.chunk_overlap,
-        "top_k": args.top_k
+        "top_k": args.top_k,
     }
 
 
@@ -230,7 +237,7 @@ def main():
 
         while True:
             question = input("\nPlease enter your question: ")
-            if question.lower() in ['q', 'quit']:
+            if question.lower() in ["q", "quit"]:
                 print("\nThank you for using! Goodbye!")
                 break
 
@@ -238,7 +245,7 @@ def main():
             print(output)
     else:
         # Default single question mode
-        question = ("How to install vLLM?")
+        question = "How to install vLLM?"
         output = qa_chain.invoke(question)
         print("-" * 50)
         print(output)
diff --git a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
index a8f76dfe4c69..08796b1b3a54 100644
--- a/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
+++ b/examples/online_serving/retrieval_augmented_generation_with_llamaindex.py
@@ -35,6 +35,7 @@
     - Default ports: 8000 (embedding), 8001 (chat)
     - First run may take time to download models
 """
+
 import argparse
 from argparse import Namespace
 from typing import Any
@@ -59,7 +60,7 @@ def init_config(args: Namespace):
         "db_path": args.db_path,
         "chunk_size": args.chunk_size,
         "chunk_overlap": args.chunk_overlap,
-        "top_k": args.top_k
+        "top_k": args.top_k,
     }
 
 
@@ -117,52 +118,58 @@ def query_document(index: VectorStoreIndex, question: str, top_k: int):
 
 def get_parser() -> argparse.ArgumentParser:
     """Parse command line arguments"""
-    parser = argparse.ArgumentParser(
-        description='RAG with vLLM and LlamaIndex')
+    parser = argparse.ArgumentParser(description="RAG with vLLM and LlamaIndex")
 
     # Add command line arguments
     parser.add_argument(
-        '--url',
-        default=("https://docs.vllm.ai/en/latest/getting_started/"
-                 "quickstart.html"),
-        help='URL of the document to process')
-    parser.add_argument('--embedding-model',
-                        default="ssmits/Qwen2-7B-Instruct-embed-base",
-                        help='Model name for embeddings')
-    parser.add_argument('--chat-model',
-                        default="qwen/Qwen1.5-0.5B-Chat",
-                        help='Model name for chat')
-    parser.add_argument('--vllm-api-key',
-                        default="EMPTY",
-                        help='API key for vLLM compatible services')
-    parser.add_argument('--embedding-endpoint',
-                        default="http://localhost:8000/v1",
-                        help='Base URL for embedding service')
-    parser.add_argument('--chat-endpoint',
-                        default="http://localhost:8001/v1",
-                        help='Base URL for chat service')
-    parser.add_argument('--db-path',
-                        default="./milvus_demo.db",
-                        help='Path to Milvus database')
-    parser.add_argument('-i',
-                        '--interactive',
-                        action='store_true',
-                        help='Enable interactive Q&A mode')
-    parser.add_argument('-c',
-                        '--chunk-size',
-                        type=int,
-                        default=1000,
-                        help='Chunk size for document splitting')
-    parser.add_argument('-o',
-                        '--chunk-overlap',
-                        type=int,
-                        default=200,
-                        help='Chunk overlap for document splitting')
-    parser.add_argument('-k',
-                        '--top-k',
-                        type=int,
-                        default=3,
-                        help='Number of top results to retrieve')
+        "--url",
+        default=("https://docs.vllm.ai/en/latest/getting_started/quickstart.html"),
+        help="URL of the document to process",
+    )
+    parser.add_argument(
+        "--embedding-model",
+        default="ssmits/Qwen2-7B-Instruct-embed-base",
+        help="Model name for embeddings",
+    )
+    parser.add_argument(
+        "--chat-model", default="qwen/Qwen1.5-0.5B-Chat", help="Model name for chat"
+    )
+    parser.add_argument(
+        "--vllm-api-key", default="EMPTY", help="API key for vLLM compatible services"
+    )
+    parser.add_argument(
+        "--embedding-endpoint",
+        default="http://localhost:8000/v1",
+        help="Base URL for embedding service",
+    )
+    parser.add_argument(
+        "--chat-endpoint",
+        default="http://localhost:8001/v1",
+        help="Base URL for chat service",
+    )
+    parser.add_argument(
+        "--db-path", default="./milvus_demo.db", help="Path to Milvus database"
+    )
+    parser.add_argument(
+        "-i", "--interactive", action="store_true", help="Enable interactive Q&A mode"
+    )
+    parser.add_argument(
+        "-c",
+        "--chunk-size",
+        type=int,
+        default=1000,
+        help="Chunk size for document splitting",
+    )
+    parser.add_argument(
+        "-o",
+        "--chunk-overlap",
+        type=int,
+        default=200,
+        help="Chunk overlap for document splitting",
+    )
+    parser.add_argument(
+        "-k", "--top-k", type=int, default=3, help="Number of top results to retrieve"
+    )
 
     return parser
 
@@ -193,7 +200,7 @@ def main():
             question = input("\nEnter your question: ")
 
             # Check for exit command
-            if question.lower() in ['quit', 'exit', 'q']:
+            if question.lower() in ["quit", "exit", "q"]:
                 print("Exiting interactive mode...")
                 break
 
diff --git a/examples/online_serving/streamlit_openai_chatbot_webserver.py b/examples/online_serving/streamlit_openai_chatbot_webserver.py
index d8a0f211d44d..0722aa671f66 100644
--- a/examples/online_serving/streamlit_openai_chatbot_webserver.py
+++ b/examples/online_serving/streamlit_openai_chatbot_webserver.py
@@ -26,6 +26,7 @@
     streamlit run streamlit_openai_chatbot_webserver.py \
         --logger.level=debug
 """
+
 import os
 from datetime import datetime
 
@@ -33,8 +34,8 @@
 from openai import OpenAI
 
 # Get command line arguments from environment variables
-openai_api_key = os.getenv('VLLM_API_KEY', "EMPTY")
-openai_api_base = os.getenv('VLLM_API_BASE', "http://localhost:8000/v1")
+openai_api_key = os.getenv("VLLM_API_KEY", "EMPTY")
+openai_api_base = os.getenv("VLLM_API_BASE", "http://localhost:8000/v1")
 
 # Initialize session states for managing chat sessions
 if "sessions" not in st.session_state:
@@ -81,9 +82,9 @@ def get_llm_response(messages, model):
         Streaming response object or error message string
     """
     try:
-        response = client.chat.completions.create(model=model,
-                                                  messages=messages,
-                                                  stream=True)
+        response = client.chat.completions.create(
+            model=model, messages=messages, stream=True
+        )
         return response
     except Exception as e:
         st.error(f"Error details: {str(e)}")
@@ -92,8 +93,9 @@ def get_llm_response(messages, model):
 
 # Sidebar - API Settings first
 st.sidebar.title("API Settings")
-new_api_base = st.sidebar.text_input("API Base URL:",
-                                     value=st.session_state.api_base_url)
+new_api_base = st.sidebar.text_input(
+    "API Base URL:", value=st.session_state.api_base_url
+)
 if new_api_base != st.session_state.api_base_url:
     st.session_state.api_base_url = new_api_base
     st.rerun()
@@ -109,16 +111,20 @@ def get_llm_response(messages, model):
 for session_id in sorted(st.session_state.sessions.keys(), reverse=True):
     # Mark the active session with a pinned button
     if session_id == st.session_state.active_session:
-        st.sidebar.button(f"📍 {session_id}",
-                          key=session_id,
-                          type="primary",
-                          on_click=switch_to_chat_session,
-                          args=(session_id, ))
+        st.sidebar.button(
+            f"📍 {session_id}",
+            key=session_id,
+            type="primary",
+            on_click=switch_to_chat_session,
+            args=(session_id,),
+        )
     else:
-        st.sidebar.button(f"Session {session_id}",
-                          key=session_id,
-                          on_click=switch_to_chat_session,
-                          args=(session_id, ))
+        st.sidebar.button(
+            f"Session {session_id}",
+            key=session_id,
+            on_click=switch_to_chat_session,
+            args=(session_id,),
+        )
 
 # Main interface
 st.title("vLLM Chat Assistant")
@@ -145,18 +151,18 @@ def get_llm_response(messages, model):
 if prompt := st.chat_input("Type your message here..."):
     # Save user message to session
     st.session_state.messages.append({"role": "user", "content": prompt})
-    st.session_state.sessions[
-        st.session_state.current_session] = st.session_state.messages
+    st.session_state.sessions[st.session_state.current_session] = (
+        st.session_state.messages
+    )
 
     # Display user message
     with st.chat_message("user"):
         st.write(prompt)
 
     # Prepare messages for llm
-    messages_for_llm = [{
-        "role": m["role"],
-        "content": m["content"]
-    } for m in st.session_state.messages]
+    messages_for_llm = [
+        {"role": m["role"], "content": m["content"]} for m in st.session_state.messages
+    ]
 
     # Generate and display llm response
     with st.chat_message("assistant"):
@@ -179,7 +185,4 @@ def get_llm_response(messages, model):
             message_placeholder.markdown(full_response)
 
     # Save llm response to session history
-    st.session_state.messages.append({
-        "role": "assistant",
-        "content": full_response
-    })
+    st.session_state.messages.append({"role": "assistant", "content": full_response})
diff --git a/examples/online_serving/utils.py b/examples/online_serving/utils.py
index 4826e8e20528..0781a27f19c5 100644
--- a/examples/online_serving/utils.py
+++ b/examples/online_serving/utils.py
@@ -16,10 +16,10 @@ def get_first_model(client: OpenAI) -> str:
             f"{client.base_url} with API key {client.api_key}. Check\n"
             "1. the server is running\n"
             "2. the server URL is correct\n"
-            "3. the API key is correct") from e
+            "3. the API key is correct"
+        ) from e
 
     if len(models.data) == 0:
-        raise RuntimeError(
-            f"No models found on the vLLM server at {client.base_url}")
+        raise RuntimeError(f"No models found on the vLLM server at {client.base_url}")
 
     return models.data[0].id
diff --git a/examples/lmcache/README.md b/examples/others/lmcache/README.md
similarity index 100%
rename from examples/lmcache/README.md
rename to examples/others/lmcache/README.md
diff --git a/examples/lmcache/cpu_offload_lmcache.py b/examples/others/lmcache/cpu_offload_lmcache.py
similarity index 87%
rename from examples/lmcache/cpu_offload_lmcache.py
rename to examples/others/lmcache/cpu_offload_lmcache.py
index eedb47dfc12e..98eafb31ed4f 100644
--- a/examples/lmcache/cpu_offload_lmcache.py
+++ b/examples/others/lmcache/cpu_offload_lmcache.py
@@ -20,6 +20,7 @@
 Learn more about LMCache environment setup, please refer to:
 https://docs.lmcache.ai/getting_started/installation.html
 """
+
 import argparse
 import contextlib
 import os
@@ -49,8 +50,7 @@ def setup_environment_variables(vllm_version: str):
 
 
 @contextlib.contextmanager
-def build_llm_with_lmcache(lmcache_connector: str, model: str,
-                           vllm_version: str):
+def build_llm_with_lmcache(lmcache_connector: str, model: str, vllm_version: str):
     ktc = KVTransferConfig(
         kv_connector=lmcache_connector,
         kv_role="kv_both",
@@ -97,18 +97,19 @@ def print_output(
     for output in outputs:
         generated_text = output.outputs[0].text
         print(f"Generated text: {generated_text!r}")
-    print(f"Generation took {time.time() - start:.2f} seconds, "
-          f"{req_str} request done.")
+    print(f"Generation took {time.time() - start:.2f} seconds, {req_str} request done.")
     print("-" * 50)
 
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-v",
-                        "--version",
-                        choices=["v0", "v1"],
-                        default="v1",
-                        help="Specify vLLM version (default: v1)")
+    parser.add_argument(
+        "-v",
+        "--version",
+        choices=["v0", "v1"],
+        default="v1",
+        help="Specify vLLM version (default: v1)",
+    )
     return parser.parse_args()
 
 
@@ -125,7 +126,6 @@ def main():
     setup_environment_variables(args.version)
 
     with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
-
         # This example script runs two requests with a shared prefix.
         # Define the shared prompt and specific prompts
         shared_prompt = "Hello, how are you?" * 1000
@@ -136,9 +136,7 @@ def main():
             shared_prompt + "Tell me a very long story",
         ]
 
-        sampling_params = SamplingParams(temperature=0,
-                                         top_p=0.95,
-                                         max_tokens=10)
+        sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
 
         # Print the first output
         print_output(llm, first_prompt, sampling_params, "first")
diff --git a/examples/lmcache/disagg_prefill_lmcache_v0.py b/examples/others/lmcache/disagg_prefill_lmcache_v0.py
similarity index 77%
rename from examples/lmcache/disagg_prefill_lmcache_v0.py
rename to examples/others/lmcache/disagg_prefill_lmcache_v0.py
index 66cc94185230..b2b7b3b2c1f9 100644
--- a/examples/lmcache/disagg_prefill_lmcache_v0.py
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v0.py
@@ -4,12 +4,13 @@
 with LMCache.
 We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
 and launch an additional LMCache server.
-KV cache is transferred in the following manner: 
+KV cache is transferred in the following manner:
 vLLM prefill node -> LMCache server -> vLLM decode node.
 
 Note that `pip install lmcache` is needed to run this example.
 Learn more about LMCache in https://github.com/LMCache/LMCache.
 """
+
 import os
 import subprocess
 import time
@@ -49,19 +50,23 @@ def run_prefill(prefill_done, prompts):
 
     sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
 
-    ktc = KVTransferConfig(kv_connector="LMCacheConnector",
-                           kv_role="kv_producer",
-                           kv_rank=0,
-                           kv_parallel_size=2)
+    ktc = KVTransferConfig(
+        kv_connector="LMCacheConnector",
+        kv_role="kv_producer",
+        kv_rank=0,
+        kv_parallel_size=2,
+    )
     # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
     # memory. Reduce the value if your GPU has less memory.
-    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
-              kv_transfer_config=ktc,
-              max_model_len=8000,
-              gpu_memory_utilization=0.8,
-              enforce_eager=True)
-
-    #llm.generate(prompts, sampling_params)
+    llm = LLM(
+        model="mistralai/Mistral-7B-Instruct-v0.2",
+        kv_transfer_config=ktc,
+        max_model_len=8000,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+    )
+
+    # llm.generate(prompts, sampling_params)
     outputs = llm.generate(prompts, sampling_params)
     for output in outputs:
         generated_text = output.outputs[0].text
@@ -79,17 +84,21 @@ def run_decode(prefill_done, prompts, timeout=1):
 
     sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
 
-    ktc = KVTransferConfig(kv_connector="LMCacheConnector",
-                           kv_role="kv_consumer",
-                           kv_rank=1,
-                           kv_parallel_size=2)
+    ktc = KVTransferConfig(
+        kv_connector="LMCacheConnector",
+        kv_role="kv_consumer",
+        kv_rank=1,
+        kv_parallel_size=2,
+    )
     # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
     # of memory. Reduce the value if your GPU has less memory.
-    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
-              kv_transfer_config=ktc,
-              max_model_len=8000,
-              gpu_memory_utilization=0.8,
-              enforce_eager=True)
+    llm = LLM(
+        model="mistralai/Mistral-7B-Instruct-v0.2",
+        kv_transfer_config=ktc,
+        max_model_len=8000,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+    )
 
     print("Waiting for prefill node to finish...")
     prefill_done.wait()
@@ -105,10 +114,9 @@ def run_decode(prefill_done, prompts, timeout=1):
 
 
 def run_lmcache_server(port):
-    server_proc = subprocess.Popen([
-        "python", "-m", "lmcache.experimental.server", "localhost",
-        str(port)
-    ])
+    server_proc = subprocess.Popen(
+        ["python", "-m", "lmcache.experimental.server", "localhost", str(port)]
+    )
     return server_proc
 
 
diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml b/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
similarity index 100%
rename from examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
rename to examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-decoder-config.yaml
diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml b/examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
similarity index 100%
rename from examples/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
rename to examples/others/lmcache/disagg_prefill_lmcache_v1/configs/lmcache-prefiller-config.yaml
diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
similarity index 100%
rename from examples/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
rename to examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_example_nixl.sh
diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
similarity index 59%
rename from examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
rename to examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
index 8db93bc8931b..20155c203658 100644
--- a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_proxy_server.py
@@ -17,13 +17,17 @@ async def lifespan(app: FastAPI):
     Lifespan context manager to handle startup and shutdown events.
     """
     # Startup: Initialize clients
-    prefiller_base_url = f'http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1'
-    decoder_base_url = f'http://{global_args.decoder_host}:{global_args.decoder_port}/v1'
-
-    app.state.prefill_client = httpx.AsyncClient(timeout=None,
-                                                 base_url=prefiller_base_url)
-    app.state.decode_client = httpx.AsyncClient(timeout=None,
-                                                base_url=decoder_base_url)
+    prefiller_base_url = (
+        f"http://{global_args.prefiller_host}:{global_args.prefiller_port}/v1"
+    )
+    decoder_base_url = (
+        f"http://{global_args.decoder_host}:{global_args.decoder_port}/v1"
+    )
+
+    app.state.prefill_client = httpx.AsyncClient(
+        timeout=None, base_url=prefiller_base_url
+    )
+    app.state.decode_client = httpx.AsyncClient(timeout=None, base_url=decoder_base_url)
 
     yield
 
@@ -37,7 +41,6 @@ async def lifespan(app: FastAPI):
 
 
 class StatsCalculator:
-
     def __init__(self):
         self._stats = []
         self._last_log_time = time.time()
@@ -51,13 +54,18 @@ def add(self, value):
     def _log_stats(self):
         # Print average, median, and 99th percentile
         np_arr = np.array(self._stats)
-        output_str = f"\nNum requests: {len(self._stats)}" + \
-                "\nPrefill node TTFT stats:" + \
-                f"\n - Average (ms): {np.mean(np_arr)}" + \
-                f"\n - Median (ms): {np.median(np_arr)}" + \
-                f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n"
-        print("===============================", output_str,
-              "===============================")
+        output_str = (
+            f"\nNum requests: {len(self._stats)}"
+            + "\nPrefill node TTFT stats:"
+            + f"\n - Average (ms): {np.mean(np_arr)}"
+            + f"\n - Median (ms): {np.median(np_arr)}"
+            + f"\n - 99th Percentile (ms): {np.percentile(np_arr, 99)}\n"
+        )
+        print(
+            "===============================",
+            output_str,
+            "===============================",
+        )
 
 
 stats_calculator = StatsCalculator()
@@ -82,15 +90,16 @@ def parse_args():
 app.state.decode_client = None
 
 
-async def send_request_to_service(client: httpx.AsyncClient, endpoint: str,
-                                  req_data: dict):
+async def send_request_to_service(
+    client: httpx.AsyncClient, endpoint: str, req_data: dict
+):
     """
     Send a request to a service using a persistent client.
     """
     req_data = req_data.copy()
-    req_data['max_tokens'] = 1
-    if 'max_completion_tokens' in req_data:
-        req_data['max_completion_tokens'] = 1
+    req_data["max_tokens"] = 1
+    if "max_completion_tokens" in req_data:
+        req_data["max_completion_tokens"] = 1
 
     headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
     response = await client.post(endpoint, json=req_data, headers=headers)
@@ -98,14 +107,16 @@ async def send_request_to_service(client: httpx.AsyncClient, endpoint: str,
     return response
 
 
-async def stream_service_response(client: httpx.AsyncClient, endpoint: str,
-                                  req_data: dict):
+async def stream_service_response(
+    client: httpx.AsyncClient, endpoint: str, req_data: dict
+):
     """
     Asynchronously stream the response from a service using a persistent client.
     """
     headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
-    async with client.stream("POST", endpoint, json=req_data,
-                             headers=headers) as response:
+    async with client.stream(
+        "POST", endpoint, json=req_data, headers=headers
+    ) as response:
         response.raise_for_status()
         async for chunk in response.aiter_bytes():
             yield chunk
@@ -121,28 +132,28 @@ async def handle_completions(request: Request):
         req_data = await request.json()
 
         # Send request to prefill service, ignore the response
-        await send_request_to_service(app.state.prefill_client, "/completions",
-                                      req_data)
+        await send_request_to_service(
+            app.state.prefill_client, "/completions", req_data
+        )
 
         et = time.time()
         stats_calculator.add(et - st)
 
         # Stream response from decode service
         async def generate_stream():
-            async for chunk in stream_service_response(app.state.decode_client,
-                                                       "/completions",
-                                                       req_data):
+            async for chunk in stream_service_response(
+                app.state.decode_client, "/completions", req_data
+            ):
                 yield chunk
 
-        return StreamingResponse(generate_stream(),
-                                 media_type="application/json")
+        return StreamingResponse(generate_stream(), media_type="text/event-stream")
 
     except Exception as e:
         import sys
         import traceback
+
         exc_info = sys.exc_info()
-        print("Error occurred in disagg prefill proxy server"
-              " - completions endpoint")
+        print("Error occurred in disagg prefill proxy server - completions endpoint")
         print(e)
         print("".join(traceback.format_exception(*exc_info)))
         raise
@@ -158,36 +169,39 @@ async def handle_chat_completions(request: Request):
         req_data = await request.json()
 
         # Send request to prefill service, ignore the response
-        await send_request_to_service(app.state.prefill_client,
-                                      "/chat/completions", req_data)
+        await send_request_to_service(
+            app.state.prefill_client, "/chat/completions", req_data
+        )
 
         et = time.time()
         stats_calculator.add(et - st)
 
         # Stream response from decode service
         async def generate_stream():
-            async for chunk in stream_service_response(app.state.decode_client,
-                                                       "/chat/completions",
-                                                       req_data):
+            async for chunk in stream_service_response(
+                app.state.decode_client, "/chat/completions", req_data
+            ):
                 yield chunk
 
-        return StreamingResponse(generate_stream(),
-                                 media_type="application/json")
+        return StreamingResponse(generate_stream(), media_type="text/event-stream")
 
     except Exception as e:
         import sys
         import traceback
+
         exc_info = sys.exc_info()
-        print("Error occurred in disagg prefill proxy server "
-              " - chat completions endpoint")
+        print(
+            "Error occurred in disagg prefill proxy server  - chat completions endpoint"
+        )
         print(e)
         print("".join(traceback.format_exception(*exc_info)))
         raise
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     global global_args
     global_args = parse_args()
 
     import uvicorn
+
     uvicorn.run(app, host=global_args.host, port=global_args.port)
diff --git a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
similarity index 97%
rename from examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
rename to examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
index 831ef0bb574b..5719fa821292 100644
--- a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
+++ b/examples/others/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
@@ -54,6 +54,6 @@ elif [[ $1 == "decoder" ]]; then
 
 else
     echo "Invalid role: $1"
-    echo "Should be either prefill, decode"
+    echo "Should be either prefiller, decoder"
     exit 1
 fi
diff --git a/examples/lmcache/kv_cache_sharing_lmcache_v1.py b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
similarity index 81%
rename from examples/lmcache/kv_cache_sharing_lmcache_v1.py
rename to examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
index 7748f8ca6133..89945d67a6f3 100644
--- a/examples/lmcache/kv_cache_sharing_lmcache_v1.py
+++ b/examples/others/lmcache/kv_cache_sharing_lmcache_v1.py
@@ -3,13 +3,14 @@
 This file demonstrates the example usage of remote KV cache sharing
 with LMCache.
 We will launch 2 vllm instances, and launch an additional LMCache server.
-KV cache is transferred in the following manner: 
+KV cache is transferred in the following manner:
 (1) vLLM instance 1 -> LMCache server (KV cache store).
 (2) LMCache server -> vLLM instance 2 (KV cache reuse/retrieve).
 
 Note that lmcache needs to be installed to run this example.
 Learn more about LMCache in https://github.com/LMCache/LMCache.
 """
+
 import os
 import subprocess
 import time
@@ -49,15 +50,16 @@ def run_store(store_done, prompts):
 
     sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
 
-    ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1",
-                           kv_role="kv_both")
+    ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both")
     # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
     # memory. Reduce the value if your GPU has less memory.
-    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
-              kv_transfer_config=ktc,
-              max_model_len=8000,
-              gpu_memory_utilization=0.8,
-              enforce_eager=True)
+    llm = LLM(
+        model="mistralai/Mistral-7B-Instruct-v0.2",
+        kv_transfer_config=ktc,
+        max_model_len=8000,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+    )
 
     outputs = llm.generate(prompts, sampling_params)
     for output in outputs:
@@ -76,15 +78,16 @@ def run_retrieve(store_done, prompts, timeout=1):
 
     sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
 
-    ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1",
-                           kv_role="kv_both")
+    ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1", kv_role="kv_both")
     # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
     # of memory. Reduce the value if your GPU has less memory.
-    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
-              kv_transfer_config=ktc,
-              max_model_len=8000,
-              gpu_memory_utilization=0.8,
-              enforce_eager=True)
+    llm = LLM(
+        model="mistralai/Mistral-7B-Instruct-v0.2",
+        kv_transfer_config=ktc,
+        max_model_len=8000,
+        gpu_memory_utilization=0.8,
+        enforce_eager=True,
+    )
 
     print("Waiting for KV cache store to finish...")
     store_done.wait()
@@ -100,10 +103,9 @@ def run_retrieve(store_done, prompts, timeout=1):
 
 
 def run_lmcache_server(port):
-    server_proc = subprocess.Popen([
-        "python", "-m", "lmcache.experimental.server", "localhost",
-        str(port)
-    ])
+    server_proc = subprocess.Popen(
+        ["python", "-m", "lmcache.experimental.server", "localhost", str(port)]
+    )
     return server_proc
 
 
diff --git a/examples/other/logging_configuration.md b/examples/others/logging_configuration.md
similarity index 100%
rename from examples/other/logging_configuration.md
rename to examples/others/logging_configuration.md
diff --git a/examples/other/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py
similarity index 72%
rename from examples/other/tensorize_vllm_model.py
rename to examples/others/tensorize_vllm_model.py
index 7d11ba51a094..175777630833 100644
--- a/examples/other/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@@ -6,11 +6,15 @@
 import os
 import uuid
 
-from vllm import LLM
+from vllm import LLM, SamplingParams
 from vllm.engine.arg_utils import EngineArgs
-from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
-                                                         TensorizerConfig,
-                                                         tensorize_vllm_model)
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.model_loader.tensorizer import (
+    TensorizerArgs,
+    TensorizerConfig,
+    tensorize_lora_adapter,
+    tensorize_vllm_model,
+)
 from vllm.utils import FlexibleArgumentParser
 
 # yapf conflicts with isort for this docstring
@@ -27,7 +31,7 @@
 To serialize a model, install vLLM from source, then run something 
 like this from the root level of this repository:
 
-python -m examples.other.tensorize_vllm_model \
+python examples/others/tensorize_vllm_model.py \
    --model facebook/opt-125m \
    serialize \
    --serialized-directory s3://my-bucket \
@@ -47,7 +51,7 @@
 To deserialize a model, you can run something like this from the root 
 level of this repository:
 
-python -m examples.other.tensorize_vllm_model \
+python examples/others/tensorize_vllm_model.py \
    --model EleutherAI/gpt-j-6B \
    --dtype float16 \
    deserialize \
@@ -65,11 +69,11 @@
 model-rank-%03d.tensors
 
 For more information on the available arguments for serializing, run 
-`python -m examples.other.tensorize_vllm_model serialize --help`.
+`python -m examples.others.tensorize_vllm_model serialize --help`.
 
 Or for deserializing:
 
-`python -m examples.other.tensorize_vllm_model deserialize --help`.
+`python examples/others/tensorize_vllm_model.py deserialize --help`.
 
 Once a model is serialized, tensorizer can be invoked with the `LLM` class 
 directly to load models:
@@ -90,11 +94,27 @@
 In order to see all of the available arguments usable to configure 
 loading with tensorizer that are given to `TensorizerConfig`, run:
 
-`python -m examples.other.tensorize_vllm_model deserialize --help`
+`python examples/others/tensorize_vllm_model.py deserialize --help`
 
 under the `tensorizer options` section. These can also be used for
 deserialization in this example script, although `--tensorizer-uri` and
 `--path-to-tensors` are functionally the same in this case.
+
+Tensorizer can also be used to save and load LoRA adapters. A LoRA adapter
+can be serialized directly with the path to the LoRA adapter on HF Hub and
+a TensorizerConfig object. In this script, passing a HF id to a LoRA adapter
+will serialize the LoRA adapter artifacts to `--serialized-directory`.
+
+You can then use the LoRA adapter with `vllm serve`, for instance, by ensuring 
+the LoRA artifacts are in your model artifacts directory and specifying 
+`--enable-lora`. For instance:
+
+```
+vllm serve <model_path> \
+    --load-format tensorizer \
+    --model-loader-extra-config '{"tensorizer_uri": "<model_path>.tensors"}' \
+    --enable-lora
+```
 """
 
 
@@ -107,6 +127,19 @@ def parse_args():
         "also supported, although libsodium must be installed to "
         "use it.")
     parser = EngineArgs.add_cli_args(parser)
+
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        required=False,
+        help="Path to a LoRA adapter to "
+        "serialize along with model tensors. This can then be deserialized "
+        "along with the model by passing a tensorizer_config kwarg to "
+        "LoRARequest with type TensorizerConfig. See the docstring for this "
+        "for a usage example."
+
+    )
+
     subparsers = parser.add_subparsers(dest='command')
 
     serialize_parser = subparsers.add_parser(
@@ -169,11 +202,42 @@ def parse_args():
 
 
 def deserialize():
-    llm = LLM(model=args.model,
-              load_format="tensorizer",
-              tensor_parallel_size=args.tensor_parallel_size,
-              model_loader_extra_config=tensorizer_config
-    )
+    if args.lora_path:
+        tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
+        llm = LLM(model=args.model,
+                  load_format="tensorizer",
+                  tensor_parallel_size=args.tensor_parallel_size,
+                  model_loader_extra_config=tensorizer_config,
+                  enable_lora=True,
+        )
+        sampling_params = SamplingParams(
+            temperature=0,
+            max_tokens=256,
+            stop=["[/assistant]"]
+        )
+
+        # Truncating this as the extra text isn't necessary
+        prompts = [
+            "[user] Write a SQL query to answer the question based on ..."
+        ]
+
+        # Test LoRA load
+        print(
+            llm.generate(
+            prompts,
+            sampling_params,
+            lora_request=LoRARequest("sql-lora",
+                                     1,
+                                     args.lora_path,
+                                     tensorizer_config = tensorizer_config)
+            )
+        )
+    else:
+        llm = LLM(model=args.model,
+                  load_format="tensorizer",
+                  tensor_parallel_size=args.tensor_parallel_size,
+                  model_loader_extra_config=tensorizer_config
+        )
     return llm
 
 
@@ -197,7 +261,10 @@ def deserialize():
 
     model_name = model_ref.split("/")[1]
 
-    keyfile = args.keyfile if args.keyfile else None
+    if args.command == "serialize" or args.command == "deserialize":
+        keyfile = args.keyfile
+    else:
+        keyfile = None
 
     if args.model_loader_extra_config:
         config = json.loads(args.model_loader_extra_config)
@@ -228,6 +295,10 @@ def deserialize():
             encryption_keyfile=keyfile,
             **credentials)
 
+        if args.lora_path:
+            tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
+            tensorize_lora_adapter(args.lora_path, tensorizer_config)
+
         tensorize_vllm_model(engine_args, tensorizer_config)
 
     elif args.command == "deserialize":
diff --git a/examples/pyproject.toml b/examples/pyproject.toml
new file mode 100644
index 000000000000..f825cb203269
--- /dev/null
+++ b/examples/pyproject.toml
@@ -0,0 +1,54 @@
+# This local pyproject file is part of the migration from yapf to ruff format.
+# It uses the same core rules as the main pyproject.toml file, but with the
+# following differences:
+# - ruff line length is overridden to 88
+# - deprecated typing ignores (UP006, UP035) have been removed
+
+[tool.ruff]
+line-length = 88
+exclude = [
+    # External file, leaving license intact
+    "examples/other/fp8/quantizer/quantize.py",
+    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
+]
+
+[tool.ruff.lint.per-file-ignores]
+"vllm/third_party/**" = ["ALL"]
+"vllm/version.py" = ["F401"]
+"vllm/_version.py" = ["ALL"]
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    "I",
+    # flake8-logging-format
+    "G",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # Loop control variable not used within loop body
+    "B007",
+    # f-string format
+    "UP032",
+    # Can remove once 3.10+ is the minimum Python version
+    "UP007",
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["vllm"]
+
+[tool.ruff.format]
+docstring-code-format = true
\ No newline at end of file
diff --git a/examples/tool_chat_template_llama4_pythonic.jinja b/examples/tool_chat_template_llama4_pythonic.jinja
index bd18a35bdda9..bbed3d8205e0 100644
--- a/examples/tool_chat_template_llama4_pythonic.jinja
+++ b/examples/tool_chat_template_llama4_pythonic.jinja
@@ -1,16 +1,17 @@
 {{- bos_token }}
-{%- if custom_tools is defined %}
+{%- if custom_tools is defined and custom_tools%}
     {%- set tools = custom_tools %}
 {%- endif %}
-{%- if not tools_in_user_message is defined %}
-    {%- set tools_in_user_message = false %}
-{%- endif %}
-{%- if not tools is defined %}
+{%- if tools is defined and tools %}
+    {%- set tool_definition = tool_definition ~ (tools | tojson(indent=4)) %}
+{%- else %}
     {%- set tools = none %}
 {%- endif %}
 
+
 {#- This block extracts the system message, so we can slot it into the right place. #}
 {%- if messages[0]['role'] == 'system' %}
+    {%- set user_provided_system_message = true %}
     {%- if messages[0]['content'] is string %}
         {%- set system_message = messages[0]['content']|trim %}
     {%- else %}
@@ -18,68 +19,33 @@
     {%- endif %}
     {%- set messages = messages[1:] %}
 {%- else %}
-    {%- if tools is not none %}
-        {#- Add default tool system message when tools are provided #}
-        {%- set system_message = "You are a helpful assistant with tool calling "
-            "capabilities. Only reply with a tool call if the function exists in the "
-            "library provided by the user. If it doesn't exist, just reply directly in "
-            "natural language. When you receive a tool call response, use the output to "
-            "format an answer to the original user question." %}
+    {%- if tools is not none  %}
+        {#- Since not system_message was provided by user, if tool is provided, system_message is now default tool system message #}
+        {#- This system message is from llama website:https://www.llama.com/docs/model-cards-and-prompt-formats/llama4/  #}
+        {%- set system_message = "You are a helpful assistant and an expert in function composition. You can answer general questions using your internal knowledge OR invoke functions when necessary. Follow these strict guidelines:\n\n1. FUNCTION CALLS:\n- ONLY use functions that are EXPLICITLY listed in the function list below\n- If NO functions are listed (empty function list []), respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If a function is not in the list, respond ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\"\n- If ALL required parameters are present AND the query EXACTLY matches a listed function's purpose: output ONLY the function call(s)\n- Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\nExamples:\nCORRECT: [get_weather(location=\"Vancouver\"), calculate_route(start=\"Boston\", end=\"New York\")] <- Only if get_weather and calculate_route are in function list\nINCORRECT: get_weather(location=\"New York\")\nINCORRECT: Let me check the weather: [get_weather(location=\"New York\")]\nINCORRECT: [get_events(location=\"Singapore\")] <- If function not in list\n\n2. RESPONSE RULES:\n- For pure function requests matching a listed function: ONLY output the function call(s)\n- For knowledge questions: ONLY output text\n- For missing parameters: ONLY request the specific missing parameters\n- For unavailable services (not in function list): output ONLY with internal knowledge or \"I don't have access to [Unavailable service] information\". Do NOT execute a function call.\n- If the query asks for information beyond what a listed function provides: output ONLY with internal knowledge about your limitations\n- NEVER combine text and function calls in the same response\n- NEVER suggest alternative functions when the requested service is unavailable\n- NEVER create or invent new functions not listed below\n\n3. STRICT BOUNDARIES:\n- ONLY use functions from the list below - no exceptions\n- NEVER use a function as an alternative to unavailable information\n- NEVER call functions not present in the function list\n- NEVER add explanatory text to function calls\n- NEVER respond with empty brackets\n- Use proper Python/JSON syntax for function calls\n- Check the function list carefully before responding\n\n4. TOOL RESPONSE HANDLING:\n- When receiving tool responses: provide concise, natural language responses\n- Don't repeat tool response verbatim\n- Don't add supplementary information\n\nHere is a list of functions in JSON format that you can invoke:\n" %}
     {%- else %}
         {%- set system_message = "" %}
     {%- endif %}
 {%- endif %}
-
-{#- System message if the user supplied one, or if tools are used (default tool system message) #}
+{#- Now writing the system message: use the user provided system message if user_provided_system_message, else default tool system message if tools presented #}
 {%- if system_message %}
     {#- always use user provided system message to override default tool system message #}
     {{- "<|header_start|>system<|header_end|>\n\n" }}
     {{- system_message }}
-    {%- if tools is not none and not tools_in_user_message %}
-        {{- "Tools: You have access to the following tools. You might need to use one "
-            "or more function/tool calls to fulfill the task. \n"
-            "If none are needed, then proceed to the response.\n\n"
-            "Tool Call Syntax: You can call tools using the following syntax:\n"
-            "[func_name1(params_name1=params_value1, params_name2=params_value2, ...), ...]\n"
-            "Do not include anything else when calling the tools with the syntax above.\n\n"
-            "Here is a list of functions in JSON format that you can invoke.\n " }}
-        {%- for t in tools %}
-            {{- t | tojson(indent=4) }}
-            {{- "\n\n" }}
-        {%- endfor %}
+    {%- if user_provided_system_message and tools %}
+        {{- "\nHere is a list of functions in JSON format that you can invoke. Use exact format: [func_name1(param1=value1, param2=value2), func_name2(...)]\n" }}
+        {{- tool_definition -}}
+        {%- elif tool_definition %}
+        {{- tool_definition -}}
     {%- endif %}
     {{- "<|eot|>" }}
 {%- endif %}
 
-{#- Custom tools are passed in a user message with some extra guidance #}
-{%- if tools_in_user_message and tools is not none %}
-    {#- Extract the first user message so we can plug it in here #}
-    {%- if messages | length != 0 %}
-        {%- if messages[0]['content'] is string %}
-            {%- set first_user_message = messages[0]['content']|trim %}
-        {%- else %}
-            {%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
-        {%- endif %}
-        {%- set messages = messages[1:] %}
-    {%- else %}
-        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
-    {%- endif %}
-    {{- '<|header_start|>user<|header_end|>\n\n' -}}
-    {{- first_user_message}}
-    {{- "\nHere is a list of functions in JSON format that you can invoke:"}}
-    {%- for t in tools %}
-        {{- t | tojson(indent=4) }}
-        {{- "\n\n" }}
-    {%- endfor %}
-    {{- "Should you decide to return the function call(s), put them in the format "
-        "of [func_name1(params_name1=params_value1, params_name2=params_value2, "
-        "...), ...]\nDo not include anything else when calling the tools with the "
-        "syntax above." }}
-{%- endif %}
-
+{#- Now deal with all other messages #}
 {%- for message in messages %}
-    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
-    {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
+    {#- Base case: messages that are not from tool role and has empty tool_call list  #}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or ('tool_calls' in message and  message.tool_calls|length != 0 )) %}
+        {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
         {%- if message['content'] is string %}
             {{- message['content'] }}
         {%- else %}
@@ -91,10 +57,12 @@
                 {%- endif %}
             {%- endfor %}
         {%- endif %}
-        {{- "<|eot|>" }}
-    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
-        {%- set tool_call = message.tool_calls[0].function %}
-        {{- '<|header_start|>assistant<|header_end|>\n\n' -}}
+    {{- "<|eot|>" }}
+    {#- Tool case: messages has non-empty tool_call list, must from assistant #}
+    {%- elif 'tool_calls' in message %}
+        {#- assume tool_calls are always coming from assistant #}
+        {%- if message.role == 'assistant' %}
+            {{- '<|header_start|>assistant<|header_end|>\n\n' -}}
         {%- if message['content'] is string %}
             {{- message['content'] }}
         {%- else %}
@@ -106,32 +74,36 @@
                 {%- endif %}
             {%- endfor %}
         {%- endif %}
+            {{- "[" }}
         {%- for tool_call in message.tool_calls %}
             {%- if tool_call.function is defined %}
                 {%- set tool_call = tool_call.function %}
             {%- endif %}
-            {{- tool_call.name + '(' -}}
+                {{-  tool_call.name + '(' -}}
             {%- for param in tool_call.arguments %}
-                {{- param + '=' -}}
+                {{- param + '="' -}}
                 {{- "%s" | format(tool_call.arguments[param]) -}}
+                {{- '"' -}}
                 {% if not loop.last %}, {% endif %}
             {%- endfor %}
             {{- ')' -}}
             {% if not loop.last %}, {% endif %}
         {%- endfor %}
-        {{- "<|eom|>" }}
+        {{- "]<|eot|>" }}
+{%- endif %}
+{#- Tool_response case: messages are from tool_response  #}
     {%- elif message.role == "tool" or message.role == "ipython" %}
         {{- "<|header_start|>ipython<|header_end|>\n\n" }}
         {%- if message.content is string %}
-            {{- message.content | tojson }}
+            {{-  message.content  | tojson }}
         {%- else %}
             {%- for content in message['content']  %}
                 {%- if content['type']  == 'text' %}
-                    {{- content['text'] | tojson }}
+                    {{-  content['text'] | tojson }}
                 {%- endif %}
             {%- endfor %}
         {%- endif %}
-        {{- "<|eom|>" }}
+        {{- "<|eot|>" }}
     {%- endif %}
 {%- endfor %}
 {%- if add_generation_prompt %}
diff --git a/mkdocs.yaml b/mkdocs.yaml
new file mode 100644
index 000000000000..52de643f5e2b
--- /dev/null
+++ b/mkdocs.yaml
@@ -0,0 +1,130 @@
+site_name: vLLM
+site_url: https://docs.vllm.ai
+repo_url: https://github.com/vllm-project/vllm
+exclude_docs: |
+  *.inc.md
+  *.template.md
+theme:
+  name: material
+  logo: assets/logos/vllm-logo-only-light.ico
+  favicon: assets/logos/vllm-logo-only-light.ico
+  palette:
+    # Palette toggle for automatic mode
+    - media: "(prefers-color-scheme)"
+      toggle:
+        icon: material/brightness-auto
+        name: Switch to light mode
+    # Palette toggle for light mode
+    - media: "(prefers-color-scheme: light)"
+      scheme: default 
+      primary: white
+      toggle:
+        icon: material/brightness-7
+        name: Switch to dark mode
+    # Palette toggle for dark mode
+    - media: "(prefers-color-scheme: dark)"
+      scheme: slate
+      primary: black
+      toggle:
+        icon: material/brightness-2
+        name: Switch to system preference
+  features:
+    - content.code.copy
+    - content.tabs.link
+    - navigation.tracking
+    - navigation.tabs
+    - navigation.sections
+    - navigation.prune
+    - navigation.top
+    - search.highlight
+    - search.share
+    - toc.follow
+  custom_dir: docs/mkdocs/overrides
+
+hooks:
+  - docs/mkdocs/hooks/remove_announcement.py
+  - docs/mkdocs/hooks/generate_examples.py
+  - docs/mkdocs/hooks/url_schemes.py
+
+# Required to stop api-autonav from raising an error
+# https://github.com/tlambert03/mkdocs-api-autonav/issues/16
+nav:
+  - api
+
+plugins:
+  - meta
+  - search
+  - autorefs
+  - awesome-nav
+  # For API reference generation
+  - api-autonav:
+      modules: ["vllm"]
+      api_root_uri: "api"
+      exclude:
+        - "re:vllm\\._.*"  # Internal modules
+        - "vllm.third_party"
+        - "vllm.vllm_flash_attn"
+  - mkdocstrings:
+      handlers:
+        python:
+          options:
+            show_symbol_type_heading: true
+            show_symbol_type_toc: true
+            filters: []
+            summary:
+              modules: true
+            show_if_no_docstring: true
+            show_signature_annotations: true
+            separate_signature: true
+            show_overloads: true
+            signature_crossrefs: true
+          inventories:
+          - https://docs.python.org/3/objects.inv
+          - https://typing-extensions.readthedocs.io/en/latest/objects.inv
+          - https://docs.aiohttp.org/en/stable/objects.inv
+          - https://pillow.readthedocs.io/en/stable/objects.inv
+          - https://numpy.org/doc/stable/objects.inv
+          - https://pytorch.org/docs/stable/objects.inv
+          - https://psutil.readthedocs.io/en/stable/objects.inv
+
+markdown_extensions:
+  - attr_list
+  - md_in_html
+  - admonition
+  - pymdownx.details
+  # For content tabs
+  - pymdownx.superfences
+  - pymdownx.tabbed:
+      slugify: !!python/object/apply:pymdownx.slugs.slugify
+        kwds:
+          case: lower
+      alternate_style: true
+  # For code highlighting
+  - pymdownx.highlight:
+      anchor_linenums: true
+      line_spans: __span
+      pygments_lang_class: true
+  - pymdownx.inlinehilite
+  - pymdownx.snippets
+  # For emoji and icons
+  - pymdownx.emoji:
+      emoji_index: !!python/name:material.extensions.emoji.twemoji
+      emoji_generator: !!python/name:material.extensions.emoji.to_svg
+  # For in page [TOC] (not sidebar)
+  - toc:
+      permalink: true
+  # For math rendering
+  - mdx_math:
+      enable_dollar_delimiter: true
+
+extra_css:
+  - mkdocs/stylesheets/extra.css
+
+extra_javascript:
+  - mkdocs/javascript/run_llm_widget.js
+  - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
+
+# Makes the url format end in .html rather than act as a dir
+# So index.md generates as index.html and is available under URL /index.html
+# https://www.mkdocs.org/user-guide/configuration/#use_directory_urls
+use_directory_urls: false
diff --git a/pyproject.toml b/pyproject.toml
index 881e0fa73eda..aca4c4cf7f61 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,6 +8,7 @@ requires = [
     "setuptools-scm>=8.0",
     "torch == 2.7.0",
     "wheel",
+    "regex",
     "jinja2",
 ]
 build-backend = "setuptools.build_meta"
@@ -35,8 +36,8 @@ dynamic = [ "version", "dependencies", "optional-dependencies"]
 
 [project.urls]
 Homepage="https://github.com/vllm-project/vllm"
-Documentation="https://vllm.readthedocs.io/en/latest/"
-Slack="http://slack.vllm.ai/"
+Documentation="https://docs.vllm.ai/en/latest/"
+Slack="https://slack.vllm.ai/"
 
 [project.scripts]
 vllm = "vllm.entrypoints.cli.main:main"
@@ -56,16 +57,12 @@ ignore_patterns = [
     ".buildkite/**",
     "benchmarks/**",
     "build/**",
+    "examples/**",
 ]
 
 [tool.ruff]
 # Allow lines to be as long as 80.
 line-length = 80
-exclude = [
-    # External file, leaving license intact
-    "examples/other/fp8/quantizer/quantize.py",
-    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
-]
 
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]
@@ -148,6 +145,7 @@ skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora
 skip_glob = [
     ".buildkite/*",
     "benchmarks/*",
+    "examples/*",
 ]
 use_parentheses = true
 skip_gitignore = true
@@ -165,9 +163,12 @@ markers = [
 
 [tool.pymarkdown]
 plugins.md004.style = "sublist" # ul-style
+plugins.md007.indent = 4 # ul-indent
+plugins.md007.start_indented = true # ul-indent
 plugins.md013.enabled = false # line-length
 plugins.md041.enabled = false # first-line-h1
 plugins.md033.enabled = false # inline-html
+plugins.md046.enabled = false # code-block-style
 plugins.md024.allow_different_nesting = true # no-duplicate-headers
 
 [tool.ty]
diff --git a/requirements/build.txt b/requirements/build.txt
index 5edc593b9270..320e5b892584 100644
--- a/requirements/build.txt
+++ b/requirements/build.txt
@@ -7,3 +7,4 @@ setuptools-scm>=8
 torch==2.7.0
 wheel
 jinja2>=3.1.6
+regex
diff --git a/requirements/common.txt b/requirements/common.txt
index 80f90e60007e..625efc3366f4 100644
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -1,3 +1,4 @@
+regex # Replace re for higher-performance regex matching
 cachetools
 psutil
 sentencepiece  # Required for LLaMA tokenizer.
@@ -7,7 +8,7 @@ tqdm
 blake3
 py-cpuinfo
 transformers >= 4.51.1
-huggingface-hub[hf_xet] >= 0.30.0  # Required for Xet downloads.
+huggingface-hub[hf_xet] >= 0.32.0  # Required for Xet downloads.
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf # Required by LlamaTokenizer.
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
@@ -40,7 +41,7 @@ compressed-tensors == 0.9.4 # required for compressed-tensors
 depyf==0.18.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files
-python-json-logger # Used by logging as per examples/other/logging_configuration.md
+python-json-logger # Used by logging as per examples/others/logging_configuration.md
 scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
 opentelemetry-sdk>=1.26.0  # vllm.tracing
diff --git a/requirements/cpu.txt b/requirements/cpu.txt
index 752931158a05..1213301584ce 100644
--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -2,11 +2,12 @@
 -r common.txt
 
 # Dependencies for CPUs
+packaging>=24.2
+setuptools>=77.0.3,<80.0.0
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.7.0+cpu; platform_machine == "x86_64"
 torch==2.7.0; platform_system == "Darwin"
 torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
-torch==2.7.0.dev20250304; platform_machine == "s390x"
 
 # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
 torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
@@ -19,3 +20,7 @@ datasets # for benchmark scripts
 
 # cpu cannot use triton 3.3.0
 triton==3.2.0; platform_machine == "x86_64"
+
+# Intel Extension for PyTorch, only for x86_64 CPUs
+intel-openmp==2024.2.1; platform_machine == "x86_64"
+intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 9c267edaceaf..64c70cb65c55 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,19 +1,9 @@
-sphinx==7.4.7
-sphinx-argparse==0.5.2
-sphinx-book-theme==1.1.4
-sphinx-copybutton==0.5.2
-sphinx-design==0.6.1
-sphinx-togglebutton==0.3.2
-myst-parser==3.0.1  # `myst-parser==4.0.1` breaks inline code in titles
-msgspec
-snowballstemmer<3  # https://github.com/snowballstem/snowball/issues/229
-commonmark # Required by sphinx-argparse when using :markdownhelp:
-
-# Custom autodoc2 is necessary for faster docstring processing
-# see: https://github.com/sphinx-extensions2/sphinx-autodoc2/issues/33#issuecomment-2856386035
-git+https://github.com/hmellor/sphinx-autodoc2.git # sphinx-autodoc2==0.5.0
-
-# packages to install to build the documentation
-cachetools
--f https://download.pytorch.org/whl/cpu
-torch
\ No newline at end of file
+mkdocs
+mkdocs-api-autonav
+mkdocs-material
+mkdocstrings-python
+mkdocs-gen-files
+mkdocs-awesome-nav
+python-markdown-math
+regex
+ruff
diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt
index 3aebcaa623c0..e9b466d3a82d 100644
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -38,4 +38,4 @@ matplotlib # required for qwen-vl test
 # required for  Multi-Modal Models Test (Standard)
 num2words # required for smolvlm test
 pqdm
-timm # required for internvl test
+timm # required for internvl test
\ No newline at end of file
diff --git a/requirements/test.in b/requirements/test.in
index cdc7c563f087..87af61769038 100644
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -33,6 +33,7 @@ num2words # required for smolvlm test
 opencv-python-headless >= 4.11.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.8 # required for model evaluation test
+mteb>=1.38.11, <2 # required for mteb test
 transformers==4.51.3
 tokenizers==0.21.1
 huggingface-hub[hf_xet]>=0.30.0  # Required for Xet downloads.
diff --git a/requirements/test.txt b/requirements/test.txt
index 9a15d9a0d824..89d477017342 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -99,6 +99,7 @@ datasets==3.0.2
     # via
     #   evaluate
     #   lm-eval
+    #   mteb
 decorator==5.1.1
     # via librosa
 dill==0.3.8
@@ -124,6 +125,8 @@ email-validator==2.2.0
     # via pydantic
 encodec==0.1.1
     # via vocos
+eval-type-backport==0.2.2
+    # via mteb
 evaluate==0.4.3
     # via lm-eval
 fastparquet==2024.11.0
@@ -291,6 +294,8 @@ msgpack==1.1.0
     # via
     #   librosa
     #   ray
+mteb==1.38.11
+    # via -r requirements/test.in
 multidict==6.1.0
     # via
     #   aiohttp
@@ -331,6 +336,7 @@ numpy==1.26.4
     #   librosa
     #   matplotlib
     #   mistral-common
+    #   mteb
     #   numba
     #   numexpr
     #   opencv-python-headless
@@ -443,6 +449,8 @@ plotly==5.24.1
     # via genai-perf
 pluggy==1.5.0
     # via pytest
+polars==1.29.0
+    # via mteb
 pooch==1.8.2
     # via librosa
 portalocker==2.10.1
@@ -476,6 +484,7 @@ pydantic==2.9.2
     # via
     #   datamodel-code-generator
     #   mistral-common
+    #   mteb
 pydantic-core==2.23.4
     # via pydantic
 pygments==2.18.0
@@ -522,6 +531,8 @@ python-dateutil==2.9.0.post0
     #   typepy
 python-rapidjson==1.20
     # via tritonclient
+pytrec-eval-terrier==0.5.7
+    # via mteb
 pytz==2024.2
     # via
     #   pandas
@@ -564,6 +575,7 @@ requests==2.32.3
     #   huggingface-hub
     #   lm-eval
     #   mistral-common
+    #   mteb
     #   pooch
     #   ray
     #   responses
@@ -580,6 +592,7 @@ rfc3987==1.3.8
 rich==13.9.4
     # via
     #   genai-perf
+    #   mteb
     #   typer
 rouge-score==0.1.2
     # via lm-eval
@@ -607,16 +620,20 @@ scikit-learn==1.5.2
     # via
     #   librosa
     #   lm-eval
+    #   mteb
     #   sentence-transformers
 scipy==1.13.1
     # via
     #   librosa
+    #   mteb
     #   scikit-learn
     #   sentence-transformers
     #   statsmodels
     #   vocos
 sentence-transformers==3.2.1
-    # via -r requirements/test.in
+    # via
+    #   -r requirements/test.in
+    #   mteb
 sentencepiece==0.2.0
     # via mistral-common
 setuptools==77.0.3
@@ -696,6 +713,7 @@ torch==2.7.0+cu128
     #   fastsafetensors
     #   lm-eval
     #   mamba-ssm
+    #   mteb
     #   peft
     #   runai-model-streamer
     #   sentence-transformers
@@ -720,6 +738,7 @@ tqdm==4.66.6
     #   evaluate
     #   huggingface-hub
     #   lm-eval
+    #   mteb
     #   nltk
     #   peft
     #   pqdm
@@ -759,6 +778,7 @@ typing-extensions==4.12.2
     #   huggingface-hub
     #   librosa
     #   mistral-common
+    #   mteb
     #   pqdm
     #   pydantic
     #   pydantic-core
diff --git a/requirements/tpu.txt b/requirements/tpu.txt
index 11501bc5d92f..3b204a8f9905 100644
--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -18,9 +18,9 @@ setuptools==78.1.0
 --find-links https://storage.googleapis.com/libtpu-releases/index.html
 --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
 --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-torch==2.8.0.dev20250430
-torchvision==0.22.0.dev20250430
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
-torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250430-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
+torch==2.8.0.dev20250518
+torchvision==0.22.0.dev20250518
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
+torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250518-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
 
diff --git a/setup.py b/setup.py
old mode 100755
new mode 100644
index 7675fbdf3efe..180f2f978501
--- a/setup.py
+++ b/setup.py
@@ -5,12 +5,12 @@
 import json
 import logging
 import os
-import re
 import subprocess
 import sys
 from pathlib import Path
 from shutil import which
 
+import regex as re
 import torch
 from packaging.version import Version, parse
 from setuptools import Extension, setup
@@ -389,7 +389,6 @@ def run(self) -> None:
             # vllm_flash_attn python code:
             # Regex from
             #  `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
-            import re
             compiled_regex = re.compile(
                 r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
             file_members += list(
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 9f3b0e8ae079..86b5e1e0ab7c 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -8,12 +8,13 @@
 from unittest.mock import Mock
 
 import pytest
+import torch
 
-from vllm import LLM
+from vllm import LLM, envs
 from vllm.platforms import current_platform
 from vllm.v1.engine.llm_engine import LLMEngine as LLMEngineV1
 
-from ..conftest import VllmRunner
+from ..conftest import HfRunner, VllmRunner
 from ..models.utils import check_outputs_equal
 from ..utils import multi_gpu_test
 
@@ -43,11 +44,26 @@ def test_vllm_gc_ed():
     assert weak_llm() is None
 
 
+def _fix_prompt_embed_outputs(
+        vllm_outputs: list[tuple[list[int], str]], hf_model: HfRunner,
+        example_prompts: list[str]) -> list[tuple[list[int], str]]:
+    fixed_vllm_outputs = []
+    for vllm_output, hf_input, prompt in zip(
+            vllm_outputs, hf_model.get_inputs(example_prompts),
+            example_prompts):
+        hf_input_ids = hf_input["input_ids"].tolist()[0]
+        fixed_vllm_outputs.append(
+            (hf_input_ids + vllm_output[0][len(hf_input_ids):],
+             prompt + vllm_output[1]))
+    return fixed_vllm_outputs
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("enforce_eager", [False])
+@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models(
     monkeypatch: pytest.MonkeyPatch,
     hf_runner,
@@ -56,8 +72,13 @@ def test_models(
     dtype: str,
     max_tokens: int,
     enforce_eager: bool,
+    enable_prompt_embeds: bool,
 ) -> None:
 
+    if enable_prompt_embeds and envs.is_set(
+            "VLLM_USE_V1") and envs.VLLM_USE_V1:
+        pytest.skip("enable_prompt_embeds is not supported in v1.")
+
     if backend == "FLASHINFER" and current_platform.is_rocm():
         pytest.skip("Flashinfer does not support ROCm/HIP.")
 
@@ -78,14 +99,25 @@ def test_models(
 
         with hf_runner(model, dtype=dtype) as hf_model:
             hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+            if enable_prompt_embeds:
+                with torch.no_grad():
+                    prompt_embeds = hf_model.get_prompt_embeddings(
+                        example_prompts)
 
         with VllmRunner(model,
                         max_model_len=8192,
                         dtype=dtype,
                         enforce_eager=enforce_eager,
+                        enable_prompt_embeds=enable_prompt_embeds,
                         gpu_memory_utilization=0.7) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
+            if enable_prompt_embeds:
+                vllm_outputs = vllm_model.generate_greedy(
+                    prompt_embeds, max_tokens)
+                vllm_outputs = _fix_prompt_embed_outputs(
+                    vllm_outputs, hf_model, example_prompts)
+            else:
+                vllm_outputs = vllm_model.generate_greedy(
+                    example_prompts, max_tokens)
 
         check_outputs_equal(
             outputs_0_lst=hf_outputs,
@@ -108,6 +140,7 @@ def test_models(
         ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
         ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
     ])
+@pytest.mark.parametrize("enable_prompt_embeds", [True, False])
 def test_models_distributed(
     monkeypatch: pytest.MonkeyPatch,
     hf_runner,
@@ -117,14 +150,22 @@ def test_models_distributed(
     distributed_executor_backend: str,
     attention_backend: str,
     test_suite: str,
+    enable_prompt_embeds: bool,
 ) -> None:
 
+    if enable_prompt_embeds and envs.is_set(
+            "VLLM_USE_V1") and envs.VLLM_USE_V1:
+        pytest.skip("enable_prompt_embeds is not supported in v1.")
+
     if test_suite != TARGET_TEST_SUITE:
         pytest.skip(f"Skip test for {test_suite}")
 
     with monkeypatch.context() as monkeypatch_context:
         if model == "meta-llama/Llama-3.2-1B-Instruct" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
-            # test Ray Compiled Graph
+            if enable_prompt_embeds:
+                pytest.skip(
+                    "enable_prompt_embeds does not work with ray compiled dag."
+                )
             monkeypatch_context.setenv("VLLM_USE_RAY_SPMD_WORKER", "1")
             monkeypatch_context.setenv("VLLM_USE_RAY_COMPILED_DAG", "1")
 
@@ -147,12 +188,26 @@ def test_models_distributed(
                 dtype=dtype,
                 tensor_parallel_size=2,
                 distributed_executor_backend=distributed_executor_backend,
+                enable_prompt_embeds=enable_prompt_embeds,
+                gpu_memory_utilization=0.7,
         ) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(example_prompts,
-                                                      max_tokens)
-
-        with hf_runner(model, dtype=dtype) as hf_model:
-            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
+            if enable_prompt_embeds:
+                with hf_runner(model, dtype=dtype) as hf_model:
+                    with torch.no_grad():
+                        prompt_embeds = hf_model.get_prompt_embeddings(
+                            example_prompts)
+                    vllm_outputs = vllm_model.generate_greedy(
+                        prompt_embeds, max_tokens)
+                    vllm_outputs = _fix_prompt_embed_outputs(
+                        vllm_outputs, hf_model, example_prompts)
+                    hf_outputs = hf_model.generate_greedy(
+                        example_prompts, max_tokens)
+            else:
+                vllm_outputs = vllm_model.generate_greedy(
+                    example_prompts, max_tokens)
+                with hf_runner(model, dtype=dtype) as hf_model:
+                    hf_outputs = hf_model.generate_greedy(
+                        example_prompts, max_tokens)
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
index a21e8eca3a6e..5a02c4e2b378 100644
--- a/tests/compile/backend.py
+++ b/tests/compile/backend.py
@@ -5,6 +5,8 @@
 
 from torch import fx
 
+from vllm.compilation.fx_utils import (find_specified_fn,
+                                       find_specified_fn_maybe)
 from vllm.compilation.inductor_pass import InductorPass
 from vllm.config import get_current_vllm_config
 
@@ -44,3 +46,19 @@ def post_pass(self, graph: fx.Graph):
         self.graph_post_pass = deepcopy(graph)
         # assign by reference, will reflect the final state of the graph
         self.final_graph = graph
+
+    def check_before_ops(self, ops,
+                         find_fn=find_specified_fn, \
+                         find_fn_maybe=find_specified_fn_maybe, \
+                        ops_fully_replaced=True):
+        for op in ops:
+            find_fn(self.graph_pre_pass.nodes, op)
+            if ops_fully_replaced:
+                assert find_fn_maybe(self.graph_post_pass.nodes, op) is None
+
+    def check_after_ops(self, ops,
+                        find_fn=find_specified_fn, \
+                        find_fn_maybe=find_specified_fn_maybe):
+        for op in ops:
+            find_fn(self.graph_post_pass.nodes, op)
+            assert find_fn_maybe(self.graph_pre_pass.nodes, op) is None
diff --git a/tests/compile/test_async_tp.py b/tests/compile/test_async_tp.py
new file mode 100644
index 000000000000..8e4e0ba83579
--- /dev/null
+++ b/tests/compile/test_async_tp.py
@@ -0,0 +1,248 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+
+import pytest
+import torch
+
+import vllm.envs as envs
+from vllm.compilation.collective_fusion import AsyncTPPass
+from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
+                         PassConfig, VllmConfig)
+from vllm.distributed import (tensor_model_parallel_all_gather,
+                              tensor_model_parallel_reduce_scatter)
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.platforms import current_platform
+from vllm.utils import update_environment_variables
+
+from ..models.registry import HF_EXAMPLE_MODELS
+from ..utils import (compare_two_settings, create_new_process_for_each_test,
+                     multi_gpu_test)
+from .backend import TestBackend
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+class TestMMRSModel(torch.nn.Module):
+
+    def __init__(self, hidden_size=16):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.gate_proj = torch.nn.Parameter(torch.empty(
+            (self.hidden_size * 2, hidden_size)),
+                                            requires_grad=False)
+        # Initialize weights
+        torch.nn.init.normal_(self.gate_proj, std=0.02)
+
+    def forward(self, hidden_states):
+        """
+        Forward pass implementing the mm + reduce scatter in the FX graph
+    
+        """
+        # Reshape input
+        view = hidden_states.reshape(-1, self.hidden_size)
+
+        # matrix multiplication
+        permute = self.gate_proj.permute(1, 0)
+        mm = torch.mm(view, permute)
+        reduce_scatter = tensor_model_parallel_reduce_scatter(mm, dim=0)
+        return reduce_scatter
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.reduce_scatter.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.symm_mem.fused_matmul_reduce_scatter.default]
+
+
+class TestAGMMModel(torch.nn.Module):
+
+    def __init__(self, hidden_size=16):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.weight = torch.nn.Parameter(torch.empty(
+            (hidden_size, hidden_size)),
+                                         requires_grad=False)
+        # Initialize weights
+        torch.nn.init.normal_(self.weight, std=0.02)
+
+    def forward(self, hidden_states):
+        """
+        Forward pass implementing the mm + all gather in the FX graph
+        """
+        # Reshape input
+        view = hidden_states.reshape(-1, self.hidden_size)
+        all_gather = tensor_model_parallel_all_gather(view, dim=0)
+        permute = self.weight.permute(1, 0)
+        mm = torch.mm(all_gather, permute)
+        return mm
+
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.all_gather.default]
+
+    def ops_in_model_after(self):
+        return [torch.ops.symm_mem.fused_all_gather_matmul.default]
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("test_model", [TestMMRSModel, TestAGMMModel])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seq_len", [16])
+@pytest.mark.parametrize("hidden_size", [16])
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE not in ["cuda"],
+                    reason="Only test on CUDA")
+def test_async_tp_pass_replace(test_model: str, batch_size: int, seq_len: int,
+                               hidden_size: int, dtype: torch.dtype):
+    num_processes = 2
+
+    def run_torch_spawn(fn, nprocs):
+        # need to use torch.mp.spawn otherwise will have problems with
+        # torch.distributed and cuda
+        torch.multiprocessing.spawn(fn,
+                                    args=(num_processes, test_model,
+                                          batch_size, seq_len, hidden_size,
+                                          dtype),
+                                    nprocs=nprocs)
+
+    run_torch_spawn(async_tp_pass_on_test_model, num_processes)
+
+
+def async_tp_pass_on_test_model(local_rank: int, world_size: int,
+                                test_model_cls: torch.nn.Module,
+                                batch_size: int, seq_len: int,
+                                hidden_size: int, dtype: torch.dtype):
+    current_platform.seed_everything(0)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': '12345',
+    })
+
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+
+    # configure vllm config for SequenceParallelismPass
+    vllm_config = VllmConfig()
+    vllm_config.compilation_config = CompilationConfig(pass_config=PassConfig(
+        enable_async_tp=True, ), )
+    vllm_config.device_config = DeviceConfig(device=torch.device("cuda"))
+
+    # this is a fake model name to construct the model config
+    # in the vllm_config, it's not really used.
+    model_name = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP8-e2e"
+    vllm_config.model_config = ModelConfig(model=model_name,
+                                           task="auto",
+                                           tokenizer=model_name,
+                                           tokenizer_mode="auto",
+                                           trust_remote_code=True,
+                                           dtype=dtype,
+                                           seed=42)
+
+    async_tp_pass = AsyncTPPass(vllm_config)
+    backend = TestBackend(async_tp_pass)
+
+    model = test_model_cls(hidden_size)
+
+    hidden_states = torch.randn((batch_size * seq_len, hidden_size),
+                                dtype=dtype,
+                                requires_grad=False)
+
+    compiled_model = torch.compile(model, backend=backend)
+    compiled_model(hidden_states)
+
+    # In pre-nodes, all gather or reduce scatter should exist,
+    # fused_matmul_reduce_scatter or fused_all_gather_matmul should not
+    backend.check_before_ops(model.ops_in_model_before(),
+                             ops_fully_replaced=False)
+
+    # In post-nodes, fused_matmul_reduce_scatter or \
+    # fused_all_gather_matmul should exist
+    backend.check_after_ops(model.ops_in_model_after())
+
+
+@create_new_process_for_each_test()
+@pytest.mark.parametrize("model_id", ["meta-llama/Llama-3.2-1B-Instruct"])
+@pytest.mark.parametrize("tp_size", [2])
+@pytest.mark.parametrize("async_tp_enabled", [True])
+@pytest.mark.parametrize("distributed_backend", ["mp"])
+@pytest.mark.parametrize("eager_mode", [False, True])
+def test_async_tp_pass_correctness(
+    model_id: str,
+    tp_size: int,
+    async_tp_enabled: bool,
+    distributed_backend: str,
+    eager_mode: bool,
+    num_gpus_available: int,
+):
+    model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
+    model_info.check_transformers_version(on_fail="skip")
+    model_info.check_available_online(on_fail="skip")
+
+    pp_size = 1
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
+
+    common_args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "8",
+    ]
+    if eager_mode:
+        common_args.append("--enforce-eager")
+
+    compilation_config = {
+        'level': 3,
+        'compile_sizes': [2, 4, 8],
+        'splitting_ops': [],
+        'pass_config': {
+            'enable_async_tp': async_tp_enabled
+        },
+    }
+
+    async_tp_env = tp_env = {
+        "VLLM_USE_V1": "1",
+    }
+
+    aysnc_tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        distributed_backend,
+        "--compilation_config",
+        json.dumps(compilation_config),
+    ]
+
+    tp_args = [
+        *common_args,
+        "--tensor-parallel-size",
+        str(tp_size),
+        "--distributed-executor-backend",
+        "mp",
+    ]
+
+    compare_two_settings(model_id,
+                         aysnc_tp_args,
+                         tp_args,
+                         async_tp_env,
+                         tp_env,
+                         method="generate")
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index 4d56b34bdecf..509593e7328d 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -29,6 +29,10 @@ def __init__(self, hidden_size: int, eps: float, static: bool,
         self.cutlass_fp8_enabled = cutlass_fp8_enabled
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
         self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
+        self.key = QuantKey(dtype=FP8_DTYPE,
+                            static=static,
+                            per_tensor=static,
+                            symmetric=True)
         if static:
             self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
         else:
@@ -59,6 +63,15 @@ def forward(self, x):
         y3, resid = self.norm[2](x3, resid)  # use resid here
         return y3
 
+    def ops_in_model_before(self):
+        return [QUANT_OPS[self.key]]
+
+    def ops_in_model_after(self):
+        return [
+            FUSED_OPS[FusedRMSQuantKey(self.key, False)],
+            FUSED_OPS[FusedRMSQuantKey(self.key, True)]
+        ]
+
 
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
@@ -107,25 +120,10 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
 
         torch.testing.assert_close(result, result2, atol=ATOL, rtol=RTOL)
 
-        # Check substitution worked
-        pre_nodes = backend.graph_pre_pass.nodes
-        post_nodes = backend.graph_post_pass.nodes
-
-        # static is per-tensor, dynamic is per-token
-        key = QuantKey(dtype=FP8_DTYPE,
-                       static=static,
-                       per_tensor=static,
-                       symmetric=True)
-        rms_quant = FUSED_OPS[FusedRMSQuantKey(key, False)]
-        add_rms_quant = FUSED_OPS[FusedRMSQuantKey(key, True)]
-        fp8_quant = QUANT_OPS[key]
-
         # In pre-nodes, fp8 quant should be there and fused kernels should not
-        assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
-        assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
-        find_auto_fn(pre_nodes, fp8_quant)
+        backend.check_before_ops(model.ops_in_model_before(), find_auto_fn,
+                                 find_auto_fn_maybe)
 
         # In post-nodes, fused kernels should be there and fp8 quant should not
-        find_auto_fn(post_nodes, rms_quant)
-        find_auto_fn(post_nodes, add_rms_quant)
-        assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
+        backend.check_after_ops(model.ops_in_model_after(), find_auto_fn,
+                                find_auto_fn_maybe)
diff --git a/tests/compile/test_sequence_parallelism.py b/tests/compile/test_sequence_parallelism.py
index 6152f171705b..2cd7ebaacec0 100644
--- a/tests/compile/test_sequence_parallelism.py
+++ b/tests/compile/test_sequence_parallelism.py
@@ -5,9 +5,7 @@
 
 import vllm.envs as envs
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.fx_utils import (find_auto_fn, find_auto_fn_maybe,
-                                       find_specified_fn,
-                                       find_specified_fn_maybe, is_func)
+from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.sequence_parallelism import SequenceParallelismPass
 from vllm.config import (CompilationConfig, DeviceConfig, ModelConfig,
                          PassConfig, VllmConfig)
@@ -21,17 +19,6 @@
 from ..utils import multi_gpu_test
 from .backend import TestBackend
 
-OPS_IN_MODEL_BEFORE = [
-    torch.ops.vllm.all_reduce.default,
-]
-
-OPS_IN_MODEL_AFTER = [
-    torch.ops.vllm.reduce_scatter.default,
-    torch.ops.vllm.all_gather.default,
-]
-
-OPS_IN_MODEL = [torch.ops._C.fused_add_rms_norm.default]
-
 prompts = [
     "Hello, my name is",
     "The president of the United States is",
@@ -78,6 +65,18 @@ def forward(self, hidden_states, residual):
 
         return norm_output, residual_output
 
+    def ops_in_model_before(self):
+        return [torch.ops.vllm.all_reduce.default]
+
+    def ops_in_model_after(self):
+        return [
+            torch.ops.vllm.reduce_scatter.default,
+            torch.ops.vllm.all_gather.default
+        ]
+
+    def ops_in_model(self):
+        return [torch.ops._C.fused_add_rms_norm.default]
+
 
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("batch_size", [8])
@@ -156,26 +155,16 @@ def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int,
     compiled_model_func = torch.compile(model, backend=backend_func)
     compiled_model_func(hidden_states, residual)
 
-    # Check substitution worked
-    pre_nodes = backend_no_func.graph_pre_pass.nodes
-    post_nodes = backend_no_func.graph_post_pass.nodes
-
     # In pre-nodes, all reduce should be there,
     # reduce scatter and all gather should not
-    for op in OPS_IN_MODEL_BEFORE:
-        find_specified_fn(pre_nodes, op)
-    for op in OPS_IN_MODEL_AFTER:
-        assert find_specified_fn_maybe(pre_nodes, op) is None
+    backend_no_func.check_before_ops(model.ops_in_model_before())
 
     # In post-nodes, reduce scatter and all gather should be there,
     # all reduce should not
-    for op in OPS_IN_MODEL_AFTER:
-        find_specified_fn(post_nodes, op)
-    for op in OPS_IN_MODEL_BEFORE:
-        assert find_specified_fn_maybe(post_nodes, op) is None
+    backend_no_func.check_after_ops(model.ops_in_model_after())
 
     # check if the functionalization pass is applied
-    for op in OPS_IN_MODEL:
+    for op in model.ops_in_model():
         find_auto_fn(backend_no_func.graph_post_pass.nodes, op)
         assert find_auto_fn_maybe(backend_func.graph_post_pass.nodes,
                                   op) is None  # noqa: E501
@@ -183,7 +172,7 @@ def sequence_parallelism_pass_on_test_model(local_rank: int, world_size: int,
     # make sure the ops were all de-functionalized
     found = dict()
     for node in backend_func.graph_post_pass.nodes:
-        for op in OPS_IN_MODEL:
+        for op in model.ops_in_model():
             if is_func(node, op):
                 found[op] = True
-    assert all(found[op] for op in OPS_IN_MODEL)
+    assert all(found[op] for op in model.ops_in_model())
diff --git a/tests/conftest.py b/tests/conftest.py
index c5700179c228..19c2c6247129 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -430,6 +430,15 @@ def get_inputs(
 
         return all_inputs
 
+    def get_prompt_embeddings(self, prompts: list[str]) -> list[torch.Tensor]:
+        all_inputs = self.get_inputs(prompts)
+        embeddings = []
+        for inputs in all_inputs:
+            input_ids = self.wrap_device(inputs)["input_ids"]
+            embedding = self.model.get_input_embeddings()(input_ids).squeeze(0)
+            embeddings.append(embedding)
+        return embeddings
+
     def classify(self, prompts: list[str]) -> list[str]:
         # output is final logits
         all_inputs = self.get_inputs(prompts)
diff --git a/tests/distributed/test_events.py b/tests/distributed/test_events.py
index 15bcfdb8555f..8de1aa20eabd 100644
--- a/tests/distributed/test_events.py
+++ b/tests/distributed/test_events.py
@@ -119,13 +119,12 @@ def test_topic_filtering(publisher_config):
     """
     publisher_config.replay_endpoint = None
 
-    cfg = publisher_config.model_copy()
-    cfg.topic = "foo"
-    pub = EventPublisherFactory.create(cfg)
+    publisher_config.topic = "foo"
+    pub = EventPublisherFactory.create(publisher_config)
 
     from .conftest import MockSubscriber
-    sub_foo = MockSubscriber(cfg.endpoint, None, "foo")
-    sub_bar = MockSubscriber(cfg.endpoint, None, "bar")
+    sub_foo = MockSubscriber(publisher_config.endpoint, None, "foo")
+    sub_bar = MockSubscriber(publisher_config.endpoint, None, "bar")
 
     try:
         time.sleep(0.1)
diff --git a/tests/distributed/test_shm_broadcast.py b/tests/distributed/test_shm_broadcast.py
index 711c2441f34b..f9eacc11d75f 100644
--- a/tests/distributed/test_shm_broadcast.py
+++ b/tests/distributed/test_shm_broadcast.py
@@ -9,7 +9,7 @@
 
 from vllm.distributed.device_communicators.shm_broadcast import MessageQueue
 from vllm.distributed.utils import StatelessProcessGroup
-from vllm.utils import get_ip, get_open_port, update_environment_variables
+from vllm.utils import get_open_port, update_environment_variables
 
 
 def get_arrays(n: int, seed: int = 0) -> list[np.ndarray]:
@@ -60,12 +60,12 @@ def worker_fn():
     rank = dist.get_rank()
     if rank == 0:
         port = get_open_port()
-        ip = get_ip()
+        ip = '127.0.0.1'
         dist.broadcast_object_list([ip, port], src=0)
     else:
         recv = [None, None]
         dist.broadcast_object_list(recv, src=0)
-        ip, port = recv
+        ip, port = recv  # type: ignore
 
     stateless_pg = StatelessProcessGroup.create(ip, port, rank,
                                                 dist.get_world_size())
@@ -107,10 +107,10 @@ def worker_fn():
 
         if pg == dist.group.WORLD:
             dist.barrier()
-            print("torch distributed passed the test!")
+            print(f"torch distributed passed the test! Rank {rank}")
         else:
             pg.barrier()
-            print("StatelessProcessGroup passed the test!")
+            print(f"StatelessProcessGroup passed the test! Rank {rank}")
 
 
 def test_shm_broadcast():
diff --git a/tests/entrypoints/llm/test_guided_generate.py b/tests/entrypoints/llm/test_guided_generate.py
index fdbdccd4654c..dd5d17885eb9 100644
--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -1,12 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-import re
 import weakref
 from enum import Enum
 
 import jsonschema
 import pytest
+import regex as re
 from pydantic import BaseModel
 
 from vllm.distributed import cleanup_dist_env_and_memory
diff --git a/tests/entrypoints/openai/correctness/test_mteb.py b/tests/entrypoints/openai/correctness/test_mteb.py
new file mode 100644
index 000000000000..ebf2f829b583
--- /dev/null
+++ b/tests/entrypoints/openai/correctness/test_mteb.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import pytest
+
+from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
+                                                      OpenAIClientMtebEncoder,
+                                                      run_mteb_embed_task,
+                                                      run_mteb_embed_task_st)
+from tests.utils import RemoteOpenAIServer
+
+os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
+
+MODEL_NAME = "BAAI/bge-m3"
+DTYPE = "float16"
+MAIN_SCORE = 0.7873427091972599
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task", "embed", "--dtype", DTYPE, "--enforce-eager",
+        "--max-model-len", "512"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+def test_mteb(server):
+    client = server.get_client()
+    encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
+    vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
+    st_main_score = MAIN_SCORE or run_mteb_embed_task_st(
+        MODEL_NAME, MTEB_EMBED_TASKS)
+
+    print("VLLM main score: ", vllm_main_score)
+    print("SentenceTransformer main score: ", st_main_score)
+    print("Difference: ", st_main_score - vllm_main_score)
+
+    assert st_main_score == pytest.approx(vllm_main_score, rel=1e-4)
diff --git a/tests/entrypoints/openai/test_chat.py b/tests/entrypoints/openai/test_chat.py
index a10b42ea3a4b..2509ef0d280a 100644
--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -2,13 +2,13 @@
 
 # imports for guided decoding tests
 import json
-import re
 from typing import Optional
 
 import jsonschema
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+import regex as re
 import requests
 import torch
 from openai import BadRequestError, OpenAI
diff --git a/tests/entrypoints/openai/test_completion.py b/tests/entrypoints/openai/test_completion.py
index 1d9aa4972b70..9d12f27a2b87 100644
--- a/tests/entrypoints/openai/test_completion.py
+++ b/tests/entrypoints/openai/test_completion.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
-
 # imports for guided decoding tests
 import json
-import re
 import shutil
 from tempfile import TemporaryDirectory
 from typing import Optional
@@ -11,6 +9,7 @@
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+import regex as re
 # downloading lora to test lora requests
 from huggingface_hub import snapshot_download
 from openai import BadRequestError
diff --git a/tests/entrypoints/openai/test_openai_schema.py b/tests/entrypoints/openai/test_openai_schema.py
index 5c585d54c429..cae2a3b59553 100644
--- a/tests/entrypoints/openai/test_openai_schema.py
+++ b/tests/entrypoints/openai/test_openai_schema.py
@@ -1,6 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
+from typing import Final
+
 import pytest
 import schemathesis
+from hypothesis import settings
 from schemathesis import GenerationConfig
 
 from ...utils import RemoteOpenAIServer
@@ -9,6 +12,8 @@
 
 MODEL_NAME = "HuggingFaceTB/SmolVLM-256M-Instruct"
 MAXIMUM_IMAGES = 2
+DEFAULT_TIMEOUT_SECONDS: Final[int] = 10
+LONG_TIMEOUT_SECONDS: Final[int] = 60
 
 
 @pytest.fixture(scope="module")
@@ -42,8 +47,58 @@ def get_schema(server):
 schema = schemathesis.from_pytest_fixture("get_schema")
 
 
+@schemathesis.hook
+def before_generate_case(context: schemathesis.hooks.HookContext, strategy):
+    op = context.operation
+    assert op is not None
+
+    def no_file_type(case: schemathesis.models.Case):
+        """
+        This filter skips test cases for the `POST /tokenize` endpoint where the
+        HTTP request body uses `"type": "file"` in any message's content.
+        We expect these cases to fail because that type isn't implemented here
+        https://github.com/vllm-project/vllm/blob/0b34593017953051b3225b1483ce0f4670e3eb0e/vllm/entrypoints/chat_utils.py#L1038-L1095
+
+        Example test cases that are skipped:
+        curl -X POST -H 'Content-Type: application/json' \
+            -d '{"messages": [{"role": "assistant"}, {"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
+            http://localhost:8000/tokenize
+
+        curl -X POST -H 'Content-Type: application/json' \
+            -d '{"messages": [{"content": [{"file": {}, "type": "file"}], "role": "user"}]}' \
+            http://localhost:8000/tokenize
+        """  # noqa: E501
+        if (op.method.lower() == "post" and op.path == "/tokenize"
+                and hasattr(case, "body") and isinstance(case.body, dict)
+                and "messages" in case.body
+                and isinstance(case.body["messages"], list)
+                and len(case.body["messages"]) > 0):
+            for message in case.body["messages"]:
+                if not isinstance(message, dict):
+                    continue
+                content = message.get("content", [])
+                if not isinstance(content, list) or len(content) == 0:
+                    continue
+                if any(item.get("type") == "file" for item in content):
+                    return False
+        return True
+
+    return strategy.filter(no_file_type)
+
+
 @schema.parametrize()
 @schema.override(headers={"Content-Type": "application/json"})
+@settings(deadline=LONG_TIMEOUT_SECONDS * 1000)
 def test_openapi_stateless(case: schemathesis.Case):
+    key = (
+        case.operation.method.upper(),
+        case.operation.path,
+    )
+    timeout = {
+        # requires a longer timeout
+        ("POST", "/v1/chat/completions"):
+        LONG_TIMEOUT_SECONDS,
+    }.get(key, DEFAULT_TIMEOUT_SECONDS)
+
     #No need to verify SSL certificate for localhost
-    case.call_and_validate(verify=False)
+    case.call_and_validate(verify=False, timeout=timeout)
diff --git a/tests/entrypoints/openai/test_prompt_validation.py b/tests/entrypoints/openai/test_prompt_validation.py
index f889189a9968..e384915899d3 100644
--- a/tests/entrypoints/openai/test_prompt_validation.py
+++ b/tests/entrypoints/openai/test_prompt_validation.py
@@ -1,10 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # imports for guided decoding tests
-import re
-
 import openai
 import pytest
+import regex as re
 
 from ...utils import RemoteOpenAIServer
 
@@ -32,7 +31,7 @@ async def test_out_of_vocab_token_ids():
         client = remote_server.get_async_client()
 
         with pytest.raises(openai.BadRequestError,
-                           match=re.compile('.*out of vocabulary.*')):
+                           match=re.compile('.*out of vocabulary.*').pattern):
             await client.completions.create(model=model_name,
                                             prompt=[999999],
                                             max_tokens=5,
@@ -46,9 +45,10 @@ async def test_reject_multistep_with_guided_decoding():
     with RemoteOpenAIServer(model_name, server_args) as remote_server:
         client = remote_server.get_async_client()
 
-        with pytest.raises(openai.BadRequestError,
-                           match=re.compile(
-                               '.*Guided decoding .* multi-step decoding.*')):
+        with pytest.raises(
+                openai.BadRequestError,
+                match=re.compile(
+                    '.*Guided decoding .* multi-step decoding.*').pattern):
             await client.completions.create(
                 model=model_name,
                 prompt="Hello",
diff --git a/tests/entrypoints/openai/test_score.py b/tests/entrypoints/openai/test_score.py
index b756680ea9f2..b373f2912752 100644
--- a/tests/entrypoints/openai/test_score.py
+++ b/tests/entrypoints/openai/test_score.py
@@ -1,6 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-
-import math
 from typing import Any
 
 import pytest
@@ -92,7 +90,7 @@ def test_text_1_str_text_2_list(self, server: RemoteOpenAIServer,
         hf_outputs = run_transformers(runner, model, text_pairs)
 
         for i in range(len(vllm_outputs)):
-            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
+            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
 
     def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
                                      model: dict[str, Any], runner):
@@ -124,7 +122,7 @@ def test_text_1_list_text_2_list(self, server: RemoteOpenAIServer,
         hf_outputs = run_transformers(runner, model, text_pairs)
 
         for i in range(len(vllm_outputs)):
-            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
+            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
 
     def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
                                    model: dict[str, Any], runner):
@@ -150,7 +148,7 @@ def test_text_1_str_text_2_str(self, server: RemoteOpenAIServer,
         hf_outputs = run_transformers(runner, model, text_pairs)
 
         for i in range(len(vllm_outputs)):
-            assert math.isclose(hf_outputs[i], vllm_outputs[i], rel_tol=0.01)
+            assert hf_outputs[i] == pytest.approx(vllm_outputs[i], rel=0.01)
 
     def test_score_max_model_len(self, server: RemoteOpenAIServer,
                                  model: dict[str, Any]):
diff --git a/tests/entrypoints/openai/test_tensorizer_entrypoint.py b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
new file mode 100644
index 000000000000..f1ab7223048d
--- /dev/null
+++ b/tests/entrypoints/openai/test_tensorizer_entrypoint.py
@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+import gc
+import json
+import tempfile
+
+import openai
+import pytest
+import pytest_asyncio
+import torch.cuda
+
+from vllm.engine.arg_utils import EngineArgs
+from vllm.model_executor.model_loader.tensorizer import (
+    TensorizerConfig, tensorize_lora_adapter, tensorize_vllm_model)
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "unsloth/llama-3.2-1b-Instruct"
+LORA_PATH = "davzoku/finqa_adapter_1b"
+
+
+def _cleanup():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+@pytest.fixture(autouse=True)
+def cleanup():
+    _cleanup()
+
+
+@pytest.fixture(scope='module')
+def tmp_dir():
+    with tempfile.TemporaryDirectory() as path:
+        yield path
+
+
+@pytest.fixture(scope='module')
+def model_uri(tmp_dir):
+    yield f"{tmp_dir}/model.tensors"
+
+
+@pytest.fixture(scope="module")
+def tensorize_model_and_lora(tmp_dir, model_uri):
+    tensorizer_config = TensorizerConfig(tensorizer_uri=model_uri,
+                                         lora_dir=tmp_dir)
+    args = EngineArgs(model=MODEL_NAME, device="cuda")
+
+    tensorize_lora_adapter(LORA_PATH, tensorizer_config)
+    tensorize_vllm_model(args, tensorizer_config)
+
+    # Manually invoke a _cleanup() here, as the cleanup()
+    # fixture won't be guaranteed to be called after this
+    # when this fixture is used for a test
+    _cleanup()
+    yield
+
+
+@pytest.fixture(scope="module")
+def server(model_uri, tensorize_model_and_lora):
+    model_loader_extra_config = {
+        "tensorizer_uri": model_uri,
+    }
+
+    ## Start OpenAI API server
+    args = [
+        "--load-format", "tensorizer", "--device", "cuda",
+        "--model-loader-extra-config",
+        json.dumps(model_loader_extra_config), "--enable-lora"
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_completion(client: openai.AsyncOpenAI, model_name: str):
+    _cleanup()
+    completion = await client.completions.create(model=model_name,
+                                                 prompt="Hello, my name is",
+                                                 max_tokens=5,
+                                                 temperature=0.0)
+
+    assert completion.id is not None
+    assert completion.choices is not None and len(completion.choices) == 1
+    assert completion.model == MODEL_NAME
+    assert len(completion.choices) == 1
+    assert len(completion.choices[0].text) >= 5
+    assert completion.choices[0].finish_reason == "length"
+    assert completion.usage == openai.types.CompletionUsage(
+        completion_tokens=5, prompt_tokens=6, total_tokens=11)
diff --git a/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
new file mode 100644
index 000000000000..92ba1376e200
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/test_llama4_pythonic_tool_parser.py
@@ -0,0 +1,193 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction, run_tool_extraction_streaming)
+from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+
+# Test cases similar to pythonic parser but with Llama4 specific format
+SIMPLE_FUNCTION_OUTPUT = "[get_weather(city='LA', metric='C')]"
+SIMPLE_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "LA", "metric": "C"}',
+)
+MORE_TYPES_FUNCTION_OUTPUT = ("[register_user(name='Doe', "
+                              "age=9, "
+                              "address={'city': 'LA', 'state': 'CA'}, "
+                              "role=None, "
+                              "passed_test=True, "
+                              "aliases=['John', 'Johnny'])]")
+MORE_TYPES_FUNCTION_CALL = FunctionCall(
+    name="register_user",
+    arguments='{"name": "Doe", '
+    '"age": 9, '
+    '"address": {"city": "LA", "state": "CA"}, '
+    '"role": null, '
+    '"passed_test": true, '
+    '"aliases": ["John", "Johnny"]}',
+)
+PARAMETERLESS_FUNCTION_OUTPUT = "[get_weather()]"
+PARAMETERLESS_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{}',
+)
+EMPTY_DICT_FUNCTION_OUTPUT = "[do_something_cool(additional_data={})]"
+EMPTY_DICT_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"additional_data": {}}',
+)
+EMPTY_LIST_FUNCTION_OUTPUT = "[do_something_cool(steps=[])]"
+EMPTY_LIST_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"steps": []}',
+)
+ESCAPED_STRING_FUNCTION_OUTPUT = (
+    r"[get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')]")
+ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
+)
+PYTHON_TAG_FUNCTION_OUTPUT = (
+    "<|python_start|>[get_weather(city='LA', metric='C')]<|python_end|>")
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_no_tool_call(streaming: bool):
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+    model_output = "How can I help you today?"
+
+    content, tool_calls = run_tool_extraction(tool_parser,
+                                              model_output,
+                                              streaming=streaming)
+
+    assert content == model_output
+    assert len(tool_calls) == 0
+
+
+test_str = "<|python_start|>"
+test_str += "[get_weather(city='LA', metric='C'),"
+test_str += "register_user(name='Doe', age=9)]"
+TEST_CASES = [
+    pytest.param(True,
+                 ESCAPED_STRING_FUNCTION_OUTPUT,
+                 [ESCAPED_STRING_FUNCTION_CALL],
+                 id="simple_streaming"),
+    pytest.param(False,
+                 SIMPLE_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
+                 id="simple_nonstreaming"),
+    pytest.param(True,
+                 MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL],
+                 id="more_types_streaming"),
+    pytest.param(False,
+                 MORE_TYPES_FUNCTION_OUTPUT, [MORE_TYPES_FUNCTION_CALL],
+                 id="more_types_nonstreaming"),
+    pytest.param(True,
+                 PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL],
+                 id="parameterless_streaming"),
+    pytest.param(False,
+                 PARAMETERLESS_FUNCTION_OUTPUT, [PARAMETERLESS_FUNCTION_CALL],
+                 id="parameterless_nonstreaming"),
+    pytest.param(True,
+                 EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL],
+                 id="empty_dict_streaming"),
+    pytest.param(False,
+                 EMPTY_DICT_FUNCTION_OUTPUT, [EMPTY_DICT_FUNCTION_CALL],
+                 id="empty_dict_nonstreaming"),
+    pytest.param(True,
+                 EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL],
+                 id="empty_list_streaming"),
+    pytest.param(False,
+                 EMPTY_LIST_FUNCTION_OUTPUT, [EMPTY_LIST_FUNCTION_CALL],
+                 id="empty_list_nonstreaming"),
+    pytest.param(True,
+                 ESCAPED_STRING_FUNCTION_OUTPUT,
+                 [ESCAPED_STRING_FUNCTION_CALL],
+                 id="escaped_string_streaming"),
+    pytest.param(False,
+                 ESCAPED_STRING_FUNCTION_OUTPUT,
+                 [ESCAPED_STRING_FUNCTION_CALL],
+                 id="escaped_string_nonstreaming"),
+    pytest.param(
+        True,
+        "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
+        [
+            SIMPLE_FUNCTION_CALL,
+            FunctionCall(name="register_user",
+                         arguments='{"name": "Doe", "age": 9}')
+        ],
+        id="parallel_calls_streaming"),
+    pytest.param(
+        False,
+        "[get_weather(city='LA',metric='C'),register_user(name='Doe',age=9)]",
+        [
+            SIMPLE_FUNCTION_CALL,
+            FunctionCall(name="register_user",
+                         arguments='{"name": "Doe", "age": 9}')
+        ],
+        id="parallel_calls_nonstreaming"),
+    pytest.param(True,
+                 PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
+                 id="python_tag_streaming"),
+    pytest.param(False,
+                 PYTHON_TAG_FUNCTION_OUTPUT, [SIMPLE_FUNCTION_CALL],
+                 id="python_tag_nonstreaming"),
+    pytest.param(True,
+                 test_str, [
+                     SIMPLE_FUNCTION_CALL,
+                     FunctionCall(name="register_user",
+                                  arguments='{"name": "Doe", "age": 9}')
+                 ],
+                 id="parallel_calls_streaming"),
+    pytest.param(False,
+                 "<|python_start|>[get_weather(city='LA', metric='C'), " +
+                 "register_user(name='Doe', age=9)]", [
+                     SIMPLE_FUNCTION_CALL,
+                     FunctionCall(name="register_user",
+                                  arguments='{"name": "Doe", "age": 9}')
+                 ],
+                 id="parallel_calls_nonstreaming"),
+]
+
+
+@pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
+                         TEST_CASES)
+def test_tool_call(streaming: bool, model_output: str,
+                   expected_tool_calls: list[FunctionCall]):
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+
+    content, tool_calls = run_tool_extraction(tool_parser,
+                                              model_output,
+                                              streaming=streaming)
+
+    assert len(tool_calls) == len(expected_tool_calls)
+    for actual, expected in zip(tool_calls, expected_tool_calls):
+        assert actual.type == "function"
+        assert actual.function == expected
+
+
+def test_streaming_tool_call_with_large_steps():
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser(
+        "llama4_pythonic")(mock_tokenizer)
+    model_output_deltas = [
+        "<|python_start|>[get_weather(city='LA', metric='C'), "
+        "get_weather(), "
+        "do_something_cool(steps=[])]<|python_end|>",
+    ]
+
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False)
+
+    assert reconstructor.other_content == ""
+    assert len(reconstructor.tool_calls) == 3
+    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
+    assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
+    assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
index 8e8f9b82b34d..a8a1ceaeb675 100644
--- a/tests/kernels/attention/test_attention.py
+++ b/tests/kernels/attention/test_attention.py
@@ -148,6 +148,11 @@ def test_paged_attention(
             or (version == "rocm" and head_size not in (64, 128))):
         pytest.skip()
 
+    if (version == "rocm" and current_platform.is_navi()
+            and (kv_cache_dtype == "fp8" or head_size != 128
+                 or block_size != 16 or use_alibi)):
+        pytest.skip()
+
     global PARTITION_SIZE
 
     current_platform.seed_everything(seed)
@@ -280,6 +285,7 @@ def test_paged_attention(
                 scale,
                 block_tables,
                 seq_lens,
+                None,
                 block_size,
                 max_seq_len,
                 alibi_slopes,
@@ -291,7 +297,7 @@ def test_paged_attention(
             opcheck(torch.ops._rocm_C.paged_attention,
                     (output, exp_sums, max_logits, tmp_output, query,
                      key_cache, value_cache, num_kv_heads, scale, block_tables,
-                     seq_lens, block_size, max_seq_len, alibi_slopes,
+                     seq_lens, None, block_size, max_seq_len, alibi_slopes,
                      kv_cache_dtype, k_scale, v_scale),
                     cond=(head_size == HEAD_SIZES[0]
                           and block_size == BLOCK_SIZES[0]))
diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py
index 43ddc79fcb81..299279390fe0 100644
--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -575,3 +575,21 @@ def test_moe_align_block_size_opcheck():
     opcheck(torch.ops._moe_C.moe_align_block_size,
             (topk_ids, num_experts, block_size, sorted_ids, expert_ids,
              num_tokens_post_pad))
+
+
+@pytest.mark.parametrize("m", [1, 33, 64, 222])
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("k", [128, 511, 1024])
+@pytest.mark.parametrize("dtype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.skipif(current_platform.is_rocm(), reason="Skip for rocm")
+def test_moe_sum(m: int, topk: int, k: int, dtype: torch.dtype):
+    input = torch.randn((m, topk, k), device="cuda", dtype=dtype)
+    actual = torch.empty((m, k), device="cuda", dtype=dtype)
+
+    expected = input.sum(dim=1)
+    torch.ops._moe_C.moe_sum(input, actual)
+
+    torch.testing.assert_close(actual, expected, atol=2e-2, rtol=0)
+
+    opcheck(torch.ops._moe_C.moe_sum, (input, actual))
diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py
index dfcd61f77587..10e6ac64df87 100644
--- a/tests/kernels/moe/test_moe_permute_unpermute.py
+++ b/tests/kernels/moe/test_moe_permute_unpermute.py
@@ -13,7 +13,7 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.model_executor.layers.fused_moe.layer import determine_expert_map
 from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
-    moe_permute, moe_unpermute)
+    moe_permute, moe_permute_unpermute_supported, moe_unpermute)
 from vllm.platforms import current_platform
 
 NUM_EXPERTS = [16, 64]
@@ -167,6 +167,8 @@ def torch_unpermute(permuted_hidden_states: torch.Tensor,
 def test_moe_permute_unpermute(n_token: int, n_hidden: int, topk: int,
                                n_expert: int, ep_size: int, dtype: torch.dtype,
                                align_block_size: Optional[int]):
+    if not moe_permute_unpermute_supported():
+        pytest.skip("moe_permute_unpermute is not supported on this platform.")
     fill_invalid_expert = 0
     ep_rank = np.random.randint(0, ep_size)
     expert_map = None
diff --git a/tests/kernels/quantization/test_block_fp8.py b/tests/kernels/quantization/test_block_fp8.py
index ef1d7e47ef81..ae05d61173f3 100644
--- a/tests/kernels/quantization/test_block_fp8.py
+++ b/tests/kernels/quantization/test_block_fp8.py
@@ -36,16 +36,16 @@
 
 # Test configurations
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
-NUM_TOKENS = [7, 83, 2048]
+NUM_TOKENS = [7, 2050]
 D = [512, 4096, 5120, 13824]
-GROUP_SIZE = [64, 128, 256, 512]
-M = [1, 7, 8, 83, 84, 512, 2048, 4096]
-N = [128, 512, 1024, 4096, 7168, 7748, 13824]
-K = [256, 4096, 5120, 3884, 13824, 16384]
+GROUP_SIZE = [64, 128, 512]
+M = [1, 7, 8, 83, 84, 4096]
+N = [128, 512, 7168, 7748, 13824]
+K = [256, 3884, 4096, 13824, 16384]
 # Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
 # and its hidden size is 7168.
-M_moe = [1, 2, 7, 83, 128, 512, 2048]
-M_moe_dg = [128, 192, 512, 1335, 2048]
+M_moe = [1, 2, 7, 83, 128, 2048]
+M_moe_dg = [128, 192, 1335, 2048]
 N_moe = [128, 256, 1024, 4608]  # [13824]
 K_moe = [256, 512, 7168]  # [13824]
 BLOCK_SIZE = [[128, 128]]
diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py
index 6cf88604ec65..ad755fe7f7a0 100644
--- a/tests/kernels/quantization/test_gguf.py
+++ b/tests/kernels/quantization/test_gguf.py
@@ -8,7 +8,6 @@
 from huggingface_hub import snapshot_download
 
 import vllm._custom_ops as ops
-from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe import fused_experts
 from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf
 from vllm.platforms import current_platform
@@ -35,11 +34,11 @@ def get_gguf_MoE_tensors(
     return GGUFReader(sample_file).tensors
 
 
-DTYPES = [torch.half, torch.bfloat16, torch.float32]
+DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
 # Hidden_size for testing, must match the sample file in HF repo,
 # we have `hidden_size = 256, 1024` for test in HF repo currently.
 HIDDEN_SIZES = [256, 1024]
-NUM_TOKENS = [7, 83, 128, 2048]  # Arbitrary values for testing
+NUM_TOKENS = [7, 2050]  # Arbitrary values for testing
 SEEDS = [0]
 QUANT_TYPES = [
     # i-matrix
@@ -176,12 +175,11 @@ def test_moe(num_tokens: int, hidden_size: int, dtype: torch.dtype,
 
     w2_dequant = torch.tensor(dequantize(w2.data, quant_type),
                               device="cuda").to(dtype)
-    act = SiluAndMul()
 
     output = _fused_moe_gguf(x, torch.tensor(w13.data, device="cuda"),
                              torch.tensor(w2.data,
                                           device="cuda"), topk_weights,
-                             topk_ids, quant_type, quant_type, act)
+                             topk_ids, quant_type, quant_type, "silu")
 
     ref_output = fused_experts(x, w13_dequant, w2_dequant, topk_weights,
                                topk_ids).reshape(output.shape)
diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py
index 45f10b0eb1d5..30e6eeb8d566 100644
--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
@@ -13,8 +13,13 @@
 
 device = "cuda"
 
+triton_scaled_mm_module = importlib.import_module(
+    "vllm.model_executor.layers.quantization.compressed_tensors."
+    "triton_scaled_mm")
+triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
 
-def scaled_mm_torch(a: torch.Tensor,
+
+def torch_scaled_mm(a: torch.Tensor,
                     b: torch.Tensor,
                     scale_a: torch.Tensor,
                     scale_b: torch.Tensor,
@@ -101,21 +106,8 @@ def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
     if use_bias:
         bias = torch.rand((N, ), device=device, dtype=out_dtype)
 
-    triton_scaled_mm_module = importlib.import_module(
-        "vllm.model_executor.layers.quantization.compressed_tensors."
-        "triton_scaled_mm")
-    triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
-
     c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
 
-    a_cpu = a.cpu()
-    b_cpu = b.cpu()
-    scale_a_cpu = scale_a.cpu()
-    scale_b_cpu = scale_b.cpu()
-    bias_cpu = None if bias is None else bias.cpu()
-
-    c_actual = scaled_mm_torch(a_cpu, b_cpu, scale_a_cpu, scale_b_cpu,
-                               out_dtype, bias_cpu)
+    c_actual = torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
 
-    c_check_cpu = c_check.cpu()
-    torch.testing.assert_close(c_check_cpu, c_actual, rtol=1e-1, atol=1e-1)
+    torch.testing.assert_close(c_check, c_actual, rtol=1e-1, atol=1e-1)
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index e3a054bd6206..580992dea53d 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -1,12 +1,17 @@
 # SPDX-License-Identifier: Apache-2.0
+import subprocess
+import sys
+from typing import Union
 
 import pytest
 import ray
 
 import vllm
+from vllm import LLM
 from vllm.lora.request import LoRARequest
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 
-from ..utils import create_new_process_for_each_test, multi_gpu_test
+from ..utils import VLLM_PATH, create_new_process_for_each_test, multi_gpu_test
 
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 
@@ -36,7 +41,10 @@ def v1(run_with_both_engines_lora):
     pass
 
 
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
+def do_sample(llm: vllm.LLM,
+              lora_path: str,
+              lora_id: int,
+              tensorizer_config_dict: Union[dict, None] = None) -> list[str]:
     prompts = [
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",  # noqa: E501
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",  # noqa: E501
@@ -45,15 +53,28 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]",  # noqa: E501
         "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
     ]
+
     sampling_params = vllm.SamplingParams(temperature=0,
                                           max_tokens=256,
                                           skip_special_tokens=False,
                                           stop=["[/assistant]"])
-    outputs = llm.generate(
-        prompts,
-        sampling_params,
-        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
-        if lora_id else None)
+
+    if tensorizer_config_dict is not None:
+        outputs = llm.generate(
+            prompts,
+            sampling_params,
+            lora_request=LoRARequest(
+                str(lora_id),
+                lora_id,
+                lora_path,
+                tensorizer_config_dict=tensorizer_config_dict)
+            if lora_id else None)
+    else:
+        outputs = llm.generate(
+            prompts,
+            sampling_params,
+            lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+            if lora_id else None)
     # Print the outputs.
     generated_texts: list[str] = []
     for output in outputs:
@@ -64,18 +85,32 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
     return generated_texts
 
 
-def generate_and_test(llm, sql_lora_files):
+def generate_and_test(llm,
+                      sql_lora_files,
+                      tensorizer_config_dict: Union[dict, None] = None):
     print("lora adapter created")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+    assert do_sample(llm,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT
 
     print("lora 1")
-    assert do_sample(llm, sql_lora_files, lora_id=1) == EXPECTED_LORA_OUTPUT
+    assert do_sample(llm,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=1) == EXPECTED_LORA_OUTPUT
 
     print("no lora")
-    assert do_sample(llm, sql_lora_files, lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+    assert do_sample(llm,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT
 
     print("lora 2")
-    assert do_sample(llm, sql_lora_files, lora_id=2) == EXPECTED_LORA_OUTPUT
+    assert do_sample(llm,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=2) == EXPECTED_LORA_OUTPUT
 
     print("removing lora")
 
@@ -153,3 +188,64 @@ def test_llama_lora_tp4_fully_sharded_loras(sql_lora_files):
         enable_chunked_prefill=True,
     )
     generate_and_test(llm, sql_lora_files)
+
+
+@multi_gpu_test(num_gpus=2)
+@create_new_process_for_each_test()
+def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
+                                            sql_lora_huggingface_id):
+
+    # Run the tensorizing of the LoRA adapter and the model in a subprocess
+    # to guarantee cleanup
+
+    tp_size = 2
+    model_name = "model-rank-%03d.tensors"
+
+    model_ref = MODEL_PATH
+    lora_path = sql_lora_huggingface_id
+    suffix = "test"
+    try:
+        result = subprocess.run([
+            sys.executable,
+            f"{VLLM_PATH}/examples/others/tensorize_vllm_model.py", "--model",
+            MODEL_PATH, "--lora-path", lora_path, "--tensor-parallel-size",
+            str(tp_size), "serialize", "--serialized-directory",
+            str(tmp_path), "--suffix", suffix
+        ],
+                                check=True,
+                                capture_output=True,
+                                text=True)
+    except subprocess.CalledProcessError as e:
+        print("Tensorizing failed.")
+        print("STDOUT:\n", e.stdout)
+        print("STDERR:\n", e.stderr)
+        raise
+
+    print("STDOUT:\n", result.stdout)
+
+    model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
+    tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
+    tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
+
+    loaded_vllm_model = LLM(model=model_ref,
+                            load_format="tensorizer",
+                            enable_lora=True,
+                            enforce_eager=True,
+                            model_loader_extra_config=tensorizer_config,
+                            max_num_seqs=13,
+                            tensor_parallel_size=2,
+                            max_loras=2)
+
+    tensorizer_config_dict = tensorizer_config.to_dict()
+
+    print("lora adapter created")
+    assert do_sample(loaded_vllm_model,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=0) == EXPECTED_NO_LORA_OUTPUT
+
+    print("lora 1")
+    assert do_sample(loaded_vllm_model,
+                     sql_lora_files,
+                     tensorizer_config_dict=tensorizer_config_dict,
+                     lora_id=1) == EXPECTED_LORA_OUTPUT
diff --git a/tests/lora/test_lora_functions.py b/tests/lora/test_lora_functions.py
index 204624a0540a..7ae33a848a0a 100644
--- a/tests/lora/test_lora_functions.py
+++ b/tests/lora/test_lora_functions.py
@@ -69,7 +69,7 @@ def run_check(fn, args, expected: list):
     run_check(llm.add_lora, make_lora_request(12), [12, 9, 10, 11])
     run_check(llm.add_lora, make_lora_request(13), [12, 13, 10, 11])
 
-    # Remove all LoRAs
+    # Remove all LoRAs.
     run_check(llm.remove_lora, 13, [12, 10, 11])
     run_check(llm.remove_lora, 12, [10, 11])
     run_check(llm.remove_lora, 11, [10])
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index 9b7a42acece5..604cb854b32f 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -31,7 +31,7 @@
     # not compatible with pip-compile.
     "pfnet/plamo-2-1b",
     "Zyphra/Zamba2-1.2B-instruct",
-    "hmellor/bamba-tiny-random",
+    "hmellor/tiny-random-BambaForCausalLM",
 ]
 
 # Avoid OOM
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 7de2a9af2f2e..f83c9940d524 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-import math
 from collections.abc import Sequence
 
 import mteb
@@ -115,4 +114,4 @@ def mteb_test_embed_models(hf_runner,
     print("SentenceTransformer:", model_dtype, st_main_score)
     print("Difference:", st_main_score - vllm_main_score)
 
-    assert math.isclose(st_main_score, vllm_main_score, rel_tol=MTEB_EMBED_TOL)
+    assert st_main_score == pytest.approx(vllm_main_score, rel=MTEB_EMBED_TOL)
diff --git a/tests/models/language/pooling/test_embedding.py b/tests/models/language/pooling/test_embedding.py
index 9db385e77bdb..a44b2154b137 100644
--- a/tests/models/language/pooling/test_embedding.py
+++ b/tests/models/language/pooling/test_embedding.py
@@ -15,13 +15,12 @@
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
         pytest.param("sentence-transformers/all-MiniLM-L12-v2"),
         pytest.param("intfloat/multilingual-e5-small"),
-        pytest.param("Alibaba-NLP/gte-Qwen2-7B-instruct"),
+        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
         # [Decoder-only]
         pytest.param("BAAI/bge-multilingual-gemma2",
                      marks=[pytest.mark.core_model]),
         pytest.param("intfloat/e5-mistral-7b-instruct",
                      marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
-        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
         pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
         # [Cross-Encoder]
         pytest.param("sentence-transformers/stsb-roberta-base-v2"),
@@ -47,9 +46,6 @@ def test_models(
         vllm_extra_kwargs["override_pooler_config"] = \
             PoolerConfig(pooling_type="MEAN")
 
-    if model == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
-        vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
-
     # The example_prompts has ending "\n", for example:
     # "Write a short story about a robot that dreams for the first time.\n"
     # sentence_transformers will strip the input texts, see:
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index 7dd3c8a4e79e..f450edd82162 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -2,7 +2,6 @@
 from __future__ import annotations
 
 import importlib.util
-import math
 from array import array
 
 import openai
@@ -104,16 +103,16 @@ def get_test_data():
 
 def validate_embed_output(q_rep: list[list[float]], d_rep: list[list[float]]):
     cosine_sim_q0_d0 = 1 - cosine(q_rep[0], d_rep[0])
-    assert math.isclose(cosine_sim_q0_d0, 0.609, abs_tol=0.001)
+    assert cosine_sim_q0_d0 == pytest.approx(0.609, abs=0.001)
 
     cosine_sim_q0_d1 = 1 - cosine(q_rep[0], d_rep[1])
-    assert math.isclose(cosine_sim_q0_d1, 0.101, abs_tol=0.001)
+    assert cosine_sim_q0_d1 == pytest.approx(0.101, abs=0.001)
 
     cosine_sim_q1_d0 = 1 - cosine(q_rep[1], d_rep[0])
-    assert math.isclose(cosine_sim_q1_d0, 0.120, abs_tol=0.001)
+    assert cosine_sim_q1_d0 == pytest.approx(0.120, abs=0.001)
 
     cosine_sim_q1_d1 = 1 - cosine(q_rep[1], d_rep[1])
-    assert math.isclose(cosine_sim_q1_d1, 0.534, abs_tol=0.001)
+    assert cosine_sim_q1_d1 == pytest.approx(0.534, abs=0.001)
 
 
 def test_gritlm_offline_embedding(vllm_runner):
diff --git a/tests/models/language/pooling/test_gte.py b/tests/models/language/pooling/test_gte.py
index 3ccf2999664c..91d10f529cd6 100644
--- a/tests/models/language/pooling/test_gte.py
+++ b/tests/models/language/pooling/test_gte.py
@@ -45,9 +45,6 @@
     EmbedModelInfo("Alibaba-NLP/gte-Qwen2-1.5B-instruct",
                    architecture="Qwen2ForCausalLM",
                    enable_test=True),
-    EmbedModelInfo("Alibaba-NLP/gte-Qwen2-7B-instruct",
-                   architecture="Qwen2ForCausalLM",
-                   enable_test=False),
     ########## ModernBertModel
     EmbedModelInfo("Alibaba-NLP/gte-modernbert-base",
                    architecture="ModernBertModel",
@@ -58,14 +55,9 @@
 @pytest.mark.parametrize("model_info", MODELS)
 def test_models_mteb(hf_runner, vllm_runner,
                      model_info: EmbedModelInfo) -> None:
-    pytest.skip("Skipping mteb test.")
-
     from .mteb_utils import mteb_test_embed_models
 
     vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
-        vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
-
     if model_info.architecture == "GteNewModel":
         vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
 
@@ -83,9 +75,6 @@ def test_models_correctness(hf_runner, vllm_runner, model_info: EmbedModelInfo,
     example_prompts = [str(s).strip() for s in example_prompts]
 
     vllm_extra_kwargs: dict[str, Any] = {}
-    if model_info.name == "Alibaba-NLP/gte-Qwen2-1.5B-instruct":
-        vllm_extra_kwargs["hf_overrides"] = {"is_causal": True}
-
     if model_info.architecture == "GteNewModel":
         vllm_extra_kwargs["hf_overrides"] = {"architectures": ["GteNewModel"]}
 
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 5287ca37c0fb..0ddff2146caa 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -1,6 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-import math
-
 import pytest
 
 from vllm import PoolingParams
@@ -60,7 +58,7 @@ def test_llm_1_to_1(vllm_runner, hf_runner, model_name, dtype: str):
     assert len(vllm_outputs) == 1
     assert len(hf_outputs) == 1
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
 
 
 @pytest.mark.parametrize("dtype", ["half"])
@@ -78,8 +76,8 @@ def test_llm_1_to_N(vllm_runner, hf_runner, model_name, dtype: str):
     assert len(vllm_outputs) == 10
     assert len(hf_outputs) == 10
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
 
 
 @pytest.fixture(scope="module", params=EMBEDDING_MODELS)
diff --git a/tests/models/language/pooling/test_nomic.py b/tests/models/language/pooling/test_nomic.py
index 6e9de30f977d..28df32e0c230 100644
--- a/tests/models/language/pooling/test_nomic.py
+++ b/tests/models/language/pooling/test_nomic.py
@@ -23,7 +23,6 @@
 @pytest.mark.parametrize("model_info", MODELS)
 def test_models_mteb(hf_runner, vllm_runner,
                      model_info: EmbedModelInfo) -> None:
-    pytest.skip("Skipping mteb test.")
     from .mteb_utils import mteb_test_embed_models
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
diff --git a/tests/models/language/pooling/test_scoring.py b/tests/models/language/pooling/test_scoring.py
index e9527700c3ca..6b10aeffc4b7 100644
--- a/tests/models/language/pooling/test_scoring.py
+++ b/tests/models/language/pooling/test_scoring.py
@@ -1,6 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-import math
-
 import pytest
 import torch
 import torch.nn.functional as F
@@ -45,7 +43,7 @@ def test_cross_encoder_1_to_1(vllm_runner, hf_runner, model_name):
     assert len(vllm_outputs) == 1
     assert len(hf_outputs) == 1
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
 
 
 def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
@@ -64,8 +62,8 @@ def test_cross_encoder_1_to_N(vllm_runner, hf_runner, model_name):
     assert len(vllm_outputs) == 2
     assert len(hf_outputs) == 2
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
 
 
 def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
@@ -84,8 +82,8 @@ def test_cross_encoder_N_to_N(vllm_runner, hf_runner, model_name):
     assert len(vllm_outputs) == 2
     assert len(hf_outputs) == 2
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
 
 
 @pytest.fixture(scope="module", params=EMBEDDING_MODELS)
@@ -112,7 +110,7 @@ def test_embedding_1_to_1(vllm_runner, hf_runner, emb_model_name):
     assert len(vllm_outputs) == 1
     assert len(hf_outputs) == 1
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
 
 
 def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
@@ -140,8 +138,8 @@ def test_embedding_1_to_N(vllm_runner, hf_runner, emb_model_name):
     assert len(vllm_outputs) == 2
     assert len(hf_outputs) == 2
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
 
 
 def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
@@ -169,5 +167,5 @@ def test_embedding_N_to_N(vllm_runner, hf_runner, emb_model_name):
     assert len(vllm_outputs) == 2
     assert len(hf_outputs) == 2
 
-    assert math.isclose(hf_outputs[0], vllm_outputs[0], rel_tol=0.01)
-    assert math.isclose(hf_outputs[1], vllm_outputs[1], rel_tol=0.01)
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.01)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.01)
diff --git a/tests/models/language/pooling/test_snowflake_arctic_embed.py b/tests/models/language/pooling/test_snowflake_arctic_embed.py
index 7d9c3c73d852..5679e0e1ce00 100644
--- a/tests/models/language/pooling/test_snowflake_arctic_embed.py
+++ b/tests/models/language/pooling/test_snowflake_arctic_embed.py
@@ -46,7 +46,6 @@ def test_models_mteb(
     vllm_runner,
     model_info: EmbedModelInfo,
 ) -> None:
-    pytest.skip("Skipping mteb test.")
     from .mteb_utils import mteb_test_embed_models
     mteb_test_embed_models(hf_runner, vllm_runner, model_info)
 
diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py
index d51a03dfea7e..e4e48f9951cf 100644
--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -349,6 +349,17 @@
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
     ),
+    "intern_vl-video": VLMTestInfo(
+        models=[
+            "OpenGVLab/InternVL3-1B",
+        ],
+        test_type=VLMTestType.VIDEO,
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        video_idx_to_prompt=lambda idx: "<video>",
+        max_model_len=8192,
+        use_tokenizer_eos=True,
+        patch_hf_runner=model_utils.internvl_patch_hf_runner,
+    ),
     "kimi_vl": VLMTestInfo(
         models=["moonshotai/Kimi-VL-A3B-Instruct"],
         test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
diff --git a/tests/models/multimodal/generation/test_interleaved.py b/tests/models/multimodal/generation/test_interleaved.py
index eec84751e450..972db40e8bd6 100644
--- a/tests/models/multimodal/generation/test_interleaved.py
+++ b/tests/models/multimodal/generation/test_interleaved.py
@@ -4,6 +4,7 @@
 
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
+from vllm.multimodal.image import convert_image_mode
 
 models = ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"]
 
@@ -26,8 +27,9 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
     give the same result.
     """
 
-    image_cherry = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-    image_stop = ImageAsset("stop_sign").pil_image.convert("RGB")
+    image_cherry = convert_image_mode(
+        ImageAsset("cherry_blossom").pil_image, "RGB")
+    image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
     images = [image_cherry, image_stop]
     video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
 
diff --git a/tests/models/multimodal/generation/test_phi4mm.py b/tests/models/multimodal/generation/test_phi4mm.py
index 11460a1a8d2b..e51dbee479c5 100644
--- a/tests/models/multimodal/generation/test_phi4mm.py
+++ b/tests/models/multimodal/generation/test_phi4mm.py
@@ -1,18 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-import re
 from collections.abc import Sequence
 from typing import Optional
 
 import librosa
 import pytest
+import regex as re
 from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 
 from vllm.assets.image import ImageAsset
 from vllm.lora.request import LoRARequest
-from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.image import convert_image_mode, rescale_image_size
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
 
@@ -267,7 +267,7 @@ def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
 
     # use the example speech question so that the model outputs are reasonable
     audio = librosa.load(speech_question, sr=None)
-    image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+    image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
 
     inputs_vision_speech = [
         (
diff --git a/tests/models/multimodal/generation/vlm_utils/model_utils.py b/tests/models/multimodal/generation/vlm_utils/model_utils.py
index b71400fc8312..dc1ea5208240 100644
--- a/tests/models/multimodal/generation/vlm_utils/model_utils.py
+++ b/tests/models/multimodal/generation/vlm_utils/model_utils.py
@@ -3,11 +3,13 @@
 for manipulating the input / output of HF & vLLM test runners, which are
 typically specific to a small subset of models.
 """
-import re
 import types
 from pathlib import PosixPath
 from typing import Optional, Union
 
+import numpy as np
+import numpy.typing as npt
+import regex as re
 import torch
 from PIL.Image import Image
 from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
@@ -495,30 +497,74 @@ def __init__(self, hf_runner: HfRunner):
             self.max_num = self.config.max_dynamic_patch
             self.image_size = self.vision_config.image_size
 
-        def __call__(self, text: str, images: Union[Image, list[Image]],
-                     **kwargs):
+        def __call__(
+            self,
+            text: str,
+            images: Union[Image, list[Image]] = None,
+            videos: Union[npt.NDArray, list[npt.NDArray]] = None,
+            **kwargs,
+        ):
             from vllm.model_executor.models.internvl import (
                 IMG_CONTEXT, IMG_END, IMG_START,
-                image_to_pixel_values_internvl)
+                image_to_pixel_values_internvl, video_to_pixel_values_internvl)
             images = [images] if isinstance(images, Image) else images
-            pixel_values = [
-                image_to_pixel_values_internvl(
-                    image,
-                    input_size=self.image_size,
-                    min_num=self.min_num,
-                    max_num=self.max_num,
-                    use_thumbnail=self.use_thumbnail,
-                ) for image in images
-            ]
-            num_patches_list = [
-                pixel_value.shape[0] for pixel_value in pixel_values
-            ]
+            videos = [videos] if isinstance(videos, np.ndarray) else videos
+            if images is not None:
+                pixel_values_images = [
+                    image_to_pixel_values_internvl(
+                        image,
+                        input_size=self.image_size,
+                        min_num=self.min_num,
+                        max_num=self.max_num,
+                        use_thumbnail=self.use_thumbnail,
+                    ) for image in images
+                ]
+                num_patches_images = [
+                    pixel_value.shape[0] for pixel_value in pixel_values_images
+                ]
+            else:
+                pixel_values_images, num_patches_images = [], []
+
+            if videos is not None:
+                pixel_values_videos = [
+                    video_to_pixel_values_internvl(
+                        video,
+                        input_size=self.image_size,
+                        min_num=1,
+                        max_num=1,
+                        use_thumbnail=False,
+                    ) for video in videos
+                ]
+                num_patches_videos = [
+                    pixel_value.shape[0] for pixel_value in pixel_values_videos
+                ]
+            else:
+                pixel_values_videos, num_patches_videos = [], []
+
+            pixel_values = []
+            while ("<image>" in text) or ("<video>" in text):
+                image_index = text.find("<image>")
+                video_index = text.find("<video>")
+                if image_index == -1 or (video_index > -1
+                                         and video_index < image_index):
+                    num_patches = num_patches_videos.pop(0)
+                    pixel_values.append(pixel_values_videos.pop(0))
+                    context_tokens = IMG_START + \
+                        IMG_CONTEXT * self.num_image_token + IMG_END
+                    video_tokens = ''.join([
+                        f'Frame{i+1}: {context_tokens}'
+                        for i in range(num_patches)
+                    ])
+                    text = text.replace('<video>', video_tokens, 1)
+                else:
+                    num_patches = num_patches_images.pop(0)
+                    pixel_values.append(pixel_values_images.pop(0))
+                    context_tokens = IMG_CONTEXT * self.num_image_token \
+                        * num_patches
+                    image_tokens = IMG_START + context_tokens + IMG_END
+                    text = text.replace('<image>', image_tokens, 1)
             pixel_values = torch.cat(pixel_values, dim=0)
-            for num_patches in num_patches_list:
-                context_tokens = IMG_CONTEXT * self.num_image_token \
-                    * num_patches
-                image_tokens = IMG_START + context_tokens + IMG_END
-                text = text.replace('<image>', image_tokens, 1)
+
             prompt = self.tokenizer(text, return_tensors="pt")
             prompt.update({"pixel_values": pixel_values})
             return prompt
diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py
index e6b70a4438e9..572fa366d332 100644
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -9,15 +9,15 @@
                                                        UserMessage)
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from PIL import Image
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from vllm.config import ModelConfig
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
 from vllm.multimodal.inputs import MultiModalInputs
 from vllm.multimodal.processing import BaseMultiModalProcessor, ProcessingCache
-from vllm.transformers_utils.tokenizer import (MistralTokenizer,
-                                               cached_tokenizer_from_config)
+from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
+                                               cached_tokenizer_from_config,
+                                               encode_tokens)
 
 from ....multimodal.utils import random_audio, random_image, random_video
 from ...registry import HF_EXAMPLE_MODELS
@@ -28,7 +28,6 @@ def _test_processing_correctness(
     hit_rate: float,
     num_batches: int,
     simplify_rate: float,
-    ignore_mm_keys: Optional[set[str]] = None,
 ):
     model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
     model_info.check_available_online(on_fail="skip")
@@ -99,10 +98,23 @@ def _test_processing_correctness(
         }
 
         mm_counts = {k: len(vs) for k, vs in mm_data.items()}
-        prompt = dummy_inputs.get_dummy_processor_inputs(
-            model_config.max_model_len,
-            mm_counts,
-        ).prompt_text
+
+        # Mistral chat outputs tokens directly, rather than text prompts
+        if isinstance(tokenizer, MistralTokenizer):
+            images = mm_data.get("image", [])
+            request = ChatCompletionRequest(messages=[
+                UserMessage(content=[
+                    TextChunk(text=""),
+                    *(ImageChunk(image=image) for image in images),
+                ]),
+            ])
+            res = tokenizer.mistral.encode_chat_completion(request)
+            prompt = res.tokens
+        else:
+            prompt = dummy_inputs.get_dummy_processor_inputs(
+                model_config.max_model_len,
+                mm_counts,
+            ).prompt
 
         # Drop unnecessary keys and test single -> multi conversion
         if rng.rand() < simplify_rate:
@@ -112,67 +124,59 @@ def _test_processing_correctness(
                 elif len(mm_data[k]) == 1:
                     mm_data[k] = mm_data[k][0]
 
-        if isinstance(tokenizer, MistralTokenizer):
-            _test_processing_correctness_mistral(
-                model_config,
-                tokenizer,
-                prompt,
-                mm_data,
-                baseline_processor,
-                cached_processor,
-                batch_idx,
-                ignore_mm_keys=ignore_mm_keys,
-            )
-        else:
-            _test_processing_correctness_hf(
-                model_config,
-                tokenizer,
-                prompt,
-                mm_data,
-                baseline_processor,
-                cached_processor,
-                batch_idx,
-                ignore_mm_keys=ignore_mm_keys,
-            )
-
-
-def _test_processing_correctness_hf(
+        _test_processing_correctness_one(
+            model_config,
+            tokenizer,
+            prompt,
+            mm_data,
+            baseline_processor,
+            cached_processor,
+            batch_idx,
+        )
+
+
+# For some multimodal models, tokenizer will always add bos_token
+# at the beginning of prompt by default, causing hf_processor outputs
+# incorrect token ids. So we need use `add_special_tokens=False` here
+# to leave bos_token to be added by the processor.
+_ADD_SPECIAL_TOKENS_OVERRIDES = {
+    "mllama": False,
+    "ovis": False,
+    "ultravox": False,
+    "whisper": False,
+}
+
+_IGNORE_MM_KEYS = {
+    # In Ultravox, the audio_features can be different depending on padding
+    # The slight difference should not be a problem though, since
+    # attention_mask lets us ignore the difference.
+    "ultravox": {"audio_features"},
+}
+
+
+def _test_processing_correctness_one(
     model_config: ModelConfig,
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    prompt: str,
+    tokenizer: AnyTokenizer,
+    prompt: Union[str, list[int]],
     mm_data: MultiModalDataDict,
     baseline_processor: BaseMultiModalProcessor,
     cached_processor: BaseMultiModalProcessor,
     batch_idx: int,
-    ignore_mm_keys: Optional[set[str]] = None,
 ):
-    if model_config.hf_config.model_type in ("mllama", "ovis", "ultravox",
-                                             "whisper"):
-        # For some multimodal models, tokenizer will always add bos_token
-        # at the beginning of prompt by default, causing hf_processor outputs
-        # incorrect token ids. So we need use `add_special_tokens=False` here
-        # to leave bos_token to be added by the processor.
-        token_prompt = tokenizer.encode(prompt, add_special_tokens=False)
+    model_type = model_config.hf_config.model_type
+    ignore_mm_keys = _IGNORE_MM_KEYS.get(model_type, set[str]())
+
+    if isinstance(prompt, str):
+        text_prompt = prompt
+        token_prompt = encode_tokens(
+            tokenizer,
+            prompt,
+            add_special_tokens=_ADD_SPECIAL_TOKENS_OVERRIDES.get(model_type),
+        )
     else:
-        token_prompt = tokenizer.encode(prompt)
-
-    baseline_result = baseline_processor.apply(
-        prompt,
-        mm_data=mm_data,
-        hf_processor_mm_kwargs={},
-    )
-    cached_result = cached_processor.apply(
-        prompt,
-        mm_data=mm_data,
-        hf_processor_mm_kwargs={},
-    )
-
-    _assert_inputs_equal(
-        baseline_result,
-        cached_result,
-        ignore_mm_keys=ignore_mm_keys,
-        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
-    )
+        # Mistral does not support decode_tokens with skip_special_tokens=False
+        text_prompt = None
+        token_prompt = prompt
 
     baseline_tokenized_result = baseline_processor.apply(
         token_prompt,
@@ -180,56 +184,6 @@ def _test_processing_correctness_hf(
         hf_processor_mm_kwargs={},
     )
 
-    _assert_inputs_equal(
-        baseline_result,
-        baseline_tokenized_result,
-        ignore_mm_keys=ignore_mm_keys,
-        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
-    )
-
-    cached_tokenized_result = cached_processor.apply(
-        token_prompt,
-        mm_data=mm_data,
-        hf_processor_mm_kwargs={},
-    )
-
-    _assert_inputs_equal(
-        cached_result,
-        cached_tokenized_result,
-        ignore_mm_keys=ignore_mm_keys,
-        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
-    )
-
-
-def _test_processing_correctness_mistral(
-    model_config: ModelConfig,
-    tokenizer: MistralTokenizer,
-    prompt: str,
-    mm_data: MultiModalDataDict,
-    baseline_processor: BaseMultiModalProcessor,
-    cached_processor: BaseMultiModalProcessor,
-    batch_idx: int,
-    ignore_mm_keys: Optional[set[str]] = None,
-):
-    images = mm_data.get("image", [])
-    if not isinstance(images, list):
-        images = [images]
-
-    request = ChatCompletionRequest(messages=[
-        UserMessage(content=[
-            TextChunk(text=prompt),
-            *(ImageChunk(image=image) for image in images),
-        ]),
-    ])
-    res = tokenizer.mistral.encode_chat_completion(request)
-    token_prompt = res.tokens
-
-    # Mistral chat outputs tokens directly, rather than text prompts
-    baseline_tokenized_result = baseline_processor.apply(
-        token_prompt,
-        mm_data=mm_data,
-        hf_processor_mm_kwargs={},
-    )
     cached_tokenized_result = cached_processor.apply(
         token_prompt,
         mm_data=mm_data,
@@ -240,9 +194,44 @@ def _test_processing_correctness_mistral(
         baseline_tokenized_result,
         cached_tokenized_result,
         ignore_mm_keys=ignore_mm_keys,
-        msg=f"Failed ({batch_idx=}, {prompt=}, {mm_data=})",
+        msg=f"Failed ({batch_idx=}, {token_prompt=}, {mm_data=})",
     )
 
+    if text_prompt is not None:
+        baseline_text_result = baseline_processor.apply(
+            text_prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+        cached_text_result = cached_processor.apply(
+            text_prompt,
+            mm_data=mm_data,
+            hf_processor_mm_kwargs={},
+        )
+
+        _assert_inputs_equal(
+            baseline_text_result,
+            cached_text_result,
+            ignore_mm_keys=ignore_mm_keys,
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, {mm_data=})",
+        )
+
+        _assert_inputs_equal(
+            baseline_text_result,
+            baseline_tokenized_result,
+            ignore_mm_keys=ignore_mm_keys,
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, "
+            f"{token_prompt=}, {mm_data=})",
+        )
+
+        _assert_inputs_equal(
+            cached_text_result,
+            cached_tokenized_result,
+            ignore_mm_keys=ignore_mm_keys,
+            msg=f"Failed ({batch_idx=}, {text_prompt=}, "
+            f"{token_prompt=}, {mm_data=})",
+        )
+
 
 # yapf: disable
 @pytest.mark.parametrize("model_id", [
@@ -258,6 +247,7 @@ def _test_processing_correctness_mistral(
     "ibm-granite/granite-speech-3.3-8b",
     "h2oai/h2ovl-mississippi-800m",
     "OpenGVLab/InternVL2-1B",
+    "OpenGVLab/InternVL3-1B",
     "HuggingFaceM4/Idefics3-8B-Llama3",
     "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
     "moonshotai/Kimi-VL-A3B-Instruct",
@@ -280,6 +270,7 @@ def _test_processing_correctness_mistral(
     "AIDC-AI/Ovis2-1B",
     "google/paligemma-3b-mix-224",
     "google/paligemma2-3b-ft-docci-448",
+    "microsoft/Phi-3.5-vision-instruct",
     "microsoft/Phi-4-multimodal-instruct",
     "mistralai/Pixtral-12B-2409",
     "mistral-community/pixtral-12b",
@@ -302,41 +293,6 @@ def test_processing_correctness(
     num_batches: int,
     simplify_rate: float,
 ):
-    ignore_mm_keys = None
-    if 'ultravox' in model_id:
-        # In Ultravox, the audio_features can be different depending on padding
-        # The slight difference should not be a problem though, since
-        # attention_mask lets us ignore the difference.
-        ignore_mm_keys = {"audio_features"}
-
-    _test_processing_correctness(
-        model_id,
-        hit_rate=hit_rate,
-        num_batches=num_batches,
-        simplify_rate=simplify_rate,
-        ignore_mm_keys=ignore_mm_keys,
-    )
-
-
-# yapf: disable
-@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
-@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
-@pytest.mark.parametrize("num_batches", [32])
-@pytest.mark.parametrize("simplify_rate", [1.0])
-# yapf: enable
-def test_processing_correctness_phi3v(
-    model_id: str,
-    hit_rate: float,
-    num_batches: int,
-    simplify_rate: float,
-):
-    # HACK - this is an attempted workaround for the following bug
-    # https://github.com/huggingface/transformers/issues/34307
-    from transformers import AutoImageProcessor  # noqa: F401
-    from transformers import AutoProcessor  # noqa: F401
-
-    AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
-
     _test_processing_correctness(
         model_id,
         hit_rate=hit_rate,
@@ -355,16 +311,10 @@ def _assert_inputs_equal(
     if ignore_mm_keys is None:
         ignore_mm_keys = set()
 
-    if msg is None:
-        assert "mm_kwargs" in a and "mm_kwargs" in b
-    else:
-        assert "mm_kwargs" in a and "mm_kwargs" in b, msg
+    assert "mm_kwargs" in a and "mm_kwargs" in b, msg
 
     for key in ignore_mm_keys:
         a["mm_kwargs"].pop(key, None)
         b["mm_kwargs"].pop(key, None)
 
-    if msg is None:
-        assert a == b
-    else:
-        assert a == b, msg
+    assert a == b, msg
diff --git a/tests/models/multimodal/processing/test_mllama.py b/tests/models/multimodal/processing/test_mllama.py
index b89376cf1722..d4794396f6d2 100644
--- a/tests/models/multimodal/processing/test_mllama.py
+++ b/tests/models/multimodal/processing/test_mllama.py
@@ -49,7 +49,7 @@ def test_profiling(
                         ] * max_num_seqs
 
     mm_kwargs = processor.apply(
-        prompt=dummy_mm_data.prompt_text,
+        prompt=dummy_mm_data.prompt,
         mm_data=dummy_mm_data.mm_data,
         hf_processor_mm_kwargs=dict(),
     )["mm_kwargs"]
diff --git a/tests/models/quantization/test_gguf.py b/tests/models/quantization/test_gguf.py
index 3ff36502df57..5f17d12284a0 100644
--- a/tests/models/quantization/test_gguf.py
+++ b/tests/models/quantization/test_gguf.py
@@ -78,8 +78,12 @@ def gguf_model(self):
 )
 
 MODELS = [
-    LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG,
-    DOLPHIN_CONFIG
+    LLAMA_CONFIG,
+    QWEN2_CONFIG,
+    PHI3_CONFIG,
+    GPT2_CONFIG,
+    # STABLELM_CONFIG,  # enable this when v1 support head_size=80
+    DOLPHIN_CONFIG,
     # STARCODER_CONFIG, # broken
 ]
 
diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py
index f94f3457c377..510858c2d7ef 100644
--- a/tests/models/quantization/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@@ -41,8 +41,8 @@
     reason=
     "Prevent unstable test based on golden strings from breaking the build "
     " and test input model being too large and hanging the system.")
-@pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
-                    reason="nvfp4 is not supported on this GPU type.")
+@pytest.mark.skipif(not is_quant_method_supported("modelopt_fp4"),
+                    reason="modelopt_fp4 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
     model = LLM(
@@ -50,7 +50,7 @@ def test_models(example_prompts, model_name) -> None:
         max_model_len=MAX_MODEL_LEN,
         trust_remote_code=True,
         enforce_eager=True,
-        quantization="nvfp4",
+        quantization="modelopt_fp4",
     )
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 84abd42e9231..a49e3ad6b20e 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -8,6 +8,8 @@
 from packaging.version import Version
 from transformers import __version__ as TRANSFORMERS_VERSION
 
+from vllm.config import TokenizerMode
+
 
 @dataclass(frozen=True)
 class _HfExamplesInfo:
@@ -20,7 +22,7 @@ class _HfExamplesInfo:
     tokenizer: Optional[str] = None
     """Set the tokenizer to load for this architecture."""
 
-    tokenizer_mode: str = "auto"
+    tokenizer_mode: TokenizerMode = "auto"
     """Set the tokenizer type for this architecture."""
 
     speculative_model: Optional[str] = None
@@ -55,9 +57,18 @@ class _HfExamplesInfo:
     trust_remote_code: bool = False
     """The ``trust_remote_code`` level required to load the model."""
 
+    v0_only: bool = False
+    """The model is only available with the vLLM V0 engine."""
+
     hf_overrides: dict[str, Any] = field(default_factory=dict)
     """The ``hf_overrides`` required to load the model."""
 
+    max_model_len: Optional[int] = None
+    """
+    The maximum model length to use for this model. Some models default to a
+    length that is too large to fit into memory in CI.
+    """
+
     def check_transformers_version(
         self,
         *,
@@ -124,7 +135,7 @@ def check_available_online(
     "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
                                          trust_remote_code=True),
     "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B",
-                                        extras={"tiny": "hmellor/bamba-tiny-random"}),  # noqa: E501
+                                        extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}),  # noqa: E501
     "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
                                         {"1b": "bigscience/bloomz-1b1"}),
     "ChatGLMModel": _HfExamplesInfo("THUDM/chatglm3-6b",
@@ -147,6 +158,9 @@ def check_available_online(
     "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
     "Fairseq2LlamaForCausalLM": _HfExamplesInfo("mgleize/fairseq2-dummy-Llama-3.2-1B"),  # noqa: E501
     "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
+    "FalconH1ForCausalLM":_HfExamplesInfo("tiiuae/Falcon-H1-1.5B-Instruct",
+                                          is_available_online=False,
+                                          min_transformers_version="4.52.2"),
     "GemmaForCausalLM": _HfExamplesInfo("google/gemma-1.1-2b-it"),
     "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
     "Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it"),
@@ -212,10 +226,11 @@ def check_available_online(
     "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
                                         trust_remote_code=True),
     "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
-    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
+    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2", v0_only=True),
     "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
     "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
-                                            trust_remote_code=True),
+                                            trust_remote_code=True,
+                                            v0_only=True),
     "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                          trust_remote_code=True),
     "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
@@ -231,7 +246,8 @@ def check_available_online(
                                      is_available_online=False),
     "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
                                                 is_available_online=False),
-    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
+    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t",
+                                           v0_only=True),
     "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
     "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
     "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
@@ -300,7 +316,8 @@ def check_available_online(
     "AriaForConditionalGeneration": _HfExamplesInfo("rhymes-ai/Aria"),
     "AyaVisionForConditionalGeneration": _HfExamplesInfo("CohereForAI/aya-vision-8b"), # noqa: E501
     "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b",  # noqa: E501
-                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"}),  # noqa: E501
+                                                     extras={"6b": "Salesforce/blip2-opt-6.7b"},  # noqa: E501
+                                                     v0_only=True),
     "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
     "DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny",  # noqa: E501
                                                 extras={"fork": "Isotr0py/deepseek-vl2-tiny"},  # noqa: E501
@@ -319,15 +336,18 @@ def check_available_online(
                                       max_transformers_version="4.48",  # noqa: E501
                                       transformers_version_reason="HF model is not compatible."),  # noqa: E501
     "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
-                                         extras={"2B": "OpenGVLab/InternVL2-2B"},  # noqa: E501
+                                         extras={"2B": "OpenGVLab/InternVL2-2B",
+                                                 "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
                                          trust_remote_code=True),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                         {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
     "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
                                                       extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
-                                                      trust_remote_code=True),
+                                                      trust_remote_code=True,
+                                                      v0_only=True),
     "Llama4ForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
-                                                      min_transformers_version="4.51"),
+                                                      min_transformers_version="4.51",
+                                                      max_model_len=10240),
     "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
                                                      extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
                                                              "mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}),  # noqa: E501
@@ -346,7 +366,8 @@ def check_available_online(
                                 extras={"2.6": "openbmb/MiniCPM-V-2_6"},  # noqa: E501
                                 trust_remote_code=True),
     "MiniMaxVL01ForConditionalGeneration": _HfExamplesInfo("MiniMaxAI/MiniMax-VL-01", # noqa: E501
-                                              trust_remote_code=True),
+                                              trust_remote_code=True,
+                                              v0_only=True),
     "Mistral3ForConditionalGeneration": _HfExamplesInfo("mistralai/Mistral-Small-3.1-24B-Instruct-2503",  # noqa: E501
                                                         extras={"fp8": "nm-testing/Mistral-Small-3.1-24B-Instruct-2503-FP8-dynamic"}),  # noqa: E501
     "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
@@ -379,6 +400,8 @@ def check_available_online(
     "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct"),  # noqa: E501
     "Qwen2_5OmniModel": _HfExamplesInfo("Qwen/Qwen2.5-Omni-3B",
                                         min_transformers_version="4.52"),
+    "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-Omni-7B-AWQ",  # noqa: E501
+                                                           min_transformers_version="4.52"),
     "SkyworkR1VChatModel": _HfExamplesInfo("Skywork/Skywork-R1V-38B"),
     "SmolVLMForConditionalGeneration": _HfExamplesInfo("HuggingFaceTB/SmolVLM2-2.2B-Instruct"),  # noqa: E501
     "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 446c4efbf6af..d403cb392fe0 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -15,12 +15,12 @@
 
 
 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
-def test_can_initialize(model_arch):
+def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
     model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
     model_info.check_available_online(on_fail="skip")
     model_info.check_transformers_version(on_fail="skip")
 
-    # Avoid OOM
+    # Avoid OOM and reduce initialization time by only using 1 layer
     def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
         hf_config.update(model_info.hf_overrides)
 
@@ -34,6 +34,12 @@ def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
             "num_local_experts": 2,
         })
 
+        if hasattr(hf_config, "vision_config"):
+            hf_config.vision_config.update({
+                "num_layers": 1,
+                "num_hidden_layers": 1,
+            })
+
         return hf_config
 
     # Avoid calling model.forward()
@@ -46,7 +52,7 @@ def _initialize_kv_caches_v1(self, vllm_config):
         scheduler_kv_cache_config = get_kv_cache_config(
             vllm_config,
             kv_cache_specs[0],
-            20 * GiB_bytes,
+            10 * GiB_bytes,
         )
 
         # gpu_blocks (> 0), cpu_blocks, scheduler_kv_cache_config
@@ -55,7 +61,9 @@ def _initialize_kv_caches_v1(self, vllm_config):
     with (patch.object(V0LLMEngine, "_initialize_kv_caches",
                        _initialize_kv_caches_v0),
           patch.object(V1EngineCore, "_initialize_kv_caches",
-                       _initialize_kv_caches_v1)):
+                       _initialize_kv_caches_v1), monkeypatch.context() as m):
+        if model_info.v0_only:
+            m.setenv("VLLM_USE_V1", "0")
         LLM(
             model_info.default,
             tokenizer=model_info.tokenizer,
@@ -65,6 +73,7 @@ def _initialize_kv_caches_v1(self, vllm_config):
                 "num_speculative_tokens": 1,
             } if model_info.speculative_model else None,
             trust_remote_code=model_info.trust_remote_code,
+            max_model_len=model_info.max_model_len,
             load_format="dummy",
             hf_overrides=hf_overrides,
         )
diff --git a/tests/models/test_oot_registration.py b/tests/models/test_oot_registration.py
index b45a87d94b86..b62720caa9cb 100644
--- a/tests/models/test_oot_registration.py
+++ b/tests/models/test_oot_registration.py
@@ -4,6 +4,7 @@
 
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
+from vllm.multimodal.image import convert_image_mode
 
 from ..utils import create_new_process_for_each_test
 
@@ -58,7 +59,7 @@ def test_oot_registration_embedding(
             assert all(v == 0 for v in output.outputs.embedding)
 
 
-image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
 
 
 @create_new_process_for_each_test()
diff --git a/tests/models/test_transformers.py b/tests/models/test_transformers.py
index 6e38c4c7cadb..1a51b4aeab04 100644
--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -1,37 +1,50 @@
 # SPDX-License-Identifier: Apache-2.0
 """Test the functionality of the Transformers backend."""
+from typing import Any, Optional, Union
+
 import pytest
 
 from vllm.platforms import current_platform
 
 from ..conftest import HfRunner, VllmRunner
+from ..core.block.e2e.test_correctness_sliding_window import prep_prompts
 from ..utils import multi_gpu_test
 from .utils import check_logprobs_close
 
 
 def check_implementation(
-    hf_runner: type[HfRunner],
-    vllm_runner: type[VllmRunner],
+    runner_ref: type[Union[HfRunner, VllmRunner]],
+    runner_test: type[VllmRunner],
     example_prompts: list[str],
     model: str,
+    kwargs_ref: Optional[dict[str, Any]] = None,
+    kwargs_test: Optional[dict[str, Any]] = None,
     **kwargs,
 ):
+    if kwargs_ref is None:
+        kwargs_ref = {}
+    if kwargs_test is None:
+        kwargs_test = {}
+
     max_tokens = 32
     num_logprobs = 5
 
-    with vllm_runner(model, **kwargs) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
+    args = (example_prompts, max_tokens, num_logprobs)
+
+    with runner_test(model, **kwargs_test, **kwargs) as model_test:
+        outputs_test = model_test.generate_greedy_logprobs(*args)
 
-    with hf_runner(model) as hf_model:
-        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
+    with runner_ref(model, **kwargs_ref) as model_ref:
+        if isinstance(model_ref, VllmRunner):
+            outputs_ref = model_ref.generate_greedy_logprobs(*args)
+        else:
+            outputs_ref = model_ref.generate_greedy_logprobs_limit(*args)
 
     check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
+        outputs_0_lst=outputs_ref,
+        outputs_1_lst=outputs_test,
+        name_0="ref",
+        name_1="test",
     )
 
 
@@ -58,6 +71,18 @@ def test_models(
                          model_impl=model_impl)
 
 
+def test_hybrid_attention(vllm_runner: type[VllmRunner]) -> None:
+    prompts, _, _ = prep_prompts(4, (800, 801))
+    kwargs_ref = {"max_model_len": 8192, "enforce_eager": True}
+    kwargs_test = {"model_impl": "transformers", **kwargs_ref}
+    check_implementation(vllm_runner,
+                         vllm_runner,
+                         prompts,
+                         model="hmellor/tiny-random-Gemma2ForCausalLM",
+                         kwargs_ref=kwargs_ref,
+                         kwargs_test=kwargs_test)
+
+
 @multi_gpu_test(num_gpus=2)
 def test_distributed(
     hf_runner: type[HfRunner],
@@ -65,8 +90,11 @@ def test_distributed(
     example_prompts,
 ):
     kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
-    check_implementation(hf_runner, vllm_runner, example_prompts,
-                         "meta-llama/Llama-3.2-1B-Instruct", **kwargs)
+    check_implementation(hf_runner,
+                         vllm_runner,
+                         example_prompts,
+                         "meta-llama/Llama-3.2-1B-Instruct",
+                         kwargs_test=kwargs)
 
 
 @pytest.mark.skipif(
diff --git a/tests/models/test_utils.py b/tests/models/test_utils.py
index d61c7d2d5000..a16384efe195 100644
--- a/tests/models/test_utils.py
+++ b/tests/models/test_utils.py
@@ -77,3 +77,73 @@ def weight_generator():
     assert torch.all(
         new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
     assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
+
+
+def test_module_skip_prefix():
+    """Ensure the auto weight loader can skip prefix."""
+    mod = ModuleWithNestedBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+
+    # Try to load the weights to a new instance
+    def weight_generator():
+        # weights needed to be filtered out
+        redundant_weights = {
+            "prefix.bn.weight": torch.Tensor([1, 2]),
+            "prefix.bn.bias": torch.Tensor([3, 4]),
+        }
+        yield from (mod.state_dict() | redundant_weights).items()
+
+    new_mod = ModuleWithNestedBatchNorm()
+
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
+
+    loader = AutoWeightsLoader(new_mod, skip_prefixes=["prefix."])
+    loader.load_weights(weight_generator())
+
+    # Ensure the stats are updated
+    assert torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
+
+
+def test_module_skip_substr():
+    """Ensure the auto weight loader can skip prefix."""
+    mod = ModuleWithNestedBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+
+    # Try to load the weights to a new instance
+    def weight_generator():
+        # weights needed to be filtered out
+        redundant_weights = {
+            "nested_mod.0.substr.weight": torch.Tensor([1, 2]),
+            "nested_mod.0.substr.bias": torch.Tensor([3, 4]),
+            "nested_mod.substr.weight": torch.Tensor([1, 2]),
+            "nested_mod.substr.bias": torch.Tensor([3, 4]),
+        }
+        yield from (mod.state_dict() | redundant_weights).items()
+
+    new_mod = ModuleWithNestedBatchNorm()
+
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
+
+    loader = AutoWeightsLoader(new_mod, skip_substrs=["substr."])
+    loader.load_weights(weight_generator())
+
+    # Ensure the stats are updated
+    assert torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
diff --git a/tests/multimodal/assets/rgba.png b/tests/multimodal/assets/rgba.png
new file mode 100644
index 000000000000..11eb81857a65
Binary files /dev/null and b/tests/multimodal/assets/rgba.png differ
diff --git a/tests/multimodal/test_image.py b/tests/multimodal/test_image.py
new file mode 100644
index 000000000000..56b5475c9ca0
--- /dev/null
+++ b/tests/multimodal/test_image.py
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: Apache-2.0
+from pathlib import Path
+
+import numpy as np
+from PIL import Image, ImageChops
+
+from vllm.multimodal.image import convert_image_mode
+
+ASSETS_DIR = Path(__file__).parent / "assets"
+assert ASSETS_DIR.exists()
+
+
+def test_rgb_to_rgb():
+    # Start with an RGB image.
+    original_image = Image.open(ASSETS_DIR / "image1.png").convert("RGB")
+    converted_image = convert_image_mode(original_image, "RGB")
+
+    # RGB to RGB should be a no-op.
+    diff = ImageChops.difference(original_image, converted_image)
+    assert diff.getbbox() is None
+
+
+def test_rgba_to_rgb():
+    original_image = Image.open(ASSETS_DIR / "rgba.png")
+    original_image_numpy = np.array(original_image)
+
+    converted_image = convert_image_mode(original_image, "RGB")
+    converted_image_numpy = np.array(converted_image)
+
+    for i in range(original_image_numpy.shape[0]):
+        for j in range(original_image_numpy.shape[1]):
+            # Verify that all transparent pixels are converted to white.
+            if original_image_numpy[i][j][3] == 0:
+                assert converted_image_numpy[i][j][0] == 255
+                assert converted_image_numpy[i][j][1] == 255
+                assert converted_image_numpy[i][j][2] == 255
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 478184c34b91..f1e45da30eda 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -10,6 +10,7 @@
 import pytest
 from PIL import Image, ImageChops
 
+from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector,
                                    merge_and_sort_multimodal_metadata)
@@ -53,7 +54,7 @@ def get_supported_suffixes() -> tuple[str, ...]:
 
 
 def _image_equals(a: Image.Image, b: Image.Image) -> bool:
-    return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()
+    return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all()
 
 
 @pytest.mark.asyncio
diff --git a/tests/neuron/2_core/test_eagle.py b/tests/neuron/2_core/test_eagle.py
new file mode 100644
index 000000000000..d71c88689a99
--- /dev/null
+++ b/tests/neuron/2_core/test_eagle.py
@@ -0,0 +1,82 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+import shutil
+import tempfile
+
+import torch
+from huggingface_hub import snapshot_download
+from safetensors import safe_open
+
+from vllm import LLM, SamplingParams
+
+
+def patch_eagle_draft_with_lm_head(target_model_id: str,
+                                   draft_model_id: str) -> str:
+    # In NxDI, draft model checkpoint must include lm_head weights from target
+    # model. For more details see https://awsdocs-neuron.readthedocs-hosted.com
+    # /en/latest/libraries/nxd-inference/developer_guides/feature-guide.html
+    # #eagle-checkpoint-compatibility
+    final_draft_dir = "/tmp/patched_eagle_draft"
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        target_dir = snapshot_download(repo_id=target_model_id,
+                                       local_dir=os.path.join(
+                                           tmp_dir, "target"))
+        draft_dir = snapshot_download(repo_id=draft_model_id,
+                                      local_dir=os.path.join(tmp_dir, "draft"))
+
+        lm_head_key = "lm_head.weight"
+        index_path = os.path.join(target_dir, "model.safetensors.index.json")
+        with open(index_path) as f:
+            index = json.load(f)
+        shard_name = index["weight_map"][lm_head_key]
+        target_safetensor_path = os.path.join(target_dir, shard_name)
+
+        with safe_open(target_safetensor_path, framework="pt") as f:
+            target_lm_head = f.get_tensor(lm_head_key)
+
+        draft_path = os.path.join(draft_dir, "pytorch_model.bin")
+        draft_state_dict = torch.load(draft_path, map_location="cpu")
+        draft_state_dict[lm_head_key] = target_lm_head.to(torch.float16)
+        torch.save(draft_state_dict, draft_path)
+
+        shutil.copytree(draft_dir, final_draft_dir, dirs_exist_ok=True)
+
+    return final_draft_dir
+
+
+def test_eagle():
+    patched_draft_path = patch_eagle_draft_with_lm_head(
+        target_model_id="meta-llama/Llama-2-7b-hf",
+        draft_model_id="yuhuili/EAGLE-llama2-chat-7B")
+    llm = LLM(
+        model="meta-llama/Llama-2-7b-hf",
+        speculative_config={
+            "model": patched_draft_path,
+            "num_speculative_tokens": 5,
+            "max_model_len": 128
+        },
+        max_num_seqs=1,
+        max_model_len=128,
+        tensor_parallel_size=2,
+        override_neuron_config={
+            "enable_eagle_speculation": True,
+            "enable_fused_speculation": True,
+            "fused_qkv": True
+        },
+    )
+    prompts = [
+        "The president of the United States is",
+    ]
+    outputs = llm.generate(prompts, SamplingParams(top_k=1))
+    expected_output = " the head of state and head of government of " \
+    "the United States. The president direct"
+
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
+        assert (expected_output == generated_text)
+
+    print("Neuron Eagle speculation test passed.")
diff --git a/tests/neuron/2_core/test_mistral.py b/tests/neuron/2_core/test_mistral.py
new file mode 100644
index 000000000000..3e651502d1e2
--- /dev/null
+++ b/tests/neuron/2_core/test_mistral.py
@@ -0,0 +1,64 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+
+
+def test_mistral():
+    llm = LLM(model="mistralai/Mistral-7B-v0.1",
+              tensor_parallel_size=2,
+              max_num_seqs=4,
+              max_model_len=128,
+              use_v2_block_manager=True,
+              override_neuron_config={
+                  "sequence_parallel_enabled": False,
+                  "skip_warmup": True
+              })
+
+    # Send more prompts than the compiled batch size (4) and request
+    # varying generation lengths to test accuracy related to Neuron
+    # specific sequence id sorting.
+    prompts = [
+        "The president of the United States is",
+        "The capital of France is",
+        "What is Annapurna labs?",
+        "I believe the meaning of life is",
+        "Tell me a story about a brave knight",
+        "Hello, my name is Llama",
+    ]
+
+    sampling_params = [
+        SamplingParams(top_k=1, max_tokens=10),
+        SamplingParams(top_k=1, max_tokens=20),
+        SamplingParams(top_k=1, max_tokens=30),
+        SamplingParams(top_k=1, max_tokens=40),
+        SamplingParams(top_k=1, max_tokens=50),
+        SamplingParams(top_k=1, max_tokens=60)
+    ]
+
+    outputs = llm.generate(prompts, sampling_params)
+
+    expected_outputs = [
+        " the most powerful person in the world. He is",
+        " a city of many faces. It is a city of history, culture, art, "
+        "fashion, and",
+        "\n\nAnnapurna Labs is a semiconductor company that was founded "
+        "in 2013 by Amazon. The company is",
+        " to be happy.\n\nI believe that happiness is a choice.\n\nI "
+        "believe that happiness is a state of mind.\n\nI believe that "
+        "happiness is a journey.\n\nI believe",
+        " who rescued a princess from a dragon.\n\nTell me a story about"
+        " a princess who rescued herself from a dragon.\n\nTell me a "
+        "story about a princess who rescued herself from a dragon and "
+        "then rescued a knight from",
+        " and I am a 10 year old male. I am a very friendly and "
+        "affectionate boy who loves to be around people. I am a very "
+        "active boy who loves to play and run around. I am a very smart "
+        "boy who loves to learn new things. I am a very loyal boy"
+    ]
+
+    for expected_output, output in zip(expected_outputs, outputs):
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
+        assert (expected_output == generated_text)
+
+    print("Neuron Mistral test passed.")
diff --git a/tests/plugins_tests/test_platform_plugins.py b/tests/plugins_tests/test_platform_plugins.py
index 9d6872e0e077..207de53abd8d 100644
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -29,5 +29,5 @@ def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
     # ignore the backend env variable if it is set
     with monkeypatch.context() as m:
         m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
-        backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
+        backend = get_attn_backend(16, torch.float16, "auto", 16, False)
         assert backend.get_name() == "Dummy_Backend"
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 0f20f42d8650..e8ddfd7fc779 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -37,12 +37,6 @@
     ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
 ]
 
-models_pre_quant_8bit_to_test = [
-    ('meta-llama/Llama-Guard-3-8B-INT8',
-     'read pre-quantized llama 8-bit model'),
-    ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
-]
-
 
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                     reason='bitsandbytes is not supported on this GPU type.')
diff --git a/tests/runai_model_streamer_test/test_weight_utils.py b/tests/runai_model_streamer_test/test_weight_utils.py
index 4afa76c51693..06e506c35761 100644
--- a/tests/runai_model_streamer_test/test_weight_utils.py
+++ b/tests/runai_model_streamer_test/test_weight_utils.py
@@ -23,10 +23,11 @@ def test_runai_model_loader():
         runai_model_streamer_tensors = {}
         hf_safetensors_tensors = {}
 
-        for name, tensor in runai_safetensors_weights_iterator(safetensors):
+        for name, tensor in runai_safetensors_weights_iterator(
+                safetensors, True):
             runai_model_streamer_tensors[name] = tensor
 
-        for name, tensor in safetensors_weights_iterator(safetensors):
+        for name, tensor in safetensors_weights_iterator(safetensors, True):
             hf_safetensors_tensors[name] = tensor
 
         assert len(runai_model_streamer_tensors) == len(hf_safetensors_tensors)
diff --git a/tests/tensorizer_loader/conftest.py b/tests/tensorizer_loader/conftest.py
index 7efef163d2b9..ce8689f5b89c 100644
--- a/tests/tensorizer_loader/conftest.py
+++ b/tests/tensorizer_loader/conftest.py
@@ -5,14 +5,6 @@
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 
 
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    """
-    Tensorizer only tested on V0 so far.
-    """
-    monkeypatch.setenv('VLLM_USE_V1', '0')
-
-
 @pytest.fixture(autouse=True)
 def cleanup():
     cleanup_dist_env_and_memory(shutdown_ray=True)
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 7136dd44de03..b6286e148397 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -1,17 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import gc
-import json
 import os
 import pathlib
 import subprocess
-from functools import partial
 from unittest.mock import MagicMock, patch
 
-import openai
 import pytest
 import torch
-from huggingface_hub import snapshot_download
 
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
@@ -22,12 +18,11 @@
                                                          is_vllm_tensorized,
                                                          load_with_tensorizer,
                                                          open_stream,
-                                                         serialize_vllm_model,
                                                          tensorize_vllm_model)
 # yapf: enable
-from vllm.utils import PlaceholderModule, import_from_path
+from vllm.utils import PlaceholderModule
 
-from ..utils import VLLM_PATH, RemoteOpenAIServer
+from ..utils import VLLM_PATH
 
 try:
     from tensorizer import EncryptionParams
@@ -103,6 +98,7 @@ def test_can_deserialize_s3(vllm_runner):
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
         vllm_runner, tmp_path):
+    args = EngineArgs(model=model_ref)
     with vllm_runner(model_ref) as vllm_model:
         model_path = tmp_path / (model_ref + ".tensors")
         key_path = tmp_path / (model_ref + ".key")
@@ -110,15 +106,13 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
 
         outputs = vllm_model.generate(prompts, sampling_params)
 
-        config_for_serializing = TensorizerConfig(tensorizer_uri=model_path,
-                                                  encryption_keyfile=key_path)
+    config_for_serializing = TensorizerConfig(tensorizer_uri=str(model_path),
+                                              encryption_keyfile=str(key_path))
 
-        vllm_model.apply_model(
-            partial(serialize_vllm_model,
-                    tensorizer_config=config_for_serializing))
+    tensorize_vllm_model(args, config_for_serializing)
 
-    config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
-                                                encryption_keyfile=key_path)
+    config_for_deserializing = TensorizerConfig(
+        tensorizer_uri=str(model_path), encryption_keyfile=str(key_path))
 
     with vllm_runner(model_ref,
                      load_format="tensorizer",
@@ -154,113 +148,46 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
         assert outputs == deserialized_outputs
 
 
-def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
-    multilora_inference = import_from_path(
-        "examples.offline_inference.multilora_inference",
-        EXAMPLES_PATH / "offline_inference/multilora_inference.py",
-    )
-
-    model_ref = "meta-llama/Llama-2-7b-hf"
-    lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
-    test_prompts = multilora_inference.create_test_prompts(lora_path)
-
-    # Serialize model before deserializing and binding LoRA adapters
-    with vllm_runner(model_ref) as vllm_model:
-        model_path = tmp_path / (model_ref + ".tensors")
-
-        vllm_model.apply_model(
-            partial(
-                serialize_vllm_model,
-                tensorizer_config=TensorizerConfig(tensorizer_uri=model_path)))
-
-    with vllm_runner(
-            model_ref,
-            load_format="tensorizer",
-            model_loader_extra_config=TensorizerConfig(
-                tensorizer_uri=model_path,
-                num_readers=1,
-            ),
-            enable_lora=True,
-            max_loras=1,
-            max_lora_rank=8,
-            max_cpu_loras=2,
-            max_num_seqs=50,
-            max_model_len=1000,
-    ) as loaded_vllm_model:
-        multilora_inference.process_requests(
-            loaded_vllm_model.model.llm_engine, test_prompts)
-
-        assert loaded_vllm_model
-
-
-def test_load_without_tensorizer_load_format(vllm_runner):
+def test_load_without_tensorizer_load_format(vllm_runner, capfd):
     model = None
-    with pytest.raises(ValueError):
+    try:
         model = vllm_runner(
             model_ref,
             model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
-    del model
-    gc.collect()
-    torch.cuda.empty_cache()
-
-
-@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
-def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
-    ## Serialize model
-    with vllm_runner(model_ref) as vllm_model:
-        model_path = tmp_path / (model_ref + ".tensors")
-
-        vllm_model.apply_model(
-            partial(
-                serialize_vllm_model,
-                tensorizer_config=TensorizerConfig(tensorizer_uri=model_path)))
-
-        model_loader_extra_config = {
-            "tensorizer_uri": str(model_path),
-        }
-
-    ## Start OpenAI API server
-    openai_args = [
-        "--dtype",
-        "float16",
-        "--load-format",
-        "tensorizer",
-        "--model-loader-extra-config",
-        json.dumps(model_loader_extra_config),
-    ]
-
-    with RemoteOpenAIServer(model_ref, openai_args) as server:
-        print("Server ready.")
-
-        client = server.get_client()
-        completion = client.completions.create(model=model_ref,
-                                               prompt="Hello, my name is",
-                                               max_tokens=5,
-                                               temperature=0.0)
-
-        assert completion.id is not None
-        assert len(completion.choices) == 1
-        assert len(completion.choices[0].text) >= 5
-        assert completion.choices[0].finish_reason == "length"
-        assert completion.usage == openai.types.CompletionUsage(
-            completion_tokens=5, prompt_tokens=6, total_tokens=11)
-
-
-def test_raise_value_error_on_invalid_load_format(vllm_runner):
+    except RuntimeError:
+        out, err = capfd.readouterr()
+        combined_output = out + err
+        assert ("ValueError: Model loader extra config "
+                "is not supported for load "
+                "format LoadFormat.AUTO") in combined_output
+    finally:
+        del model
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+def test_raise_value_error_on_invalid_load_format(vllm_runner, capfd):
     model = None
-    with pytest.raises(ValueError):
+    try:
         model = vllm_runner(
             model_ref,
             load_format="safetensors",
             model_loader_extra_config=TensorizerConfig(tensorizer_uri="test"))
-    del model
-    gc.collect()
-    torch.cuda.empty_cache()
+    except RuntimeError:
+        out, err = capfd.readouterr()
+
+        combined_output = out + err
+        assert ("ValueError: Model loader extra config is not supported "
+                "for load format LoadFormat.SAFETENSORS") in combined_output
+    finally:
+        del model
+        gc.collect()
+        torch.cuda.empty_cache()
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
-def test_tensorizer_with_tp_path_without_template(vllm_runner):
-    with pytest.raises(ValueError):
+def test_tensorizer_with_tp_path_without_template(vllm_runner, capfd):
+    try:
         model_ref = "EleutherAI/pythia-1.4b"
         tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
 
@@ -275,6 +202,13 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner):
             tensor_parallel_size=2,
             disable_custom_all_reduce=True,
         )
+    except RuntimeError:
+        out, err = capfd.readouterr()
+        combined_output = out + err
+        assert ("ValueError: For a sharded model, tensorizer_uri "
+                "should include a string format template like '%04d' "
+                "to be formatted with the rank "
+                "of the shard") in combined_output
 
 
 @pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
@@ -288,7 +222,6 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
             enforce_eager=True,
     ) as base_model:
         outputs = base_model.generate(prompts, sampling_params)
-        base_model.model.llm_engine.model_executor.shutdown()
 
     # load model with two shards and serialize with encryption
     model_path = str(tmp_path / (model_ref + "-%02d.tensors"))
@@ -296,7 +229,7 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
 
     tensorizer_config = TensorizerConfig(
         tensorizer_uri=model_path,
-        encryption_keyfile=key_path,
+        encryption_keyfile=str(key_path),
     )
 
     tensorize_vllm_model(
@@ -331,14 +264,13 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
     model_ref = "facebook/opt-125m"
     model_path = tmp_path / (model_ref + ".tensors")
     config = TensorizerConfig(tensorizer_uri=str(model_path))
+    args = EngineArgs(model=model_ref, device="cuda")
 
     with vllm_runner(model_ref) as vllm_model:
         outputs = vllm_model.generate(prompts, sampling_params)
 
-        vllm_model.apply_model(
-            partial(serialize_vllm_model, tensorizer_config=config))
-
-        assert is_vllm_tensorized(config)
+    tensorize_vllm_model(args, config)
+    assert is_vllm_tensorized(config)
 
     with vllm_runner(model_ref,
                      load_format="tensorizer",
diff --git a/tests/test_logger.py b/tests/test_logger.py
index 11deae309ac8..046f70504c89 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
-
+import enum
 import json
 import logging
 import os
 import sys
 import tempfile
+from dataclasses import dataclass
 from json.decoder import JSONDecodeError
 from tempfile import NamedTemporaryFile
 from typing import Any
@@ -16,6 +17,7 @@
 from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger,
                          enable_trace_function_call, init_logger)
 from vllm.logging_utils import NewLineFormatter
+from vllm.logging_utils.dump_input import prepare_object_to_dump
 
 
 def f1(x):
@@ -216,3 +218,37 @@ def test_custom_logging_config_causes_an_error_if_configure_logging_is_off():
         assert other_logger.handlers != root_logger.handlers
         assert other_logger.level != root_logger.level
         assert other_logger.propagate
+
+
+def test_prepare_object_to_dump():
+    str_obj = 'str'
+    assert prepare_object_to_dump(str_obj) == "'str'"
+
+    list_obj = [1, 2, 3]
+    assert prepare_object_to_dump(list_obj) == '[1, 2, 3]'
+
+    dict_obj = {'a': 1, 'b': 'b'}
+    assert prepare_object_to_dump(dict_obj) in [
+        "{a: 1, b: 'b'}", "{b: 'b', a: 1}"
+    ]
+
+    set_obj = {1, 2, 3}
+    assert prepare_object_to_dump(set_obj) == '[1, 2, 3]'
+
+    tuple_obj = ('a', 'b', 'c')
+    assert prepare_object_to_dump(tuple_obj) == "['a', 'b', 'c']"
+
+    class CustomEnum(enum.Enum):
+        A = enum.auto()
+        B = enum.auto()
+        C = enum.auto()
+
+    assert prepare_object_to_dump(CustomEnum.A) == repr(CustomEnum.A)
+
+    @dataclass
+    class CustomClass:
+        a: int
+        b: str
+
+    assert (prepare_object_to_dump(CustomClass(
+        1, 'b')) == "CustomClass(a=1, b='b')")
diff --git a/tests/test_outputs.py b/tests/test_outputs.py
new file mode 100644
index 000000000000..c41bd6723ba1
--- /dev/null
+++ b/tests/test_outputs.py
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm.outputs import RequestOutput
+
+
+def test_request_output_forward_compatible():
+    output = RequestOutput(request_id="test_request_id",
+                           prompt="test prompt",
+                           prompt_token_ids=[1, 2, 3],
+                           prompt_logprobs=None,
+                           outputs=[],
+                           finished=False,
+                           example_arg_added_in_new_version="some_value")
+    assert output is not None
diff --git a/tests/test_regression.py b/tests/test_regression.py
index 8c9d4a91c73b..e092945422ed 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -60,6 +60,9 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
     # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_MODELSCOPE", "True")
+        # Don't use HF_TOKEN for ModelScope repos, otherwise it will fail
+        # with 400 Client Error: Bad Request.
+        m.setenv("HF_TOKEN", "")
         llm = LLM(model="qwen/Qwen1.5-0.5B-Chat")
 
         prompts = [
diff --git a/tests/tokenization/test_mistral_tokenizer.py b/tests/tokenization/test_mistral_tokenizer.py
index f1c880286951..b16d9af35be9 100644
--- a/tests/tokenization/test_mistral_tokenizer.py
+++ b/tests/tokenization/test_mistral_tokenizer.py
@@ -1,15 +1,18 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
-from mistral_common.protocol.instruct.messages import UserMessage
+from mistral_common.protocol.instruct.messages import (AssistantMessage,
+                                                       ToolMessage,
+                                                       UserMessage)
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
-from mistral_common.protocol.instruct.tool_calls import Function, Tool
+from mistral_common.protocol.instruct.tool_calls import (Function,
+                                                         FunctionCall, Tool,
+                                                         ToolCall)
 
 from vllm.transformers_utils.tokenizers.mistral import (
     make_mistral_chat_completion_request)
 
 
-# yapf: enable
 @pytest.mark.parametrize(
     "openai_request,expected_mistral_request",
     [(
@@ -78,6 +81,107 @@
 )
 def test_make_mistral_chat_completion_request(openai_request,
                                               expected_mistral_request):
-    assert (make_mistral_chat_completion_request(
-        openai_request["messages"],
-        openai_request["tools"]) == expected_mistral_request)
+    actual_request = make_mistral_chat_completion_request(
+        openai_request["messages"], openai_request["tools"])
+    assert actual_request == expected_mistral_request
+
+
+# Tool use with list content and reasoning_content
+@pytest.mark.parametrize("openai_request,expected_mistral_request", [(
+    {
+        "messages": [
+            {
+                "role": "user",
+                "content": "What's the weather in Paris?",
+            },
+            {
+                "role":
+                "assistant",
+                "reasoning_content":
+                None,
+                "content":
+                None,
+                "tool_calls": [{
+                    "id": "call123",
+                    "type": "function",
+                    "function": {
+                        "name": "get_weather",
+                        "arguments": '{"city": "Paris"}',
+                    },
+                }],
+            },
+            {
+                "role": "tool",
+                "content": [{
+                    "type": "text",
+                    "text": "Rainy"
+                }],
+                "name": "get_weather",
+                "tool_call_id": "call123",
+            },
+        ],
+        "tools": [{
+            "type": "function",
+            "function": {
+                "name": "get_weather",
+                "description": "Gets the current weather in a city.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description": "The city name"
+                        }
+                    },
+                    "required": ["city"],
+                },
+            },
+        }],
+    },
+    ChatCompletionRequest(
+        messages=[
+            UserMessage(content="What's the weather in Paris?"),
+            AssistantMessage(
+                content=None,
+                tool_calls=[
+                    ToolCall(
+                        id="call123",
+                        function=FunctionCall(
+                            name="get_weather",
+                            arguments='{"city": "Paris"}',
+                        ),
+                    )
+                ],
+            ),
+            ToolMessage(
+                content="Rainy",
+                tool_call_id="call123",
+                name="get_weather",
+            ),
+        ],
+        tools=[
+            Tool(
+                type="function",
+                function=Function(
+                    name="get_weather",
+                    description="Gets the current weather in a city.",
+                    parameters={
+                        "type": "object",
+                        "properties": {
+                            "city": {
+                                "type": "string",
+                                "description": "The city name"
+                            }
+                        },
+                        "required": ["city"],
+                    },
+                ),
+            )
+        ],
+    ),
+)])
+def test_make_mistral_chat_completion_request_list_content(
+        openai_request, expected_mistral_request):
+    actual_request = make_mistral_chat_completion_request(
+        openai_request["messages"], openai_request["tools"])
+    assert actual_request == expected_mistral_request
diff --git a/tests/tool_use/test_tool_choice_required.py b/tests/tool_use/test_tool_choice_required.py
index 2ab87a0ef41f..291769848145 100644
--- a/tests/tool_use/test_tool_choice_required.py
+++ b/tests/tool_use/test_tool_choice_required.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 import json
-import re
 from copy import deepcopy
 from unittest.mock import MagicMock
 
 import pytest
+import regex as re
 from pydantic import TypeAdapter
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
@@ -333,4 +333,4 @@ def test_streaming_output_valid(output, empty_params, delta_len):
             combined_messages += message.tool_calls[0].function.arguments
     combined_messages += "}]"
     assert json.loads(combined_messages) == output
-    assert json.dumps(json.loads(combined_messages)) == output_json
+    assert json.dumps(json.loads(combined_messages)) == output_json
\ No newline at end of file
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index c14eaf71e978..efa6455c41df 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -88,7 +88,7 @@ def ensure_system_prompt(messages: list[dict[str, Any]],
         "meta-llama/Llama-4-Scout-17B-16E-Instruct",
         "arguments": [
             "--enforce-eager", "--no-enable-prefix-caching",
-            "--tool-call-parser", "pythonic", "--chat-template",
+            "--tool-call-parser", "llama4_pythonic", "--chat-template",
             str(VLLM_PATH /
                 "examples/tool_chat_template_llama4_pythonic.jinja"), "-tp",
             "4"
diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py
index 0a79424a30b7..511d57d405ba 100644
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@@ -19,7 +19,8 @@ def model() -> LLM:
                enable_prefix_caching=True,
                long_prefill_token_threshold=2,
                max_num_batched_tokens=6,
-               max_num_seqs=3)
+               max_num_seqs=3,
+               block_size=16)
 
 
 def test_concurrent_partial_prefill(model):
@@ -27,3 +28,11 @@ def test_concurrent_partial_prefill(model):
     assert len(outputs) == 3
     for output in outputs:
         assert len(output.outputs) == 1
+
+
+def test_prefix_cache_stats_is_recorded(model):
+    # 17 tokens will make sure first 16 tokens are cached in a block
+    input_tokens = {"prompt_token_ids": [101] * 17}
+    _ = model.generate([input_tokens])
+    outputs = model.generate([input_tokens])
+    assert outputs[0].num_cached_tokens == 16
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index cefb89eb652b..e77916f95823 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -6,6 +6,7 @@
 import pytest
 
 from vllm import LLM, SamplingParams
+from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
 
 MODEL = "facebook/opt-125m"
 DTYPE = "half"
@@ -97,3 +98,67 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
             raise AssertionError(
                 f"{len(completion_counts)} unique completions; expected"
                 f" {n}. Repeats: {repeats}")
+
+
+def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
+    max_tokens = 100
+    # Use spec decoding to test num_accepted_tokens_per_pos
+    speculative_config = {
+        "method": "ngram",
+        "prompt_lookup_max": 5,
+        "prompt_lookup_min": 3,
+        "num_speculative_tokens": 5,
+    }
+    monkeypatch.setenv("VLLM_USE_V1", "1")
+    with vllm_runner(
+            MODEL,
+            speculative_config=speculative_config,
+            disable_log_stats=False,
+    ) as vllm_model:
+        model: LLM = vllm_model.model
+        sampling_params = SamplingParams(temperature=0.0,
+                                         max_tokens=max_tokens)
+        outputs = model.generate(example_prompts, sampling_params)
+
+        n_prompts = len(example_prompts)
+        assert len(outputs) == n_prompts
+
+        total_tokens = 0
+        for out in outputs:
+            assert len(out.outputs) == 1
+            total_tokens += len(out.outputs[0].token_ids)
+        assert total_tokens == max_tokens * n_prompts
+
+        metrics = model.get_metrics()
+
+        def find_metric(name) -> list[Metric]:
+            found = []
+            for metric in metrics:
+                if metric.name == name:
+                    found.append(metric)
+            return found
+
+        num_requests_running = find_metric("vllm:num_requests_running")
+        assert len(num_requests_running) == 1
+        assert isinstance(num_requests_running[0], Gauge)
+        assert num_requests_running[0].value == .0
+
+        generation_tokens = find_metric("vllm:generation_tokens")
+        assert len(generation_tokens) == 1
+        assert isinstance(generation_tokens[0], Counter)
+        assert generation_tokens[0].value == total_tokens
+
+        request_generation_tokens = find_metric(
+            "vllm:request_generation_tokens")
+        assert len(request_generation_tokens) == 1
+        assert isinstance(request_generation_tokens[0], Histogram)
+        assert "+Inf" in request_generation_tokens[0].buckets
+        assert request_generation_tokens[0].buckets["+Inf"] == n_prompts
+        assert request_generation_tokens[0].count == n_prompts
+        assert request_generation_tokens[0].sum == total_tokens
+
+        num_accepted_tokens_per_pos = find_metric(
+            "vllm:spec_decode_num_accepted_tokens_per_pos")
+        assert len(num_accepted_tokens_per_pos) == 1
+        assert isinstance(num_accepted_tokens_per_pos[0], Vector)
+        assert len(num_accepted_tokens_per_pos[0].values) == 5
diff --git a/tests/v1/entrypoints/llm/test_struct_output_generate.py b/tests/v1/entrypoints/llm/test_struct_output_generate.py
index 25bbcd901d6a..5f1fff200de3 100644
--- a/tests/v1/entrypoints/llm/test_struct_output_generate.py
+++ b/tests/v1/entrypoints/llm/test_struct_output_generate.py
@@ -4,12 +4,12 @@
 from __future__ import annotations
 
 import json
-import re
 from enum import Enum
 from typing import TYPE_CHECKING, Any
 
 import jsonschema
 import pytest
+import regex as re
 from pydantic import BaseModel
 
 from tests.reasoning.utils import run_reasoning_extraction
diff --git a/tests/v1/entrypoints/openai/test_completion.py b/tests/v1/entrypoints/openai/test_completion.py
index 3ffc54f520b4..333ad23795f3 100644
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import re
 from typing import Optional
 
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+import regex as re
 from openai import BadRequestError
 
 from tests.utils import RemoteOpenAIServer
diff --git a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
index e90b72a7cf24..c17784e0a263 100755
--- a/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+++ b/tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
@@ -13,6 +13,8 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-2}   # Default to 2
 # Find the git repository root directory
 GIT_ROOT=$(git rev-parse --show-toplevel)
 
+SMI_BIN=$(which nvidia-smi || which rocm-smi)
+
 # Trap the SIGINT signal (triggered by Ctrl+C)
 trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
 
@@ -44,6 +46,13 @@ get_model_args() {
   echo "$extra_args"
 }
 
+get_num_gpus() {
+  if [[ "$SMI_BIN" == *"nvidia"* ]]; then
+    echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
+  else
+    echo "$($SMI_BIN -l | grep GPU | wc -l)"
+  fi
+}
 
 # Function to run tests for a specific model
 run_tests_for_model() {
@@ -64,7 +73,7 @@ run_tests_for_model() {
   # Start prefill instances
   for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs
-    GPU_ID=$((i % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
+    GPU_ID=$((i % $(get_num_gpus)))
     # Calculate port number (base port + instance number)
     PORT=$((8100 + i))
     # Calculate side channel port
@@ -96,7 +105,7 @@ run_tests_for_model() {
   # Start decode instances
   for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
     # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
-    GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)))
+    GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
     # Calculate port number (base port + instance number)
     PORT=$((8200 + i))
     # Calculate side channel port
diff --git a/tests/v1/kv_connector/unit/test_multi_connector.py b/tests/v1/kv_connector/unit/test_multi_connector.py
index 64da0d79bf33..a21d92c52244 100644
--- a/tests/v1/kv_connector/unit/test_multi_connector.py
+++ b/tests/v1/kv_connector/unit/test_multi_connector.py
@@ -239,3 +239,11 @@ def get_connector_events() -> dict[str, list[str]]:
             print(f"[ERROR] Could not read connector events for {name}: {e}")
 
     return connector_events
+
+
+def test_engine_id_conflict():
+    configs = [KVTransferConfig() for _ in range(2)]
+    ids = [config.engine_id for config in configs]
+    assert ids[0] != ids[1], (
+        "Engine IDs should be different for different configs. "
+        f"Got {ids}")
diff --git a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
index fc4928f9ebd1..6fcff0d62045 100644
--- a/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_prefill_lifecycle.py
@@ -340,3 +340,84 @@ def test_full_block_prompt():
     output = outputs[0]
     assert output.finish_reason == FinishReason.STOP
     assert_scheduler_empty(scheduler)
+
+
+def test_cannot_schedule_after_recv():
+    """
+    Test that we can handle no schedule after recv due to not
+    enough remaining KV blocks.
+    """
+
+    # NOTE: the KVCacheManager will use 1 null block.
+    # So there are 5 total working blocks.
+    TOTAL_NUM_BLOCKS = 6
+    vllm_config = create_vllm_config()
+    scheduler = create_scheduler(vllm_config, num_blocks=TOTAL_NUM_BLOCKS)
+
+    # Prime the KVCache.
+    NUM_PROMPT_BLOCKS = 2
+    BLOCK_SIZE = vllm_config.cache_config.block_size
+    # Prompt will use 2 blocks + 1 block after we schedule.
+    NUM_TOKENS_LOCAL = int(BLOCK_SIZE * NUM_PROMPT_BLOCKS)
+    NUM_TOKENS_REMOTE = int(BLOCK_SIZE * (NUM_PROMPT_BLOCKS + 0.5))
+
+    request_normal = create_request(request_id=1, num_tokens=NUM_TOKENS_LOCAL)
+    request_remote = create_request(request_id=2,
+                                    num_tokens=NUM_TOKENS_REMOTE,
+                                    do_remote_prefill=True)
+
+    # STEP 1: 3 blocks are in use (2 for prompt, 1 for decode).
+    scheduler.add_request(request_normal)
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_normal])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # Step 2: 5 blocks are in use (2 new for remote blocks).
+    scheduler.add_request(request_remote)
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_normal])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # Step 3: finish recving (5 blocks in use)
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(
+        reqs=[request_normal], finished_recving=[request_remote.request_id])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # Step 4: try to schedule, not enough blocks.
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_normal])
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 1
+
+    # Step 5: finish the request, free it.
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_normal],
+                                                     use_eos=True)
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 0
+    assert len(scheduler.waiting) == 1
+
+    # Step 6: now we can schedule (with 2 blocks computed).
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_remote])
+    assert (scheduler_output.scheduled_new_reqs[0].num_computed_tokens ==
+            NUM_PROMPT_BLOCKS * BLOCK_SIZE)
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    assert len(scheduler.running) == 1
+    assert len(scheduler.waiting) == 0
+
+    # Step 7: free everything.
+    scheduler_output = scheduler.schedule()
+    model_runner_output = create_model_runner_output(reqs=[request_remote],
+                                                     use_eos=True)
+    scheduler.update_from_output(scheduler_output, model_runner_output)
+    _ = scheduler.schedule()
+    assert_scheduler_empty(scheduler)
diff --git a/tests/v1/sample/test_topk_topp_sampler.py b/tests/v1/sample/test_topk_topp_sampler.py
index a8a713d446b7..220f05c7ff1c 100644
--- a/tests/v1/sample/test_topk_topp_sampler.py
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -16,31 +16,40 @@
 FLASHINFER_ENABLED = current_platform.is_cuda() and is_flashinfer_available
 
 
+@pytest.fixture(autouse=True)
+def reset_default_device():
+    """
+    Explicitly set the default device, which can affect subsequent tests. 
+    Adding this fixture helps avoid this problem.
+    """
+    original_device = torch.get_default_device()
+    yield
+    torch.set_default_device(original_device)
+
+
 def test_topk_impl_equivalance():
 
-    with torch.device(DEVICE):
-        generator = Generator(device=DEVICE).manual_seed(33)
+    torch.set_default_device(DEVICE)
+    generator = Generator(device=DEVICE).manual_seed(33)
 
-        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+    logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
 
-        # Random top-k values between 1 and 9.
-        k = torch.randint(1, 10, (BATCH_SIZE, ), generator=generator)
+    # Random top-k values between 1 and 9.
+    k = torch.randint(1, 10, (BATCH_SIZE, ), generator=generator)
 
-        # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
-        k.masked_fill_(
-            torch.randint(0,
-                          2, (BATCH_SIZE, ),
-                          generator=generator,
-                          dtype=bool), VOCAB_SIZE)
+    # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
+    k.masked_fill_(
+        torch.randint(0, 2, (BATCH_SIZE, ), generator=generator, dtype=bool),
+        VOCAB_SIZE)
 
-        # Top-k only implementation
-        result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
+    # Top-k only implementation
+    result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
 
-        # Top-p + top-k
-        no_op_top_p = torch.tensor([1.0])
-        result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p)
+    # Top-p + top-k
+    no_op_top_p = torch.tensor([1.0])
+    result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p)
 
-        assert torch.allclose(result1, result2)
+    assert torch.allclose(result1, result2)
 
 
 def test_flashinfer_sampler():
@@ -58,50 +67,49 @@ def test_flashinfer_sampler():
         pytest.skip(
             "FlashInfer not installed or not available on this platform.")
 
-    with torch.device(DEVICE):
-        generator = Generator(device=DEVICE).manual_seed(42)
-
-        # Generate random logits
-        logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
-
-        # Generate various top-k and top-p values
-        k_values = torch.randint(1, 1000, (BATCH_SIZE, ), generator=generator)
-        p_values = torch.rand(
-            (BATCH_SIZE, ),
-            generator=generator) * 0.5 + 0.5  # range in [0.5, 1.0]
-
-        # Sometimes disable top-k (k=vocab_size)
-        k_values.masked_fill_(
-            torch.randint(0,
-                          2, (BATCH_SIZE, ),
-                          generator=generator,
-                          dtype=torch.bool), VOCAB_SIZE)
-
-        # Sometimes disable top-p (p=1.0)
-        p_values.masked_fill_(
-            torch.randint(0,
-                          2, (BATCH_SIZE, ),
-                          generator=generator,
-                          dtype=torch.bool), 1.0)
-
-        python_logits = apply_top_k_top_p(
-            logits=logits.clone(),
-            k=k_values,
-            p=p_values,
-        )
-        python_probs = torch.softmax(python_logits, dim=-1)
-
-        # FlashInfer only exposed renorm interfaces for probs so convert first
-        flashinfer_probs = torch.softmax(logits.clone(), dim=-1)
-        flashinfer_probs = top_k_renorm_probs(
-            probs=flashinfer_probs,
-            top_k=k_values,
-        )
-        flashinfer_probs = top_p_renorm_probs(
-            probs=flashinfer_probs,
-            top_p=p_values,
-        )
-
-        # Compare the results
-        assert torch.allclose(python_probs, flashinfer_probs, atol=2e-2), \
-            "FlashInfer and Python sampling implementations do not match!"
+    torch.set_default_device(DEVICE)
+    generator = Generator(device=DEVICE).manual_seed(42)
+
+    # Generate random logits
+    logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+
+    # Generate various top-k and top-p values
+    k_values = torch.randint(1, 1000, (BATCH_SIZE, ), generator=generator)
+    p_values = torch.rand(
+        (BATCH_SIZE, ), generator=generator) * 0.5 + 0.5  # range in [0.5, 1.0]
+
+    # Sometimes disable top-k (k=vocab_size)
+    k_values.masked_fill_(
+        torch.randint(0,
+                      2, (BATCH_SIZE, ),
+                      generator=generator,
+                      dtype=torch.bool), VOCAB_SIZE)
+
+    # Sometimes disable top-p (p=1.0)
+    p_values.masked_fill_(
+        torch.randint(0,
+                      2, (BATCH_SIZE, ),
+                      generator=generator,
+                      dtype=torch.bool), 1.0)
+
+    python_logits = apply_top_k_top_p(
+        logits=logits.clone(),
+        k=k_values,
+        p=p_values,
+    )
+    python_probs = torch.softmax(python_logits, dim=-1)
+
+    # FlashInfer only exposed renorm interfaces for probs so convert first
+    flashinfer_probs = torch.softmax(logits.clone(), dim=-1)
+    flashinfer_probs = top_k_renorm_probs(
+        probs=flashinfer_probs,
+        top_k=k_values,
+    )
+    flashinfer_probs = top_p_renorm_probs(
+        probs=flashinfer_probs,
+        top_p=p_values,
+    )
+
+    # Compare the results
+    assert torch.allclose(python_probs, flashinfer_probs, atol=2e-2), \
+        "FlashInfer and Python sampling implementations do not match!"
diff --git a/tests/v1/sample/utils.py b/tests/v1/sample/utils.py
index f540895bbf14..932b652aea32 100644
--- a/tests/v1/sample/utils.py
+++ b/tests/v1/sample/utils.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import re
 from enum import Enum
 from typing import Optional
 
+import regex as re
+
 from vllm import CompletionOutput
 
 
diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py
index 7d93a44c5059..b49ac45f3129 100644
--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -100,8 +100,12 @@ def test_prepare_inputs():
         dtype=torch.int32,
         device=device)
 
+    # n1 + n2 + n3 - a - b -c
+    num_tokens = cu_target_query_lens[-1].item() - num_rejected_tokens.sum(
+    ).item()
+
     cu_num_tokens, token_indices = EagleProposer.prepare_inputs(
-        cu_target_query_lens, num_rejected_tokens)
+        cu_target_query_lens, num_rejected_tokens, num_tokens)
 
     assert torch.equal(cu_num_tokens, expected_cu_num_tokens)
     assert token_indices.shape[0] == expected_cu_num_tokens[-1].item()
@@ -117,34 +121,13 @@ def test_prepare_inputs():
     ])
 @mock.patch('vllm.v1.spec_decode.eagle.get_pp_group')
 @mock.patch('vllm.v1.spec_decode.eagle.get_layers_from_vllm_config')
-@mock.patch('vllm.v1.spec_decode.eagle.ModelRegistry')
-@mock.patch('vllm.v1.spec_decode.eagle.get_model_loader')
-@mock.patch('vllm.v1.spec_decode.eagle.set_default_torch_dtype')
-@mock.patch('vllm.v1.spec_decode.eagle.set_current_vllm_config')
-def test_load_model(mock_set_config, mock_set_dtype, mock_get_loader,
-                    mock_registry, mock_get_layers, mock_get_pp_group, method,
+@mock.patch('vllm.v1.spec_decode.eagle.get_model')
+def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
                     proposer_helper, draft_model_dir, target_attribute_path):
 
-    # Setup mock for model class
-    mock_model_cls = mock.MagicMock()
-    mock_registry.resolve_model_cls.return_value = (mock_model_cls,
-                                                    "test_arch")
-
-    # Create a real context manager for mocks
-    class MockContextManager:
-
-        def __init__(self):
-            pass
-
-        def __enter__(self):
-            return None
-
-        def __exit__(self, exc_type, exc_val, exc_tb):
-            return False
-
-    # Make the mocks return actual context manager objects
-    mock_set_dtype.return_value = MockContextManager()
-    mock_set_config.return_value = MockContextManager()
+    # Setup model mock
+    mock_model = mock.MagicMock()
+    mock_get_model.return_value = mock_model
 
     # Setup mocks for attention layers
     target_attn_layers = {
@@ -164,25 +147,6 @@ def __exit__(self, exc_type, exc_val, exc_tb):
     mock_pp_group.world_size = 2 if method == "eagle" else 1
     mock_get_pp_group.return_value = mock_pp_group
 
-    # Setup model loader mock
-    mock_loader = mock.MagicMock()
-    mock_get_loader.return_value = mock_loader
-
-    # Setup model mock
-    mock_model = mock.MagicMock()
-    mock_model_cls.return_value = mock_model
-    mock_model.to.return_value = mock_model
-
-    # Configure mock to test the attribute sharing path
-    if method == "eagle":
-        # For eagle, test the lm_head path
-        mock_model.load_weights.return_value = {
-            "model.embed_tokens.weight": torch.zeros(1)
-        }
-    else:
-        # For eagle3, test the embed_tokens path
-        mock_model.load_weights.return_value = {}
-
     # Setup target model with the appropriate attributes
     target_model = mock.MagicMock()
 
@@ -204,13 +168,7 @@ def __exit__(self, exc_type, exc_val, exc_tb):
     proposer.load_model(target_model)
 
     # Verify common interactions
-    mock_get_loader.assert_called_once()
-    mock_model_cls.assert_called_once()
-    mock_model.to.assert_called_once()
-    mock_model.load_weights.assert_called_once()
-
-    # Verify the loader was called with the right config
-    mock_get_loader.assert_called_once_with(proposer.vllm_config.load_config)
+    mock_get_model.assert_called_once()
 
     # Verify the specific attribute sharing based on the method
     if method == "eagle":
@@ -288,6 +246,9 @@ def create_deterministic_logits(token_ids):
     # Assign the mock to the proposer
     proposer.model = model_mock
 
+    # Assign draft attn_layer_names since load_model is not invoked
+    proposer.attn_layer_names = ["layer.0"]
+
     # Create input tensors
     cu_num_tokens = torch.tensor([0, seq_len_1, total_tokens],
                                  dtype=torch.int32,
diff --git a/tests/v1/test_metrics_reader.py b/tests/v1/test_metrics_reader.py
new file mode 100644
index 000000000000..68539c80b59c
--- /dev/null
+++ b/tests/v1/test_metrics_reader.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import prometheus_client
+import pytest
+
+from vllm.v1.metrics.reader import (Counter, Gauge, Histogram, Vector,
+                                    get_metrics_snapshot)
+
+
+@pytest.fixture(autouse=True)
+def test_registry(monkeypatch):
+    # Use a custom registry for tests
+    test_registry = prometheus_client.CollectorRegistry(auto_describe=True)
+    monkeypatch.setattr("vllm.v1.metrics.reader.REGISTRY", test_registry)
+    return test_registry
+
+
+@pytest.mark.parametrize("num_engines", [1, 4])
+def test_gauge_metric(test_registry, num_engines):
+    g = prometheus_client.Gauge("vllm:test_gauge",
+                                "Test gauge metric",
+                                labelnames=["model", "engine_index"],
+                                registry=test_registry)
+    for i in range(num_engines):
+        g.labels(model="foo", engine_index=str(i)).set(98.5)
+
+    metrics = get_metrics_snapshot()
+    assert len(metrics) == num_engines
+    engine_labels = [str(i) for i in range(num_engines)]
+    for m in metrics:
+        assert isinstance(m, Gauge)
+        assert m.name == "vllm:test_gauge"
+        assert m.value == 98.5
+        assert m.labels["model"] == "foo"
+        assert m.labels["engine_index"] in engine_labels
+        engine_labels.remove(m.labels["engine_index"])
+
+
+@pytest.mark.parametrize("num_engines", [1, 4])
+def test_counter_metric(test_registry, num_engines):
+    c = prometheus_client.Counter("vllm:test_counter",
+                                  "Test counter metric",
+                                  labelnames=["model", "engine_index"],
+                                  registry=test_registry)
+    for i in range(num_engines):
+        c.labels(model="bar", engine_index=str(i)).inc(19)
+
+    metrics = get_metrics_snapshot()
+    assert len(metrics) == num_engines
+    engine_labels = [str(i) for i in range(num_engines)]
+    for m in metrics:
+        assert isinstance(m, Counter)
+        assert m.name == "vllm:test_counter"
+        assert m.value == 19
+        assert m.labels["model"] == "bar"
+        assert m.labels["engine_index"] in engine_labels
+        engine_labels.remove(m.labels["engine_index"])
+
+
+@pytest.mark.parametrize("num_engines", [1, 4])
+def test_histogram_metric(test_registry, num_engines):
+    h = prometheus_client.Histogram("vllm:test_histogram",
+                                    "Test histogram metric",
+                                    labelnames=["model", "engine_index"],
+                                    buckets=[10, 20, 30, 40, 50],
+                                    registry=test_registry)
+    for i in range(num_engines):
+        hist = h.labels(model="blaa", engine_index=str(i))
+        hist.observe(42)
+        hist.observe(21)
+        hist.observe(7)
+
+    metrics = get_metrics_snapshot()
+    assert len(metrics) == num_engines
+    engine_labels = [str(i) for i in range(num_engines)]
+    for m in metrics:
+        assert isinstance(m, Histogram)
+        assert m.name == "vllm:test_histogram"
+        assert m.count == 3
+        assert m.sum == 70
+        assert m.buckets["10.0"] == 1
+        assert m.buckets["20.0"] == 1
+        assert m.buckets["30.0"] == 2
+        assert m.buckets["40.0"] == 2
+        assert m.buckets["50.0"] == 3
+        assert m.labels["model"] == "blaa"
+        assert m.labels["engine_index"] in engine_labels
+        engine_labels.remove(m.labels["engine_index"])
+
+
+@pytest.mark.parametrize("num_engines", [1, 4])
+def test_vector_metric(test_registry, num_engines):
+    c = prometheus_client.Counter(
+        "vllm:spec_decode_num_accepted_tokens_per_pos",
+        "Vector-like counter metric",
+        labelnames=["position", "model", "engine_index"],
+        registry=test_registry)
+    for i in range(num_engines):
+        c.labels(position="0", model="llama", engine_index=str(i)).inc(10)
+        c.labels(position="1", model="llama", engine_index=str(i)).inc(5)
+        c.labels(position="2", model="llama", engine_index=str(i)).inc(1)
+
+    metrics = get_metrics_snapshot()
+    assert len(metrics) == num_engines
+    engine_labels = [str(i) for i in range(num_engines)]
+    for m in metrics:
+        assert isinstance(m, Vector)
+        assert m.name == "vllm:spec_decode_num_accepted_tokens_per_pos"
+        assert m.values == [10, 5, 1]
+        assert m.labels["model"] == "llama"
+        assert m.labels["engine_index"] in engine_labels
+        engine_labels.remove(m.labels["engine_index"])
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index c34c673e985e..1b77417a1bd3 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -12,7 +12,7 @@
     "openai/whisper-large-v3",  # transcription
     "facebook/bart-large-cnn",  # encoder decoder
     "mistralai/Mamba-Codestral-7B-v0.1",  # mamba
-    "hmellor/bamba-tiny-random",  # hybrid
+    "hmellor/tiny-random-BambaForCausalLM",  # hybrid
     "BAAI/bge-m3",  # embedding
 ]
 
diff --git a/tests/v1/worker/test_gpu_input_batch.py b/tests/v1/worker/test_gpu_input_batch.py
index 638f5bedcfca..27741bd156be 100644
--- a/tests/v1/worker/test_gpu_input_batch.py
+++ b/tests/v1/worker/test_gpu_input_batch.py
@@ -251,7 +251,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
         device=torch.device(device),
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
-        kv_cache_config=get_kv_cache_config(),
+        block_size=1,
     )
     reqs: list[CachedRequestState] = []
     req_id_reqs = {}
@@ -341,7 +341,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
         device=torch.device(device),
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
-        kv_cache_config=get_kv_cache_config(),
+        block_size=1,
     )
     ref_input_batch: InputBatch = InputBatch(
         max_num_reqs=batch_size,
@@ -350,7 +350,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int,
         device=torch.device(device),
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
-        kv_cache_config=get_kv_cache_config(),
+        block_size=1,
     )
 
     reqs: list[CachedRequestState] = []
diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py
index e44660525763..b8c3d88617d0 100644
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -43,7 +43,7 @@ def initialize_kv_cache(runner: GPUModelRunner):
         device=runner.device,
         pin_memory=runner.pin_memory,
         vocab_size=runner.model_config.get_vocab_size(),
-        kv_cache_config=kv_cache_config,
+        block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size,
     )
     runner.initialize_attn_backend(kv_cache_config)
 
diff --git a/tools/check_triton_import.py b/tools/check_triton_import.py
new file mode 100644
index 000000000000..18c9726a11ac
--- /dev/null
+++ b/tools/check_triton_import.py
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+import subprocess
+import sys
+
+import regex as re
+
+FORBIDDEN_IMPORT_RE = re.compile(r"^(from|import)\s+triton(\s|\.|$)")
+
+# the way allowed to import triton
+ALLOWED_LINES = {
+    "from vllm.triton_utils import triton",
+    "from vllm.triton_utils import tl",
+    "from vllm.triton_utils import tl, triton",
+}
+
+
+def is_forbidden_import(line: str) -> bool:
+    stripped = line.strip()
+    return bool(
+        FORBIDDEN_IMPORT_RE.match(stripped)) and stripped not in ALLOWED_LINES
+
+
+def parse_diff(diff: str) -> list[str]:
+    violations = []
+    current_file = None
+    current_lineno = None
+
+    for line in diff.splitlines():
+        if line.startswith("+++ b/"):
+            current_file = line[6:]
+        elif line.startswith("@@"):
+            match = re.search(r"\+(\d+)", line)
+            if match:
+                current_lineno = int(
+                    match.group(1)) - 1  # next "+ line" is here
+        elif line.startswith("+") and not line.startswith("++"):
+            current_lineno += 1
+            code_line = line[1:]
+            if is_forbidden_import(code_line):
+                violations.append(
+                    f"{current_file}:{current_lineno}: {code_line.strip()}")
+    return violations
+
+
+def get_diff(diff_type: str) -> str:
+    if diff_type == "staged":
+        return subprocess.check_output(
+            ["git", "diff", "--cached", "--unified=0"], text=True)
+    elif diff_type == "unstaged":
+        return subprocess.check_output(["git", "diff", "--unified=0"],
+                                       text=True)
+    else:
+        raise ValueError(f"Unknown diff_type: {diff_type}")
+
+
+def main():
+    all_violations = []
+    for diff_type in ["staged", "unstaged"]:
+        try:
+            diff_output = get_diff(diff_type)
+            violations = parse_diff(diff_output)
+            all_violations.extend(violations)
+        except subprocess.CalledProcessError as e:
+            print(f"[{diff_type}] Git diff failed: {e}", file=sys.stderr)
+
+    if all_violations:
+        print("❌ Forbidden direct `import triton` detected."
+              " ➤ Use `from vllm.triton_utils import triton` instead.\n")
+        for v in all_violations:
+            print(f"❌ {v}")
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tools/enforce_regex_import.py b/tools/enforce_regex_import.py
new file mode 100644
index 000000000000..b55c4a94eac8
--- /dev/null
+++ b/tools/enforce_regex_import.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import subprocess
+from pathlib import Path
+
+import regex as re
+
+FORBIDDEN_PATTERNS = re.compile(
+    r'^\s*(?:import\s+re(?:$|\s|,)|from\s+re\s+import)')
+ALLOWED_PATTERNS = [
+    re.compile(r'^\s*import\s+regex\s+as\s+re\s*$'),
+    re.compile(r'^\s*import\s+regex\s*$'),
+]
+
+
+def get_staged_python_files() -> list[str]:
+    try:
+        result = subprocess.run(
+            ['git', 'diff', '--cached', '--name-only', '--diff-filter=AM'],
+            capture_output=True,
+            text=True,
+            check=True)
+        files = result.stdout.strip().split(
+            '\n') if result.stdout.strip() else []
+        return [f for f in files if f.endswith('.py')]
+    except subprocess.CalledProcessError:
+        return []
+
+
+def is_forbidden_import(line: str) -> bool:
+    line = line.strip()
+    return bool(
+        FORBIDDEN_PATTERNS.match(line)
+        and not any(pattern.match(line) for pattern in ALLOWED_PATTERNS))
+
+
+def check_file(filepath: str) -> list[tuple[int, str]]:
+    violations = []
+    try:
+        with open(filepath, encoding='utf-8') as f:
+            for line_num, line in enumerate(f, 1):
+                if is_forbidden_import(line):
+                    violations.append((line_num, line.strip()))
+    except (OSError, UnicodeDecodeError):
+        pass
+    return violations
+
+
+def main() -> int:
+    files = get_staged_python_files()
+    if not files:
+        return 0
+
+    total_violations = 0
+
+    for filepath in files:
+        if not Path(filepath).exists():
+            continue
+
+        violations = check_file(filepath)
+        if violations:
+            print(f"\n❌ {filepath}:")
+            for line_num, line in violations:
+                print(f"  Line {line_num}: {line}")
+                total_violations += 1
+
+    if total_violations > 0:
+        print(f"\n💡 Found {total_violations} violation(s).")
+        print("❌ Please replace 'import re' with 'import regex as re'")
+        print(
+            "   Also replace 'from re import ...' with 'from regex import ...'"
+        )  # noqa: E501
+        print("✅ Allowed imports:")
+        print("   - import regex as re")
+        print("   - import regex")  # noqa: E501
+        return 1
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tools/install_nixl.sh b/tools/install_nixl.sh
new file mode 100644
index 000000000000..56717cfb77f7
--- /dev/null
+++ b/tools/install_nixl.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+# Usage: ./install_nixl.sh [--force]
+
+FORCE=false
+if [ "$1" == "--force" ]; then
+    FORCE=true
+fi
+
+SUDO=false
+if command -v sudo >/dev/null 2>&1 && sudo -n true 2>/dev/null; then
+    SUDO=true
+fi
+
+ARCH=$(uname -m)
+
+ROOT_DIR="/usr/local"
+mkdir -p "$ROOT_DIR"
+GDR_HOME="$ROOT_DIR/gdrcopy"
+UCX_HOME="$ROOT_DIR/ucx"
+NIXL_HOME="$ROOT_DIR/nixl"
+CUDA_HOME=/usr/local/cuda
+
+export PATH="$GDR_HOME/bin:$UCX_HOME/bin:$NIXL_HOME/bin:$PATH"
+export LD_LIBRARY_PATH="$GDR_HOME/lib:$UCX_HOME/lib:$NIXL_HOME/lib/$ARCH-linux-gnu:$LD_LIBRARY_PATH"
+
+TEMP_DIR="nixl_installer"
+mkdir -p "$TEMP_DIR"
+cd "$TEMP_DIR"
+
+pip install meson ninja pybind11
+
+if [ ! -e "/dev/gdrdrv" ] || [ "$FORCE" = true ]; then
+    echo "Installing gdrcopy\n"
+    wget https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v2.5.tar.gz
+    tar xzf v2.5.tar.gz; rm v2.5.tar.gz
+    cd gdrcopy-2.5
+    make prefix=$GDR_HOME CUDA=$CUDA_HOME all install
+    
+    if $SUDO; then
+        echo "Running insmod.sh with sudo"
+        sudo ./insmod.sh
+    else
+        echo "Skipping insmod.sh - sudo not available"
+        echo "Please run 'sudo ./gdrcopy-2.5/insmod.sh' manually if needed"
+    fi
+    
+    cd ..
+else
+    echo "Found /dev/gdrdrv. Skipping gdrcopy installation"
+fi
+
+if ! command -v ucx_info &> /dev/null || [ "$FORCE" = true ]; then
+    echo "Installing UCX"
+    wget https://github.com/openucx/ucx/releases/download/v1.18.0/ucx-1.18.0.tar.gz
+    tar xzf ucx-1.18.0.tar.gz; rm ucx-1.18.0.tar.gz
+    cd ucx-1.18.0
+    
+    # Checking Mellanox NICs
+    MLX_OPTS=""
+    if lspci | grep -i mellanox > /dev/null || command -v ibstat > /dev/null; then
+        echo "Mellanox NIC detected, adding Mellanox-specific options"
+        MLX_OPTS="--with-rdmacm \
+                  --with-mlx5-dv \
+                  --with-ib-hw-tm"
+    fi
+    
+    ./configure  --prefix=$UCX_HOME                \
+                --enable-shared                    \
+                --disable-static                   \
+                --disable-doxygen-doc              \
+                --enable-optimizations             \
+                --enable-cma                       \
+                --enable-devel-headers             \
+                --with-cuda=$CUDA_HOME             \
+                --with-dm                          \
+                --with-gdrcopy=$GDR_HOME           \
+                --with-verbs                       \
+                --enable-mt                        \
+                $MLX_OPTS
+    make -j
+    make -j install-strip
+    
+    if $SUDO; then
+        echo "Running ldconfig with sudo"
+        sudo ldconfig
+    else
+        echo "Skipping ldconfig - sudo not available"
+        echo "Please run 'sudo ldconfig' manually if needed"
+    fi
+
+    cd ..
+else
+    echo "Found existing UCX. Skipping UCX installation"  
+fi
+
+if ! command -v nixl_test &> /dev/null || [ "$FORCE" = true ]; then
+    echo "Installing NIXL"
+    wget https://github.com/ai-dynamo/nixl/archive/refs/tags/0.2.0.tar.gz
+    tar xzf 0.2.0.tar.gz; rm 0.2.0.tar.gz
+    cd nixl-0.2.0
+    meson setup build --prefix=$NIXL_HOME -Ducx_path=$UCX_HOME
+    cd build
+    ninja
+    ninja install
+
+    cd ../..
+else
+    echo "Found existing NIXL. Skipping NIXL installation"  
+fi
diff --git a/tools/update-dockerfile-graph.sh b/tools/update-dockerfile-graph.sh
index a1e22a69cdc7..88189e8ab208 100755
--- a/tools/update-dockerfile-graph.sh
+++ b/tools/update-dockerfile-graph.sh
@@ -24,7 +24,7 @@ if printf '%s\n' "${FILES[@]}" | grep -q "^docker/Dockerfile$"; then
   fi
 
   # Define the target file path
-  TARGET_GRAPH_FILE="docs/source/assets/contributing/dockerfile-stages-dependency.png"
+  TARGET_GRAPH_FILE="docs/assets/contributing/dockerfile-stages-dependency.png"
 
   # Ensure target directory exists
   mkdir -p "$(dirname "$TARGET_GRAPH_FILE")"
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 3ce4944e54cc..a422f5e5d5c1 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1103,7 +1103,6 @@ def scaled_fp4_experts_quant(
     blockscale_offsets: torch.Tensor,
     topk: int,
     expert_map: Optional[torch.Tensor] = None,
-    MAX_TOKENS_PER_EXPERT: int = 163840,
 ) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Quantize input tensor to FP4 and return quantized tensor and scale, for
@@ -1125,9 +1124,16 @@ def scaled_fp4_experts_quant(
     input_tensor = input_tensor[
         expert_map] if expert_map is not None else input_tensor
     m_numtopk, k = input_tensor.shape
+    # Control the maximum number of tokens per expert supported by the
+    # NVFP4 MoE Expert Quantization. This is used to prevent the kernel
+    # from running out of memory. This value can also be increased to support
+    # larger models.
+    MAX_TOKENS_PER_EXPERT = envs.VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE
     assert (m_numtopk <= MAX_TOKENS_PER_EXPERT * topk), (
-        f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT * topk for"
-        f" scaled_fp4_experts_quant kernel, observed m_numtopk = {m_numtopk}")
+        f"m_numtopk must be less than MAX_TOKENS_PER_EXPERT("
+        f"{MAX_TOKENS_PER_EXPERT})"
+        f" for cutlass_moe_fp4, observed m_numtopk = {m_numtopk}. Use"
+        f" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE to set this value.")
     scales_k = k // 16
     padded_k = (scales_k + (4 - 1)) // 4
 
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index f0608485b5f8..baaa14efafe6 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -863,7 +863,8 @@ def forward(
             gqa_ratio = num_heads // self.num_kv_heads
             use_custom = use_rocm_custom_paged_attention(
                 decode_query.dtype, head_size, block_size, gqa_ratio,
-                decode_meta.max_decode_seq_len, self.sliding_window)
+                decode_meta.max_decode_seq_len, self.sliding_window,
+                self.kv_cache_dtype, self.alibi_slopes)
             if use_custom:
                 max_seq_len = (decode_meta.max_decode_seq_len if self.attn_type
                                != AttentionType.ENCODER_DECODER else
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index 26cec3752c96..7e25fa96518f 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -288,7 +288,8 @@ def chunked_prefill_paged_decode(
     use_custom = use_rocm_custom_paged_attention(query.dtype, head_size,
                                                  block_size,
                                                  num_queries_per_kv,
-                                                 max_seq_len, sliding_window)
+                                                 max_seq_len, sliding_window,
+                                                 kv_cache_dtype, alibi_slopes)
     if use_custom:
         _PARTITION_SIZE_ROCM = 256
         max_num_partitions = ((max_seq_len + _PARTITION_SIZE_ROCM - 1) //
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
index 241e84ca669d..4bced779785a 100644
--- a/vllm/attention/ops/triton_unified_attention.py
+++ b/vllm/attention/ops/triton_unified_attention.py
@@ -31,8 +31,8 @@ def apply_softcap(S, x):
 def kernel_unified_attention_2d(
     output_ptr,  # [num_tokens, num_query_heads, head_size]
     query_ptr,  # [num_tokens, num_query_heads, head_size]
-    key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
-    value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+    key_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
+    value_cache_ptr,  # [num_blks, blk_size, num_kv_heads, head_size]
     block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
     seq_lens_ptr,  # [num_seqs]
     alibi_slopes_ptr,  # [num_query_heads]
diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py
index fab44fb6062d..74a9b2b03391 100644
--- a/vllm/benchmarks/datasets.py
+++ b/vllm/benchmarks/datasets.py
@@ -13,7 +13,6 @@
 TODO: Implement CustomDataset to parse a JSON file and convert its contents into
 SampleRequest instances, similar to the approach used in ShareGPT.
 """
-
 import base64
 import io
 import json
@@ -33,6 +32,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal.image import convert_image_mode
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 
 logger = logging.getLogger(__name__)
@@ -129,16 +129,17 @@ def get_random_lora_request(
 
         Args:
             tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
-            LoRA is selected.  max_loras (Optional[int]): The maximum number of
-            LoRAs available. If None, LoRA is not used.  lora_path
-            (Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
-            is not used.
+                LoRA is selected.
+            max_loras (Optional[int]): The maximum number of LoRAs available.
+                If `None`, LoRA is not used.
+            lora_path (Optional[str]): Path to the LoRA parameters on disk.
+                If `None`, LoRA is not used.
 
         Returns:
-            tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
-            element is a LoRARequest (or None if not applicable) and the second
-            element is the tokenizer associated with the LoRA request (or the
-            base tokenizer).
+            A tuple with the following elements:
+                - A new [LoRARequest][] (or `None` if not applicable).
+                - The tokenizer associated with the LoRA request
+                  (or the base tokenizer).
         """
         if max_loras is None or lora_path is None:
             return None, tokenizer
@@ -167,7 +168,7 @@ def sample(self, tokenizer: PreTrainedTokenizerBase,
 
         Args:
             tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
-             for processing the dataset's text.
+                for processing the dataset's text.
             num_requests (int): The number of sample requests to generate.
 
         Returns:
@@ -184,7 +185,8 @@ def maybe_oversample_requests(self, requests: list[SampleRequest],
 
         Args:
             requests (List[SampleRequest]): The current list of sampled
-            requests.  num_requests (int): The target number of requests.
+                requests.
+            num_requests (int): The target number of requests.
         """
         if len(requests) < num_requests:
             random.seed(self.random_seed)
@@ -259,7 +261,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
     if isinstance(image, dict) and 'bytes' in image:
         image = Image.open(BytesIO(image['bytes']))
     if isinstance(image, Image.Image):
-        image = image.convert("RGB")
+        image = convert_image_mode(image, "RGB")
         with io.BytesIO() as image_data:
             image.save(image_data, format="JPEG")
             image_base64 = base64.b64encode(
diff --git a/vllm/benchmarks/latency.py b/vllm/benchmarks/latency.py
index 06f6848f50cb..2c992727b139 100644
--- a/vllm/benchmarks/latency.py
+++ b/vllm/benchmarks/latency.py
@@ -80,6 +80,9 @@ def add_cli_args(parser: argparse.ArgumentParser):
     )
 
     parser = EngineArgs.add_cli_args(parser)
+    # V1 enables prefix caching by default which skews the latency
+    # numbers. We need to disable prefix caching by default.
+    parser.set_defaults(enable_prefix_caching=True)
 
 
 def main(args: argparse.Namespace):
diff --git a/vllm/collect_env.py b/vllm/collect_env.py
index 85746b7ef606..86eb465b8f65 100644
--- a/vllm/collect_env.py
+++ b/vllm/collect_env.py
@@ -815,4 +815,4 @@ def main():
 
 
 if __name__ == '__main__':
-    main()
+    main()
\ No newline at end of file
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 0c1381a565c1..8114cddcd9fa 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -6,9 +6,7 @@
 import pprint
 import time
 from collections.abc import Sequence
-from contextlib import ExitStack
 from typing import Any, Callable, Optional
-from unittest.mock import patch
 
 import torch
 import torch.fx as fx
@@ -16,13 +14,13 @@
 import vllm.envs as envs
 from vllm.config import CompilationConfig, VllmConfig
 from vllm.logger import init_logger
-from vllm.utils import weak_ref_tensors
+from vllm.platforms import current_platform
+from vllm.utils import resolve_obj_by_qualname
 
 from .compiler_interface import (CompilerInterface, EagerAdaptor,
                                  InductorAdaptor, InductorStandaloneAdaptor)
 from .counter import compilation_counter
 from .inductor_pass import InductorPass
-from .monitor import end_monitoring_torch_compile
 from .pass_manager import PostGradPassManager
 
 logger = init_logger(__name__)
@@ -297,7 +295,9 @@ def call_module(self, target: torch.fx.node.Target,
                 num_graphs=len(self.compile_submod_names),
                 runtime_shape=None)
 
-            self.module.__dict__[target] = PiecewiseBackend(
+            piecewise_backend = resolve_obj_by_qualname(
+                current_platform.get_piecewise_backend_cls())
+            self.module.__dict__[target] = piecewise_backend(
                 submod, self.vllm_config, self.graph_pool, index,
                 len(self.compile_submod_names), sym_shape_indices,
                 compiled_graph_for_general_shape, self.vllm_backend)
@@ -341,7 +341,7 @@ def __init__(
     ):
         global global_graph_pool
         if global_graph_pool is None:
-            global_graph_pool = torch.cuda.graph_pool_handle()
+            global_graph_pool = current_platform.graph_pool_handle()
 
         # TODO: in the future, if we want to use multiple
         # streams, it might not be safe to share a global pool.
@@ -558,197 +558,3 @@ def copy_and_call(*args):
             return self.split_gm(*list_args)
 
         return copy_and_call
-
-
-@dataclasses.dataclass
-class ConcreteSizeEntry:
-    runtime_shape: int
-    need_to_compile: bool  # the size is in compile_sizes
-    use_cudagraph: bool  # the size is in cudagraph_capture_sizes
-
-    compiled: bool = False
-    runnable: Callable = None  # type: ignore
-    num_finished_warmup: int = 0
-    cudagraph: Optional[torch.cuda.CUDAGraph] = None
-    output: Optional[Any] = None
-
-    # for cudagraph debugging, track the input addresses
-    # during capture, and check if they are the same during replay
-    input_addresses: Optional[list[int]] = None
-
-
-class PiecewiseBackend:
-
-    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
-                 graph_pool: Any, piecewise_compile_index: int,
-                 total_piecewise_compiles: int, sym_shape_indices: list[int],
-                 compiled_graph_for_general_shape: Callable,
-                 vllm_backend: VllmBackend):
-        """
-        The backend for piecewise compilation.
-        It mainly handles the compilation and cudagraph capturing.
-
-        We will compile `self.graph` once for the general shape,
-        and then compile for different shapes specified in
-        `compilation_config.compile_sizes`.
-
-        Independently, we will capture cudagraph for different shapes.
-
-        If a shape needs both compilation and cudagraph, we will
-        compile it first, and then capture cudagraph.
-        """
-        self.graph = graph
-        self.vllm_config = vllm_config
-        self.compilation_config = vllm_config.compilation_config
-        self.graph_pool = graph_pool
-        self.piecewise_compile_index = piecewise_compile_index
-        self.total_piecewise_compiles = total_piecewise_compiles
-        self.vllm_backend = vllm_backend
-
-        self.is_first_graph = piecewise_compile_index == 0
-        self.is_last_graph = (
-            piecewise_compile_index == total_piecewise_compiles - 1)
-
-        self.compile_sizes: set[int] = set(
-            self.compilation_config.compile_sizes)
-        self.cudagraph_capture_sizes: set[int] = set(
-            self.compilation_config.cudagraph_capture_sizes
-        ) if self.compilation_config.use_cudagraph else set()
-
-        self.first_run_finished = False
-
-        self.compiled_graph_for_general_shape = compiled_graph_for_general_shape  # noqa
-
-        self.sym_shape_indices = sym_shape_indices
-
-        self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
-
-        # the entries for different shapes that we need to either
-        # compile or capture cudagraph
-        self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {}
-
-        # to_be_compiled_sizes tracks the remaining sizes to compile,
-        # and updates during the compilation process, so we need to copy it
-        self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy()
-        for shape in self.compile_sizes.union(self.cudagraph_capture_sizes):
-            self.concrete_size_entries[shape] = ConcreteSizeEntry(
-                runtime_shape=shape,
-                need_to_compile=shape in self.compile_sizes,
-                use_cudagraph=shape in self.cudagraph_capture_sizes,
-            )
-
-    def check_for_ending_compilation(self):
-        if self.is_last_graph and not self.to_be_compiled_sizes:
-            # no specific sizes to compile
-            # save the hash of the inductor graph for the next run
-            self.vllm_backend.compiler_manager.save_to_file()
-            end_monitoring_torch_compile(self.vllm_config)
-
-    def __call__(self, *args) -> Any:
-        if not self.first_run_finished:
-            self.first_run_finished = True
-            self.check_for_ending_compilation()
-            return self.compiled_graph_for_general_shape(*args)
-
-        runtime_shape = args[self.sym_shape_indices[0]]
-        if runtime_shape not in self.concrete_size_entries:
-            # we don't need to do anything for this shape
-            return self.compiled_graph_for_general_shape(*args)
-
-        entry = self.concrete_size_entries[runtime_shape]
-
-        if entry.runnable is None:
-            entry.runnable = self.compiled_graph_for_general_shape
-
-        if entry.need_to_compile and not entry.compiled:
-            entry.compiled = True
-            self.to_be_compiled_sizes.remove(runtime_shape)
-            # args are real arguments
-            entry.runnable = self.vllm_backend.compiler_manager.compile(
-                self.graph,
-                args,
-                self.compilation_config.inductor_compile_config,
-                self.compilation_config,
-                graph_index=self.piecewise_compile_index,
-                num_graphs=self.total_piecewise_compiles,
-                runtime_shape=runtime_shape)
-
-            # finished compilations for all required shapes
-            if self.is_last_graph and not self.to_be_compiled_sizes:
-                self.check_for_ending_compilation()
-
-        if not entry.use_cudagraph:
-            return entry.runnable(*args)
-
-        if entry.cudagraph is None:
-            if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
-                entry.num_finished_warmup += 1
-                if self.is_first_graph:
-                    logger.debug(
-                        "Warming up %s/%s for shape %s",
-                        entry.num_finished_warmup,
-                        self.compilation_config.cudagraph_num_of_warmups,
-                        runtime_shape)
-                return entry.runnable(*args)
-
-            if self.is_first_graph:
-                # Since we capture cudagraph for many different shapes and
-                # capturing is fast, we don't need to log it for every shape.
-                # We only log it in the debug mode.
-                logger.debug("Capturing a cudagraph for shape %s",
-                             runtime_shape)
-
-            input_addresses = [
-                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
-            ]
-            entry.input_addresses = input_addresses
-            cudagraph = torch.cuda.CUDAGraph()
-
-            with ExitStack() as stack:
-                if not self.is_first_graph:
-                    # during every model forward, we will capture
-                    # many pieces of cudagraphs (roughly one per layer).
-                    # running gc again and again across layers will
-                    # make the cudagraph capture very slow.
-                    # therefore, we only run gc for the first graph,
-                    # and disable gc for the rest of the graphs.
-                    stack.enter_context(patch("gc.collect", lambda: None))
-                    stack.enter_context(
-                        patch("torch.cuda.empty_cache", lambda: None))
-
-                # mind-exploding: carefully manage the reference and memory.
-                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
-                    # `output` is managed by pytorch's cudagraph pool
-                    output = entry.runnable(*args)
-                    if self.is_last_graph:
-                        # by converting it to weak ref,
-                        # the original `output` will immediately be released
-                        # to save memory. It is only safe to do this for
-                        # the last graph, because the output of the last graph
-                        # will not be used by any other cuda graph.
-                        output = weak_ref_tensors(output)
-
-            # here we always use weak ref for the output
-            # to save memory
-            entry.output = weak_ref_tensors(output)
-            entry.cudagraph = cudagraph
-
-            compilation_counter.num_cudagraph_caputured += 1
-
-            # important: we need to return the output, rather than
-            # the weak ref of the output, so that pytorch can correctly
-            # manage the memory during cuda graph capture
-            return output
-
-        if self.is_debugging_mode:
-            # check if the input addresses are the same
-            new_input_addresses = [
-                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
-            ]
-            assert new_input_addresses == entry.input_addresses, (
-                "Input addresses for cudagraphs are different during replay."
-                f" Expected {entry.input_addresses}, got {new_input_addresses}"
-            )
-
-        entry.cudagraph.replay()
-        return entry.output
diff --git a/vllm/compilation/base_piecewise_backend.py b/vllm/compilation/base_piecewise_backend.py
new file mode 100644
index 000000000000..84d1e1f77739
--- /dev/null
+++ b/vllm/compilation/base_piecewise_backend.py
@@ -0,0 +1,71 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Callable, Protocol
+
+import torch.fx as fx
+
+from vllm.compilation.backends import VllmBackend
+from vllm.config import VllmConfig
+
+
+class AbstractPiecewiseBackend(Protocol):
+    """
+    PiecewiseBackend interface that allows platforms to extend 
+    piecewise static graph.
+    """
+
+    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
+                 graph_pool: Any, piecewise_compile_index: int,
+                 total_piecewise_compiles: int, sym_shape_indices: list[int],
+                 compiled_graph_for_general_shape: Callable,
+                 vllm_backend: VllmBackend, **kwargs):
+        """
+        Initializes the PiecewiseBackend class with compilation and 
+        execution-related configurations.
+
+        This class handles piecewise compilation, graph capturing, 
+        and dispatching for specific input shapes.
+
+        Args:
+            graph (fx.GraphModule): The graph represented in fx.
+            vllm_config (VllmConfig): Global configuration for vLLM.
+            graph_pool (Any): 
+                Graph memory pool handle, e.g., 
+                    `torch.cuda.graph_pool_handle()`.
+            piecewise_compile_index (int): 
+                Index of the current piecewise subgraph.
+            total_piecewise_compiles (int): 
+                Total number of piecewise-compiled graphs.
+            sym_shape_indices (list[int]): 
+                Indices of symbolic shape.
+            compiled_graph_for_general_shape (Callable): 
+                Callable that executes the graph compiled for general shapes.
+            vllm_backend (VllmBackend): 
+                Backend compiler that manages compilation and graph runtime 
+                for vLLM.
+
+        Keyword Args:
+            kwargs: Additional keyword arguments reserved for future 
+                extensions or custom platforms.
+        """
+        raise NotImplementedError
+
+    def __call__(self, *args) -> Any:
+        """Executes the compiled graph for given input args.
+
+        If this is the first invocation, executes the general compiled graph
+        and initiates the compilation process tracking. For subsequent calls,
+        dynamically dispatches execution to either a compiled graph or a static
+        graph based on the input shape.
+
+        Args:
+            *args: Variable length input arguments to be passed into the 
+                graph. The symbolic shape is expected to be in position 
+                `sym_shape_indices[0]`.
+
+        Returns:
+            Any: Output of the executed graph. This can be from the general
+            compiled graph, a specialized compiled version for the given shape,
+            or a replayed static graph.
+        """
+        raise NotImplementedError
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
new file mode 100644
index 000000000000..f651ee6912ab
--- /dev/null
+++ b/vllm/compilation/collective_fusion.py
@@ -0,0 +1,126 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import torch
+import torch._inductor.pattern_matcher as pm
+import torch.fx as fx
+from torch._inductor.pattern_matcher import PatternMatcherPass
+from torch.distributed._symmetric_memory import enable_symm_mem_for_group
+
+from vllm.config import VllmConfig
+from vllm.distributed import get_tp_group
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.logger import init_logger
+
+from .vllm_inductor_pass import VllmInductorPass
+
+logger = init_logger(__name__)
+
+
+class BasePattern:
+
+    def __init__(self, dtype: torch.dtype, device: str):
+        self.dtype = dtype
+        self.device = device
+        self.tp = get_tp_group()
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+
+class GEMMReduceScatterPattern(BasePattern):
+
+    def get_inputs(self):
+        mul = torch.empty([16, 4], device=self.device, dtype=self.dtype)
+        mm_weight = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+        return [mul, mm_weight]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(mul: torch.Tensor, mm_weight: torch.Tensor):
+            mm = torch.ops.aten.mm.default(mul, mm_weight)
+            reduce_scatter = torch.ops.vllm.reduce_scatter.default(
+                mm,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name)
+            return reduce_scatter
+
+        def replacement(mul: torch.Tensor, mm_weight: torch.Tensor):
+            gemm_rs = torch.ops.symm_mem.fused_matmul_reduce_scatter(
+                mul,
+                mm_weight,
+                "avg",
+                scatter_dim=0,
+                group_name=self.tp.device_group.group_name,
+            )
+
+            return gemm_rs
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class AllGatherGEMMPattern(BasePattern):
+
+    def get_inputs(self):
+        x = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+        weight = torch.empty([4, 4], device=self.device, dtype=self.dtype)
+
+        return [x, weight]
+
+    def register(self, pm_pass: PatternMatcherPass):
+
+        def pattern(
+            x: torch.Tensor,
+            weight: torch.Tensor,
+        ) -> tuple[torch.Tensor, torch.Tensor]:
+            all_gather = torch.ops.vllm.all_gather.default(
+                x,
+                dim=0,
+                world_size=self.tp_size,
+                group_name=self.tp.unique_name)
+
+            return torch.ops.aten.mm.default(all_gather, weight)
+
+        def replacement(
+                x: torch.Tensor,
+                weight: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+            ag_output, mm_outputs = torch.ops.symm_mem.fused_all_gather_matmul(
+                x,
+                [weight],
+                gather_dim=0,
+                group_name=self.tp.device_group.group_name,
+            )
+            return mm_outputs
+
+        pm.register_replacement(pattern, replacement, self.get_inputs(),
+                                pm.fwd_only, pm_pass)
+
+
+class AsyncTPPass(VllmInductorPass):
+
+    def __init__(self, config: VllmConfig):
+        super().__init__(config)
+
+        # Enable symmetric memory for the TP process group
+        enable_symm_mem_for_group(get_tp_group().device_group.group_name)
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="async_tp_pass")
+        GEMMReduceScatterPattern(self.model_dtype,
+                                 self.device).register(self.patterns)
+
+        AllGatherGEMMPattern(self.model_dtype,
+                             self.device).register(self.patterns)
+
+    def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
+        # only do replace for specific shapes
+        tp_size = get_tensor_model_parallel_world_size()
+        return shape is not None and shape % tp_size == 0
+
+    def __call__(self, graph: fx.Graph):
+        self.begin()
+        self.dump_graph(graph, "before_async_tp_pass")
+        count = self.patterns.apply(graph)
+        logger.debug("Replaced %s patterns", count)
+        self.dump_graph(graph, "after_async_tp_pass")
+        self.end_and_log()
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
index 89a131e8ea24..21af5eb76ee8 100644
--- a/vllm/compilation/compiler_interface.py
+++ b/vllm/compilation/compiler_interface.py
@@ -39,7 +39,8 @@ def compute_hash(self, vllm_config: VllmConfig) -> str:
         Gather all the relevant information from the vLLM config,
         to compute a hash so that we can cache the compiled model.
 
-        See {meth}`VllmConfig.compute_hash` to check what information
+        See [`VllmConfig.compute_hash`][vllm.config.VllmConfig.compute_hash]
+        to check what information
         is already considered by default. This function should only
         consider the information that is specific to the compiler.
         """
diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
new file mode 100644
index 000000000000..0ad480e28cd7
--- /dev/null
+++ b/vllm/compilation/cuda_piecewise_backend.py
@@ -0,0 +1,213 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+from contextlib import ExitStack
+from typing import Any, Callable, Optional
+from unittest.mock import patch
+
+import torch
+import torch.fx as fx
+
+import vllm.envs as envs
+from vllm.compilation.backends import VllmBackend
+from vllm.compilation.counter import compilation_counter
+from vllm.compilation.monitor import end_monitoring_torch_compile
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.utils import weak_ref_tensors
+
+logger = init_logger(__name__)
+
+
+@dataclasses.dataclass
+class ConcreteSizeEntry:
+    runtime_shape: int
+    need_to_compile: bool  # the size is in compile_sizes
+    use_cudagraph: bool  # the size is in cudagraph_capture_sizes
+
+    compiled: bool = False
+    runnable: Callable = None  # type: ignore
+    num_finished_warmup: int = 0
+    cudagraph: Optional[torch.cuda.CUDAGraph] = None
+    output: Optional[Any] = None
+
+    # for cudagraph debugging, track the input addresses
+    # during capture, and check if they are the same during replay
+    input_addresses: Optional[list[int]] = None
+
+
+class CUDAPiecewiseBackend:
+
+    def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
+                 graph_pool: Any, piecewise_compile_index: int,
+                 total_piecewise_compiles: int, sym_shape_indices: list[int],
+                 compiled_graph_for_general_shape: Callable,
+                 vllm_backend: VllmBackend):
+        """
+        The backend for piecewise compilation.
+        It mainly handles the compilation and cudagraph capturing.
+
+        We will compile `self.graph` once for the general shape,
+        and then compile for different shapes specified in
+        `compilation_config.compile_sizes`.
+
+        Independently, we will capture cudagraph for different shapes.
+
+        If a shape needs both compilation and cudagraph, we will
+        compile it first, and then capture cudagraph.
+        """
+        self.graph = graph
+        self.vllm_config = vllm_config
+        self.compilation_config = vllm_config.compilation_config
+        self.graph_pool = graph_pool
+        self.piecewise_compile_index = piecewise_compile_index
+        self.total_piecewise_compiles = total_piecewise_compiles
+        self.vllm_backend = vllm_backend
+
+        self.is_first_graph = piecewise_compile_index == 0
+        self.is_last_graph = (
+            piecewise_compile_index == total_piecewise_compiles - 1)
+
+        self.compile_sizes: set[int] = set(
+            self.compilation_config.compile_sizes)
+        self.cudagraph_capture_sizes: set[int] = set(
+            self.compilation_config.cudagraph_capture_sizes
+        ) if self.compilation_config.use_cudagraph else set()
+
+        self.first_run_finished = False
+
+        self.compiled_graph_for_general_shape = compiled_graph_for_general_shape  # noqa
+
+        self.sym_shape_indices = sym_shape_indices
+
+        self.is_debugging_mode = envs.VLLM_LOGGING_LEVEL == "DEBUG"
+
+        # the entries for different shapes that we need to either
+        # compile or capture cudagraph
+        self.concrete_size_entries: dict[int, ConcreteSizeEntry] = {}
+
+        # to_be_compiled_sizes tracks the remaining sizes to compile,
+        # and updates during the compilation process, so we need to copy it
+        self.to_be_compiled_sizes: set[int] = self.compile_sizes.copy()
+        for shape in self.compile_sizes.union(self.cudagraph_capture_sizes):
+            self.concrete_size_entries[shape] = ConcreteSizeEntry(
+                runtime_shape=shape,
+                need_to_compile=shape in self.compile_sizes,
+                use_cudagraph=shape in self.cudagraph_capture_sizes,
+            )
+
+    def check_for_ending_compilation(self):
+        if self.is_last_graph and not self.to_be_compiled_sizes:
+            # no specific sizes to compile
+            # save the hash of the inductor graph for the next run
+            self.vllm_backend.compiler_manager.save_to_file()
+            end_monitoring_torch_compile(self.vllm_config)
+
+    def __call__(self, *args) -> Any:
+        if not self.first_run_finished:
+            self.first_run_finished = True
+            self.check_for_ending_compilation()
+            return self.compiled_graph_for_general_shape(*args)
+
+        runtime_shape = args[self.sym_shape_indices[0]]
+        if runtime_shape not in self.concrete_size_entries:
+            # we don't need to do anything for this shape
+            return self.compiled_graph_for_general_shape(*args)
+
+        entry = self.concrete_size_entries[runtime_shape]
+
+        if entry.runnable is None:
+            entry.runnable = self.compiled_graph_for_general_shape
+
+        if entry.need_to_compile and not entry.compiled:
+            entry.compiled = True
+            self.to_be_compiled_sizes.remove(runtime_shape)
+            # args are real arguments
+            entry.runnable = self.vllm_backend.compiler_manager.compile(
+                self.graph,
+                args,
+                self.compilation_config.inductor_compile_config,
+                self.compilation_config,
+                graph_index=self.piecewise_compile_index,
+                num_graphs=self.total_piecewise_compiles,
+                runtime_shape=runtime_shape)
+
+            # finished compilations for all required shapes
+            if self.is_last_graph and not self.to_be_compiled_sizes:
+                self.check_for_ending_compilation()
+
+        if not entry.use_cudagraph:
+            return entry.runnable(*args)
+
+        if entry.cudagraph is None:
+            if entry.num_finished_warmup < self.compilation_config.cudagraph_num_of_warmups:  # noqa
+                entry.num_finished_warmup += 1
+                if self.is_first_graph:
+                    logger.debug(
+                        "Warming up %s/%s for shape %s",
+                        entry.num_finished_warmup,
+                        self.compilation_config.cudagraph_num_of_warmups,
+                        runtime_shape)
+                return entry.runnable(*args)
+
+            if self.is_first_graph:
+                # Since we capture cudagraph for many different shapes and
+                # capturing is fast, we don't need to log it for every shape.
+                # We only log it in the debug mode.
+                logger.debug("Capturing a cudagraph for shape %s",
+                             runtime_shape)
+
+            input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            entry.input_addresses = input_addresses
+            cudagraph = torch.cuda.CUDAGraph()
+
+            with ExitStack() as stack:
+                if not self.is_first_graph:
+                    # during every model forward, we will capture
+                    # many pieces of cudagraphs (roughly one per layer).
+                    # running gc again and again across layers will
+                    # make the cudagraph capture very slow.
+                    # therefore, we only run gc for the first graph,
+                    # and disable gc for the rest of the graphs.
+                    stack.enter_context(patch("gc.collect", lambda: None))
+                    stack.enter_context(
+                        patch("torch.cuda.empty_cache", lambda: None))
+
+                # mind-exploding: carefully manage the reference and memory.
+                with torch.cuda.graph(cudagraph, pool=self.graph_pool):
+                    # `output` is managed by pytorch's cudagraph pool
+                    output = entry.runnable(*args)
+                    if self.is_last_graph:
+                        # by converting it to weak ref,
+                        # the original `output` will immediately be released
+                        # to save memory. It is only safe to do this for
+                        # the last graph, because the output of the last graph
+                        # will not be used by any other cuda graph.
+                        output = weak_ref_tensors(output)
+
+            # here we always use weak ref for the output
+            # to save memory
+            entry.output = weak_ref_tensors(output)
+            entry.cudagraph = cudagraph
+
+            compilation_counter.num_cudagraph_caputured += 1
+
+            # important: we need to return the output, rather than
+            # the weak ref of the output, so that pytorch can correctly
+            # manage the memory during cuda graph capture
+            return output
+
+        if self.is_debugging_mode:
+            # check if the input addresses are the same
+            new_input_addresses = [
+                x.data_ptr() for x in args if isinstance(x, torch.Tensor)
+            ]
+            assert new_input_addresses == entry.input_addresses, (
+                "Input addresses for cudagraphs are different during replay."
+                f" Expected {entry.input_addresses}, got {new_input_addresses}"
+            )
+
+        entry.cudagraph.replay()
+        return entry.output
diff --git a/vllm/compilation/pass_manager.py b/vllm/compilation/pass_manager.py
index f4d3fd9b457f..07ebd3e1b7dd 100644
--- a/vllm/compilation/pass_manager.py
+++ b/vllm/compilation/pass_manager.py
@@ -6,6 +6,7 @@
 from vllm.logger import init_logger
 
 from .activation_quant_fusion import ActivationQuantFusionPass
+from .collective_fusion import AsyncTPPass
 from .fix_functionalization import FixFunctionalizationPass
 from .fusion import FusionPass
 from .inductor_pass import CustomGraphPass, InductorPass, get_pass_context
@@ -54,6 +55,8 @@ def configure(self, config: VllmConfig):
 
         if self.pass_config.enable_sequence_parallelism:
             self.passes += [SequenceParallelismPass(config)]
+            if self.pass_config.enable_async_tp:
+                self.passes += [AsyncTPPass(config)]
 
         self.fix_functionalization = FixFunctionalizationPass(config)
 
diff --git a/vllm/compilation/sequence_parallelism.py b/vllm/compilation/sequence_parallelism.py
index f0476bfcb65a..17dded87fe8d 100644
--- a/vllm/compilation/sequence_parallelism.py
+++ b/vllm/compilation/sequence_parallelism.py
@@ -243,24 +243,25 @@ def __init__(self, config: VllmConfig):
             pass_name="sequence_parallelism_pass")
         for epsilon in [1e-5, 1e-6]:
             EmbeddingAllReduceRMSNormPattern(
-                epsilon, self.dtype, self.device).register(self.patterns)
+                epsilon, self.model_dtype, self.device).register(self.patterns)
 
-            MiddleAllReduceRMSNormPattern(epsilon, self.dtype,
+            MiddleAllReduceRMSNormPattern(epsilon, self.model_dtype,
                                           self.device).register(self.patterns)
 
-            LastAllReduceRMSNormPattern(epsilon, self.dtype,
+            LastAllReduceRMSNormPattern(epsilon, self.model_dtype,
                                         self.device).register(self.patterns)
             # WARNING: This is a hack to clear the pattern matcher cache
             # and allow multiple values of epsilon.
             torch._inductor.pattern_matcher._seen_patterns.clear()
 
     def is_applicable_for_shape(self, shape: Optional[int]) -> bool:
-        # only do replace for specific shapes
         tp_size = get_tensor_model_parallel_world_size()
         return shape is not None and shape % tp_size == 0
 
     def __call__(self, graph: fx.Graph):
+        self.begin()
         self.dump_graph(graph, "before_sequence_parallelism_pass")
         count = self.patterns.apply(graph)
         logger.debug("Replaced %s patterns", count)
         self.dump_graph(graph, "after_sequence_parallelism_pass")
+        self.end_and_log()
diff --git a/vllm/compilation/vllm_inductor_pass.py b/vllm/compilation/vllm_inductor_pass.py
index c95e0bce5f2e..0fe73b72b1de 100644
--- a/vllm/compilation/vllm_inductor_pass.py
+++ b/vllm/compilation/vllm_inductor_pass.py
@@ -26,7 +26,8 @@ class VllmInductorPass(InductorPass):
 
     def __init__(self, config: VllmConfig):
         self.pass_config = config.compilation_config.pass_config
-        self.dtype = config.model_config.dtype if config.model_config else None
+        self.model_dtype = config.model_config.dtype if config.model_config \
+            else None
         self.device = config.device_config.device if config.device_config \
             else None
         self.pass_name = self.__class__.__name__
diff --git a/vllm/config.py b/vllm/config.py
index 07706bbff459..5296b753afa2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -6,7 +6,6 @@
 import hashlib
 import inspect
 import json
-import re
 import textwrap
 import uuid
 import warnings
@@ -20,6 +19,7 @@
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Literal, Optional,
                     Protocol, TypeVar, Union, cast, get_args, get_origin)
 
+import regex as re
 import torch
 from torch.distributed import ProcessGroup, ReduceOp
 from transformers import PretrainedConfig
@@ -42,7 +42,10 @@
     try_get_generation_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
-from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless,
+from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
+                        MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
+                        POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes,
+                        LayerBlockType, cuda_device_count_stateless,
                         get_cpu_memory, get_open_port, is_mi250, is_navi,
                         is_torch_equal_or_newer, random_uuid,
                         resolve_obj_by_qualname)
@@ -65,12 +68,6 @@
 
 ConfigT = TypeVar("ConfigT", bound=ConfigType)
 
-# This value is chosen to have a balance between ITL and TTFT. Note it is
-# not optimized for throughput.
-_DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
-_POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
-_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
-
 TaskOption = Literal["auto", "generate", "embedding", "embed", "classify",
                      "score", "reward", "transcription"]
 
@@ -537,13 +534,19 @@ def __post_init__(self) -> None:
             self.model, hf_token=self.hf_token, revision=self.revision)
         self.dtype = _get_and_verify_dtype(self.hf_config, self.dtype)
 
-        interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2"]
+        # Workaround for Gemma 2 which uses interleaved sliding window
+        # attention, but it's not specified in its config. TODO: remove this
+        # when Gemma 2 is fixed in Transformers.
+        if self.hf_text_config.model_type == "gemma2":
+            self.hf_text_config.sliding_window_pattern = 2
+
         sliding_window = getattr(self.hf_text_config, "sliding_window", None)
-        has_interleaved_attention = (sliding_window is not None) and (
-            isinstance(sliding_window, list) or
-            (self.hf_text_config.model_type in interleaved_attn_models))
+        sliding_window_pattern = getattr(self.hf_text_config,
+                                         "sliding_window_pattern", None)
+        has_interleaved_attention = sliding_window_pattern is not None or (
+            isinstance(sliding_window, list))
 
-        if (not self.disable_sliding_window and has_interleaved_attention):
+        if not self.disable_sliding_window and has_interleaved_attention:
             if (backend :=
                     envs.VLLM_ATTENTION_BACKEND) in ("XFORMERS", "FLASHINFER"):
                 sliding_window_len_min = get_min_sliding_window(
@@ -563,7 +566,10 @@ def __post_init__(self) -> None:
                 # only the attention layer itself is aware of the sliding
                 # window, and use the window size to compute the attention.
                 self.hf_text_config.interleaved_sliding_window = sliding_window
-                delattr(self.hf_text_config, "sliding_window")
+
+                if hasattr(self.hf_text_config, "sliding_window"):
+                    delattr(self.hf_text_config, "sliding_window")
+
                 sliding_window = None
 
         self.max_model_len = _get_and_verify_max_len(
@@ -825,7 +831,7 @@ def _verify_quantization(self) -> None:
         optimized_quantization_methods = [
             "fp8", "marlin", "modelopt", "gptq_marlin_24", "gptq_marlin",
             "awq_marlin", "fbgemm_fp8", "compressed-tensors", "experts_int8",
-            "quark", "nvfp4", "bitblas", "gptq_bitblas"
+            "quark", "modelopt_fp4", "bitblas", "gptq_bitblas"
         ]
         if self.quantization is not None:
             self.quantization = cast(QuantizationMethods,
@@ -988,7 +994,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
             self.use_async_output_proc = False
             return
 
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
+        # Reminder: Please update docs/features/compatibility_matrix.md
         # If the feature combo become valid
         from vllm.platforms import current_platform
         if not current_platform.is_async_output_supported(self.enforce_eager):
@@ -1004,7 +1010,7 @@ def verify_async_output_proc(self, parallel_config, speculative_config,
         if self.runner_type == "pooling":
             self.use_async_output_proc = False
 
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
+        # Reminder: Please update docs/features/compatibility_matrix.md
         # If the feature combo become valid
         if speculative_config:
             self.use_async_output_proc = False
@@ -2086,28 +2092,28 @@ def __post_init__(self) -> None:
                     # so we don't reject sequences on account of a short
                     # max_num_batched_tokens.
                     self.max_num_batched_tokens = max(
-                        self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS)
+                        self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
                 else:
                     self.max_num_batched_tokens = (
-                        _DEFAULT_MAX_NUM_BATCHED_TOKENS)
+                        DEFAULT_MAX_NUM_BATCHED_TOKENS)
             else:
                 # If max_model_len is too short, use
-                # _DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
+                # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value
                 # for higher throughput.
                 self.max_num_batched_tokens = max(
-                    self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS)
+                    self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS)
 
             if self.runner_type == "pooling":
                 # Choose specific value for higher throughput
                 self.max_num_batched_tokens = max(
                     self.max_num_batched_tokens,
-                    _POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
+                    POOLING_MODEL_MAX_NUM_BATCHED_TOKENS,
                 )
             if self.is_multimodal_model:
                 # The value needs to be at least the number of multimodal tokens
                 self.max_num_batched_tokens = max(
                     self.max_num_batched_tokens,
-                    _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
+                    MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS,
                 )
 
             # When using default settings,
@@ -2213,7 +2219,11 @@ class DeviceConfig:
     """Configuration for the device to use for vLLM execution."""
 
     device: Union[Device, torch.device] = "auto"
-    """Device type for vLLM execution."""
+    """Device type for vLLM execution.
+    This parameter is deprecated and will be 
+    removed in a future release. 
+    It will now be set automatically based 
+    on the current platform."""
     device_type: str = field(init=False)
     """Device type from the current platform. This is set in
     `__post_init__`."""
@@ -2263,7 +2273,7 @@ def __post_init__(self):
 
 
 SpeculativeMethod = Literal["ngram", "eagle", "medusa", "mlp_speculator",
-                            "draft_model"]
+                            "draft_model", "deepseek_mtp"]
 SpeculativeAcceptanceMethod = Literal["rejection_sampler",
                                       "typical_acceptance_sampler"]
 
@@ -2527,6 +2537,15 @@ def __post_init__(self):
                 elif (self.draft_model_config.hf_config.model_type ==
                       "mlp_speculator"):
                     self.method = "mlp_speculator"
+                elif (self.draft_model_config.hf_config.model_type ==
+                      "deepseek_mtp"):
+                    self.method = "deepseek_mtp"
+                    if self.num_speculative_tokens > 1:
+                        logger.warning(
+                                "All Deepseek MTP models only have " \
+                                "one layer. Might need some code changes " \
+                                "to support multiple layers."
+                            )
                 else:
                     self.method = "draft_model"
 
@@ -2537,11 +2556,10 @@ def __post_init__(self):
                             "Chunked prefill and EAGLE are not compatible "
                             "when using V0.")
 
-                    from vllm.platforms import current_platform
                     from vllm.transformers_utils.configs.eagle import (
                         EAGLEConfig)
                     if isinstance(self.draft_model_config.hf_config,
-                                  EAGLEConfig) or current_platform.is_neuron():
+                                  EAGLEConfig):
                         pass
                     else:
                         eagle_config = EAGLEConfig(
@@ -2747,7 +2765,7 @@ def num_lookahead_slots(self) -> int:
         return self.num_speculative_tokens
 
     def use_eagle(self) -> bool:
-        return self.method in ("eagle", "eagle3")
+        return self.method in ("eagle", "eagle3", "deepseek_mtp")
 
     def __repr__(self) -> str:
         method = self.method
@@ -2980,7 +2998,7 @@ class PoolerConfig:
     pooling_type: Optional[str] = None
     """
     The pooling method of the pooling model. This should be a key in
-    {class}`vllm.model_executor.layers.pooler.PoolingType`.
+    [`vllm.model_executor.layers.pooler.PoolingType`][].
     """
 
     normalize: Optional[bool] = None
@@ -3503,7 +3521,7 @@ class KVTransferConfig:
     """The KV connector for vLLM to transmit KV caches between vLLM instances.
     """
 
-    engine_id: str = str(uuid.uuid4())
+    engine_id: Optional[str] = None
     """The engine id for KV transfers."""
 
     kv_buffer_device: Optional[str] = "cuda"
@@ -3560,6 +3578,9 @@ def compute_hash(self) -> str:
         return hash_str
 
     def __post_init__(self) -> None:
+        if self.engine_id is None:
+            self.engine_id = str(uuid.uuid4())
+
         if self.kv_role is not None and self.kv_role not in get_args(KVRole):
             raise ValueError(f"Unsupported kv_role: {self.kv_role}. "
                              f"Supported roles are {get_args(KVRole)}")
@@ -3658,6 +3679,8 @@ class PassConfig:
     """Whether to enable the custom no-op elimination pass."""
     enable_sequence_parallelism: bool = False
     """Whether to enable sequence parallelism."""
+    enable_async_tp: bool = False
+    """Whether to enable async TP."""
 
     def uuid(self):
         """
@@ -3667,7 +3690,8 @@ def uuid(self):
         compilation.
         """
         include = {
-            "enable_fusion", "enable_noop", "enable_sequence_parallelism"
+            "enable_fusion", "enable_noop", "enable_sequence_parallelism",
+            "enable_async_tp"
         }
         dict_ = {k: v for k, v in asdict(self).items() if k in include}
         return InductorPass.hash_dict(dict_)
@@ -3685,23 +3709,27 @@ class CompilationConfig:
     """Configuration for compilation. It has three parts:
 
     - Top-level Compilation control:
-        - {attr}`level`
-        - {attr}`debug_dump_path`
-        - {attr}`cache_dir`
-        - {attr}`backend`
-        - {attr}`custom_ops`
-        - {attr}`splitting_ops`
+        - [`level`][vllm.config.CompilationConfig.level]
+        - [`debug_dump_path`][vllm.config.CompilationConfig.debug_dump_path]
+        - [`cache_dir`][vllm.config.CompilationConfig.cache_dir]
+        - [`backend`][vllm.config.CompilationConfig.backend]
+        - [`custom_ops`][vllm.config.CompilationConfig.custom_ops]
+        - [`splitting_ops`][vllm.config.CompilationConfig.splitting_ops]
     - CudaGraph capture:
-        - {attr}`use_cudagraph`
-        - {attr}`cudagraph_capture_sizes`
-        - {attr}`cudagraph_num_of_warmups`
-        - {attr}`cudagraph_copy_inputs`
-        - {attr}`full_cuda_graph`
+        - [`use_cudagraph`][vllm.config.CompilationConfig.use_cudagraph]
+        - [`cudagraph_capture_sizes`]
+        [vllm.config.CompilationConfig.cudagraph_capture_sizes]
+        - [`cudagraph_num_of_warmups`]
+        [vllm.config.CompilationConfig.cudagraph_num_of_warmups]
+        - [`cudagraph_copy_inputs`]
+        [vllm.config.CompilationConfig.cudagraph_copy_inputs]
+        - [`full_cuda_graph`][vllm.config.CompilationConfig.full_cuda_graph]
     - Inductor compilation:
-        - {attr}`use_inductor`
-        - {attr}`compile_sizes`
-        - {attr}`inductor_compile_config`
-        - {attr}`inductor_passes`
+        - [`use_inductor`][vllm.config.CompilationConfig.use_inductor]
+        - [`compile_sizes`][vllm.config.CompilationConfig.compile_sizes]
+        - [`inductor_compile_config`]
+        [vllm.config.CompilationConfig.inductor_compile_config]
+        - [`inductor_passes`][vllm.config.CompilationConfig.inductor_passes]
         - custom inductor passes
 
     Why we have different sizes for cudagraph and inductor:
@@ -4280,6 +4308,12 @@ def __post_init__(self):
 
         if self.compilation_config is None:
             self.compilation_config = CompilationConfig()
+
+        # async tp is built on top of sequence parallelism
+        # and requires it to be enabled.
+        if self.compilation_config.pass_config.enable_async_tp:
+            self.compilation_config.pass_config.enable_sequence_parallelism = \
+                True
         if self.compilation_config.pass_config.enable_sequence_parallelism:
             self.compilation_config.custom_ops.append("+rms_norm")
         if envs.VLLM_USE_V1 and self.model_config is not None and \
@@ -4329,18 +4363,6 @@ def __post_init__(self):
                 "full_cuda_graph is not supported with "
                 "cascade attention. Disabling cascade attention.")
             self.model_config.disable_cascade_attn = True
-
-        if self.model_config and self.model_config.use_mla and \
-            not (current_platform.is_cuda() or current_platform.is_rocm()):
-            logger.info(
-                "MLA is enabled on a non-GPU platform; forcing chunked "
-                "prefill and prefix caching to be disabled.")
-            self.scheduler_config.enable_chunked_prefill = False
-            self.scheduler_config.chunked_prefill_enabled = False
-            self.scheduler_config.max_num_batched_tokens = max(
-                self.scheduler_config.max_model_len,
-                _DEFAULT_MAX_NUM_BATCHED_TOKENS)
-
             if self.cache_config is not None:
                 self.cache_config.enable_prefix_caching = False
 
@@ -4566,7 +4588,7 @@ def contains_object_print(text):
         text (str): The text to check
 
     Returns:
-        bool: True if a match is found, False otherwise
+        result (bool): `True` if a match is found, `False` otherwise.
     """
     pattern = r'at 0x[a-fA-F0-9]{2,16}>'
     match = re.search(pattern, text)
diff --git a/vllm/connections.py b/vllm/connections.py
index 9abc66050e18..84e32a4d5ca9 100644
--- a/vllm/connections.py
+++ b/vllm/connections.py
@@ -167,4 +167,7 @@ async def async_download_file(
 
 
 global_http_connection = HTTPConnection()
-"""The global {class}`HTTPConnection` instance used by vLLM."""
+"""
+The global [`HTTPConnection`][vllm.connections.HTTPConnection] instance used
+by vLLM.
+"""
diff --git a/vllm/distributed/device_communicators/all2all.py b/vllm/distributed/device_communicators/all2all.py
index b69647b00586..a250ec89cd5b 100644
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -1,44 +1,24 @@
 # SPDX-License-Identifier: Apache-2.0
+import importlib.util
+from typing import TYPE_CHECKING
+
 import torch
+import torch.distributed as dist
 
 from vllm.forward_context import get_forward_context
+from vllm.logger import init_logger
 
+from .base_device_communicator import All2AllManagerBase, Cache
 
-class All2AllBase:
-
-    def __init__(self, cpu_group, model):
-        self.cpu_group = cpu_group
-
-        # compute some common properties
-        from vllm.distributed.parallel_state import (get_dp_group,
-                                                     get_ep_group,
-                                                     get_tp_group,
-                                                     in_the_same_node_as)
-
-        # all2all lives in ep group, which is merged from dp and tp group
-        self.dp_group = get_dp_group()
-        self.tp_group = get_tp_group()
-        self.ep_group = get_ep_group()
-        self.dp_rank = self.dp_group.rank_in_group
-        self.dp_world_size = self.dp_group.world_size
-
-        # all2all communication often has separate implementations for
-        # intra-node and inter-node communication
-        self.intranode = in_the_same_node_as(cpu_group, source_rank=0)
-        self.internode = not self.intranode
-
-    def dispatch(self, hidden_states: torch.Tensor,
-                 router_logits: torch.Tensor):
-        raise NotImplementedError
-
-    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        raise NotImplementedError
+logger = init_logger(__name__)
 
-    def destroy(self):
-        pass
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.fused_moe.layer import FusedMoE
+else:
+    FusedMoE = None
 
 
-class NaiveAll2All(All2AllBase):
+class NaiveAll2AllManager(All2AllManagerBase):
     """
     A naive implementation of all2all communication.
     It uses all-reduce under the hood, which is not
@@ -46,8 +26,8 @@ class NaiveAll2All(All2AllBase):
     debugging.
     """
 
-    def __init__(self, cpu_group, model):
-        super().__init__(cpu_group, model)
+    def __init__(self, cpu_group):
+        super().__init__(cpu_group)
 
     def naive_multicast(self, x: torch.Tensor,
                         cu_tokens_across_dp_cpu: torch.Tensor):
@@ -91,3 +71,56 @@ def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
     def destroy(self):
         pass
+
+
+class PPLXAll2AllManager(All2AllManagerBase):
+    """
+    All2All communication based on PPLX kernels.
+    """
+
+    def __init__(self, cpu_group):
+        has_pplx = importlib.util.find_spec("pplx_kernels") is not None
+        assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels."  # noqa
+        super().__init__(cpu_group)
+
+        if self.internode:
+            # inter-node communication needs nvshmem,
+            # intra-node communication uses p2p mapping directly
+            from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id,
+                                              nvshmem_get_unique_id,
+                                              nvshmem_init)
+            logger.debug(
+                "Initialize NVSHMEM for pplx_kernels: "
+                "rank=%d, world size=%d", self.rank, self.world_size)
+            uid = nvshmem_get_unique_id(
+            ) if self.rank == 0 else nvshmem_alloc_empty_unique_id()
+            dist.broadcast(uid,
+                           src=dist.get_process_group_ranks(self.cpu_group)[0],
+                           group=self.cpu_group)
+            logger.debug("PPLX NVSHMEM UID = %s", uid)
+            nvshmem_init(uid, self.rank, self.world_size)
+
+        self.handle_cache = Cache()
+
+    def get_handle(self, kwargs):
+        import pplx_kernels as pplx
+        return self.handle_cache.get_or_create(
+            kwargs, pplx.AllToAll.internode
+            if self.internode else pplx.AllToAll.intranode)
+
+    def dispatch(self, hidden_states: torch.Tensor,
+                 router_logits: torch.Tensor):
+        raise NotImplementedError
+
+    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    def destroy(self):
+        with self.handle_cache._lock:
+            for _, handle in self.handle_cache._cache.items():
+                handle.destroy()
+
+        if self.internode:
+            from pplx_kernels.nvshmem import nvshmem_finalize
+            logger.debug("PPLX NVSHMEM finalize")
+            nvshmem_finalize()
diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py
index ead79872bd49..52b970949144 100644
--- a/vllm/distributed/device_communicators/base_device_communicator.py
+++ b/vllm/distributed/device_communicators/base_device_communicator.py
@@ -1,11 +1,76 @@
 # SPDX-License-Identifier: Apache-2.0
+import threading
 from typing import Optional
+from weakref import WeakValueDictionary
 
 import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
 
 
+class Cache:
+
+    def __init__(self):
+        self._cache: WeakValueDictionary = WeakValueDictionary()
+        self._lock = threading.RLock()  # Reentrant lock for thread safety
+
+    def get_or_create(self, kwargs, func):
+        # Create a hashable key from the kwargs
+        key = tuple(sorted((k, v) for k, v in kwargs.items()))
+
+        with self._lock:
+            instance = self._cache.get(key)
+            if instance is None:
+                instance = func(**kwargs)
+                self._cache[key] = instance
+            return instance
+
+
+class All2AllManagerBase:
+
+    def __init__(self, cpu_group):
+        self.cpu_group = cpu_group
+
+        # compute some common properties
+        from vllm.distributed.parallel_state import (get_dp_group,
+                                                     get_tp_group,
+                                                     in_the_same_node_as)
+
+        # all2all lives in ep group, which is merged from dp and tp group
+        self.dp_group = get_dp_group()
+        self.tp_group = get_tp_group()
+        # no self.ep_group since self.ep_group is still in construction
+        # when we create this object
+        self.dp_rank = self.dp_group.rank_in_group
+        self.dp_world_size = self.dp_group.world_size
+        self.rank = dist.get_rank(cpu_group)
+        self.world_size = dist.get_world_size(cpu_group)
+
+        # all2all communication often has separate implementations for
+        # intra-node and inter-node communication
+        self.intranode = in_the_same_node_as(cpu_group, source_rank=0)
+        self.internode = not self.intranode
+
+    def get_handle(self, kwargs):
+        # get a handle for the all2all communication,
+        # based on the kwargs.
+        # different layers can have different configs,
+        # e.g. one layer has hidden size 1024, another has 2048.
+        # usually the underlying implementation caches the handle
+        # and reuse it for the same config.
+        raise NotImplementedError
+
+    def dispatch(self, hidden_states: torch.Tensor,
+                 router_logits: torch.Tensor):
+        raise NotImplementedError
+
+    def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
+
+    def destroy(self):
+        pass
+
+
 class DeviceCommunicatorBase:
     """
     Base class for device-specific communicator.
@@ -31,6 +96,18 @@ def __init__(self,
         self.rank_in_group = dist.get_group_rank(self.cpu_group,
                                                  self.global_rank)
 
+        use_ep = False
+        from vllm.config import get_current_vllm_config
+        config = get_current_vllm_config()
+        if config is not None:
+            # as long as we use data parallel (coupled data parallel
+            # where all data parallel ranks execute forward together),
+            # we initialize the all2all manager used in expert parallel.
+            use_ep = config.parallel_config.data_parallel_size > 1
+
+        self.use_all2all = "ep" in unique_name and use_ep
+        self.all2all_manager: Optional[All2AllManagerBase] = None
+
     def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
         dist.all_reduce(input_, group=self.device_group)
         return input_
@@ -154,9 +231,17 @@ def prepare_communication_buffer_for_model(self,
                                                model: torch.nn.Module) -> None:
         """
         Prepare the communication buffer for the model.
-        This is a no-op in the base class.
         """
-        pass
+        if not self.use_all2all:
+            return
+
+        moe_modules = [
+            module for module in model.modules()
+            if module.__class__.__name__ == "FusedMoE"
+        ]
+        for module in moe_modules:
+            module.quant_method.init_prepare_finalize(module.moe_config,
+                                                      module.quant_config)
 
     def dispatch(
             self, hidden_states: torch.Tensor,
diff --git a/vllm/distributed/device_communicators/cpu_communicator.py b/vllm/distributed/device_communicators/cpu_communicator.py
index d4b34900b951..c04218cb9f39 100644
--- a/vllm/distributed/device_communicators/cpu_communicator.py
+++ b/vllm/distributed/device_communicators/cpu_communicator.py
@@ -22,8 +22,10 @@ def __init__(self,
         super().__init__(cpu_group, device, device_group, unique_name)
         self.dist_module = torch.distributed
 
-        if (current_platform.get_cpu_architecture() == CpuArchEnum.X86) \
-            and hasattr(torch.ops._C, "init_shm_manager"):
+        if (current_platform.get_cpu_architecture()
+                == CpuArchEnum.X86) and hasattr(
+                    torch.ops._C,
+                    "init_shm_manager") and unique_name.startswith("tp"):
             self.dist_module = _CPUSHMDistributed(self)
 
     def all_reduce(self, input_):
@@ -96,6 +98,8 @@ class _CPUSHMDistributed:
 
     def __init__(self, communicator: CpuCommunicator):
         instance_identifier = os.environ["VLLM_DIST_IDENT"]
+        unique_name = communicator.unique_name
+        instance_identifier = f"{instance_identifier}-{unique_name}"
         self.communicator = communicator
 
         group_ranks = [str(rank) for rank in self.communicator.ranks]
diff --git a/vllm/distributed/device_communicators/cuda_communicator.py b/vllm/distributed/device_communicators/cuda_communicator.py
index 13303f94b8ea..a05a13f51d4b 100644
--- a/vllm/distributed/device_communicators/cuda_communicator.py
+++ b/vllm/distributed/device_communicators/cuda_communicator.py
@@ -6,10 +6,12 @@
 from torch.distributed import ProcessGroup
 
 import vllm.envs as envs
+from vllm.logger import init_logger
 
-from .all2all import All2AllBase
 from .base_device_communicator import DeviceCommunicatorBase
 
+logger = init_logger(__name__)
+
 
 class CudaCommunicator(DeviceCommunicatorBase):
 
@@ -31,8 +33,6 @@ def __init__(self,
         use_pynccl = "ep" not in unique_name
 
         self.use_pynccl = use_pynccl
-        self.use_all2all = "ep" in unique_name
-        self.all2all_impl: Optional[All2AllBase] = None
         self.use_custom_allreduce = use_custom_allreduce
 
         # lazy import to avoid documentation build error
@@ -56,6 +56,19 @@ def __init__(self,
                 device=self.device,
             )
 
+        if self.use_all2all:
+            all2all_backend = envs.VLLM_ALL2ALL_BACKEND
+            if all2all_backend == "naive":
+                from .all2all import NaiveAll2AllManager
+                self.all2all_manager = NaiveAll2AllManager(self.cpu_group)
+                logger.info("Using naive all2all manager.")
+            elif all2all_backend == "pplx":
+                from .all2all import PPLXAll2AllManager
+                self.all2all_manager = PPLXAll2AllManager(self.cpu_group)
+                logger.info("Using PPLX all2all manager.")
+            else:
+                raise ValueError(f"Unknown all2all backend: {all2all_backend}")
+
     def all_reduce(self, input_):
         # always try custom allreduce first,
         # and then pynccl.
@@ -136,31 +149,19 @@ def destroy(self):
             self.pynccl_comm = None
         if self.ca_comm is not None:
             self.ca_comm = None
-        if self.all2all_impl is not None:
-            self.all2all_impl.destroy()
-            self.all2all_impl = None
-
-    def prepare_communication_buffer_for_model(self,
-                                               model: torch.nn.Module) -> None:
-        """
-        Prepare the communication buffer for the model.
-        """
-        if not self.use_all2all:
-            return
-        all2all_backend = envs.VLLM_ALL2ALL_BACKEND
-        if all2all_backend == "naive":
-            from .all2all import NaiveAll2All
-            self.all2all_impl = NaiveAll2All(self.cpu_group, model)
+        if self.all2all_manager is not None:
+            self.all2all_manager.destroy()
+            self.all2all_manager = None
 
     def dispatch(
             self, hidden_states: torch.Tensor,
             router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
-        assert self.all2all_impl is not None
-        hidden_states, router_logits = self.all2all_impl.dispatch(
+        assert self.all2all_manager is not None
+        hidden_states, router_logits = self.all2all_manager.dispatch(
             hidden_states, router_logits)
         return hidden_states, router_logits
 
     def combine(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        assert self.all2all_impl is not None
-        hidden_states = self.all2all_impl.combine(hidden_states)
+        assert self.all2all_manager is not None
+        hidden_states = self.all2all_manager.combine(hidden_states)
         return hidden_states
diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py
index fa944407a703..40e57e6624d1 100644
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import os
 import pickle
-import sys
 import time
 from contextlib import contextmanager
 from dataclasses import dataclass, field
@@ -19,7 +17,7 @@
 from zmq import SUB, SUBSCRIBE, XPUB, XPUB_VERBOSE, Context  # type: ignore
 
 import vllm.envs as envs
-from vllm.distributed.utils import StatelessProcessGroup
+from vllm.distributed.utils import StatelessProcessGroup, sched_yield
 from vllm.logger import init_logger
 from vllm.utils import (get_ip, get_open_port, get_open_zmq_ipc_path,
                         is_valid_ipv6_address)
@@ -28,20 +26,6 @@
 
 logger = init_logger(__name__)
 
-# We prefer to use os.sched_yield as it results in tighter polling loops,
-# measured to be around 3e-7 seconds. However on earlier versions of Python
-# os.sched_yield() does not release the GIL, so we fall back to time.sleep(0)
-USE_SCHED_YIELD = ((sys.version_info[:3] >= (3, 11, 1))
-                   or (sys.version_info[:2] == (3, 10)
-                       and sys.version_info[2] >= 8))
-
-
-def sched_yield():
-    if USE_SCHED_YIELD:
-        os.sched_yield()
-    else:
-        time.sleep(0)
-
 
 class ShmRingBuffer:
 
diff --git a/vllm/distributed/kv_transfer/__init__.py b/vllm/distributed/kv_transfer/__init__.py
index a9f26607de49..8b6abf5a80dd 100644
--- a/vllm/distributed/kv_transfer/__init__.py
+++ b/vllm/distributed/kv_transfer/__init__.py
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from vllm.distributed.kv_transfer.kv_connector.base import KVConnectorBaseType
 from vllm.distributed.kv_transfer.kv_transfer_state import (
-    ensure_kv_transfer_initialized, get_kv_transfer_group,
+    KVConnectorBaseType, ensure_kv_transfer_initialized, get_kv_transfer_group,
     has_kv_transfer_group, is_v1_kv_transfer_group)
 
 __all__ = [
diff --git a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
index 56b55c2bb59d..58eabd0a37eb 100644
--- a/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/mooncake_store_connector.py
@@ -31,12 +31,12 @@ def __init__(
         local_rank: int,
         config: VllmConfig,
     ):
-        self.config = config.kv_transfer_config
+        self.kv_transfer_config = config.kv_transfer_config
         self.kv_helper = kv_helper(config)
         self.local_tp_rank = local_rank
 
         # Init kv_store
-        if self.config.kv_connector == "MooncakeStoreConnector":
+        if self.kv_transfer_config.kv_connector == "MooncakeStoreConnector":
             # Check if MOONCAKE_CONFIG_PATH is set
             import os
             use_mooncake_store = os.getenv('MOONCAKE_CONFIG_PATH') is not None
@@ -50,10 +50,11 @@ def __init__(
                     MooncakeStore)
                 logger.info(
                     "Initializing KVStoreConnector under kv_transfer_config %s",
-                    self.config)
+                    self.kv_transfer_config)
                 self.kv_store = MooncakeStore(config)
         else:
-            logger.error("Can not find %s", self.config.kv_connector)
+            logger.error("Can not find %s",
+                         self.kv_transfer_config.kv_connector)
 
         assert self.kv_store is not None
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
index 2e4bd20740e2..ed8fe38161e9 100644
--- a/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/simple_connector.py
@@ -106,7 +106,7 @@ def __init__(
         else:
 
             # the current vLLM instance is KV consumer, so it needs to connect
-            # its recv pipe to the send pipe of KV producder
+            # its recv pipe to the send pipe of KV producer
             if self.config.kv_connector == "PyNcclConnector":
                 self.consumer_data_pipe = PyNcclPipe(
                     local_rank=local_rank,
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/base.py b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
index ef4460a592bd..bc9258e9d07b 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/base.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/base.py
@@ -210,10 +210,11 @@ def get_num_new_matched_tokens(
                 computed tokens for this request
 
         Returns:
-            * the number of tokens that can be loaded from the 
-              external KV cache beyond what is already computed.
-            * true if external KV cache tokens will be loaded
-              asynchronously (between scheduler steps).
+            A tuple with the following elements:
+                - The number of tokens that can be loaded from the 
+                  external KV cache beyond what is already computed.
+                - `True` if external KV cache tokens will be loaded
+                  asynchronously (between scheduler steps).
         """
         pass
 
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
index cea454a0b597..0aabb260fd3d 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/multi_connector.py
@@ -40,7 +40,7 @@ class MultiConnector(KVConnectorBase_V1):
 
     def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
         super().__init__(vllm_config=vllm_config, role=role)
-        self._connectors = []
+        self._connectors: list[KVConnectorBase_V1] = []
         ktcs = vllm_config.kv_transfer_config.kv_connector_extra_config.get(
             "connectors")
         assert ktcs is not None
diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
index 9c2e82b29c76..6303d77ad305 100644
--- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
@@ -259,6 +259,15 @@ def build_connector_meta(
         # Loop through scheduled reqs and convert to ReqMeta.
         for req_id, (req, block_ids) in self._reqs_need_recv.items():
             assert req.kv_transfer_params is not None
+            # For the case where there are no remote blocks to pull
+            # (block_ids is empty), we don't need to schedule
+            # an async read on the worker side.
+            if not block_ids:
+                logger.debug(
+                    "Skipping adding request %s to NixlConnectorMetadata, "
+                    "as there are no remote blocks to pull", req_id)
+                continue
+
             meta.add_new_req(
                 request_id=req_id,
                 local_block_ids=block_ids,
@@ -528,6 +537,7 @@ def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
 
     def add_remote_agent(self, nixl_agent_meta: NixlAgentMetadata):
         engine_id = nixl_agent_meta.engine_id
+        assert engine_id != self.engine_id, "Conflict engine id found!"
         if engine_id in self._remote_agents:
             return
 
diff --git a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
index fcc38d7fbd12..761c56f7e41f 100644
--- a/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
+++ b/vllm/distributed/kv_transfer/kv_pipe/pynccl_pipe.py
@@ -118,11 +118,11 @@ def _make_metadata(self, tensor: Optional[torch.Tensor]) -> Metadata:
         """
         Create the metadata as a dictionary based on the input tensor.
 
-        Parameters:
-            - tensor: The input tensor or None if no tensor is provided.
+        Args:
+            tensor: The input tensor or None if no tensor is provided.
 
         Returns:
-            - metadata: A dictionary with the following keys:
+            metadata: A dictionary with the following keys:
                 - "dtype": The data type of the tensor or None.
                 - "shape": The shape of the tensor or None.
         """
@@ -135,13 +135,13 @@ def _prepare_recv_buffer(self, metadata: Metadata) -> torch.Tensor:
         """
         Create a buffer to receive the tensor based on the provided metadata.
 
-        Parameters:
-            - metadata: A dictionary with keys "dtype" and "shape", describing
-              the tensor's data type and shape.
+        Args:
+            metadata: A dictionary with keys "dtype" and "shape",
+                describing the tensor's data type and shape.
 
         Returns:
-            - buffer: A tensor of the specified type and shape, allocated on
-              self.device.
+            buffer: A tensor of the specified type and shape,
+                allocated on `self.device`.
         """
         return torch.empty(metadata["shape"],
                            dtype=metadata["dtype"],
@@ -151,8 +151,8 @@ def _send_metadata(self, metadata: Metadata):
         """
         Send the metadata dictionary to the target rank.
 
-        Parameters:
-            - metadata: A dictionary with keys "dtype" and "shape".
+        Args:
+            metadata: A dictionary with keys "dtype" and "shape".
         """
         self.group.send_obj(metadata, self.target_rank_for_send)
 
@@ -161,8 +161,8 @@ def _recv_metadata(self) -> Metadata:
         Receive the metadata dictionary from the target rank.
 
         Returns:
-            - metadata: A dictionary with keys "dtype" and "shape" describing
-              the tensor.
+            metadata: A dictionary with keys "dtype" and "shape"
+                describing the tensor.
         """
         return self.group.recv_obj(self.target_rank_for_recv)
 
@@ -171,9 +171,9 @@ def _send_impl(self, tensor: Optional[torch.Tensor]) -> None:
         The actual implementation of sending the tensor and its metadata to the
         target rank.
 
-        Parameters:
-            - tensor: The input tensor to be sent, or None if no tensor is
-              being sent.
+        Args:
+            tensor: The input tensor to be sent, or `None` if no tensor is
+                being sent.
         """
         metadata = self._make_metadata(tensor)
         self._send_metadata(metadata)
@@ -187,7 +187,7 @@ def _recv_impl(self) -> Optional[torch.Tensor]:
         the target rank.
 
         Returns:
-            - buffer: The received tensor, or None if no tensor is received.
+            buffer: The received tensor, or `None` if no tensor is received.
         """
         metadata = self._recv_metadata()
         if metadata["dtype"] is None:
@@ -227,8 +227,8 @@ def send_tensor(self, tensor: Optional[torch.Tensor]) -> None:
         Sends a tensor and its metadata to the destination rank in a
         non-blocking way.
 
-        Parameters:
-            - tensor: The tensor to send, or None if no tensor is being sent.
+        Args:
+            tensor: The tensor to send, or `None` if no tensor is being sent.
         """
         if self.transport_thread is None:
             self.transport_thread = ThreadPoolExecutor(max_workers=1)
@@ -250,8 +250,8 @@ def recv_tensor(self) -> Optional[torch.Tensor]:
         """
         Receives a tensor and its metadata from the source rank. Blocking call.
 
-        Returns:
-            - tensor: The received tensor, or None if no tensor is received.
+        Args:
+            tensor: The received tensor, or `None` if no tensor is received.
         """
         if self.transport_thread is None:
             self.transport_thread = ThreadPoolExecutor(max_workers=1)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 51c519d8f862..b674d05a7771 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -23,7 +23,6 @@
 """
 import contextlib
 import gc
-import importlib.util
 import pickle
 import weakref
 from collections import namedtuple
@@ -43,7 +42,7 @@
 from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 from vllm.utils import (direct_register_custom_op, resolve_obj_by_qualname,
-                        run_once, supports_custom_op)
+                        supports_custom_op)
 
 
 @dataclass
@@ -120,7 +119,7 @@ def reduce_scatter(tensor: torch.Tensor, dim: int, world_size: int,
     group = _groups[group_name]()
     if group is None:
         raise ValueError(f"Group {group_name} is destroyed.")
-    return group.reduce_scatter(tensor, dim)
+    return group._reduce_scatter_out_place(tensor, dim)
 
 
 def reduce_scatter_fake(tensor: torch.Tensor, dim: int, world_size: int,
@@ -136,7 +135,7 @@ def all_gather(tensor: torch.Tensor, dim: int, world_size: int,
     group = _groups[group_name]()
     if group is None:
         raise ValueError(f"Group {group_name} is destroyed.")
-    return group.all_gather(tensor, dim)
+    return group._all_gather_out_place(tensor, dim)
 
 
 def all_gather_fake(tensor: torch.Tensor, dim: int, world_size: int,
@@ -161,6 +160,7 @@ def all_gather_fake(tensor: torch.Tensor, dim: int, world_size: int,
         op_func=reduce_scatter,
         mutates_args=[],
         fake_impl=reduce_scatter_fake,
+        dispatch_key=current_platform.dispatch_key,
     )
 
     direct_register_custom_op(
@@ -168,6 +168,7 @@ def all_gather_fake(tensor: torch.Tensor, dim: int, world_size: int,
         op_func=all_gather,
         mutates_args=[],
         fake_impl=all_gather_fake,
+        dispatch_key=current_platform.dispatch_key,
     )
 
 
@@ -367,6 +368,16 @@ def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
         assert -input_.dim() <= dim < input_.dim(), (
             f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
 
+        if self.use_custom_op_call:
+            return torch.ops.vllm.all_gather(input_,
+                                             dim,
+                                             world_size,
+                                             group_name=self.unique_name)
+        else:
+            return self._all_gather_out_place(input_, dim)
+
+    def _all_gather_out_place(self, input_: torch.Tensor,
+                              dim: int) -> torch.Tensor:
         return self.device_communicator.all_gather(input_, dim)
 
     def reduce_scatter(self,
@@ -379,6 +390,16 @@ def reduce_scatter(self,
         assert -input_.dim() <= dim < input_.dim(), (
             f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
 
+        if self.use_custom_op_call:
+            return torch.ops.vllm.reduce_scatter(input_,
+                                                 dim,
+                                                 world_size,
+                                                 group_name=self.unique_name)
+        else:
+            return self._reduce_scatter_out_place(input_, dim)
+
+    def _reduce_scatter_out_place(self, input_: torch.Tensor,
+                                  dim: int) -> torch.Tensor:
         return self.device_communicator.reduce_scatter(input_, dim)
 
     def gather(self,
@@ -769,10 +790,14 @@ def dispatch(
         if self.device_communicator is not None:
             return self.device_communicator.dispatch(hidden_states,
                                                      router_logits)
+        else:
+            return hidden_states, router_logits
 
     def combine(self, hidden_states) -> torch.Tensor:
         if self.device_communicator is not None:
             return self.device_communicator.combine(hidden_states)
+        else:
+            return hidden_states
 
 
 _WORLD: Optional[GroupCoordinator] = None
@@ -937,49 +962,9 @@ def init_distributed_environment(
             "world group already initialized with a different world size")
 
 
-PPLX_DID_INIT: bool = False
-
-
-@run_once
-def pplx_init(rank, world_size):
-    has_pplx = importlib.util.find_spec("pplx_kernels") is not None
-
-    if has_pplx and world_size > 1:
-        from pplx_kernels.nvshmem import (nvshmem_alloc_empty_unique_id,
-                                          nvshmem_get_unique_id, nvshmem_init)
-        try:
-            global PPLX_DID_INIT
-            logger.debug(
-                "Initialize NVSHMEM for PPLX kernels: rank=%d, "
-                "world size=%d", rank, world_size)
-            uid = nvshmem_get_unique_id(
-            ) if rank == 0 else nvshmem_alloc_empty_unique_id()
-            uid_gpu = uid.cuda()
-            get_world_group().broadcast(uid_gpu, src=0)
-            uid = uid_gpu.to(device='cpu')
-            logger.debug("PPLX NVSHMEM UID = %s", uid)
-            nvshmem_init(uid, rank, world_size)
-            PPLX_DID_INIT = True
-        except Exception as ex:
-            logger.error("Failed to initialize NVSHMEM for PPLX: %s", ex)
-
-
-@run_once
-def pplx_finalize():
-    global PPLX_DID_INIT
-    if PPLX_DID_INIT:
-        from pplx_kernels.nvshmem import nvshmem_finalize
-        logger.debug("PPLX NVSHMEM finalize")
-        from vllm.model_executor.layers.fused_moe.layer import (
-            _all_to_all_cache)
-        _all_to_all_cache.destroy()
-        nvshmem_finalize()
-
-
 def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
-    enable_expert_parallel: bool = False,
     backend: Optional[str] = None,
 ) -> None:
     """
@@ -1082,14 +1067,10 @@ def initialize_model_parallel(
         _DP.rank_in_group, _PP.rank_in_group, _TP.rank_in_group,
         _EP.rank_in_group)
 
-    if enable_expert_parallel:
-        pplx_init(rank, world_size)
-
 
 def ensure_model_parallel_initialized(
     tensor_model_parallel_size: int,
     pipeline_model_parallel_size: int,
-    enable_expert_parallel: bool = False,
     backend: Optional[str] = None,
 ) -> None:
     """Helper to initialize model parallel groups if they are not initialized,
@@ -1100,8 +1081,7 @@ def ensure_model_parallel_initialized(
         get_world_group().device_group)
     if not model_parallel_is_initialized():
         initialize_model_parallel(tensor_model_parallel_size,
-                                  pipeline_model_parallel_size,
-                                  enable_expert_parallel, backend)
+                                  pipeline_model_parallel_size, backend)
         return
 
     assert (
@@ -1180,8 +1160,6 @@ def destroy_model_parallel():
     """Set the groups to none and destroy them."""
     global _TP
 
-    pplx_finalize()
-
     if _TP:
         _TP.destroy()
     _TP = None
@@ -1221,8 +1199,9 @@ def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
         ray.shutdown()
     gc.collect()
     from vllm.platforms import current_platform
-    if not current_platform.is_cpu():
-        torch.cuda.empty_cache()
+    empty_cache = current_platform.empty_cache
+    if empty_cache is not None:
+        empty_cache()
     try:
         torch._C._host_emptyCache()
     except AttributeError:
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 6bb323d79d64..93a069d36c4b 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -6,9 +6,12 @@
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 import dataclasses
 import datetime
+import os
 import pickle
 import socket
+import sys
 import time
+import uuid
 from collections import deque
 from collections.abc import Sequence
 from typing import Any, Optional
@@ -27,6 +30,20 @@
 
 logger = init_logger(__name__)
 
+# We prefer to use os.sched_yield as it results in tighter polling loops,
+# measured to be around 3e-7 seconds. However on earlier versions of Python
+# os.sched_yield() does not release the GIL, so we fall back to time.sleep(0)
+USE_SCHED_YIELD = ((sys.version_info[:3] >= (3, 11, 1))
+                   or (sys.version_info[:2] == (3, 10)
+                       and sys.version_info[2] >= 8))
+
+
+def sched_yield():
+    if USE_SCHED_YIELD:
+        os.sched_yield()
+    else:
+        time.sleep(0)
+
 
 def ensure_divisibility(numerator, denominator):
     """Ensure that numerator is divisible by the denominator."""
@@ -212,10 +229,141 @@ def all_gather_obj(self, obj: Any) -> list[Any]:
                 gathered_objs.append(recv_obj)
         return gathered_objs
 
-    def barrier(self):
-        """A barrier to synchronize all ranks."""
+    def barrier(self, timeout: float = 30.0):
+        """A robust barrier to synchronize all ranks.
+
+
+        Uses a multi-phase approach to ensure all processes reach the barrier
+        before proceeding:
+
+        1. Each process signals it has reached the barrier
+
+        2. Each process signals that it has confirmed the arrival of all other
+        ranks.
+
+        3. Rank 0 waits for all other ranks to signal their departure to ensure
+        that all ranks have departed the barrier first.
+
+        Args:
+            timeout: Maximum time in seconds to wait for each phase (in seconds)
+
+
+        Raises:
+            RuntimeError: If coordination fails or times out
+        """
+        # Generate a barrier ID that is globally unique
+        try:
+            if self.rank == 0:
+                barrier_id = f"barrier_{uuid.uuid4()}"
+                self.broadcast_obj(barrier_id, src=0)
+            else:
+                barrier_id = self.broadcast_obj(None, src=0)
+        except Exception as e:
+            raise RuntimeError("Failed to broadcast barrier_id") from e
+
+        # Phase 1: Signal arrival at barrier
+        # Wait for all processes to arrive
+        # We need all ranks to confirm the arrival of all other ranks.
+        # This is the key synchronization point.
+        arrival_key = f"arrival_{barrier_id}_{self.rank}"
+        try:
+            self.store.set(arrival_key, b"1")
+        except Exception as e:
+            raise RuntimeError("Failed to signal barrier arrival") from e
+
+        start_time = time.time()
+        processes_arrived: set[int] = set()
+
+        while len(processes_arrived) < self.world_size:
+            # Check for timeout
+            cur_time = time.time()
+            if cur_time - start_time > timeout:
+                raise RuntimeError("Barrier timed out after %f seconds",
+                                   timeout)
+
+            # Check for each process
+            for i in range(self.world_size):
+                if i in processes_arrived:
+                    continue
+
+                key = f"arrival_{barrier_id}_{i}"
+                try:
+                    # Try to get the key - if it exists, we'll get a value
+                    # If it doesn't exist, it will throw an exception
+                    self.store.get(key)
+                    processes_arrived.add(i)
+                except KeyError:
+                    # Key doesn't exist yet
+                    pass
+                except Exception as check_e:
+                    logger.debug("Error checking key existence: %s", check_e)
+                    sched_yield()
+
+            # Short sleep to avoid tight polling
+            if len(processes_arrived) < self.world_size:
+                sched_yield()
+
+        # Phase 2: Signal departure from barrier
+        # We only care to block at this stage in rank 0, which runs the
+        # server side of the TCPStore. We want to make sure that all
+        # clients have departed the barrier before rank 0 in case the
+        # next thing after the barrier is a shutdown, including tearing
+        # down the TCPStore. Other ranks can exit the barrier immediately
+        # after signaling their departure.
+        departure_key = f"departure_{barrier_id}_{self.rank}"
+        try:
+            self.store.set(departure_key, b"1")
+        except Exception as e:
+            raise RuntimeError("Failed to signal barrier departure") from e
+
+        if self.rank != 0:
+            return
+
+        # Make rank 0 wait for all processes to signal departure
+        start_time = time.time()
+        processes_departed: set[int] = set()
+
+        while len(processes_departed) < self.world_size:
+            # Check for timeout
+            if time.time() - start_time > timeout:
+                raise RuntimeError("Barrier departure timed out after %f s",
+                                   timeout)
+
+            # Check for each process
+            for i in range(self.world_size):
+                if i in processes_departed:
+                    continue
+
+                key = f"departure_{barrier_id}_{i}"
+                try:
+                    # Try to get the key - if it exists, we'll get a value
+                    # If it doesn't exist, it will throw an exception
+                    self.store.get(key)
+                    processes_departed.add(i)
+                except KeyError:
+                    # Key doesn't exist yet
+                    pass
+                except Exception as check_e:
+                    logger.debug("Error checking key existence: %s", check_e)
+                    sched_yield()
+
+            # Short sleep to avoid tight polling
+            if len(processes_departed) < self.world_size:
+                sched_yield()
+
+        # Clean up keys to avoid leaking memory in the store
         for i in range(self.world_size):
-            self.broadcast_obj(None, src=i)
+            try:
+                self.store.delete_key(f"arrival_{barrier_id}_{i}")
+            except Exception:
+                logger.debug("Error deleting key: %s",
+                             f'arrival_{barrier_id}_{i}')
+
+            try:
+                self.store.delete_key(f"departure_{barrier_id}_{i}")
+            except Exception:
+                logger.debug("Error deleting key: %s",
+                             f'departure_{barrier_id}_{i}')
 
     @staticmethod
     def create(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index f0c6b15b79da..442e4100fea1 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -4,7 +4,6 @@
 import argparse
 import dataclasses
 import json
-import re
 import sys
 import threading
 import warnings
@@ -13,6 +12,7 @@
 from typing import (Annotated, Any, Callable, Dict, List, Literal, Optional,
                     Type, TypeVar, Union, cast, get_args, get_origin)
 
+import regex as re
 import torch
 from typing_extensions import TypeIs, deprecated
 
@@ -577,7 +577,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             action=argparse.BooleanOptionalAction,
             deprecated=True,
             help="[DEPRECATED] The `--enable-reasoning` flag is deprecated as "
-            "of v0.8.6. Use `--reasoning-parser` to specify the reasoning "
+            "of v0.9.0. Use `--reasoning-parser` to specify the reasoning "
             "parser backend instead. This flag (`--enable-reasoning`) will be "
             "removed in v0.10.0. When `--reasoning-parser` is specified, "
             "reasoning mode is automatically enabled.")
@@ -737,7 +737,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             title="DeviceConfig",
             description=DeviceConfig.__doc__,
         )
-        device_group.add_argument("--device", **device_kwargs["device"])
+        device_group.add_argument("--device",
+                                  **device_kwargs["device"],
+                                  deprecated=True)
 
         # Speculative arguments
         speculative_group = parser.add_argument_group(
@@ -977,7 +979,7 @@ def create_engine_config(
         from vllm.platforms import current_platform
         current_platform.pre_register_and_update()
 
-        device_config = DeviceConfig(device=self.device)
+        device_config = DeviceConfig(device=current_platform.device_type)
         model_config = self.create_model_config()
 
         # * If VLLM_USE_V1 is unset, we enable V1 for "supported features"
@@ -1082,7 +1084,7 @@ def create_engine_config(
             disable_log_stats=self.disable_log_stats,
         )
 
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
+        # Reminder: Please update docs/features/compatibility_matrix.md
         # If the feature combo become valid
         if self.num_scheduler_steps > 1:
             if speculative_config is not None:
@@ -1193,8 +1195,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
         #############################################################
         # Unsupported Feature Flags on V1.
 
-        if (self.load_format == LoadFormat.TENSORIZER.value
-                or self.load_format == LoadFormat.SHARDED_STATE.value):
+        if self.load_format == LoadFormat.SHARDED_STATE.value:
             _raise_or_fallback(
                 feature_name=f"--load_format {self.load_format}",
                 recommend_to_remove=False)
@@ -1290,14 +1291,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                recommend_to_remove=False)
             return False
 
-        # Some quantization is not compatible with torch.compile.
-        V1_UNSUPPORTED_QUANT = ["gguf"]
-        if model_config.quantization in V1_UNSUPPORTED_QUANT:
-            _raise_or_fallback(
-                feature_name=f"--quantization {model_config.quantization}",
-                recommend_to_remove=False)
-            return False
-
         # No Embedding Models so far.
         if model_config.task not in ["generate"]:
             _raise_or_fallback(feature_name=f"--task {model_config.task}",
@@ -1337,7 +1330,7 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                     is_ngram_enabled = True
                 elif speculative_method == "medusa":
                     is_medusa_enabled = True
-                elif speculative_method in ("eagle", "eagle3"):
+                elif speculative_method in ("eagle", "eagle3", "deepseek_mtp"):
                     is_eagle_enabled = True
             else:
                 speculative_model = self.speculative_config.get("model")
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 56b9e49d24d9..19b219b674f3 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -475,7 +475,8 @@ async def add_request_async(
             *,
             inputs: Optional[PromptType] = None,  # DEPRECATED
     ) -> None:
-        """Async version of {meth}`add_request`."""
+        """Async version of
+        [`add_request`][vllm.engine.llm_engine.LLMEngine.add_request]."""
         if inputs is not None:
             prompt = inputs
         assert prompt is not None and params is not None
@@ -582,20 +583,21 @@ async def build_guided_decoding_logits_processor_async(
 
 
 class AsyncLLMEngine(EngineClient):
-    """An asynchronous wrapper for {class}`LLMEngine`.
+    """An asynchronous wrapper for [`LLMEngine`][vllm.LLMEngine].
 
-    This class is used to wrap the {class}`LLMEngine` class to make it
-    asynchronous. It uses asyncio to create a background loop that keeps
-    processing incoming requests. The {class}`LLMEngine` is kicked by the
-    generate method when there are requests in the waiting queue. The generate
-    method yields the outputs from the {class}`LLMEngine` to the caller.
+    This class is used to wrap the [`LLMEngine`][vllm.LLMEngine] class to
+    make it asynchronous. It uses asyncio to create a background loop that keeps
+    processing incoming requests. The [`LLMEngine`][vllm.LLMEngine] is kicked
+    by the generate method when there are requests in the waiting queue. The
+    generate method yields the outputs from the [`LLMEngine`][vllm.LLMEngine]
+    to the caller.
 
     Args:
         log_requests: Whether to log the requests.
         start_engine_loop: If True, the background task to run the engine
             will be automatically started in the generate call.
-        *args: Arguments for {class}`LLMEngine`.
-        **kwargs: Arguments for {class}`LLMEngine`.
+        *args: Arguments for [`LLMEngine`][vllm.LLMEngine].
+        **kwargs: Arguments for [`LLMEngine`][vllm.LLMEngine].
     """
 
     _engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
@@ -985,8 +987,9 @@ async def generate(
         from the LLMEngine to the caller.
 
         Args:
-            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
-                for more details about the format of each input.
+            prompt: The prompt to the LLM. See
+                [`PromptType`][vllm.inputs.PromptType] for more details about
+                the format of each input.
             sampling_params: The sampling parameters of the request.
             request_id: The unique id of the request.
             lora_request: LoRA request to use for generation, if any.
@@ -1003,7 +1006,7 @@ async def generate(
         Details:
             - If the engine is not running, start the background loop,
               which iteratively invokes
-              {meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
+              [`engine_step`][vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step]
               to process the waiting requests.
             - Add the request to the engine's `RequestTracker`.
               On the next background loop, this request will be sent to
@@ -1075,8 +1078,9 @@ async def encode(
         from the LLMEngine to the caller.
 
         Args:
-            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
-                for more details about the format of each input.
+            prompt: The prompt to the LLM. See
+                [`PromptType`][vllm.inputs.PromptType] for more details about
+                the format of each input.
             pooling_params: The pooling parameters of the request.
             request_id: The unique id of the request.
             lora_request: LoRA request to use for generation, if any.
@@ -1089,15 +1093,15 @@ async def encode(
             for the request.
 
         Details:
-        - If the engine is not running, start the background loop,
-            which iteratively invokes
-            {meth}`~vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`
-            to process the waiting requests.
-        - Add the request to the engine's `RequestTracker`.
-            On the next background loop, this request will be sent to
-            the underlying engine.
-            Also, a corresponding `AsyncStream` will be created.
-        - Wait for the request outputs from `AsyncStream` and yield them.
+            - If the engine is not running, start the background loop,
+                which iteratively invokes
+                [`vllm.engine.async_llm_engine.AsyncLLMEngine.engine_step`][]
+                to process the waiting requests.
+            - Add the request to the engine's `RequestTracker`.
+                On the next background loop, this request will be sent to
+                the underlying engine.
+                Also, a corresponding `AsyncStream` will be created.
+            - Wait for the request outputs from `AsyncStream` and yield them.
 
         Example:
         ```
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 2a27afe9757e..ff33d566ab68 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -130,26 +130,16 @@ class LLMEngine:
     iteration-level scheduling and efficient memory management to maximize the
     serving throughput.
 
-    The {class}`~vllm.LLM` class wraps this class for offline batched inference
-    and the {class}`AsyncLLMEngine` class wraps this class for online serving.
+    The [`LLM`][vllm.LLM] class wraps this class for offline batched inference
+    and the [`AsyncLLMEngine`][vllm.engine.async_llm_engine.AsyncLLMEngine]
+    class wraps this class for online serving.
 
-    The config arguments are derived from {class}`~vllm.EngineArgs`. (See
-    {ref}`engine-args`)
+    The config arguments are derived from [`EngineArgs`][vllm.EngineArgs].
 
     Args:
-        model_config: The configuration related to the LLM model.
-        cache_config: The configuration related to the KV cache memory
-            management.
-        parallel_config: The configuration related to distributed execution.
-        scheduler_config: The configuration related to the request scheduler.
-        device_config: The configuration related to the device.
-        lora_config (Optional): The configuration related to serving multi-LoRA.
-        speculative_config (Optional): The configuration related to speculative
-            decoding.
+        vllm_config: The configuration for initializing and running vLLM.
         executor_class: The model executor class for managing distributed
             execution.
-        prompt_adapter_config (Optional): The configuration related to serving
-            prompt adapters.
         log_stats: Whether to log statistics.
         usage_context: Specified entry point, used for usage info collection.
     """
@@ -695,11 +685,12 @@ def add_request(
 
         Args:
             request_id: The unique ID of the request.
-            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
+            prompt: The prompt to the LLM. See
+                [PromptType][vllm.inputs.PromptType]
                 for more details about the format of each input.
             params: Parameters for sampling or pooling.
-                {class}`~vllm.SamplingParams` for text generation.
-                {class}`~vllm.PoolingParams` for pooling.
+                [SamplingParams][vllm.SamplingParams] for text generation.
+                [PoolingParams][vllm.PoolingParams] for pooling.
             arrival_time: The arrival time of the request. If None, we use
                 the current monotonic time.
             lora_request: The LoRA request to add.
@@ -711,10 +702,11 @@ def add_request(
         Details:
             - Set arrival_time to the current time if it is None.
             - Set prompt_token_ids to the encoded prompt if it is None.
-            - Create `n` number of {class}`~vllm.Sequence` objects.
-            - Create a {class}`~vllm.SequenceGroup` object
-              from the list of {class}`~vllm.Sequence`.
-            - Add the {class}`~vllm.SequenceGroup` object to the scheduler.
+            - Create `n` number of [Sequence][vllm.Sequence] objects.
+            - Create a [SequenceGroup][vllm.SequenceGroup] object
+              from the list of [Sequence][vllm.Sequence].
+            - Add the [SequenceGroup][vllm.SequenceGroup] object to the
+              scheduler.
 
         Example:
             >>> # initialize engine
@@ -861,9 +853,7 @@ def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
             request_id: The ID(s) of the request to abort.
 
         Details:
-            - Refer to the
-              {meth}`~vllm.core.scheduler.Scheduler.abort_seq_group`
-              from class {class}`~vllm.core.scheduler.Scheduler`.
+            - Refer to [vllm.core.scheduler.Scheduler.abort_seq_group][].
 
         Example:
             >>> # initialize engine and add a request with request_id
@@ -1263,12 +1253,10 @@ def _advance_to_next_step(
     def step(self) -> List[Union[RequestOutput, PoolingRequestOutput]]:
         """Performs one decoding iteration and returns newly generated results.
 
-        :::{figure} https://i.imgur.com/sv2HssD.png
-        :alt: Overview of the step function
-        :align: center
-
-        Overview of the step function.
-        :::
+        <figure markdown="span">
+        ![Overview of the step function](https://i.imgur.com/sv2HssD.png)
+        <figcaption>Overview of the step function</figcaption>
+        </figure>
 
         Details:
         - Step 1: Schedules the sequences to be executed in the next
@@ -1662,6 +1650,20 @@ def _get_stats(self,
         gpu_prefix_cache_hit_rate = self.scheduler[
             0].get_prefix_cache_hit_rate(Device.GPU)
 
+        # Exchange the uasge and cache hit stats between gpu and cpu when
+        # running on cpu because the cpu_worker.py intentionally reports the
+        # number of cpu blocks as gpu blocks in favor of cache management.
+        if self.device_config.device_type == "cpu":
+            num_total_gpu, num_total_cpu = num_total_cpu, num_total_gpu
+            gpu_cache_usage_sys, cpu_cache_usage_sys = (
+                cpu_cache_usage_sys,
+                gpu_cache_usage_sys,
+            )
+            gpu_prefix_cache_hit_rate, cpu_prefix_cache_hit_rate = (
+                cpu_prefix_cache_hit_rate,
+                gpu_prefix_cache_hit_rate,
+            )
+
         # Iteration stats
         num_prompt_tokens_iter = 0
         num_generation_tokens_iter = 0
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 033551d07c39..34b48f83b643 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -29,7 +29,7 @@
 # to extract the metrics definitions.
 
 
-# begin-metrics-definitions
+# --8<-- [start:metrics-definitions]
 class Metrics:
     """
     vLLM uses a multiprocessing-based frontend for the OpenAI server.
@@ -293,7 +293,7 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
             labelnames=labelnames))
 
 
-# end-metrics-definitions
+# --8<-- [end:metrics-definitions]
 
     def _unregister_vllm_metrics(self) -> None:
         for collector in list(prometheus_client.REGISTRY._collector_to_names):
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index eea89a9a055f..18b7c187bdff 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -492,8 +492,9 @@ def generate(
         from the LLMEngine to the caller.
 
         Args:
-            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
-                for more details about the format of each input.
+            prompt: The prompt to the LLM. See
+                [`PromptType`][vllm.inputs.PromptType] for more details about
+                the format of each input.
             sampling_params: The sampling parameters of the request.
             request_id: The unique id of the request.
             lora_request: LoRA request to use for generation, if any.
@@ -561,8 +562,9 @@ def encode(
         from the LLMEngine to the caller.
 
         Args:
-            prompt: The prompt to the LLM. See {class}`~vllm.inputs.PromptType`
-                for more details about the format of each input.
+            prompt: The prompt to the LLM. See
+                [`PromptType`][vllm.inputs.PromptType] for more details about
+                the format of each input.
             pooling_params: The pooling parameters of the request.
             request_id: The unique id of the request.
             lora_request: LoRA request to use for generation, if any.
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index ac234d25373d..434cb4985562 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -42,19 +42,22 @@
 
 
 class MQLLMEngine:
-    """A multiprocessing wrapper for {class}`LLMEngine`.
+    """A multiprocessing wrapper for
+    [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
 
-    This class is used to wrap the {class}`LLMEngine` class to enable use
+    This class is used to wrap the
+    [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] class to enable use
     in concurrnet manner. It runs a background loop and uses zeromq to
     receive new requests and stream outputs incrementally via ipc.
 
-    The {class}`LLMEngine` generate or encode process is kicked off when a new
-    RPCProcessRequest is received by the input_socket.
+    The [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] generate or encode
+    process is kicked off when a new RPCProcessRequest is received by the
+    input_socket.
 
     The self.engine_loop checks the input_socket for new requests,
     adds them to the LLMEngine if there are any, calls the internal
-    {class}`LLMEngine.step()`, and sends the RequestOutputs back over
-    the output_socket.
+    [`LLMEngine.step()`][vllm.engine.llm_engine.LLMEngine.step], and sends
+    the RequestOutputs back over the output_socket.
 
     If use_async_sockets is set, the logic associated with reading new
     requests from the socket and sending data to the socket is passed
@@ -65,8 +68,8 @@ class MQLLMEngine:
         ipc_path: Base path for zeromq interprocess messaging
         use_async_sockets: Whether to make send/recv async with GPU
         log_requests: Whether to log the requests.
-        *args: Arguments for {class}`LLMEngine`.
-        **kwargs: Arguments for {class}`LLMEngine`.
+        *args: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
+        **kwargs: Arguments for [`LLMEngine`][vllm.engine.llm_engine.LLMEngine].
     """
 
     def __init__(self,
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 4cfb22c5a750..110f84a65efc 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -56,8 +56,11 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
         scheduled computation.
 
         Args:
-          seq_group: the outputs are associated with this {class}`SequenceGroup`
-          outputs: the {class}`SequenceGroupOutput`s for all scheduler steps
+          seq_group: the outputs are associated with this
+              [`SequenceGroup`][vllm.sequence.SequenceGroup]
+          outputs: the
+              [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]s
+              for all scheduler steps
         """
         for output in outputs:
             # Concatenate single-step prompt logprob processing results.
@@ -67,7 +70,7 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
     @staticmethod
     @functools.lru_cache
     def _log_prompt_logprob_unsupported_warning_once():
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
+        # Reminder: Please update docs/features/compatibility_matrix.md
         # If the feature combo become valid
         logger.warning(
             "Prompt logprob is not supported by multi step workers. "
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index ea4b71a5b9cd..e88f119c8742 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -19,17 +19,21 @@
 def single_step_process_prompt_logprob(
         sg_output_proc: SequenceGroupOutputProcessor, seq_group: SequenceGroup,
         output: CompletionSequenceGroupOutput) -> None:
-    """Process prompt logprobs associated with the {class}`SequenceGroupOutput`
-    for a given step.
+    """Process prompt logprobs associated with the
+    [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput] for a given step.
 
     Do nothing if the output has no prompt logprobs.
 
     Account for the fact that transformers do not compute first-token logprobs.
     
     Args:
-      sg_output_proc: {class}`SequenceGroupOutputProcessor` instance
-      seq_group: the output is associated with this {class}`SequenceGroup`
-      output: the {class}`SequenceGroupOutput` for a single scheduler step
+      sg_output_proc:
+          [`SequenceGroupOutputProcessor`][vllm.engine.output_processor.interfaces.SequenceGroupOutputProcessor]
+          instance
+      seq_group: the output is associated with this
+          [`SequenceGroup`][vllm.sequence.SequenceGroup]
+      output: the [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
+          for a single scheduler step
     """
     prompt_logprobs = output.prompt_logprobs
 
@@ -103,8 +107,11 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
         scheduled computation.
         
         Args:
-          seq_group: the output is associated with this {class}`SequenceGroup`
-          outputs: the {class}`SequenceGroupOutput` for a single scheduler step
+          seq_group: the output is associated with this
+              [`SequenceGroup`][vllm.sequence.SequenceGroup]
+          outputs: the
+              [`SequenceGroupOutput`][vllm.sequence.SequenceGroupOutput]
+              for a single scheduler step
         """
         assert len(outputs) == 1, "Single step should only have 1 output."
         output = outputs[0]
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index e8d10017a1e9..ec1b327da905 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -556,6 +556,8 @@ def _placeholder_str(self, modality: ModalityStr,
                 return "(<audio>./</audio>)"
             raise TypeError(f"Unknown model type: {model_type}")
         elif modality == "video":
+            if model_type == "internvl_chat":
+                return "<video>"
             if model_type in ("qwen2_vl", "qwen2_5_vl"):
                 return "<|vision_start|><|video_pad|><|vision_end|>"
             if model_type == "qwen2_5_omni":
diff --git a/vllm/entrypoints/cli/main.py b/vllm/entrypoints/cli/main.py
index b7c1afce7118..6676c294c81c 100644
--- a/vllm/entrypoints/cli/main.py
+++ b/vllm/entrypoints/cli/main.py
@@ -9,7 +9,7 @@
 import vllm.entrypoints.cli.openai
 import vllm.entrypoints.cli.serve
 import vllm.version
-from vllm.entrypoints.utils import cli_env_setup
+from vllm.entrypoints.utils import VLLM_SERVE_PARSER_EPILOG, cli_env_setup
 from vllm.utils import FlexibleArgumentParser
 
 CMD_MODULES = [
@@ -32,7 +32,10 @@ def signal_handler(sig, frame):
 def main():
     cli_env_setup()
 
-    parser = FlexibleArgumentParser(description="vLLM CLI")
+    parser = FlexibleArgumentParser(
+        description="vLLM CLI",
+        epilog=VLLM_SERVE_PARSER_EPILOG,
+    )
     parser.add_argument('-v',
                         '--version',
                         action='version',
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 04be7c033998..957fec290bf2 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -11,6 +11,8 @@
 from vllm.entrypoints.openai.api_server import run_server
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
                                               validate_parsed_serve_args)
+from vllm.entrypoints.utils import (VLLM_SERVE_PARSER_EPILOG,
+                                    show_filtered_argument_or_group_from_help)
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import FlexibleArgumentParser, get_tcp_uri
@@ -77,7 +79,10 @@ def subparser_init(
             "https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#cli-reference"
         )
 
-        return make_arg_parser(serve_parser)
+        serve_parser = make_arg_parser(serve_parser)
+        show_filtered_argument_or_group_from_help(serve_parser)
+        serve_parser.epilog = VLLM_SERVE_PARSER_EPILOG
+        return serve_parser
 
 
 def cmd_init() -> list[CLISubcommand]:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 053ee55bb6a8..59cc44eb0e18 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -4,7 +4,8 @@
 import warnings
 from collections.abc import Sequence
 from contextlib import contextmanager
-from typing import Any, Callable, ClassVar, Optional, Union, cast, overload
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union,
+                    cast, overload)
 
 import cloudpickle
 import torch.nn as nn
@@ -47,6 +48,9 @@
 from vllm.utils import (Counter, Device, deprecate_args, deprecate_kwargs,
                         is_list_of)
 
+if TYPE_CHECKING:
+    from vllm.v1.metrics.reader import Metric
+
 logger = init_logger(__name__)
 
 _R = TypeVar("_R", default=Any)
@@ -116,7 +120,8 @@ class LLM:
             to eager mode. Additionally for encoder-decoder models, if the
             sequence length of the encoder input is larger than this, we fall
             back to the eager mode.
-        disable_custom_all_reduce: See {class}`~vllm.config.ParallelConfig`
+        disable_custom_all_reduce: See
+            [ParallelConfig][vllm.config.ParallelConfig].
         disable_async_output_proc: Disable async output processing.
             This may result in lower performance.
         hf_token: The token to use as HTTP bearer authorization for remote files
@@ -128,13 +133,11 @@ class LLM:
         compilation_config: Either an integer or a dictionary. If it is an
             integer, it is used as the level of compilation optimization. If it
             is a dictionary, it can specify the full compilation configuration.
-        **kwargs: Arguments for {class}`~vllm.EngineArgs`. (See
-            {ref}`engine-args`)
+        **kwargs: Arguments for [`EngineArgs`][vllm.EngineArgs].
 
-    :::{note}
-    This class is intended to be used for offline inference. For online
-    serving, use the {class}`~vllm.AsyncLLMEngine` class instead.
-    :::
+    Note:
+        This class is intended to be used for offline inference. For online
+        serving, use the [AsyncLLMEngine][vllm.AsyncLLMEngine] class instead.
     """
 
     DEPRECATE_LEGACY: ClassVar[bool] = True
@@ -143,7 +146,7 @@ class LLM:
     DEPRECATE_INIT_POSARGS: ClassVar[bool] = True
     """
     A flag to toggle whether to deprecate positional arguments in
-    {meth}`LLM.__init__`.
+    [LLM.__init__][].
     """
 
     @classmethod
@@ -404,7 +407,7 @@ def generate(
 
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
-                for batch inference. See {class}`~vllm.inputs.PromptType`
+                for batch inference. See [PromptType][vllm.inputs.PromptType]
                 for more details about the format of each prompts.
             sampling_params: The sampling parameters for text generation. If
                 None, we use the default sampling parameters.
@@ -422,11 +425,10 @@ def generate(
             A list of `RequestOutput` objects containing the
             generated completions in the same order as the input prompts.
 
-        :::{note}
-        Using `prompts` and `prompt_token_ids` as keyword parameters is
-        considered legacy and may be deprecated in the future. You should
-        instead pass them via the `inputs` parameter.
-        :::
+        Note:
+            Using `prompts` and `prompt_token_ids` as keyword parameters is
+            considered legacy and may be deprecated in the future. You should
+            instead pass them via the `inputs` parameter.
         """
         runner_type = self.llm_engine.model_config.runner_type
         if runner_type not in ["generate", "transcription"]:
@@ -495,17 +497,16 @@ def collective_rpc(self,
                 `self` argument, in addition to the arguments passed in `args`
                 and `kwargs`. The `self` argument will be the worker object.
             timeout: Maximum time in seconds to wait for execution. Raises a
-                {exc}`TimeoutError` on timeout. `None` means wait indefinitely.
+                [`TimeoutError`][] on timeout. `None` means wait indefinitely.
             args: Positional arguments to pass to the worker method.
             kwargs: Keyword arguments to pass to the worker method.
 
         Returns:
             A list containing the results from each worker.
 
-        :::{note}
-        It is recommended to use this API to only pass control messages,
-        and set up data-plane communication to pass data.
-        :::
+        Note:
+            It is recommended to use this API to only pass control messages,
+            and set up data-plane communication to pass data.
         """
 
         return self.llm_engine.collective_rpc(method, timeout, args, kwargs)
@@ -672,7 +673,7 @@ def chat(
         Generate responses for a chat conversation.
 
         The chat conversation is converted into a text prompt using the
-        tokenizer and calls the {meth}`generate` method to generate the
+        tokenizer and calls the [generate][] method to generate the
         responses.
 
         Multi-modal inputs can be passed in the same way you would pass them
@@ -681,8 +682,8 @@ def chat(
         Args:
             messages: A list of conversations or a single conversation.
 
-              - Each conversation is represented as a list of messages.
-              - Each message is a dictionary with 'role' and 'content' keys.
+                - Each conversation is represented as a list of messages.
+                - Each message is a dictionary with 'role' and 'content' keys.
 
             sampling_params: The sampling parameters for text generation.
                 If None, we use the default sampling parameters. When it
@@ -692,27 +693,27 @@ def chat(
             use_tqdm: Whether to use tqdm to display the progress bar.
             lora_request: LoRA request to use for generation, if any.
             chat_template: The template to use for structuring the chat.
-              If not provided, the model's default chat template will be used.
+                If not provided, the model's default chat template will be used.
             chat_template_content_format: The format to render message content.
 
-              - "string" will render the content as a string.
-                Example: ``"Who are you?"``
-              - "openai" will render the content as a list of dictionaries,
-                similar to OpenAI schema.
-                Example: ``[{"type": "text", "text": "Who are you?"}]``
+                - "string" will render the content as a string.
+                  Example: `"Who are you?"`
+                - "openai" will render the content as a list of dictionaries,
+                  similar to OpenAI schema.
+                  Example: `[{"type": "text", "text": "Who are you?"}]`
 
             add_generation_prompt: If True, adds a generation template
                 to each message.
             continue_final_message: If True, continues the final message in
                 the conversation instead of starting a new one. Cannot be
-                ``True`` if ``add_generation_prompt`` is also ``True``.
+                `True` if `add_generation_prompt` is also `True`.
             chat_template_kwargs: Additional kwargs to pass to the chat
                 template.
             mm_processor_kwargs: Multimodal processor kwarg overrides for this
                 chat request. Only used for offline requests.
 
         Returns:
-            A list of ``RequestOutput`` objects containing the generated
+            A list of `RequestOutput` objects containing the generated
             responses in the same order as the input messages.
         """
         list_of_messages: list[list[ChatCompletionMessageParam]]
@@ -911,7 +912,7 @@ def encode(
 
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
-                for batch inference. See {class}`~vllm.inputs.PromptType`
+                for batch inference. See [PromptType][vllm.inputs.PromptType]
                 for more details about the format of each prompts.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
@@ -924,11 +925,10 @@ def encode(
             A list of `PoolingRequestOutput` objects containing the
             pooled hidden states in the same order as the input prompts.
 
-        :::{note}
-        Using `prompts` and `prompt_token_ids` as keyword parameters is
-        considered legacy and may be deprecated in the future. You should
-        instead pass them via the `inputs` parameter.
-        :::
+        Note:
+            Using `prompts` and `prompt_token_ids` as keyword parameters is
+            considered legacy and may be deprecated in the future. You should
+            instead pass them via the `inputs` parameter.
         """
         runner_type = self.llm_engine.model_config.runner_type
         if runner_type != "pooling":
@@ -1001,7 +1001,7 @@ def embed(
 
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
-                for batch inference. See {class}`~vllm.inputs.PromptType`
+                for batch inference. See [PromptType][vllm.inputs.PromptType]
                 for more details about the format of each prompts.
             pooling_params: The pooling parameters for pooling. If None, we
                 use the default pooling parameters.
@@ -1011,7 +1011,7 @@ def embed(
                 generation, if any.
 
         Returns:
-            A list of ``EmbeddingRequestOutput`` objects containing the
+            A list of `EmbeddingRequestOutput` objects containing the
             embedding vectors in the same order as the input prompts.
         """
         if self.llm_engine.model_config.task != "embed":
@@ -1045,7 +1045,7 @@ def classify(
 
         Args:
             prompts: The prompts to the LLM. You may pass a sequence of prompts
-                for batch inference. See {class}`~vllm.inputs.PromptType`
+                for batch inference. See [PromptType][vllm.inputs.PromptType]
                 for more details about the format of each prompts.
             use_tqdm: Whether to use tqdm to display the progress bar.
             lora_request: LoRA request to use for generation, if any.
@@ -1053,7 +1053,7 @@ def classify(
                 generation, if any.
 
         Returns:
-            A list of ``ClassificationRequestOutput`` objects containing the
+            A list of `ClassificationRequestOutput` objects containing the
             embedding vectors in the same order as the input prompts.
         """
         if self.llm_engine.model_config.task != "classify":
@@ -1163,11 +1163,11 @@ def score(
         lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> list[ScoringRequestOutput]:
-        """Generate similarity scores for all pairs ``<text,text_pair>``.
+        """Generate similarity scores for all pairs `<text,text_pair>`.
 
-        The inputs can be ``1 -> 1``, ``1 -> N`` or ``N -> N``.
-        In the ``1 - N`` case the ``text_1`` sentence will be replicated ``N``
-        times to pair with the ``text_2`` sentences.
+        The inputs can be `1 -> 1`, `1 -> N` or `N -> N`.
+        In the `1 - N` case the `text_1` sentence will be replicated `N`
+        times to pair with the `text_2` sentences.
         The input pairs are used to build a list of prompts for the
         cross encoder model. This class automatically batches the prompts,
         considering the memory constraint. For the best performance, put all
@@ -1175,9 +1175,9 @@ def score(
 
         Args:
             text_1: can be a single prompt or a list of prompts, in which
-                case it has to have the same length as the ``text_2`` list
+                case it has to have the same length as the `text_2` list
             text_2: The texts to pair with the query to form the input
-                to the LLM. See {class}`~vllm.inputs.PromptType` for
+                to the LLM. See [PromptType][vllm.inputs.PromptType] for
                 more details about the format of each prompts.
             use_tqdm: Whether to use tqdm to display the progress bar.
             lora_request: LoRA request to use for generation, if any.
@@ -1185,7 +1185,7 @@ def score(
                 generation, if any.
 
         Returns:
-            A list of ``ScoringRequestOutput`` objects containing the
+            A list of `ScoringRequestOutput` objects containing the
             generated scores in the same order as the input prompts.
         """
         runner_type = self.llm_engine.model_config.runner_type
@@ -1286,18 +1286,32 @@ def sleep(self, level: int = 1):
 
     def wake_up(self, tags: Optional[list[str]] = None):
         """
-        Wake up the engine from sleep mode. See the {meth}`sleep` method
+        Wake up the engine from sleep mode. See the [sleep][] method
         for more details.
         
         Args:
             tags: An optional list of tags to reallocate the engine memory 
                 for specific memory allocations. Values must be in 
-                ("weights", "kv_cache",). If None, all memory is reallocated.
+                `("weights", "kv_cache")`. If None, all memory is reallocated.
                 wake_up should be called with all tags (or None) before the 
                 engine is used again.
         """
         self.llm_engine.wake_up(tags)
 
+    def get_metrics(self) -> list["Metric"]:
+        """Return a snapshot of aggregated metrics from Prometheus.
+
+        Returns:
+            A ``MetricSnapshot`` instance capturing the current state
+            of all aggregated metrics from Prometheus.
+
+        Note:
+            This method is only available with the V1 LLM engine.
+        """
+        from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+        assert isinstance(self.llm_engine, V1LLMEngine)
+        return self.llm_engine.get_metrics()
+
     # LEGACY
     def _convert_v1_inputs(
         self,
@@ -1306,27 +1320,25 @@ def _convert_v1_inputs(
     ):
         # skip_tokenizer_init is now checked in engine
 
+        if prompts is None and prompt_token_ids is None:
+            raise ValueError(
+                "Either prompts or prompt_token_ids must be provided.")
+        if prompts is not None and prompt_token_ids is not None \
+                and len(prompts) != len(prompt_token_ids):
+            raise ValueError(
+                "The lengths of prompts and prompt_token_ids must be the same."
+            )
+
         if prompts is not None:
             prompts = [p["content"] for p in parse_and_batch_prompt(prompts)]
         if prompt_token_ids is not None:
             prompt_token_ids = [
                 p["content"] for p in parse_and_batch_prompt(prompt_token_ids)
             ]
-
-        num_requests = None
         if prompts is not None:
             num_requests = len(prompts)
-        if prompt_token_ids is not None:
-            if (num_requests is not None
-                    and num_requests != len(prompt_token_ids)):
-                raise ValueError("The lengths of prompts and prompt_token_ids "
-                                 "must be the same.")
-
+        elif prompt_token_ids is not None:
             num_requests = len(prompt_token_ids)
-        if num_requests is None:
-            raise ValueError("Either prompts or prompt_token_ids must be "
-                             "provided.")
-
         parsed_prompts: list[PromptType] = []
         for i in range(num_requests):
             item: PromptType
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 0ab6fcdca1a4..2da89b4f5944 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -7,7 +7,6 @@
 import inspect
 import multiprocessing
 import os
-import re
 import signal
 import socket
 import tempfile
@@ -21,6 +20,7 @@
 from typing import Annotated, Optional, Union
 
 import prometheus_client
+import regex as re
 import uvloop
 from fastapi import APIRouter, Depends, FastAPI, Form, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 5ab2356a0898..393cf381b16b 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -3,11 +3,11 @@
 # Adapted from
 # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
 import json
-import re
 import time
 from http import HTTPStatus
 from typing import Annotated, Any, ClassVar, Literal, Optional, Union
 
+import regex as re
 import torch
 from fastapi import HTTPException, UploadFile
 from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter,
@@ -251,7 +251,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     parallel_tool_calls: Optional[bool] = False
     user: Optional[str] = None
 
-    # doc: begin-chat-completion-sampling-params
+    # --8<-- [start:chat-completion-sampling-params]
     best_of: Optional[int] = None
     use_beam_search: bool = False
     top_k: Optional[int] = None
@@ -266,9 +266,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
     spaces_between_special_tokens: bool = True
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
     prompt_logprobs: Optional[int] = None
-    # doc: end-chat-completion-sampling-params
+    # --8<-- [end:chat-completion-sampling-params]
 
-    # doc: begin-chat-completion-extra-params
+    # --8<-- [start:chat-completion-extra-params]
     echo: bool = Field(
         default=False,
         description=(
@@ -407,7 +407,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
         default=None,
         description="KVTransfer parameters used for disaggregated serving.")
 
-    # doc: end-chat-completion-extra-params
+    # --8<-- [end:chat-completion-extra-params]
 
     # Default sampling parameters for chat completion requests
     _DEFAULT_SAMPLING_PARAMS: dict = {
@@ -764,7 +764,7 @@ class CompletionRequest(OpenAIBaseModel):
     top_p: Optional[float] = None
     user: Optional[str] = None
 
-    # doc: begin-completion-sampling-params
+    # --8<-- [start:completion-sampling-params]
     use_beam_search: bool = False
     top_k: Optional[int] = None
     min_p: Optional[float] = None
@@ -779,9 +779,9 @@ class CompletionRequest(OpenAIBaseModel):
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
     allowed_token_ids: Optional[list[int]] = None
     prompt_logprobs: Optional[int] = None
-    # doc: end-completion-sampling-params
+    # --8<-- [end:completion-sampling-params]
 
-    # doc: begin-completion-extra-params
+    # --8<-- [start:completion-extra-params]
     add_special_tokens: bool = Field(
         default=True,
         description=(
@@ -858,7 +858,7 @@ class CompletionRequest(OpenAIBaseModel):
         default=None,
         description="KVTransfer parameters used for disaggregated serving.")
 
-    # doc: end-completion-extra-params
+    # --8<-- [end:completion-extra-params]
 
     # Default sampling parameters for completion requests
     _DEFAULT_SAMPLING_PARAMS: dict = {
@@ -1045,11 +1045,11 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
     user: Optional[str] = None
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
 
-    # doc: begin-embedding-pooling-params
+    # --8<-- [start:embedding-pooling-params]
     additional_data: Optional[Any] = None
-    # doc: end-embedding-pooling-params
+    # --8<-- [end:embedding-pooling-params]
 
-    # doc: begin-embedding-extra-params
+    # --8<-- [start:embedding-extra-params]
     add_special_tokens: bool = Field(
         default=True,
         description=(
@@ -1064,7 +1064,7 @@ class EmbeddingCompletionRequest(OpenAIBaseModel):
             "if the served model does not use priority scheduling."),
     )
 
-    # doc: end-embedding-extra-params
+    # --8<-- [end:embedding-extra-params]
 
     def to_pooling_params(self):
         return PoolingParams(dimensions=self.dimensions,
@@ -1080,11 +1080,11 @@ class EmbeddingChatRequest(OpenAIBaseModel):
     user: Optional[str] = None
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
 
-    # doc: begin-chat-embedding-pooling-params
+    # --8<-- [start:chat-embedding-pooling-params]
     additional_data: Optional[Any] = None
-    # doc: end-chat-embedding-pooling-params
+    # --8<-- [end:chat-embedding-pooling-params]
 
-    # doc: begin-chat-embedding-extra-params
+    # --8<-- [start:chat-embedding-extra-params]
     add_special_tokens: bool = Field(
         default=False,
         description=(
@@ -1118,7 +1118,7 @@ class EmbeddingChatRequest(OpenAIBaseModel):
             "default: 0). Any priority other than 0 will raise an error "
             "if the served model does not use priority scheduling."),
     )
-    # doc: end-chat-embedding-extra-params
+    # --8<-- [end:chat-embedding-extra-params]
 
     @model_validator(mode="before")
     @classmethod
@@ -1147,11 +1147,11 @@ class ScoreRequest(OpenAIBaseModel):
     text_2: Union[list[str], str]
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
 
-    # doc: begin-score-pooling-params
+    # --8<-- [start:score-pooling-params]
     additional_data: Optional[Any] = None
-    # doc: end-score-pooling-params
+    # --8<-- [end:score-pooling-params]
 
-    # doc: begin-score-extra-params
+    # --8<-- [start:score-extra-params]
     priority: int = Field(
         default=0,
         description=(
@@ -1160,7 +1160,7 @@ class ScoreRequest(OpenAIBaseModel):
             "if the served model does not use priority scheduling."),
     )
 
-    # doc: end-score-extra-params
+    # --8<-- [end:score-extra-params]
 
     def to_pooling_params(self):
         return PoolingParams(additional_data=self.additional_data)
@@ -1173,11 +1173,11 @@ class RerankRequest(OpenAIBaseModel):
     top_n: int = Field(default_factory=lambda: 0)
     truncate_prompt_tokens: Optional[Annotated[int, Field(ge=-1)]] = None
 
-    # doc: begin-rerank-pooling-params
+    # --8<-- [start:rerank-pooling-params]
     additional_data: Optional[Any] = None
-    # doc: end-rerank-pooling-params
+    # --8<-- [end:rerank-pooling-params]
 
-    # doc: begin-rerank-extra-params
+    # --8<-- [start:rerank-extra-params]
     priority: int = Field(
         default=0,
         description=(
@@ -1186,7 +1186,7 @@ class RerankRequest(OpenAIBaseModel):
             "if the served model does not use priority scheduling."),
     )
 
-    # doc: end-rerank-extra-params
+    # --8<-- [end:rerank-extra-params]
 
     def to_pooling_params(self):
         return PoolingParams(additional_data=self.additional_data)
@@ -1321,11 +1321,11 @@ class ClassificationRequest(OpenAIBaseModel):
     truncate_prompt_tokens: Optional[int] = None
     user: Optional[str] = None
 
-    # doc: begin-classification-pooling-params
+    # --8<-- [start:classification-pooling-params]
     additional_data: Optional[Any] = None
-    # doc: end-classification-pooling-params
+    # --8<-- [end:classification-pooling-params]
 
-    # doc: begin-classification-extra-params
+    # --8<-- [start:classification-extra-params]
     priority: int = Field(
         default=0,
         description=(
@@ -1334,7 +1334,7 @@ class ClassificationRequest(OpenAIBaseModel):
             "if the served model does not use priority scheduling."),
     )
 
-    # doc: end-classification-extra-params
+    # --8<-- [end:classification-extra-params]
 
     def to_pooling_params(self):
         return PoolingParams(additional_data=self.additional_data)
@@ -1698,7 +1698,7 @@ class TranscriptionRequest(OpenAIBaseModel):
     timestamps incurs additional latency.
     """
 
-    # doc: begin-transcription-extra-params
+    # --8<-- [start:transcription-extra-params]
     stream: Optional[bool] = False
     """Custom field not present in the original OpenAI definition. When set,
     it will enable output to be streamed in a similar fashion as the Chat
@@ -1707,9 +1707,9 @@ class TranscriptionRequest(OpenAIBaseModel):
     # Flattened stream option to simplify form data.
     stream_include_usage: Optional[bool] = False
     stream_continuous_usage_stats: Optional[bool] = False
-    # doc: end-transcription-extra-params
+    # --8<-- [end:transcription-extra-params]
 
-    # doc: begin-transcription-sampling-params
+    # --8<-- [start:transcription-sampling-params]
     temperature: float = Field(default=0.0)
     """The sampling temperature, between 0 and 1.
 
@@ -1743,7 +1743,7 @@ class TranscriptionRequest(OpenAIBaseModel):
 
     presence_penalty: Optional[float] = 0.0
     """The presence penalty to use for sampling."""
-    # doc: end-transcription-sampling-params
+    # --8<-- [end:transcription-sampling-params]
 
     # Default sampling parameters for transcription requests.
     _DEFAULT_SAMPLING_PARAMS: dict = {
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index fccf459f17dc..eae83c9a494a 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -365,8 +365,8 @@ async def main(args):
 
         # Determine the type of request and run it.
         if request.url == "/v1/chat/completions":
-            chat_handler_fn = (None if openai_serving_chat is None else
-                               openai_serving_chat.create_chat_completion)
+            chat_handler_fn = openai_serving_chat.create_chat_completion if \
+                openai_serving_chat is not None else None
             if chat_handler_fn is None:
                 response_futures.append(
                     make_async_error_request_output(
@@ -380,8 +380,8 @@ async def main(args):
                 run_request(chat_handler_fn, request, tracker))
             tracker.submitted()
         elif request.url == "/v1/embeddings":
-            embed_handler_fn = (None if openai_serving_embedding is None else
-                                openai_serving_embedding.create_embedding)
+            embed_handler_fn = openai_serving_embedding.create_embedding if \
+                openai_serving_embedding is not None else None
             if embed_handler_fn is None:
                 response_futures.append(
                     make_async_error_request_output(
@@ -394,8 +394,8 @@ async def main(args):
                 run_request(embed_handler_fn, request, tracker))
             tracker.submitted()
         elif request.url == "/v1/score":
-            score_handler_fn = (None if openai_serving_scores is None else
-                                openai_serving_scores.create_score)
+            score_handler_fn = openai_serving_scores.create_score if \
+                openai_serving_scores is not None else None
             if score_handler_fn is None:
                 response_futures.append(
                     make_async_error_request_output(
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index ee18e0b0a454..bc11686d7be8 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -2,7 +2,6 @@
 
 import asyncio
 import json
-import re
 import time
 from collections.abc import AsyncGenerator, AsyncIterator
 from collections.abc import Sequence as GenericSequence
@@ -10,6 +9,7 @@
 
 import jinja2
 import partial_json_parser
+import regex as re
 from fastapi import Request
 from pydantic import TypeAdapter
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 93de9f3a5c05..c73575b48d9c 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -582,7 +582,8 @@ def _tokenize_prompt_input(
         add_special_tokens: bool = True,
     ) -> TextTokensPrompt:
         """
-        A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
+        A simpler implementation of
+        [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
         that assumes single input.
         """
         return next(
@@ -603,7 +604,8 @@ def _tokenize_prompt_inputs(
         add_special_tokens: bool = True,
     ) -> Iterator[TextTokensPrompt]:
         """
-        A simpler implementation of {meth}`_tokenize_prompt_input_or_inputs`
+        A simpler implementation of
+        [`_tokenize_prompt_input_or_inputs`][vllm.entrypoints.openai.serving_engine.OpenAIServing._tokenize_prompt_input_or_inputs]
         that assumes multiple inputs.
         """
         for text in prompt_inputs:
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index f7c7112b124f..054c0b006b2f 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -7,6 +7,7 @@
 from .hermes_tool_parser import Hermes2ProToolParser
 from .internlm2_tool_parser import Internlm2ToolParser
 from .jamba_tool_parser import JambaToolParser
+from .llama4_pythonic_tool_parser import Llama4PythonicToolParser
 from .llama_tool_parser import Llama3JsonToolParser
 from .mistral_tool_parser import MistralToolParser
 from .phi4mini_tool_parser import Phi4MiniJsonToolParser
@@ -16,5 +17,6 @@
     "ToolParser", "ToolParserManager", "Granite20bFCToolParser",
     "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser",
     "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser",
-    "PythonicToolParser", "Phi4MiniJsonToolParser", "DeepSeekV3ToolParser"
+    "Llama4PythonicToolParser", "PythonicToolParser", "Phi4MiniJsonToolParser",
+    "DeepSeekV3ToolParser"
 ]
diff --git a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
index bd8e87e4cee8..14e743e13a72 100644
--- a/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/deepseekv3_tool_parser.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import re
 from collections.abc import Sequence
 from typing import Union
 
+import regex as re
+
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               DeltaFunctionCall, DeltaMessage,
                                               DeltaToolCall,
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
index b93de6b41817..383e0d44de99 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_20b_fc_tool_parser.py
@@ -1,12 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-import re
 from collections.abc import Sequence
 from json import JSONDecoder
 from typing import Union
 
 import partial_json_parser
+import regex as re
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.chat_utils import random_tool_call_id
@@ -80,7 +80,8 @@ def extract_tool_calls(
                     function=FunctionCall(
                         name=function_call["name"],
                         # function call args are JSON but as a string
-                        arguments=json.dumps(function_call["arguments"]),
+                        arguments=json.dumps(function_call["arguments"],
+                                             ensure_ascii=False),
                     ),
                 ) for function_call in raw_function_calls
             ]
@@ -166,7 +167,8 @@ def extract_tool_calls_streaming(
                 if self.current_tool_id >= 0:
                     cur_arguments = current_tool_call.get("arguments")
                     if cur_arguments:
-                        cur_args_json = json.dumps(cur_arguments)
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
                         sent = len(
                             self.streamed_args_for_tool[self.current_tool_id])
                         argument_diff = cur_args_json[sent:]
@@ -218,7 +220,8 @@ def extract_tool_calls_streaming(
                 if cur_arguments:
                     sent = len(
                         self.streamed_args_for_tool[self.current_tool_id])
-                    cur_args_json = json.dumps(cur_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
                     prev_arguments = self.prev_tool_call_arr[
                         self.current_tool_id].get("arguments")
 
@@ -226,7 +229,8 @@ def extract_tool_calls_streaming(
                     if is_complete[self.current_tool_id]:
                         argument_diff = cur_args_json[sent:]
                     elif prev_arguments:
-                        prev_args_json = json.dumps(prev_arguments)
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
                         if cur_args_json != prev_args_json:
 
                             prefix = find_common_prefix(
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
index 6710e7938c43..b8bf142530ee 100644
--- a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -67,7 +67,8 @@ def extract_tool_calls(
                     function=FunctionCall(
                         name=function_call["name"],
                         # function call args are JSON but as a string
-                        arguments=json.dumps(function_call["arguments"]),
+                        arguments=json.dumps(function_call["arguments"],
+                                             ensure_ascii=False),
                     ),
                 ) for function_call in raw_function_calls
             ]
@@ -151,7 +152,8 @@ def extract_tool_calls_streaming(
                 if self.current_tool_id >= 0:
                     cur_arguments = current_tool_call.get("arguments")
                     if cur_arguments:
-                        cur_args_json = json.dumps(cur_arguments)
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
                         sent = len(
                             self.streamed_args_for_tool[self.current_tool_id])
                         argument_diff = cur_args_json[sent:]
@@ -197,7 +199,8 @@ def extract_tool_calls_streaming(
                 if cur_arguments:
                     sent = len(
                         self.streamed_args_for_tool[self.current_tool_id])
-                    cur_args_json = json.dumps(cur_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
                     prev_arguments = self.prev_tool_call_arr[
                         self.current_tool_id].get("arguments")
 
@@ -205,7 +208,8 @@ def extract_tool_calls_streaming(
                     if is_complete[self.current_tool_id]:
                         argument_diff = cur_args_json[sent:]
                     elif prev_arguments:
-                        prev_args_json = json.dumps(prev_arguments)
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
                         if cur_args_json != prev_args_json:
                             prefix = find_common_prefix(
                                 prev_args_json, cur_args_json)
diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
index e56a8ef7193c..2b9f9852bcb3 100644
--- a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-import re
 from collections.abc import Sequence
 from typing import Union
 
 import partial_json_parser
+import regex as re
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.chat_utils import random_tool_call_id
diff --git a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
index 5abd553d884d..3f2799f8010a 100644
--- a/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/internlm2_tool_parser.py
@@ -133,7 +133,8 @@ def extract_tool_calls_streaming(
                     delta = None
                 # first time to get parameters
                 elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments)
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)
 
                     arguments_delta = cur_arguments_json[:cur_arguments_json.
                                                          index(delta_text) +
@@ -148,8 +149,10 @@ def extract_tool_calls_streaming(
                         self.current_tool_id] += arguments_delta
                 # both prev and cur parameters, send the increase parameters
                 elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments)
-                    prev_args_json = json.dumps(prev_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
 
                     argument_diff = extract_intermediate_diff(
                         cur_args_json, prev_args_json)
@@ -190,7 +193,8 @@ def extract_tool_calls(
             action_dict = json.loads(action)
             name, parameters = action_dict['name'], json.dumps(
                 action_dict.get('parameters', action_dict.get('arguments',
-                                                              {})))
+                                                              {})),
+                ensure_ascii=False)
 
             if not tools or name not in [t.function.name for t in tools]:
                 ExtractedToolCallInformation(tools_called=False,
diff --git a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
index 6cac6f8163bf..2714a545f997 100644
--- a/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/jamba_tool_parser.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-import re
 from collections.abc import Sequence
 from typing import Union
 
 import partial_json_parser
+import regex as re
 from partial_json_parser.core.options import Allow
 
 from vllm.entrypoints.chat_utils import random_tool_call_id
@@ -96,8 +96,9 @@ def extract_tool_calls(
                         function=FunctionCall(
                             name=function_call["name"],
                             # function call args are JSON but as a string
-                            arguments=json.dumps(function_call["arguments"])))
-                    for function_call in raw_function_calls
+                            arguments=json.dumps(function_call["arguments"],
+                                                 ensure_ascii=False),
+                        )) for function_call in raw_function_calls
                 ]
 
                 content = model_output[:model_output.
@@ -187,7 +188,7 @@ def extract_tool_calls_streaming(
                     diff: Union[str, None] = current_tool_call.get("arguments")
 
                     if diff:
-                        diff = json.dumps(diff).replace(
+                        diff = json.dumps(diff, ensure_ascii=False).replace(
                             self.streamed_args_for_tool[self.current_tool_id],
                             "")
                         delta = DeltaMessage(tool_calls=[
@@ -248,7 +249,8 @@ def extract_tool_calls_streaming(
                         "mid-arguments")
                     delta = None
                 elif cur_arguments and not prev_arguments:
-                    cur_arguments_json = json.dumps(cur_arguments)
+                    cur_arguments_json = json.dumps(cur_arguments,
+                                                    ensure_ascii=False)
                     logger.debug("finding %s in %s", new_text,
                                  cur_arguments_json)
 
@@ -267,8 +269,10 @@ def extract_tool_calls_streaming(
                         self.current_tool_id] += arguments_delta
 
                 elif cur_arguments and prev_arguments:
-                    cur_args_json = json.dumps(cur_arguments)
-                    prev_args_json = json.dumps(prev_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
+                    prev_args_json = json.dumps(prev_arguments,
+                                                ensure_ascii=False)
                     logger.debug("Searching for diff between \n%s\n%s",
                                  cur_args_json, prev_args_json)
 
diff --git a/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
new file mode 100644
index 000000000000..858c8db99fd2
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/llama4_pythonic_tool_parser.py
@@ -0,0 +1,302 @@
+# SPDX-License-Identifier: Apache-2.0
+import ast
+import json
+from collections.abc import Sequence
+from typing import Any, Union
+
+import regex as re
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class _UnexpectedAstError(Exception):
+    pass
+
+
+@ToolParserManager.register_module("llama4_pythonic")
+class Llama4PythonicToolParser(ToolParser):
+    """
+    Toolcall parser for Llama4 that produce tool calls in a pythonic style
+    Use --enable-auto-tool-choice --tool-call-parser llama4_pythonic
+    """
+    # TODO(mdepinet): Possible future improvements:
+    #   1. Support text + tools separated by either <|python_tag|> or \n\n
+    #   2. Support tools outside of a list (or separated by a semicolon).
+    #      This depends on item 1 for consistent streaming.
+    # Neither of these are necessary for e.g. ToolACE, but both would help make
+    # Llama3.2 models more reliable.
+
+    TOOL_CALL_REGEX = re.compile(
+        r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
+        re.DOTALL)
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+    # Rename for readability. This is NOT a tool id.
+    @property
+    def current_tool_index(self) -> int:
+        return self.current_tool_id
+
+    @current_tool_index.setter
+    def current_tool_index(self, value: int) -> None:
+        self.current_tool_id = value
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+
+        # remove <|python_start|> and <|python_end|>
+        # as Llama 4 model sometime will output those tokens
+        if model_output.startswith("<|python_start|>"):
+            model_output = model_output[len("<|python_start|>"):]
+            model_output = model_output.replace("<|python_end|>", "")
+        if not (self.TOOL_CALL_REGEX.match(model_output)):
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            module = ast.parse(model_output)
+            parsed = getattr(module.body[0], "value", None)
+            if isinstance(parsed, ast.List) and all(
+                    isinstance(e, ast.Call) for e in parsed.elts):
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=[
+                        _handle_single_tool(e)  # type: ignore
+                        for e in parsed.elts
+                    ],
+                    content=None)
+            else:
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls")
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # Treat as regular text
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        if not current_text.startswith("[") and not current_text.startswith(
+                "<|python_start|>"):
+            return DeltaMessage(content=delta_text)
+
+        try:
+            # remove <|python_start|> and <|python_end|>
+            if current_text.startswith("<|python_start|>"):
+                current_text = current_text[len("<|python_start|>"):]
+            if current_text.endswith("<|python_end|>"):
+                current_text = current_text[:current_text.
+                                            rfind("<|python_end|>")]
+            valid_and_added_text = _make_valid_python(current_text)
+            if valid_and_added_text is None:
+                return None
+            valid_text, added_text = valid_and_added_text
+
+            module = ast.parse(valid_text)
+            parsed = getattr(module.body[0], "value", None)
+            if not isinstance(parsed, ast.List) or not all(
+                    isinstance(e, ast.Call) for e in parsed.elts):
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls")
+            tool_calls = [
+                _handle_single_tool(e)  # type: ignore
+                for e in parsed.elts
+            ]
+
+            tool_deltas = []
+            for index, new_call in enumerate(tool_calls):
+                if index < self.current_tool_index:
+                    continue
+
+                self.current_tool_index = index
+                if len(self.streamed_args_for_tool) == index:
+                    self.streamed_args_for_tool.append("")
+
+                new_call_complete = index < len(
+                    tool_calls) - 1 or ")]" not in added_text
+                if new_call_complete:
+                    self.current_tool_index += 1
+
+                withheld_suffix = (added_text[:-2]
+                                   if not new_call_complete else "")
+                if not new_call_complete and added_text[-2] == ")":
+                    # Function call is incomplete. Withhold the closing bracket.
+                    withheld_suffix = withheld_suffix + "}"
+                # Strings get single quotes in the model-produced string.
+                # JSON requires double quotes.
+                withheld_suffix = withheld_suffix.replace("'", '"')
+                delta = _compute_tool_delta(self.streamed_args_for_tool[index],
+                                            new_call, index, withheld_suffix)
+
+                if delta is not None:
+                    tool_deltas.append(delta)
+                    if (delta.function is not None
+                            and delta.function.arguments is not None):
+                        self.streamed_args_for_tool[
+                            index] += delta.function.arguments
+
+        # HACK: serving_chat.py inspects the internal state of tool parsers
+        # when determining it's final streaming delta, automatically
+        # adding autocompleted JSON.
+        # These two lines avoid that nonsense while ensuring finish_reason
+        # is set to tool_calls when at least one tool is called.
+            if tool_deltas and not self.prev_tool_call_arr:
+                self.prev_tool_call_arr = [{"arguments": {}}]
+
+            if tool_deltas:
+                return DeltaMessage(tool_calls=tool_deltas)
+            elif not added_text and self.current_tool_id > 0:
+                # Return an empty DeltaMessage once the tool calls are all done
+                # so that finish_reason gets set.
+                return DeltaMessage(content='')
+            else:
+                return None
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
+
+
+def _get_parameter_value(val: ast.expr) -> Any:
+    if isinstance(val, ast.Constant):
+        return val.value
+    elif isinstance(val, ast.Dict):
+        if not all(isinstance(k, ast.Constant) for k in val.keys):
+            raise _UnexpectedAstError(
+                "Dict tool call arguments must have literal keys")
+        return {
+            k.value: _get_parameter_value(v)  # type: ignore
+            for k, v in zip(val.keys, val.values)
+        }
+    elif isinstance(val, ast.List):
+        return [_get_parameter_value(v) for v in val.elts]
+    else:
+        raise _UnexpectedAstError("Tool call arguments must be literals")
+
+
+def _handle_single_tool(call: ast.Call) -> ToolCall:
+    if not isinstance(call.func, ast.Name):
+        raise _UnexpectedAstError("Invalid tool call name")
+    function_name = call.func.id
+    arguments = {}
+    for keyword in call.keywords:
+        arguments[keyword.arg] = _get_parameter_value(keyword.value)
+    return ToolCall(type="function",
+                    function=FunctionCall(name=function_name,
+                                          arguments=json.dumps(arguments)))
+
+
+def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
+    bracket_stack = []
+    for index, char in enumerate(text):
+        if char in {"[", "(", "{"}:
+            bracket_stack.append(char)
+        elif char == "]":
+            if not bracket_stack or bracket_stack.pop() != "[":
+                raise _UnexpectedAstError("Mismatched square brackets")
+        elif char == ")":
+            if not bracket_stack or bracket_stack.pop() != "(":
+                raise _UnexpectedAstError("Mismatched parentheses")
+        elif char == "}":
+            if not bracket_stack or bracket_stack.pop() != "{":
+                raise _UnexpectedAstError("Mismatched curly braces")
+        elif char in {"'", '"'}:
+            if bracket_stack and bracket_stack[-1] == char:
+                if index > 0 and text[index - 1] == "\\":
+                    # Treat an escaped quote as a regular character
+                    pass
+                else:
+                    bracket_stack.pop()
+            elif bracket_stack and bracket_stack[-1] in {"'", '"'}:
+                # Double quote within a single quote string or vice versa.
+                pass
+            else:
+                bracket_stack.append(char)
+
+    text = text.rstrip()
+    if text.endswith("=") or text.endswith(":"):
+        # Since we have no type information for this property/parameter value,
+        # we can't fill in a valid value.
+        return None
+    if bracket_stack and bracket_stack[-1] == "{":
+        trailing_dict_text = text[:text.rfind("{")]
+        num_keys = trailing_dict_text.count(":")
+        num_values = trailing_dict_text.count(",")
+        if num_keys <= num_values:
+            return None  # Incomplete property name within parameter value
+    if bracket_stack and bracket_stack[-1] == "(":
+        trailing_params_text = text[:text.rfind("(")]
+        num_full_param_names = trailing_params_text.count("=")
+        num_full_param_values = trailing_params_text.count(",")
+        if num_full_param_names <= num_full_param_values:
+            return None  # Incomplete parameter name
+    if text.endswith(","):
+        text = text[:-1]
+    if bracket_stack and bracket_stack[-1] == "[" and not text.endswith(
+            "[") and not text.endswith(")"):
+        return None  # Incomplete function name
+
+    added_text = ""
+    for char in reversed(bracket_stack):
+        if char == "[":
+            added_text += "]"
+        elif char == "(":
+            added_text += ")"
+        elif char == "{":
+            added_text += "}"
+        elif char == "'":
+            added_text += "'"
+        elif char == '"':
+            added_text += '"'
+
+    return text + added_text, added_text
+
+
+def _compute_tool_delta(previously_sent_args: str, new_call: ToolCall,
+                        index: int,
+                        withheld_suffix: str) -> Union[DeltaToolCall, None]:
+    new_call_args = new_call.function.arguments
+    if withheld_suffix:
+        assert new_call_args.endswith(withheld_suffix)
+        new_call_args = new_call_args[:-len(withheld_suffix)]
+    if not previously_sent_args:
+        return DeltaToolCall(id=new_call.id,
+                             type="function",
+                             index=index,
+                             function=DeltaFunctionCall(
+                                 name=new_call.function.name,
+                                 arguments=new_call_args,
+                             ))
+
+    arg_diff = new_call_args[len(previously_sent_args):]
+    return DeltaToolCall(
+        id=None, index=index, function=DeltaFunctionCall(
+            arguments=arg_diff)) if arg_diff else None
diff --git a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
index 9307034f40d6..4eda7044cbba 100644
--- a/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/llama_tool_parser.py
@@ -1,12 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-import re
 from collections.abc import Sequence
 from json import JSONDecoder
 from typing import Union
 
 import partial_json_parser
+import regex as re
 from partial_json_parser.core.options import Allow
 from transformers import PreTrainedTokenizerBase
 
@@ -88,7 +88,8 @@ def extract_tool_calls(
                         # function call args are JSON but as a string
                         arguments=json.dumps(raw_function_call["arguments"] \
                                 if "arguments" in raw_function_call \
-                                else raw_function_call["parameters"])))
+                                else raw_function_call["parameters"],
+                                ensure_ascii=False)))
                 for raw_function_call in function_call_arr
             ]
 
@@ -174,7 +175,8 @@ def extract_tool_calls_streaming(
                 if self.current_tool_id >= 0:
                     cur_arguments = current_tool_call.get("arguments")
                     if cur_arguments:
-                        cur_args_json = json.dumps(cur_arguments)
+                        cur_args_json = json.dumps(cur_arguments,
+                                                   ensure_ascii=False)
                         sent = len(
                             self.streamed_args_for_tool[self.current_tool_id])
                         argument_diff = cur_args_json[sent:]
@@ -226,7 +228,8 @@ def extract_tool_calls_streaming(
                 if cur_arguments:
                     sent = len(
                         self.streamed_args_for_tool[self.current_tool_id])
-                    cur_args_json = json.dumps(cur_arguments)
+                    cur_args_json = json.dumps(cur_arguments,
+                                               ensure_ascii=False)
                     prev_arguments = self.prev_tool_call_arr[
                         self.current_tool_id].get("arguments")
 
@@ -234,7 +237,8 @@ def extract_tool_calls_streaming(
                     if is_complete[self.current_tool_id]:
                         argument_diff = cur_args_json[sent:]
                     elif prev_arguments:
-                        prev_args_json = json.dumps(prev_arguments)
+                        prev_args_json = json.dumps(prev_arguments,
+                                                    ensure_ascii=False)
                         if cur_args_json != prev_args_json:
 
                             prefix = find_common_prefix(
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index 9dbfe85ecc68..fecad7e653ab 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -1,13 +1,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-import re
 from collections.abc import Sequence
 from random import choices
 from string import ascii_letters, digits
 from typing import Union
 
 import partial_json_parser
+import regex as re
 from partial_json_parser.core.options import Allow
 from pydantic import Field
 
diff --git a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
index abf70a5e85c4..b403a146716d 100644
--- a/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/phi4mini_tool_parser.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
-import re
 from collections.abc import Sequence
 from typing import Any, Optional
 
+import regex as re
 from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.chat_utils import random_tool_call_id
@@ -79,10 +79,11 @@ def extract_tool_calls(
                         name=raw_function_call["name"],
                         # function call args are JSON but as a string
                         arguments=json.dumps(
-                            raw_function_call["arguments"] if "arguments" in
-                            raw_function_call else
-                            raw_function_call["parameters"])))
-                for raw_function_call in function_call_arr
+                            raw_function_call["arguments"]
+                            if "arguments" in raw_function_call else
+                            raw_function_call["parameters"],
+                            ensure_ascii=False),
+                    )) for raw_function_call in function_call_arr
             ]
 
             # get any content before the tool call
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
index bb91a35af3be..548ff39d1ca4 100644
--- a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -2,10 +2,10 @@
 
 import ast
 import json
-import re
 from collections.abc import Sequence
 from typing import Any, Union
 
+import regex as re
 from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
@@ -200,9 +200,12 @@ def _handle_single_tool(call: ast.Call) -> ToolCall:
     arguments = {}
     for keyword in call.keywords:
         arguments[keyword.arg] = _get_parameter_value(keyword.value)
-    return ToolCall(type="function",
-                    function=FunctionCall(name=function_name,
-                                          arguments=json.dumps(arguments)))
+    return ToolCall(
+        type="function",
+        function=FunctionCall(name=function_name,
+                              arguments=json.dumps(arguments,
+                                                   ensure_ascii=False)),
+    )
 
 
 def _make_valid_python(text: str) -> Union[tuple[str, str], None]:
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index 2fe6e1a9e9c4..cc651a172b40 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -13,6 +13,13 @@
 
 logger = init_logger(__name__)
 
+VLLM_SERVE_PARSER_EPILOG = (
+    "Tip: Use `vllm serve --help=<keyword>` to explore arguments from help.\n"
+    "   - To view a argument group:     --help=ModelConfig\n"
+    "   - To view a single argument:    --help=max-num-seqs\n"
+    "   - To search by keyword:         --help=max\n"
+    "   - To list all groups:           --help=listgroup")
+
 
 async def listen_for_disconnect(request: Request) -> None:
     """Returns if a disconnect message is received"""
@@ -158,3 +165,55 @@ def _validate_truncation_size(
             tokenization_kwargs["max_length"] = truncate_prompt_tokens
 
     return truncate_prompt_tokens
+
+
+def show_filtered_argument_or_group_from_help(parser):
+    import sys
+    for arg in sys.argv:
+        if arg.startswith('--help='):
+            search_keyword = arg.split('=', 1)[1]
+
+            # List available groups
+            if search_keyword == 'listgroup':
+                print("\nAvailable argument groups:")
+                for group in parser._action_groups:
+                    if group.title and not group.title.startswith(
+                            "positional arguments"):
+                        print(f"  - {group.title}")
+                        if group.description:
+                            print("    " + group.description.strip())
+                        print()
+                sys.exit(0)
+
+            # For group search
+            formatter = parser._get_formatter()
+            for group in parser._action_groups:
+                if group.title and group.title.lower() == search_keyword.lower(
+                ):
+                    formatter.start_section(group.title)
+                    formatter.add_text(group.description)
+                    formatter.add_arguments(group._group_actions)
+                    formatter.end_section()
+                    print(formatter.format_help())
+                    sys.exit(0)
+
+            # For single arg
+            matched_actions = []
+
+            for group in parser._action_groups:
+                for action in group._group_actions:
+                    # search option name
+                    if any(search_keyword.lower() in opt.lower()
+                           for opt in action.option_strings):
+                        matched_actions.append(action)
+
+            if matched_actions:
+                print(f"\nParameters matching '{search_keyword}':\n")
+                formatter = parser._get_formatter()
+                formatter.add_arguments(matched_actions)
+                print(formatter.format_help())
+                sys.exit(0)
+
+            print(f"\nNo group or parameter matching '{search_keyword}'")
+            print("Tip: use `--help=listgroup` to view all groups.")
+            sys.exit(1)
diff --git a/vllm/envs.py b/vllm/envs.py
index a60749015283..e24fdf712acc 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -122,6 +122,7 @@
     VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
     VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5557
     VLLM_ALL2ALL_BACKEND: str = "naive"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
 
 
 def get_default_cache_root():
@@ -168,7 +169,7 @@ def get_vllm_port() -> Optional[int]:
                 raise ValueError(
                     f"VLLM_PORT '{port}' appears to be a URI. "
                     "This may be caused by a Kubernetes service discovery issue"
-                    "check the warning in: https://docs.vllm.ai/en/stable/serving/env_vars.html"
+                    "check the warning in: https://docs.vllm.ai/en/stable/usage/env_vars.html"
                 )
         except Exception:
             pass
@@ -180,7 +181,7 @@ def get_vllm_port() -> Optional[int]:
 # The begin-* and end* here are used by the documentation generator
 # to extract the used env vars.
 
-# begin-env-vars-definition
+# --8<-- [start:env-vars-definition]
 
 environment_variables: dict[str, Callable[[], Any]] = {
 
@@ -843,11 +844,21 @@ def get_vllm_port() -> Optional[int]:
     lambda: int(os.getenv("VLLM_NIXL_SIDE_CHANNEL_PORT", "5557")),
 
     # all2all backend for vllm's expert parallel communication
+    # Available options:
+    # - "naive": naive all2all implementation using all-reduce
+    # - "pplx": use pplx kernels
     "VLLM_ALL2ALL_BACKEND":
     lambda: os.getenv("VLLM_ALL2ALL_BACKEND", "naive"),
+
+    # Control the maximum number of tokens per expert supported by the
+    # NVFP4 MoE CUTLASS Kernel. This value is used to create a buffer for
+    # the blockscale tensor of activations NVFP4 Quantization.
+    # This is used to prevent the kernel from running out of memory.
+    "VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE":
+    lambda: int(os.getenv("VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE", "163840")),
 }
 
-# end-env-vars-definition
+# --8<-- [end:env-vars-definition]
 
 
 def __getattr__(name: str):
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index 522bd940211f..40ca1d29939a 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -74,7 +74,7 @@ def collective_rpc(self,
                 `self` argument, in addition to the arguments passed in `args`
                 and `kwargs`. The `self` argument will be the worker object.
             timeout: Maximum time in seconds to wait for execution. Raises a
-                {exc}`TimeoutError` on timeout. `None` means wait indefinitely.
+                [`TimeoutError`][] on timeout. `None` means wait indefinitely.
             args: Positional arguments to pass to the worker method.
             kwargs: Keyword arguments to pass to the worker method.
 
diff --git a/vllm/executor/ray_distributed_executor.py b/vllm/executor/ray_distributed_executor.py
index 9b0b98731e03..8e67c7a41bb1 100644
--- a/vllm/executor/ray_distributed_executor.py
+++ b/vllm/executor/ray_distributed_executor.py
@@ -528,12 +528,12 @@ def _wait_for_tasks_completion(self, parallel_worker_tasks: Any) -> None:
         ray.get(parallel_worker_tasks)
 
     def _check_ray_cgraph_installation(self):
-        import pkg_resources
+        import importlib.metadata
+
         from packaging import version
 
         required_version = version.parse("2.43.0")
-        current_version = version.parse(
-            pkg_resources.get_distribution("ray").version)
+        current_version = version.parse(importlib.metadata.version("ray"))
         if current_version < required_version:
             raise ValueError(f"Ray version {required_version} is "
                              f"required, but found {current_version}")
diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 37cc07bfbb36..7bc98a16f041 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -87,9 +87,8 @@ def execute_model_spmd(
             # TODO(swang): This is needed right now because Ray Compiled Graph
             # executes on a background thread, so we need to reset torch's
             # current device.
-            import torch
             if not self.compiled_dag_cuda_device_set:
-                torch.cuda.set_device(self.worker.device)
+                current_platform.set_device(self.worker.device)
                 self.compiled_dag_cuda_device_set = True
 
             output = self.worker._execute_model_spmd(execute_model_req,
@@ -113,8 +112,7 @@ def setup_device_if_necessary(self):
                     # Not needed
                     pass
                 else:
-                    import torch
-                    torch.cuda.set_device(self.worker.device)
+                    current_platform.set_device(self.worker.device)
 
                 self.compiled_dag_cuda_device_set = True
 
diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 5d2d95f18d2f..3c8083e3dd0d 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -120,7 +120,10 @@ def set_forward_context(attn_metadata: Any,
             # we use synchronous scheduling right now,
             # adding a sync point here should not affect
             # scheduling of the next batch
-            torch.cuda.synchronize()
+            from vllm.platforms import current_platform
+            synchronize = current_platform.synchronize
+            if synchronize is not None:
+                synchronize()
             now = time.perf_counter()
             # time measurement is in milliseconds
             batchsize_forward_time[batchsize].append(
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 0673aece9108..df4f844cd815 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -10,8 +10,9 @@
 
 INPUT_REGISTRY = InputRegistry()
 """
-The global {class}`~InputRegistry` which is used by {class}`~vllm.LLMEngine`
-to dispatch data processing according to the target model.
+The global [`InputRegistry`][vllm.inputs.registry.InputRegistry] which is used
+by [`LLMEngine`][vllm.LLMEngine] to dispatch data processing according to the
+target model.
 """
 
 __all__ = [
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 3b58ec47d5bf..843c45bd6163 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -80,22 +80,24 @@ class EmbedsPrompt(TypedDict):
 """
 Set of possible schemas for a single prompt:
 
-- A text prompt ({class}`str` or {class}`TextPrompt`)
-- A tokenized prompt ({class}`TokensPrompt`)
-- An embeddings prompt ({class}`EmbedsPrompt`)
+- A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt])
+- A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt])
+- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt])
 
 Note that "singleton" is as opposed to a data structure
 which encapsulates multiple prompts, i.e. of the sort
 which may be utilized for encoder/decoder models when
 the user desires to express both the encoder & decoder
-prompts explicitly, i.e. {class}`ExplicitEncoderDecoderPrompt`
+prompts explicitly, i.e. 
+[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
 
-A prompt of type {class}`SingletonPrompt` may be employed
-as (1) input to a decoder-only model, (2) input to
+A prompt of type [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] may be 
+employed as (1) input to a decoder-only model, (2) input to
 the encoder of an encoder/decoder model, in the scenario
 where the decoder-prompt is not specified explicitly, or
 (3) as a member of a larger data structure encapsulating
-more than one prompt, i.e. {class}`ExplicitEncoderDecoderPrompt`
+more than one prompt, i.e. 
+[`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
 """
 
 
@@ -126,18 +128,20 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
     comprising an explicit encoder prompt and a decoder prompt.
 
     The encoder and decoder prompts, respectively, may be formatted
-    according to any of the {class}`SingletonPrompt` schemas,
+    according to any of the
+    [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] schemas,
     and are not required to have the same schema.
 
     Only the encoder prompt may have multi-modal data. mm_processor_kwargs
     should be at the top-level, and should not be set in the encoder/decoder
     prompts, since they are agnostic to the encoder/decoder.
 
-    Note that an {class}`ExplicitEncoderDecoderPrompt` may not
-    be used as an input to a decoder-only model,
+    Note that an
+    [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
+    may not be used as an input to a decoder-only model,
     and that the `encoder_prompt` and `decoder_prompt`
     fields of this data structure themselves must be
-    {class}`SingletonPrompt` instances.
+    [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] instances.
     """
 
     encoder_prompt: _T1_co
@@ -152,11 +156,11 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]):
 Set of possible schemas for an LLM input, including
 both decoder-only and encoder/decoder input types:
 
-- A text prompt ({class}`str` or {class}`TextPrompt`)
-- A tokenized prompt ({class}`TokensPrompt`)
-- An embeddings prompt ({class}`EmbedsPrompt`)
+- A text prompt ([`str`][] or [`TextPrompt`][vllm.inputs.data.TextPrompt])
+- A tokenized prompt ([`TokensPrompt`][vllm.inputs.data.TokensPrompt])
+- An embeddings prompt ([`EmbedsPrompt`][vllm.inputs.data.EmbedsPrompt])
 - A single data structure containing both an encoder and a decoder prompt
-  ({class}`ExplicitEncoderDecoderPrompt`)
+  ([`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt])
 """
 
 
@@ -189,7 +193,8 @@ def token_inputs(
     prompt: Optional[str] = None,
     cache_salt: Optional[str] = None,
 ) -> TokenInputs:
-    """Construct {class}`TokenInputs` from optional values."""
+    """Construct [`TokenInputs`][vllm.inputs.data.TokenInputs] from optional
+    values."""
     inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids)
 
     if prompt is not None:
@@ -221,7 +226,8 @@ def embeds_inputs(
     prompt_embeds: torch.Tensor,
     cache_salt: Optional[str] = None,
 ) -> EmbedsInputs:
-    """Construct :class:`EmbedsInputs` from optional values."""
+    """Construct [`EmbedsInputs`][vllm.inputs.data.EmbedsInputs] from optional
+    values."""
     inputs = EmbedsInputs(type="embeds", prompt_embeds=prompt_embeds)
 
     if cache_salt is not None:
@@ -232,7 +238,7 @@ def embeds_inputs(
 
 DecoderOnlyInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
 """
-The inputs in {class}`~vllm.LLMEngine` before they are
+The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they are
 passed to the model executor.
 This specifies the data required for decoder-only models.
 """
@@ -240,11 +246,12 @@ def embeds_inputs(
 
 class EncoderDecoderInputs(TypedDict):
     """
-    The inputs in {class}`~vllm.LLMEngine` before they are
-    passed to the model executor.
+    The inputs in [`LLMEngine`][vllm.engine.llm_engine.LLMEngine] before they
+    are passed to the model executor.
 
     This specifies the required data for encoder-decoder models.
     """
+
     encoder: Union[TokenInputs, "MultiModalInputs"]
     """The inputs for the encoder portion."""
 
@@ -254,13 +261,13 @@ class EncoderDecoderInputs(TypedDict):
 
 SingletonInputs = Union[TokenInputs, EmbedsInputs, "MultiModalInputs"]
 """
-A processed {class}`SingletonPrompt` which can be passed to
-{class}`vllm.sequence.Sequence`.
+A processed [`SingletonPrompt`][vllm.inputs.data.SingletonPrompt] which can be 
+passed to [`vllm.sequence.Sequence`][].
 """
 
 ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
 """
-The inputs to {data}`vllm.inputs.InputProcessor`.
+The outputs from [`vllm.inputs.preprocess.InputPreprocessor`][].
 """
 
 _T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt)
@@ -277,7 +284,8 @@ def build_explicit_enc_dec_prompt(
     return ExplicitEncoderDecoderPrompt(
         encoder_prompt=encoder_prompt,
         decoder_prompt=decoder_prompt,
-        mm_processor_kwargs=mm_processor_kwargs)
+        mm_processor_kwargs=mm_processor_kwargs,
+    )
 
 
 def zip_enc_dec_prompts(
@@ -288,7 +296,8 @@ def zip_enc_dec_prompts(
 ) -> list[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
     """
     Zip encoder and decoder prompts together into a list of
-    {class}`ExplicitEncoderDecoderPrompt` instances.
+    [`ExplicitEncoderDecoderPrompt`][vllm.inputs.data.ExplicitEncoderDecoderPrompt]
+    instances.
 
     ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
     dictionary will be used for every encoder/decoder prompt. If an iterable is
@@ -299,10 +308,11 @@ def zip_enc_dec_prompts(
     if isinstance(mm_processor_kwargs, dict):
         return [
             build_explicit_enc_dec_prompt(
-                encoder_prompt, decoder_prompt,
-                cast(dict[str, Any], mm_processor_kwargs))
-            for (encoder_prompt,
-                 decoder_prompt) in zip(enc_prompts, dec_prompts)
+                encoder_prompt,
+                decoder_prompt,
+                cast(dict[str, Any], mm_processor_kwargs),
+            ) for (encoder_prompt,
+                   decoder_prompt) in zip(enc_prompts, dec_prompts)
         ]
     return [
         build_explicit_enc_dec_prompt(encoder_prompt, decoder_prompt,
diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py
index d17122b48344..4c64a41ace31 100644
--- a/vllm/inputs/parse.py
+++ b/vllm/inputs/parse.py
@@ -23,13 +23,13 @@ class ParsedTokens(TypedDict):
 
 @overload
 def parse_and_batch_prompt(
-        prompt: Union[str, list[str]]) -> Sequence[ParsedText]:
+    prompt: Union[str, list[str]], ) -> Sequence[ParsedText]:
     ...
 
 
 @overload
 def parse_and_batch_prompt(
-        prompt: Union[list[int], list[list[int]]]) -> Sequence[ParsedTokens]:
+    prompt: Union[list[int], list[list[int]]], ) -> Sequence[ParsedTokens]:
     ...
 
 
@@ -86,7 +86,7 @@ class ParsedTokensPrompt(TypedDict):
 
 
 class ParsedEmbedsPrompt(TypedDict):
-    type: Literal['embeds']
+    type: Literal["embeds"]
     content: EmbedsPrompt
 
 
@@ -133,7 +133,7 @@ def parse_singleton_prompt(prompt: SingletonPrompt) -> ParsedSingletonPrompt:
 
 
 def is_explicit_encoder_decoder_prompt(
-        prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]:
+    prompt: PromptType, ) -> TypeIs[ExplicitEncoderDecoderPrompt]:
     return isinstance(prompt, dict) and "encoder_prompt" in prompt
 
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 6e8effd60274..b9acabeabd8d 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -67,11 +67,11 @@ def get_eos_token_id(self,
         return self.tokenizer.get_lora_tokenizer(lora_request).eos_token_id
 
     def get_decoder_start_token_id(self) -> Optional[int]:
-        '''
+        """
         Obtain the decoder start token id employed by an encoder/decoder
         model. Returns None for non-encoder/decoder models or if the
         model config is unavailable.
-        '''
+        """
 
         if not self.model_config.is_encoder_decoder:
             logger.warning_once(
@@ -79,14 +79,14 @@ def get_decoder_start_token_id(self) -> Optional[int]:
                 "this is not an encoder/decoder model.")
             return None
 
-        if (self.model_config is None or self.model_config.hf_config is None):
+        if self.model_config is None or self.model_config.hf_config is None:
             logger.warning_once(
                 "Using None for decoder start token id because "
                 "model config is not available.")
             return None
 
         dec_start_token_id = getattr(self.model_config.hf_config,
-                                     'decoder_start_token_id', None)
+                                     "decoder_start_token_id", None)
         if dec_start_token_id is None:
             logger.warning_once(
                 "Falling back on <BOS> for decoder start token "
@@ -97,7 +97,7 @@ def get_decoder_start_token_id(self) -> Optional[int]:
         return dec_start_token_id
 
     def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
-        '''
+        """
         Specifically for encoder/decoder models:
         generate a default decoder prompt for when
         the user specifies only the encoder prompt.
@@ -126,7 +126,7 @@ def _get_default_enc_dec_decoder_prompt(self) -> list[int]:
         Returns:
 
         * prompt_token_ids
-        '''
+        """
 
         bos_token_id = self.get_bos_token_id()
         assert bos_token_id is not None
@@ -224,7 +224,10 @@ async def _tokenize_prompt_async(
         lora_request: Optional[LoRARequest],
         tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> list[int]:
-        """Async version of {meth}`_tokenize_prompt`."""
+        """
+        Async version of
+        [`_tokenize_prompt`][vllm.inputs.preprocess.InputPreprocessor._tokenize_prompt].
+        """
         tokenizer = self.get_tokenizer_group()
         tokenization_kwargs = self._get_tokenization_kw(tokenization_kwargs)
 
@@ -287,7 +290,10 @@ async def _process_multimodal_async(
         lora_request: Optional[LoRARequest],
         return_mm_hashes: bool = False,
     ) -> MultiModalInputs:
-        """Async version of {meth}`_process_multimodal`."""
+        """
+        Async version of
+        [`_process_multimodal`][vllm.inputs.preprocess.InputPreprocessor._process_multimodal].
+        """
         tokenizer = await self._get_mm_tokenizer_async(lora_request)
 
         mm_processor = self.mm_registry.create_processor(self.model_config,
@@ -472,7 +478,7 @@ def _prompt_to_llm_inputs(
 
         Returns:
 
-        * {class}`SingletonInputs` instance
+        * [`SingletonInputs`][vllm.inputs.data.SingletonInputs] instance
         """
         parsed = parse_singleton_prompt(prompt)
 
@@ -508,7 +514,10 @@ async def _prompt_to_llm_inputs_async(
         lora_request: Optional[LoRARequest] = None,
         return_mm_hashes: bool = False,
     ) -> SingletonInputs:
-        """Async version of {meth}`_prompt_to_llm_inputs`."""
+        """
+        Async version of
+        [`_prompt_to_llm_inputs`][vllm.inputs.preprocess.InputPreprocessor._prompt_to_llm_inputs].
+        """
         parsed = parse_singleton_prompt(prompt)
 
         if parsed["type"] == "embeds":
@@ -644,7 +653,9 @@ def _process_encoder_decoder_prompt(
     ) -> EncoderDecoderInputs:
         """
         For encoder/decoder models only:
-        Process an input prompt into an {class}`EncoderDecoderInputs` instance.
+        Process an input prompt into an
+        [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
+        instance.
 
         There are two types of input prompts:
         singleton prompts which carry only the
@@ -670,7 +681,8 @@ def _process_encoder_decoder_prompt(
 
         Returns:
 
-        * {class}`EncoderDecoderInputs` instance
+        * [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
+          instance
         """
         encoder_inputs: SingletonInputs
         decoder_inputs: Optional[SingletonInputs]
@@ -710,7 +722,10 @@ async def _process_encoder_decoder_prompt_async(
         prompt: PromptType,
         tokenization_kwargs: Optional[dict[str, Any]] = None,
     ) -> EncoderDecoderInputs:
-        """Async version of {meth}`_process_encoder_decoder_prompt`."""
+        """
+        Async version of
+        [`_process_encoder_decoder_prompt`][vllm.inputs.preprocess.InputPreprocessor._process_encoder_decoder_prompt].
+        """
         encoder_inputs: SingletonInputs
         decoder_inputs: Optional[SingletonInputs]
 
@@ -778,7 +793,8 @@ def _process_decoder_only_prompt(
     ) -> DecoderOnlyInputs:
         """
         For decoder-only models:
-        Process an input prompt into an {class}`DecoderOnlyInputs` instance.
+        Process an input prompt into a
+        [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance.
 
         Arguments:
 
@@ -789,7 +805,7 @@ def _process_decoder_only_prompt(
 
         Returns:
 
-        * {class}`DecoderOnlyInputs` instance
+        * [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] instance
         """
 
         prompt_comps = self._prompt_to_llm_inputs(
@@ -812,7 +828,10 @@ async def _process_decoder_only_prompt_async(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         return_mm_hashes: bool = False,
     ) -> DecoderOnlyInputs:
-        """Async version of {meth}`_process_decoder_only_prompt`."""
+        """
+        Async version of
+        [`_process_decoder_only_prompt`][vllm.inputs.preprocess.InputPreprocessor._process_decoder_only_prompt].
+        """
         prompt_comps = await self._prompt_to_llm_inputs_async(
             prompt,
             tokenization_kwargs=tokenization_kwargs,
@@ -863,7 +882,10 @@ async def preprocess_async(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         return_mm_hashes: bool = False,
     ) -> ProcessorInputs:
-        """Async version of {meth}`preprocess`."""
+        """
+        Async version of
+        [`preprocess`][vllm.inputs.preprocess.InputPreprocessor.preprocess].
+        """
         if self.model_config.is_encoder_decoder:
             assert not return_mm_hashes, (
                 "Multimodal hashes for encoder-decoder models should not be ",
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 148b3558c15e..f424a8f613ab 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -38,7 +38,7 @@ def get_hf_config(
     ) -> _C:
         """
         Get the HuggingFace configuration
-        ({class}`transformers.PretrainedConfig`) of the model,
+        (`transformers.PretrainedConfig`) of the model,
         additionally checking its type.
 
         Raises:
@@ -79,7 +79,7 @@ def get_hf_processor(
     ) -> _P:
         """
         Get the HuggingFace processor
-        ({class}`transformers.ProcessorMixin`) of the model,
+        (`transformers.ProcessorMixin`) of the model,
         additionally checking its type.
 
         Raises:
diff --git a/vllm/logger.py b/vllm/logger.py
index cf32041c5b70..fd16dd95bb1b 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -68,22 +68,22 @@ class _VllmLogger(Logger):
     """
     Note:
         This class is just to provide type information.
-        We actually patch the methods directly on the {class}`logging.Logger`
+        We actually patch the methods directly on the [`logging.Logger`][]
         instance to avoid conflicting with other libraries such as
         `intel_extension_for_pytorch.utils._logger`.
     """
 
     def info_once(self, msg: str, *args: Hashable) -> None:
         """
-        As {meth}`info`, but subsequent calls with the same message
-        are silently dropped.
+        As [`info`][logging.Logger.info], but subsequent calls with
+        the same message are silently dropped.
         """
         _print_info_once(self, msg, *args)
 
     def warning_once(self, msg: str, *args: Hashable) -> None:
         """
-        As {meth}`warning`, but subsequent calls with the same message
-        are silently dropped.
+        As [`warning`][logging.Logger.warning], but subsequent calls with
+        the same message are silently dropped.
         """
         _print_warning_once(self, msg, *args)
 
diff --git a/vllm/logging_utils/dump_input.py b/vllm/logging_utils/dump_input.py
index 169e24794095..47ce0ab188bd 100644
--- a/vllm/logging_utils/dump_input.py
+++ b/vllm/logging_utils/dump_input.py
@@ -18,7 +18,7 @@
 
 def prepare_object_to_dump(obj) -> str:
     if isinstance(obj, str):
-        return "'{obj}'"  # Double quotes
+        return f"'{obj}'"  # Double quotes
     elif isinstance(obj, dict):
         dict_str = ', '.join({f'{str(k)}: {prepare_object_to_dump(v)}' \
             for k, v in obj.items()})
@@ -42,9 +42,9 @@ def prepare_object_to_dump(obj) -> str:
         return obj.anon_repr()
     elif hasattr(obj, '__dict__'):
         items = obj.__dict__.items()
-        dict_str = ','.join([f'{str(k)}={prepare_object_to_dump(v)}' \
+        dict_str = ', '.join([f'{str(k)}={prepare_object_to_dump(v)}' \
             for k, v in items])
-        return (f"{type(obj).__name__}({dict_str})")
+        return f"{type(obj).__name__}({dict_str})"
     else:
         # Hacky way to make sure we can serialize the object in JSON format
         try:
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 959fe4a672a6..af5cebdf2a8b 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -3,11 +3,11 @@
 import copy
 import math
 import os
-import re
 from collections.abc import Sequence
 from dataclasses import dataclass, field
 from typing import Any, Callable, Optional, Union
 
+import regex as re
 import safetensors.torch
 import torch
 from torch import nn
@@ -29,6 +29,7 @@
                              get_supported_lora_modules,
                              is_regex_target_modules,
                              parse_fine_tuned_lora_name, replace_submodule)
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.interfaces import is_pooling_model
 from vllm.model_executor.models.module_mapping import MultiModelKeys
@@ -185,19 +186,19 @@ def from_lora_tensors(
 
     @classmethod
     def from_local_checkpoint(
-        cls,
-        lora_dir: str,
-        expected_lora_modules: list[str],
-        peft_helper: PEFTHelper,
-        *,
-        lora_model_id: Optional[int] = None,
-        device: str = "cuda",
-        dtype: Optional[torch.dtype] = None,
-        target_embedding_padding: Optional[int] = None,
-        embedding_modules: Optional[dict[str, str]] = None,
-        embedding_padding_modules: Optional[list[str]] = None,
-        weights_mapper: Optional[WeightsMapper] = None,
-    ) -> "LoRAModel":
+            cls,
+            lora_dir: str,
+            expected_lora_modules: list[str],
+            peft_helper: PEFTHelper,
+            *,
+            lora_model_id: Optional[int] = None,
+            device: str = "cuda",
+            dtype: Optional[torch.dtype] = None,
+            target_embedding_padding: Optional[int] = None,
+            embedding_modules: Optional[dict[str, str]] = None,
+            embedding_padding_modules: Optional[list[str]] = None,
+            weights_mapper: Optional[WeightsMapper] = None,
+            tensorizer_config_dict: Optional[dict] = None) -> "LoRAModel":
         """Create a LoRAModel from a local checkpoint.
         
         Args:
@@ -219,10 +220,36 @@ def from_local_checkpoint(
             lora_dir, "new_embeddings.safetensors")
         new_embeddings_bin_file_path = os.path.join(lora_dir,
                                                     "new_embeddings.bin")
+        tensors: dict[str, torch.Tensor] = {}
+        unexpected_modules: list[Union[list[str], str]] = []
+
+        def check_unexpected_modules(modules: dict):
+            for lora_module in modules.keys():  # noqa
+                module_name, _, _ = parse_fine_tuned_lora_name(
+                    lora_module, weights_mapper)
+                part_name = module_name.split(".")[-1]
+                if part_name not in expected_lora_modules:
+                    unexpected_modules.append(module_name)
+            if unexpected_modules:
+                raise ValueError(
+                    f"While loading {lora_dir}, expected"
+                    f" target modules in {expected_lora_modules}"
+                    f" but received {unexpected_modules}."
+                    f" Please verify that the loaded LoRA module is correct")
 
-        unexpected_modules: list[Union[list[str], str]]
-        if os.path.isfile(lora_tensor_path):
-            tensors: dict[str, torch.Tensor] = {}
+        if tensorizer_config_dict:
+            from tensorizer import TensorDeserializer
+
+            tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
+            lora_tensor_path = os.path.join(tensorizer_config.tensorizer_dir,
+                                            "adapter_model.tensors")
+            tensorizer_args = tensorizer_config._construct_tensorizer_args()
+            tensors = TensorDeserializer(lora_tensor_path,
+                                         dtype=tensorizer_config.dtype,
+                                         **tensorizer_args.deserializer_params)
+            check_unexpected_modules(tensors)
+
+        elif os.path.isfile(lora_tensor_path):
             # Find unexpected modules.
             # Use safetensor key as a source of truth to find expected modules.
             # in peft if you have target_modules A, B, C and C does not exist
@@ -232,20 +259,8 @@ def from_local_checkpoint(
             unexpected_modules = []
             with safetensors.safe_open(lora_tensor_path,
                                        framework="pt") as f:  # type: ignore
-                for lora_module in f.keys():  # noqa
-                    module_name, _, _ = parse_fine_tuned_lora_name(
-                        lora_module, weights_mapper)
-                    part_name = module_name.split(".")[-1]
-                    if part_name not in expected_lora_modules:
-                        unexpected_modules.append(module_name)
-                if unexpected_modules:
-                    raise ValueError(
-                        f"While loading {lora_dir}, expected"
-                        f" target modules in {expected_lora_modules}"
-                        f" but received {unexpected_modules}."
-                        f" Please verify that the loaded LoRA module is correct"
-                    )
                 # Load tensors if there are only expected modules.
+                check_unexpected_modules(f)
                 for module in f.keys():  # noqa
                     tensors[module] = f.get_tensor(module)
         elif os.path.isfile(lora_bin_file_path):
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index d5de63f5baad..7d335e5f7fab 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -10,6 +10,7 @@
 
 from vllm.config import LoRAConfig
 from vllm.logger import init_logger
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 
 logger = init_logger(__name__)
 
@@ -89,12 +90,31 @@ def from_dict(cls, config_dict: dict) -> "PEFTHelper":
         return cls(**filtered_dict)
 
     @classmethod
-    def from_local_dir(cls, lora_path: str,
-                       max_position_embeddings: Optional[int]) -> "PEFTHelper":
+    def from_local_dir(
+            cls,
+            lora_path: str,
+            max_position_embeddings: Optional[int],
+            tensorizer_config_dict: Optional[dict] = None) -> "PEFTHelper":
         lora_config_path = os.path.join(lora_path, "adapter_config.json")
 
-        with open(lora_config_path) as f:
-            config = json.load(f)
+        if tensorizer_config_dict:
+            tensorizer_config = TensorizerConfig(**tensorizer_config_dict)
+            tensorizer_args = tensorizer_config._construct_tensorizer_args()
+            from tensorizer.stream_io import open_stream
+            lora_config_path = os.path.join(tensorizer_config.lora_dir,
+                                            "adapter_config.json")
+            with open_stream(lora_config_path,
+                             mode="rb",
+                             **tensorizer_args.stream_params) as f:
+                config = json.load(f)
+
+            logger.info("Successfully deserialized LoRA config from %s",
+                        tensorizer_config.lora_dir)
+
+        else:
+            with open(lora_config_path) as f:
+                config = json.load(f)
+
         config["vllm_max_position_embeddings"] = max_position_embeddings
         return cls.from_dict(config)
 
diff --git a/vllm/lora/request.py b/vllm/lora/request.py
index badfaa419377..616e94f8d678 100644
--- a/vllm/lora/request.py
+++ b/vllm/lora/request.py
@@ -31,6 +31,7 @@ class LoRARequest(
     lora_local_path: Optional[str] = msgspec.field(default=None)
     long_lora_max_len: Optional[int] = None
     base_model_name: Optional[str] = msgspec.field(default=None)
+    tensorizer_config_dict: Optional[dict] = None
 
     def __post_init__(self):
         if self.lora_local_path:
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index b66850d4304f..619dd3bdc40a 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-import re
 from typing import Optional, Union
 
 import huggingface_hub
+import regex as re
 from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
                                    HFValidationError, RepositoryNotFoundError)
 from torch import nn
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 8e5bc6106659..afc8a8dc3b26 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -100,7 +100,8 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
             lora_path = get_adapter_absolute_path(lora_request.lora_path)
 
             peft_helper = PEFTHelper.from_local_dir(
-                lora_path, self.max_position_embeddings)
+                lora_path, self.max_position_embeddings,
+                lora_request.tensorizer_config_dict)
 
             # Validates the LoRA configuration against requirements before
             # loading weights, throwing an exception if validation fails.
@@ -125,6 +126,7 @@ def _load_adapter(self, lora_request: LoRARequest) -> LoRAModel:
                 self.lora_config.lora_extra_vocab_size,
                 embedding_modules=self.embedding_modules,
                 embedding_padding_modules=self.embedding_padding_modules,
+                tensorizer_config_dict=lora_request.tensorizer_config_dict,
                 weights_mapper=hf_to_vllm_mapper)
 
         except FileNotFoundError as e:
diff --git a/vllm/model_executor/guided_decoding/guidance_decoding.py b/vllm/model_executor/guided_decoding/guidance_decoding.py
index 0b1f4762bc73..58adcc3caff9 100644
--- a/vllm/model_executor/guided_decoding/guidance_decoding.py
+++ b/vllm/model_executor/guided_decoding/guidance_decoding.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 import json
-from re import escape as regex_escape
 
 import llguidance
+from regex import escape as regex_escape
 from transformers import PreTrainedTokenizerBase
 
 from vllm.model_executor.guided_decoding.guidance_logits_processors import (
diff --git a/vllm/model_executor/guided_decoding/guidance_logits_processors.py b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
index 4b45c272adc5..e17df68b4b4d 100644
--- a/vllm/model_executor/guided_decoding/guidance_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/guidance_logits_processors.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+import copy
 import os
 from typing import Any
 
@@ -34,9 +35,24 @@ def __init__(
         self.grammar = grammar
         self.tokenizer = tokenizer
         self.tokenizer_name = tokenizer.name_or_path
+        self.ll_tokenizer = None
+        self.ll_matcher = None
+        self.bitmask = None
         self.new_sampling = False
         self.initialized = False
 
+    def clone(self) -> "GuidanceLogitsProcessor":
+        cloned = copy.copy(self)
+        if self.initialized:
+            cloned.ll_matcher = llguidance.LLMatcher(
+                self.ll_tokenizer,  # type: ignore[assignment]
+                self.grammar,
+                log_level=int(os.environ.get("LLGUIDANCE_LOG_LEVEL", "1")),
+            )
+            self.bitmask = llguidance.torch.allocate_token_bitmask(
+                1, self.ll_tokenizer.vocab_size)  # type: ignore[attr-defined]
+        return cloned
+
     def _initialize(self):
         if self.initialized:
             return
@@ -56,7 +72,7 @@ def _initialize(self):
 
         # create reusable bitmask
         self.bitmask = llguidance.torch.allocate_token_bitmask(
-            1, self.ll_tokenizer.vocab_size)
+            1, self.ll_tokenizer.vocab_size)  # type: ignore[attr-defined]
 
         self.initialized = True
 
@@ -70,15 +86,17 @@ def __call__(
         self._initialize()
 
         if self.new_sampling and len(input_ids) > 0:
-            self.ll_matcher.consume_token(input_ids[-1])
-            err = self.ll_matcher.get_error()
+            self.ll_matcher.consume_token(  # type: ignore[attr-defined]
+                input_ids[-1])
+            err = self.ll_matcher.get_error()  # type: ignore[attr-defined]
             if err:
                 logger.warning("Error in LLMatcher: %s", err)
 
         llguidance.torch.fill_next_token_bitmask(self.ll_matcher, self.bitmask,
                                                  0)
         llguidance.torch.apply_token_bitmask_inplace(
-            scores, self.bitmask.to(scores.device))
+            scores,
+            self.bitmask.to(scores.device))  # type: ignore[attr-defined]
 
         self.new_sampling = True
 
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index bcd7494e6cec..e41af4b360e4 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -5,9 +5,9 @@
 import os
 from enum import Enum
 from json import dumps as json_dumps
-from re import escape as regex_escape
 from typing import Optional, Union
 
+from regex import escape as regex_escape
 from transformers import PreTrainedTokenizerBase
 
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 8ae7c7b6b2c7..6986b6554c23 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -56,6 +56,12 @@ def __init__(self, guide: Guide, reasoner: Optional[ReasoningParser]):
         self._fsm_state: defaultdict[int, Union[int,
                                                 CFGState]] = defaultdict(int)
 
+    def clone(self) -> "BaseLogitsProcessor":
+        cloned = copy.copy(self)
+        cloned._guide = self._guide.copy()
+        cloned._fsm_state = copy.deepcopy(self._fsm_state)
+        return cloned
+
     def __call__(self, input_ids: list[int],
                  scores: torch.Tensor) -> torch.Tensor:
         """Use the FSM to bias the logits before sampling the next token."""
@@ -218,6 +224,12 @@ def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase,
                          reasoner)
         self._guide = self._guide.copy()
 
+    def clone(self) -> "CFGLogitsProcessor":
+        cloned = copy.copy(self)
+        cloned._fsm_state = copy.deepcopy(self._fsm_state)
+        cloned._guide = self._guide.copy()
+        return cloned
+
 
 @lru_cache(maxsize=32)
 def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase):
diff --git a/vllm/model_executor/guided_decoding/utils.py b/vllm/model_executor/guided_decoding/utils.py
index 1ad1ef8fbf16..3f77cf394d9a 100644
--- a/vllm/model_executor/guided_decoding/utils.py
+++ b/vllm/model_executor/guided_decoding/utils.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import re
+import regex as re
 
 
 def has_xgrammar_unsupported_json_features(schema: dict) -> bool:
diff --git a/vllm/model_executor/guided_decoding/xgrammar_decoding.py b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
index 8e40da4b3aa9..d2e568609945 100644
--- a/vllm/model_executor/guided_decoding/xgrammar_decoding.py
+++ b/vllm/model_executor/guided_decoding/xgrammar_decoding.py
@@ -4,10 +4,10 @@
 from __future__ import annotations
 
 import json
-import re
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any
 
+import regex as re
 import torch
 
 import vllm.envs
@@ -302,8 +302,9 @@ class XGrammarLogitsProcessor:
     prefilled: bool = field(default=False)
 
     def __post_init__(self):
-        self.tokenizer_info = self.config.tokenizer_info(
-            self.config.tokenizer_data)
+        if self.tokenizer_info is None:
+            self.tokenizer_info = self.config.tokenizer_info(
+                self.config.tokenizer_data)
 
     def __getstate__(self) -> dict[str, Any]:
         return {'config': self.config, 'reasoner': self.reasoner}
@@ -400,7 +401,8 @@ def __call__(self, input_ids: list[int],
     def clone(self) -> XGrammarLogitsProcessor:
         """Create a new instance with shared compiled grammar
           but separate state"""
-        new_processor = XGrammarLogitsProcessor(self.config, self.reasoner)
+        new_processor = XGrammarLogitsProcessor(self.config, self.reasoner,
+                                                None, self.tokenizer_info)
 
         # Share the compiled grammar context (immutable after compilation)
         new_processor.ctx = self.ctx
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
new file mode 100644
index 000000000000..3e0ad0d5a989
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128,128].json
@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "8": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "64": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "256": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    }
+}
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
index aff108112b61..26a433da2189 100644
--- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 """ CUTLASS based Fused MoE kernels."""
-import os
 from typing import Optional
 
 import torch
@@ -271,8 +270,6 @@ def cutlass_moe_fp8(
 
 FLOAT4_E2M1_MAX = scalar_types.float4_e2m1f.max()
 FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
-MAX_TOKENS_PER_EXPERT = int(
-    os.environ.get('VLLM_MODELOPT_MAX_TOKENS_PER_EXPERT', '65536'))
 
 
 def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
@@ -330,10 +327,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
     assert a.dtype in [torch.half, torch.bfloat16], "Invalid input dtype"
     assert (topk_weights.shape[0] == m and topk_ids.shape[0]
             == m), ("topk must be provided for each row of a")
-    assert (m <= MAX_TOKENS_PER_EXPERT), (
-        f"m must be less than MAX_TOKENS_PER_EXPERT({MAX_TOKENS_PER_EXPERT})"
-        f" for cutlass_moe_fp4, observed m = {m}. Use"
-        f" VLLM_MODELOPT_MAX_TOKENS_PER_EXPERT to set this value.")
+
     out_dtype = a.dtype
     num_topk = topk_ids.shape[1]
 
@@ -362,8 +356,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
         expert_offsets,
         blockscale_offsets,
         num_topk,
-        expert_map=a_map,
-        MAX_TOKENS_PER_EXPERT=MAX_TOKENS_PER_EXPERT)
+        expert_map=a_map)
 
     c1 = ops.cutlass_fp4_moe_mm(rep_a_fp4, w1_fp4, rep_a_blockscale,
                                 w1_blockscale, w1_alphas, problem_sizes1,
@@ -378,12 +371,7 @@ def cutlass_moe_fp4(a: torch.Tensor, a1_gscale: torch.Tensor,
     torch.ops._C.silu_and_mul(intermediate, c1)
 
     int_fp4, int_blockscale = ops.scaled_fp4_experts_quant(
-        intermediate,
-        a2_gscale,
-        expert_offsets,
-        blockscale_offsets,
-        num_topk,
-        MAX_TOKENS_PER_EXPERT=MAX_TOKENS_PER_EXPERT)
+        intermediate, a2_gscale, expert_offsets, blockscale_offsets, num_topk)
 
     c2 = ops.cutlass_fp4_moe_mm(int_fp4, w2_fp4, int_blockscale, w2_blockscale,
                                 w2_alphas, problem_sizes2, expert_offsets[:-1],
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index f1cb77f64eae..29b41e720852 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1,12 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import importlib
-import threading
 from abc import abstractmethod
 from dataclasses import dataclass
 from enum import Enum
 from typing import Callable, Optional
-from weakref import WeakValueDictionary
 
 import torch
 import torch.nn.functional as F
@@ -43,6 +41,7 @@
         from .pplx_prepare_finalize import PplxPrepareAndFinalize
 else:
     fused_experts = None  # type: ignore
+    FusedMoEPermuteExpertsUnpermute = None  # type: ignore
     FusedMoEPrepareAndFinalize = None  # type: ignore
 if is_rocm_aiter_moe_enabled():
     from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
@@ -50,8 +49,7 @@
 else:
     from vllm.model_executor.layers.fused_moe.fused_moe import grouped_topk
 if current_platform.is_tpu():
-    # the iterative moe implementation is used until the moe_pallas is fixed
-    from .moe_torch_iterative import fused_moe as fused_moe_pallas
+    from .moe_pallas import fused_moe as fused_moe_pallas
 else:
     fused_moe_pallas = None  # type: ignore
 logger = init_logger(__name__)
@@ -74,7 +72,8 @@ class FusedMoEParallelConfig:
 
     @property
     def use_pplx_kernels(self):
-        return self.dp_size > 1 and self.use_ep and has_pplx
+        return self.dp_size > 1 and self.use_ep and \
+             envs.VLLM_ALL2ALL_BACKEND == "pplx"
 
     @staticmethod
     def make(tp_size_: int, dp_size_: int,
@@ -197,6 +196,8 @@ class MoEConfig:
     # TODO: add more quantization params, blocked, per-token, etc.
     block_size: int = 128
 
+    max_num_tokens: int = MOE_DP_CHUNK_SIZE
+
     @property
     def tp_size(self):
         return self.moe_parallel_config.tp_size
@@ -245,13 +246,59 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
         raise NotImplementedError
 
-    def set_prepare_finalize(
-        self,
-        dp_size: int,
-        world_size: int,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
-    ) -> bool:
-        return False
+    def init_prepare_finalize(self, moe: MoEConfig,
+                              quant_config: Optional[QuantizationConfig]):
+        all2all_manager = get_ep_group().device_communicator.all2all_manager
+        assert all2all_manager is not None
+
+        prepare_finalize = None
+        if moe.use_pplx_kernels:
+            all_to_all_args = dict(
+                max_num_tokens=moe.max_num_tokens,
+                num_experts=moe.num_experts,
+                experts_per_token=moe.experts_per_token,  # topk
+                rank=all2all_manager.rank,
+                world_size=all2all_manager.world_size,
+                # dp_size actually means tp_size, bug in pplx kernels
+                dp_size=all2all_manager.tp_group.world_size,
+                hidden_dim=moe.hidden_dim,
+                hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize,
+                # For blocked per token: set to
+                #   ceil_div(hidden_dim, block_size) * sizeof(float32)
+                # For per-token: set to sizeof(float32)
+                hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else (
+                    (moe.hidden_dim + moe.block_size - 1) // moe.block_size *
+                    torch.float32.itemsize)),
+                group_name=all2all_manager.cpu_group.group_name,
+            )
+
+            handle = all2all_manager.get_handle(all_to_all_args)
+
+            prepare_finalize = PplxPrepareAndFinalize(
+                handle,
+                max_num_tokens=moe.max_num_tokens,
+                world_size=all2all_manager.world_size,
+                rank=all2all_manager.rank,
+                # dp_size actually means tp_size, bug in pplx kernels
+                dp_size=all2all_manager.tp_group.world_size,
+                quant_dtype=moe.in_dtype,
+            )
+
+        if prepare_finalize is not None:
+            experts = self.select_gemm_impl(prepare_finalize)
+            self.fused_experts = FusedMoEModularKernel(
+                prepare_finalize,
+                experts,
+            )
+
+    def select_gemm_impl(
+        self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]
+    ) -> FusedMoEPermuteExpertsUnpermute:
+        # based on the all2all implementation, select the appropriate
+        # gemm implementation
+        raise NotImplementedError(
+            "Subclass must select appropriate gemm implementation"
+            " based on the prepare_finalize")
 
     @abstractmethod
     def apply(
@@ -275,53 +322,13 @@ def apply(
         raise NotImplementedError
 
 
-class AllToAllCache:
-
-    def __init__(self):
-        self._cache: WeakValueDictionary = WeakValueDictionary()
-        self._lock = threading.RLock()  # Reentrant lock for thread safety
-
-    def destroy(self):
-        with self._lock:
-            # TODO: can we do del self._cache?
-            for _, a2a in self._cache.items():
-                a2a.destroy()
-
-    def get_or_create(self, **kwargs):
-        assert has_pplx
-        import pplx_kernels as pplx
-
-        # Create a hashable key from the kwargs
-        key = tuple(sorted((k, v) for k, v in kwargs.items()))
-
-        with self._lock:
-            instance = self._cache.get(key)
-            if instance is None:
-                # TODO (varun): Add support to switch to intranode
-                # when all communications are within the same
-                # node.
-                logger.debug("Create AllToAll %s", kwargs)
-                instance = pplx.AllToAll.internode(**kwargs)
-                self._cache[key] = instance
-            return instance
-
-
-# Global singleton
-_all_to_all_cache = AllToAllCache()
-
-
-# Factory function as a cleaner interface
-def get_all_to_all(**kwargs):
-    return _all_to_all_cache.get_or_create(**kwargs)
-
-
 @CustomOp.register("unquantized_fused_moe")
 class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
     """MoE method without quantization."""
 
     def __init__(self, moe: MoEConfig):
         super().__init__()
-        self.fused_experts = fused_experts
+        self.fused_experts = fused_experts  # type: ignore
         self.moe = moe
 
         self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
@@ -331,6 +338,42 @@ def __init__(self, moe: MoEConfig):
         else:
             self.rocm_aiter_fused_experts = None  # type: ignore
 
+    def select_gemm_impl(
+            self, prepare_finalize: Optional[FusedMoEPrepareAndFinalize]):
+
+        assert self.fused_experts == fused_experts
+
+        all2all_manager = get_ep_group().device_communicator.all2all_manager
+        assert all2all_manager is not None
+
+        experts: Optional[FusedMoEPermuteExpertsUnpermute] = None
+
+        if isinstance(prepare_finalize,
+                      (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)):
+            logger.debug("BatchedTritonExperts %s", self.moe)
+            experts = BatchedTritonExperts(
+                max_num_tokens=MOE_DP_CHUNK_SIZE,
+                world_size=all2all_manager.world_size,
+                # dp_size actually means tp_size, bug in pplx kernels
+                dp_size=all2all_manager.tp_group.world_size,
+                use_fp8_w8a8=False,
+                use_int8_w8a8=False,
+                use_int8_w8a16=False,
+                use_int4_w4a16=False,
+                block_shape=None,
+            )
+        else:
+            logger.debug("TritonExperts %s", self.moe)
+            experts = TritonExperts(
+                use_fp8_w8a8=False,
+                use_int8_w8a8=False,
+                use_int8_w8a16=False,
+                use_int4_w4a16=False,
+                block_shape=None,
+                per_channel_quant=False,
+            )
+        return experts
+
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,
                        params_dtype: torch.dtype, **extra_weight_attrs):
@@ -376,10 +419,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             shuffle_weights)
 
         if self.rocm_aiter_moe_enabled:
-            # use 2stage ck moe layout
-            shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data,
-                                                        layer.w2_weight.data,
-                                                        layout=(32, 32))
+            shuffled_w13, shuffled_w2 = shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data)
 
             layer.w13_weight.data = shuffled_w13
             layer.w2_weight.data = shuffled_w2
@@ -430,47 +471,6 @@ def apply(
             activation=activation,
             apply_router_weight_on_input=apply_router_weight_on_input)
 
-    def set_prepare_finalize(
-        self,
-        dp_size: int,
-        world_size: int,
-        prepare_finalize: FusedMoEPrepareAndFinalize,
-    ) -> bool:
-        assert self.fused_experts == fused_experts
-
-        experts: Optional[FusedMoEPermuteExpertsUnpermute] = None
-
-        if isinstance(prepare_finalize,
-                      (BatchedPrepareAndFinalize, PplxPrepareAndFinalize)):
-            logger.debug("BatchedTritonExperts %s", self.moe)
-            experts = BatchedTritonExperts(
-                max_num_tokens=MOE_DP_CHUNK_SIZE,
-                world_size=world_size,
-                dp_size=dp_size,
-                use_fp8_w8a8=False,
-                use_int8_w8a8=False,
-                use_int8_w8a16=False,
-                use_int4_w4a16=False,
-                block_shape=None,
-            )
-        else:
-            logger.debug("TritonExperts %s", self.moe)
-            experts = TritonExperts(
-                use_fp8_w8a8=False,
-                use_int8_w8a8=False,
-                use_int8_w8a16=False,
-                use_int4_w4a16=False,
-                block_shape=None,
-                per_channel_quant=False,
-            )
-
-        self.fused_experts = FusedMoEModularKernel(
-            prepare_finalize,
-            experts,
-        )
-
-        return True
-
     def forward_cuda(
         self,
         layer: torch.nn.Module,
@@ -680,45 +680,6 @@ def determine_expert_map(
     return (local_num_experts, expert_map)
 
 
-def _construct_prepare_finalize(
-    moe: MoEConfig, quant_config: Optional[QuantizationConfig]
-) -> Optional[FusedMoEPrepareAndFinalize]:
-    max_num_tokens = MOE_DP_CHUNK_SIZE
-    world_size = moe.ep_size
-    dp_size = moe.ep_size // moe.dp_size  # dp_size actually means TP.
-    rank = moe.ep_rank
-
-    if moe.use_pplx_kernels:
-        logger.debug("using PplxPrepareAndFinalize")
-
-        all_to_all = get_all_to_all(
-            max_num_tokens=max_num_tokens,
-            num_experts=moe.num_experts,
-            experts_per_token=moe.experts_per_token,  # topk
-            rank=rank,
-            world_size=world_size,
-            dp_size=dp_size,
-            hidden_dim=moe.hidden_dim,
-            hidden_dim_bytes=moe.hidden_dim * moe.in_dtype.itemsize,
-            # For blocked per token: set to
-            #   ceil_div(hidden_dim, block_size) * sizeof(float32)
-            # For per-token: set to sizeof(float32)
-            hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else
-                                    ((moe.hidden_dim + moe.block_size - 1) //
-                                     moe.block_size * torch.float32.itemsize)))
-
-        return PplxPrepareAndFinalize(
-            all_to_all,
-            max_num_tokens=max_num_tokens,
-            world_size=world_size,
-            rank=rank,
-            dp_size=dp_size,
-            quant_dtype=moe.in_dtype,
-        )
-
-    return None
-
-
 class FusedMoE(torch.nn.Module):
     """FusedMoE layer for MoE models.
 
@@ -832,7 +793,10 @@ def __init__(
             moe_parallel_config=self.moe_parallel_config,
             # TODO (bnell): this needs to be fixed for quantized types.
             in_dtype=params_dtype,
+            max_num_tokens=MOE_DP_CHUNK_SIZE,
         )
+        self.moe_config = moe
+        self.quant_config = quant_config
 
         # Note: get_quant_method will look at the layer's local_num_experts
         # for heuristic purposes, so it must be initialized first.
@@ -840,25 +804,13 @@ def __init__(
 
         if quant_config is None:
             quant_method = UnquantizedFusedMoEMethod(moe)
-            prepare_finalize = _construct_prepare_finalize(moe, quant_config)
         else:
             quant_method = quant_config.get_quant_method(self, prefix)
-            # No pplx for quantized types yet.
-            prepare_finalize = None
 
         assert quant_method is not None
         assert isinstance(quant_method, FusedMoEMethodBase)
         self.quant_method = quant_method
 
-        if prepare_finalize is not None:
-            world_size = moe.ep_size
-            dp_size = int(moe.ep_size // moe.dp_size)
-            success = self.quant_method.set_prepare_finalize(
-                dp_size, world_size, prepare_finalize)
-            if not success:
-                logger.warning("DP+EP not supported for %s.",
-                               type(self.quant_method))
-
         moe_quant_params = {
             "num_experts": self.local_num_experts,
             "hidden_size": hidden_size,
diff --git a/vllm/model_executor/layers/fused_moe/moe_pallas.py b/vllm/model_executor/layers/fused_moe/moe_pallas.py
index 8f28b64ed487..babeb97308a9 100644
--- a/vllm/model_executor/layers/fused_moe/moe_pallas.py
+++ b/vllm/model_executor/layers/fused_moe/moe_pallas.py
@@ -2,7 +2,23 @@
 
 import torch
 import torch.nn.functional as F
-from torch_xla.experimental.custom_kernel import _histogram
+
+
+def _histogram(input: torch.Tensor, min: int, max: int) -> torch.Tensor:
+    """
+  Compute the histogram of a int32 tensor. The bin edges are defined by the
+  min and max values, with step = 1.
+  """
+    assert input.dtype == torch.int32, "input must be of torch.int32 dtype."
+    assert min <= max, "min must be less than or equal to max."
+
+    def searchsorted(sorted_sequence: torch.Tensor,
+                     values_to_search: torch.Tensor) -> torch.Tensor:
+        return (sorted_sequence.unsqueeze(1) == values_to_search).sum(dim=1)
+
+    bin_edges = torch.linspace(min, max, max - min + 1,
+                               dtype=input.dtype).to(input.device)
+    return searchsorted(bin_edges, input).to(torch.int32)
 
 
 def fused_moe(
@@ -61,7 +77,7 @@ def fused_moe(
     x = torch.ops.xla.gmm(x, w2, group_sizes)
     x = x[topk_argsort_revert_indices].reshape(-1, topk, hidden_size)
 
-    x = x * topk_weights.unsqueeze_(dim=-1)
+    x = x * topk_weights.unsqueeze(dim=-1)
     x = x.sum(dim=-2)
     x = x.reshape(orig_shape)
     return x
diff --git a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
index 270e7cf1298a..cb396f26c96e 100644
--- a/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
+++ b/vllm/model_executor/layers/fused_moe/moe_permute_unpermute.py
@@ -182,3 +182,7 @@ def moe_unpermute(
                                    expert_first_token_offset, n_expert,
                                    n_local_expert, topk, hidden_states)
     return hidden_states
+
+
+def moe_permute_unpermute_supported():
+    return torch.ops._moe_C.moe_permute_unpermute_supported()
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index b1126b94e45a..783ebebbfec9 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -9,7 +9,6 @@
     moe_kernel_quantize_input)
 
 
-# Note use: layer.get_all_to_all() to get an AllToAll instance
 # The max_num_tokens, world_size and dp_size must be the same
 # as the ones used to create the AllToAll.
 class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
index a92081862bfa..10b61fcda176 100644
--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+from enum import IntEnum
 from functools import cache
 from typing import Optional
 
@@ -9,6 +10,28 @@
 from vllm.utils import direct_register_custom_op
 
 
+class QuantMethod(IntEnum):
+    # This allows interfacing with AITER QuantType Enum
+    # without importing the QuantType from AITER globally.
+
+    # Note that these quantization methods are
+    # supported in AITER package. However,
+    # not all are used in this module.
+
+    NO = 0  # a16w16
+    PER_TENSOR = 1  # w8a8 (pre_Tensor)
+    PER_TOKEN = 2  # w8a8/w8a4 (per_Token)
+    BLOCK_1X128 = 3  # block quantized w8a8 (per_1x128)
+    BLOCK_128x128 = 4  # block quantized w8a8 (per_128x128)
+
+
+class ActivationMethod(IntEnum):
+    # This allows interfacing with AITER ActivationType enum
+    # without importing the ActivationType enum from AITER globally.
+    SILU = 0
+    GELU = 1
+
+
 @cache
 def is_rocm_aiter_moe_enabled() -> bool:
     return current_platform.is_rocm() \
@@ -29,13 +52,12 @@ def rocm_aiter_asm_moe_tkw1_impl(
         a16: bool = False,
         per_tensor_quant_scale: Optional[torch.Tensor] = None,
         expert_mask: Optional[torch.Tensor] = None,
-        activation_str: str = "silu") -> torch.Tensor:
+        activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor:
 
     from aiter import ActivationType
     from aiter.fused_moe_bf16_asm import asm_moe_tkw1
 
-    activation = \
-        ActivationType.Gelu if activation_str == "gelu" else ActivationType.Silu
+    activation = ActivationType(activation_method)
 
     return asm_moe_tkw1(hidden_states,
                         w1,
@@ -65,163 +87,7 @@ def rocm_aiter_asm_moe_tkw1_fake(
         a16: bool = False,
         per_tensor_quant_scale: Optional[torch.Tensor] = None,
         expert_mask: Optional[torch.Tensor] = None,
-        activation_str: str = "silu") -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
-def rocm_aiter_fmoe_fp8_blockscale_g1u1_impl(
-        topk_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        hidden_states_dtype: torch.dtype,
-        expert_mask: torch.Tensor,
-        a1: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        w1_scale: torch.Tensor,
-        w2_scale: torch.Tensor,
-        a1_scale: torch.Tensor,
-        block_shape: list[int],
-        smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
-    from aiter import fmoe_fp8_blockscale_g1u1
-    from aiter.fused_moe_bf16_asm import moe_sorting_ck
-
-    topk = topk_ids.shape[1]
-    model_dim = w1.shape[-1]
-    local_E = E = w1.shape[0]
-    if expert_mask is not None:
-        E = expert_mask.numel()
-
-    (
-        sorted_token_ids,
-        sorted_weight_buf,
-        sorted_expert_ids,
-        num_valid_ids,
-        out_asm,
-    ) = moe_sorting_ck(topk_ids,
-                       topk_weights,
-                       E,
-                       model_dim,
-                       hidden_states_dtype,
-                       expert_mask=expert_mask)
-
-    fmoe_fp8_blockscale_g1u1(out_asm, a1, w1, w2, sorted_token_ids,
-                             sorted_weight_buf, sorted_expert_ids,
-                             num_valid_ids, topk,
-                             a1_scale.t().contiguous(),
-                             w1_scale.view(local_E, -1),
-                             w2_scale.view(local_E,
-                                           -1), *block_shape, smooth_scale)
-
-    return out_asm
-
-
-def rocm_aiter_fmoe_fp8_blockscale_g1u1_fake(
-        topk_ids: torch.Tensor,
-        topk_weights: torch.Tensor,
-        hidden_states_dtype: torch.dtype,
-        expert_mask: torch.Tensor,
-        a1: torch.Tensor,
-        w1: torch.Tensor,
-        w2: torch.Tensor,
-        w1_scale: torch.Tensor,
-        w2_scale: torch.Tensor,
-        a1_scale: torch.Tensor,
-        block_shape: list[int],
-        smooth_scale: Optional[torch.Tensor] = None) -> torch.Tensor:
-
-    return torch.empty_like(a1, dtype=hidden_states_dtype)
-
-
-def rocm_aiter_asm_moe_impl(hidden_states: torch.Tensor,
-                            w1: torch.Tensor,
-                            w2: torch.Tensor,
-                            topk_weights: torch.Tensor,
-                            topk_ids: torch.Tensor,
-                            fc1_scale: Optional[torch.Tensor] = None,
-                            fc2_scale: Optional[torch.Tensor] = None,
-                            fc1_smooth_scale: Optional[torch.Tensor] = None,
-                            fc2_smooth_scale: Optional[torch.Tensor] = None,
-                            a16: bool = False,
-                            activation: str = "silu") -> torch.Tensor:
-    import aiter.fused_moe_bf16_asm as rocm_aiter_asm_fmoe
-    from aiter import ActivationType
-
-    assert activation in ["silu", "gelu"], "The given activation:" \
-                                          f" {activation}"         \
-                                           " is not supported in" \
-                                           " AITER."
-    if activation == "silu":
-        aiter_activation = ActivationType.Silu
-    else:
-        aiter_activation = ActivationType.Gelu
-
-    return rocm_aiter_asm_fmoe.asm_moe(hidden_states=hidden_states,
-                                       w1=w1,
-                                       w2=w2,
-                                       topk_weight=topk_weights,
-                                       topk_ids=topk_ids,
-                                       fc1_scale=fc1_scale,
-                                       fc2_scale=fc2_scale,
-                                       fc1_smooth_scale=fc1_smooth_scale,
-                                       fc2_smooth_scale=fc2_smooth_scale,
-                                       a16=a16,
-                                       activation=aiter_activation)
-
-
-def rocm_aiter_asm_moe_fake(hidden_states: torch.Tensor,
-                            w1: torch.Tensor,
-                            w2: torch.Tensor,
-                            topk_weights: torch.Tensor,
-                            topk_ids: torch.Tensor,
-                            fc1_scale: Optional[torch.Tensor] = None,
-                            fc2_scale: Optional[torch.Tensor] = None,
-                            fc1_smooth_scale: Optional[torch.Tensor] = None,
-                            fc2_smooth_scale: Optional[torch.Tensor] = None,
-                            a16: bool = False,
-                            activation: str = "silu") -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-
-
-def rocm_aiter_ck_moe_2stages_impl(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    fc1_scale: Optional[torch.Tensor] = None,
-    fc2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_size: Optional[list[int]] = None,
-    expert_mask: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    from aiter.fused_moe_bf16_asm import ck_moe_2stages
-    return ck_moe_2stages(a1=hidden_states,
-                          w1=w1,
-                          w2=w2,
-                          topk_weight=topk_weights,
-                          topk_ids=topk_ids,
-                          fc1_scale=fc1_scale,
-                          fc2_scale=fc2_scale,
-                          a1_scale=a1_scale,
-                          a2_scale=a2_scale,
-                          block_size=block_size,
-                          expert_mask=expert_mask)
-
-
-def rocm_aiter_ck_moe_2stages_fake(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    fc1_scale: Optional[torch.Tensor] = None,
-    fc2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_size: Optional[list[int]] = None,
-    expert_mask: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
+        activation_method: int = ActivationMethod.SILU.value) -> torch.Tensor:
     return torch.empty_like(hidden_states)
 
 
@@ -274,6 +140,50 @@ def rocm_aiter_biased_grouped_topk_fake(
     pass
 
 
+def rocm_aiter_fused_moe_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    expert_mask: Optional[torch.Tensor] = None,
+    activation_method: int = ActivationMethod.SILU.value,
+    quant_method: int = QuantMethod.NO.value,
+    doweight_stage1: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    from aiter import ActivationType, QuantType
+    from aiter.fused_moe import fused_moe
+
+    activation = ActivationType(activation_method)
+    quant_type = QuantType(quant_method)
+
+    return fused_moe(hidden_states, w1, w2, topk_weight, topk_ids, expert_mask,
+                     activation, quant_type, doweight_stage1, w1_scale,
+                     w2_scale, a1_scale, a2_scale)
+
+
+def rocm_aiter_fused_moe_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weight: torch.Tensor,
+    topk_ids: torch.Tensor,
+    expert_mask: Optional[torch.Tensor] = None,
+    activation_method: int = ActivationMethod.SILU.value,
+    quant_method: int = QuantMethod.NO.value,
+    doweight_stage1: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+
+
 if current_platform.is_rocm():
 
     direct_register_custom_op(
@@ -285,26 +195,10 @@ def rocm_aiter_biased_grouped_topk_fake(
     )
 
     direct_register_custom_op(
-        op_name="rocm_aiter_fmoe_fp8_blockscale_g1u1",
-        op_func=rocm_aiter_fmoe_fp8_blockscale_g1u1_impl,
-        mutates_args=[],
-        fake_impl=rocm_aiter_fmoe_fp8_blockscale_g1u1_fake,
-        dispatch_key=current_platform.dispatch_key,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_asm_moe",
-        op_func=rocm_aiter_asm_moe_impl,
-        mutates_args=[],
-        fake_impl=rocm_aiter_asm_moe_fake,
-        dispatch_key=current_platform.dispatch_key,
-    )
-
-    direct_register_custom_op(
-        op_name="rocm_aiter_ck_moe_2stages",
-        op_func=rocm_aiter_ck_moe_2stages_impl,
+        op_name="rocm_aiter_fused_moe",
+        op_func=rocm_aiter_fused_moe_impl,
         mutates_args=[],
-        fake_impl=rocm_aiter_ck_moe_2stages_fake,
+        fake_impl=rocm_aiter_fused_moe_fake,
         dispatch_key=current_platform.dispatch_key,
     )
 
@@ -373,32 +267,14 @@ def rocm_aiter_fused_experts(
         a2_scale: Optional[torch.Tensor] = None,
         block_shape: Optional[list[int]] = None) -> torch.Tensor:
 
-    from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-        per_token_group_quant_fp8)
-
+    activation_method = (ActivationMethod.SILU
+                         if activation == "silu" else ActivationMethod.GELU)
     # All AITER Fused MoE kernels are expecting the following datatypes
     topk_weights = topk_weights.to(torch.float32)
     topk_ids = topk_ids.to(torch.int32)
 
-    # w8a8 block-scaled
-    if block_shape is not None and use_fp8_w8a8:
-        assert not apply_router_weight_on_input, (
-            "apply_router_weight_on_input is not supported for block scaled moe"
-        )
-        assert w1_scale is not None
-        assert w2_scale is not None
-
-        # The default block sizes are 128 in AITER.
-        block_shape = [128, 128] if block_shape is None else block_shape
-
-        a1, a1_scale = per_token_group_quant_fp8(hidden_states, block_shape[1])
-
-        return torch.ops.vllm.rocm_aiter_fmoe_fp8_blockscale_g1u1(
-            topk_ids, topk_weights, hidden_states.dtype, None, a1, w1, w2,
-            w1_scale, w2_scale, a1_scale, block_shape, None)
-
     # w8a8 per-channel quantization
-    elif per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
+    if per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
         # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
         # This applies topk_weights on the GEMM output of the first FC layer
         #  rather than the second FC.
@@ -421,60 +297,44 @@ def rocm_aiter_fused_experts(
             a16=False,
             per_tensor_quant_scale=None,
             expert_mask=None,
-            activation_str=activation)
-
-    # w8a8 per-tensor activation per-tensor weight
-    elif use_fp8_w8a8:
-        assert not apply_router_weight_on_input, (
-            "apply_router_weight_on_input is not supported for fp8_w8a8")
-
-        # - faster static per-tensor-activation static per-tensor-weight
-        #   fp8 quantization w8a8
-        if a1_scale is not None and a2_scale is not None:
-            return torch.ops.vllm.rocm_aiter_ck_moe_2stages(
-                hidden_states=hidden_states,
-                w1=w1,
-                w2=w2,
-                topk_weights=topk_weights,
-                topk_ids=topk_ids,
-                fc1_scale=w1_scale,
-                fc2_scale=w2_scale,
-                a1_scale=a1_scale,
-                a2_scale=a2_scale)
-
-        # - fallback static per-tensor-activation static per-tensor-weight
-        #   fp8 quantization w8a8
-        # - dynamic per-tensor activation static per-tensor-weight
-        #   fp8 quantization w8a8
-        return torch.ops.vllm.rocm_aiter_asm_moe(hidden_states=hidden_states,
-                                                 w1=w1,
-                                                 w2=w2,
-                                                 topk_weights=topk_weights,
-                                                 topk_ids=topk_ids,
-                                                 fc1_scale=w1_scale,
-                                                 fc2_scale=w2_scale,
-                                                 fc1_smooth_scale=None,
-                                                 fc2_smooth_scale=None,
-                                                 a16=False,
-                                                 activation=activation)
-    if apply_router_weight_on_input:
-        assert (topk_weights.dim() == 2
-                ), "`topk_weights` should be in shape (num_tokens, topk)"
-        _, topk = topk_weights.shape
-        assert (
-            topk == 1
-        ), "Only support topk=1 when `apply_router_weight_on_input` is True"
-
-        hidden_states = hidden_states * topk_weights.to(hidden_states.dtype)
-        topk_ids = topk_ids.to(torch.int32)
-        topk_weights = torch.ones_like(topk_weights, dtype=torch.float32)
+            activation_method=activation_method)
 
-    return torch.ops.vllm.rocm_aiter_ck_moe_2stages(
-        hidden_states=hidden_states,
-        w1=w1,
-        w2=w2,
-        topk_weights=topk_weights,
-        topk_ids=topk_ids)
+    else:
+        quant_method = QuantMethod.NO.value
+
+        # w8a8 block-scaled
+        if block_shape is not None and use_fp8_w8a8:
+            assert not apply_router_weight_on_input, (
+                "apply_router_weight_on_input is\
+                not supported for block scaled moe")
+            assert w1_scale is not None
+            assert w2_scale is not None
+            quant_method = QuantMethod.BLOCK_128x128.value
+        elif use_fp8_w8a8:
+            # Currently only per tensor quantization method is enabled.
+            quant_method = QuantMethod.PER_TENSOR.value
+
+        if apply_router_weight_on_input:
+            assert (topk_weights.dim() == 2
+                    ), "`topk_weights` should be in shape (num_tokens, topk)"
+            _, topk = topk_weights.shape
+            assert (
+                topk == 1
+            ), "Only support topk=1 when `apply_router_weight_on_input` is True"
+
+        return torch.ops.vllm.rocm_aiter_fused_moe(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            quant_method=quant_method,
+            activation_method=activation_method,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            doweight_stage1=apply_router_weight_on_input)
 
 
 def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
@@ -488,14 +348,21 @@ def rocm_aiter_topk_softmax(topk_weights: torch.Tensor,
     return topk_weights, topk_indices
 
 
-def shuffle_weights(*tensors: torch.Tensor,
-                    layout: tuple[int, int]) -> tuple[torch.Tensor, ...]:
+def shuffle_weights(
+    *tensors: torch.Tensor, layout: tuple[int, int] = (16, 16)
+) -> tuple[torch.Tensor, ...]:
     """
     Applies shuffle_weight function from AITER to each 
     input tensor and returns them.
+    
+    Rearranges (shuffles) the input tensor/s
+    into a specified block layout for optimized computation.
 
     Args:
-    *tensors: Variable number of torch.Tensor objects.
+        *tensors: Variable number of torch.Tensor objects.
+        layout: A pair of integers specifying the 
+        block sizes used to divide the tensors during shuffling.
+        Default is (16, 16).
 
     Returns:
     A Tuple of shuffled tensors.
@@ -503,25 +370,3 @@ def shuffle_weights(*tensors: torch.Tensor,
     from aiter.ops.shuffle import shuffle_weight
 
     return tuple(shuffle_weight(tensor, layout=layout) for tensor in tensors)
-
-
-def expand_weights(*tensors: torch.Tensor,
-                   expansion_dims: list[int]) -> tuple[torch.Tensor, ...]:
-    """
-    Expands the dimensions of input tensors.
-
-    Args:
-        *tensors: A variable number of torch.Tensor objects.
-        expansion_dims: A list of expansion dimensions 
-        corresponding to each tensor.
-
-    Returns:
-        A Tuple of tensors with expanded dimensions.
-    """
-
-    assert len(tensors) == len(expansion_dims), \
-    "Number of tensors must match the number of expansion dimensions."
-
-    return tuple(
-        tensor.unsqueeze(-1).unsqueeze(-1).expand((-1, dim, -1))
-        for tensor, dim in zip(tensors, expansion_dims))
\ No newline at end of file
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 54dd1251e59f..269ac043d26c 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -261,6 +261,7 @@ class ReplicatedLinear(LinearBase):
         quant_config: Quantization configure.
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.qkv_proj)
+        return_bias: If true, return bias together with outputs in forward pass.
     """
 
     def __init__(
@@ -523,6 +524,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         quant_config: Quantization configure.
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.qkv_proj)
+        return_bias: If true, return bias together with outputs in forward pass.
     """
 
     def __init__(
@@ -585,8 +587,6 @@ def weight_loader(self,
                 param.shard_id.append(loaded_shard_id)
                 param.shard_id_map[loaded_shard_id] = len(param.data_container)
                 param.data_container.append(loaded_weight)
-                if len(param.data_container) == 2:
-                    self.qweight = param.materialize_nested()
                 return
 
         param_data = param.data
@@ -805,6 +805,7 @@ class QKVParallelLinear(ColumnParallelLinear):
         quant_config: Quantization configure.
         prefix: The name of the layer in the state dict, including all parents
                         (e.g. model.layers.0.qkv_proj)
+        return_bias: If true, return bias together with outputs in forward pass.
     """
 
     def __init__(
@@ -979,8 +980,6 @@ def weight_loader(self,
                 param.shard_id.append(loaded_shard_id)
                 param.shard_id_map[loaded_shard_id] = len(param.data_container)
                 param.data_container.append(loaded_weight)
-                if len(param.data_container) == 3:
-                    self.qweight = param.materialize_nested()
                 return
 
         param_data = param.data
@@ -1155,7 +1154,13 @@ class RowParallelLinear(LinearBase):
                        bias can be fused with other element-wise operations.
                        We skip adding bias but instead return it.
         params_dtype: Data type for the parameters.
+        reduce_results: If true, call all-reduce on output and make Y available
+                       to all GPUs, otherwise, every GPU will have its output
+                       which is Y = X_iA_i
         quant_config: Quantization configure.
+        prefix: The name of the layer in the state dict, including all parents
+                        (e.g. model.layers.0.down_proj)
+        return_bias: If true, return bias together with outputs in forward pass.
     """
 
     def __init__(
diff --git a/vllm/model_executor/layers/mamba/mamba2_metadata.py b/vllm/model_executor/layers/mamba/mamba2_metadata.py
index e5b88de2fcc8..019f634a9ef4 100644
--- a/vllm/model_executor/layers/mamba/mamba2_metadata.py
+++ b/vllm/model_executor/layers/mamba/mamba2_metadata.py
@@ -5,10 +5,9 @@
 import torch
 
 from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.attention.backends.placeholder_attn import (
     PlaceholderAttentionMetadata)
-from vllm.attention.backends.xformers import XFormersMetadata
+from vllm.platforms import current_platform
 
 
 @dataclass
@@ -23,6 +22,21 @@ class Mamba2Metadata:
     chunk_offsets: torch.Tensor
 
 
+def get_platform_metadata_classes() -> tuple[type[AttentionMetadata], ...]:
+    """Returns the appropriate metadata classes for the current platform."""
+    if current_platform.is_rocm():
+        from vllm.attention.backends.rocm_flash_attn import (
+            ROCmFlashAttentionMetadata)
+        return (ROCmFlashAttentionMetadata, PlaceholderAttentionMetadata)
+    elif current_platform.is_cuda():
+        from vllm.attention.backends.flash_attn import FlashAttentionMetadata
+        from vllm.attention.backends.xformers import XFormersMetadata
+        return (FlashAttentionMetadata, XFormersMetadata,
+                PlaceholderAttentionMetadata)
+    raise ValueError(
+        f"Unsupported platform for Mamba2: {current_platform.device_type}")
+
+
 def _query_start_loc_to_chunk_indices_offsets(query_start_loc: torch.Tensor,
                                               chunk_size: int,
                                               total_seqlens: int):
@@ -78,9 +92,8 @@ def prepare_mamba2_metadata(
 
     # Compute seq_idx, chunk_indices and chunk_offsets for prefill only
     if num_prefills > 0:
-        if (isinstance(attn_metadata,
-                       (FlashAttentionMetadata, XFormersMetadata,
-                        PlaceholderAttentionMetadata))
+        attn_metadata_instances = get_platform_metadata_classes()
+        if (isinstance(attn_metadata, attn_metadata_instances)
                 and attn_metadata.context_lens_tensor is not None):
             has_initial_states = \
                 attn_metadata.context_lens_tensor[:num_prefills] > 0  #[batch,]
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
index bc6e6fcdd0a2..f94ab75f9a4f 100644
--- a/vllm/model_executor/layers/mamba/mamba_mixer2.py
+++ b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -34,7 +34,11 @@
 @CustomOp.register("mixer2_gated_rms_norm")
 class Mixer2RMSNormGated(CustomOp):
 
-    def __init__(self, full_hidden_size, full_n_groups, eps=1e-6):
+    def __init__(self,
+                 full_hidden_size: int,
+                 full_n_groups: int,
+                 use_rms_norm: bool = True,
+                 eps: float = 1e-6):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
         self.tp_rank = get_tensor_model_parallel_rank()
@@ -44,11 +48,17 @@ def __init__(self, full_hidden_size, full_n_groups, eps=1e-6):
         self.n_groups = full_hidden_size // self.group_size
 
         self.variance_epsilon = eps
-        self.weight = nn.Parameter(torch.ones(self.per_rank_hidden_size))
-        set_weight_attrs(self.weight,
-                         {"weight_loader": sharded_weight_loader(0)})
-        assert self.full_hidden_size % self.tp_size== 0,\
-            "Tensor parallel world size must divide hidden size."
+        self.use_rms_norm = use_rms_norm
+        if self.use_rms_norm:
+            # Register norm weight only if we're actually applying RMSNorm
+            self.weight = nn.Parameter(torch.ones(self.per_rank_hidden_size))
+            set_weight_attrs(self.weight,
+                             {"weight_loader": sharded_weight_loader(0)})
+        else:
+            # Avoid checkpoint mismatch by skipping unused parameter
+            self.register_parameter("weight", None)
+        assert (self.full_hidden_size % self.tp_size == 0
+                ), "Tensor parallel world size must divide hidden size."
 
     def forward_native(
         self,
@@ -66,6 +76,8 @@ def forward_native(
         #      the input and then redundantly compute the RMSNorm.
         input_dtype = x.dtype
         x = x * nn.functional.silu(gate.to(torch.float32))
+        if not self.use_rms_norm:
+            return x.to(input_dtype)
 
         if self.n_groups == 1:
             if self.tp_size > 1:
@@ -74,7 +86,7 @@ def forward_native(
                 global_sums = tensor_model_parallel_all_reduce(local_sums)
                 # Calculate the variance
                 count = self.tp_size * x.shape[-1]
-                variance = (global_sums / count)
+                variance = global_sums / count
 
             else:
                 variance = x.pow(2).mean(-1, keepdim=True)
@@ -105,6 +117,11 @@ def forward_cuda(
         x: torch.Tensor,
         gate: torch.Tensor,
     ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        input_dtype = x.dtype
+        if not self.use_rms_norm:
+            # Keep gate in float32 for numerical stability during silu
+            return x * nn.functional.silu(gate.to(
+                torch.float32)).to(input_dtype)
 
         if self.tp_size > 1 or self.n_groups != 1:
             return self.forward_native(x, gate)
@@ -124,7 +141,7 @@ def forward_cuda(
 
 
 def extra_groups_for_head_shards(ngroups: int, tp_size: int):
-    """Compute the increase in group numbers to account for 
+    """Compute the increase in group numbers to account for
     replication in order to accompany the head shards."""
 
     # in the case ngoups % tp_size == 0, this will be zero
@@ -182,13 +199,15 @@ def loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
             #   seem to handle slices well.
             # https://github.com/python/mypy/issues/2410
             param.data[
-                boundary:(boundary + take),  # type: ignore[misc]
-                ...] = loaded_weight[loaded_start_idx:(  # type: ignore[misc]
-                    loaded_start_idx + take)]  # type: ignore[misc]
+                boundary:(boundary + take),
+                ...  # type: ignore[misc]
+            ] = loaded_weight[loaded_start_idx:(loaded_start_idx +
+                                                take)  # type: ignore[misc]
+                              ]  # type: ignore[misc]
 
             # move indexing boundaries
             boundary += shard_size
-            loaded_boundary += (full_dim - extra)
+            loaded_boundary += full_dim - extra
 
     return loader
 
@@ -206,19 +225,22 @@ class MambaMixer2(CustomOp):
     **selective** state spaces)
     """
 
-    def __init__(self,
-                 hidden_size: int,
-                 ssm_state_size: int,
-                 conv_kernel_size: int,
-                 intermediate_size: int,
-                 use_conv_bias: bool,
-                 use_bias: bool,
-                 n_groups: int = 1,
-                 num_heads: int = 128,
-                 head_dim: int = 64,
-                 rms_norm_eps: float = 1e-5,
-                 activation="silu",
-                 quant_config: Optional[QuantizationConfig] = None):
+    def __init__(
+        self,
+        hidden_size: int,
+        ssm_state_size: int,
+        conv_kernel_size: int,
+        intermediate_size: int,
+        use_conv_bias: bool,
+        use_bias: bool,
+        n_groups: int = 1,
+        num_heads: int = 128,
+        head_dim: int = 64,
+        rms_norm_eps: float = 1e-5,
+        activation: str = "silu",
+        use_rms_norm: bool = True,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
         super().__init__()
 
         # For TP, the sharding plan is as follows:
@@ -238,17 +260,16 @@ def __init__(self,
         self.tp_size = get_tensor_model_parallel_world_size()
         tp_rank = get_tensor_model_parallel_rank()
 
-        assert num_heads % self.tp_size == 0, \
-            "Tensor parallel world size must divide num heads."
+        assert (num_heads % self.tp_size == 0
+                ), "Tensor parallel world size must divide num heads."
 
-        assert (n_groups % self.tp_size) == 0 or n_groups == 1, \
-            (
-                "If tensor parallel world size does not divide num_heads, "
-                "then num_groups must equal 1."
-            )
+        assert (n_groups % self.tp_size) == 0 or n_groups == 1, (
+            "If tensor parallel world size does not divide num_heads, "
+            "then num_groups must equal 1.")
 
-        assert self.tp_size == 1 or quant_config is None, \
-            "Tensor parallel currently not supported for quantized models."
+        assert (
+            self.tp_size == 1 or quant_config is None
+        ), "Tensor parallel currently not supported for quantized models."
 
         self.ssm_state_size = ssm_state_size
         self.activation = activation
@@ -265,8 +286,7 @@ def __init__(self,
             self.n_groups = n_groups + extra_groups_for_head_shards(
                 n_groups, self.tp_size)
 
-        self.conv_dim = (intermediate_size +
-                         2 * self.n_groups * ssm_state_size)
+        self.conv_dim = intermediate_size + 2 * self.n_groups * ssm_state_size
         self.conv1d = ColumnParallelLinear(
             input_size=conv_kernel_size,
             output_size=self.conv_dim,
@@ -279,11 +299,12 @@ def __init__(self,
         # doesn't allow to override it
         self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1)
 
-        self.in_proj = ColumnParallelLinear(input_size=hidden_size,
-                                            output_size=intermediate_size +
-                                            self.conv_dim + self.num_heads,
-                                            bias=use_bias,
-                                            quant_config=quant_config)
+        self.in_proj = ColumnParallelLinear(
+            input_size=hidden_size,
+            output_size=intermediate_size + self.conv_dim + self.num_heads,
+            bias=use_bias,
+            quant_config=quant_config,
+        )
 
         # - because in_proj is a concatenation of 3 weights, we
         #   need to interleave them before sharding
@@ -305,7 +326,8 @@ def __init__(self,
         # - ditto for the otther two weights below
         delattr(self.conv1d.bias, "weight_loader")
         set_weight_attrs(
-            self.conv1d.bias, {
+            self.conv1d.bias,
+            {
                 "weight_loader":
                 mamba_v2_sharded_weight_loader(
                     [
@@ -316,18 +338,25 @@ def __init__(self,
                     self.tp_size,
                     tp_rank,
                 )
-            })
+            },
+        )
 
         delattr(self.conv1d.weight, "weight_loader")
         set_weight_attrs(
-            self.conv1d.weight, {
+            self.conv1d.weight,
+            {
                 "weight_loader":
-                mamba_v2_sharded_weight_loader([
-                    intermediate_settings,
-                    group_shard_settings,
-                    group_shard_settings,
-                ], self.tp_size, tp_rank)
-            })
+                mamba_v2_sharded_weight_loader(
+                    [
+                        intermediate_settings,
+                        group_shard_settings,
+                        group_shard_settings,
+                    ],
+                    self.tp_size,
+                    tp_rank,
+                )
+            },
+        )
 
         if quant_config is None:
             # - quant layers do not have a weight loader
@@ -345,8 +374,10 @@ def __init__(self,
                             head_setings,  # for dt
                         ],
                         self.tp_size,
-                        tp_rank)
-                })
+                        tp_rank,
+                    )
+                },
+            )
 
         # - these are TPed by heads to reduce the size of the
         #   temporal shape
@@ -357,6 +388,7 @@ def __init__(self,
             ))
         self.D = nn.Parameter(torch.ones(num_heads // self.tp_size))
         self.dt_bias = nn.Parameter(torch.ones(num_heads // self.tp_size))
+        self.use_rms_norm = use_rms_norm
 
         set_weight_attrs(self.D, {"weight_loader": sharded_weight_loader(0)})
         a_weight_loader = composed_weight_loader(
@@ -365,18 +397,25 @@ def __init__(self,
         set_weight_attrs(self.dt_bias,
                          {"weight_loader": sharded_weight_loader(0)})
 
-        self.out_proj = RowParallelLinear(intermediate_size,
-                                          hidden_size,
-                                          bias=use_bias,
-                                          input_is_parallel=True,
-                                          quant_config=quant_config)
+        self.out_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=use_bias,
+            input_is_parallel=True,
+            quant_config=quant_config,
+        )
 
         self.norm = Mixer2RMSNormGated(intermediate_size,
                                        n_groups,
+                                       self.use_rms_norm,
                                        eps=rms_norm_eps)
 
-    def forward_native(self, hidden_states: torch.Tensor,
-                       conv_state: torch.Tensor, ssm_state: torch.Tensor):
+    def forward_native(
+        self,
+        hidden_states: torch.Tensor,
+        conv_state: torch.Tensor,
+        ssm_state: torch.Tensor,
+    ):
         pass
 
     def forward_cuda(
@@ -384,6 +423,7 @@ def forward_cuda(
         hidden_states: torch.Tensor,
         mamba_cache_params: MambaCacheParams,
         mamba2_metadata: Mamba2Metadata,
+        mup_vector: Optional[torch.Tensor] = None,
     ):
         # mamba2_metadata contains metadata necessary for the mamba2 triton
         # kernels to operate in continuous batching and in chunked prefill
@@ -401,6 +441,10 @@ def forward_cuda(
 
         # 1. Gated MLP's linear projection
         projected_states, _ = self.in_proj(hidden_states)
+
+        if mup_vector is not None:
+            projected_states = projected_states * mup_vector
+
         gate, hidden_states_B_C, dt = torch.split(
             projected_states,
             [
@@ -561,6 +605,9 @@ def forward_cuda(
         hidden_states = torch.vstack(ssd_output_list)
 
         # 4. gated MLP
+        # GatedRMSNorm internally applying SiLU to the gate
+        # SiLU is applied internally before normalization, unlike standard
+        # norm usage
         hidden_states = self.norm(hidden_states, gate)
 
         # 5. Final linear projection
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index a22f8103e8fd..407b9c72f41d 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -14,7 +14,7 @@
     "ptpc_fp8",
     "fbgemm_fp8",
     "modelopt",
-    "nvfp4",
+    "modelopt_fp4",
     "marlin",
     "bitblas",
     "gguf",
@@ -120,7 +120,7 @@ def get_quantization_config(quantization: str) -> type[QuantizationConfig]:
         "fp8": Fp8Config,
         "fbgemm_fp8": FBGEMMFp8Config,
         "modelopt": ModelOptFp8Config,
-        "nvfp4": ModelOptNvFp4Config,
+        "modelopt_fp4": ModelOptNvFp4Config,
         "marlin": MarlinConfig,
         "bitblas": BitBLASConfig,
         "gguf": GGUFConfig,
diff --git a/vllm/model_executor/layers/quantization/auto_round.py b/vllm/model_executor/layers/quantization/auto_round.py
index a5e63843cf62..2d9f5e52bd65 100644
--- a/vllm/model_executor/layers/quantization/auto_round.py
+++ b/vllm/model_executor/layers/quantization/auto_round.py
@@ -8,6 +8,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
+from vllm.model_executor.layers.quantization import QuantizationMethods
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
@@ -74,7 +75,7 @@ def __repr__(self) -> str:
                 f"group_size={self.group_size}, sym={self.sym})")
 
     @classmethod
-    def get_name(cls):  ## use str will trigger preci issue
+    def get_name(cls) -> QuantizationMethods:
         return "auto-round"
 
     @classmethod
@@ -142,18 +143,18 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
                      prefix, layer.__class__.__name__, weight_bits, group_size,
                      sym)
         if backend == "auto" or "marlin" in backend:
+            AWQ_TYPE_MAP = {
+                4: scalar_types.uint4,
+                8: scalar_types.uint8,
+            }
+            use_marlin = (weight_bits
+                          in AWQ_TYPE_MAP) and check_marlin_supported(
+                              AWQ_TYPE_MAP[weight_bits], group_size, not sym)
+
             if isinstance(layer, FusedMoE):
-                use_marlin = check_moe_marlin_supports_layer(layer, group_size)
-            else:
+                use_marlin = use_marlin and check_moe_marlin_supports_layer(
+                    layer, group_size)
 
-                AWQ_TYPE_MAP = {
-                    4: scalar_types.uint4,
-                    8: scalar_types.uint8,
-                }
-                use_marlin = ((weight_bits, sym) in AWQ_TYPE_MAP
-                              and check_marlin_supported(
-                                  AWQ_TYPE_MAP[(weight_bits)], group_size,
-                                  not sym))
         else:
             use_marlin = False
         if use_marlin:
@@ -180,10 +181,11 @@ def apply_awq_quant_layer(self, layer, prefix: str, backend: str = "auto"):
             from vllm.model_executor.layers.quantization.moe_wna16 import (
                 MoeWNA16Config)
             config = {
-                "linear_quant_method": "awq",
-                "weight_bits": weight_bits,
+                "quant_method": "awq",
+                "bits": weight_bits,
                 "group_size": group_size,
                 "zero_point": not sym,
+                "lm_head": False,
             }
             return MoeWNA16Config.from_config(config).get_quant_method(
                 layer, prefix)
@@ -213,18 +215,18 @@ def apply_gptq_quant_layer(self,
                      prefix, layer.__class__.__name__, weight_bits, group_size,
                      sym)
         if backend == "auto" or "marlin" in backend:
+            GPTQ_TYPE_MAP = {
+                (4, True): scalar_types.uint4b8,
+                (8, True): scalar_types.uint8b128,
+            }
+            use_marlin = ((weight_bits, sym) in GPTQ_TYPE_MAP
+                          and check_marlin_supported(
+                              GPTQ_TYPE_MAP[(weight_bits, sym)],
+                              group_size,
+                              has_zp=not sym))
             if isinstance(layer, FusedMoE):
-                use_marlin = check_moe_marlin_supports_layer(layer, group_size)
-            else:
-                GPTQ_TYPE_MAP = {
-                    (4, True): scalar_types.uint4b8,
-                    (8, True): scalar_types.uint8b128,
-                }
-                use_marlin = ((weight_bits, sym) in GPTQ_TYPE_MAP
-                              and check_marlin_supported(
-                                  GPTQ_TYPE_MAP[(weight_bits, sym)],
-                                  group_size,
-                                  has_zp=not sym))
+                use_marlin = use_marlin and check_moe_marlin_supports_layer(
+                    layer, group_size)
         else:
             use_marlin = False
         if use_marlin:
@@ -251,11 +253,11 @@ def apply_gptq_quant_layer(self,
                 from vllm.model_executor.layers.quantization.moe_wna16 import (
                     MoeWNA16Config)
                 config = {
-                    "linear_quant_method": "gptq",
-                    "weight_bits": weight_bits,
+                    "quant_method": "gptq",
+                    "bits": weight_bits,
                     "group_size": group_size,
                     "sym": sym,
-                    "lm_head_quantized": False,
+                    "lm_head": False,
                 }
                 return MoeWNA16Config.from_config(config).get_quant_method(
                     layer, prefix)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
index fa0067c44802..9241ceeb4db2 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -286,9 +286,8 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                 rocm_aiter_fused_experts, shuffle_weights)
 
             # reshaping weights is required for aiter moe kernel.
-            shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight.data,
-                                                        layer.w2_weight.data,
-                                                        layout=(16, 16))
+            shuffled_w13, shuffled_w2 = shuffle_weights(
+                layer.w13_weight.data, layer.w2_weight.data)
 
             layer.w13_weight = torch.nn.Parameter(shuffled_w13,
                                                   requires_grad=False)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
index ccd54281ceb7..75e81c4dd49d 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -1,10 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import re
 from collections.abc import Iterable, Mapping
 from types import MappingProxyType
 from typing import Optional
 
+import regex as re
 from compressed_tensors import CompressionFormat
 from torch.nn import Module
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index bfb84ec8ed69..c3082d8341b9 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -10,7 +10,6 @@
 from torch.nn.parameter import Parameter
 
 import vllm.envs as envs
-import vllm.model_executor.layers.fused_moe.modular_kernel as mk
 from vllm import _custom_ops as ops
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
@@ -63,10 +62,9 @@ def __init__(
         weight_block_size: Optional[list[int]] = None,
     ) -> None:
         super().__init__()
+
         self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
-        if is_checkpoint_fp8_serialized:
-            logger.warning("Detected fp8 checkpoint. Please note that the "
-                           "format is experimental and subject to change.")
+
         if activation_scheme not in ACTIVATION_SCHEMES:
             raise ValueError(
                 f"Unsupported activation scheme {activation_scheme}")
@@ -465,7 +463,7 @@ def __init__(self, quant_config: Fp8Config):
                 logger.warning_once(
                     "DeepGemm not supported on the current platform.")
 
-        self.fused_experts = functools.partial(
+        self.fused_experts = functools.partial(  # type: ignore
             fused_experts,
             block_shape=self.quant_config.weight_block_size,
             allow_deep_gemm=self.allow_deep_gemm)
@@ -601,7 +599,7 @@ def create_weights(self, layer: Module, num_experts: int, hidden_size: int,
     def process_weights_after_loading(self, layer: Module) -> None:
         # Lazy import to avoid importing triton too early.
         from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
-            expand_weights, is_rocm_aiter_moe_enabled, shuffle_weights)
+            is_rocm_aiter_moe_enabled, shuffle_weights)
 
         self.rocm_aiter_moe_enabled = is_rocm_aiter_moe_enabled()
 
@@ -633,9 +631,7 @@ def process_weights_after_loading(self, layer: Module) -> None:
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
                 shuffled_w13, shuffled_w2 = shuffle_weights(
-                    layer.w13_weight.data,
-                    layer.w2_weight.data,
-                    layout=(16, 16))
+                    layer.w13_weight.data, layer.w2_weight.data)
 
                 layer.w13_weight = torch.nn.Parameter(shuffled_w13,
                                                       requires_grad=False)
@@ -681,20 +677,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
                                                  requires_grad=False)
             if self.rocm_aiter_moe_enabled:
                 # reshaping weights is required for aiter moe kernel.
-                w13_scales, w2_scales = expand_weights(
-                    layer.w13_weight_scale.data,
-                    layer.w2_weight_scale.data,
-                    expansion_dims=[
-                        layer.w13_weight.shape[1], layer.w2_weight.shape[1]
-                    ])
-                layer.w13_weight_scale = torch.nn.Parameter(
-                    w13_scales.contiguous(), requires_grad=False)
-                layer.w2_weight_scale = torch.nn.Parameter(
-                    w2_scales.contiguous(), requires_grad=False)
-
-                shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight,
-                                                            layer.w2_weight,
-                                                            layout=(16, 16))
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight, layer.w2_weight)
 
                 layer.w13_weight = torch.nn.Parameter(shuffled_w13,
                                                       requires_grad=False)
@@ -766,20 +750,8 @@ def process_weights_after_loading(self, layer: Module) -> None:
                     start += shard_size
 
             if self.rocm_aiter_moe_enabled:
-                # reshaping weights is required for aiter moe kernel.
-                expansion_dims = [
-                    layer.w13_weight.shape[1], layer.w2_weight.shape[1]
-                ]
-                max_w13_scales, w2_scales = expand_weights(
-                    max_w13_scales,
-                    layer.w2_weight_scale.data,
-                    expansion_dims=expansion_dims)
-                layer.w2_weight_scale = torch.nn.Parameter(
-                    w2_scales.contiguous(), requires_grad=False)
-
-                shuffled_w13, shuffled_w2 = shuffle_weights(layer.w13_weight,
-                                                            layer.w2_weight,
-                                                            layout=(32, 32))
+                shuffled_w13, shuffled_w2 = shuffle_weights(
+                    layer.w13_weight, layer.w2_weight)
 
                 layer.w13_weight = torch.nn.Parameter(shuffled_w13,
                                                       requires_grad=False)
@@ -795,17 +767,12 @@ def process_weights_after_loading(self, layer: Module) -> None:
             del layer.w13_input_scale
             del layer.w2_input_scale
 
-    def set_prepare_finalize(
-        self,
-        dp_size: int,
-        world_size: int,
-        prepare_finalize: mk.FusedMoEPrepareAndFinalize,
-    ) -> bool:
+    def select_gemm_impl(self, prepare_finalize):
         from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
             TritonOrDeepGemmExperts)
 
-        if self.use_marlin or self.rocm_aiter_moe_enabled:
-            return False
+        assert not self.use_marlin and not self.rocm_aiter_moe_enabled, (
+            "Marlin and ROCm AITER are not supported with all2all yet.")
 
         experts = TritonOrDeepGemmExperts(
             use_fp8_w8a8=True,
@@ -813,12 +780,7 @@ def set_prepare_finalize(
             allow_deep_gemm=self.allow_deep_gemm,
         )
 
-        self.fused_experts = mk.FusedMoEModularKernel(
-            prepare_finalize,
-            experts,
-        )
-
-        return True
+        return experts
 
     def apply(
         self,
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index d7d4a5d6acdb..1fcb6d7afc9b 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -9,7 +9,6 @@
 
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE,
                                                         FusedMoEMethodBase)
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
@@ -19,6 +18,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.utils import set_weight_attrs
+from vllm.utils import direct_register_custom_op
 
 logger = init_logger(__name__)
 
@@ -96,8 +96,8 @@ def get_quant_method(self, layer: torch.nn.Module,
 MMQ_QUANT_TYPES = STANDARD_QUANT_TYPES | KQUANT_TYPES
 
 
-def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
-                  qweight_type: int) -> torch.Tensor:
+def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor,
+                        qweight_type: int) -> torch.Tensor:
     # HACK: when doing chunked prefill we don't generate output tokens
     # so input to logits generator is empty which causes invalid parameter
     if x.shape[0] == 0:
@@ -130,6 +130,30 @@ def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
     return y
 
 
+def _fused_mul_mat_gguf_fake(
+    x: torch.Tensor,
+    qweight: torch.Tensor,
+    qweight_type: int,
+) -> torch.Tensor:
+    return torch.empty(x.shape[0],
+                       qweight.shape[0],
+                       dtype=x.dtype,
+                       device=x.device)
+
+
+try:
+    direct_register_custom_op(
+        op_name="_fused_mul_mat_gguf",
+        op_func=_fused_mul_mat_gguf,
+        mutates_args=[],
+        fake_impl=_fused_mul_mat_gguf_fake,
+    )
+    fused_mul_mat_gguf = torch.ops.vllm._fused_mul_mat_gguf
+
+except AttributeError as error:
+    raise error
+
+
 def _fused_moe_gguf(
     x: torch.Tensor,
     w1: torch.Tensor,
@@ -138,8 +162,21 @@ def _fused_moe_gguf(
     topk_ids: torch.Tensor,
     qweight_type: int,
     qweight_type2: int,
-    act,
+    activation: str,
 ) -> torch.Tensor:
+
+    def act(x: torch.Tensor):
+        d = x.shape[-1] // 2
+        output_shape = (x.shape[:-1] + (d, ))
+        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+        if activation == "silu":
+            torch.ops._C.silu_and_mul(out, x)
+        elif activation == "gelu":
+            torch.ops._C.gelu_and_mul(out, x)
+        else:
+            raise ValueError(f"Unsupported activation: {activation}")
+        return out
+
     # lazy import to avoid triggering triton import in CPU backend
     from vllm.model_executor.layers.fused_moe.fused_moe import (
         moe_align_block_size)
@@ -189,12 +226,12 @@ def _fused_moe_gguf(
             for ww, ii in zip(w, idx):
                 expert_up = w1[ii]
 
-                out = _fuse_mul_mat(inp, expert_up, qweight_type)
+                out = fused_mul_mat_gguf(inp, expert_up, qweight_type)
                 out = act(out)
 
                 expert_down = w2[ii]
-                current_state = _fuse_mul_mat(out, expert_down,
-                                              qweight_type2).mul_(ww)
+                current_state = fused_mul_mat_gguf(out, expert_down,
+                                                   qweight_type2).mul_(ww)
                 if current_hidden_state is None:
                     current_hidden_state = current_state
                 else:
@@ -203,6 +240,78 @@ def _fused_moe_gguf(
     return out_hidden_states
 
 
+def _fused_moe_gguf_fake(
+    x: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    qweight_type: int,
+    qweight_type2: int,
+    activation: str,
+) -> torch.Tensor:
+    return torch.empty_like(x)
+
+
+try:
+    direct_register_custom_op(
+        op_name="_fused_moe_gguf",
+        op_func=_fused_moe_gguf,
+        mutates_args=[],
+        fake_impl=_fused_moe_gguf_fake,
+    )
+    fused_moe_gguf = torch.ops.vllm._fused_moe_gguf
+
+except AttributeError as error:
+    raise error
+
+
+def _apply_gguf_embedding(
+    x: torch.Tensor,
+    qweight: torch.Tensor,
+    qweight_type: int,
+    hidden_size: int,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    if qweight_type in UNQUANTIZED_TYPES:
+        return torch.embedding(qweight, x)
+    elif qweight_type in DEQUANT_TYPES:
+        block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
+        x_flat = x.flatten()
+        assert (hidden_size == qweight.shape[1] // type_size * block_size)
+        quant = torch.index_select(qweight, dim=0, index=x_flat)
+        dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size,
+                                      x_flat.shape[0], dtype)
+        return dequant.view(*x.shape, hidden_size)
+    else:
+        qweight_type = WeightType(qweight_type)
+        raise NotImplementedError(
+            f"Unsupported GGUF quantization type: {qweight_type}")
+
+
+def _apply_gguf_embedding_fake(
+    x: torch.Tensor,
+    qweight: torch.Tensor,
+    qweight_type: int,
+    hidden_size: int,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    return torch.empty(x.shape[0], hidden_size, dtype=dtype, device=x.device)
+
+
+try:
+    direct_register_custom_op(
+        op_name="_apply_gguf_embedding",
+        op_func=_apply_gguf_embedding,
+        mutates_args=[],
+        fake_impl=_apply_gguf_embedding_fake,
+    )
+    apply_gguf_embedding = torch.ops.vllm._apply_gguf_embedding
+
+except AttributeError as error:
+    raise error
+
+
 class GGUFLinearMethod(LinearMethodBase):
     """Linear method for GGUF.
 
@@ -249,26 +358,76 @@ def create_weights(self, layer: torch.nn.Module,
         set_weight_attrs(qweight_type, extra_weight_attrs)
         layer.register_parameter("qweight_type", qweight_type)
 
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        qweight_type = layer.qweight_type.weight_type
+        if not (qweight_type in UNQUANTIZED_TYPES
+                or qweight_type in DEQUANT_TYPES):
+            qweight_type = WeightType(qweight_type)
+            raise ValueError(
+                f"Unsupported GGUF quantization type {qweight_type} in "
+                f"layer {layer}.")
+        # For MergedColumnParallelLinear and QKVParallelLinear, we need to
+        # materialize the padded weight parameter for CUDA Graph compatibility.
+        self._create_padded_weight_param(layer)
+
+    def _create_padded_weight_param(self, layer: torch.nn.Module):
+        """Create padded weight parameter for GGUF MergedLinear layer."""
+        qweight = layer.qweight
+        shard_id_map = qweight.shard_id_map
+        shard_id = qweight.shard_id
+        if len(data_container := qweight.data_container) > 1:
+            dtype = {data.dtype for data in data_container}
+            assert len(dtype) == 1, ValueError(
+                f"Data container has mixed dtypes: {dtype}")
+            dtype = next(iter(dtype))
+            # concat dim0 and pad dim1
+            padded_side = max(x.size(1) for x in data_container)
+            concat_side = sum(x.size(0) for x in data_container)
+            # Pad the quantized weights to dense tensor, and create a map
+            # with the location of each shard in the padded tensor.
+            padded_data = torch.zeros((concat_side, padded_side),
+                                      dtype=dtype,
+                                      device=qweight.device)
+            # (dim0_start, dim0_end, dim1_size)
+            shard_offset_map = dict[str, tuple[int, int, int]]()
+            for idx in shard_id:
+                id_in_container = shard_id_map[idx]
+                start = sum(
+                    x.size(0) for x in data_container[:id_in_container])
+                end = start + data_container[id_in_container].size(0)
+                size = data_container[id_in_container].size(1)
+                padded_data[start:end, :size] = data_container[id_in_container]
+                shard_offset_map[idx] = (start, end, size)
+            qweight.data_container.clear()
+            padded_param = Parameter(padded_data, requires_grad=False)
+            set_weight_attrs(padded_param, vars(qweight))
+            set_weight_attrs(padded_param,
+                             {"shard_offset_map": shard_offset_map})
+            layer.register_parameter("qweight", padded_param)
+
     def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
-        shard_id = getattr(layer.qweight, "shard_id", None)
+        shard_id = layer.qweight.shard_id
 
         if shard_id:
             # dequantize shard weights respectively
             shard_id = ["q", "k", "v"] if "q" in shard_id else shard_id
-            qweight = layer.qweight.unbind(0)
+            qweight = layer.qweight
             result = []
             for idx in shard_id:
-                q_idx = layer.qweight.shard_id_map[idx]
+                start, end, offset = layer.qweight.shard_offset_map[idx]
                 qweight_type = layer.qweight_type.shard_weight_type[idx]
-                result.append(_fuse_mul_mat(x, qweight[q_idx], qweight_type))
+                result.append(
+                    fused_mul_mat_gguf(
+                        x, qweight[start:end, :offset].contiguous(),
+                        qweight_type))
             out = torch.cat(result, axis=1)
         else:
             qweight = layer.qweight
             qweight_type = layer.qweight_type.weight_type
-            out = _fuse_mul_mat(x, qweight, qweight_type)
+            out = fused_mul_mat_gguf(x, qweight, qweight_type)
         if bias is not None:
             out.add_(bias)
         return out
@@ -338,7 +497,6 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
 
         set_weight_attrs(w2_qweight_type, extra_weight_attrs)
         layer.register_parameter("w2_qweight_type", w2_qweight_type)
-        self.act = SiluAndMul()
 
     def apply(
         self,
@@ -375,10 +533,10 @@ def apply(
             custom_routing_function=custom_routing_function,
             scoring_func=scoring_func,
             e_score_correction_bias=e_score_correction_bias)
-        return _fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight,
-                               topk_weights, topk_ids,
-                               layer.w13_qweight_type.weight_type,
-                               layer.w2_qweight_type.weight_type, self.act)
+        return fused_moe_gguf(x, layer.w13_qweight, layer.w2_qweight,
+                              topk_weights, topk_ids,
+                              layer.w13_qweight_type.weight_type,
+                              layer.w2_qweight_type.weight_type, activation)
 
 
 class GGUFEmbeddingMethod(GGUFLinearMethod):
@@ -392,34 +550,15 @@ def embedding(self, layer: torch.nn.Module,
                   x: torch.Tensor) -> torch.Tensor:
         qweight = layer.qweight
         qweight_type = layer.qweight_type.weight_type
+        hidden_size = qweight.tensor_shape[1]
 
-        block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
-        hidden_size = qweight.shape[1] // type_size * block_size
-        if qweight_type < 2:
-            return torch.embedding(qweight, x)
-        x_flat = x.flatten()
-        quant = torch.index_select(qweight, dim=0, index=x_flat)
-        dequant = ops.ggml_dequantize(quant, qweight_type, hidden_size,
-                                      x_flat.shape[0], self.params_dtype)
-        return dequant.view(*x.shape, hidden_size)
+        return apply_gguf_embedding(x,
+                                    qweight,
+                                    qweight_type,
+                                    hidden_size,
+                                    dtype=self.params_dtype)
 
 
 class GGUFUninitializedParameter(UninitializedParameter):
     cls_to_become = Parameter
     data_container: list[torch.Tensor]
-
-    def materialize_nested(self) -> Parameter:
-        dtype = {data.dtype for data in self.data_container}
-        assert len(dtype) == 1, ValueError(
-            f"Data container has mixed dtypes: {dtype}")
-        dtype = next(iter(dtype))
-        nested_data = torch.nested.nested_tensor(self.data_container,
-                                                 device=self.device,
-                                                 dtype=dtype)
-        self.data_container.clear()
-        param = torch.Tensor._make_subclass(self.cls_to_become,
-                                            nested_data,
-                                            require_grad=False)
-        for k, v in self.__dict__.items():
-            setattr(param, k, v)
-        return param
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 8bce6bba460a..8108c797637d 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -14,7 +14,7 @@
 from vllm.model_executor.layers.quantization.gptq import GPTQLinearMethod
 from vllm.platforms import current_platform
 
-MIN_IPEX_VERSION = "2.5.0"
+MIN_IPEX_VERSION = "2.7.0"
 
 
 class IPEXConfig(QuantizationConfig):
@@ -181,8 +181,6 @@ def apply(self,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         reshaped_x = x.reshape(-1, x.shape[-1])
         out = layer.ipex_qlinear(reshaped_x)
-        if bias is not None:
-            out.add_(bias)
         return out.reshape(x.shape[:-1] + (layer.ipex_output_size, ))
 
 
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 13957a96deca..1c5680f952ab 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -192,7 +192,7 @@ def __init__(
 
     @classmethod
     def get_name(cls) -> QuantizationMethods:
-        return "nvfp4"
+        return "modelopt_fp4"
 
     @classmethod
     def get_supported_act_dtypes(cls) -> list[torch.dtype]:
@@ -228,7 +228,7 @@ def from_config(cls, config: dict[str, Any]) -> "ModelOptNvFp4Config":
                    exclude_modules, group_size)
 
     def is_layer_excluded(self, prefix: str, exclude_modules: list):
-        import re
+        import regex as re
         for pattern in exclude_modules:
             regex_str = pattern.replace('.', r'\.').replace('*', r'.*')
             if re.fullmatch(regex_str, prefix):
diff --git a/vllm/model_executor/layers/quantization/quark/utils.py b/vllm/model_executor/layers/quantization/quark/utils.py
index d1d293b01791..5e56bcb7564c 100644
--- a/vllm/model_executor/layers/quantization/quark/utils.py
+++ b/vllm/model_executor/layers/quantization/quark/utils.py
@@ -1,10 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import re
 from collections.abc import Iterable, Mapping
 from types import MappingProxyType
 from typing import Any, Optional
 
+import regex as re
+
 
 def deep_compare(dict1: Any, dict2: Any) -> bool:
     if type(dict1) is not type(dict2):
diff --git a/vllm/model_executor/layers/quantization/utils/gptq_utils.py b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
index ff7a8169e6fb..36161d13b24f 100644
--- a/vllm/model_executor/layers/quantization/utils/gptq_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/gptq_utils.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
-import re
 from copy import deepcopy
 from typing import Optional, Union
 
+import regex as re
 import torch
 
 from vllm.config import QuantizationConfig
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index af82b9dc93b7..3db73495827c 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -262,16 +262,16 @@ def _get_accepted(
         True, then a token can be accepted, else it should be
         rejected.
 
-        Given {math}`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of
-        {math}`\hat{x}_{n+1}` given context {math}`x_1, \dots, x_n` according
-        to the target model, and {math}`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the
+        Given $q(\hat{x}_{n+1}|x_1, \dots, x_n)$, the probability of
+        $\hat{x}_{n+1}$ given context $x_1, \dots, x_n$ according
+        to the target model, and $p(\hat{x}_{n+1}|x_1, \dots, x_n)$, the
         same conditional probability according to the draft model, the token
         is accepted with probability:
 
-        :::{math}
+        $$
         \min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)}
                         {p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right)
-        :::
+        $$
 
         This implementation does not apply causality. When using the output,
         if a token is rejected, subsequent tokens should not be used.
@@ -314,30 +314,31 @@ def _get_recovered_probs(
         target model is recovered (within hardware numerics).
 
         The probability distribution used in this rejection case is constructed
-        as follows. Given {math}`q(x|x_1, \dots, x_n)`, the probability of
-        {math}`x` given context {math}`x_1, \dots, x_n` according to the target
-        model and {math}`p(x|x_1, \dots, x_n)`, the same conditional probability
+        as follows. Given $q(x|x_1, \dots, x_n)$, the probability of
+        $x$ given context $x_1, \dots, x_n$ according to the target
+        model and $p(x|x_1, \dots, x_n)$, the same conditional probability
         according to the draft model:
 
-        :::{math}
+        $$
         x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+
-        :::
+        $$
 
-        where {math}`(f(x))_+` is defined as:
+        where $(f(x))_+$ is defined as:
 
-        :::{math}
+        $$
         (f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))}
-        :::
+        $$
 
         See https://github.com/vllm-project/vllm/pull/2336 for a visualization
         of the draft, target, and recovered probability distributions.
 
         Returns a tensor of shape [batch_size, k, vocab_size].
 
-        Note: This batches operations on GPU and thus constructs the recovered
-        distribution for all tokens, even if they are accepted. This causes
-        division-by-zero errors, so we use self._smallest_positive_value to
-        avoid that. This introduces some drift to the distribution.
+        Note: 
+            This batches operations on GPU and thus constructs the recovered
+            distribution for all tokens, even if they are accepted. This causes
+            division-by-zero errors, so we use self._smallest_positive_value to
+            avoid that. This introduces some drift to the distribution.
         """
         _, k, _ = draft_probs.shape
 
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index c92e1802e410..1744b35c5aec 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -231,17 +231,19 @@ def forward(
     ) -> Optional[SamplerOutput]:
         """
         Single-step scheduling:
-        * Perform GPU-side sampling computation & compute
-          GPU-side logprobs tensor
-        * Pythonize sampling result & logprobs tensor
+            * Perform GPU-side sampling computation & compute
+            GPU-side logprobs tensor
+            * Pythonize sampling result & logprobs tensor
 
         Multi-step scheduling:
-        * Perform GPU-side sampling computation & compute
-          GPU-side logprobs tensor
-        * Defer Pythonization of sampling result & logprobs
-          tensor
-        * Encapsulate arguments required for deferred Pythonization
-          in the {class}`SamplerOutput` structure
+            * Perform GPU-side sampling computation & compute
+            GPU-side logprobs tensor
+            * Defer Pythonization of sampling result & logprobs
+            tensor
+            * Encapsulate arguments required for deferred Pythonization
+            in the
+            [`SamplerOutput`][vllm.model_executor.layers.sampler.SamplerOutput]
+            structure
 
         Args:
             logits: (num_tokens, vocab_size).
diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py
index 527a301cd8e2..a14c86148e73 100644
--- a/vllm/model_executor/layers/typical_acceptance_sampler.py
+++ b/vllm/model_executor/layers/typical_acceptance_sampler.py
@@ -93,29 +93,27 @@ def _evaluate_accepted_tokens(self, target_probs, draft_token_ids):
         Evaluates and returns a mask of accepted tokens based on the
         posterior probabilities.
 
-        Parameters:
-        ----------
-        target_probs : torch.Tensor
-            A tensor of shape (batch_size, k, vocab_size) representing 
-            the probabilities of each token in the vocabulary for each
-            position in the proposed sequence. This is the distribution
-            generated by the target model.
-        draft_token_ids : torch.Tensor
-            A tensor of shape (batch_size, k) representing the proposed
-            token ids.
+        Args:
+            target_probs (torch.Tensor): A tensor of shape
+                (batch_size, k, vocab_size) representing  the probabilities of
+                each token in the vocabulary for each position in the proposed
+                sequence. This is the distribution generated by the target
+                model.
+            draft_token_ids (torch.Tensor): A tensor of shape (batch_size, k)
+                representing the proposed token ids.
 
         A draft token_id x_{n+k} is accepted if it satisfies the
         following condition
     
-        :::{math}
+        $$
         p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > 
         \min \left( \epsilon, \delta * \exp \left(
             -H(p_{\text{original}}(
                 \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right)
-        :::
+        $$
         
-        where {math}`p_{\text{original}}` corresponds to target_probs 
-        and {math}`\epsilon` and {math}`\delta` correspond to hyperparameters
+        where $p_{\text{original}}$ corresponds to target_probs 
+        and $\epsilon$ and $\delta$ correspond to hyperparameters
         specified using self._posterior_threshold and self._posterior_alpha
 
         This method computes the posterior probabilities for the given
@@ -126,13 +124,10 @@ def _evaluate_accepted_tokens(self, target_probs, draft_token_ids):
         returns a boolean mask indicating which tokens can be accepted.
 
         Returns:
-        -------
-        torch.Tensor
-            A boolean tensor of shape (batch_size, k) where each element
-            indicates whether the corresponding draft token has been accepted
-            or rejected. True indicates acceptance and false indicates
-            rejection.
-            
+            torch.Tensor: A boolean tensor of shape (batch_size, k) where each
+                element indicates whether the corresponding draft token has
+                been accepted or rejected. True indicates acceptance and false
+                indicates rejection.
         """
         device = target_probs.device
         candidates_prob = torch.gather(
@@ -156,17 +151,14 @@ def _get_recovered_token_ids(self, target_probs):
         The recovered token ids will fill the first unmatched token
         by the target token.
 
-        Parameters
-        ----------
-        target_probs : torch.Tensor
-            A tensor of shape (batch_size, k, vocab_size) containing 
-            the target probability distribution
-
-        Returns
-        -------
-        torch.Tensor
-            A tensor of shape (batch_size, k) with the recovered token
-            ids which are selected from target probs.
+        Args:
+            target_probs (torch.Tensor): A tensor of shape
+                (batch_size, k, vocab_size) containing the target probability
+                distribution.
+
+        Returns:
+            torch.Tensor: A tensor of shape (batch_size, k) with the recovered
+                token ids which are selected from target probs.
         """
         max_indices = torch.argmax(target_probs, dim=-1)
 
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
index 92a0b0923b6e..a443a652d8a3 100644
--- a/vllm/model_executor/model_loader/__init__.py
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
+from typing import Optional
+
 from torch import nn
 
-from vllm.config import LoadConfig, LoadFormat, VllmConfig
+from vllm.config import LoadConfig, LoadFormat, ModelConfig, VllmConfig
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.bitsandbytes_loader import (
     BitsAndBytesModelLoader)
@@ -47,9 +49,14 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     return DefaultModelLoader(load_config)
 
 
-def get_model(*, vllm_config: VllmConfig) -> nn.Module:
+def get_model(*,
+              vllm_config: VllmConfig,
+              model_config: Optional[ModelConfig] = None) -> nn.Module:
     loader = get_model_loader(vllm_config.load_config)
-    return loader.load_model(vllm_config=vllm_config)
+    if model_config is None:
+        model_config = vllm_config.model_config
+    return loader.load_model(vllm_config=vllm_config,
+                             model_config=model_config)
 
 
 __all__ = [
diff --git a/vllm/model_executor/model_loader/base_loader.py b/vllm/model_executor/model_loader/base_loader.py
index f17cab05c25d..010dd515784a 100644
--- a/vllm/model_executor/model_loader/base_loader.py
+++ b/vllm/model_executor/model_loader/base_loader.py
@@ -18,6 +18,7 @@ def download_model(self, model_config: ModelConfig) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def load_model(self, *, vllm_config: VllmConfig) -> nn.Module:
+    def load_model(self, *, vllm_config: VllmConfig,
+                   model_config: ModelConfig) -> nn.Module:
         """Load a model with the given configurations."""
         raise NotImplementedError
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
index 6771c128c5a1..0d83c8d53419 100644
--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -569,10 +569,9 @@ def _load_weights(self, model_config: ModelConfig,
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model, model_config.revision)
 
-    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig,
+                   model_config: ModelConfig) -> nn.Module:
         device_config = vllm_config.device_config
-        model_config = vllm_config.model_config
-
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
 
diff --git a/vllm/model_executor/model_loader/default_loader.py b/vllm/model_executor/model_loader/default_loader.py
index 21eb7d8a75fb..29a6e0af4bc6 100644
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -11,8 +11,8 @@
 from torch import nn
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
+from vllm import envs
 from vllm.config import LoadConfig, LoadFormat, ModelConfig, VllmConfig
-from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
 from vllm.model_executor.model_loader.utils import (
@@ -64,7 +64,7 @@ def _maybe_download_from_modelscope(
 
         Returns the path to the downloaded model, or None if the model is not
         downloaded from ModelScope."""
-        if VLLM_USE_MODELSCOPE:
+        if envs.VLLM_USE_MODELSCOPE:
             # download model from ModelScope hub,
             # lazy import so that modelscope is not required for normal use.
             # pylint: disable=C.
@@ -264,13 +264,14 @@ def download_model(self, model_config: ModelConfig) -> None:
                               fall_back_to_pt=True,
                               allow_patterns_overrides=None)
 
-    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig,
+                   model_config: ModelConfig) -> nn.Module:
         device_config = vllm_config.device_config
-        model_config = vllm_config.model_config
         target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
-                model = initialize_model(vllm_config=vllm_config)
+                model = initialize_model(vllm_config=vllm_config,
+                                         model_config=model_config)
 
             weights_to_load = {name for name, _ in model.named_parameters()}
             loaded_weights = model.load_weights(
diff --git a/vllm/model_executor/model_loader/dummy_loader.py b/vllm/model_executor/model_loader/dummy_loader.py
index 5047a161f3f9..0e2f0be1ec26 100644
--- a/vllm/model_executor/model_loader/dummy_loader.py
+++ b/vllm/model_executor/model_loader/dummy_loader.py
@@ -22,9 +22,9 @@ def __init__(self, load_config: LoadConfig):
     def download_model(self, model_config: ModelConfig) -> None:
         pass  # Nothing to download
 
-    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig,
+                   model_config: ModelConfig) -> nn.Module:
         device_config = vllm_config.device_config
-        model_config = vllm_config.model_config
         target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
diff --git a/vllm/model_executor/model_loader/gguf_loader.py b/vllm/model_executor/model_loader/gguf_loader.py
index 2766c9787b83..806004bf9604 100644
--- a/vllm/model_executor/model_loader/gguf_loader.py
+++ b/vllm/model_executor/model_loader/gguf_loader.py
@@ -92,9 +92,9 @@ def _get_weights_iterator(
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model)
 
-    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig,
+                   model_config: ModelConfig) -> nn.Module:
         device_config = vllm_config.device_config
-        model_config = vllm_config.model_config
         local_model_path = self._prepare_weights(model_config.model)
         gguf_weights_map = self._get_gguf_weights_map(model_config)
         # we can only know if tie word embeddings after mapping weights
diff --git a/vllm/model_executor/model_loader/neuronx_distributed.py b/vllm/model_executor/model_loader/neuronx_distributed.py
index fee8c10b6c2f..557feea46a90 100644
--- a/vllm/model_executor/model_loader/neuronx_distributed.py
+++ b/vllm/model_executor/model_loader/neuronx_distributed.py
@@ -48,6 +48,9 @@
 # Models supported by Neuronx distributed for inference.
 _NEURON_SUPPORTED_MODELS: dict[str, tuple[str, str]] = {
     "LlamaForCausalLM":
+    ("neuronx_distributed_inference.models.llama.modeling_llama",
+     "NeuronLlamaForCausalLM"),
+    "MistralForCausalLM":
     ("neuronx_distributed_inference.models.llama.modeling_llama",
      "NeuronLlamaForCausalLM"),
     "DbrxForCausalLM":
@@ -84,16 +87,29 @@ def forward(
         input_block_ids: torch.Tensor,
         sampling_params: torch.Tensor,
     ) -> torch.Tensor:
+        # sort block ids sequentially for perf/neuron support reasons
+        sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids)
+        input_ids = torch.index_select(input_ids, 0, sorted_indices)
+        positions = torch.index_select(positions, 0, sorted_indices)
+        sampling_params = torch.index_select(sampling_params, 0,
+                                             sorted_indices)
+
         output = self.model(input_ids,
                             attention_mask=None,
                             position_ids=positions,
-                            seq_ids=input_block_ids,
+                            seq_ids=sorted_input_block_ids,
                             sampling_params=sampling_params)
         # on-device sampling
         if self.config.neuron_config.on_device_sampling_config:
-            return output.hidden_states
+            output = output.hidden_states
         else:
-            return output.logits[:, -1, :]
+            output = output.logits[:, -1, :]
+
+        restored_indices = torch.argsort(sorted_indices)
+        if input_block_ids.shape[0] != 1:
+            output = torch.index_select(output, 0, restored_indices)
+
+        return output
 
     def compute_logits(self, hidden_states: torch.Tensor,
                        sampling_metadata: SamplingMetadata) -> torch.Tensor:
@@ -337,14 +353,26 @@ def forward(
         input_block_ids: torch.Tensor,
         sampling_params: torch.Tensor,
     ) -> torch.Tensor:
+        # sort block ids sequentially for perf/neuron support reasons
+        sorted_input_block_ids, sorted_indices = torch.sort(input_block_ids)
+        input_ids = torch.index_select(input_ids, 0, sorted_indices)
+        positions = torch.index_select(positions, 0, sorted_indices)
+        sampling_params = torch.index_select(sampling_params, 0,
+                                             sorted_indices)
+
         output = self.model(input_ids,
                             attention_mask=None,
                             position_ids=positions,
-                            seq_ids=input_block_ids,
+                            seq_ids=sorted_input_block_ids,
                             sampling_params=sampling_params)
+        restored_indices = torch.argsort(sorted_indices)
+
         # CTX encoding
         if (positions[:, 0]).sum().item() == 0:
-            return output.fused_outputs[0][:, 0:1]
+            output = output.fused_outputs[0][:, 0:1]
+            if input_block_ids.shape[0] != 1:
+                output = torch.index_select(output, 0, restored_indices)
+            return output
 
         # Fused Spec (Generation)
         accepted_tokens_with_padding = output.fused_outputs[0]
@@ -359,6 +387,10 @@ def forward(
                                           -1) >= generated_token_counts
         accepted_tokens_with_padding[mask] = -1
 
+        if input_block_ids.shape[0] != 1:
+            accepted_tokens_with_padding = torch.index_select(
+                accepted_tokens_with_padding, 0, restored_indices)
+
         return accepted_tokens_with_padding
 
     def sample(
@@ -413,6 +445,10 @@ def load_weights(self, model_name_or_path: str,
             draft_neuron_config.speculation_length = 0
         draft_neuron_config.trace_tokengen_model = True
         draft_neuron_config.enable_fused_speculation = False
+        if getattr(config.neuron_config, "draft_model_modules_to_not_convert",
+                   None):
+            draft_neuron_config.modules_to_not_convert = (
+                draft_neuron_config.draft_model_modules_to_not_convert)
         if config.neuron_config.enable_eagle_speculation:
             draft_neuron_config.is_eagle_draft = True
             draft_neuron_config.sequence_parallel_enabled = False
@@ -499,7 +535,7 @@ def _get_default_neuron_config(model_config: ModelConfig,
         max_context_length=scheduler_config.max_model_len,
         seq_len=scheduler_config.max_model_len,
         enable_bucketing=True,
-        is_continuous_batching=(batch_size > 1),
+        is_continuous_batching=True,
         quantized=False,
         torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
         padding_side="right",
@@ -517,6 +553,7 @@ def _get_default_speculation_config(model_config: ModelConfig,
     args."""
     neuron_config = dict(
         tp_degree=parallel_config.tensor_parallel_size,
+        ctx_batch_size=1,
         batch_size=scheduler_config.max_num_seqs,
         max_context_length=scheduler_config.max_model_len,
         seq_len=scheduler_config.max_model_len,
@@ -524,6 +561,7 @@ def _get_default_speculation_config(model_config: ModelConfig,
         trace_tokengen_model=False,
         enable_fused_speculation=True,
         enable_bucketing=True,
+        is_continuous_batching=True,
         quantized=False,
         torch_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype],
         on_device_sampling_config=dict(
diff --git a/vllm/model_executor/model_loader/runai_streamer_loader.py b/vllm/model_executor/model_loader/runai_streamer_loader.py
index a695ba03bd1d..9f1022c25925 100644
--- a/vllm/model_executor/model_loader/runai_streamer_loader.py
+++ b/vllm/model_executor/model_loader/runai_streamer_loader.py
@@ -100,11 +100,10 @@ def download_model(self, model_config: ModelConfig) -> None:
         """Download model if necessary"""
         self._prepare_weights(model_config.model, model_config.revision)
 
-    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig,
+                   model_config: ModelConfig) -> nn.Module:
         """Perform streaming of the model to destination"""
         device_config = vllm_config.device_config
-        model_config = vllm_config.model_config
-
         target_device = torch.device(device_config.device)
         with set_default_torch_dtype(model_config.dtype):
             with target_device:
diff --git a/vllm/model_executor/model_loader/sharded_state_loader.py b/vllm/model_executor/model_loader/sharded_state_loader.py
index 913bda7e007a..78bca89f0015 100644
--- a/vllm/model_executor/model_loader/sharded_state_loader.py
+++ b/vllm/model_executor/model_loader/sharded_state_loader.py
@@ -100,9 +100,9 @@ def _prepare_weights(self, model_name_or_path: str,
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model, model_config.revision)
 
-    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
+    def load_model(self, vllm_config: VllmConfig,
+                   model_config: ModelConfig) -> nn.Module:
         device_config = vllm_config.device_config
-        model_config = vllm_config.model_config
         target_device = torch.device(device_config.device)
 
         from vllm.distributed import get_tensor_model_parallel_rank
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 459c4b4392e3..4c4502284a6a 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -1,24 +1,28 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import argparse
+import contextlib
+import contextvars
 import dataclasses
 import io
+import json
 import os
-import re
+import threading
 import time
 from collections.abc import Generator
 from dataclasses import dataclass
 from functools import partial
-from typing import BinaryIO, Optional, Union
+from typing import Any, BinaryIO, Optional, Union
 
+import regex as re
 import torch
 from torch import nn
+from torch.utils._python_dispatch import TorchDispatchMode
 from transformers import PretrainedConfig
 
 import vllm.envs as envs
 from vllm.config import ModelConfig, ParallelConfig, set_current_vllm_config
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.llm_engine import LLMEngine
 from vllm.logger import init_logger
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
@@ -58,9 +62,79 @@
 logger = init_logger(__name__)
 
 
+class MetaTensorMode(TorchDispatchMode):
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs or {}
+
+        if func._schema.name == "aten::empty" and "device" not in kwargs:
+            kwargs["device"] = "meta"
+
+        return func(*args, **kwargs)
+
+
+def meta_tensor_mode(loading_code=None, ):
+
+    if loading_code is None:
+        return _NoInitOrTensorImpl.context_manager()
+    elif callable(loading_code):
+        with _NoInitOrTensorImpl.context_manager():
+            return loading_code()
+    else:
+        raise TypeError(
+            "expected a callable to evaluate,"
+            " or None if being used as a context manager;"
+            f' got an object of type "{type(loading_code).__name__}" instead.')
+
+
+class _NoInitOrTensorImpl:
+    _MODULES = (torch.nn.Linear, torch.nn.Embedding, torch.nn.LayerNorm)
+    _MODULE_ORIGINALS = tuple((m, m.reset_parameters) for m in _MODULES)
+
+    is_active = contextvars.ContextVar("_NoInitOrTensorImpl.is_active",
+                                       default=False)
+    _count_active: int = 0
+    _count_active_lock = threading.Lock()
+
+    @classmethod
+    @contextlib.contextmanager
+    def context_manager(cls):
+        if cls.is_active.get():
+            yield
+            return
+
+        with cls._count_active_lock:
+            cls._count_active += 1
+            if cls._count_active == 1:
+                for mod in cls._MODULES:
+                    mod.reset_parameters = cls._disable(mod.reset_parameters)
+
+        reset_token = cls.is_active.set(True)
+
+        try:
+            with MetaTensorMode():
+                yield
+        finally:
+            cls.is_active.reset(reset_token)
+            with cls._count_active_lock:
+                cls._count_active -= 1
+                if cls._count_active == 0:
+                    for mod, original in cls._MODULE_ORIGINALS:
+                        mod.reset_parameters = original
+
+    @staticmethod
+    def _disable(func):
+
+        def wrapper(*args, **kwargs):
+            if not _NoInitOrTensorImpl.is_active.get():
+                return func(*args, **kwargs)
+
+        return wrapper
+
+
 @dataclass
 class TensorizerConfig:
-    tensorizer_uri: str
+    tensorizer_uri: Union[str, None] = None
     vllm_tensorized: Optional[bool] = False
     verify_hash: Optional[bool] = False
     num_readers: Optional[int] = None
@@ -71,12 +145,29 @@ class TensorizerConfig:
     model_class: Optional[type[torch.nn.Module]] = None
     hf_config: Optional[PretrainedConfig] = None
     dtype: Optional[Union[str, torch.dtype]] = None
+    lora_dir: Optional[str] = None
     _is_sharded: bool = False
 
     def __post_init__(self):
         # check if the configuration is for a sharded vLLM model
         self._is_sharded = isinstance(self.tensorizer_uri, str) \
             and re.search(r'%0\dd', self.tensorizer_uri) is not None
+        if not self.tensorizer_uri and not self.lora_dir:
+            raise ValueError("tensorizer_uri must be provided.")
+        if not self.tensorizer_uri and self.lora_dir:
+            self.tensorizer_uri = f"{self.lora_dir}/adapter_model.tensors"
+        assert self.tensorizer_uri is not None, ("tensorizer_uri must be "
+                                                 "provided.")
+        self.tensorizer_dir = os.path.dirname(self.tensorizer_uri)
+        self.lora_dir = self.tensorizer_dir
+
+    @classmethod
+    def as_dict(cls, *args, **kwargs) -> dict[str, Any]:
+        cfg = TensorizerConfig(*args, **kwargs)
+        return dataclasses.asdict(cfg)
+
+    def to_dict(self) -> dict[str, Any]:
+        return dataclasses.asdict(self)
 
     def _construct_tensorizer_args(self) -> "TensorizerArgs":
         tensorizer_args = {
@@ -140,7 +231,9 @@ class TensorizerArgs:
   
   Args:
       tensorizer_uri: Path to serialized model tensors. Can be a local file 
-          path or a S3 URI.
+          path or a S3 URI. This is a required field unless lora_dir is 
+          provided and the config is meant to be used for the
+          `tensorize_lora_adapter` function.
       vllm_tensorized: If True, indicates that the serialized model is a 
           vLLM model. This is used to determine the behavior of the 
           TensorDeserializer when loading tensors from a serialized model.
@@ -158,7 +251,7 @@ class TensorizerArgs:
       encryption_keyfile: File path to a binary file containing a  
           binary key to use for decryption. `None` (the default) means 
           no decryption. See the example script in 
-          examples/other/tensorize_vllm_model.py. 
+          examples/others/tensorize_vllm_model.py. 
       s3_access_key_id: The access key for the S3 bucket. Can also be set via
           the S3_ACCESS_KEY_ID environment variable.
       s3_secret_access_key: The secret access key for the S3 bucket. Can also
@@ -296,10 +389,10 @@ def _init_model(self):
         model_args.torch_dtype = self.tensorizer_config.dtype
         assert self.tensorizer_config.model_class is not None
         # TODO: Do we need to consider old-style model class?
-        with no_init_or_tensor(), set_current_vllm_config(self.vllm_config,
-                                                          check_compile=True):
+        with meta_tensor_mode(), set_current_vllm_config(self.vllm_config,
+                                                         check_compile=True):
             return self.tensorizer_config.model_class(
-                vllm_config=self.vllm_config, )
+                vllm_config=self.vllm_config)
 
     def _resize_lora_embeddings(self):
         """Modify LoRA embedding layers to use bigger tensors
@@ -376,7 +469,7 @@ def tensorizer_weights_iterator(
                    "loading on vLLM, as tensorizer is forced to load to CPU. "
                    "Consider deserializing a vLLM model instead for faster "
                    "load times. See the "
-                   "examples/other/tensorize_vllm_model.py example script "
+                   "examples/others/tensorize_vllm_model.py example script "
                    "for serializing vLLM models.")
 
     deserializer_args = tensorizer_args.deserializer_params
@@ -467,8 +560,73 @@ def tensorize_vllm_model(engine_args: EngineArgs,
         ) as stream:
             stream.write(encryption_params.key)
 
-    engine = LLMEngine.from_engine_args(engine_args)
-    engine.model_executor.collective_rpc(
-        "save_tensorized_model",
-        kwargs=dict(tensorizer_config=tensorizer_config),
-    )
+    from vllm import LLMEngine
+    from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+
+    if not envs.VLLM_USE_V1:
+        engine = LLMEngine.from_engine_args(engine_args)
+        engine.model_executor.collective_rpc(
+            "save_tensorized_model",
+            kwargs=dict(tensorizer_config=tensorizer_config),
+        )
+    else:
+        engine = V1LLMEngine.from_vllm_config(engine_config)
+        engine.collective_rpc(
+            "save_tensorized_model",
+            kwargs=dict(tensorizer_config=tensorizer_config),
+        )
+
+
+def tensorize_lora_adapter(lora_path: str,
+                           tensorizer_config: TensorizerConfig):
+    """
+    Uses tensorizer to serialize a LoRA adapter. Assumes that the files
+    needed to load a LoRA adapter are a safetensors-format file called
+    adapter_model.safetensors and a json config file called adapter_config.json.
+
+    Serializes the files in the tensorizer_config.lora_dir
+    """
+    import safetensors
+
+    from vllm.lora.utils import get_adapter_absolute_path
+
+    lora_dir = get_adapter_absolute_path(lora_path)
+
+    tensor_path = config_path = ""
+
+    for file in os.listdir(lora_dir):
+        if file.startswith("adapter_model"):
+            tensor_path = lora_dir + "/" + file
+        if file.startswith("adapter_config"):
+            config_path = lora_dir + "/" + file
+        if tensor_path and config_path:
+            break
+
+    if tensor_path.endswith(".safetensors"):
+        tensors = safetensors.torch.load_file(tensor_path)
+    elif tensor_path.endswith(".bin"):
+        tensors = torch.load(tensor_path)
+    else:
+        raise ValueError("Unsupported file: %s", tensor_path)
+
+    with open(config_path) as f:
+        config = json.load(f)
+
+    tensorizer_args = tensorizer_config._construct_tensorizer_args()
+
+    with open_stream(f"{tensorizer_config.lora_dir}/adapter_config.json",
+                     mode="wb+",
+                     **tensorizer_args.stream_params) as f:
+
+        f.write(json.dumps(config).encode("utf-8"))
+
+    lora_uri = (f"{tensorizer_config.lora_dir}"
+                f"/adapter_model.tensors")
+    with open_stream(lora_uri, mode="wb+",
+                     **tensorizer_args.stream_params) as f:
+        serializer = TensorSerializer(f)
+        serializer.write_state_dict(tensors)
+        serializer.close()
+
+    logger.info("Successfully serialized LoRA files to %s",
+                str(tensorizer_config.lora_dir))
diff --git a/vllm/model_executor/model_loader/tensorizer_loader.py b/vllm/model_executor/model_loader/tensorizer_loader.py
index 4107e741fd8f..2afe2b59e2f9 100644
--- a/vllm/model_executor/model_loader/tensorizer_loader.py
+++ b/vllm/model_executor/model_loader/tensorizer_loader.py
@@ -2,6 +2,7 @@
 # ruff: noqa: SIM117
 import copy
 from collections.abc import Generator
+from typing import Union
 
 import torch
 from torch import nn
@@ -47,7 +48,7 @@ def _load_model_serialized_cpu(
         """Load a serialized model with tensorizer to the CPU.
 
         This is only necessary when the model isn't vLLM-tensorized (see
-        examples/other/tensorize_vllm_model.py) This should still
+        examples/others/tensorize_vllm_model.py) This should still
         be faster than default HuggingFace loading, but will be slower than
         loading a vLLM-tensorized model.
         """
@@ -67,7 +68,7 @@ def _load_model_serialized(
         """Load a serialized model with tensorizer.
 
         Expects a vLLM-tensorized model. See the
-        examples/other/tensorize_vllm_model.py example script
+        examples/others/tensorize_vllm_model.py example script
         for serializing vLLM models."""
 
         device_config = vllm_config.device_config
@@ -92,8 +93,8 @@ def download_model(self, model_config: ModelConfig) -> None:
         with self.tensorizer_config.open_stream():
             pass
 
-    def load_model(self, vllm_config: VllmConfig) -> nn.Module:
-        model_config = vllm_config.model_config
+    def load_model(self, vllm_config: VllmConfig,
+                   model_config: ModelConfig) -> nn.Module:
         parallel_config = vllm_config.parallel_config
         self._verify_config(model_config, parallel_config)
 
@@ -111,8 +112,10 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
     @staticmethod
     def save_model(
         model: torch.nn.Module,
-        tensorizer_config: TensorizerConfig,
+        tensorizer_config: Union[TensorizerConfig, dict],
     ) -> None:
+        if isinstance(tensorizer_config, dict):
+            tensorizer_config = TensorizerConfig(**tensorizer_config)
         serialize_vllm_model(
             model=model,
             tensorizer_config=tensorizer_config,
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index 967c30277172..85c232a18623 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -42,9 +42,11 @@ def initialize_model(
     *,
     prefix: str = "",
     model_class: Optional[type[nn.Module]] = None,
+    model_config: Optional[ModelConfig] = None,
 ) -> nn.Module:
     """Initialize a model with the given configurations."""
-    model_config = vllm_config.model_config
+    if model_config is None:
+        model_config = vllm_config.model_config
     if model_class is None:
         model_class, _ = get_model_architecture(model_config)
 
@@ -227,17 +229,16 @@ def get_model_architecture(
         "fp8", "compressed-tensors", "gptq_marlin", "awq_marlin", "quark"
     ]
 
-    if (model_config.quantization is not None
-            and model_config.quantization not in mixtral_supported
-            and "MixtralForCausalLM" in architectures):
-        architectures = ["QuantMixtralForCausalLM"]
-
     vllm_supported_archs = ModelRegistry.get_supported_archs()
     vllm_not_supported = not any(arch in vllm_supported_archs
                                  for arch in architectures)
     if (model_config.model_impl == ModelImpl.TRANSFORMERS or
             model_config.model_impl != ModelImpl.VLLM and vllm_not_supported):
         architectures = resolve_transformers_arch(model_config, architectures)
+    elif (model_config.quantization is not None
+          and model_config.quantization not in mixtral_supported
+          and "MixtralForCausalLM" in architectures):
+        architectures = ["QuantMixtralForCausalLM"]
 
     model_cls, arch = ModelRegistry.resolve_model_cls(architectures)
     if model_config.task == "embed":
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index a1cf43328bab..f61956f4e8e0 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -319,6 +319,7 @@ def download_safetensors_index_file_from_hf(
 
     Args:
         model_name_or_path (str): The model name or path.
+        index_file (str): The safetensors index file name
         cache_dir (Optional[str]): The cache directory to store the model
             weights. If None, will use HF defaults.
         revision (Optional[str]): The revision of the model.
@@ -337,10 +338,10 @@ def download_safetensors_index_file_from_hf(
             )
         # If file not found on remote or locally, we should not fail since
         # only some models will have index_file.
-        except huggingface_hub.utils.EntryNotFoundError:
-            logger.info("No %s found in remote.", index_file)
         except huggingface_hub.utils.LocalEntryNotFoundError:
             logger.info("No %s found in local cache.", index_file)
+        except huggingface_hub.utils.EntryNotFoundError:
+            logger.info("No %s found in remote.", index_file)
 
 
 # For models like Mistral-7B-v0.3, there are both sharded
@@ -634,7 +635,7 @@ def row_parallel_weight_loader(param: torch.Tensor,
     return default_weight_loader(param, loaded_weight)
 
 
-LoaderFunction = Callable[[torch.Tensor, torch.Tensor], torch.Tensor]
+LoaderFunction = Callable[[torch.Tensor, torch.Tensor], None]
 
 
 def sharded_weight_loader(shard_axis: int) -> LoaderFunction:
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 077e36176430..bcff6eb3fd31 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -42,7 +42,8 @@
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
-from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader, row_parallel_weight_loader)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
@@ -384,7 +385,7 @@ def __init__(
         lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
-
+        self.tp_size = get_tensor_model_parallel_world_size()
         self.quant_config = quant_config
         self.model = BaiChuanModel(vllm_config=vllm_config,
                                    prefix=prefix,
@@ -438,8 +439,10 @@ def lm_head_weight_loader(self, param: nn.Parameter,
         is_baichuan2 = self.config.vocab_size == 125696
         if is_baichuan2:
             loaded_weight = torch.nn.functional.normalize(loaded_weight)
-
-        default_weight_loader(param, loaded_weight)
+        if self.tp_size > 1:
+            row_parallel_weight_loader(param, loaded_weight)
+        else:
+            default_weight_loader(param, loaded_weight)
 
 
 class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 2ff7e394a416..db0dd2051d52 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -681,9 +681,8 @@ def forward(
                 batch.
             pixel_values: The pixels in each input image.
         
-        :::{seealso}
-        {class}`Blip2ImageInputs`
-        :::
+        Info:
+            [Blip2ImageInputs][]
         """
 
         if intermediate_tensors is not None:
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index eb1085d6b40d..10424e218fbc 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -43,7 +43,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP, SupportsQuant, SupportsV0Only
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -229,6 +229,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
+        self.config = config
 
         self.embed_dim = config.hidden_size
 
@@ -278,6 +279,38 @@ def forward(
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if is_pp_missing_parameter(name, self):
+                continue
+            param = params_dict[name]
+
+            if "query_key_value" in name:
+                # NOTE: BLOOM's fused QKV's output_dim has the shape of
+                # (num_heads * 3 * head_size), while the
+                # required shape is (3 * num_heads * head_size).
+                # Thus, we need weight conversion.
+                output_dim = getattr(param, "output_dim", None)
+                num_heads = self.config.num_attention_heads
+                if output_dim is not None:
+                    loaded_weight_shape = loaded_weight.shape
+                    loaded_weight = loaded_weight.view(
+                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
+                        loaded_weight_shape[output_dim + 1:])
+                    loaded_weight = loaded_weight.transpose(
+                        output_dim, output_dim + 1)
+                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
+
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        return loaded_params
+
 
 class BloomForCausalLM(nn.Module, SupportsPP, SupportsV0Only, SupportsQuant):
 
@@ -325,35 +358,15 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if name == "lm_head.weight":
-                continue
-            if not name.startswith("transformer."):
-                name = "transformer." + name
-            if is_pp_missing_parameter(name, self):
-                continue
-            param = params_dict[name]
-
-            if "query_key_value" in name:
-                # NOTE: BLOOM's fused QKV's output_dim has the shape of
-                # (num_heads * 3 * head_size), while the
-                # required shape is (3 * num_heads * head_size).
-                # Thus, we need weight conversion.
-                output_dim = getattr(param, "output_dim", None)
-                num_heads = self.config.num_attention_heads
-                if output_dim is not None:
-                    loaded_weight_shape = loaded_weight.shape
-                    loaded_weight = loaded_weight.view(
-                        loaded_weight_shape[:output_dim] + (num_heads, 3, -1) +
-                        loaded_weight_shape[output_dim + 1:])
-                    loaded_weight = loaded_weight.transpose(
-                        output_dim, output_dim + 1)
-                    loaded_weight = loaded_weight.reshape(loaded_weight_shape)
-
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self, skip_prefixes=["lm_head.weight"])
+        weights = _add_transformer_prefix(weights)
+        return loader.load_weights(weights)
+
+
+def _add_transformer_prefix(
+    weights: Iterable[tuple[str, torch.Tensor]]
+) -> Iterable[tuple[str, torch.Tensor]]:
+    for name, tensor in weights:
+        if not name.startswith('transformer.'):
+            name = 'transformer.' + name
+        yield name, tensor
diff --git a/vllm/model_executor/models/deepseek_mtp.py b/vllm/model_executor/models/deepseek_mtp.py
index 6d7b52aba5f9..03ef7bed0edc 100644
--- a/vllm/model_executor/models/deepseek_mtp.py
+++ b/vllm/model_executor/models/deepseek_mtp.py
@@ -19,6 +19,7 @@
 
 from .deepseek_v2 import (DeepseekV2DecoderLayer,
                           get_spec_layer_idx_from_weight_name)
+from .interfaces import SupportsPP
 from .utils import maybe_prefix
 
 
@@ -145,7 +146,7 @@ def compute_logits(
         return logits
 
 
-class DeepSeekMTP(nn.Module):
+class DeepSeekMTP(nn.Module, SupportsPP):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py
index 164fa40ffebe..5c8793f59ffb 100644
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -210,9 +210,7 @@ def _call_hf_processor(
                 dict(prompt=prompt, **mm_data),
                 mm_kwargs,
             )
-            target_dtype = self.info.ctx.model_config.dtype
-            pixel_values = processed_outputs.pop("pixel_values").to(
-                target_dtype)
+            pixel_values = processed_outputs["pixel_values"]
             # split pixel values into patches corresponding to each image
             images_spatial_crop = processed_outputs["images_spatial_crop"]
             patches_per_image = [
diff --git a/vllm/model_executor/models/falcon_h1.py b/vllm/model_executor/models/falcon_h1.py
new file mode 100644
index 000000000000..1c0e3911fcce
--- /dev/null
+++ b/vllm/model_executor/models/falcon_h1.py
@@ -0,0 +1,684 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Inference-only FalconH1 model."""
+from collections.abc import Iterable
+from typing import Optional
+
+import torch
+from torch import nn
+from transformers import FalconH1Config
+
+from vllm.attention.layer import Attention
+from vllm.config import CacheConfig, VllmConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.forward_context import get_forward_context
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear,
+                                               RowParallelLinear)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.mamba2_metadata import (
+    Mamba2Metadata, prepare_mamba2_metadata)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import (
+    MambaMixer2, extra_groups_for_head_shards)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.mamba_cache import (MambaCacheManager,
+                                                    MambaCacheParams)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+
+from .interfaces import (HasInnerState, IsHybrid, SupportsLoRA, SupportsPP,
+                         SupportsV0Only)
+from .utils import (PPMissingLayer, is_pp_missing_parameter,
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
+
+
+class FalconH1MLP(nn.Module):
+
+    def __init__(
+        self,
+        config: FalconH1Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+    ) -> None:
+        super().__init__()
+        self.gate_up_proj = MergedColumnParallelLinear(
+            input_size=config.hidden_size,
+            output_sizes=[config.intermediate_size] * 2,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.down_proj = RowParallelLinear(
+            input_size=config.intermediate_size,
+            output_size=config.hidden_size,
+            bias=bias,
+            quant_config=quant_config,
+        )
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.intermediate_size = config.intermediate_size
+        self.gate_multiplier, self.down_multiplier = config.mlp_multipliers
+        if config.hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {config.hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        x, _ = self.gate_up_proj(x)
+        x[:, :self.intermediate_size // self.tp_size] *= self.gate_multiplier
+        x = self.act_fn(x)
+        x, _ = self.down_proj(x)
+        x = x * self.down_multiplier
+        return x
+
+
+class FalconH1SSMDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: FalconH1Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.tp_size = get_tensor_model_parallel_world_size()
+
+        self.d_ssm = (int(config.mamba_expand * config.hidden_size)
+                      if config.mamba_d_ssm is None else config.mamba_d_ssm)
+
+        self.mamba = MambaMixer2(
+            hidden_size=config.hidden_size,
+            ssm_state_size=config.mamba_d_state,
+            conv_kernel_size=config.mamba_d_conv,
+            intermediate_size=self.d_ssm,
+            use_conv_bias=config.mamba_conv_bias,
+            use_bias=config.mamba_proj_bias,
+            n_groups=config.mamba_n_groups,
+            num_heads=config.mamba_n_heads,
+            head_dim=config.mamba_d_head,
+            rms_norm_eps=config.rms_norm_eps,
+            activation=config.hidden_act,
+            quant_config=quant_config,
+            use_rms_norm=config.mamba_rms_norm,
+        )
+        # n_groups is overridden later by `MambaMixer2`
+        self.groups_time_state_size = self.mamba.n_groups * config.mamba_d_state
+        self.zxbcdt_multipliers = config.ssm_multipliers
+        self._init_mup_vector()
+
+    def _init_mup_vector(self):
+        """
+        Non learnable per-block scaling vector composed of element-wise 
+        multipliersapplied to each separate contiguous block of the output 
+        of the linear projection (in_proj) before further processing
+        (gating, convolution, SSM):
+
+            - Z block:  [0 : d_ssm]                      → zxbcdt_multipliers[0]
+            - X block:  [d_ssm : 2 * d_ssm]              → zxbcdt_multipliers[1]
+            - B block:  [2 * d_ssm : 2 * d_ssm + G * S]  → zxbcdt_multipliers[2]
+            - C block:  [2 * d_ssm + G * S : 2 * d_ssm + 2 * G * S] 
+                        → zxbcdt_multipliers[3]
+            - dt block: [2 * d_ssm + 2 * G * S : end]    → zxbcdt_multipliers[4]
+
+        where:
+            - d_ssm:     Dimension of state-space model latent
+            - G:         Number of groups (n_groups)
+            - S:         SSM state size per group
+            - All indices are divided by tp_size to support tensor parallelism
+        """
+        vector_shape = (2 * self.d_ssm + 2 * self.groups_time_state_size +
+                        self.config.mamba_n_heads) // self.tp_size
+        mup_vector = torch.ones(1, vector_shape)
+        # Z vector 0 -> d_ssm
+        mup_vector[:, :self.d_ssm //
+                   self.tp_size] *= self.zxbcdt_multipliers[0]
+        # X vector d_ssm -> 2 * d_ssm
+        mup_vector[:,
+                   (self.d_ssm //
+                    self.tp_size):(2 * self.d_ssm //
+                                   self.tp_size)] *= self.zxbcdt_multipliers[1]
+        # B vector 2 * d_ssm -> 2 * d_ssm + (n_group * d_state)
+        mup_vector[
+            :,
+            (2 * self.d_ssm) //
+            self.tp_size:(2 * self.d_ssm + self.groups_time_state_size) //
+            self.tp_size,
+        ] *= self.zxbcdt_multipliers[2]
+        # C vector 2 * d_ssm + (n_group * d_state)
+        # -> 2 * d_ssm + 2 * (n_group * d_state)
+        mup_vector[
+            :,
+            (2 * self.d_ssm + self.groups_time_state_size) //
+            self.tp_size:(2 * self.d_ssm + 2 * self.groups_time_state_size) //
+            self.tp_size,
+        ] *= self.zxbcdt_multipliers[3]
+        # dt vector 2 * d_ssm + 2 * (n_group * d_state)
+        # -> 2 * d_ssm + 2 * (n_group * d_state) + n_heads
+        mup_vector[
+            :,
+            (2 * self.d_ssm + 2 * self.groups_time_state_size) //
+            self.tp_size:,
+        ] *= self.zxbcdt_multipliers[4]
+
+        self.register_buffer("mup_vector", mup_vector, persistent=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
+        hidden_states = self.mamba(
+            hidden_states,
+            mamba_cache_params,
+            mamba2_metadata=mamba2_metadata,
+            mup_vector=self.mup_vector,
+        )
+        return hidden_states, residual
+
+
+class FalconH1AttentionDecoderLayer(nn.Module):
+
+    def __init__(
+        self,
+        config: FalconH1Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        rope_theta = getattr(config, "rope_theta", 1e11)
+        rope_scaling = getattr(config, "rope_scaling", None)
+        max_position_embeddings = getattr(config, "max_position_embeddings",
+                                          8192)
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            # Number of KV heads is greater than TP size, so we partition
+            # the KV heads across multiple tensor parallel GPUs.
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            # Number of KV heads is less than TP size, so we replicate
+            # the KV heads across multiple tensor parallel GPUs.
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = (config.hidden_size // self.total_num_heads if getattr(
+            config, "head_dim", None) is None else config.head_dim)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.rope_theta = rope_theta
+        self.max_position_embeddings = max_position_embeddings
+
+        if hasattr(config, "partial_rotary_factor"):
+            rotary_dim = self.head_dim * config.partial_rotary_factor
+        elif hasattr(config, "attn_rotary_emb"):
+            rotary_dim = config.attn_rotary_emb  # for backward compatibility
+        else:
+            rotary_dim = self.head_dim  # default
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            rotary_dim=rotary_dim,
+            max_position=max_position_embeddings,
+            rope_scaling=rope_scaling,
+            base=rope_theta,
+            is_neox_style=True,
+            dtype=None,  # see impl of get_rope
+        )
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads,
+            self.total_num_kv_heads,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            prefix=f"{prefix}.attn",
+        )
+        self.key_multiplier = config.key_multiplier
+
+    def self_attention(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        **kwargs,
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        k = k * self.key_multiplier
+
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ):
+        hidden_states = self.self_attention(
+            positions=positions,
+            hidden_states=hidden_states,
+        )
+        return hidden_states, residual
+
+
+class FalconH1ParallelHybrid(nn.Module):
+    """
+    A hybrid decoder layer for FalconH1 where the input is processed
+    in parallel through both the self-attention branch and the SSM (Mamba)
+    branch. Their outputs are then summed to produce the final hidden state.
+
+    This layer uses:
+      - FalconH1AttentionDecoderLayer for the multi-head self-attention branch.
+      - FalconH1SSMDecoderLayer for the state-space (Mamba) branch.
+    """
+
+    def __init__(
+        self,
+        config: FalconH1Config,
+        layer_idx: int,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        # Instantiate the attention branch
+        self.self_attn = FalconH1AttentionDecoderLayer(
+            config=config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=prefix,
+        )
+        # Instantiate the SSM branch
+        self.mamba = FalconH1SSMDecoderLayer(
+            config=config,
+            cache_config=cache_config,
+            quant_config=quant_config,
+        )
+        self.ssm_out_multiplier = config.ssm_out_multiplier
+        self.ssm_in_multiplier = config.ssm_in_multiplier
+
+        self.attention_in_multiplier = config.attention_in_multiplier
+        self.attn_out_multiplier = config.attention_out_multiplier
+
+        self.feed_forward = FalconH1MLP(config)
+
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = RMSNorm(config.hidden_size,
+                                        eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        mamba2_metadata: Mamba2Metadata,
+        **kwargs,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Process input through the attention branch.
+        # FalconH1AttentionDecoderLayer expects positions, hidden_states,
+        # kv_cache, attn_metadata, and residual.
+        attn_hidden, _ = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states * self.attention_in_multiplier,
+            residual=residual,
+            **kwargs,
+        )
+
+        # Process input through the SSM branch.
+        # FalconH1SSMDecoderLayer expects hidden_states, attn_metadata,
+        # residual, mamba_cache_params, and sequence_idx.
+        ssm_hidden, _ = self.mamba(
+            hidden_states=hidden_states * self.ssm_in_multiplier,
+            residual=residual,
+            mamba_cache_params=mamba_cache_params,
+            mamba2_metadata=mamba2_metadata,
+            **kwargs,
+        )
+        # Sum the outputs from both branches.
+        # We assume both branches produce outputs of the same
+        # dimensionality (config.hidden_size).
+        hidden_states = (attn_hidden * self.attn_out_multiplier) + (
+            ssm_hidden * self.ssm_out_multiplier)
+        hidden_states = hidden_states + residual
+
+        # feed-forward
+        residual = hidden_states
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+
+class FalconH1Model(nn.Module):
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config: FalconH1Config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
+        self.config = config
+        lora_vocab = ((lora_config.lora_extra_vocab_size *
+                       (lora_config.max_loras or 1)) if lora_config else 0)
+        self.vocab_size = config.vocab_size + lora_vocab
+        self.org_vocab_size = config.vocab_size
+        if get_pp_group().is_first_rank:
+
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+            )
+            self.embedding_multiplier = config.embedding_multiplier
+        else:
+            self.embed_tokens = PPMissingLayer()
+            self.embedding_multiplier = 1.0
+
+        def get_layer(prefix: str):
+            layer_idx = int(prefix.rsplit(".", 1)[1])
+            layer_class = FalconH1ParallelHybrid
+            return layer_class(
+                config,
+                layer_idx,
+                cache_config,
+                quant_config=quant_config,
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers")
+        self.make_empty_intermediate_tensors = (
+            make_empty_intermediate_tensors_factory(
+                ["hidden_states", "residual"], config.hidden_size))
+        if get_pp_group().is_last_rank:
+            self.final_layernorm = RMSNorm(config.hidden_size,
+                                           eps=config.rms_norm_eps)
+        else:
+            self.final_layernorm = PPMissingLayer()
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mamba_cache_params: MambaCacheParams,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        # pass a sequence index tensor, that is required for
+        # proper continuous batching computation including
+        # chunked prefill
+        attn_metadata = get_forward_context().attn_metadata
+        mamba2_metadata = prepare_mamba2_metadata(
+            chunk_size=self.config.mamba_chunk_size,
+            attn_metadata=attn_metadata,
+        )
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds * self.embedding_multiplier
+            else:
+                hidden_states = (self.get_input_embeddings(input_ids) *
+                                 self.embedding_multiplier)
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            layer_mamba_cache_params = mamba_cache_params.at_layer_idx(i)
+            hidden_states = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                mamba_cache_params=layer_mamba_cache_params,
+                mamba2_metadata=mamba2_metadata,
+            )
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors({
+                "hidden_states": hidden_states,
+            })
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+
+
+class FalconH1ForCausalLM(nn.Module, HasInnerState, SupportsLoRA, SupportsPP,
+                          IsHybrid, SupportsV0Only):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+    }
+
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+        "lm_head": "output_embeddings",
+    }
+    embedding_padding_modules = ["lm_head"]
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
+        assert (not cache_config.enable_prefix_caching
+                ), "FalconH1 currently does not support prefix caching"
+
+        self.quant_config = vllm_config.quant_config
+
+        super().__init__()
+        self.config = config
+        self.scheduler_config = scheduler_config
+        self.model = FalconH1Model(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
+        self.tie_word_embeddings = config.tie_word_embeddings
+        self.unpadded_vocab_size = config.vocab_size
+        self.mamba_cache: Optional[MambaCacheManager] = None
+        if lora_config:
+            self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
+        if get_pp_group().is_last_rank:
+            self.lm_head = ParallelLMHead(
+                self.unpadded_vocab_size,
+                config.hidden_size,
+                org_num_embeddings=config.vocab_size,
+                padding_size=(
+                    DEFAULT_VOCAB_PADDING_SIZE
+                    # We need bigger padding if using lora for kernel
+                    # compatibility
+                    if not lora_config else
+                    lora_config.lora_vocab_padding_size),
+            )
+            self.lm_head_multiplier = config.lm_head_multiplier
+            if self.tie_word_embeddings:
+                self.lm_head = self.lm_head.tie_weights(
+                    self.model.embed_tokens)
+            # Used to track and store by the Mamba cache between steps.
+
+            self.logits_processor = LogitsProcessor(
+                self.unpadded_vocab_size,
+                config.vocab_size,
+                scale=config.lm_head_multiplier,
+            )
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors)
+
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ):
+        if self.mamba_cache is None:
+            self.mamba_cache = MambaCacheManager(
+                self.vllm_config,
+                self.lm_head.weight.dtype
+                if hasattr(self.lm_head, 'weight') else torch.bfloat16,
+                self.config.num_hidden_layers,
+                *self._get_mamba_cache_shape(),
+            )
+        mamba_cache_params = self.mamba_cache.current_run_tensors(**kwargs)
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            mamba_cache_params,
+            intermediate_tensors,
+            inputs_embeds,
+        )
+
+        return hidden_states
+
+    def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
+        return self.mamba_cache.copy_inputs_before_cuda_graphs(
+            input_buffers, **kwargs)
+
+    def get_seqlen_agnostic_capture_inputs(self, batch_size: int):
+        return self.mamba_cache.get_seqlen_agnostic_capture_inputs(batch_size)
+
+    def _get_mamba_cache_shape(
+            self) -> tuple[tuple[int, int], tuple[int, int]]:
+        world_size = get_tensor_model_parallel_world_size()
+        hidden_size = self.config.hidden_size
+
+        conv_state_shape, temporal_state_shape = None, None
+
+        intermediate_size = (int(self.config.mamba_expand *
+                                 hidden_size) if self.config.mamba_d_ssm
+                             is None else self.config.mamba_d_ssm)
+
+        # if n_groups is not divisible by world_size, need to extend the shards
+        # to ensure all groups needed by a head is sharded along with it
+        n_groups = self.config.mamba_n_groups + extra_groups_for_head_shards(
+            self.config.mamba_n_groups, world_size)
+
+        # - heads and n_groups are TP-ed
+        conv_dim = intermediate_size + 2 * n_groups * self.config.mamba_d_state
+        conv_state_shape = (
+            divide(conv_dim, world_size),
+            self.config.mamba_d_conv - 1,
+        )
+
+        # These are not TP-ed as they depend on A, dt_bias, D
+        # - they are typically small
+        #   e.g., (h_heads, d_head, d_state) = (128, 64, 128)
+        temporal_state_shape = (
+            divide(self.config.mamba_n_heads, world_size),
+            self.config.mamba_d_head,
+            self.config.mamba_d_state,
+        )
+        return conv_state_shape, temporal_state_shape
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+
+        return logits
+
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "A_log" in name:
+                name = name.replace("A_log", "A")
+
+            if "mamba" in name:
+                name = name.replace("mamba", "mamba.mamba")
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                if is_pp_missing_parameter(name, self):
+                    continue
+                if self.tie_word_embeddings and "lm_head" in name:
+                    continue
+
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+
+        if self.tie_word_embeddings:
+            loaded_params.add("lm_head.weight")
+        return loaded_params
diff --git a/vllm/model_executor/models/gemma3_mm.py b/vllm/model_executor/models/gemma3_mm.py
index 743542ec8dfa..182cc86d3ca8 100644
--- a/vllm/model_executor/models/gemma3_mm.py
+++ b/vllm/model_executor/models/gemma3_mm.py
@@ -504,18 +504,12 @@ def dtype(self):
         return next(self.parameters()).dtype
 
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-        h = w = self.config.vision_config.image_size
-        expected_dims = (3, h, w)
-
-        def _validate_shape(d: torch.Tensor):
-            if d.shape != expected_dims:
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f"is {expected_dims}. You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
+        image_size = self.config.vision_config.image_size
+        expected_dims = (3, image_size, image_size)
+        if data.shape[1:] != expected_dims:
+            raise ValueError(
+                "The expected shape of pixel values per image per batch is "
+                f"{expected_dims}. You supplied {tuple(data.shape)}.")
         return data
 
     def _parse_and_validate_image_input(
@@ -549,9 +543,7 @@ def _image_pixels_to_features(
         vision_tower: SiglipVisionModel,
         pixel_values: torch.Tensor,
     ) -> torch.Tensor:
-        target_dtype = vision_tower.get_input_embeddings().weight.dtype
-        image_features = vision_tower(pixel_values.to(dtype=target_dtype))
-        return image_features
+        return vision_tower(pixel_values)
 
     def _process_image_input(
         self,
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 470a7053e1b6..c2c310fca4d9 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -43,7 +43,7 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsPP
-from .utils import (is_pp_missing_parameter,
+from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
@@ -235,6 +235,35 @@ def forward(
         hidden_states = self.ln_f(hidden_states)
         return hidden_states
 
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if ".attn.bias" in name or ".attn.masked_bias" in name:
+                # Skip attention mask.
+                # NOTE: "c_attn.bias" should not be skipped.
+                continue
+
+            if is_pp_missing_parameter(name, self):
+                continue
+
+            param = params_dict[name]
+            # The HF's GPT-2 implementation uses Conv1D instead of Linear.
+            # Because of this, we need to transpose the weights.
+            # Note(zhuohan): the logic below might break quantized models.
+            for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
+                if conv1d_weight_name not in name:
+                    continue
+                if not name.endswith(".weight"):
+                    continue
+                loaded_weight = loaded_weight.t()
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
 
 class GPT2LMHeadModel(nn.Module, SupportsPP):
 
@@ -283,32 +312,16 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        params_dict = dict(self.named_parameters(remove_duplicate=False))
-        loaded_params: set[str] = set()
-        for name, loaded_weight in weights:
-            if ".attn.bias" in name or ".attn.masked_bias" in name:
-                # Skip attention mask.
-                # NOTE: "c_attn.bias" should not be skipped.
-                continue
-            if not name.startswith("transformer.") and not name.startswith(
-                    "lm_head"):
-                name = "transformer." + name
-
-            if is_pp_missing_parameter(name, self):
-                continue
-
-            param = params_dict[name]
-            # The HF's GPT-2 implementation uses Conv1D instead of Linear.
-            # Because of this, we need to transpose the weights.
-            # Note(zhuohan): the logic below might break quantized models.
-            for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
-                if conv1d_weight_name not in name:
-                    continue
-                if not name.endswith(".weight"):
-                    continue
-                loaded_weight = loaded_weight.t()
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
+        loader = AutoWeightsLoader(self)
+        weights = _add_transformer_prefix(weights)
+        return loader.load_weights(weights)
+
+
+def _add_transformer_prefix(
+    weights: Iterable[tuple[str, torch.Tensor]]
+) -> Iterable[tuple[str, torch.Tensor]]:
+    for name, tensor in weights:
+        if not name.startswith('transformer.') and not name.startswith(
+                "lm_head"):
+            name = 'transformer.' + name
+        yield name, tensor
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 6a1d97bd7b69..c4ae4fc3c006 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -272,12 +272,6 @@ def load_weights(self, weights: Iterable[tuple[str,
 class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {"c_attn": ["c_attn"]}
 
-    # LoRA specific attributes
-    embedding_modules = {
-        "wte": "input_embeddings",
-        "lm_head": "output_embeddings",
-    }
-
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -330,8 +324,11 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
+        skip_prefixes = None
+        if self.config.tie_word_embeddings:
+            skip_prefixes = ["lm_head."]
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=(["lm_head."]),
+            skip_prefixes=skip_prefixes,
         )
-        return loader.load_weights(weights)
\ No newline at end of file
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 0f2e90df7363..3524d036db22 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -479,18 +479,14 @@ def make_empty_intermediate_tensors(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        skip_prefixes = [
-            "rotary_emb.inv_freq",
-            # Models trained using ColossalAI may include these tensors in
-            # the checkpoint. Skip them.
-            "rotary_emb.cos_cached",
-            "rotary_emb.sin_cached",
-        ]
         # With tie_word_embeddings, we can skip lm_head.weight
         # The weight might appear unnecessarily in the files if the model is
         # processed with quantization, LoRA, fine-tuning, etc.
-        if self.config.tie_word_embeddings:
-            skip_prefixes.append("lm_head.weight")
+        skip_prefixes = (["lm_head."]
+                         if self.config.tie_word_embeddings else None)
 
-        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=skip_prefixes,
+        )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py
index 6d2d16d098d4..bc9e9a3c0206 100644
--- a/vllm/model_executor/models/grok1.py
+++ b/vllm/model_executor/models/grok1.py
@@ -28,7 +28,7 @@
 import torch.nn.functional as F
 from torch import nn
 
-from vllm.attention import Attention, AttentionMetadata
+from vllm.attention import Attention
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -182,25 +182,20 @@ def __init__(
                               quant_config=quant_config,
                               logits_soft_cap=attn_logits_soft_cap,
                               prefix=f"{prefix}.attn")
+        self.attn_multiplier = getattr(self.config, "attn_output_multiplier",
+                                       1.0) if self.config else 1.0
 
     def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
         q, k = self.rotary_emb(positions, q, k)
-        attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
+        attn_output = self.attn(q, k, v)
         output, _ = self.o_proj(attn_output)
-
-        # Apply attention output multiplier if specified in config
-        attn_multiplier = getattr(self.config, "attn_output_multiplier",
-                                  None) if self.config else None
-        if attn_multiplier is not None:
-            output = output * attn_multiplier
+        output *= self.attn_multiplier
         return output
 
 
@@ -261,8 +256,6 @@ def forward(
         self,
         positions: torch.Tensor,
         hidden_states: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
@@ -276,8 +269,6 @@ def forward(
         hidden_states = self.attn(
             positions=positions,
             hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            attn_metadata=attn_metadata,
         )
 
         # Post attention normalization
@@ -341,8 +332,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: list[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
@@ -359,9 +348,7 @@ def forward(
 
         for i in range(self.start_layer, self.end_layer):
             layer = self.layers[i]
-            hidden_states, residual = layer(positions, hidden_states,
-                                            kv_caches[i - self.start_layer],
-                                            attn_metadata, residual)
+            hidden_states, residual = layer(positions, hidden_states, residual)
 
         if not get_pp_group().is_last_rank:
             return IntermediateTensors({
@@ -529,13 +516,10 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        kv_caches: list[torch.Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors,
+        hidden_states = self.model(input_ids, positions, intermediate_tensors,
                                    inputs_embeds)
         return hidden_states
 
@@ -550,10 +534,12 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        skip_prefixes = ["rotary_emb.inv_freq"]
         # Skip lm_head when tie_word_embeddings is True
-        if self.config.tie_word_embeddings:
-            skip_prefixes.append("lm_head")
+        skip_prefixes = (["lm_head"]
+                         if self.config.tie_word_embeddings else None)
 
-        loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=skip_prefixes,
+        )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 99c226439ecb..904f5330c653 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -25,9 +25,10 @@
 
 from .intern_vit import InternVisionModel
 from .internvl import (IMG_CONTEXT, IMG_END, IMG_START,
+                       BaseInternVLDummyInputsBuilder,
+                       BaseInternVLMultiModalProcessor,
                        BaseInternVLProcessingInfo, BaseInternVLProcessor,
-                       InternVLChatModel, InternVLDummyInputsBuilder,
-                       InternVLMultiModalProcessor, build_transform,
+                       InternVLChatModel, build_transform,
                        find_closest_aspect_ratio, get_internvl_target_ratios)
 
 
@@ -430,8 +431,8 @@ def get_num_image_tokens(
         )
 
 
-class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
-                               ):
+class H2OVLMultiModalProcessor(
+        BaseInternVLMultiModalProcessor[H2OVLProcessingInfo]):
 
     def _get_prompt_updates(
         self,
@@ -514,7 +515,7 @@ def _cached_apply_hf_processor(
 @MULTIMODAL_REGISTRY.register_processor(
     H2OVLMultiModalProcessor,
     info=H2OVLProcessingInfo,
-    dummy_inputs=InternVLDummyInputsBuilder)
+    dummy_inputs=BaseInternVLDummyInputsBuilder)
 class H2OVLChatModel(InternVLChatModel):
 
     def _init_vision_model(
diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 8f33a3e29c60..8be8841c1f6c 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -226,9 +226,11 @@ def forward(
         intermediate_tensors: Optional["IntermediateTensors"],
     ) -> Union[Tensor, "IntermediateTensors"]:
         """
-        Accept {class}`IntermediateTensors` when PP rank > 0.
+        Accept [`IntermediateTensors`][vllm.sequence.IntermediateTensors] when
+        PP rank > 0.
 
-        Return {class}`IntermediateTensors` only for the last PP rank.
+        Return [`IntermediateTensors`][vllm.sequence.IntermediateTensors] only
+        for the last PP rank.
         """
         ...
 
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 66e78fcc4e80..4612fc438741 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -8,8 +8,9 @@
 # --------------------------------------------------------
 from abc import ABC, abstractmethod
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict, TypeVar, Union
+from typing import Any, Literal, Optional, TypedDict, TypeVar, Union
 
+import numpy.typing as npt
 import torch
 import torch.nn as nn
 import torchvision.transforms as T
@@ -23,6 +24,7 @@
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargs, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
@@ -73,11 +75,38 @@ class InternVLImageEmbeddingInputs(TypedDict):
                             InternVLImageEmbeddingInputs]
 
 
+class InternVLVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
+    pixel_values_flat: torch.Tensor
+    """
+    Shape:
+    `(batch_size * num_video * num_frames, num_channels, height, width)`
+    """
+
+    num_patches: torch.Tensor
+    """Shape: `(batch_size * num_images)`"""
+
+
+class InternVLVideoEmbeddingInputs(TypedDict):
+    type: Literal["video_embeds"]
+    data: Union[torch.Tensor, list[torch.Tensor]]
+    """ 
+    A tensor of shape `(num_videos, total_video_feature_size, hidden_size)`
+    or a list of tensors of shape `(total_video_feature_size, hidden_size)`
+
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+InternVLVideoInputs = Union[InternVLVideoPixelInputs,
+                            InternVLVideoEmbeddingInputs]
+
+
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
 def build_transform(input_size: int):
     MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
     return T.Compose([
-        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Lambda(lambda img: convert_image_mode(img, 'RGB')),
         T.Resize((input_size, input_size),
                  interpolation=T.InterpolationMode.BICUBIC),
         T.ToTensor(),
@@ -230,6 +259,33 @@ def image_to_pixel_values_internvl(
     return pixel_values
 
 
+# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B
+def video_to_pixel_values_internvl(
+    video: npt.NDArray,
+    *,
+    input_size: int,
+    min_num: int,
+    max_num: int,
+    use_thumbnail: bool,
+) -> torch.Tensor:
+    target_ratios = get_internvl_target_ratios(min_num, max_num)
+
+    transform = build_transform(input_size=input_size)
+    frames_list = list[Image.Image]()
+    for frame in video:
+        pil_frame = dynamic_preprocess_internvl(
+            Image.fromarray(frame, mode="RGB"),
+            target_ratios=target_ratios,
+            image_size=input_size,
+            use_thumbnail=use_thumbnail,
+        )
+        assert len(pil_frame) == 1
+        frames_list.extend(pil_frame)
+
+    pixel_values = torch.stack([transform(image) for image in frames_list])
+    return pixel_values
+
+
 class BaseInternVLProcessor(ABC):
     """
     This model doesn't define its own HF processor,
@@ -374,24 +430,14 @@ def _images_to_pixel_values_lst(
             ) for image in images
         ]
 
-    def __call__(
+    def _preprocess_image(
         self,
-        text: Optional[Union[str, list[str]]] = None,
-        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        text: list[str],
+        images: list[Image.Image],
         min_dynamic_patch: Optional[int] = None,
         max_dynamic_patch: Optional[int] = None,
         dynamic_image_size: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-    ) -> Mapping[str, NestedTensors]:
-        if text is None:
-            text = []
-        if not isinstance(text, list):
-            text = [text]
-        if images is None:
-            images = []
-        if not isinstance(images, list):
-            images = [images]
-
+    ) -> tuple[list[str], dict[str, torch.Tensor]]:
         if len(images) == 0:
             image_inputs = {}
         else:
@@ -414,6 +460,34 @@ def __call__(
 
                 image_repl = self.get_image_repl(feature_size, num_patches)
                 text = [t.replace('<image>', image_repl.full, 1) for t in text]
+        return text, image_inputs
+
+    def _make_batch_input(self,
+                          input_item: Optional[Union[Any, list[Any]]] = None):
+        if input_item is None:
+            input_item = []
+        if not isinstance(input_item, list):
+            input_item = [input_item]
+        return input_item
+
+    def __call__(
+        self,
+        text: Optional[Union[str, list[str]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> Mapping[str, NestedTensors]:
+        text, images = [self._make_batch_input(x) for x in (text, images)]
+
+        text, image_inputs = self._preprocess_image(
+            text=text,
+            images=images,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
 
         text_inputs = self.tokenizer(text)
 
@@ -424,11 +498,133 @@ def __call__(
 
 
 class InternVLProcessor(BaseInternVLProcessor):
+    """
+    HF Processor for InternVLChatModel with extended video processing logic.
+
+    Code for video processing is adapted from video example:
+    https://huggingface.co/OpenGVLab/InternVL3-1B#inference-with-transformers
+    """
+
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        tokenizer: AnyTokenizer,
+        *,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        video_token: Optional[str] = None,
+    ) -> None:
+        super().__init__(
+            config=config,
+            tokenizer=tokenizer,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+        # add extra video token for video processing
+        self.video_token = video_token
 
     @property
     def image_token_id(self) -> int:
         return self.tokenizer.get_vocab()[IMG_CONTEXT]
 
+    @property
+    def video_token_id(self) -> Optional[int]:
+        if self.video_token is None:
+            return None
+        return self.tokenizer.get_vocab().get(self.video_token, None)
+
+    @property
+    def supports_video(self) -> bool:
+        return self.video_token_id is not None
+
+    def _videos_to_pixel_values_lst(
+        self,
+        videos: list[npt.NDArray],
+        dynamic_image_size: Optional[bool] = None,
+    ) -> list[torch.Tensor]:
+        min_num, max_num = self.resolve_min_max_num(
+            min_dynamic_patch=1,
+            max_dynamic_patch=1,
+            dynamic_image_size=dynamic_image_size,
+            use_thumbnail=False,  # Applied in image_to_pixel_values
+        )
+
+        return [
+            video_to_pixel_values_internvl(
+                video,
+                input_size=self.image_size,
+                min_num=min_num,
+                max_num=max_num,
+                use_thumbnail=False,
+            ) for video in videos
+        ]
+
+    def _preprocess_video(
+        self,
+        text: list[str],
+        videos: list[npt.NDArray],
+        dynamic_image_size: Optional[bool] = None,
+    ):
+        if len(videos) == 0 or not self.supports_video:
+            video_inputs = {}
+        else:
+            pixel_values_lst_video = self._videos_to_pixel_values_lst(
+                videos,
+                dynamic_image_size=dynamic_image_size,
+            )
+            video_inputs: dict[str, NestedTensors] = {
+                "pixel_values_flat_video":
+                torch.cat(pixel_values_lst_video),
+                "video_num_patches":
+                torch.tensor([len(item) for item in pixel_values_lst_video]),
+            }
+
+            for pixel_values in pixel_values_lst_video:
+                num_patches = pixel_values.shape[0]
+
+                video_repl = self.get_video_repl(self.num_image_token,
+                                                 num_patches, self.video_token)
+                text = [t.replace('<video>', video_repl.full, 1) for t in text]
+        return text, video_inputs
+
+    def __call__(
+        self,
+        text: Optional[Union[str, list[str]]] = None,
+        images: Optional[Union[Image.Image, list[Image.Image]]] = None,
+        videos: Optional[Union[npt.NDArray, list[npt.NDArray]]] = None,
+        min_dynamic_patch: Optional[int] = None,
+        max_dynamic_patch: Optional[int] = None,
+        dynamic_image_size: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> Mapping[str, NestedTensors]:
+        text, images, videos = [
+            self._make_batch_input(x) for x in (text, images, videos)
+        ]
+
+        text, image_inputs = self._preprocess_image(
+            text=text,
+            images=images,
+            min_dynamic_patch=min_dynamic_patch,
+            max_dynamic_patch=max_dynamic_patch,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        text, video_inputs = self._preprocess_video(
+            text=text,
+            videos=videos,
+            dynamic_image_size=dynamic_image_size,
+        )
+
+        text_inputs = self.tokenizer(text)
+
+        return {
+            **BatchEncoding(text_inputs, tensor_type=return_tensors),
+            **image_inputs,
+            **video_inputs,
+        }
+
     def get_image_repl(
         self,
         feature_size: int,
@@ -439,8 +635,24 @@ def get_image_repl(
 
         return PromptUpdateDetails.select_text(repl_full, IMG_CONTEXT)
 
+    def get_video_repl(
+        self,
+        feature_size: int,
+        num_patches: Optional[int] = None,
+        video_context_token: str = IMG_CONTEXT,
+    ) -> PromptUpdateDetails[str]:
+        repl_features = video_context_token * self.num_image_token
+        repl_features_with_sep = IMG_START + repl_features + IMG_END
+        # num_patches is equal to num_frames
+        repl_full = ''.join([
+            f'Frame{i+1}: {repl_features_with_sep}' for i in range(num_patches)
+        ])
+
+        return PromptUpdateDetails.select_text(repl_full, video_context_token)
+
 
 class BaseInternVLProcessingInfo(BaseProcessingInfo):
+    """Basic image-only ProcessingInfo for InternVL-style models."""
 
     @abstractmethod
     def get_hf_processor(
@@ -496,11 +708,22 @@ def get_image_size_with_most_features(self) -> ImageSize:
 
         return largest_feature_pinpoint
 
+    def get_max_image_tokens(self) -> int:
+        processor = self.get_hf_processor()
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            processor=processor,
+        )
+
 
 _I = TypeVar("_I", bound=BaseInternVLProcessingInfo)
 
 
-class InternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+class BaseInternVLDummyInputsBuilder(BaseDummyInputsBuilder[_I]):
+    """Basic image-only DummyInputsBuilder for InternVL-style models."""
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
@@ -524,7 +747,8 @@ def get_dummy_mm_data(
         }
 
 
-class InternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
+class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
+    """ Basic image-only MultiModalProcessor for InternVL-style models."""
 
     def _call_hf_processor(
         self,
@@ -613,6 +837,38 @@ def get_replacement_internvl(item_idx: int):
 
 
 class InternVLProcessingInfo(BaseInternVLProcessingInfo):
+    """InternVL ProcessingInfo extended for video processing"""
+
+    @property
+    def supports_video(self):
+        return self.get_hf_processor().supports_video
+
+    def get_supported_mm_limits(self):
+        video_limit = {"video": None} if self.supports_video else {}
+        return {**super().get_supported_mm_limits(), **video_limit}
+
+    def get_video_token(self) -> Optional[str]:
+        text_model_type = self.get_hf_config().get_text_config().model_type
+        if text_model_type == "qwen2":
+            return "<|video_pad|>"
+        return None
+
+    def get_num_frames_with_most_features(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> int:
+        max_images = mm_counts.get("image", 0)
+        max_videos = mm_counts.get("video", 0)
+
+        processor = self.get_hf_processor()
+
+        max_image_tokens = self.get_max_image_tokens() * max_images
+        max_total_frames = (seq_len -
+                            max_image_tokens) // processor.num_image_token
+        max_frames_per_video = max_total_frames // max(max_videos, 1)
+
+        return max(max_frames_per_video, 1)
 
     def get_hf_processor(
         self,
@@ -629,6 +885,8 @@ def get_hf_processor(
         if dynamic_image_size is not None:
             kwargs["dynamic_image_size"] = dynamic_image_size
 
+        kwargs["video_token"] = self.get_video_token()
+
         return self.ctx.init_processor(
             InternVLProcessor,
             config=self.get_hf_config(),
@@ -637,6 +895,121 @@ def get_hf_processor(
         )
 
 
+class InternVLDummyInputsBuilder(
+        BaseInternVLDummyInputsBuilder[InternVLProcessingInfo]):
+    """InternVL DummyInputsBuilder extended for video support"""
+
+    def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+        num_videos = mm_counts.get("video", 0)
+
+        return super().get_dummy_text(mm_counts) + "<video>" * num_videos
+
+    def get_dummy_mm_data(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> MultiModalDataDict:
+        dummy_image = super().get_dummy_mm_data(seq_len=seq_len,
+                                                mm_counts=mm_counts)
+        if self.info.supports_video:
+            config = self.info.get_hf_config()
+            image_size: int = config.vision_config.image_size
+            target_num_frames = \
+                self.info.get_num_frames_with_most_features(seq_len, mm_counts)
+            num_videos = mm_counts.get("video", 0)
+            dummy_video = {
+                "video":
+                self._get_dummy_videos(width=image_size,
+                                       height=image_size,
+                                       num_frames=target_num_frames,
+                                       num_videos=num_videos)
+            }
+        else:
+            dummy_video = {}
+        return {**dummy_image, **dummy_video}
+
+
+class InternVLMultiModalProcessor(
+        BaseInternVLMultiModalProcessor[InternVLProcessingInfo]):
+    """InternVL MultiModalProcessor extended for video support"""
+
+    def _call_hf_processor(
+        self,
+        prompt: str,
+        mm_data: Mapping[str, object],
+        mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, NestedTensors]:
+        processed_outputs = super()._call_hf_processor(prompt, mm_data,
+                                                       mm_kwargs)
+
+        hf_processor = self.info.get_hf_processor(**mm_kwargs)
+        if self.info.supports_video and (
+                video_token_id := hf_processor.video_token_id) is not None:
+            processed_outputs["video_token_id"] = torch.tensor(video_token_id)
+        return processed_outputs
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: Mapping[str, NestedTensors],
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        image_fields = super()._get_mm_fields_config(hf_inputs,
+                                                     hf_processor_mm_kwargs)
+        if self.info.supports_video:
+            video_num_patches = hf_inputs.get("video_num_patches",
+                                              torch.empty(0))
+            num_videos = len(video_num_patches)
+            video_fields = dict(
+                pixel_values_flat_video=MultiModalFieldConfig.flat_from_sizes(
+                    "video", video_num_patches),
+                video_num_patches=MultiModalFieldConfig.batched("video"),
+                video_token_id=MultiModalFieldConfig.shared(
+                    "video", num_videos),
+            )
+        else:
+            video_fields = {}
+
+        return image_fields | video_fields
+
+    def _get_prompt_updates(
+        self,
+        mm_items: MultiModalDataItems,
+        hf_processor_mm_kwargs: Mapping[str, object],
+        out_mm_kwargs: MultiModalKwargs,
+    ) -> Sequence[PromptUpdate]:
+        prompt_repl: list[PromptUpdate] = super()._get_prompt_updates(
+            mm_items, hf_processor_mm_kwargs, out_mm_kwargs)
+
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+
+        if "video_num_patches" in out_mm_kwargs:
+            video_num_patches = out_mm_kwargs["video_num_patches"]
+            assert isinstance(video_num_patches, torch.Tensor)
+            video_num_patches = video_num_patches.tolist()
+        else:
+            video_num_patches = []
+
+        def get_video_replacement_internvl(item_idx: int):
+            feature_size = hf_processor.num_image_token
+            num_patches = video_num_patches[item_idx]
+            if num_patches is not None:
+                assert isinstance(num_patches, int)
+
+            return hf_processor.get_video_repl(
+                feature_size,
+                num_patches,
+                video_context_token=hf_processor.video_token)
+
+        if self.info.supports_video:
+            prompt_repl.append(
+                PromptReplacement(
+                    modality="video",
+                    target="<video>",
+                    replacement=get_video_replacement_internvl,
+                ))
+        return prompt_repl
+
+
 @MULTIMODAL_REGISTRY.register_processor(
     InternVLMultiModalProcessor,
     info=InternVLProcessingInfo,
@@ -680,6 +1053,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.mlp1 = self._init_mlp1(config)
 
         self.img_context_token_id = None
+        self.video_context_token_id = None
+
         self.visual_token_mask = None
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
@@ -824,10 +1199,55 @@ def _parse_and_validate_image_input(
 
         raise AssertionError("This line should be unreachable.")
 
+    def _parse_and_validate_video_input(
+            self, **kwargs: object) -> Optional[InternVLVideoPixelInputs]:
+        pixel_values_flat_video = kwargs.pop("pixel_values_flat_video", None)
+        video_num_patches = kwargs.pop("video_num_patches", None)
+        video_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values_flat_video is None and video_embeds is None:
+            return None
+
+        if video_embeds is not None:
+            if not isinstance(video_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+
+            return InternVLImageEmbeddingInputs(
+                type="video_embeds",
+                data=flatten_bn(video_embeds),
+            )
+
+        video_token_id = kwargs["video_token_id"]
+        assert isinstance(video_token_id, torch.Tensor)
+        self.video_context_token_id = video_token_id.flatten().unique().item()
+
+        if pixel_values_flat_video is not None:
+            if not isinstance(pixel_values_flat_video, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values_flat_video)}")
+
+            if not isinstance(video_num_patches, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image_num_patches. "
+                                 f"Got type: {type(video_num_patches)}")
+
+            pixel_values_flat_video = flatten_bn(pixel_values_flat_video,
+                                                 concat=True)
+            video_num_patches = flatten_bn(video_num_patches, concat=True)
+
+            return InternVLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_flat=self._validate_pixel_values(
+                    pixel_values_flat_video),
+                num_patches=video_num_patches,
+            )
+
+        raise AssertionError("This line should be unreachable.")
+
     def _process_image_input(
         self,
-        image_input: InternVLImageInputs,
-    ) -> Union[torch.Tensor, list[torch.Tensor], tuple[torch.Tensor, ...]]:
+        image_input: Union[InternVLImageInputs, InternVLVideoPixelInputs],
+    ) -> tuple[torch.Tensor, ...]:
         if image_input["type"] == "image_embeds":
             return image_input["data"]
 
@@ -839,8 +1259,8 @@ def _process_image_input(
 
         # Only one image in the current batch
         if len(num_patches) == 1:
-            return image_embeds.view(
-                -1, self.config.text_config.hidden_size).unsqueeze(0)
+            return (image_embeds.view(-1,
+                                      self.config.text_config.hidden_size), )
 
         # NOTE: Image embeddings are split into separate tensors for each image
         # by the size of each embedding.
@@ -852,8 +1272,26 @@ def _process_image_input(
         ]
         return image_embeds.split(image_feature_sizes)
 
+    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
+        modalities = {}
+
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
+        for input_key in kwargs:
+            if input_key in ("pixel_values_flat",
+                             "image_embeds") and "images" not in modalities:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if input_key in ("pixel_values_flat_video",
+                             ) and "videos" not in modalities:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+
+        return modalities
+
     def _set_visual_token_mask(self, input_ids: torch.Tensor) -> None:
         if self.is_mono:
+            assert self.img_context_token_id is not None
             self.visual_token_mask = (
                 input_ids == self.img_context_token_id).reshape(-1, 1)
         else:
@@ -864,11 +1302,28 @@ def get_language_model(self) -> torch.nn.Module:
 
     def get_multimodal_embeddings(
             self, **kwargs: object) -> Optional[MultiModalEmbeddings]:
-        image_input = self._parse_and_validate_image_input(**kwargs)
-        if image_input is None:
+
+        modalities = self._parse_and_validate_multimodal_inputs(**kwargs)
+        if not modalities:
             return None
 
-        return self._process_image_input(image_input)
+        # The result multimodal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
+
+        # NOTE: It is important to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += vision_embeddings
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_image_input(video_input)
+                multimodal_embeddings += video_embeddings
+
+        return multimodal_embeddings
 
     def get_input_embeddings(
         self,
@@ -877,13 +1332,18 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
-            assert self.img_context_token_id is not None
+            context_token_ids = [
+                token_id for token_id in (self.img_context_token_id,
+                                          self.video_context_token_id)
+                if token_id is not None
+            ]
+            assert len(context_token_ids) >= 1
             self._set_visual_token_mask(input_ids)
             inputs_embeds = merge_multimodal_embeddings(
                 input_ids,
                 inputs_embeds,
                 multimodal_embeddings,
-                self.img_context_token_id,
+                context_token_ids,
             )
         return inputs_embeds
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 8c4eafaf3d1c..ff10fa42df67 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -181,20 +181,9 @@ def __init__(
             prefix=f"{prefix}.o_proj",
         )
 
-        is_neox_style = True
-        is_gguf = quant_config and quant_config.get_name() == "gguf"
-        if is_gguf and config.model_type == "llama":
-            is_neox_style = False
-
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
-            base=rope_theta,
-            rope_scaling=rope_scaling,
-            is_neox_style=is_neox_style,
-            partial_rotary_factor=self.partial_rotary_factor,
-        )
+        self._init_rotary_emb(config,
+                              rope_scaling=rope_scaling,
+                              quant_config=quant_config)
 
         if hasattr(config, "interleaved_sliding_window"):
             interleaved_sliding_window = config.interleaved_sliding_window
@@ -243,6 +232,24 @@ def forward(
         output, _ = self.o_proj(attn_output)
         return output
 
+    def _init_rotary_emb(self, config: LlamaConfig,
+                         rope_scaling: Optional[dict[str, Any]],
+                         quant_config: Optional[QuantizationConfig]) -> None:
+        is_neox_style = True
+        is_gguf = quant_config and quant_config.get_name() == "gguf"
+        if is_gguf and config.model_type == "llama":
+            is_neox_style = False
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+            partial_rotary_factor=self.partial_rotary_factor,
+        )
+
 
 class LlamaDecoderLayer(nn.Module):
 
diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py
index 018ecc2a8c0f..172dc8b5ec06 100644
--- a/vllm/model_executor/models/llama_eagle.py
+++ b/vllm/model_executor/models/llama_eagle.py
@@ -130,13 +130,15 @@ def load_weights(self, weights: Iterable[tuple[str,
 
 class EagleLlamaForCausalLM(LlamaForCausalLM):
 
-    def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         nn.Module.__init__(self)
         self.config = vllm_config. \
             speculative_config.draft_model_config.hf_config
+        target_layer_num = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config)
         self.model = LlamaModel(vllm_config=vllm_config,
                                 prefix="model",
-                                start_layer_id=start_layer_id)
+                                start_layer_id=target_layer_num)
 
         logit_scale = getattr(self.config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.config.vocab_size,
diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py
index 2302d1352de6..f211bfe54a7d 100644
--- a/vllm/model_executor/models/llama_eagle3.py
+++ b/vllm/model_executor/models/llama_eagle3.py
@@ -175,13 +175,15 @@ def load_weights(self, weights: Iterable[tuple[str,
 
 class Eagle3LlamaForCausalLM(LlamaForCausalLM):
 
-    def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         nn.Module.__init__(self)
         self.config = vllm_config. \
             speculative_config.draft_model_config.hf_config
+        target_layer_num = vllm_config.model_config.get_num_layers(
+            vllm_config.parallel_config)
         self.model = LlamaModel(vllm_config=vllm_config,
-                                start_layer_id=start_layer_id,
-                                prefix="model")
+                                prefix="model",
+                                start_layer_id=target_layer_num)
 
         logit_scale = getattr(self.config, "logit_scale", 1.0)
         self.lm_head = ParallelLMHead(
@@ -193,8 +195,7 @@ def __init__(self, *, vllm_config: VllmConfig, start_layer_id: int = 0):
         self.logits_processor = LogitsProcessor(self.config.draft_vocab_size,
                                                 scale=logit_scale)
         self.draft_id_to_target_id = nn.Parameter(
-            torch.zeros((self.config.draft_vocab_size),
-                        dtype=torch.long).type(torch.LongTensor),
+            torch.zeros(self.config.draft_vocab_size, dtype=torch.long),
             requires_grad=False,
         )
 
@@ -213,6 +214,9 @@ def compute_logits(
     ) -> Optional[torch.Tensor]:
         logits = self.logits_processor(self.lm_head, hidden_states,
                                        sampling_metadata)
+        if self.draft_id_to_target_id is None:
+            return logits
+
         base = torch.arange(self.config.draft_vocab_size, device=logits.device)
         targets = base + self.draft_id_to_target_id
         logits_new = logits.new_full((
@@ -245,4 +249,9 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
                 name = "model." + name
             model_weights[name] = loaded_weight
 
-        return loader.load_weights(model_weights.items())
+        loaded_weights = loader.load_weights(model_weights.items())
+
+        if 'd2t' not in loaded_weights:
+            self.draft_id_to_target_id = None
+
+        return loaded_weights
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 95c1a0ca0b98..ced71b6dcdeb 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -721,9 +721,8 @@ def forward(
                 batch.
             pixel_values: The pixels in each input image.
 
-        :::{seealso}
-        {class}`LlavaImageInputs`
-        :::
+        Info:
+            [LlavaImageInputs][]
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index e731f1bfdb9a..2fb79f57a67f 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -135,11 +135,13 @@ def _get_num_unpadded_features(
         current_aspect_ratio = current_width / current_height
 
         if aspect_ratio > current_aspect_ratio:
-            new_height = (original_height * current_width) // original_width
+            new_height = int(
+                round(original_height * (current_width / original_width), 7))
             padding = (current_height - new_height) // 2
             current_height = current_height - (2 * padding)
         else:
-            new_width = (original_width * current_height) // original_height
+            new_width = int(
+                round(original_width * (current_height / original_height), 7))
             padding = (current_width - new_width) // 2
             current_width = current_width - (2 * padding)
 
@@ -538,7 +540,7 @@ def forward(
         Unlike in LLaVA-1.5, the number of image tokens inputted to the language
         model depends on the original size of the input image. Including the
         original image token in the input, the required number of image tokens
-        is given by {func}`get_llava_next_image_feature_size`.
+        is given by [get_llava_next_image_feature_size][].
 
         This way, the `positions` and `attn_metadata` are consistent
         with the `input_ids`.
@@ -549,9 +551,8 @@ def forward(
             pixel_values: The pixels in each grid patch for each input image.
             image_sizes: The original `(height, width)` for each input image.
 
-        :::{seealso}
-        {class}`LlavaNextImageInputs`
-        :::
+        Info:
+            [LlavaNextImageInputs][]
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 49f1ecb4be89..7ea759fd59b8 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -116,11 +116,13 @@ def _get_num_unpadded_features(
         current_aspect_ratio = current_width / current_height
 
         if aspect_ratio > current_aspect_ratio:
-            new_height = (original_height * current_width) // original_width
+            new_height = int(
+                round(original_height * (current_width / original_width), 7))
             padding = (current_height - new_height) // 2
             current_height = current_height - (2 * padding)
         else:
-            new_width = (original_width * current_height) // original_height
+            new_width = int(
+                round(original_width * (current_height / original_height), 7))
             padding = (current_width - new_width) // 2
             current_width = current_width - (2 * padding)
 
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
index 588bcb628f8c..95ef1134b1bf 100644
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -51,10 +51,7 @@ class Medusa(nn.Module):
        needs to have truncated_vocab_size (=k) as an attribute."""
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
-        if hasattr(vllm_config, 'draft_model_config'):
-            config = vllm_config.draft_model_config.hf_config
-        else:
-            config = vllm_config.model_config.hf_config
+        config = vllm_config.speculative_config.draft_model_config.hf_config
         super().__init__()
         self.config = config
         self.blocks = nn.ModuleList([
diff --git a/vllm/model_executor/models/mimo_mtp.py b/vllm/model_executor/models/mimo_mtp.py
index adcfcaa6b1e6..cbca6a4c8f9d 100644
--- a/vllm/model_executor/models/mimo_mtp.py
+++ b/vllm/model_executor/models/mimo_mtp.py
@@ -250,7 +250,7 @@ def load_weights(self, weights: Iterable[tuple[str,
         return loaded_params
 
     def map_model_name_to_mtp_param_name(self, name: str) -> str:
-        import re
+        import regex as re
         name_without_prefix = [
             "token_layernorm", "hidden_layernorm", "input_proj",
             "final_layernorm"
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index d99ae81468a9..0397b552ce9f 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -242,9 +242,6 @@ def __init__(
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        # set rope as fp32 instead of bf16
-        self.rotary_emb.cos_sin_cache = self.rotary_emb._compute_cos_sin_cache(
-        )
         self.attn = Attention(self.num_heads,
                               self.head_dim,
                               self.scaling,
diff --git a/vllm/model_executor/models/minimax_text_01.py b/vllm/model_executor/models/minimax_text_01.py
index 7724e52c1ce1..36bab9ee13b1 100644
--- a/vllm/model_executor/models/minimax_text_01.py
+++ b/vllm/model_executor/models/minimax_text_01.py
@@ -2,10 +2,10 @@
 """Inference-only MiniMaxText01 model."""
 import copy
 import math
-import re
 from collections.abc import Iterable
 from typing import Optional, Union
 
+import regex as re
 import torch
 import torch.distributed
 import torch.nn.functional as F
diff --git a/vllm/model_executor/models/mistral3.py b/vllm/model_executor/models/mistral3.py
index 2b9cbf10440a..051a73120838 100644
--- a/vllm/model_executor/models/mistral3.py
+++ b/vllm/model_executor/models/mistral3.py
@@ -559,9 +559,8 @@ def forward(
                 batch.
             pixel_values: The pixels in each input image.
 
-        :::{seealso}
-        {class}`Mistral3ImagePixelInputs`
-        :::
+        Info:
+            [Mistral3ImagePixelInputs][]
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index ca8136776019..9bc7a16153e1 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -483,5 +483,5 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(self, skip_prefixes=["rotary_emb.inv_freq"])
+        loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index af774ea312a9..8220200d270c 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -448,8 +448,5 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=(["rotary_emb.inv_freq"]),
-        )
+        loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index e215582a37ac..640a2049a629 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -965,7 +965,7 @@ def select_tiling(
 
 class MolmoProcessorWrapper:
     """
-    Wraps {class}`MolmoProcessor` so that it can be called directly.
+    Wraps `MolmoProcessor` so that it can be called directly.
 
     The original definition can be found here:
     https://huggingface.co/allenai/Molmo-7B-D-0924/blob/main/preprocessing_molmo.py
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index ffd21def0699..d0999e30e1ba 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -503,14 +503,5 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=([
-                "rotary_emb.inv_freq",
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                "rotary_emb.cos_cached",
-                "rotary_emb.sin_cached"
-            ]),
-        )
+        loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py
index f4d5a77f2086..9808fe05558e 100644
--- a/vllm/model_executor/models/nemotron_nas.py
+++ b/vllm/model_executor/models/nemotron_nas.py
@@ -23,18 +23,20 @@
 # limitations under the License.
 """Inference-only deci model compatible with HuggingFace weights."""
 from collections.abc import Iterable
-from typing import Optional, Union
+from typing import Any, Optional, Union
 
 import torch
 from torch import nn
 from transformers import LlamaConfig
 
+from vllm.attention import AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -62,6 +64,48 @@ def _find_multiple(n: int, k: int) -> int:
     return n + k - (n % k)
 
 
+class DeciLMAttention(LlamaAttention):
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+        rope_theta: float = 10000,
+        rope_scaling: Optional[dict[str, Any]] = None,
+        max_position_embeddings: int = 8192,
+        quant_config: Optional[QuantizationConfig] = None,
+        bias: bool = False,
+        bias_o_proj: bool = False,
+        cache_config: Optional[CacheConfig] = None,
+        prefix: str = "",
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        super().__init__(config, hidden_size, num_heads, num_kv_heads,
+                         rope_theta, rope_scaling, max_position_embeddings,
+                         quant_config, bias, bias_o_proj, cache_config, prefix,
+                         attn_type)
+
+    def _init_rotary_emb(self, config, rope_scaling: Optional[dict[str, Any]],
+                         quant_config: Optional[QuantizationConfig]) -> None:
+        # Enables YARN for Mistral and LLaMA4 derivatives.
+        is_neox_style = True
+        if hasattr(config, "position_embedding_type"):
+            is_neox_style = config.position_embedding_type not in [
+                "mistral_yarn", "rope_llama4"
+            ]
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+            is_neox_style=is_neox_style,
+            partial_rotary_factor=self.partial_rotary_factor)
+
+
 class DeciLMDecoderLayer(nn.Module):
 
     def __init__(
@@ -98,7 +142,7 @@ def __init__(
         if not self._is_no_op_attention:
             num_kv_heads = (config.num_attention_heads //
                             block_config.attention.n_heads_in_group)
-            self.self_attn = LlamaAttention(
+            self.self_attn = DeciLMAttention(
                 config=config,
                 hidden_size=self.hidden_size,
                 num_heads=config.num_attention_heads,
diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py
index 62a7deab6a10..172434e66ae2 100644
--- a/vllm/model_executor/models/nvlm_d.py
+++ b/vllm/model_executor/models/nvlm_d.py
@@ -22,9 +22,10 @@
                                         PromptUpdateDetails)
 
 from .intern_vit import InternVisionModel
-from .internvl import (BaseInternVLProcessingInfo, BaseInternVLProcessor,
-                       InternVLChatModel, InternVLDummyInputsBuilder,
-                       InternVLMultiModalProcessor)
+from .internvl import (BaseInternVLDummyInputsBuilder,
+                       BaseInternVLMultiModalProcessor,
+                       BaseInternVLProcessingInfo, BaseInternVLProcessor,
+                       InternVLChatModel)
 
 IMG_PAD = "<|vision_pad|>"
 
@@ -84,7 +85,8 @@ def get_hf_processor(
         )
 
 
-class NVLMDummyInputsBuilder(InternVLDummyInputsBuilder[NVLMProcessingInfo]):
+class NVLMDummyInputsBuilder(BaseInternVLDummyInputsBuilder[NVLMProcessingInfo]
+                             ):
 
     def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
         num_images = mm_counts.get("image", 0)
@@ -110,7 +112,8 @@ def get_dummy_mm_data(
         }
 
 
-class NVLMMultiModalProcessor(InternVLMultiModalProcessor[NVLMProcessingInfo]):
+class NVLMMultiModalProcessor(
+        BaseInternVLMultiModalProcessor[NVLMProcessingInfo]):
 
     def _get_prompt_updates(
         self,
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 26ca770d8493..fcb7c619a102 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -382,19 +382,7 @@ def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=([
-                "rotary_emb.inv_freq",
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                "rotary_emb.cos_cached",
-                "rotary_emb.sin_cached",
-                "lm_head.weight"
-            ] if self.config.tie_word_embeddings else [
-                "rotary_emb.inv_freq",
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                "rotary_emb.cos_cached",
-                "rotary_emb.sin_cached"
-            ]),
+            skip_prefixes=(["lm_head.weight"]
+                           if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py
index e4dc0e0cc411..33adacdae5f5 100644
--- a/vllm/model_executor/models/olmo2.py
+++ b/vllm/model_executor/models/olmo2.py
@@ -314,7 +314,8 @@ def forward(
         hidden_states = self.norm(hidden_states)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -325,6 +326,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         ]
 
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: set[str] = set()
         for name, loaded_weight in weights:
             if is_pp_missing_parameter(name, self):
                 continue
@@ -347,6 +349,8 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 class Olmo2ForCausalLM(nn.Module, SupportsPP):
@@ -403,19 +407,7 @@ def compute_logits(
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(
             self,
-            skip_prefixes=([
-                "rotary_emb.inv_freq",
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                "rotary_emb.cos_cached",
-                "rotary_emb.sin_cached",
-                "lm_head.weight"
-            ] if self.config.tie_word_embeddings else [
-                "rotary_emb.inv_freq",
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                "rotary_emb.cos_cached",
-                "rotary_emb.sin_cached"
-            ]),
+            skip_prefixes=(["lm_head.weight"]
+                           if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 9a07f57fd999..6364b89fb837 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -442,8 +442,5 @@ def compute_logits(self, hidden_states: torch.Tensor,
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=["rotary_emb.inv_freq"],
-        )
+        loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 1ccd1fe1f741..da2a194e6bdf 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -344,14 +344,5 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=([
-                "rotary_emb.inv_freq",
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                "rotary_emb.cos_cached",
-                "rotary_emb.sin_cached"
-            ]),
-        )
+        loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index bb4d46be3f99..b757e661d771 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -14,10 +14,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import re
 from collections.abc import Iterable, Mapping, Sequence
 from typing import Any, Literal, Optional, TypedDict, Union
 
+import regex as re
 import torch
 import torch.nn as nn
 from transformers import (BatchFeature, CLIPVisionConfig, PretrainedConfig,
diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py
index b7bb3c45c633..418ff900ffd5 100644
--- a/vllm/model_executor/models/phi4mm.py
+++ b/vllm/model_executor/models/phi4mm.py
@@ -1228,9 +1228,7 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> None:
-        weights = ((name, data) for name, data in weights
-                   if "lora" not in name)
-        loader = AutoWeightsLoader(self)
+        loader = AutoWeightsLoader(self, skip_substrs=["lora"])
         return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
 
     def get_mm_mapping(self) -> MultiModelKeys:
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 7f2e9fdf7c4e..d9917c26d1b1 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -660,8 +660,5 @@ def compute_logits(self, hidden_states: torch.Tensor,
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=(["rotary_emb.inv_freq"]),
-        )
+        loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index c664d2371e27..9f28d4cef425 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -9,7 +9,9 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from mistral_common.protocol.instruct.messages import ImageChunk
+from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
+                                                       UserMessage)
+from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.multimodal import ImageEncoder
 from PIL import Image
 from transformers import PixtralVisionConfig, TensorType
@@ -39,7 +41,7 @@
                                         BaseProcessingInfo, MultiModalHashes,
                                         PromptReplacement, PromptUpdate,
                                         PromptUpdateDetails)
-from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
 from vllm.sequence import IntermediateTensors
 from vllm.transformers_utils.tokenizer import (MistralTokenizer,
                                                cached_tokenizer_from_config)
@@ -65,14 +67,14 @@ class PixtralImagePixelInputs(TypedDict):
     """
     Shape: `(batch_size * num_images, num_channels, image_width, image_height)`
 
-    The result of stacking {attr}`ImageEncoding.tokens` from each prompt.
+    The result of stacking `ImageEncoding.tokens` from each prompt.
     """
 
 
 class PixtralProcessorAdapter:
     """
     Provide a HF-compatible interface for
-    {class}`mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
+    `mistral_common.tokens.tokenizers.multimodal.ImageEncoder`.
     """
 
     def __init__(self, tokenizer: MistralTokenizer) -> None:
@@ -224,6 +226,28 @@ def get_dummy_mm_data(
                                    num_images=num_images)
         }
 
+    def get_dummy_processor_inputs(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> ProcessorInputs:
+        tokenizer = self.info.get_tokenizer()
+
+        dummy_text = self.get_dummy_text(mm_counts)
+        dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
+        dummy_images = dummy_mm_data.get("image", [])
+
+        request = ChatCompletionRequest(messages=[
+            UserMessage(content=[
+                TextChunk(text=dummy_text),
+                *(ImageChunk(image=image) for image in dummy_images),
+            ]),
+        ])
+        res = tokenizer.mistral.encode_chat_completion(request)
+        dummy_tokens = res.tokens
+
+        return ProcessorInputs(prompt=dummy_tokens, mm_data=dummy_mm_data)
+
 
 class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
                                  ):
@@ -275,8 +299,12 @@ def _cached_apply_hf_processor(
         *,
         return_mm_hashes: bool,
     ) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
-        prompt_ids, mm_kwargs, mm_hashes, _ = super(
-        )._cached_apply_hf_processor(
+        (
+            prompt_ids,
+            mm_kwargs,
+            mm_hashes,
+            _,
+        ) = super()._cached_apply_hf_processor(
             prompt=prompt,
             mm_data_items=mm_data_items,
             hf_processor_mm_kwargs=hf_processor_mm_kwargs,
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 7cf98dc7a4ea..143b9f98b029 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -535,8 +535,5 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=(["rotary_emb.inv_freq"]),
-        )
+        loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py
index aae5401721df..8a4c2850dda3 100644
--- a/vllm/model_executor/models/qwen3_moe.py
+++ b/vllm/model_executor/models/qwen3_moe.py
@@ -530,8 +530,5 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=(["rotary_emb.inv_freq"]),
-        )
+        loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py
index 3701153bace5..f5d242fdf1c2 100644
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -7,12 +7,12 @@
 
 import copy
 import math
-import re
 import unicodedata
 from collections.abc import Collection, Mapping, Sequence, Set
 from functools import lru_cache, partial
 from typing import Callable, Literal, Optional, TypedDict, Union
 
+import regex as re
 import torch
 from torch import nn
 from torchvision import transforms
@@ -382,7 +382,8 @@ def _get_tokenizer_without_image_pad(
         tokenizer: PreTrainedTokenizer) -> PreTrainedTokenizer:
     """
     The logic of adding image pad tokens should only be applied in
-    {class}`QwenVLProcessor`, so they are patched out here.
+    [`QwenVLProcessor`][vllm.model_executor.models.qwen_vl.QwenVLProcessor],
+    so they are patched out here.
 
     The definition of the wrapped tokenizer can be found here:
     https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index c55f7ccd344f..97ea12de6537 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -79,6 +79,7 @@
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MambaForCausalLM": ("mamba", "MambaForCausalLM"),
     "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
+    "FalconH1ForCausalLM":("falcon_h1", "FalconH1ForCausalLM"),
     "Mamba2ForCausalLM": ("mamba2", "Mamba2ForCausalLM"),
     "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
     "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
@@ -207,6 +208,7 @@
     "Qwen2_5_VLForConditionalGeneration": ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"),  # noqa: E501
     "Qwen2AudioForConditionalGeneration": ("qwen2_audio", "Qwen2AudioForConditionalGeneration"),  # noqa: E501
     "Qwen2_5OmniModel": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
+    "Qwen2_5OmniForConditionalGeneration": ("qwen2_5_omni_thinker", "Qwen2_5OmniThinkerForConditionalGeneration"),  # noqa: E501
     "UltravoxModel": ("ultravox", "UltravoxModel"),
     "Phi4MMForCausalLM": ("phi4mm", "Phi4MMForCausalLM"),
     # [Encoder-decoder]
@@ -381,7 +383,7 @@ def register_model(
 
         `model_cls` can be either:
 
-        - A {class}`torch.nn.Module` class directly referencing the model.
+        - A [`torch.nn.Module`][] class directly referencing the model.
         - A string in the format `<module>:<class>` which can be used to
           lazily import the model. This is useful to avoid initializing CUDA
           when importing the model and thus the related error
diff --git a/vllm/model_executor/models/skyworkr1v.py b/vllm/model_executor/models/skyworkr1v.py
index 91f6c7753c68..eefadda918f6 100644
--- a/vllm/model_executor/models/skyworkr1v.py
+++ b/vllm/model_executor/models/skyworkr1v.py
@@ -24,6 +24,7 @@
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
                                     MultiModalKwargs, NestedTensors)
 from vllm.multimodal.parse import (ImageEmbeddingItems, ImageProcessorItems,
@@ -78,7 +79,7 @@ class SkyworkR1VImageEmbeddingInputs(TypedDict):
 def build_transform(input_size: int):
     MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
     return T.Compose([
-        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Lambda(lambda img: convert_image_mode(img, 'RGB')),
         T.Resize((input_size, input_size),
                  interpolation=T.InterpolationMode.BICUBIC),
         T.ToTensor(),
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 8240c3fa3557..fcd17cc1c2ba 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -501,14 +501,5 @@ def compute_logits(self, hidden_states: torch.Tensor,
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=([
-                "rotary_emb.inv_freq",
-                # Models trained using ColossalAI may include these tensors in
-                # the checkpoint. Skip them.
-                "rotary_emb.cos_cached",
-                "rotary_emb.sin_cached"
-            ]),
-        )
+        loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 8c2ad6f19251..86ce813ddf3d 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -338,13 +338,5 @@ def compute_logits(
 
     def load_weights(self, weights: Iterable[tuple[str,
                                                    torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(
-            self,
-            # Models trained using ColossalAI may include these tensors in
-            # the checkpoint. Skip them.
-            skip_prefixes=[
-                "rotary_emb.inv_freq", "rotary_emb.cos_cached",
-                "rotary_emb.sin_cached"
-            ],
-        )
+        loader = AutoWeightsLoader(self)
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 5927afa91f49..f4ba5a8030e5 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -349,8 +349,7 @@ def load_weights(self, weights: Iterable[tuple[str,
             self,
             # Models trained using ColossalAI may include these tensors in
             # the checkpoint. Skip them.
-            skip_prefixes=([
-                "rotary_emb.inv_freq", "lm_head.weight"
-            ] if self.config.tie_word_embeddings else ["rotary_emb.inv_freq"]),
+            skip_prefixes=(["lm_head.weight"]
+                           if self.config.tie_word_embeddings else None),
         )
         return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index a8f30b2f27bf..b87a2ebf211a 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -14,10 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Wrapper around `transformers` models"""
-import re
 from collections.abc import Iterable
+from contextlib import nullcontext
 from typing import Literal, Optional, Union
 
+import regex as re
 import torch
 from torch import nn
 from transformers import AutoModel, PretrainedConfig, PreTrainedModel
@@ -110,6 +111,33 @@ def replace_linear_class(
     )
 
 
+class ConfigOverride:
+    """Context manager to temporarily override config attributes."""
+
+    def __init__(self, config: PretrainedConfig, **kwargs):
+        self.config = config
+        self.kwargs = kwargs
+        self.kwargs_original = {}
+        self.kwargs_delete = set()
+
+    def __enter__(self):
+        """Override config attributes."""
+        for key, value in self.kwargs.items():
+            if not hasattr(self.config, key):
+                self.kwargs_delete.add(key)
+            self.kwargs_original[key] = getattr(self.config, key, None)
+            setattr(self.config, key, value)
+        return self.config
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        """Restore original config attributes."""
+        for key, value in self.kwargs_original.items():
+            if key in self.kwargs_delete:
+                delattr(self.config, key)
+            else:
+                setattr(self.config, key, value)
+
+
 class TransformersModel(nn.Module):
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
@@ -135,8 +163,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.pp_rank = self.pp_group.rank_in_group
         self.tp_size = get_tensor_model_parallel_world_size()
 
+        # vLLM handles interleaved sliding window attention by creating a new
+        # interleaved_sliding_window attribute and deleting the sliding_window
+        # attribute. This breaks the constructors in Transformers so we
+        # temporarily add the attribute back to construct the model.
+        config_override = nullcontext()
+        if hasattr(config, "interleaved_sliding_window"):
+            config_override = ConfigOverride(
+                config, sliding_window=config.interleaved_sliding_window)
+
         # Use meta device to delay allocating GPU tensors
-        with torch.device("meta"):
+        with torch.device("meta"), config_override:
             # FIXME(Isotr0py): We need to refactor this part in the future to
             # avoid registering an extra model layer, otherwise we will need a
             # weights mapper to rename weights.
@@ -262,9 +299,17 @@ def create_attention_instances(self) -> dict[int, Attention]:
         num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
         start, end = get_pp_indices(self.config.num_hidden_layers,
                                     self.pp_rank, self.pp_size)
-        return {
-            i:
-            Attention(
+
+        attention_instances = {}
+        for i in range(start, end):
+            # Handle interleaved sliding window attention
+            sliding_window = None
+            if (hasattr(self.config, "interleaved_sliding_window")
+                    and hasattr(self.config, "sliding_window_pattern")
+                    and ((i + 1) % self.config.sliding_window_pattern > 0)):
+                sliding_window = self.config.interleaved_sliding_window
+
+            attention_instances[i] = Attention(
                 num_heads=num_heads,
                 head_size=head_size,
                 # NOTE: We use Llama scale as default, if it's set by
@@ -273,9 +318,9 @@ def create_attention_instances(self) -> dict[int, Attention]:
                 num_kv_heads=num_kv_heads,
                 cache_config=self.cache_config,
                 quant_config=self.quant_config,
+                per_layer_sliding_window=sliding_window,
                 prefix=f"{i}.attn")
-            for i in range(start, end)
-        }
+        return attention_instances
 
     def init_buffers(self, module: nn.Module):
         """
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 5cc501622891..3d821d3dc6b5 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -66,7 +66,7 @@ def apply(
 
 class AutoWeightsLoader:
     """
-    Helper class to load weights into a {class}`torch.nn.Module`. It is able
+    Helper class to load weights into a [`torch.nn.Module`][]. It is able
     to automatically detect child modules and parameters while iterating over
     the weights only once.
 
@@ -80,18 +80,30 @@ class AutoWeightsLoader:
     environment variable ``VLLM_LOGGING_LEVEL=DEBUG``.
     """
 
+    # Models trained using early version ColossalAI
+    # may include these tensors in checkpoint. Skip them.
+    ROTARY_EMBEDS_UNUSED_WEIGHTS = [
+        "rotary_emb.inv_freq",
+        "rotary_emb.cos_cached",
+        "rotary_emb.sin_cached",
+    ]
+
     def __init__(
         self,
         module: nn.Module,
         *,
         skip_prefixes: Optional[list[str]] = None,
+        skip_substrs: Optional[list[str]] = None,
         ignore_unexpected_prefixes: Optional[list[str]] = None,
     ) -> None:
         super().__init__()
 
         self.module = module
         self.skip_prefixes = skip_prefixes or []
+        self.skip_substrs = skip_substrs or []
         self.ignore_unexpected_prefixes = ignore_unexpected_prefixes or []
+        # update default skip_substrs
+        self.skip_substrs += self.ROTARY_EMBEDS_UNUSED_WEIGHTS
 
     def _groupby_prefix(
         self,
@@ -119,7 +131,8 @@ def _get_qualname(self, prefix: str, rest: str) -> str:
         return ".".join((prefix, rest))
 
     def _can_skip(self, qualname: str) -> bool:
-        return any(qualname.startswith(p) for p in self.skip_prefixes)
+        return (any(qualname.startswith(p) for p in self.skip_prefixes)
+                or any(substr in qualname for substr in self.skip_substrs))
 
     def _can_ignore_unexpected(self, qualname: str) -> bool:
         return any(
@@ -257,6 +270,9 @@ def load_weights(
     ) -> set[str]:
         if mapper is not None:
             weights = mapper.apply(weights)
+        # filter out weights with first-prefix/substr to skip in name
+        weights = ((name, weight) for name, weight in weights
+                   if not self._can_skip(name))
 
         autoloaded_weights = set(self._load_module("", self.module, weights))
         return autoloaded_weights
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 756ea11311da..815e34d5ac5d 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -8,12 +8,12 @@
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
 """
-The global {class}`~MultiModalRegistry` is used by model runners to
-dispatch data processing according to the target model.
+The global [`MultiModalRegistry`][vllm.multimodal.registry.MultiModalRegistry]
+is used by model runners to dispatch data processing according to the target
+model.
 
-:::{seealso}
-{ref}`mm-processing`
-:::
+Info:
+    [mm_processing](../../../design/mm_processing.html)
 """
 
 __all__ = [
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 53e289370a9f..b4cd6a90834c 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -10,6 +10,7 @@
 from PIL import Image
 
 from vllm.logger import init_logger
+from vllm.multimodal.image import convert_image_mode
 
 if TYPE_CHECKING:
     from vllm.inputs import TokensPrompt
@@ -35,7 +36,8 @@ def serialize_item(cls, obj: object) -> bytes:
             return np.array(obj).tobytes()
 
         if isinstance(obj, Image.Image):
-            return cls.item_to_bytes("image", np.array(obj.convert("RGBA")))
+            return cls.item_to_bytes(
+                "image", np.asarray(convert_image_mode(obj, "RGBA")))
         if isinstance(obj, torch.Tensor):
             return cls.item_to_bytes("tensor", obj.numpy())
         if isinstance(obj, np.ndarray):
@@ -43,7 +45,7 @@ def serialize_item(cls, obj: object) -> bytes:
                 "ndarray", {
                     "dtype": obj.dtype.str,
                     "shape": obj.shape,
-                    "data": obj.data.tobytes(),
+                    "data": obj.tobytes(),
                 })
 
         logger.warning(
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 939928bbf108..a63ec0bd8ada 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -22,6 +22,25 @@ def rescale_image_size(image: Image.Image,
     return image
 
 
+# TODO: Support customizable background color to fill in.
+def rgba_to_rgb(
+    image: Image.Image, background_color=(255, 255, 255)) -> Image.Image:
+    """Convert an RGBA image to RGB with filled background color."""
+    assert image.mode == "RGBA"
+    converted = Image.new("RGB", image.size, background_color)
+    converted.paste(image, mask=image.split()[3])  # 3 is the alpha channel
+    return converted
+
+
+def convert_image_mode(image: Image.Image, to_mode: str):
+    if image.mode == to_mode:
+        return image
+    elif image.mode == "RGBA" and to_mode == "RGB":
+        return rgba_to_rgb(image)
+    else:
+        return image.convert(to_mode)
+
+
 class ImageMediaIO(MediaIO[Image.Image]):
 
     def __init__(self, *, image_mode: str = "RGB") -> None:
@@ -32,7 +51,7 @@ def __init__(self, *, image_mode: str = "RGB") -> None:
     def load_bytes(self, data: bytes) -> Image.Image:
         image = Image.open(BytesIO(data))
         image.load()
-        return image.convert(self.image_mode)
+        return convert_image_mode(image, self.image_mode)
 
     def load_base64(self, media_type: str, data: str) -> Image.Image:
         return self.load_bytes(base64.b64decode(data))
@@ -40,7 +59,7 @@ def load_base64(self, media_type: str, data: str) -> Image.Image:
     def load_file(self, filepath: Path) -> Image.Image:
         image = Image.open(filepath)
         image.load()
-        return image.convert(self.image_mode)
+        return convert_image_mode(image, self.image_mode)
 
     def encode_base64(
         self,
@@ -51,7 +70,7 @@ def encode_base64(
         image = media
 
         with BytesIO() as buffer:
-            image = image.convert(self.image_mode)
+            image = convert_image_mode(image, self.image_mode)
             image.save(buffer, image_format)
             data = buffer.getvalue()
 
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 2335af843ed5..600a34d39ef6 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -29,14 +29,14 @@
 
 HfImageItem: TypeAlias = Union["Image", np.ndarray, "torch.Tensor"]
 """
-A {class}`transformers.image_utils.ImageInput` representing a single image
+A `transformers.image_utils.ImageInput` representing a single image
 item, which can be passed to a HuggingFace `ImageProcessor`.
 """
 
 HfVideoItem: TypeAlias = Union[list["Image"], np.ndarray, "torch.Tensor",
                                list[np.ndarray], list["torch.Tensor"]]
 """
-A {class}`transformers.image_utils.VideoInput` representing a single video
+A `transformers.image_utils.VideoInput` representing a single video
 item, which can be passed to a HuggingFace `VideoProcessor`.
 """
 
@@ -48,7 +48,7 @@
 
 ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor"]
 """
-A {class}`transformers.image_utils.ImageInput` representing a single image
+A `transformers.image_utils.ImageInput` representing a single image
 item, which can be passed to a HuggingFace `ImageProcessor`.
 
 Alternatively, a 3-D tensor or batch of 2-D tensors,
@@ -58,7 +58,7 @@
 
 VideoItem: TypeAlias = Union[HfVideoItem, "torch.Tensor"]
 """
-A {class}`transformers.image_utils.VideoInput` representing a single video
+A `transformers.image_utils.VideoInput` representing a single video
 item, which can be passed to a HuggingFace `VideoProcessor`.
 
 Alternatively, a 3-D tensor or batch of 2-D tensors,
@@ -108,7 +108,8 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 """
 A dictionary containing an entry for each modality type to input.
 
-The built-in modalities are defined by {class}`MultiModalDataBuiltins`.
+The built-in modalities are defined by
+[`MultiModalDataBuiltins`][vllm.multimodal.inputs.MultiModalDataBuiltins].
 """
 
 
@@ -169,7 +170,8 @@ def __eq__(self, other: object) -> bool:
 
 
 def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
-    """Equality check between {data}`NestedTensors` objects."""
+    """Equality check between
+    [`NestedTensors`][vllm.multimodal.inputs.NestedTensors] objects."""
     if isinstance(a, torch.Tensor):
         return isinstance(b, torch.Tensor) and torch.equal(a, b)
     elif isinstance(b, torch.Tensor):
@@ -189,7 +191,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
 BatchedTensorInputs: TypeAlias = Mapping[str, NestedTensors]
 """
 A dictionary containing nested tensors which have been batched via
-{meth}`MultiModalKwargs.batch`.
+[`MultiModalKwargs.batch`][vllm.multimodal.inputs.MultiModalKwargs.batch].
 """
 
 
@@ -197,7 +199,7 @@ def nested_tensors_equal(a: NestedTensors, b: NestedTensors) -> bool:
 class MultiModalFieldElem:
     """
     Represents a keyword argument corresponding to a multi-modal item
-    in {class}`MultiModalKwargs`.
+    in [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs].
     """
 
     modality: str
@@ -208,13 +210,15 @@ class MultiModalFieldElem:
 
     key: str
     """
-    The key of this field in {class}`MultiModalKwargs`,
+    The key of this field in
+    [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs],
     i.e. the name of the keyword argument to be passed to the model.
     """
 
     data: NestedTensors
     """
-    The tensor data of this field in {class}`MultiModalKwargs`,
+    The tensor data of this field in
+    [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs],
     i.e. the value of the keyword argument to be passed to the model.
     """
 
@@ -237,7 +241,8 @@ def __eq__(self, other: object) -> bool:
 class BaseMultiModalField(ABC):
     """
     Defines how to interpret tensor data belonging to a keyword argument in
-    {class}`MultiModalKwargs` for multiple multi-modal items, and vice versa.
+    [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs] for multiple
+    multi-modal items, and vice versa.
     """
 
     def _field_factory(self, *, modality: str, key: str):
@@ -262,10 +267,12 @@ def build_elems(
         data: NestedTensors,
     ) -> Sequence[MultiModalFieldElem]:
         """
-        Construct {class}`MultiModalFieldElem` instances to represent
-        the provided data.
+        Construct
+        [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]
+        instances to represent the provided data.
 
-        This is the inverse of {meth}`reduce_data`.
+        This is the inverse of
+        [`reduce_data`][vllm.multimodal.inputs.BaseMultiModalField.reduce_data].
         """
         raise NotImplementedError
 
@@ -275,9 +282,11 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
 
     def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
         """
-        Merge the data from multiple instances of {class}`MultiModalFieldElem`.
+        Merge the data from multiple instances of
+        [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem].
 
-        This is the inverse of {meth}`build_elems`.
+        This is the inverse of
+        [`build_elems`][vllm.multimodal.inputs.BaseMultiModalField.build_elems].
         """
         field_types = [type(item.field) for item in elems]
         if len(set(field_types)) > 1:
@@ -289,9 +298,8 @@ def reduce_data(self, elems: list[MultiModalFieldElem]) -> NestedTensors:
 @dataclass(frozen=True)
 class MultiModalBatchedField(BaseMultiModalField):
     """
-    :::{seealso}
-    {func}`MultiModalFieldConfig.batched`
-    :::
+    Info:
+        [`MultiModalFieldConfig.batched`][vllm.multimodal.inputs.MultiModalFieldConfig.batched]
     """
 
     def build_elems(
@@ -320,10 +328,9 @@ def _reduce_data(self, batch: list[NestedTensors]) -> NestedTensors:
 @dataclass(frozen=True)
 class MultiModalFlatField(BaseMultiModalField):
     """
-    :::{seealso}
-    {func}`MultiModalFieldConfig.flat`
-    {func}`MultiModalFieldConfig.flat_from_sizes`
-    :::
+    Info:
+        [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
+        [`MultiModalFieldConfig.flat_from_sizes`][vllm.multimodal.inputs.MultiModalFieldConfig.flat_from_sizes]
     """
     slices: Union[Sequence[slice], Sequence[Sequence[slice]]]
     dim: int = 0
@@ -363,9 +370,8 @@ def _expect_same_shape(tensor: torch.Tensor):
 @dataclass(frozen=True)
 class MultiModalSharedField(BaseMultiModalField):
     """
-    :::{seealso}
-    {func}`MultiModalFieldConfig.shared`
-    :::
+    Info:
+        [`MultiModalFieldConfig.shared`][vllm.multimodal.inputs.MultiModalFieldConfig.shared]
     """
     batch_size: int
 
@@ -510,9 +516,8 @@ def flat_from_sizes(modality: str,
             Element 3: [[C],[C]]
         ```
 
-        :::{seealso}
-        {func}`MultiModalFieldConfig.flat`
-        :::
+        Info:
+            [`MultiModalFieldConfig.flat`][vllm.multimodal.inputs.MultiModalFieldConfig.flat]
         """
 
         if size_per_item.ndim != 1:
@@ -576,8 +581,10 @@ def build_elems(
 
 class MultiModalKwargsItem(UserDict[str, MultiModalFieldElem]):
     """
-    A collection of {class}`MultiModalFieldElem`
-    corresponding to a data item in {class}`MultiModalDataItems`.
+    A collection of
+    [`MultiModalFieldElem`][vllm.multimodal.inputs.MultiModalFieldElem]
+    corresponding to a data item in
+    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
     """
 
     @staticmethod
@@ -596,11 +603,13 @@ def modality(self) -> str:
 class MultiModalKwargs(UserDict[str, NestedTensors]):
     """
     A dictionary that represents the keyword arguments to
-    {meth}`~torch.nn.Module.forward`.
+    [`torch.nn.Module.forward`][].
 
     The metadata `items` enables us to obtain the keyword arguments
-    corresponding to each data item in {class}`MultiModalDataItems`, via
-    {meth}`get_item` and {meth}`get_items`.
+    corresponding to each data item in
+    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems], via
+    [`get_item`][vllm.multimodal.inputs.MultiModalKwargs.get_item] and
+    [`get_items`][vllm.multimodal.inputs.MultiModalKwargs.get_items].
     """
 
     @staticmethod
@@ -639,7 +648,9 @@ def from_hf_inputs(
 
     @staticmethod
     def from_items(items: Sequence[MultiModalKwargsItem]):
-        """Construct a new {class}`MultiModalKwargs` from multiple items."""
+        """Construct a new
+        [`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs]
+        from multiple items."""
         elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
         for item in items:
             for key, elem in item.items():
@@ -735,11 +746,17 @@ def as_kwargs(
         batched_inputs: BatchedTensorInputs,
         *,
         device: torch.types.Device,
+        dtype: Optional[torch.dtype] = None,
     ) -> BatchedTensorInputs:
         json_inputs = cast(JSONTree[torch.Tensor], batched_inputs)
 
+        def maybe_cast_dtype(x: torch.Tensor):
+            # This mimics the behavior of transformers.BatchFeature
+            return x.to(dtype=dtype) if x.is_floating_point() else x
+
         json_mapped = json_map_leaves(
-            lambda x: x.to(device, non_blocking=True),
+            # NOTE: Cast the dtype before sending it to device
+            lambda x: maybe_cast_dtype(x).to(device=device, non_blocking=True),
             json_inputs,
         )
 
@@ -804,7 +821,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 class MultiModalInputs(TypedDict):
     """
     Represents the outputs of
-    {class}`vllm.multimodal.processing.BaseMultiModalProcessor`,
+    [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor],
     ready to be passed to vLLM internals.
     """
 
@@ -840,7 +857,8 @@ class MultiModalInputs(TypedDict):
 
 class MultiModalEncDecInputs(MultiModalInputs):
     """
-    Represents the outputs of {class}`vllm.multimodal.EncDecMultiModalProcessor`
+    Represents the outputs of
+    [`EncDecMultiModalProcessor`][vllm.multimodal.processing.EncDecMultiModalProcessor]
     ready to be passed to vLLM internals.
     """
 
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index 6e9ec9555802..63af842747a5 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -28,7 +28,8 @@
 
 class ModalityDataItems(ABC, Generic[_T, _I]):
     """
-    Represents data items for a modality in {class}`MultiModalDataItems`.
+    Represents data items for a modality in
+    [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
     """
 
     def __init__(self, data: _T, modality: str) -> None:
@@ -251,15 +252,15 @@ def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
 
 class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
     """
-    As {data}`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
-    such that each entry corresponds to a list.
+    As [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict], but
+    normalized such that each entry corresponds to a list.
     """
 
     def get_count(self, modality: str, *, strict: bool = True) -> int:
         """
         Get the number of data items belonging to a modality.
 
-        If `strict=False`, return `0` instead of raising {exc}`KeyError`
+        If `strict=False`, return `0` instead of raising [`KeyError`][]
         even if the modality is not found.
         """
         if modality not in self:
@@ -305,8 +306,8 @@ def get_items(
 
 class MultiModalDataParser:
     """
-    Parses {data}`~vllm.multimodal.inputs.MultiModalDataDict` into
-    {class}`MultiModalDataItems`.
+    Parses [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict]
+    into [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems].
 
     Args:
         target_sr (float, optional): Enables automatic resampling of audio
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 320a26f37555..aa7914e40cbf 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 import json
-import re
 import sys
 from abc import ABC, abstractmethod
 from collections import defaultdict
@@ -12,6 +11,7 @@
 from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
                     TypeVar, Union, cast)
 
+import regex as re
 import torch
 from typing_extensions import assert_never
 
@@ -114,13 +114,14 @@ class PromptUpdateDetails(Generic[_S]):
 
     is_embed: Optional[Callable[["_BoundPromptSequence"], torch.Tensor]] = None
     """
-    Given {attr}`full`, return a boolean mask of shape `(len(full),)`
-    indicating which positions of `full` to assign embeddings to.
+    Given [`full`][vllm.multimodal.processing.PromptUpdateDetails.full],
+    return a boolean mask of shape `(len(full),)` indicating which positions
+    of `full` to assign embeddings to.
 
     `None` (default) means to assign embeddings to all positions of `full`.
 
     The embeddings are obtained by calling
-    {class}`SupportsMultiModal.get_multimodal_embeddings`.
+    [`SupportsMultiModal.get_multimodal_embeddings`][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings].
     """
 
     @staticmethod
@@ -159,13 +160,15 @@ def select_token_id(
 The token sequence or text that are part of the update.
 
 If only part of the content corresponds to feature placeholders, you can
-use {class}`PromptUpdateDetails` to specify which part.
+use [`PromptUpdateDetails`][vllm.multimodal.processing.PromptUpdateDetails] to
+specify which part.
 """
 
 PromptUpdateContent = Union[Callable[[int], PromptUpdateInfo],
                             PromptUpdateInfo]
 """
-Given the index of the processed item within {attr}`modality`,
+Given the index of the processed item within
+[`modality`][vllm.multimodal.processing.PromptUpdate.modality],
 output the corresponding token sequence (or text).
 
 For convenience, you can directly pass in the token sequence (or text)
@@ -260,8 +263,10 @@ class PromptInsertion(PromptUpdate):
 
     insertion: PromptUpdateContent = field(repr=False)
     """
-    Given the index of the processed item within {attr}`modality`,
-    output the token sequence (or text) to insert right after {attr}`target`.
+    Given the index of the processed item within
+    [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
+    output the token sequence (or text) to insert right after
+    [`target`][vllm.multimodal.processing.PromptUpdate.target].
 
     For convenience, you can directly pass in the token sequence (or text)
     instead of a function if it does not depend on the input.
@@ -332,8 +337,10 @@ class PromptReplacement(PromptUpdate):
 
     replacement: PromptUpdateContent = field(repr=False)
     """
-    Given the index of the processed item within {attr}`modality`,
-    output the token sequence (or text) to replace {attr}`target`.
+    Given the index of the processed item within
+    [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
+    output the token sequence (or text) to replace
+    [`target`][vllm.multimodal.processing.PromptUpdate.target].
 
     For convenience, you can directly pass in the token sequence (or text)
     instead of a function if it does not depend on the input.
@@ -387,14 +394,16 @@ def modality(self) -> str:
 
 
 def full_groupby_modality(values: Iterable[_M]) -> ItemsView[str, list[_M]]:
-    """Convenience function to apply {func}`full_groupby` based on modality."""
+    """Convenience function to apply [`full_groupby`][vllm.utils.full_groupby]
+    based on modality."""
     return full_groupby(values, key=lambda x: x.modality)
 
 
 @dataclass
 class _BoundPromptSequence:
     """
-    A {data}`_PromptSeq` bound to a tokenizer to automatically
+    A [`_PromptSeq`][vllm.multimodal.processing.PromptSeq] bound
+    to a tokenizer to automatically
     convert between token sequence and text representations.
     """
     tokenizer: AnyTokenizer = field(repr=False)
@@ -446,9 +455,11 @@ class _BoundPromptContent:
 @dataclass
 class BoundPromptUpdate:
     """
-    A {class}`PromptUpdate` bound to a tokenizer to automatically convert
-    {attr}`target` and the result of {meth}`get_content` between
-    token sequence and text representations.
+    A [`PromptUpdate`][vllm.multimodal.processing.PromptUpdate] bound
+    to a tokenizer to automatically convert
+    [`target`][vllm.multimodal.processing.PromptUpdate.target] and the result of
+    [`get_content`][vllm.multimodal.processing.BoundPromptUpdate.get_content]
+    between token sequence and text representations.
     """
     _origin: PromptUpdate
     tokenizer: AnyTokenizer = field(repr=False)
@@ -482,7 +493,8 @@ def mode(self) -> UpdateMode:
 
     def get_content(self, item_idx: int) -> _BoundPromptContent:
         """
-        Given the index of the processed item within {attr}`modality`,
+        Given the index of the processed item within
+        [`modality`][vllm.multimodal.processing.PromptUpdate.modality],
         output the token sequence (or text) to update.
         """
         content = self.content
@@ -1019,7 +1031,8 @@ def put(
     ) -> None:
         """
         Put a processed multi-modal item into the cache
-        according to its dependencies (see {meth}`get`).
+        according to its dependencies
+        (see [`get`][vllm.multimodal.processing.ProcessingCache.get]).
         """
         cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
                                                  **{modality: input_item},
@@ -1091,7 +1104,8 @@ def get_allowed_mm_limits(self) -> Mapping[str, int]:
 
 MultiModalHashes = dict[str, list[str]]
 """
-A collection of hashes with a similar structure as {class}`MultiModalKwargs`.
+A collection of hashes with a similar structure as
+[`MultiModalKwargs`][vllm.multimodal.inputs.MultiModalKwargs].
 """
 
 
@@ -1099,7 +1113,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
     """
     Abstract base class to process multi-modal inputs to be used in vLLM.
 
-    Not to be confused with {class}`transformers.ProcessorMixin`.
+    Not to be confused with `transformers.ProcessorMixin`.
     """
 
     def __init__(self,
@@ -1126,10 +1140,12 @@ def __call__(
     def _get_data_parser(self) -> MultiModalDataParser:
         """
         Construct a parser to preprocess multi-modal data items
-        before passing them to {meth}`_get_hf_mm_data`.
+        before passing them to
+        [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
 
         You can support additional modalities by creating a subclass
-        of {class}`MultiModalDataParser` that has additional subparsers.
+        of [`MultiModalDataParser`][vllm.multimodal.parse.MultiModalDataParser]
+        that has additional subparsers.
         """
         return MultiModalDataParser()
 
@@ -1138,8 +1154,11 @@ def _to_mm_items(
         mm_data: MultiModalDataDict,
     ) -> MultiModalDataItems:
         """
-        Normalize {class}`MultiModalDataDict` to {class}`MultiModalDataItems`
-        before passing them to {meth}`_get_hf_mm_data`.
+        Normalize
+        [`MultiModalDataDict`][vllm.multimodal.inputs.MultiModalDataDict]
+        to [`MultiModalDataItems`][vllm.multimodal.parse.MultiModalDataItems]
+        before passing them to
+        [`_get_hf_mm_data`][vllm.multimodal.processing.BaseMultiModalProcessor._get_hf_mm_data].
         """
         mm_items = self.data_parser.parse_mm_data(mm_data)
         supported_mm_limits = self.info.get_supported_mm_limits()
@@ -1191,7 +1210,8 @@ def _get_prompt_updates(
         inputs.
 
         Moreover, this information is critical to determine the token positions
-        in order to construct  {class}`~vllm-multimodal.input.PlaceholderRange`
+        in order to construct
+        [`PlaceholderRange`][vllm.multimodal.inputs.PlaceholderRange]
         for each multi-modal item.
         """
         raise NotImplementedError
@@ -1315,7 +1335,9 @@ def _apply_hf_processor_tokens_only(
         Most HF processors accept prompt text but not prompt tokens.
         If the HF processor adds or removes tokens that are not related to
         multi-modal data, you should override this method so it is consistent
-        with the output of {meth}`_apply_hf_processor_text_only` on the
+        with the output of
+        [`_apply_hf_processor_text_only`][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_text_only]
+        on the
         corresponding text.
         """
         return prompt_tokens
@@ -1330,7 +1352,8 @@ def _apply_hf_processor_mm_only(
 
         Since HF processor requires that text and multi-modal items
         correspond to each other, we generate dummy text using
-        {class}`DummyInputsBuilder` to go along with the multi-modal data.
+        [`DummyInputsBuilder`][vllm.multimodal.profiling.BaseDummyInputsBuilder]
+        to go along with the multi-modal data.
         """
         mm_counts = mm_items.get_all_counts()
 
diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py
index b5875124c126..a85b13fb2387 100644
--- a/vllm/multimodal/profiling.py
+++ b/vllm/multimodal/profiling.py
@@ -3,7 +3,7 @@
 from abc import ABC
 from collections.abc import Mapping
 from dataclasses import dataclass, field
-from typing import Generic, NamedTuple, Optional, TypeVar, cast
+from typing import Generic, NamedTuple, Optional, TypeVar, Union, cast
 
 import numpy as np
 import numpy.typing as npt
@@ -25,9 +25,9 @@
 class ProcessorInputs:
     """
     Represents the keyword arguments to
-    {meth}`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
+    [`vllm.multimodal.processing.BaseMultiModalProcessor.apply`][].
     """
-    prompt_text: str
+    prompt: Union[str, list[int]]
     mm_data: MultiModalDataDict
     hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
 
@@ -75,7 +75,12 @@ def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
                             "in an upcoming release.")
 
         seq_len = self.info.ctx.model_config.max_model_len
-        return self.get_dummy_processor_inputs(seq_len, mm_counts).prompt_text
+
+        prompt = self.get_dummy_processor_inputs(seq_len, mm_counts).prompt
+        if not isinstance(prompt, str):
+            prompt = self.info.get_tokenizer().decode(prompt)
+
+        return prompt
 
     # TODO: @abstractmethod after transition
     def get_dummy_mm_data(
@@ -101,7 +106,7 @@ def get_dummy_processor_inputs(
         dummy_text = self.get_dummy_text(mm_counts)
         dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
 
-        return ProcessorInputs(prompt_text=dummy_text, mm_data=dummy_mm_data)
+        return ProcessorInputs(prompt=dummy_text, mm_data=dummy_mm_data)
 
     def _get_dummy_audios(
         self,
@@ -177,7 +182,7 @@ def _get_dummy_mm_inputs(
             seq_len, mm_counts)
 
         return self.processor.apply(
-            prompt=processor_inputs.prompt_text,
+            prompt=processor_inputs.prompt,
             mm_data=processor_inputs.mm_data,
             hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
         )
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 67d0d7fc1183..b9f5cee922a7 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -29,7 +29,11 @@
 
 
 class ProcessingInfoFactory(Protocol[_I_co]):
-    """Constructs a {class}`MultiModalProcessor` instance from the context."""
+    """
+    Constructs a
+    [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor]
+    instance from the context.
+    """
 
     def __call__(
         self,
@@ -40,7 +44,9 @@ def __call__(
 
 class DummyInputsBuilderFactory(Protocol[_I]):
     """
-    Constructs a {class}`BaseDummyInputsBuilder` instance from the context.
+    Constructs a
+    [`BaseDummyInputsBuilder`][vllm.multimodal.profiling.BaseDummyInputsBuilder]
+    instance from the context.
     """
 
     def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
@@ -48,7 +54,11 @@ def __call__(self, info: _I) -> BaseDummyInputsBuilder[_I]:
 
 
 class MultiModalProcessorFactory(Protocol[_I]):
-    """Constructs a {class}`MultiModalProcessor` instance from the context."""
+    """
+    Constructs a
+    [`BaseMultiModalProcessor`][vllm.multimodal.processing.BaseMultiModalProcessor]
+    instance from the context.
+    """
 
     def __call__(
         self,
@@ -155,8 +165,6 @@ def get_max_tokens_by_modality(
         """
         Get the maximum number of tokens from each modality
         for profiling the memory usage of a model.
-
-        See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
         """
         mm_limits = self.get_mm_limits_per_prompt(model_config)
 
@@ -170,8 +178,6 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         """
         Get the maximum number of multi-modal tokens
         for profiling the memory usage of a model.
-
-        See {meth}`MultiModalPlugin.get_max_multimodal_tokens` for more details.
         """
         return sum(self.get_max_tokens_by_modality(model_config).values())
 
@@ -213,10 +219,6 @@ def register_processor(
 
         When the model receives multi-modal data, the provided function is
         invoked to transform the data into a dictionary of model inputs.
-
-        :::{seealso}
-        {ref}`mm-processing`
-        :::
         """
 
         def wrapper(model_cls: N) -> N:
@@ -259,10 +261,6 @@ def create_processor(
     ) -> BaseMultiModalProcessor[BaseProcessingInfo]:
         """
         Create a multi-modal processor for a specific model and tokenizer.
-
-        :::{seealso}
-        {ref}`mm-processing`
-        :::
         """
         if not model_config.is_multimodal_model:
             raise ValueError(f"{model_config.model} is not a multimodal model")
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index aef5f669ac68..9ddba67bff70 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -259,7 +259,8 @@ def fetch_image_embedding(
 
 
 global_media_connector = MediaConnector()
-"""The global {class}`MediaConnector` instance used by vLLM."""
+"""The global [`MediaConnector`][vllm.multimodal.utils.MediaConnector]
+instance used by vLLM."""
 
 fetch_audio = global_media_connector.fetch_audio
 fetch_image = global_media_connector.fetch_image
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 3685fd4c3458..261d56abad9c 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -164,7 +164,7 @@ def load_base64(self, media_type: str, data: str) -> npt.NDArray:
             )
 
             return np.stack([
-                np.array(load_frame(frame_data))
+                np.asarray(load_frame(frame_data))
                 for frame_data in data.split(",")
             ])
 
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 6cd60575b00d..33cc50c872b6 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -9,12 +9,15 @@
 import torch
 from typing_extensions import TypeVar, deprecated
 
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.multimodal.inputs import MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind
 from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
                            SequenceGroup, SequenceGroupBase, SequenceStatus)
 
+logger = init_logger(__name__)
+
 
 @dataclass
 class CompletionOutput:
@@ -122,7 +125,13 @@ def __init__(
         *,
         multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None,
         kv_transfer_params: Optional[dict[str, Any]] = None,
+        # Forward compatibility, code that uses args added in new release can
+        # still run with older versions of vLLM without breaking.
+        **kwargs: Any,
     ) -> None:
+        if kwargs:
+            logger.warning_once("RequestOutput: Ignoring extra arguments: %s",
+                                str(kwargs))
         self.request_id = request_id
         self.prompt = prompt
         self.prompt_token_ids = prompt_token_ids
@@ -382,15 +391,6 @@ def from_seq_group(seq_group: SequenceGroup) -> "PoolingRequestOutput":
                                     prompt_token_ids, finished)
 
     def __repr__(self):
-        """
-        Returns a string representation of an PoolingRequestOutput instance.
-
-        The representation includes the request_id and the number of outputs,
-        providing a quick overview of the pooling request's results.
-
-        Returns:
-            str: A string representation of the PoolingRequestOutput instance.
-        """
         return (f"{type(self).__name__}(request_id={self.request_id!r}, "
                 f"outputs={self.outputs!r}, "
                 f"prompt_token_ids={self.prompt_token_ids}, "
diff --git a/vllm/platforms/__init__.py b/vllm/platforms/__init__.py
index b1df4fd1339b..00d00d05f47a 100644
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -42,7 +42,6 @@ def tpu_platform_plugin() -> Optional[str]:
         logger.debug("Confirmed TPU platform is available.")
     except Exception as e:
         logger.debug("TPU platform is not available because: %s", str(e))
-        pass
 
     return "vllm.platforms.tpu.TpuPlatform" if is_tpu else None
 
@@ -112,7 +111,6 @@ def rocm_platform_plugin() -> Optional[str]:
             amdsmi.amdsmi_shut_down()
     except Exception as e:
         logger.debug("ROCm platform is not available because: %s", str(e))
-        pass
 
     return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
 
@@ -130,7 +128,6 @@ def hpu_platform_plugin() -> Optional[str]:
                          "habana_frameworks is not found.")
     except Exception as e:
         logger.debug("HPU platform is not available because: %s", str(e))
-        pass
 
     return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None
 
@@ -148,7 +145,6 @@ def xpu_platform_plugin() -> Optional[str]:
             logger.debug("Confirmed XPU platform is available.")
     except Exception as e:
         logger.debug("XPU platform is not available because: %s", str(e))
-        pass
 
     return "vllm.platforms.xpu.XPUPlatform" if is_xpu else None
 
@@ -170,7 +166,6 @@ def cpu_platform_plugin() -> Optional[str]:
 
     except Exception as e:
         logger.debug("CPU platform is not available because: %s", str(e))
-        pass
 
     return "vllm.platforms.cpu.CpuPlatform" if is_cpu else None
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 2d48af397636..c79c603c02eb 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -9,6 +9,7 @@
 import torch
 
 from vllm.logger import init_logger
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
 from .interface import CpuArchEnum, Platform, PlatformEnum, _Backend
 
@@ -74,7 +75,7 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         import vllm.envs as envs
         from vllm.utils import GiB_bytes
         model_config = vllm_config.model_config
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
+        # Reminder: Please update docs/features/compatibility_matrix.md
         # If the feature combo become valid
         if not model_config.enforce_eager:
             model_config.enforce_eager = True
@@ -177,6 +178,16 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                     " set VLLM_WORKER_MULTIPROC_METHOD to fork explicitly.")
                 os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
 
+        if vllm_config.model_config and vllm_config.model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
     @classmethod
     def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on CPU.")
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
index bdee8b2f821d..8bb3dfe7457a 100644
--- a/vllm/platforms/cuda.py
+++ b/vllm/platforms/cuda.py
@@ -158,6 +158,7 @@ def check_and_update_config(cls, vllm_config: "VllmConfig") -> None:
                 "currently not supported with CUDA Graphs.")
             vllm_config.model_config.enforce_eager = True
             compilation_config.use_cudagraph = False
+            # FIXME: inductor breaks cudagraph (from @bnell)
             compilation_config.use_inductor = False
 
     @classmethod
@@ -311,6 +312,10 @@ def supports_v1(cls, model_config: "ModelConfig") -> bool:
     def use_custom_allreduce(cls) -> bool:
         return True
 
+    @classmethod
+    def get_piecewise_backend_cls(cls) -> str:
+        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
+
 
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index 456b054b2b43..a8dd7df9f2e3 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -7,6 +7,7 @@
 
 from vllm import envs
 from vllm.logger import init_logger
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
 from .interface import Platform, PlatformEnum, _Backend
 
@@ -38,8 +39,8 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
     def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
         return True
 
-    @staticmethod
-    def inference_mode():
+    @classmethod
+    def inference_mode(cls):
         return torch.no_grad()
 
     @classmethod
@@ -80,6 +81,16 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                     "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
                 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
+        if vllm_config.model_config and vllm_config.model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
     @classmethod
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on HPU.")
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index b09e31e9ed46..504c3b42a75d 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -84,7 +84,7 @@ def as_version_str(self) -> str:
 
     def to_int(self) -> int:
         """
-        Express device capability as an integer ``<major><minor>``.
+        Express device capability as an integer `<major><minor>`.
 
         It is assumed that the minor version is always a single digit.
         """
@@ -157,7 +157,7 @@ def is_out_of_tree(self) -> bool:
         return self._enum == PlatformEnum.OOT
 
     def is_cuda_alike(self) -> bool:
-        """Stateless version of {func}`torch.cuda.is_available`."""
+        """Stateless version of [torch.cuda.is_available][]."""
         return self._enum in (PlatformEnum.CUDA, PlatformEnum.ROCM)
 
     def is_sleep_mode_available(self) -> bool:
@@ -194,7 +194,7 @@ def get_device_capability(
         cls,
         device_id: int = 0,
     ) -> Optional[DeviceCapability]:
-        """Stateless version of {func}`torch.cuda.get_device_capability`."""
+        """Stateless version of [torch.cuda.get_device_capability][]."""
         return None
 
     @classmethod
@@ -206,10 +206,11 @@ def has_device_capability(
         """
         Test whether this platform is compatible with a device capability.
 
-        The ``capability`` argument can either be:
+        The `capability` argument can either be:
 
-        - A tuple ``(major, minor)``.
-        - An integer ``<major><minor>``. (See {meth}`DeviceCapability.to_int`)
+        - A tuple `(major, minor)`.
+        - An integer `<major><minor>`. (See
+        [`DeviceCapability.to_int`][vllm.platforms.interface.DeviceCapability.to_int])
         """
         current_capability = cls.get_device_capability(device_id=device_id)
         if current_capability is None:
@@ -478,6 +479,13 @@ def get_cu_count(cls, device_id: int = 0) -> int:
         """
         raise NotImplementedError
 
+    @classmethod
+    def get_piecewise_backend_cls(cls) -> str:
+        """
+        Get piecewise backend class for piecewise graph.
+        """
+        return "vllm.compilation.base_piecewise_backend.AbstractPiecewiseBackend"  # noqa
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/neuron.py b/vllm/platforms/neuron.py
index 71f7c718cdf9..9cd49fd34804 100644
--- a/vllm/platforms/neuron.py
+++ b/vllm/platforms/neuron.py
@@ -6,6 +6,7 @@
 
 from vllm import envs
 from vllm.logger import init_logger
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
 from .interface import Platform, PlatformEnum
 
@@ -51,12 +52,21 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         assert (vllm_config.lora_config
                 is None), "LoRA is not supported for Neuron backend."
 
-        cache_config = vllm_config.cache_config
-        if cache_config:
+        if vllm_config.cache_config and vllm_config.model_config:
             # neuron needs block_size = max_model_len
             vllm_config.cache_config.block_size = \
                 vllm_config.model_config.max_model_len  # type: ignore
 
+        if vllm_config.model_config and vllm_config.model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
     @classmethod
     def is_pin_memory_available(cls) -> bool:
         logger.warning("Pin memory is not supported on Neuron.")
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index 18cf14ae2569..b19aad5d07d8 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -102,26 +102,43 @@ def on_mi250_mi300() -> bool:
 
 
 @cache
-def use_rocm_custom_paged_attention(qtype: torch.dtype, head_size: int,
-                                    block_size: int, gqa_ratio: int,
-                                    max_seq_len: int,
-                                    sliding_window: int) -> bool:
+def use_rocm_custom_paged_attention(
+        qtype: torch.dtype,
+        head_size: int,
+        block_size: int,
+        gqa_ratio: int,
+        max_seq_len: int,
+        sliding_window: int,
+        kv_cache_dtype: str,
+        alibi_slopes: Optional[torch.Tensor] = None) -> bool:
 
     GPU_ARCH = torch.cuda.get_device_properties("cuda").gcnArchName
     ON_GFX9 = any(arch in GPU_ARCH for arch in ["gfx90a", "gfx942", "gfx950"])
+    ON_GFX11_GFX12 = any(arch in GPU_ARCH for arch in ["gfx11", "gfx12"])
 
-    # rocm custom page attention not support on gfx1*
     # custom paged attn always supported on V0. On V1, requires sliding window
     # disabled due to observed numerical discrepancy.
-    return (ON_GFX9 and (not envs.VLLM_USE_V1 or sliding_window == 0
-                         or sliding_window == (-1, -1))
-            and (qtype == torch.half or qtype == torch.bfloat16)
-            and (head_size == 64 or head_size == 128)
-            and (block_size == 16 or block_size == 32)
-            and (gqa_ratio >= 1 and gqa_ratio <= 16)
-            and max_seq_len <= 128 * 1024 and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN
-            and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
-                     and envs.VLLM_ROCM_USE_AITER))
+    if ON_GFX9:
+        return ((not envs.VLLM_USE_V1 or sliding_window == 0
+                 or sliding_window == (-1, -1))
+                and (qtype == torch.half or qtype == torch.bfloat16)
+                and (head_size == 64 or head_size == 128)
+                and (block_size == 16 or block_size == 32)
+                and (gqa_ratio >= 1 and gqa_ratio <= 16)
+                and max_seq_len <= 128 * 1024
+                and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+                and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
+                         and envs.VLLM_ROCM_USE_AITER))
+
+    else:
+        return (ON_GFX11_GFX12 and (not envs.VLLM_USE_V1 or sliding_window == 0
+                                    or sliding_window == (-1, -1))
+                and (qtype == torch.half or qtype == torch.bfloat16)
+                and head_size == 128 and block_size == 16
+                and (gqa_ratio >= 3 and gqa_ratio <= 16)
+                and max_seq_len <= 32768 and alibi_slopes is None
+                and kv_cache_dtype == "auto"
+                and envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
 
 
 class RocmPlatform(Platform):
@@ -201,9 +218,9 @@ def get_device_capability(cls,
         major, minor = torch.cuda.get_device_capability(device_id)
         return DeviceCapability(major=major, minor=minor)
 
-    @staticmethod
+    @classmethod
     @with_amdsmi_context
-    def is_fully_connected(physical_device_ids: list[int]) -> bool:
+    def is_fully_connected(cls, physical_device_ids: list[int]) -> bool:
         """
         Query if the set of gpus are fully connected by xgmi (1 hop)
         """
@@ -362,3 +379,11 @@ def use_custom_allreduce(cls) -> bool:
     def get_cu_count(cls, device_id: int = 0) -> int:
         return torch.cuda.get_device_properties(
             device_id).multi_processor_count
+
+    @classmethod
+    def is_navi(cls) -> bool:
+        return 'gfx1' in torch.cuda.get_device_properties(0).gcnArchName
+
+    @classmethod
+    def get_piecewise_backend_cls(cls) -> str:
+        return "vllm.compilation.cuda_piecewise_backend.CUDAPiecewiseBackend"  # noqa
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 6c573c1b3635..0173b15697cf 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -9,6 +9,7 @@
 from vllm.inputs import ProcessorInputs, PromptType
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
 from .interface import Platform, PlatformEnum, _Backend
 
@@ -161,6 +162,16 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             "Forcing --disable_chunked_mm_input.")
             scheduler_config.disable_chunked_mm_input = True
 
+        if vllm_config.model_config and vllm_config.model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
     @classmethod
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on TPU.")
diff --git a/vllm/platforms/xpu.py b/vllm/platforms/xpu.py
index 225e756cd7ce..b2a6ad5d77db 100644
--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
@@ -5,6 +5,7 @@
 import torch
 
 from vllm.logger import init_logger
+from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
 
 from .interface import DeviceCapability, Platform, PlatformEnum, _Backend
 
@@ -36,15 +37,17 @@ def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
         logger.info("Using IPEX attention backend.")
         return "vllm.attention.backends.ipex_attn.IpexAttnBackend"
 
-    @staticmethod
+    @classmethod
     def get_device_capability(
-            device_id: int = 0) -> Optional[DeviceCapability]:
+        cls,
+        device_id: int = 0,
+    ) -> Optional[DeviceCapability]:
         # capacity format differs from cuda's and will cause unexpected
         # failure, so use None directly
         return None
 
-    @staticmethod
-    def get_device_name(device_id: int = 0) -> str:
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
         return torch.xpu.get_device_name(device_id)
 
     @classmethod
@@ -56,8 +59,8 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
     def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
         return True
 
-    @staticmethod
-    def inference_mode():
+    @classmethod
+    def inference_mode(cls):
         return torch.no_grad()
 
     @classmethod
@@ -113,6 +116,16 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
                 parallel_config.distributed_executor_backend)
             parallel_config.distributed_executor_backend = "ray"
 
+        if vllm_config.model_config and vllm_config.model_config.use_mla:
+            logger.info(
+                "MLA is enabled on a non-GPU platform; forcing chunked "
+                "prefill and prefix caching to be disabled.")
+            vllm_config.scheduler_config.enable_chunked_prefill = False
+            vllm_config.scheduler_config.chunked_prefill_enabled = False
+            vllm_config.scheduler_config.max_num_batched_tokens = max(
+                vllm_config.scheduler_config.max_model_len,
+                DEFAULT_MAX_NUM_BATCHED_TOKENS)
+
     @classmethod
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on XPU.")
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index d72ab2bd088c..2884cb46fecd 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -2,7 +2,7 @@
 
 import logging
 import os
-from typing import Callable
+from typing import Any, Callable
 
 import torch
 
@@ -14,7 +14,7 @@
 plugins_loaded = False
 
 
-def load_plugins_by_group(group: str) -> dict[str, Callable]:
+def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]:
     import sys
     if sys.version_info < (3, 10):
         from importlib_metadata import entry_points
@@ -27,23 +27,27 @@ def load_plugins_by_group(group: str) -> dict[str, Callable]:
     if len(discovered_plugins) == 0:
         logger.debug("No plugins for group %s found.", group)
         return {}
+
     logger.info("Available plugins for group %s:", group)
     for plugin in discovered_plugins:
-        logger.info("name=%s, value=%s", plugin.name, plugin.value)
+        logger.info("- %s -> %s", plugin.name, plugin.value)
+
     if allowed_plugins is None:
-        logger.info("all available plugins for group %s will be loaded.",
-                    group)
-        logger.info("set environment variable VLLM_PLUGINS to control"
-                    " which plugins to load.")
-    plugins = {}
+        logger.info("All plugins in this group will be loaded. "
+                    "Set `VLLM_PLUGINS` to control which plugins to load.")
+
+    plugins = dict[str, Callable[[], Any]]()
     for plugin in discovered_plugins:
         if allowed_plugins is None or plugin.name in allowed_plugins:
+            if allowed_plugins is not None:
+                logger.info("Loading plugin %s", plugin.name)
+
             try:
                 func = plugin.load()
                 plugins[plugin.name] = func
-                logger.info("plugin %s loaded.", plugin.name)
             except Exception:
                 logger.exception("Failed to load plugin %s", plugin.name)
+
     return plugins
 
 
diff --git a/vllm/reasoning/granite_reasoning_parser.py b/vllm/reasoning/granite_reasoning_parser.py
index 0dae02d33fec..07a63e294df4 100644
--- a/vllm/reasoning/granite_reasoning_parser.py
+++ b/vllm/reasoning/granite_reasoning_parser.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
-import re
 from collections.abc import Sequence
 from typing import Optional, Union
 
+import regex as re
 from transformers import PreTrainedTokenizerBase
 
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 5aa9ae62f542..d359f897da25 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -27,7 +27,7 @@
 
 
 def array_full(token_id: int, count: int):
-    """{class}`array` equivalent of {func}`numpy.full`."""
+    """[`array`][] equivalent of [numpy.full][]."""
     return array(VLLM_TOKEN_ID_ARRAY_TYPE, [token_id]) * count
 
 
@@ -112,12 +112,12 @@ class RequestMetrics:
                             will include model forward, block/sync across
                             workers, cpu-gpu sync time and sampling time.
         spec_token_acceptance_counts: number of accepted speculative tokens at
-                                      each position; the first token is from 
+                                      each position; the first token is from
                                       the target model and is always accepted;
-                                      e.g., when it's [10, 8, 4, 2] for a req, 
+                                      e.g., when it's [10, 8, 4, 2] for a req,
                                       it means there were 10 forward passes in
-                                      total, and there were 8, 4, 2 accepted 
-                                      tokens at 1st, 2nd, 3rd speculation step. 
+                                      total, and there were 8, 4, 2 accepted
+                                      tokens at 1st, 2nd, 3rd speculation step.
     """
     arrival_time: float
     last_token_time: float
@@ -192,8 +192,8 @@ class SequenceData(msgspec.Struct,
     def from_prompt_token_counts(
             *token_counts: tuple[int, int]) -> "SequenceData":
         """
-        Construct a {class}`SequenceData` instance by concatenating
-        prompt token sequences.
+        Construct a [`SequenceData`][vllm.sequence.SequenceData] instance
+        by concatenating prompt token sequences.
 
         Each tuple represents one token sequence, expressed in the form
         `(token_id, count)`.
@@ -216,8 +216,8 @@ def from_seqs(
         prompt_embeds: Optional[torch.Tensor] = None,
     ) -> "SequenceData":
         """
-        Construct a {class}`SequenceData` instance from prompt and output
-        token sequences.
+        Construct a [`SequenceData`][vllm.sequence.SequenceData] instance
+        from prompt and output token sequences.
         """
         prompt_token_ids_arr = array(VLLM_TOKEN_ID_ARRAY_TYPE,
                                      prompt_token_ids)
@@ -452,9 +452,11 @@ def __repr__(self) -> str:
 class Sequence:
     """Stores the data, status, and block information of a sequence.
 
-    The sequence is constructed from the {data}`DecoderOnlyInputs`
-    (for decoder-only) or {data}`EncoderDecoderInputs` (for encoder-decoder)
-    instance passed in through the `inputs` constructor argument.
+    The sequence is constructed from the
+    [`DecoderOnlyInputs`][vllm.inputs.data.DecoderOnlyInputs] (for decoder-only)
+    or [`EncoderDecoderInputs`][vllm.inputs.data.EncoderDecoderInputs]
+    (for encoder-decoder) instance passed in through the `inputs`
+    constructor argument.
 
     Args:
         seq_id: The ID of the sequence.
@@ -714,9 +716,9 @@ class SequenceGroup:
         trace_headers: OpenTelemetry trace headers.
         prompt_adapter_request: Prompt Adapter request.
         priority: User-defined priority of the request.
-        draft_size: The number of speculative tokens plus one from the target 
+        draft_size: The number of speculative tokens plus one from the target
                     model; equal to max number of tokens a step can generate
-                    for single-draft speculative decoding but larger than 
+                    for single-draft speculative decoding but larger than
                     that for multi-draft SD (currently not supported).
     """
 
@@ -1123,7 +1125,7 @@ def __repr__(self) -> str:
             self.output_embed.shape if self.output_embed is not None else None
         return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, "
                 f"output_token={self.output_token}, "
-                f"output_embed.shape={output_embed_shape}"
+                f"output_embed.shape={output_embed_shape}, "
                 f"logprobs={self.logprobs})")
 
     def __eq__(self, other: object) -> bool:
@@ -1494,7 +1496,7 @@ def add_request(request_id: str, engine, params, **kwargs):
         for i in range(original_params.n):
             request_id_i = f"{request_id}_parallel_sample_{i}"
             group.seq_id_to_index[request_id_i] = i
-            params = copy.deepcopy(original_params)
+            params = original_params.clone()
             params.n = 1
             if params.seed is not None:
                 params.seed += i
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index a6276c563394..991d2040a878 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -294,8 +294,11 @@ def execute_model(
                     inputs_embeds=None,
                     positions=model_input.input_positions,
                     intermediate_tensors=intermediate_tensors,
-                    **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
-                                                 device=self.device),
+                    **MultiModalKwargs.as_kwargs(
+                        multi_modal_kwargs,
+                        dtype=self.model_runner.model_config.dtype,
+                        device=self.device,
+                    ),
                     **model_execute_kwargs,
                 )
 
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index 0bb8d602ec8f..4430da26c049 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -126,12 +126,12 @@ def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
         """Copy rejection/typical-acceptance sampling metrics
         (number of accepted tokens, etc) to CPU asynchronously.
 
-        Returns a CUDA event recording when the copy is complete.
+        Returns a device event recording when the copy is complete.
         """
         assert self._copy_stream is not None
-        self._copy_stream.wait_stream(torch.cuda.current_stream())
+        self._copy_stream.wait_stream(current_platform.current_stream())
 
-        with torch.cuda.stream(self._copy_stream):
+        with current_platform.stream(self._copy_stream):
             self._aggregate_num_accepted_tokens.copy_(
                 self.spec_decode_sampler.num_accepted_tokens,
                 non_blocking=True)
@@ -142,7 +142,7 @@ def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
             self._aggregate_num_draft_tokens = (
                 self.spec_decode_sampler.num_draft_tokens)
 
-        aggregate_metrics_ready = torch.cuda.Event()
+        aggregate_metrics_ready = current_platform.Event()
         aggregate_metrics_ready.record(self._copy_stream)
 
         return aggregate_metrics_ready
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 6ba5a51007b4..252c80957305 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -114,7 +114,7 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     return spec_decode_worker
 
 
-# Reminder: Please update docs/source/features/compatibility_matrix.md
+# Reminder: Please update docs/features/compatibility_matrix.md
 # If the feature combo become valid
 class SpecDecodeWorker(LoRANotSupportedWorkerBase):
     """Worker which implements speculative decoding.
diff --git a/vllm/transformers_utils/__init__.py b/vllm/transformers_utils/__init__.py
index b556976a51ba..84bd7a747656 100644
--- a/vllm/transformers_utils/__init__.py
+++ b/vllm/transformers_utils/__init__.py
@@ -1,8 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
-from vllm.envs import VLLM_USE_MODELSCOPE
+from vllm import envs
 
-if VLLM_USE_MODELSCOPE:
+if envs.VLLM_USE_MODELSCOPE:
     try:
         # Patch here, before each import happens
         import modelscope
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 5db0d59771c1..e58496616f22 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -24,7 +24,7 @@
     MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)
 from transformers.utils import CONFIG_NAME as HF_CONFIG_NAME
 
-from vllm.envs import VLLM_USE_MODELSCOPE
+from vllm import envs
 from vllm.logger import init_logger
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -45,13 +45,12 @@
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import resolve_obj_by_qualname
 
-if VLLM_USE_MODELSCOPE:
+if envs.VLLM_USE_MODELSCOPE:
     from modelscope import AutoConfig
 else:
     from transformers import AutoConfig
 
 MISTRAL_CONFIG_NAME = "params.json"
-HF_TOKEN = os.getenv('HF_TOKEN', None)
 
 logger = init_logger(__name__)
 
@@ -130,7 +129,7 @@ def lookup_files() -> list[str]:
             ]
         # if model is remote, use hf_hub api to list files
         try:
-            if VLLM_USE_MODELSCOPE:
+            if envs.VLLM_USE_MODELSCOPE:
                 from vllm.transformers_utils.utils import (
                     modelscope_list_repo_files)
                 return modelscope_list_repo_files(repo_id,
@@ -185,7 +184,7 @@ def file_or_path_exists(model: Union[str, Path], config_name: str,
     return file_exists(str(model),
                        config_name,
                        revision=revision,
-                       token=HF_TOKEN)
+                       token=os.getenv('HF_TOKEN', None))
 
 
 def patch_rope_scaling(config: PretrainedConfig) -> None:
@@ -300,7 +299,10 @@ def get_config(
                 "   - For Hugging Face models: ensure the presence of a "
                 "'config.json'.\n"
                 "   - For Mistral models: ensure the presence of a "
-                "'params.json'.\n").format(model=model)
+                "'params.json'.\n"
+                "3. For GGUF: pass the local path of the GGUF checkpoint.\n"
+                "   Loading GGUF from a remote repo directly is not yet "
+                "supported.\n").format(model=model)
 
             raise ValueError(error_message) from e
 
@@ -309,7 +311,7 @@ def get_config(
             model,
             revision=revision,
             code_revision=code_revision,
-            token=HF_TOKEN,
+            token=os.getenv('HF_TOKEN', None),
             **kwargs,
         )
 
@@ -321,7 +323,7 @@ def get_config(
                 model,
                 revision=revision,
                 code_revision=code_revision,
-                token=HF_TOKEN,
+                token=os.getenv('HF_TOKEN', None),
                 **kwargs,
             )
         else:
@@ -331,7 +333,7 @@ def get_config(
                     trust_remote_code=trust_remote_code,
                     revision=revision,
                     code_revision=code_revision,
-                    token=HF_TOKEN,
+                    token=os.getenv('HF_TOKEN', None),
                     **kwargs,
                 )
             except ValueError as e:
@@ -353,7 +355,7 @@ def get_config(
                     model, revision=revision, code_revision=code_revision)
 
     elif config_format == ConfigFormat.MISTRAL:
-        config = load_params_config(model, revision, token=HF_TOKEN, **kwargs)
+        config = load_params_config(model, revision, **kwargs)
     else:
         supported_formats = [
             fmt.value for fmt in ConfigFormat if fmt != ConfigFormat.AUTO
@@ -562,7 +564,7 @@ def get_sentence_transformer_tokenizer_config(model: str,
             # If model is on HuggingfaceHub, get the repo files
             repo_files = list_repo_files(model,
                                          revision=revision,
-                                         token=HF_TOKEN)
+                                         token=os.getenv('HF_TOKEN', None))
         except Exception:
             repo_files = []
 
@@ -769,7 +771,7 @@ def get_hf_image_processor_config(
     **kwargs,
 ) -> dict[str, Any]:
     # ModelScope does not provide an interface for image_processor
-    if VLLM_USE_MODELSCOPE:
+    if envs.VLLM_USE_MODELSCOPE:
         return dict()
     # Separate model folder from file path for GGUF models
     if check_gguf_file(model):
diff --git a/vllm/transformers_utils/configs/eagle.py b/vllm/transformers_utils/configs/eagle.py
index 586d5c7f5e54..377523efefc3 100644
--- a/vllm/transformers_utils/configs/eagle.py
+++ b/vllm/transformers_utils/configs/eagle.py
@@ -52,13 +52,15 @@ def __init__(self,
                 assert self.model is not None, \
                     "model should not be None when method is eagle"
                 kwargs["architectures"] = [
-                    f"Eagle{arch}" for arch in self.model.architectures
+                    f"Eagle{arch}" if not arch.startswith("Eagle") \
+                        else arch for arch in self.model.architectures
                 ]
             elif method == "eagle3":
                 assert self.model is not None, \
                     "model should not be None when method is eagle3"
                 kwargs["architectures"] = [
-                    f"Eagle3{arch}" for arch in self.model.architectures
+                    f"Eagle3{arch}" if not arch.startswith("Eagle3") \
+                        else arch for arch in self.model.architectures
                 ]
             else:
                 raise ValueError(f"Invalid method {method}. \
diff --git a/vllm/transformers_utils/processors/ovis.py b/vllm/transformers_utils/processors/ovis.py
index a35d32999991..f1c6407e1f3a 100644
--- a/vllm/transformers_utils/processors/ovis.py
+++ b/vllm/transformers_utils/processors/ovis.py
@@ -33,6 +33,8 @@
                                            Unpack)
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 
+from vllm.multimodal.image import convert_image_mode
+
 __all__ = ['OvisProcessor']
 IGNORE_ID = -100
 
@@ -361,8 +363,8 @@ def _get_best_grid(img, side):
                 # pick the partition with maximum covering_ratio and break the tie using #sub_images
                 return sorted(all_grids, key=lambda x: (-x[1], x[0][0] * x[0][1]))[0][0]
 
-        if convert_to_rgb and image.mode != 'RGB':
-            image = image.convert('RGB')
+        if convert_to_rgb:
+            image = convert_image_mode(image, 'RGB')
 
 
         sides = self.get_image_size()
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index e31580ede57b..fa7a208c48ed 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -13,7 +13,7 @@
 from transformers import (AutoTokenizer, PreTrainedTokenizer,
                           PreTrainedTokenizerFast)
 
-from vllm.envs import VLLM_USE_MODELSCOPE
+from vllm import envs
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer_base import (TokenizerBase,
@@ -168,7 +168,7 @@ def get_tokenizer(
 ) -> AnyTokenizer:
     """Gets a tokenizer for the given model name via HuggingFace or ModelScope.
     """
-    if VLLM_USE_MODELSCOPE:
+    if envs.VLLM_USE_MODELSCOPE:
         # download model from ModelScope hub,
         # lazy import so that modelscope is not required for normal use.
         # pylint: disable=C.
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 551c2d55b4fc..23b6f67f09df 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -1,12 +1,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-import re
 from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Optional, Union, cast
 
 import huggingface_hub
+import regex as re
 from huggingface_hub import HfApi, hf_hub_download
 
 from vllm.logger import init_logger
@@ -156,7 +156,11 @@ def make_mistral_chat_completion_request(
     #
     # [1]: https://github.com/mistralai/mistral-common/blob/f4a06998b75ed78bbf5aaf569590b772ea26c9f6/src/mistral_common/protocol/instruct/messages.py#L80
     for message in messages:
-        if message.get("role") == "assistant":
+        # Remove reasoning_content as unsupported by Mistral
+        _ = message.pop("reasoning_content", None)  # type: ignore
+
+        # Convert list text content to string
+        if message.get("role") in ("assistant", "tool"):
             content = message.get("content")
             if isinstance(content, list):
                 content = "\n".join(chunk.get("text") for chunk in content)
diff --git a/vllm/utils.py b/vllm/utils.py
index 53c0e4cd5ab9..0057d9cf6ac8 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -19,7 +19,6 @@
 import multiprocessing
 import os
 import pickle
-import re
 import signal
 import socket
 import subprocess
@@ -34,7 +33,8 @@
 import warnings
 import weakref
 from argparse import (Action, ArgumentDefaultsHelpFormatter, ArgumentParser,
-                      ArgumentTypeError, _ArgumentGroup)
+                      ArgumentTypeError, RawDescriptionHelpFormatter,
+                      _ArgumentGroup)
 from asyncio import FIRST_COMPLETED, AbstractEventLoop, Task
 from collections import UserDict, defaultdict
 from collections.abc import (AsyncGenerator, Awaitable, Generator, Hashable,
@@ -54,6 +54,7 @@
 import numpy as np
 import numpy.typing as npt
 import psutil
+import regex as re
 import torch
 import torch.types
 import yaml
@@ -77,9 +78,15 @@
 
 logger = init_logger(__name__)
 
+# This value is chosen to have a balance between ITL and TTFT. Note it is
+# not optimized for throughput.
+DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048
+POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
+MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
+
 # Exception strings for non-implemented encoder/decoder scenarios
 
-# Reminder: Please update docs/source/features/compatibility_matrix.md
+# Reminder: Please update docs/features/compatibility_matrix.md
 # If the feature combo become valid
 
 STR_NOT_IMPL_ENC_DEC_SWA = \
@@ -901,16 +908,15 @@ def get_kv_cache_torch_dtype(
         model_dtype: Optional[Union[str, torch.dtype]] = None) -> torch.dtype:
     if isinstance(cache_dtype, str):
         if cache_dtype == "auto":
-            if isinstance(model_dtype, str):
+            if isinstance(model_dtype,
+                          str) and model_dtype in STR_DTYPE_TO_TORCH_DTYPE:
                 torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[model_dtype]
             elif isinstance(model_dtype, torch.dtype):
                 torch_dtype = model_dtype
             else:
                 raise ValueError(f"Invalid model dtype: {model_dtype}")
-        elif cache_dtype in ["half", "bfloat16", "float"]:
+        elif cache_dtype in STR_DTYPE_TO_TORCH_DTYPE:
             torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_dtype]
-        elif cache_dtype == "fp8":
-            torch_dtype = torch.uint8
         else:
             raise ValueError(f"Invalid kv cache dtype: {cache_dtype}")
     elif isinstance(cache_dtype, torch.dtype):
@@ -1147,7 +1153,7 @@ def flatten_2d_lists(lists: Iterable[Iterable[T]]) -> list[T]:
 
 def full_groupby(values: Iterable[_V], *, key: Callable[[_V], _K]):
     """
-    Unlike {class}`itertools.groupby`, groups are not broken by
+    Unlike [`itertools.groupby`][], groups are not broken by
     non-contiguous data.
     """
     groups = defaultdict[_K, list[_V]](list)
@@ -1466,7 +1472,8 @@ def __call__(self, parser, namespace, values, option_string=None):
                              "Expected 'true' or 'false'.")
 
 
-class SortedHelpFormatter(ArgumentDefaultsHelpFormatter):
+class SortedHelpFormatter(ArgumentDefaultsHelpFormatter,
+                          RawDescriptionHelpFormatter):
     """SortedHelpFormatter that sorts arguments by their option strings."""
 
     def _split_lines(self, text, width):
@@ -2089,11 +2096,11 @@ class _PlaceholderBase:
     Disallows downstream usage of placeholder modules.
 
     We need to explicitly override each dunder method because
-    {meth}`__getattr__` is not called when they are accessed.
+    [`__getattr__`][vllm.utils._PlaceholderBase.__getattr__]
+    is not called when they are accessed.
 
-    :::{seealso}
-    [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
-    :::
+    Info:
+        [Special method lookup](https://docs.python.org/3/reference/datamodel.html#special-lookup)
     """
 
     def __getattr__(self, key: str) -> Never:
@@ -2695,7 +2702,7 @@ def _maybe_force_spawn():
         logger.warning(
             "We must use the `spawn` multiprocessing start method. "
             "Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
-            "See https://docs.vllm.ai/en/latest/getting_started/"
+            "See https://docs.vllm.ai/en/latest/usage/"
             "troubleshooting.html#python-multiprocessing "
             "for more information. Reason: %s", reason)
         os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
@@ -2960,14 +2967,17 @@ def wrapper(*args, **kwargs):
 
 # Only relevant for models using ALiBi (e.g, MPT)
 def check_use_alibi(model_config: ModelConfig) -> bool:
-    return (getattr(model_config.hf_text_config, "alibi", False)  # Falcon
+    cfg = model_config.hf_text_config
+    return (getattr(cfg, "alibi", False)  # Falcon
             or ("BloomForCausalLM" in getattr(model_config.hf_config,
                                               "architectures", []))  # Bloom
-            or getattr(model_config.hf_text_config, "position_encoding_type",
-                       "") == "alibi"  # codellm_1b_alibi
-            or
-            (hasattr(model_config.hf_text_config, "attn_config")  # MPT
-             and model_config.hf_text_config.attn_config.get("alibi", False)))
+            or getattr(cfg, "position_encoding_type", "") ==
+            "alibi"  # codellm_1b_alibi
+            or (hasattr(cfg, "attn_config")  # MPT
+                and ((isinstance(cfg.attn_config, dict)
+                      and cfg.attn_config.get("alibi", False)) or
+                     (not isinstance(cfg.attn_config, dict)
+                      and getattr(cfg.attn_config, "alibi", False)))))
 
 
 def sha256(input) -> int:
diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
index 7ce39110ac01..31980e94a037 100644
--- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
+++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py
@@ -53,6 +53,8 @@ class AiterMLADecodeMetadata(MLACommonDecodeMetadata):
     # The number of entries in the last page of each request in
     # the paged kv cache, shape: [batch_size]
     paged_kv_last_page_len: Optional[torch.Tensor] = None
+    # The query indptr, shape : [num_decode + 1]
+    qo_indptr: Optional[torch.Tensor] = None
 
 
 class AiterMLAMetadata(MLACommonMetadata[AiterMLADecodeMetadata]):
@@ -75,27 +77,33 @@ def _get_paged_kv_tensors(
             seq_lens: torch.Tensor) -> tuple[torch.Tensor, ...]:
         page_size = self.kv_cache_spec.block_size
         block_table_bounds = (seq_lens + page_size - 1) // page_size
+        device = self.runner.device
 
         mask = (torch.arange(block_table.size(1),
                              dtype=block_table.dtype,
-                             device=block_table.device).unsqueeze(0)
+                             device=device).unsqueeze(0)
                 < block_table_bounds.unsqueeze(1))
         paged_kv_indices = block_table[mask]
 
         paged_kv_indptr = torch.cat([
-            torch.zeros(1,
-                        dtype=block_table_bounds.dtype,
-                        device=block_table_bounds.device),
+            torch.zeros(1, dtype=block_table_bounds.dtype, device=device),
             block_table_bounds.cumsum(dim=0, dtype=torch.int32)
         ])
 
         paged_kv_last_page_len = seq_lens % page_size
         paged_kv_last_page_len = torch.where(paged_kv_last_page_len == 0,
                                              page_size, paged_kv_last_page_len)
+        qo_indptr = torch.arange(0,
+                                 self._num_decodes + 1,
+                                 step=1,
+                                 dtype=torch.int32,
+                                 device=device)
+
         return (
             paged_kv_indices,
             paged_kv_indptr,
             paged_kv_last_page_len,
+            qo_indptr,
         )
 
     def _build_decode(self, block_table_tensor: torch.Tensor,
@@ -105,6 +113,7 @@ def _build_decode(self, block_table_tensor: torch.Tensor,
             paged_kv_indices,
             paged_kv_indptr,
             paged_last_page_len,
+            qo_indptr,
         ) = self._get_paged_kv_tensors(block_table_tensor, seq_lens)
 
         attn_metadata = AiterMLADecodeMetadata(
@@ -112,7 +121,8 @@ def _build_decode(self, block_table_tensor: torch.Tensor,
             seq_lens=seq_lens,
             paged_kv_indptr=paged_kv_indptr,
             paged_kv_indices=paged_kv_indices,
-            paged_kv_last_page_len=paged_last_page_len)
+            paged_kv_last_page_len=paged_last_page_len,
+            qo_indptr=qo_indptr)
 
         return attn_metadata
 
@@ -137,7 +147,10 @@ def __init__(
                          alibi_slopes, sliding_window, kv_cache_dtype,
                          blocksparse_params, logits_soft_cap, attn_type,
                          **mla_args)
-
+        assert (num_heads == 16 or num_heads == 128), (
+            f"Aiter MLA only supports 16 or 128 number of heads.\n"
+            f"Provided {num_heads} number of heads.\n"
+            "Try adjusting tensor_parallel_size value.")
         unsupported_features = [
             alibi_slopes, sliding_window, blocksparse_params, logits_soft_cap
         ]
@@ -189,7 +202,18 @@ def _forward_decode(
 
         kv_buffer = kv_c_and_k_pe_cache.unsqueeze(2)
 
+        if self.num_heads == 16:
+            # AITER MLA decode kernel only supports
+            # max_seqlen_q=1 when using 16 heads.
+            max_seqlen_qo = 1
+        else:
+            # AITER MLA decode Kernel handles arbitrary
+            # max_seqlen_q values when using 128 heads.
+            assert attn_metadata.prefill is not None
+            max_seqlen_qo = attn_metadata.prefill.max_query_len
+
         aiter_mla_decode_fwd(q, kv_buffer, o, self.scale,
+                             attn_metadata.decode.qo_indptr, max_seqlen_qo,
                              attn_metadata.decode.paged_kv_indptr,
                              attn_metadata.decode.paged_kv_indices,
                              attn_metadata.decode.paged_kv_last_page_len)
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index da18ece7555a..0f6098d2b400 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -174,6 +174,7 @@ def allocate_slots(
         num_new_tokens: int,
         num_new_computed_tokens: int = 0,
         new_computed_blocks: Optional[KVCacheBlocks] = None,
+        num_draft_tokens: int = 0,
         num_lookahead_tokens: int = 0,
         delay_cache_blocks: bool = False,
     ) -> Optional[KVCacheBlocks]:
@@ -273,7 +274,7 @@ def allocate_slots(
         # generated (accepted) tokens.
         self.single_type_manager.cache_blocks(
             request, self.req_to_block_hashes[request.request_id],
-            num_computed_tokens + num_new_tokens - len(request.spec_token_ids))
+            num_computed_tokens + num_new_tokens - num_draft_tokens)
 
         return KVCacheBlocks(new_blocks)
 
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
index d8fd67e232cb..4c6b3eea0cb7 100644
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -227,10 +227,15 @@ def schedule(self) -> SchedulerOutput:
                 req_index += 1
                 continue
 
+            num_draft_tokens = max(
+                num_new_tokens + request.num_computed_tokens -
+                request.num_tokens, 0)
+
             while True:
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request,
                     num_new_tokens,
+                    num_draft_tokens=num_draft_tokens,
                     num_lookahead_tokens=self.num_lookahead_tokens)
                 if new_blocks is None:
                     # The request cannot be scheduled.
@@ -310,15 +315,16 @@ def schedule(self) -> SchedulerOutput:
                     break
 
                 request = self.waiting[0]
-                num_prealloc_computed_tokens = 0
-                # P/D: skip request if still waiting for remote kvs.
+
+                # KVTransfer: skip request if still waiting for remote kvs.
                 if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
                     is_ready = self._update_waiting_for_remote_kv(request)
                     if is_ready:
                         request.status = RequestStatus.WAITING
-                        num_prealloc_computed_tokens = (
-                            request.num_computed_tokens)
                     else:
+                        logger.debug(
+                            "%s is still in WAITING_FOR_REMOTE_KVS state.",
+                            request.request_id)
                         self.waiting.popleft()
                         skipped_waiting_requests.appendleft(request)
                         continue
@@ -349,8 +355,9 @@ def schedule(self) -> SchedulerOutput:
                 load_kv_async = False
 
                 # Get already-cached tokens.
-                if num_prealloc_computed_tokens == 0:
-                    new_computed_blocks, num_native_computed_tokens = \
+                if request.num_computed_tokens == 0:
+                    # Get locally-cached tokens.
+                    new_computed_blocks, num_new_local_computed_tokens = \
                         self.kv_cache_manager.get_computed_blocks(
                             request)
 
@@ -358,23 +365,22 @@ def schedule(self) -> SchedulerOutput:
                     if self.connector is not None:
                         num_external_computed_tokens, load_kv_async = (
                             self.connector.get_num_new_matched_tokens(
-                                request, num_native_computed_tokens))
+                                request, num_new_local_computed_tokens))
 
                     # Total computed tokens (local + external).
-                    num_computed_tokens = (num_native_computed_tokens +
+                    num_computed_tokens = (num_new_local_computed_tokens +
                                            num_external_computed_tokens)
+                # KVTransfer: WAITING reqs have num_computed_tokens > 0
+                # after async KV recvs are completed.
                 else:
-                    # P/D: skip checking prefix cache if loaded from remote kvs.
                     new_computed_blocks = KVCacheBlocks.create_empty()
-                    num_native_computed_tokens = 0
-
-                    # Total computed tokens (allocated in prior step).
-                    num_computed_tokens = num_prealloc_computed_tokens
+                    num_new_local_computed_tokens = 0
+                    num_computed_tokens = request.num_computed_tokens
 
                 encoder_inputs_to_schedule = None
                 new_encoder_budget = encoder_budget
 
-                # P/D: loading remote KV, do not allocate for new work.
+                # KVTransfer: loading remote KV, do not allocate for new work.
                 if load_kv_async:
                     assert num_external_computed_tokens > 0
                     num_new_tokens = 0
@@ -405,7 +411,7 @@ def schedule(self) -> SchedulerOutput:
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request,
                     num_new_tokens + num_external_computed_tokens,
-                    num_native_computed_tokens,
+                    num_new_local_computed_tokens,
                     new_computed_blocks,
                     num_lookahead_tokens=self.num_lookahead_tokens,
                     delay_cache_blocks=load_kv_async,
@@ -457,7 +463,9 @@ def schedule(self) -> SchedulerOutput:
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
                 request.num_computed_tokens = num_computed_tokens
-
+                # Count the number of prifix cached tokens.
+                if request.num_cached_tokens < 0:
+                    request.num_cached_tokens = num_computed_tokens
                 # Encoder-related.
                 if encoder_inputs_to_schedule:
                     scheduled_encoder_inputs[request.request_id] = (
@@ -799,6 +807,7 @@ def update_from_output(
                         stop_reason=request.stop_reason,
                         events=request.take_events(),
                         kv_transfer_params=kv_transfer_params,
+                        num_cached_tokens=request.num_cached_tokens,
                     ))
 
             else:
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 122a5a72cc36..41db99beaad5 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -107,6 +107,9 @@ class EngineCoreOutput(
     events: Optional[list[EngineCoreEvent]] = None
     kv_transfer_params: Optional[dict[str, Any]] = None
 
+    # The number of tokens with prefix cache hits.
+    num_cached_tokens: int = 0
+
     @property
     def finished(self) -> bool:
         return self.finish_reason is not None
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 0d646d8dd575..74c2251c7521 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -20,6 +20,8 @@
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.config import (
+    maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
@@ -80,6 +82,9 @@ def __init__(
                 "AsyncLLMEngine.from_vllm_config(...) or explicitly set "
                 "VLLM_USE_V1=0 or 1 and report this issue on Github.")
 
+        # Ensure we can serialize custom transformer configs
+        maybe_register_config_serialize_by_value()
+
         self.model_config = vllm_config.model_config
         self.vllm_config = vllm_config
         self.log_requests = log_requests
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 0cf2383af1c9..740ba60fe231 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -57,6 +57,10 @@ def __init__(self,
                  executor_fail_callback: Optional[Callable] = None):
         assert vllm_config.model_config.runner_type != "pooling"
 
+        # plugins need to be loaded at the engine/scheduler level too
+        from vllm.plugins import load_general_plugins
+        load_general_plugins()
+
         self.vllm_config = vllm_config
         logger.info("Initializing a V1 LLM engine (v%s) with config: %s",
                     VLLM_VERSION, vllm_config)
@@ -336,6 +340,13 @@ def collective_rpc(self,
         return self.model_executor.collective_rpc(method, timeout, args,
                                                   kwargs)
 
+    def save_tensorized_model(
+        self,
+        tensorizer_config,
+    ) -> None:
+        self.model_executor.save_tensorized_model(
+            tensorizer_config=tensorizer_config, )
+
 
 class EngineCoreProc(EngineCore):
     """ZMQ-wrapper for running EngineCore in background process."""
@@ -697,7 +708,7 @@ def _init_data_parallel(self, vllm_config: VllmConfig):
             for i in range(local_dp_rank * world_size, (local_dp_rank + 1) *
                            world_size))
 
-        self.local_dp_rank = local_dp_rank
+        self.dp_rank = dp_rank
         self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
         self.current_wave = 0
 
@@ -770,7 +781,7 @@ def run_busy_loop(self):
                 local_unfinished_reqs)
 
             if not self.engines_running:
-                if self.local_dp_rank == 0:
+                if self.dp_rank == 0:
                     # Notify client that we are pausing the loop.
                     logger.debug("Wave %d finished, pausing engine loop.",
                                  self.current_wave)
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 112896d6c767..c856e2645a2c 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -27,7 +27,10 @@
 from vllm.v1.engine.parallel_sampling import ParentRequest
 from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.abstract import Executor
-from vllm.v1.metrics.loggers import StatLoggerFactory
+from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase,
+                                     StatLoggerFactory)
+from vllm.v1.metrics.reader import Metric, get_metrics_snapshot
+from vllm.v1.metrics.stats import IterationStats
 
 logger = init_logger(__name__)
 
@@ -64,6 +67,11 @@ def __init__(
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
 
+        self.log_stats = log_stats
+        self.stat_logger: Optional[StatLoggerBase] = None
+        if self.log_stats:
+            self.stat_logger = PrometheusStatLogger(vllm_config)
+
         # important: init dp group before init the engine_core
         # In the decoupled engine case this is handled in EngineCoreProc.
         parallel_config = vllm_config.parallel_config
@@ -86,7 +94,7 @@ def __init__(
 
         # OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
         self.output_processor = OutputProcessor(self.tokenizer,
-                                                log_stats=False)
+                                                log_stats=self.log_stats)
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
         self.engine_core = EngineCoreClient.make_client(
@@ -94,7 +102,7 @@ def __init__(
             asyncio_mode=False,
             vllm_config=vllm_config,
             executor_class=executor_class,
-            log_stats=False,  # FIXME: implement
+            log_stats=self.log_stats,
         )
 
         if not multiprocess_mode:
@@ -223,12 +231,21 @@ def step(self) -> list[RequestOutput]:
         outputs = self.engine_core.get_output()
 
         # 2) Process EngineCoreOutputs.
+        iteration_stats = IterationStats() if self.log_stats else None
         processed_outputs = self.output_processor.process_outputs(
-            outputs.outputs)
+            outputs.outputs,
+            engine_core_timestamp=outputs.timestamp,
+            iteration_stats=iteration_stats)
 
         # 3) Abort any reqs that finished due to stop strings.
         self.engine_core.abort_requests(processed_outputs.reqs_to_abort)
 
+        # 4) Record stats
+        if self.stat_logger is not None:
+            assert outputs.scheduler_stats is not None
+            self.stat_logger.record(scheduler_stats=outputs.scheduler_stats,
+                                    iteration_stats=iteration_stats)
+
         return processed_outputs.request_outputs
 
     def get_vllm_config(self):
@@ -260,6 +277,10 @@ def wake_up(self, tags: Optional[list[str]] = None):
     def is_sleeping(self) -> bool:
         return self.engine_core.is_sleeping()
 
+    def get_metrics(self) -> list[Metric]:
+        assert self.log_stats, "Stat logging disabled"
+        return get_metrics_snapshot()
+
     def get_tokenizer_group(self) -> TokenizerGroup:
         if self.tokenizer is None:
             raise ValueError("Unable to get tokenizer because "
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
index a7a9b0e4a161..293c291b4341 100644
--- a/vllm/v1/engine/output_processor.py
+++ b/vllm/v1/engine/output_processor.py
@@ -147,6 +147,7 @@ def make_request_output(
         finish_reason: Optional[FinishReason],
         stop_reason: Union[int, str, None],
         kv_transfer_params: Optional[dict[str, Any]] = None,
+        num_cached_tokens: int = 0,
     ) -> Optional[RequestOutput]:
 
         finished = finish_reason is not None
@@ -169,7 +170,7 @@ def make_request_output(
                 return None
 
         return self._new_request_output(request_id, outputs, finished,
-                                        kv_transfer_params)
+                                        kv_transfer_params, num_cached_tokens)
 
     def _new_request_output(
         self,
@@ -177,6 +178,7 @@ def _new_request_output(
         outputs: list[CompletionOutput],
         finished: bool,
         kv_transfer_params: Optional[dict[str, Any]] = None,
+        num_cached_tokens: int = 0,
     ) -> RequestOutput:
 
         if self.output_kind == RequestOutputKind.DELTA:
@@ -193,6 +195,7 @@ def _new_request_output(
             outputs=outputs,
             finished=finished,
             kv_transfer_params=kv_transfer_params,
+            num_cached_tokens=num_cached_tokens,
         )
 
     def _new_completion_output(
@@ -340,7 +343,7 @@ def process_outputs(
             finish_reason = engine_core_output.finish_reason
             stop_reason = engine_core_output.stop_reason
             kv_transfer_params = engine_core_output.kv_transfer_params
-
+            num_cached_tokens = engine_core_output.num_cached_tokens
             req_state.is_prefilling = False
 
             # 2) Detokenize the token ids into text and perform stop checks.
@@ -356,7 +359,7 @@ def process_outputs(
             # 4) Create and handle RequestOutput objects.
             if request_output := req_state.make_request_output(
                     new_token_ids, finish_reason, stop_reason,
-                    kv_transfer_params):
+                    kv_transfer_params, num_cached_tokens):
                 if req_state.queue is not None:
                     # AsyncLLM: put into queue for handling by generate().
                     req_state.queue.put(request_output)
diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
index 74b226b45424..eb5f9d4bfe00 100644
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -38,7 +38,7 @@
 POLLING_TIMEOUT_MS = 5000
 POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
 
-EXECUTE_MODEL_TIMEOUT_S = 40
+EXECUTE_MODEL_TIMEOUT_S = 300
 
 
 class MultiprocExecutor(Executor):
@@ -50,6 +50,7 @@ def _init_executor(self) -> None:
         self.is_failed = False
         self.shutdown_event = threading.Event()
         self.failure_callback: Optional[FailureCallback] = None
+        self.io_thread_pool: Optional[ThreadPoolExecutor] = None
 
         self.world_size = self.parallel_config.world_size
         tensor_parallel_size = self.parallel_config.tensor_parallel_size
@@ -107,7 +108,6 @@ def _init_executor(self) -> None:
 
         # For pipeline parallel, we use a thread pool for asynchronous
         # execute_model.
-        self.io_thread_pool: Optional[ThreadPoolExecutor] = None
         if self.max_concurrent_batches > 1:
             # Note: must use only 1 IO thread to keep dequeue sequence
             # from the response queue
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
index 2b75a3a2ecbd..3dc2f77444f6 100644
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@@ -200,24 +200,24 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
         # Counters
         #
         self.counter_num_preempted_reqs = self._counter_cls(
-            name="vllm:num_preemptions_total",
+            name="vllm:num_preemptions",
             documentation="Cumulative number of preemption from the engine.",
             labelnames=labelnames).labels(*labelvalues)
 
         self.counter_prompt_tokens = self._counter_cls(
-            name="vllm:prompt_tokens_total",
+            name="vllm:prompt_tokens",
             documentation="Number of prefill tokens processed.",
             labelnames=labelnames).labels(*labelvalues)
 
         self.counter_generation_tokens = self._counter_cls(
-            name="vllm:generation_tokens_total",
+            name="vllm:generation_tokens",
             documentation="Number of generation tokens processed.",
             labelnames=labelnames).labels(*labelvalues)
 
         self.counter_request_success: dict[FinishReason,
                                            prometheus_client.Counter] = {}
         counter_request_success_base = self._counter_cls(
-            name="vllm:request_success_total",
+            name="vllm:request_success",
             documentation="Count of successfully processed requests.",
             labelnames=labelnames + ["finished_reason"])
         for reason in FinishReason:
diff --git a/vllm/v1/metrics/reader.py b/vllm/v1/metrics/reader.py
new file mode 100644
index 000000000000..5ab78129a009
--- /dev/null
+++ b/vllm/v1/metrics/reader.py
@@ -0,0 +1,245 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass
+from typing import Optional
+
+from prometheus_client import REGISTRY
+from prometheus_client import Metric as PromMetric
+from prometheus_client.samples import Sample
+
+
+@dataclass
+class Metric:
+    """A base class for prometheus metrics.
+
+    Each metric may be associated with key=value labels, and
+    in some cases a single vLLM instance may have multiple
+    metrics with the same name but different sets of labels.
+    """
+    name: str
+    labels: dict[str, str]
+
+
+@dataclass
+class Counter(Metric):
+    """A monotonically increasing integer counter."""
+    value: int
+
+
+@dataclass
+class Vector(Metric):
+    """An ordered array of integer counters.
+
+    This type - which doesn't exist in Prometheus - models one very
+    specific metric, vllm:spec_decode_num_accepted_tokens_per_pos.
+    """
+    values: list[int]
+
+
+@dataclass
+class Gauge(Metric):
+    """A numerical value that can go up or down."""
+    value: float
+
+
+@dataclass
+class Histogram(Metric):
+    """Observations recorded in configurable buckets.
+
+    Buckets are represented by a dictionary. The key is
+    the upper limit of the bucket, and the value is the
+    observed count in that bucket. A '+Inf' key always
+    exists.
+
+    The count property is the total count across all
+    buckets, identical to the count of the '+Inf' bucket.
+
+    The sum property is the total sum of all observed
+    values.
+    """
+    count: int
+    sum: float
+    buckets: dict[str, int]
+
+
+def get_metrics_snapshot() -> list[Metric]:
+    """An API for accessing in-memory Prometheus metrics.
+
+    Example:
+        >>> for metric in llm.get_metrics():
+        ...     if isinstance(metric, Counter):
+        ...         print(f"{metric} = {metric.value}")
+        ...     elif isinstance(metric, Gauge):
+        ...         print(f"{metric} = {metric.value}")
+        ...     elif isinstance(metric, Histogram):
+        ...         print(f"{metric}")
+        ...         print(f"    sum = {metric.sum}")
+        ...         print(f"    count = {metric.count}")
+        ...         for bucket_le, value in metrics.buckets.items():
+        ...             print(f"    {bucket_le} = {value}")
+    """
+    collected: list[Metric] = []
+    for metric in REGISTRY.collect():
+        if not metric.name.startswith("vllm:"):
+            continue
+        if metric.type == "gauge":
+            samples = _get_samples(metric)
+            for s in samples:
+                collected.append(
+                    Gauge(name=metric.name, labels=s.labels, value=s.value))
+        elif metric.type == "counter":
+            samples = _get_samples(metric, "_total")
+            if metric.name == "vllm:spec_decode_num_accepted_tokens_per_pos":
+                #
+                # Ugly vllm:num_accepted_tokens_per_pos special case.
+                #
+                # This metric is a vector of counters - for each spec
+                # decoding token position, we observe the number of
+                # accepted tokens using a Counter labeled with 'position'.
+                # We convert these into a vector of integer values.
+                #
+                for labels, values in _digest_num_accepted_by_pos_samples(
+                        samples):
+                    collected.append(
+                        Vector(name=metric.name, labels=labels, values=values))
+            else:
+                for s in samples:
+                    collected.append(
+                        Counter(name=metric.name,
+                                labels=s.labels,
+                                value=int(s.value)))
+
+        elif metric.type == "histogram":
+            #
+            # A histogram has a number of '_bucket' samples where
+            # the 'le' label represents the upper limit of the bucket.
+            # We convert these bucketized values into a dict of values
+            # indexed by the value of the 'le' label. The 'le=+Inf'
+            # label is a special case, catching all values observed.
+            #
+            bucket_samples = _get_samples(metric, "_bucket")
+            count_samples = _get_samples(metric, "_count")
+            sum_samples = _get_samples(metric, "_sum")
+            for labels, buckets, count_value, sum_value in _digest_histogram(
+                    bucket_samples, count_samples, sum_samples):
+                collected.append(
+                    Histogram(name=metric.name,
+                              labels=labels,
+                              buckets=buckets,
+                              count=count_value,
+                              sum=sum_value))
+        else:
+            raise AssertionError(f"Unknown metric type {metric.type}")
+
+    return collected
+
+
+def _get_samples(metric: PromMetric,
+                 suffix: Optional[str] = None) -> list[Sample]:
+    name = (metric.name + suffix) if suffix is not None else metric.name
+    return [s for s in metric.samples if s.name == name]
+
+
+def _strip_label(labels: dict[str, str], key_to_remove: str) -> dict[str, str]:
+    labels_copy = labels.copy()
+    labels_copy.pop(key_to_remove)
+    return labels_copy
+
+
+def _digest_histogram(
+    bucket_samples: list[Sample], count_samples: list[Sample],
+    sum_samples: list[Sample]
+) -> list[tuple[dict[str, str], dict[str, int], int, float]]:
+    #
+    # In the case of DP, we have an indigestable
+    # per-bucket-per-engine count as a list of labelled
+    # samples, along with total and sum samples
+    #
+    # bucket_samples (in):
+    #   labels = {bucket: 100, idx: 0}, value = 2
+    #   labels = {bucket: 200, idx: 0}, value = 4
+    #   labels = {bucket: Inf, idx: 0}, value = 10
+    #   labels = {bucket: 100, idx: 1}, value = 1
+    #   labels = {bucket: 200, idx: 2}, value = 5
+    #   labels = {bucket: Inf, idx: 3}, value = 7
+    # count_samples (in):
+    #   labels = {idx: 0}, value = 10
+    #   labels = {idx: 1}, value = 7
+    # sum_samples (in):
+    #   labels = {idx: 0}, value = 2000
+    #   labels = {idx: 1}, value = 1200
+    #
+    # output: [
+    #   {idx: 0}, {"100": 2, "200": 4, "Inf": 10}, 10, 2000
+    #   {idx: 1}, {"100": 1, "200": 5, "Inf": 7},   7, 1200
+    # ]
+    buckets_by_labels: dict[frozenset[tuple[str, str]], dict[str, int]] = {}
+    for s in bucket_samples:
+        bucket = s.labels["le"]
+        labels_key = frozenset(_strip_label(s.labels, "le").items())
+        if labels_key not in buckets_by_labels:
+            buckets_by_labels[labels_key] = {}
+        buckets_by_labels[labels_key][bucket] = int(s.value)
+
+    counts_by_labels: dict[frozenset[tuple[str, str]], int] = {}
+    for s in count_samples:
+        labels_key = frozenset(s.labels.items())
+        counts_by_labels[labels_key] = int(s.value)
+
+    sums_by_labels: dict[frozenset[tuple[str, str]], float] = {}
+    for s in sum_samples:
+        labels_key = frozenset(s.labels.items())
+        sums_by_labels[labels_key] = s.value
+
+    assert set(buckets_by_labels.keys()) == set(
+        counts_by_labels.keys()) == set(sums_by_labels.keys())
+
+    output = []
+    label_keys = list(buckets_by_labels.keys())
+    for k in label_keys:
+        labels = dict(k)
+        output.append((labels, buckets_by_labels[k], counts_by_labels[k],
+                       sums_by_labels[k]))
+    return output
+
+
+def _digest_num_accepted_by_pos_samples(
+        samples: list[Sample]) -> list[tuple[dict[str, str], list[int]]]:
+    #
+    # In the case of DP, we have an indigestable
+    # per-position-per-engine count as a list of
+    # labelled samples
+    #
+    # samples (in):
+    #   labels = {pos: 0, idx: 0}, value = 10
+    #   labels = {pos: 1, idx: 0}, value = 7
+    #   labels = {pos: 2, idx: 0}, value = 2
+    #   labels = {pos: 0, idx: 1}, value = 5
+    #   labels = {pos: 1, idx: 1}, value = 3
+    #   labels = {pos: 2, idx: 1}, value = 1
+    #
+    # output: [
+    #   {idx: 0}, [10, 7, 2]
+    #   {idx: 1}, [5, 3, 1]
+    # ]
+    #
+    max_pos = 0
+    values_by_labels: dict[frozenset[tuple[str, str]], dict[int, int]] = {}
+
+    for s in samples:
+        position = int(s.labels["position"])
+        max_pos = max(max_pos, position)
+
+        labels_key = frozenset(_strip_label(s.labels, "position").items())
+        if labels_key not in values_by_labels:
+            values_by_labels[labels_key] = {}
+        values_by_labels[labels_key][position] = int(s.value)
+
+    output = []
+    for labels_key, values_by_position in values_by_labels.items():
+        labels = dict(labels_key)
+        values = [0] * (max_pos + 1)
+        for pos, val in values_by_position.items():
+            values[pos] = val
+        output.append((labels, values))
+    return output
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index d1cdd2c52750..b4c84507532a 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -77,6 +77,10 @@ def __init__(
         self.output_token_ids = ConstantList(self._output_token_ids)
         self.all_token_ids = ConstantList(self._all_token_ids)
 
+        # State
+        # The number of tokens with prefix cache hits.
+        self.num_cached_tokens = -1
+
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
         if request.mm_inputs is not None:
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
index 5d8b3f423b02..4a5fbb10d408 100644
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -89,18 +89,18 @@ def forward_cuda(
         p: Optional[torch.Tensor],
     ) -> torch.Tensor:
         """More optimized implementation for top-k and top-p sampling."""
-        probs = logits.softmax(dim=-1, dtype=torch.float32)
         if k is None and p is None:
             # We prefer `random_sample` over `flashinfer_sample` when sorting is
             # not needed. This is because `random_sample` does not require
             # CPU-GPU synchronization while `flashinfer_sample` does.
+            probs = logits.softmax(dim=-1, dtype=torch.float32)
             return random_sample(probs, generators)
         if generators:
             logger.warning("FlashInfer 0.2.3+ does not support "
                            "per-request generators. Falling back to "
                            "PyTorch-native implementation.")
             return self.forward_native(logits, generators, k, p)
-        return flashinfer_sample(probs, k, p, generators)
+        return flashinfer_sample(logits, k, p, generators)
 
     def forward_tpu(
         self,
@@ -254,17 +254,17 @@ def random_sample(
 
 
 def flashinfer_sample(
-    probs: torch.Tensor,
+    logits: torch.Tensor,
     k: Optional[torch.Tensor],
     p: Optional[torch.Tensor],
     generators: dict[int, torch.Generator],
 ) -> torch.Tensor:
-    """Sample from the probabilities using FlashInfer.
+    """Sample from the logits using FlashInfer.
 
     Statistically, this function is equivalent to the `random_sample` function.
     However, this function is faster because it avoids sorting the logits tensor
     via rejection sampling.
-    
+
     NOTE: The outputs of this function do not necessarily match the outputs of
     the `random_sample` function. It only guarantees that the outputs are
     statistically equivalent.
@@ -274,18 +274,19 @@ def flashinfer_sample(
     the synchronization overhead.
     """
     assert not (k is None and p is None)
-
     if k is None:
         # Top-p only.
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
         next_token_ids = flashinfer.sampling.top_p_sampling_from_probs(
             probs, p, deterministic=True)
     elif p is None:
         # Top-k only.
+        probs = logits.softmax(dim=-1, dtype=torch.float32)
         next_token_ids = flashinfer.sampling.top_k_sampling_from_probs(
             probs, k, deterministic=True)
     else:
         # Both top-k and top-p.
-        next_token_ids = (flashinfer.sampling.top_k_top_p_sampling_from_probs(
-            probs, k, p, deterministic=True))
+        next_token_ids = flashinfer.sampling.top_k_top_p_sampling_from_logits(
+            logits, k, p, deterministic=True)
 
     return next_token_ids.view(-1)
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
index 5b84bc1f5ec3..971b06758c21 100644
--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -4,17 +4,17 @@
 
 from vllm.attention.layer import Attention
 from vllm.config import (CompilationLevel, VllmConfig,
-                         get_layers_from_vllm_config, set_current_vllm_config)
+                         get_layers_from_vllm_config)
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
-from vllm.model_executor.model_loader import get_model_loader
-from vllm.model_executor.model_loader.utils import set_default_torch_dtype
-from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
-from vllm.triton_utils import tl, triton
-from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.attention.backends.flash_attn import (CommonAttentionMetadata,
+                                                   FlashAttentionMetadata)
+from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.spec_decode.utils import prepare_eagle_input_kernel
 
 logger = init_logger(__name__)
 
@@ -27,12 +27,15 @@ def __init__(
         self,
         vllm_config: VllmConfig,
         device: torch.device,
+        runner=None,
     ):
         self.vllm_config = vllm_config
         self.speculative_config = vllm_config.speculative_config
         self.draft_model_config = self.speculative_config.draft_model_config
         self.method = self.speculative_config.method
 
+        self.runner = runner
+
         self.dtype = vllm_config.model_config.dtype
         self.max_model_len = vllm_config.model_config.max_model_len
         self.block_size = vllm_config.cache_config.block_size
@@ -108,24 +111,51 @@ def propose(
         # FA requires seq_len to have dtype int32.
         seq_lens = (target_positions[last_token_indices] + 1).int()
 
-        # FIXME(woosuk): The below two ops cause synchronization. Optimize.
-        max_seq_len = seq_lens.max().item()
-        max_num_tokens = (cu_num_tokens[1:] - cu_num_tokens[:-1]).max().item()
-        attn_metadata = FlashAttentionMetadata(
-            num_actual_tokens=num_tokens,
-            max_query_len=max_num_tokens,
-            query_start_loc=cu_num_tokens,
-            max_seq_len=max_seq_len,
-            seq_lens=seq_lens,
-            block_table=block_table,
-            slot_mapping=target_slot_mapping,
-            # TODO(woosuk): Support cascade attention.
-            use_cascade=False,
-            common_prefix_len=0,
-            cu_prefix_query_lens=None,
-            prefix_kv_lens=None,
-            suffix_kv_lens=None,
-        )
+        if self.method in ["eagle", "eagle3"]:
+            # FIXME(woosuk): The below two ops cause synchronization. Optimize.
+            max_seq_len = seq_lens.max().item()
+            max_num_tokens = (cu_num_tokens[1:] -
+                              cu_num_tokens[:-1]).max().item()
+            attn_metadata = FlashAttentionMetadata(
+                num_actual_tokens=num_tokens,
+                max_query_len=max_num_tokens,
+                query_start_loc=cu_num_tokens,
+                max_seq_len=max_seq_len,
+                seq_lens=seq_lens,
+                block_table=block_table,
+                slot_mapping=target_slot_mapping,
+                # TODO(woosuk): Support cascade attention.
+                use_cascade=False,
+                common_prefix_len=0,
+                cu_prefix_query_lens=None,
+                prefix_kv_lens=None,
+                suffix_kv_lens=None,
+            )
+        elif self.method == "deepseek_mtp":
+            query_lens = cu_num_tokens[1:] - cu_num_tokens[:-1]
+            max_query_len = query_lens.max().item()
+
+            common_attn_metadata = CommonAttentionMetadata(
+                query_start_loc=cu_num_tokens, seq_lens=seq_lens)
+
+            assert self.runner is not None
+
+            # FIXME: need to consider multiple kv_cache_groups
+            attn_metadata = self.runner.attn_metadata_builder.build(
+                num_reqs=batch_size,
+                num_actual_tokens=num_tokens,
+                max_query_len=max_query_len,
+                common_prefix_len=0,
+                common_attn_metadata=common_attn_metadata,
+            )
+        else:
+            raise ValueError(f"Unsupported method: {self.method}")
+
+        # At this moment, we assume all eagle layers belong to the same KV
+        # cache group, thus using the same attention metadata.
+        per_layer_attn_metadata = {}
+        for layer_name in self.attn_layer_names:
+            per_layer_attn_metadata[layer_name] = attn_metadata
         if self.use_cuda_graph and \
             num_tokens <= self.cudagraph_batch_sizes[-1]:
             num_input_tokens = self.vllm_config.pad_for_cudagraph(num_tokens)
@@ -135,14 +165,18 @@ def propose(
         self.positions[:num_tokens] = target_positions
         self.hidden_states[:num_tokens] = target_hidden_states
 
-        with set_forward_context(attn_metadata,
+        with set_forward_context(per_layer_attn_metadata,
                                  self.vllm_config,
                                  num_tokens=num_input_tokens):
-            last_hidden_states, hidden_states = self.model(
-                input_ids=self.input_ids[:num_input_tokens],
-                positions=self.positions[:num_input_tokens],
-                hidden_states=self.hidden_states[:num_input_tokens],
+            ret_hidden_states = self.model(
+                self.input_ids[:num_input_tokens],
+                self.positions[:num_input_tokens],
+                self.hidden_states[:num_input_tokens],
             )
+            if self.method == "deepseek_mtp":
+                last_hidden_states = ret_hidden_states
+            else:
+                last_hidden_states, hidden_states = ret_hidden_states
         sample_hidden_states = last_hidden_states[last_token_indices]
         logits = self.model.compute_logits(sample_hidden_states, None)
         draft_token_ids = logits.argmax(dim=-1)
@@ -152,6 +186,10 @@ def propose(
             # [batch_size, 1]
             return draft_token_ids.view(-1, 1)
 
+        # TODO: Currently, MTP module released by deepseek only has
+        # one layer. Adapt this code to support multiple layers once
+        # there's a multi-layer MTP module.
+
         # Generate the remaining draft tokens.
         draft_token_ids_list = [draft_token_ids]
 
@@ -213,13 +251,13 @@ def propose(
             self.hidden_states[:batch_size] = hidden_states
 
             # Run the model.
-            with set_forward_context(attn_metadata,
+            with set_forward_context(per_layer_attn_metadata,
                                      self.vllm_config,
                                      num_tokens=input_batch_size):
                 last_hidden_states, hidden_states = self.model(
-                    input_ids=self.input_ids[:input_batch_size],
-                    positions=self.positions[:input_batch_size],
-                    hidden_states=self.hidden_states[:input_batch_size],
+                    self.input_ids[:input_batch_size],
+                    self.positions[:input_batch_size],
+                    self.hidden_states[:input_batch_size],
                 )
             hidden_states = hidden_states[:batch_size]
             logits = self.model.compute_logits(last_hidden_states[:batch_size],
@@ -239,6 +277,7 @@ def prepare_inputs(
         cu_target_query_lens: torch.Tensor,
         # [batch_size]
         num_rejected_tokens: torch.Tensor,
+        num_tokens: int,
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # cu_target_query_lens: [0, a, a + b, a + b + c]
         # num_rejected_tokens: [n1, n2, n3]
@@ -256,21 +295,16 @@ def prepare_inputs(
 
         # [a - n1, b - n2, c - n3] ->
         # [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3]
-        cu_num_tokens = torch.empty_like(cu_target_query_lens)
+        cu_num_tokens = torch.zeros_like(cu_target_query_lens)
         torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:])
-        cu_num_tokens[0] = 0
-
-        # FIXME(woosuk): Avoid synchronization.
-        num_tokens = cu_num_tokens[-1].item()
         token_indices = torch.empty(
             num_tokens,
             dtype=torch.int32,
-            device=cu_num_tokens.device,
+            device=cu_target_query_lens.device,
         )
-
         batch_size = num_rejected_tokens.shape[0]
         BLOCK_SIZE = 1024
-        prepare_input_kernel[(batch_size, )](
+        prepare_eagle_input_kernel[(batch_size, )](
             token_indices,
             cu_target_query_lens,
             cu_num_tokens,
@@ -279,48 +313,28 @@ def prepare_inputs(
         return cu_num_tokens, token_indices
 
     def load_model(self, target_model: nn.Module) -> None:
-        loader = get_model_loader(self.vllm_config.load_config)
-        target_layer_num = self.vllm_config.model_config.get_num_layers(
-            self.vllm_config.parallel_config)
+        draft_model_config = \
+            self.vllm_config.speculative_config.draft_model_config
         target_attn_layer_names = set(
             get_layers_from_vllm_config(self.vllm_config, Attention).keys())
 
-        draft_model_config = \
-            self.vllm_config.speculative_config.draft_model_config
-        # FIXME(lily): This does not handle with distributed inference.
-        target_device = self.vllm_config.device_config.device
-        # We need to set the vllm_config here to register attention
-        # layers in the forward context.
-        with set_default_torch_dtype(
-                draft_model_config.dtype), set_current_vllm_config(
-                    self.vllm_config):
-            draft_model_cls, arch = ModelRegistry.resolve_model_cls(
-                draft_model_config.architectures)
-            self.model = draft_model_cls(
-                vllm_config=self.vllm_config,
-                start_layer_id=target_layer_num).to(target_device)
+        self.model = get_model(vllm_config=self.vllm_config,
+                               model_config=draft_model_config)
 
         draft_attn_layer_names = (
             get_layers_from_vllm_config(self.vllm_config, Attention).keys() -
             target_attn_layer_names)
-        assert len(draft_attn_layer_names) == 1
-        self.attn_layer_name = next(iter(draft_attn_layer_names))
-        loaded_weights = self.model.load_weights(
-            loader.get_all_weights(draft_model_config, self.model))
+
+        self.attn_layer_names = list(draft_attn_layer_names)
 
         # share embed_tokens with the target model if needed
         if get_pp_group().world_size == 1:
-            assert "model.embed_tokens.weight" not in loaded_weights, \
-            "For PP = 1, Eagle draft should share embed with target model"
             logger.info(
                 "The EAGLE head shares the same vocab embedding" \
                 " with the target model."
             )
             self.model.model.embed_tokens = target_model.model.embed_tokens
         else:
-            assert "model.embed_tokens.weight" in loaded_weights, \
-            "For PP > 1, Eagle draft checkpoint should its own copy of "
-            " the model.embed_tokens.weight"
             logger.info(
                 "Since PP > 1, the EAGLE head loaded its own vocab embedding" \
                 " weights instead of sharing them with the target model."
@@ -342,11 +356,30 @@ def dummy_run(
         with set_forward_context(None, self.vllm_config,
                                  num_tokens=num_tokens):
             self.model(
-                input_ids=self.input_ids[:num_tokens],
-                positions=self.positions[:num_tokens],
-                hidden_states=self.hidden_states[:num_tokens],
+                self.input_ids[:num_tokens],
+                self.positions[:num_tokens],
+                self.hidden_states[:num_tokens],
             )
 
+    def validate_same_kv_cache_group(self,
+                                     kv_cache_config: KVCacheConfig) -> None:
+        """
+        Validate that all eagle layers belong to the same KVCacheGroup.
+        Need this assumption to ensure all eagle layers can use the
+        same AttentionMetadata.
+        May extend to multiple AttentionMetadata in the future.
+        """
+        kv_cache_groups: dict[str, int] = {}
+        for id, kv_cache_group in enumerate(kv_cache_config.kv_cache_groups):
+            for layer_name in kv_cache_group.layer_names:
+                kv_cache_groups[layer_name] = id
+        assert len(
+            set([
+                kv_cache_groups[layer_name]
+                for layer_name in self.attn_layer_names
+            ])
+        ) == 1, "All eagle layers should belong to the same kv cache group"
+
 
 # NOTE(woosuk): Currently, the below code is not used and we always use argmax
 # to sample the draft tokens. We will use this after we find a way to manage
@@ -389,29 +422,3 @@ def compute_probs_and_sample_next_token(
             next_token_ids,
         )
     return next_token_ids, probs
-
-
-@triton.jit
-def prepare_input_kernel(
-    out_ptr,
-    cu_query_lens_ptr,
-    cu_num_tokens_ptr,
-    BLOCK_SIZE: tl.constexpr,
-):
-    pid = tl.program_id(0)
-
-    # [start_pos, end_pos)
-    start_pos = tl.load(cu_num_tokens_ptr + pid)
-    end_pos = tl.load(cu_num_tokens_ptr + pid + 1)
-    num_tokens = end_pos - start_pos
-
-    index_start = tl.load(cu_query_lens_ptr + pid)
-
-    num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE)
-    for i in tl.range(num_blocks):
-        offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-        tl.store(
-            out_ptr + start_pos + offset,
-            index_start + offset,
-            mask=offset < num_tokens,
-        )
diff --git a/vllm/v1/spec_decode/medusa.py b/vllm/v1/spec_decode/medusa.py
index 14bc9c9e0d1a..fdac2ef64c3f 100644
--- a/vllm/v1/spec_decode/medusa.py
+++ b/vllm/v1/spec_decode/medusa.py
@@ -3,12 +3,10 @@
 import torch
 import torch.nn as nn
 
-from vllm.config import VllmConfig, set_current_vllm_config
+from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
-from vllm.model_executor.model_loader import get_model_loader
-from vllm.model_executor.model_loader.utils import set_default_torch_dtype
-from vllm.model_executor.models.medusa import Medusa
+from vllm.model_executor.model_loader import get_model
 from vllm.v1.sample.metadata import SamplingMetadata
 
 # Initialize logger
@@ -49,20 +47,9 @@ def propose(
         return [list(row) for row in zip(*draft_tokens)]
 
     def load_model(self, target_model: nn.Module) -> None:
-        # Get model loader and config
-        loader = get_model_loader(self.vllm_config.load_config)
-        draft_config = self.vllm_config.speculative_config.draft_model_config
-
-        # Load model with proper dtype and config
-        with set_default_torch_dtype(draft_config.dtype), \
-                set_current_vllm_config(self.vllm_config):
-            self.model = Medusa(
-                vllm_config=self.vllm_config.speculative_config).to(
-                    self.device)
-
-        # Load model weights
-        weights = loader.get_all_weights(draft_config, self.model)
-        self.model.load_weights(weights)
+        self.model = get_model(vllm_config=self.vllm_config,
+                               model_config=self.vllm_config.
+                               speculative_config.draft_model_config)
 
     @torch.inference_mode()
     def dummy_run(self, num_tokens: int) -> None:
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
index 899aa9200e85..36091bef2895 100644
--- a/vllm/v1/spec_decode/metrics.py
+++ b/vllm/v1/spec_decode/metrics.py
@@ -134,17 +134,17 @@ def __init__(
 
         self.counter_spec_decode_num_drafts = \
             self._counter_cls(
-                name="vllm:spec_decode_num_drafts_total",
+                name="vllm:spec_decode_num_drafts",
                 documentation="Number of spec decoding drafts.",
                 labelnames=labelnames).labels(*labelvalues)
         self.counter_spec_decode_num_draft_tokens = \
             self._counter_cls(
-                name="vllm:spec_decode_num_draft_tokens_total",
+                name="vllm:spec_decode_num_draft_tokens",
                 documentation="Number of draft tokens.",
                 labelnames=labelnames,).labels(*labelvalues)
         self.counter_spec_decode_num_accepted_tokens = \
             self._counter_cls(
-                name="vllm:spec_decode_num_accepted_tokens_total",
+                name="vllm:spec_decode_num_accepted_tokens",
                 documentation="Number of accepted tokens.",
                 labelnames=labelnames).labels(*labelvalues)
 
diff --git a/vllm/v1/spec_decode/utils.py b/vllm/v1/spec_decode/utils.py
index ce81a40ee3ae..334258e7f87a 100644
--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+from vllm.triton_utils import tl, triton
 from vllm.v1.worker.gpu_input_batch import InputBatch
 
 
@@ -16,3 +17,29 @@ def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool:
         return False
 
     return True
+
+
+@triton.jit
+def prepare_eagle_input_kernel(
+    out_ptr,
+    cu_query_lens_ptr,
+    cu_num_tokens_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+
+    # [start_pos, end_pos)
+    start_pos = tl.load(cu_num_tokens_ptr + pid)
+    end_pos = tl.load(cu_num_tokens_ptr + pid + 1)
+    num_tokens = end_pos - start_pos
+
+    index_start = tl.load(cu_query_lens_ptr + pid)
+
+    num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE)
+    for i in tl.range(num_blocks):
+        offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        tl.store(
+            out_ptr + start_pos + offset,
+            index_start + offset,
+            mask=offset < num_tokens,
+        )
diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py
index f33f4972e103..111e92dc0990 100644
--- a/vllm/v1/structured_output/utils.py
+++ b/vllm/v1/structured_output/utils.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-import re
+import regex as re
 
 
 def grammar_is_likely_lark(grammar_str: str) -> bool:
diff --git a/vllm/v1/worker/block_table.py b/vllm/v1/worker/block_table.py
index 0c3341691509..576086ebeb7f 100644
--- a/vllm/v1/worker/block_table.py
+++ b/vllm/v1/worker/block_table.py
@@ -5,7 +5,6 @@
 
 from vllm.logger import init_logger
 from vllm.utils import cdiv
-from vllm.v1.kv_cache_interface import KVCacheConfig
 
 logger = init_logger(__name__)
 
@@ -105,15 +104,10 @@ class MultiGroupBlockTable:
 
     def __init__(self, max_num_reqs: int, max_model_len: int,
                  max_num_batched_tokens: int, pin_memory: bool,
-                 device: torch.device, kv_cache_config: KVCacheConfig) -> None:
-        max_num_blocks_per_req = [
-            cdiv(max_model_len, g.kv_cache_spec.block_size)
-            for g in kv_cache_config.kv_cache_groups
-        ]
+                 device: torch.device, block_size: int) -> None:
         self.block_tables = [
-            BlockTable(max_num_reqs, max_num_blocks_per_req[i],
+            BlockTable(max_num_reqs, cdiv(max_model_len, block_size),
                        max_num_batched_tokens, pin_memory, device)
-            for i in range(len(kv_cache_config.kv_cache_groups))
         ]
 
     def append_row(self, block_ids: list[list[int]], row_idx: int) -> None:
diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py
index 570de9bddd29..b3e65917d3cc 100644
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -11,7 +11,6 @@
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import swap_dict_values
-from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.utils import copy_slice
@@ -63,7 +62,7 @@ def __init__(
         device: torch.device,
         pin_memory: bool,
         vocab_size: int,
-        kv_cache_config: KVCacheConfig,
+        block_size: int,
     ):
         self.max_num_reqs = max_num_reqs
         self.max_model_len = max_model_len
@@ -105,7 +104,7 @@ def __init__(
             max_num_batched_tokens=max_num_batched_tokens,
             pin_memory=pin_memory,
             device=device,
-            kv_cache_config=kv_cache_config,
+            block_size=block_size,
         )
 
         # Sampling-related.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 201796c96ee5..910c0e80bb31 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -27,15 +27,15 @@
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
-from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.model_loader import TensorizerLoader, get_model
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.multimodal.utils import group_mm_inputs_by_modality
 from vllm.sampling_params import SamplingType
 from vllm.sequence import IntermediateTensors
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler,
-                        GiB_bytes, LazyLoader, cdiv, check_use_alibi,
-                        is_pin_memory_available)
+                        GiB_bytes, LazyLoader, async_tensor_h2d, cdiv,
+                        check_use_alibi, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
 from vllm.v1.attention.backends.utils import CommonAttentionMetadata
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
@@ -63,6 +63,7 @@
 if TYPE_CHECKING:
     import xgrammar as xgr
 
+    from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
     from vllm.v1.core.sched.output import SchedulerOutput
 else:
     xgr = LazyLoader("xgr", globals(), "xgrammar")
@@ -150,12 +151,16 @@ def __init__(
         self.use_aux_hidden_state_outputs = False
         if self.speculative_config:
             self.use_spec_decode = True
+
+            # NOTE(Jiayi): currently we put the entire draft model on
+            # the last PP rank. This is not ideal if there are many
+            # layers in the draft model.
             if get_pp_group().is_last_rank:
                 if self.speculative_config.method == "ngram":
                     self.drafter = NgramProposer(self.vllm_config)
                 elif self.speculative_config.use_eagle():
-                    self.drafter = EagleProposer(self.vllm_config,
-                                                 self.device)  # type: ignore
+                    self.drafter = EagleProposer(self.vllm_config, self.device,
+                                                 self)  # type: ignore
                     if self.speculative_config.method == "eagle3":
                         self.use_aux_hidden_state_outputs = True
                 elif self.speculative_config.method == "medusa":
@@ -170,6 +175,16 @@ def __init__(
         # Request states.
         self.requests: dict[str, CachedRequestState] = {}
 
+        self.input_batch = InputBatch(
+            max_num_reqs=self.max_num_reqs,
+            max_model_len=self.max_model_len,
+            max_num_batched_tokens=self.max_num_tokens,
+            device=self.device,
+            pin_memory=self.pin_memory,
+            vocab_size=self.model_config.get_vocab_size(),
+            block_size=self.cache_config.block_size,
+        )
+
         self.use_cuda_graph = (self.vllm_config.compilation_config.level
                                == CompilationLevel.PIECEWISE
                                and not self.model_config.enforce_eager)
@@ -266,7 +281,7 @@ def __init__(
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> bool:
         """
         Update the order of requests in the batch based on the attention
-        backend's needs. For example, some attention backends (namely MLA) may 
+        backend's needs. For example, some attention backends (namely MLA) may
         want to separate requests based on if the attention computation will be
         compute-bound or memory-bound.
 
@@ -914,8 +929,11 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         encoder_outputs = []
         for grouped_mm_inputs in grouped_mm_inputs_list:
             batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
-            batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
-                                                           device=self.device)
+            batched_mm_inputs = MultiModalKwargs.as_kwargs(
+                batched_mm_inputs,
+                dtype=self.model_config.dtype,
+                device=self.device,
+            )
 
             # Run the encoder.
             # `curr_group_outputs` is either of the following:
@@ -1348,7 +1366,16 @@ def execute_model(
             next_token_ids = torch.tensor(next_token_ids,
                                           dtype=torch.int32,
                                           device=self.device)
-            eagle_attn_metadata = attn_metadata[self.drafter.attn_layer_name]
+            # At this moment, we assume all eagle layers belong to the same KV
+            # cache group, thus using the same attention metadata.
+            eagle_attn_metadata = attn_metadata[
+                self.drafter.attn_layer_names[0]]
+
+            # NOTE: deepseek_mtp uses MLA which does not have `block_table`
+            if hasattr(eagle_attn_metadata, "block_table"):
+                block_table = eagle_attn_metadata.block_table
+            else:
+                block_table = None
 
             if spec_decode_metadata is None:
                 # input_ids can be None for multimodal models.
@@ -1369,14 +1396,16 @@ def execute_model(
                     n + 1 - len(valid_sampled_token_ids[i]) if n > 0 else 0
                     for i, n in enumerate(num_draft_tokens)
                 ]
-                num_rejected_tokens = torch.tensor(
+                num_rejected_tokens_tensor = async_tensor_h2d(
                     num_rejected_tokens,
                     dtype=torch.int32,
-                    device=self.device,
-                )
+                    target_device=self.device,
+                    pin_memory=True)
+                num_tokens = num_scheduled_tokens - sum(num_rejected_tokens)
                 cu_num_tokens, token_indices = self.drafter.prepare_inputs(
                     eagle_attn_metadata.query_start_loc,
-                    num_rejected_tokens,
+                    num_rejected_tokens_tensor,
+                    num_tokens,
                 )
                 target_token_ids = self.input_ids[token_indices]
                 target_positions = positions[token_indices]
@@ -1387,7 +1416,6 @@ def execute_model(
                     target_hidden_states = hidden_states[token_indices]
                 target_slot_mapping = eagle_attn_metadata.slot_mapping[
                     token_indices]
-
             draft_token_ids = self.drafter.propose(
                 target_token_ids=target_token_ids,
                 target_positions=target_positions,
@@ -1395,7 +1423,7 @@ def execute_model(
                 target_slot_mapping=target_slot_mapping,
                 next_token_ids=next_token_ids,
                 cu_num_tokens=cu_num_tokens,
-                block_table=eagle_attn_metadata.block_table,
+                block_table=block_table,
                 sampling_metadata=sampling_metadata,
             )
             spec_token_ids = draft_token_ids.tolist()
@@ -1523,6 +1551,15 @@ def load_model(self) -> None:
                     time_after_load - time_before_load)
         prepare_communication_buffer_for_model(self.model)
 
+    def save_tensorized_model(
+        self,
+        tensorizer_config: "TensorizerConfig",
+    ) -> None:
+        TensorizerLoader.save_model(
+            self.model,
+            tensorizer_config=tensorizer_config,
+        )
+
     def _get_prompt_logprobs_dict(
         self,
         hidden_states: torch.Tensor,
@@ -1703,8 +1740,7 @@ def _dummy_run(
             else:
                 hidden_states = outputs
 
-            if self.use_spec_decode and \
-                self.speculative_config.method in ('eagle', 'eagle3'):
+            if self.use_spec_decode and self.speculative_config.use_eagle():
                 assert isinstance(self.drafter, EagleProposer)
                 self.drafter.dummy_run(num_tokens)
 
@@ -1716,6 +1752,10 @@ def _dummy_sampler_run(
         self,
         hidden_states: torch.Tensor,
     ) -> torch.Tensor:
+        # The dummy hidden states may contain special values,
+        # like `inf` or `nan`.
+        # To avoid breaking the sampler, we use a random tensor here instead.
+        hidden_states = torch.rand_like(hidden_states)
 
         logits = self.model.compute_logits(hidden_states, None)
         num_reqs = logits.size(0)
@@ -1837,7 +1877,10 @@ def profile_run(self) -> None:
             batched_dummy_mm_inputs = MultiModalKwargs.batch(
                 [dummy_mm_kwargs] * max_num_mm_items)
             batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(
-                batched_dummy_mm_inputs, device=self.device)
+                batched_dummy_mm_inputs,
+                dtype=self.model_config.dtype,
+                device=self.device,
+            )
 
             # Run multimodal encoder.
             dummy_encoder_outputs = self.model.get_multimodal_embeddings(
@@ -1947,16 +1990,11 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             kv_cache_config: Configuration for the KV cache, including the KV
             cache size of each layer
         """
+        if len(kv_cache_config.kv_cache_groups) > 1:
+            raise NotImplementedError(
+                "Hybrid models with more than one KV cache type are not "
+                "supported yet.")
         self.kv_cache_config = kv_cache_config
-        self.input_batch = InputBatch(
-            max_num_reqs=self.max_num_reqs,
-            max_model_len=self.max_model_len,
-            max_num_batched_tokens=self.max_num_tokens,
-            device=self.device,
-            pin_memory=self.pin_memory,
-            vocab_size=self.model_config.get_vocab_size(),
-            kv_cache_config=kv_cache_config,
-        )
         self.initialize_attn_backend(kv_cache_config)
 
         kv_caches: dict[str, torch.Tensor] = {}
@@ -1988,6 +2026,12 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                     # KV cache specs.
                     raise ValueError("Unknown KV cache spec type.")
 
+        if self.speculative_config and self.speculative_config.use_eagle():
+            assert isinstance(self.drafter, EagleProposer)
+            # validate all draft model layers belong to the same kv cache
+            # group
+            self.drafter.validate_same_kv_cache_group(kv_cache_config)
+
         bind_kv_cache(
             kv_caches,
             self.vllm_config.compilation_config.static_forward_context,
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 93129d987940..bce5cbb5f9d0 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -31,6 +31,7 @@
 logger = init_logger(__name__)
 
 if TYPE_CHECKING:
+    from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
     from vllm.v1.core.sched.output import SchedulerOutput
 
 
@@ -171,10 +172,9 @@ def determine_available_memory(self) -> int:
         Then, it calculate the free memory that can be used for KV cache in
         bytes.
 
-        :::{tip}
-        You may limit the usage of GPU memory
-        by adjusting the `gpu_memory_utilization` parameter.
-        :::
+        Tip:
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
         """
         torch.cuda.empty_cache()
         torch.cuda.reset_peak_memory_stats()
@@ -326,6 +326,13 @@ def save_sharded_state(
             max_size=max_size,
         )
 
+    def save_tensorized_model(
+        self,
+        tensorizer_config: "TensorizerConfig",
+    ) -> None:
+        self.model_runner.save_tensorized_model(
+            tensorizer_config=tensorizer_config, )
+
 
 def init_worker_distributed_environment(
     vllm_config: VllmConfig,
@@ -341,8 +348,7 @@ def init_worker_distributed_environment(
                                  distributed_init_method, local_rank)
 
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size,
-                                      parallel_config.enable_expert_parallel)
+                                      parallel_config.pipeline_parallel_size)
 
     ensure_kv_transfer_initialized(vllm_config)
 
diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py
index 2da99696445e..46bcf64ed0c3 100644
--- a/vllm/v1/worker/tpu_model_runner.py
+++ b/vllm/v1/worker/tpu_model_runner.py
@@ -652,8 +652,11 @@ def _execute_mm_encoder(self, scheduler_output: "SchedulerOutput"):
         encoder_outputs = []
         for grouped_mm_inputs in grouped_mm_inputs_list:
             batched_mm_inputs = MultiModalKwargs.batch(grouped_mm_inputs)
-            batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
-                                                           device=self.device)
+            batched_mm_inputs = MultiModalKwargs.as_kwargs(
+                batched_mm_inputs,
+                dtype=self.model_config.dtype,
+                device=self.device,
+            )
 
             # Run the encoder.
             # `curr_group_outputs` is either of the following:
@@ -1261,7 +1264,8 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             device=self.device,
             pin_memory=self.pin_memory,
             vocab_size=self.model_config.get_vocab_size(),
-            kv_cache_config=kv_cache_config,
+            block_size=kv_cache_config.kv_cache_groups[0].kv_cache_spec.
+            block_size,
         )
         assert self.block_table_cpu.dtype == self.input_batch.block_table[
             0].get_cpu_tensor().dtype
@@ -1434,8 +1438,11 @@ def _get_mm_dummy_batch(self, modality: str,
 
         batched_dummy_mm_inputs = MultiModalKwargs.batch([dummy_mm_kwargs] *
                                                          batch_size)
-        return MultiModalKwargs.as_kwargs(batched_dummy_mm_inputs,
-                                          device=self.device)
+        return MultiModalKwargs.as_kwargs(
+            batched_dummy_mm_inputs,
+            dtype=self.model_config.dtype,
+            device=self.device,
+        )
 
 
 def _get_req_paddings(min_req_size: int, max_req_size: int) -> list[int]:
diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py
index ae3735ab0255..fa4eb30ccd9a 100644
--- a/vllm/v1/worker/tpu_worker.py
+++ b/vllm/v1/worker/tpu_worker.py
@@ -265,8 +265,7 @@ def init_tpu_worker_distributed_environment(
         backend="gloo",
     )
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size,
-                                      parallel_config.enable_expert_parallel)
+                                      parallel_config.pipeline_parallel_size)
 
 
 try:
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 267754036b31..91548a52cfc7 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -10,7 +10,7 @@ def sanity_check_mm_encoder_outputs(
 ) -> None:
     """
     Perform sanity checks for the result of
-    {meth}`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`.
+    [`vllm.model_executor.models.SupportsMultiModal.get_multimodal_embeddings`][].
     """
     assert isinstance(mm_embeddings, (list, tuple, torch.Tensor)), (
         "Expected multimodal embeddings to be a list/tuple of 2D tensors, "
@@ -39,7 +39,7 @@ def scatter_mm_placeholders(
     Scatter the multimodal embeddings into a contiguous tensor that represents
     the placeholder tokens.
 
-    {class}`vllm.multimodal.processing.PromptUpdateDetails.is_embed`.
+    [`vllm.multimodal.processing.PromptUpdateDetails.is_embed`][].
 
     Args:
         embeds: The multimodal embeddings.
@@ -66,7 +66,7 @@ def gather_mm_placeholders(
     """
     Reconstructs the embeddings from the placeholder tokens.
 
-    This is the operation of {func}`scatter_mm_placeholders`.
+    This is the operation of [scatter_mm_placeholders][].
     """
     if is_embed is None:
         return placeholders
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index c2120c035175..82eeeb570d22 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -297,8 +297,11 @@ def execute_model(
             model_input.encoder_input_tokens,
             "encoder_positions":
             model_input.encoder_input_positions,
-            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
-                                         device=self.device),
+            **MultiModalKwargs.as_kwargs(
+                model_input.multi_modal_kwargs or {},
+                dtype=self.model_config.dtype,
+                device=self.device,
+            ),
             "intermediate_tensors":
             intermediate_tensors,
         }
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 710ca1a13b0c..fb436a079f87 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -628,7 +628,10 @@ def execute_model(
         multimodal_kwargs = {}
         if model_input.multi_modal_kwargs is not None:
             multimodal_kwargs = MultiModalKwargs.as_kwargs(
-                model_input.multi_modal_kwargs, device=self.device)
+                model_input.multi_modal_kwargs,
+                dtype=self.model_config.dtype,
+                device=self.device,
+            )
         execute_model_kwargs = {}
         if previous_hidden_states is not None:
             execute_model_kwargs.update(
diff --git a/vllm/worker/cpu_pooling_model_runner.py b/vllm/worker/cpu_pooling_model_runner.py
index 1ceb2557c6b3..2a60e51261ad 100644
--- a/vllm/worker/cpu_pooling_model_runner.py
+++ b/vllm/worker/cpu_pooling_model_runner.py
@@ -50,8 +50,11 @@ def execute_model(
             model_input.input_tokens,
             "positions":
             model_input.input_positions,
-            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
-                                         device=self.device),
+            **MultiModalKwargs.as_kwargs(
+                model_input.multi_modal_kwargs or {},
+                dtype=self.model_config.dtype,
+                device=self.device,
+            ),
             **cross_enc_kwargs,
             "intermediate_tensors":
             intermediate_tensors,
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index a92cf1e5a3b3..1436a404335a 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -390,8 +390,7 @@ def init_distributed_environment(self) -> None:
 
         ensure_model_parallel_initialized(
             parallel_config.tensor_parallel_size,
-            parallel_config.pipeline_parallel_size,
-            parallel_config.enable_expert_parallel)
+            parallel_config.pipeline_parallel_size)
 
     def get_cache_block_size_bytes(self) -> int:
         """Return the size in bytes of a single KV cache block.
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 4864163b0de2..3957e5608524 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -202,9 +202,13 @@ def execute_model(
                 encoder_input_ids=model_input.encoder_input_tokens,
                 encoder_positions=model_input.encoder_input_positions,
                 intermediate_tensors=intermediate_tensors,
-                **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
-                                             device=self.device),
-                **seqlen_agnostic_kwargs)
+                **MultiModalKwargs.as_kwargs(
+                    multi_modal_kwargs,
+                    dtype=self.model_config.dtype,
+                    device=self.device,
+                ),
+                **seqlen_agnostic_kwargs,
+            )
 
         logits = self.model.compute_logits(hidden_or_intermediate_states,
                                            model_input.sampling_metadata)
diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py
index 42882992f2da..533fead0e669 100644
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
@@ -201,10 +201,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         Then, it calculate the maximum possible number of GPU and CPU blocks
         that can be allocated with the remaining free memory.
 
-        :::{tip}
-        You may limit the usage of GPU memory
-        by adjusting the `gpu_memory_utilization` parameter.
-        :::
+        Tip:
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
         """
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.
@@ -416,8 +415,7 @@ def init_worker_distributed_environment(
                                  backend='hccl')
 
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size,
-                                      parallel_config.enable_expert_parallel)
+                                      parallel_config.pipeline_parallel_size)
 
     if torch.distributed.is_initialized():
         torch_world_size = torch.distributed.get_world_size()
@@ -443,8 +441,7 @@ def init_worker_distributed_environment(
     torch.distributed.all_reduce(dummy_tensor_hpu)
     assert dummy_tensor_hpu.item() == parallel_config.world_size
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size,
-                                      parallel_config.enable_expert_parallel)
+                                      parallel_config.pipeline_parallel_size)
 
 
 def raise_if_cache_size_invalid(num_gpu_blocks, block_size, max_model_len,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 8a294de45c81..8c968faa7810 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -23,7 +23,7 @@
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.config import CompilationLevel, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
-from vllm.distributed import get_pp_group
+from vllm.distributed import broadcast_tensor_dict, get_pp_group
 from vllm.distributed.kv_transfer import get_kv_transfer_group
 from vllm.distributed.parallel_state import (get_tensor_model_parallel_rank,
                                              graph_capture)
@@ -729,7 +729,10 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
         mm_kwargs, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
             seq_group_metadata,
             range(positions[0], positions[0] + len(positions)))
-        if not mm_kwargs:
+
+        # M-RoPE requires mrope_positions even for plain text; return early
+        # when mm_kwargs is empty only if inter_data.is_prompt is False.
+        if not mm_kwargs and not inter_data.is_prompt:
             return
 
         inter_data.multi_modal_kwargs = mm_kwargs
@@ -741,12 +744,6 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
             video_grid_thw = mm_kwargs.get("video_grid_thw", None)
             audio_feature_lengths = mm_kwargs.get("audio_feature_lengths",
                                                   None)
-            assert (
-                image_grid_thw is not None or video_grid_thw is not None
-                or audio_feature_lengths is not None), (
-                    "mrope embedding type requires multi-modal input mapper "
-                    "returns 'image_grid_thw' or 'video_grid_thw' or "
-                    "'audio_feature_lengths'.")
 
             second_per_grid_ts = mm_kwargs.get("second_per_grid_ts", None)
             use_audio_in_video = mm_kwargs.get("use_audio_in_video", False)
@@ -872,7 +869,7 @@ def build(self) -> ModelInputForGPU:
         """
         # Combine and flatten intermediate data.
         input_tokens = list[int]()
-        inputs_embeds_lst = list[torch.Tensor]()
+        inputs_embeds_list = list[torch.Tensor]()
         token_types = list[int]()
         for inter_data in self.inter_data_list:
             for cur_input_tokens in inter_data.input_tokens:
@@ -880,15 +877,15 @@ def build(self) -> ModelInputForGPU:
             for cur_token_types in inter_data.token_types:
                 token_types.extend(cur_token_types)
             if inter_data.inputs_embeds is not None:
-                inputs_embeds_lst.append(
+                inputs_embeds_list.append(
                     inter_data.inputs_embeds.to(
                         dtype=self.runner.model_config.dtype,
                         device=self.runner.device))
         inputs_embeds: Optional[torch.Tensor]
-        if len(inputs_embeds_lst) == 0:
+        if len(inputs_embeds_list) == 0:
             inputs_embeds = None
         else:
-            inputs_embeds = torch.cat(inputs_embeds_lst, dim=0).to(
+            inputs_embeds = torch.cat(inputs_embeds_list, dim=0).to(
                 dtype=self.runner.model_config.dtype,
                 device=self.runner.device)
             assert len(inputs_embeds) == len(input_tokens)
@@ -1848,8 +1845,11 @@ def execute_model(
                     inputs_embeds=model_input.inputs_embeds,
                     positions=model_input.input_positions,
                     intermediate_tensors=intermediate_tensors,
-                    **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
-                                                 device=self.device),
+                    **MultiModalKwargs.as_kwargs(
+                        multi_modal_kwargs,
+                        dtype=self.model_config.dtype,
+                        device=self.device,
+                    ),
                     **seqlen_agnostic_kwargs,
                     **model_kwargs,
                 )
@@ -1893,50 +1893,60 @@ def execute_model(
         logits = self.model.compute_logits(hidden_or_intermediate_states,
                                            model_input.sampling_metadata)
 
-        if not self.is_driver_worker:
-            return []
+        if self.is_driver_worker:
+            if model_input.async_callback is not None:
+                model_input.async_callback()
 
-        if model_input.async_callback is not None:
-            model_input.async_callback()
-
-        # Sample the next token.
-        assert isinstance(self.sampler, Sampler)
-        orig_include_gpu_probs_tensor = self.sampler.include_gpu_probs_tensor
-        if model_input.inputs_embeds is not None:
-            self.sampler.include_gpu_probs_tensor = True
+            # Sample the next token.
+            assert isinstance(self.sampler, Sampler)
+            orig_include_gpu_probs = self.sampler.include_gpu_probs_tensor
+            if model_input.inputs_embeds is not None:
+                self.sampler.include_gpu_probs_tensor = True
 
-        output: SamplerOutput = self.sampler(
-            logits=logits,
-            sampling_metadata=model_input.sampling_metadata,
-        )
-        if (self.observability_config is not None
-                and self.observability_config.collect_model_forward_time
-                and output is not None):
-            model_forward_end.synchronize()
-            model_forward_time = model_forward_start.elapsed_time(
-                model_forward_end)
-            orig_model_forward_time = 0.0
-            if intermediate_tensors is not None:
-                orig_model_forward_time = intermediate_tensors.tensors.get(
-                    "model_forward_time", torch.tensor(0.0)).item()
-            # If there are multiple workers, we are still tracking the latency
-            # from the start time of the driver worker to the end time of the
-            # driver worker. The model forward time will then end up covering
-            # the communication time as well.
-            output.model_forward_time = (orig_model_forward_time +
-                                         model_forward_time)
+            output: SamplerOutput = self.sampler(
+                logits=logits,
+                sampling_metadata=model_input.sampling_metadata,
+            )
+            if (self.observability_config is not None
+                    and self.observability_config.collect_model_forward_time
+                    and output is not None):
+                model_forward_end.synchronize()
+                model_forward_time = model_forward_start.elapsed_time(
+                    model_forward_end)
+                orig_model_forward_time = 0.0
+                if intermediate_tensors is not None:
+                    orig_model_forward_time = intermediate_tensors.tensors.get(
+                        "model_forward_time", torch.tensor(0.0)).item()
+                # If there are multiple workers, we are still tracking the
+                # latency from the start time of the driver worker to the end
+                # time of the driver worker. The model forward time will then
+                # end up covering the communication time as well.
+                output.model_forward_time = (orig_model_forward_time +
+                                             model_forward_time)
 
         if model_input.inputs_embeds is not None:
-            self.sampler.include_gpu_probs_tensor = \
-                orig_include_gpu_probs_tensor
-            if output.sampled_token_ids is not None:
-                output.sampled_token_embeds = self.model.get_input_embeddings(
-                    output.sampled_token_ids.squeeze(1))
-
-                for token_embed, sequence_group_output in zip(
-                        output.sampled_token_embeds, output.outputs):
-                    assert len(sequence_group_output.samples) == 1
-                    sequence_group_output.samples[0].output_embed = token_embed
+            if self.is_driver_worker:
+                sampled = broadcast_tensor_dict(
+                    {"token_ids": output.sampled_token_ids})
+            else:
+                sampled = broadcast_tensor_dict()
+            if sampled["token_ids"] is not None:
+                sampled_token_embeds = self.model.get_input_embeddings(
+                    sampled["token_ids"].squeeze(1))
+                if self.is_driver_worker:
+                    self.sampler.include_gpu_probs_tensor = \
+                        orig_include_gpu_probs
+
+                    output.sampled_token_embeds = sampled_token_embeds
+
+                    for token_embed, sequence_group_output in zip(
+                            output.sampled_token_embeds, output.outputs):
+                        assert len(sequence_group_output.samples) == 1
+                        sequence_group_output.samples[
+                            0].output_embed = token_embed
+
+        if not self.is_driver_worker:
+            return []
 
         if self.return_hidden_states:
             # we only need to pass hidden states of most recent token
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
index 0825abbed143..f8d5acf586c5 100644
--- a/vllm/worker/multi_step_model_runner.py
+++ b/vllm/worker/multi_step_model_runner.py
@@ -733,12 +733,13 @@ def _pythonize_sampler_output(
     logprobs_tensor: Optional[torch.Tensor],
     cache: Optional[PythonizationCache],
 ) -> None:
-    """ This function is only called when the output tensors are ready. 
-    See {class}`ModelOutput`. 
-    
-    Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place, 
+    """ This function is only called when the output tensors are ready.
+    See [`ModelOutput`][vllm.worker.multi_step_model_runner.ModelOutput].
+
+    Modifies `output.outputs` and `pinned_sampled_token_buffer` in-place,
     adding a Pythonized output data structure
-    ({class}`CompletionSequenceGroupOutput`) for each {class}`SequenceGroup`.
+    ([`CompletionSequenceGroupOutput`][vllm.sequence.CompletionSequenceGroupOutput])
+    for each [`SequenceGroup`][vllm.sequence.SequenceGroup].
 
     Args:
       model_input
@@ -824,7 +825,7 @@ def _pythonize_sampler_output(
 
     for sgdx, (seq_group,
                sample_result) in enumerate(zip(seq_groups, samples_list)):
-        # Reminder: Please update docs/source/features/compatibility_matrix.md
+        # Reminder: Please update docs/features/compatibility_matrix.md
         # If the feature combo become valid
         # (Check for Guided Decoding)
         if seq_group.sampling_params.logits_processors:
diff --git a/vllm/worker/multi_step_neuron_model_runner.py b/vllm/worker/multi_step_neuron_model_runner.py
index 9618a4b49ff8..aafb7ab7cfb8 100644
--- a/vllm/worker/multi_step_neuron_model_runner.py
+++ b/vllm/worker/multi_step_neuron_model_runner.py
@@ -70,8 +70,11 @@ def execute_model(
             input_ids=model_input.input_tokens,
             positions=model_input.input_positions,
             input_block_ids=model_input.input_block_ids,
-            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
-                                         device=self.device),
+            **MultiModalKwargs.as_kwargs(
+                model_input.multi_modal_kwargs or {},
+                dtype=self.model_config.dtype,
+                device=self.device,
+            ),
         )
 
         output = self.model.sample(
diff --git a/vllm/worker/multi_step_neuronx_distributed_model_runner.py b/vllm/worker/multi_step_neuronx_distributed_model_runner.py
index b6a3492a493b..3a9c0993e004 100644
--- a/vllm/worker/multi_step_neuronx_distributed_model_runner.py
+++ b/vllm/worker/multi_step_neuronx_distributed_model_runner.py
@@ -49,8 +49,11 @@ def execute_model(
             positions=model_input.input_positions,
             input_block_ids=model_input.input_block_ids,
             sampling_params=sampling_params,
-            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
-                                         device=self.device),
+            **MultiModalKwargs.as_kwargs(
+                model_input.multi_modal_kwargs or {},
+                dtype=self.model_config.dtype,
+                device=self.device,
+            ),
         )
 
         output = self.model.sample(
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index e97adf757cc1..968596471a26 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -378,9 +378,11 @@ def execute_model(
                 positions=model_input.input_positions,
                 input_block_ids=model_input.input_block_ids,
                 sampling_params=sampling_params,
-                **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs
-                                             or {},
-                                             device=self.device),
+                **MultiModalKwargs.as_kwargs(
+                    model_input.multi_modal_kwargs or {},
+                    dtype=self.model_config.dtype,
+                    device=self.device,
+                ),
             )
         elif current_platform.use_transformers_neuronx():
             # [TODO] validate on-device sampling
@@ -389,9 +391,11 @@ def execute_model(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
                 input_block_ids=model_input.input_block_ids,
-                **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs
-                                             or {},
-                                             device=self.device),
+                **MultiModalKwargs.as_kwargs(
+                    model_input.multi_modal_kwargs or {},
+                    dtype=self.model_config.dtype,
+                    device=self.device,
+                ),
             )
 
         # Compute the logits only if the on-device sampling is turned off as
diff --git a/vllm/worker/pooling_model_runner.py b/vllm/worker/pooling_model_runner.py
index fdb7353f2f9c..912e04c435f5 100644
--- a/vllm/worker/pooling_model_runner.py
+++ b/vllm/worker/pooling_model_runner.py
@@ -119,10 +119,14 @@ def execute_model(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
                 intermediate_tensors=intermediate_tensors,
-                **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
-                                             device=self.device),
+                **MultiModalKwargs.as_kwargs(
+                    multi_modal_kwargs,
+                    dtype=self.model_config.dtype,
+                    device=self.device,
+                ),
                 **cross_enc_kwargs,
-                **seqlen_agnostic_kwargs)
+                **seqlen_agnostic_kwargs,
+            )
 
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):
diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 891ed66599dc..4bb9bea022f9 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -76,8 +76,7 @@ def init_device(self) -> None:
         )
         ensure_model_parallel_initialized(
             self.parallel_config.tensor_parallel_size,
-            self.parallel_config.pipeline_parallel_size,
-            self.parallel_config.enable_expert_parallel)
+            self.parallel_config.pipeline_parallel_size)
 
         # Device initialization should happen after initializing the distributed
         # runtime.
diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py
index d925f088357b..e2854bcb37ce 100644
--- a/vllm/worker/utils.py
+++ b/vllm/worker/utils.py
@@ -14,7 +14,7 @@ def assert_enc_dec_mr_supported_scenario(
     a supported scenario.
     '''
 
-    # Reminder: Please update docs/source/features/compatibility_matrix.md
+    # Reminder: Please update docs/features/compatibility_matrix.md
     # If the feature combo become valid
 
     if enc_dec_mr.cache_config.enable_prefix_caching:
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index bb148c398a77..f258678fef70 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -262,10 +262,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         Then, it calculate the maximum possible number of GPU and CPU blocks
         that can be allocated with the remaining free memory.
 
-        :::{tip}
-        You may limit the usage of GPU memory
-        by adjusting the `gpu_memory_utilization` parameter.
-        :::
+        Tip:
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
         """
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.
@@ -558,8 +557,7 @@ def init_worker_distributed_environment(
     init_distributed_environment(parallel_config.world_size, rank,
                                  distributed_init_method, local_rank)
     ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
-                                      parallel_config.pipeline_parallel_size,
-                                      parallel_config.enable_expert_parallel)
+                                      parallel_config.pipeline_parallel_size)
 
     ensure_kv_transfer_initialized(vllm_config)
 
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 7042b575aa78..79fa7d2c73e8 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -562,9 +562,12 @@ def execute_model(
                 input_ids=model_input.input_tokens,
                 positions=model_input.input_positions,
                 intermediate_tensors=intermediate_tensors,
-                **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs
-                                             or {},
-                                             device=self.device))
+                **MultiModalKwargs.as_kwargs(
+                    model_input.multi_modal_kwargs or {},
+                    dtype=self.model_config.dtype,
+                    device=self.device,
+                ),
+            )
         # Compute the logits in the last pipeline stage.
         if not get_pp_group().is_last_rank:
             return hidden_or_intermediate_states
diff --git a/vllm/worker/xpu_worker.py b/vllm/worker/xpu_worker.py
index 65085f80f97a..a5109a982cbf 100644
--- a/vllm/worker/xpu_worker.py
+++ b/vllm/worker/xpu_worker.py
@@ -93,10 +93,9 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         Then, it calculate the maximum possible number of GPU and CPU blocks
         that can be allocated with the remaining free memory.
 
-        :::{tip}
-        You may limit the usage of GPU memory
-        by adjusting the `gpu_memory_utilization` parameter.
-        :::
+        Tip:
+            You may limit the usage of GPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
         """
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.
@@ -176,8 +175,7 @@ def init_worker_distributed_environment(self) -> None:
 
         ensure_model_parallel_initialized(
             parallel_config.tensor_parallel_size,
-            parallel_config.pipeline_parallel_size,
-            parallel_config.enable_expert_parallel)
+            parallel_config.pipeline_parallel_size)
         # global all_reduce needed for overall oneccl warm up
         torch.distributed.all_reduce(torch.zeros(1).xpu())