[doc] improve readability (vllm-project#18675)

reidliu41 · amitm02 · commit 88616b55d113 · 2025-06-01T17:55:09.000+03:00
Signed-off-by: reidliu41 &lt;reid201711@gmail.com&gt;
Co-authored-by: reidliu41 &lt;reid201711@gmail.com&gt;
Signed-off-by: amit &lt;amit.man@gmail.com&gt;
diff --git a/docs/contributing/dockerfile/dockerfile.md b/docs/contributing/dockerfile/dockerfile.md
@@ -26,7 +26,12 @@ The edges of the build graph represent:
   > Commands to regenerate the build graph (make sure to run it **from the \`root\` directory of the vLLM repository** where the dockerfile is present):
   >
   > ```bash
-  > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile
+  > dockerfilegraph \
+  >   -o png \
+  >   --legend \
+  >   --dpi 200 \
+  >   --max-label-length 50 \
+  >   --filename docker/Dockerfile
   > ```
   >
   > or in case you want to run it directly with the docker image:
diff --git a/docs/contributing/model/registration.md b/docs/contributing/model/registration.md
@@ -41,7 +41,10 @@ If your model imports modules that initialize CUDA, consider lazy-importing it t
 ```python
 from vllm import ModelRegistry
 
-ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
+ModelRegistry.register_model(
+    "YourModelForCausalLM",
+    "your_code:YourModelForCausalLM"
+)
 ```
 
 !!! warning
diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md
@@ -11,7 +11,7 @@ vLLM offers an official Docker image for deployment.
 The image can be used to run OpenAI compatible server and is available on Docker Hub as [vllm/vllm-openai](https://hub.docker.com/r/vllm/vllm-openai/tags).
 
 ```console
-$ docker run --runtime nvidia --gpus all \
+docker run --runtime nvidia --gpus all \
     -v ~/.cache/huggingface:/root/.cache/huggingface \
     --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
     -p 8000:8000 \
@@ -23,7 +23,7 @@ $ docker run --runtime nvidia --gpus all \
 This image can also be used with other container engines such as [Podman](https://podman.io/).
 
 ```console
-$ podman run --gpus all \
+podman run --gpus all \
   -v ~/.cache/huggingface:/root/.cache/huggingface \
   --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
   -p 8000:8000 \
@@ -73,7 +73,10 @@ You can build and run vLLM from source via the provided <gh-file:docker/Dockerfi
 
 ```console
 # optionally specifies: --build-arg max_jobs=8 --build-arg nvcc_threads=2
-DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --file docker/Dockerfile
+DOCKER_BUILDKIT=1 docker build . \
+    --target vllm-openai \
+    --tag vllm/vllm-openai \
+    --file docker/Dockerfile
 ```
 
 !!! note
@@ -96,8 +99,8 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
 
 ```console
 # Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
-$ python3 use_existing_torch.py
-$ DOCKER_BUILDKIT=1 docker build . \
+python3 use_existing_torch.py
+DOCKER_BUILDKIT=1 docker build . \
   --file docker/Dockerfile \
   --target vllm-openai \
   --platform "linux/arm64" \
@@ -113,7 +116,7 @@ $ DOCKER_BUILDKIT=1 docker build . \
 To run vLLM with the custom-built Docker image:
 
 ```console
-$ docker run --runtime nvidia --gpus all \
+docker run --runtime nvidia --gpus all \
     -v ~/.cache/huggingface:/root/.cache/huggingface \
     -p 8000:8000 \
     --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
diff --git a/docs/deployment/frameworks/skypilot.md b/docs/deployment/frameworks/skypilot.md
@@ -82,7 +82,11 @@ Check the output of the command. There will be a shareable gradio link (like the
 **Optional**: Serve the 70B model instead of the default 8B and use more GPU:
 
 ```console
-HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
+HF_TOKEN="your-huggingface-token" \
+  sky launch serving.yaml \
+  --gpus A100:8 \
+  --env HF_TOKEN \
+  --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
 ```
 
 ## Scale up to multiple replicas
@@ -155,7 +159,9 @@ run: |
 Start the serving the Llama-3 8B model on multiple replicas:
 
 ```console
-HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
+HF_TOKEN="your-huggingface-token" \
+  sky serve up -n vllm serving.yaml \
+  --env HF_TOKEN
 ```
 
 Wait until the service is ready:
@@ -318,7 +324,9 @@ run: |
 1. Start the chat web UI:
 
     ```console
-    sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
+    sky launch \
+      -c gui ./gui.yaml \
+      --env ENDPOINT=$(sky serve status --endpoint vllm)
     ```
 
 2. Then, we can access the GUI at the returned gradio link:
diff --git a/docs/deployment/frameworks/streamlit.md b/docs/deployment/frameworks/streamlit.md
@@ -33,7 +33,8 @@ pip install streamlit openai
 streamlit run streamlit_openai_chatbot_webserver.py
 
 # or specify the VLLM_API_BASE or VLLM_API_KEY
-VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" streamlit run streamlit_openai_chatbot_webserver.py
+VLLM_API_BASE="http://vllm-server-host:vllm-server-port/v1" \
+    streamlit run streamlit_openai_chatbot_webserver.py
 
 # start with debug mode to view more details
 streamlit run streamlit_openai_chatbot_webserver.py --logger.level=debug
diff --git a/docs/deployment/nginx.md b/docs/deployment/nginx.md
@@ -77,7 +77,11 @@ If you are behind proxy, you can pass the proxy settings to the docker build com
 
 ```console
 cd $vllm_root
-docker build -f docker/Dockerfile . --tag vllm --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy
+docker build \
+    -f docker/Dockerfile . \
+    --tag vllm \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy
 ```
 
 [](){ #nginxloadbalancer-nginx-docker-network }
@@ -102,8 +106,26 @@ Notes:
 ```console
 mkdir -p ~/.cache/huggingface/hub/
 hf_cache_dir=~/.cache/huggingface/
-docker run -itd --ipc host --network vllm_nginx --gpus device=0 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8081:8000 --name vllm0 vllm --model meta-llama/Llama-2-7b-chat-hf
-docker run -itd --ipc host --network vllm_nginx --gpus device=1 --shm-size=10.24gb -v $hf_cache_dir:/root/.cache/huggingface/ -p 8082:8000 --name vllm1 vllm --model meta-llama/Llama-2-7b-chat-hf
+docker run \
+    -itd \
+    --ipc host \
+    --network vllm_nginx \
+    --gpus device=0 \
+    --shm-size=10.24gb \
+    -v $hf_cache_dir:/root/.cache/huggingface/ \
+    -p 8081:8000 \
+    --name vllm0 vllm \
+    --model meta-llama/Llama-2-7b-chat-hf
+docker run \
+    -itd \
+    --ipc host \
+    --network vllm_nginx \
+    --gpus device=1 \
+    --shm-size=10.24gb \
+    -v $hf_cache_dir:/root/.cache/huggingface/ \
+    -p 8082:8000 \
+    --name vllm1 vllm \
+    --model meta-llama/Llama-2-7b-chat-hf
 ```
 
 !!! note
@@ -114,7 +136,12 @@ docker run -itd --ipc host --network vllm_nginx --gpus device=1 --shm-size=10.24
 ## Launch Nginx
 
 ```console
-docker run -itd -p 8000:80 --network vllm_nginx -v ./nginx_conf/:/etc/nginx/conf.d/ --name nginx-lb nginx-lb:latest
+docker run \
+    -itd \
+    -p 8000:80 \
+    --network vllm_nginx \
+    -v ./nginx_conf/:/etc/nginx/conf.d/ \
+    --name nginx-lb nginx-lb:latest
 ```
 
 [](){ #nginxloadbalancer-nginx-verify-nginx }
diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md
@@ -42,7 +42,9 @@ print(f'Model is quantized and saved at "{quant_path}"')
 To run an AWQ model with vLLM, you can use [TheBloke/Llama-2-7b-Chat-AWQ](https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ) with the following command:
 
 ```console
-python examples/offline_inference/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
+python examples/offline_inference/llm_engine_example.py \
+    --model TheBloke/Llama-2-7b-Chat-AWQ \
+    --quantization awq
 ```
 
 AWQ models are also supported directly through the LLM entrypoint:
diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md
@@ -33,7 +33,12 @@ import torch
 
 # "hxbgsyxh/llama-13b-4bit-g-1-bitblas" is a pre-quantized checkpoint.
 model_id = "hxbgsyxh/llama-13b-4bit-g-1-bitblas"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, quantization="bitblas")
+llm = LLM(
+    model=model_id,
+    dtype=torch.bfloat16,
+    trust_remote_code=True,
+    quantization="bitblas"
+)
 ```
 
 ## Read gptq format checkpoint
@@ -44,5 +49,11 @@ import torch
 
 # "hxbgsyxh/llama-13b-4bit-g-1" is a pre-quantized checkpoint.
 model_id = "hxbgsyxh/llama-13b-4bit-g-1"
-llm = LLM(model=model_id, dtype=torch.float16, trust_remote_code=True, quantization="bitblas", max_model_len=1024)
+llm = LLM(
+    model=model_id,
+    dtype=torch.float16,
+    trust_remote_code=True,
+    quantization="bitblas",
+    max_model_len=1024
+)
 ```
diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md
@@ -27,7 +27,11 @@ from vllm import LLM
 import torch
 # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
 model_id = "unsloth/tinyllama-bnb-4bit"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True)
+llm = LLM(
+    model=model_id,
+    dtype=torch.bfloat16,
+    trust_remote_code=True
+)
 ```
 
 ## Inflight quantization: load as 4bit quantization
@@ -38,8 +42,12 @@ For inflight 4bit quantization with BitsAndBytes, you need to explicitly specify
 from vllm import LLM
 import torch
 model_id = "huggyllama/llama-7b"
-llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
-quantization="bitsandbytes")
+llm = LLM(
+    model=model_id,
+    dtype=torch.bfloat16,
+    trust_remote_code=True,
+    quantization="bitsandbytes"
+)
 ```
 
 ## OpenAI Compatible Server
diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md
@@ -14,14 +14,17 @@ To run a GGUF model with vLLM, you can download and use the local GGUF model fro
 ```console
 wget https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
 # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0
 ```
 
 You can also add `--tensor-parallel-size 2` to enable tensor parallelism inference with 2 GPUs:
 
 ```console
 # We recommend using the tokenizer from base model to avoid long-time and buggy tokenizer conversion.
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tensor-parallel-size 2
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+   --tensor-parallel-size 2
 ```
 
 !!! warning
@@ -31,7 +34,9 @@ GGUF assumes that huggingface can convert the metadata to a config file. In case
 
 ```console
 # If you model is not supported by huggingface you can manually provide a huggingface compatible config path
-vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
+vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
+   --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+   --hf-config-path Tinyllama/TInyLlama-1.1B-Chat-v1.0
 ```
 
 You can also use the GGUF model directly through the LLM entrypoint:
diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md
@@ -59,7 +59,8 @@ model.save(quant_path)
 To run an GPTQModel quantized model with vLLM, you can use [DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2](https://huggingface.co/ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2) with the following command:
 
 ```console
-python examples/offline_inference/llm_engine_example.py --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
+python examples/offline_inference/llm_engine_example.py \
+    --model ModelCloud/DeepSeek-R1-Distill-Qwen-7B-gptqmodel-4bit-vortex-v2
 ```
 
 ## Using GPTQModel with vLLM's Python API
diff --git a/docs/features/quantization/torchao.md b/docs/features/quantization/torchao.md
@@ -7,7 +7,9 @@ We recommend installing the latest torchao nightly with
 ```console
 # Install the latest TorchAO nightly build
 # Choose the CUDA version that matches your system (cu126, cu128, etc.)
-pip install --pre torchao>=10.0.0 --index-url https://download.pytorch.org/whl/nightly/cu126
+pip install \
+    --pre torchao>=10.0.0 \
+    --index-url https://download.pytorch.org/whl/nightly/cu126
 ```
 
 ## Quantizing HuggingFace Models
@@ -20,7 +22,12 @@ from torchao.quantization import Int8WeightOnlyConfig
 
 model_name = "meta-llama/Meta-Llama-3-8B"
 quantization_config = TorchAoConfig(Int8WeightOnlyConfig())
-quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
@@ -27,7 +27,8 @@ vLLM currently supports the following reasoning models:
 To use reasoning models, you need to specify the `--reasoning-parser` flags when making a request to the chat completion endpoint. The `--reasoning-parser` flag specifies the reasoning parser to use for extracting reasoning content from the model output.
 
 ```bash
-vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B --reasoning-parser deepseek_r1
+vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
+    --reasoning-parser deepseek_r1
 ```
 
 Next, make a request to the model that should return the reasoning content in the response.
diff --git a/docs/features/spec_decode.md b/docs/features/spec_decode.md
@@ -45,8 +45,13 @@ for output in outputs:
 To perform the same with an online mode launch the server:
 
 ```bash
-python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
-    --seed 42 -tp 1 --gpu_memory_utilization 0.8 \
+python -m vllm.entrypoints.openai.api_server \
+    --host 0.0.0.0 \
+    --port 8000 \
+    --model facebook/opt-6.7b \
+    --seed 42 \
+    -tp 1 \
+    --gpu_memory_utilization 0.8 \
     --speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}'
 ```
 
diff --git a/docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md b/docs/getting_started/installation/ai_accelerator/hpu-gaudi.inc.md
@@ -45,7 +45,15 @@ Use the following commands to run a Docker image:
 
 ```console
 docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+docker run \
+  -it \
+  --runtime=habana \
+  -e HABANA_VISIBLE_DEVICES=all \
+  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+  --cap-add=sys_nice \
+  --net=host \
+  --ipc=host \
+  vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
 ```
 
 # --8<-- [end:requirements]
@@ -91,7 +99,14 @@ Currently, there are no pre-built Intel Gaudi images.
 
 ```console
 docker build -f docker/Dockerfile.hpu -t vllm-hpu-env  .
-docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
+docker run \
+  -it \
+  --runtime=habana \
+  -e HABANA_VISIBLE_DEVICES=all \
+  -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+  --cap-add=sys_nice \
+  --net=host \
+  --rm vllm-hpu-env
 ```
 
 !!! tip
diff --git a/docs/getting_started/installation/ai_accelerator/neuron.inc.md b/docs/getting_started/installation/ai_accelerator/neuron.inc.md
@@ -38,7 +38,8 @@ The installation of drivers and tools wouldn't be necessary, if [Deep Learning A
 sudo tee /etc/apt/sources.list.d/neuron.list > /dev/null <<EOF
 deb https://apt.repos.neuron.amazonaws.com ${VERSION_CODENAME} main
 EOF
-wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
+wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB \
+    | sudo apt-key add -
 
 # Update OS packages
 sudo apt-get update -y
@@ -96,12 +97,17 @@ source aws_neuron_venv_pytorch/bin/activate
 
 # Install Jupyter notebook kernel
 pip install ipykernel
-python3.10 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
+python3.10 -m ipykernel install \
+    --user \
+    --name aws_neuron_venv_pytorch \
+    --display-name "Python (torch-neuronx)"
 pip install jupyter notebook
 pip install environment_kernels
 
 # Set pip repository pointing to the Neuron repository
-python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+python -m pip config set \
+    global.extra-index-url \
+    https://pip.repos.neuron.amazonaws.com
 
 # Install wget, awscli
 python -m pip install wget
diff --git a/docs/getting_started/installation/gpu/cuda.inc.md b/docs/getting_started/installation/gpu/cuda.inc.md
diff --git a/docs/getting_started/installation/gpu/rocm.inc.md b/docs/getting_started/installation/gpu/rocm.inc.md
diff --git a/docs/models/extensions/runai_model_streamer.md b/docs/models/extensions/runai_model_streamer.md
diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,12 @@ The edges of the build graph represent:`
`26`	`26`	> Commands to regenerate the build graph (make sure to run it from the \`root\` directory of the vLLM repository where the dockerfile is present):
`27`	`27`	`>`
`28`	`28`	> ```bash
`29`		`- > dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename docker/Dockerfile`
	`29`	`+ > dockerfilegraph \`
	`30`	`+ > -o png \`
	`31`	`+ > --legend \`
	`32`	`+ > --dpi 200 \`
	`33`	`+ > --max-label-length 50 \`
	`34`	`+ > --filename docker/Dockerfile`
`30`	`35`	> ```
`31`	`36`	`>`
`32`	`37`	`> or in case you want to run it directly with the docker image:`