vllm-project · vllm-bot · Oct 15, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md
@@ -18,13 +18,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
     from awq import AutoAWQForCausalLM
     from transformers import AutoTokenizer
 
-    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
-    quant_path = 'mistral-instruct-v0.2-awq'
-    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+    model_path = "mistralai/Mistral-7B-Instruct-v0.2"
+    quant_path = "mistral-instruct-v0.2-awq"
+    quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
 
     # Load model
     model = AutoAWQForCausalLM.from_pretrained(
-        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+        model_path,
+        low_cpu_mem_usage=True,
+        use_cache=False,
     )
     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 

diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md
@@ -34,7 +34,7 @@ llm = LLM(
     model=model_id,
     dtype=torch.bfloat16,
     trust_remote_code=True,
-    quantization="bitblas"
+    quantization="bitblas",
 )
 ```
 
@@ -53,6 +53,6 @@ llm = LLM(
         dtype=torch.float16,
         trust_remote_code=True,
         quantization="bitblas",
-        max_model_len=1024
+        max_model_len=1024,
     )
     ```
diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md
@@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
 llm = LLM(
     model=model_id,
     dtype=torch.bfloat16,
-    trust_remote_code=True
+    trust_remote_code=True,
 )
 ```
 
@@ -43,7 +43,7 @@ llm = LLM(
     model=model_id,
     dtype=torch.bfloat16,
     trust_remote_code=True,
-    quantization="bitsandbytes"
+    quantization="bitsandbytes",
 )
 ```
 

diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
@@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
@@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio
 
     # Configure the simple PTQ quantization
     recipe = QuantizationModifier(
-      targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+        targets="Linear",
+        scheme="FP8_DYNAMIC",
+        ignore=["lm_head"],
+    )
 
     # Apply the quantization algorithm.
     oneshot(model=model, recipe=recipe)

diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md
@@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
       conversation = [
          {
             "role": "system",
-            "content": "You are a helpful assistant"
+            "content": "You are a helpful assistant",
          },
          {
             "role": "user",
-            "content": "Hello"
+            "content": "Hello",
          },
          {
             "role": "assistant",
-            "content": "Hello! How can I assist you today?"
+            "content": "Hello! How can I assist you today?",
          },
          {
             "role": "user",
@@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
       sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
       # Create an LLM.
-      llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
-               tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+      llm = LLM(
+         model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+      )
       # Generate texts from the prompts. The output is a list of RequestOutput objects
       # that contain the prompt, generated text, and other information.
       outputs = llm.chat(conversation, sampling_params)

diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md
@@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
     calibration_dataset = load_dataset(
         "allenai/c4",
         data_files="en/c4-train.00001-of-01024.json.gz",
-        split="train"
+        split="train",
     ).select(range(1024))["text"]
 
     quant_config = QuantizeConfig(bits=4, group_size=128)

diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
@@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
@@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
         },
         ignore=["lm_head"],
         update_size=NUM_CALIBRATION_SAMPLES,
-        dampening_frac=0.01
+        dampening_frac=0.01,
     )
     ```
 

diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
@@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```

diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md
@@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
     from vllm import LLM, SamplingParams
 
     def main():
-
         model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
-        # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
+
+        # Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
         llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
 
         sampling_params = SamplingParams(temperature=0.8, top_p=0.9)

diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md
@@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
     from vllm import LLM, SamplingParams
 
     sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
-    llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
-            kv_cache_dtype="fp8",
-            calculate_kv_scales=True)
+    llm = LLM(
+        model="meta-llama/Llama-2-7b-chat-hf",
+        kv_cache_dtype="fp8",
+        calculate_kv_scales=True,
+    )
     prompt = "London is the capital of"
     out = llm.generate(prompt, sampling_params)[0].outputs[0].text
     print(out)

diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md
@@ -48,7 +48,9 @@ to fetch model and tokenizer.
     MAX_SEQ_LEN = 512
 
     model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID, device_map="auto", torch_dtype="auto",
+        MODEL_ID,
+        device_map="auto",
+        torch_dtype="auto",
     )
     model.eval()
 
@@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
     dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
     text_data = dataset["text"][:NUM_CALIBRATION_DATA]
 
-    tokenized_outputs = tokenizer(text_data, return_tensors="pt",
-        padding=True, truncation=True, max_length=MAX_SEQ_LEN)
-    calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
-        batch_size=BATCH_SIZE, drop_last=True)
+    tokenized_outputs = tokenizer(
+        text_data,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=MAX_SEQ_LEN,
+    )
+    calib_dataloader = DataLoader(
+        tokenized_outputs['input_ids'],
+        batch_size=BATCH_SIZE,
+        drop_last=True,
+    )
     ```
 
 ### 3. Set the Quantization Configuration
@@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
                                         load_quant_algo_config_from_file)
 
     # Define fp8/per-tensor/static spec.
-    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
-        is_dynamic=False).to_quantization_spec()
+    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
+        observer_method="min_max",
+        is_dynamic=False,
+    ).to_quantization_spec()
 
     # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
-    global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
-        weight=FP8_PER_TENSOR_SPEC)
+    global_quant_config = QuantizationConfig(
+        input_tensors=FP8_PER_TENSOR_SPEC,
+        weight=FP8_PER_TENSOR_SPEC,
+    )
 
     # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
     KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
     kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
-    kv_cache_quant_config = {name :
-        QuantizationConfig(input_tensors=global_quant_config.input_tensors,
-                        weight=global_quant_config.weight,
-                        output_tensors=KV_CACHE_SPEC)
-        for name in kv_cache_layer_names_for_llama}
+    kv_cache_quant_config = {
+        name: QuantizationConfig(
+            input_tensors=global_quant_config.input_tensors,
+            weight=global_quant_config.weight,
+            output_tensors=KV_CACHE_SPEC,
+        )
+        for name in kv_cache_layer_names_for_llama
+    }
     layer_quant_config = kv_cache_quant_config.copy()
 
     # Define algorithm config by config file.
-    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
-        'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
+    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
     algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
 
     EXCLUDE_LAYERS = ["lm_head"]
@@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
         layer_quant_config=layer_quant_config,
         kv_cache_quant_config=kv_cache_quant_config,
         exclude=EXCLUDE_LAYERS,
-        algo_config=algo_config)
+        algo_config=algo_config,
+    )
     ```
 
 ### 4. Quantize the Model and Export
@@ -165,8 +182,11 @@ for more exporting format details.
     EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
     exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
     with torch.no_grad():
-        exporter.export_safetensors_model(freezed_model,
-            quant_config=quant_config, tokenizer=tokenizer)
+        exporter.export_safetensors_model(
+            freezed_model,
+            quant_config=quant_config,
+            tokenizer=tokenizer,
+        )
     ```
 
 ### 5. Evaluation in vLLM
@@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
     # Create an LLM.
-    llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
-            kv_cache_dtype='fp8',quantization='quark')
+    llm = LLM(
+        model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+        kv_cache_dtype="fp8",
+        quantization="quark",
+    )
     # Generate texts from the prompts. The output is a list of RequestOutput objects
     # that contain the prompt, generated text, and other information.
     outputs = llm.generate(prompts, sampling_params)

diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
@@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
         api_key=openai_api_key,
         base_url=openai_api_base,
     )
-    completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
-                                        prompt="San Francisco is a")
+    completion = client.completions.create(
+        model="Qwen/Qwen2.5-1.5B-Instruct",
+        prompt="San Francisco is a",
+    )
     print("Completion result:", completion)
     ```
 
@@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package:
         messages=[
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": "Tell me a joke."},
-        ]
+        ],
     )
     print("Chat response:", chat_response)
     ```

diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md
@@ -60,7 +60,7 @@ from vllm import LLM
 llm = LLM(
     "s3://my-bucket/vllm/facebook/opt-125m/v1", 
     load_format="tensorizer",
-    enable_lora=True
+    enable_lora=True,
 )
 ```
 
@@ -97,6 +97,6 @@ llm = LLM(
     "s3://my-bucket/vllm/facebook/opt-125m/v1", 
     load_format="tensorizer",
     enable_lora=True,
-    model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}}
+    model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}},
 )
 ```
diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md
@@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
     conversation = [
         {
             "role": "system",
-            "content": "You are a helpful assistant"
+            "content": "You are a helpful assistant",
         },
         {
             "role": "user",
-            "content": "Hello"
+            "content": "Hello",
         },
         {
             "role": "assistant",
-            "content": "Hello! How can I assist you today?"
+            "content": "Hello! How can I assist you today?",
         },
         {
             "role": "user",