Skip to content

Commit c5f7be2

Browse files
DarkLight1337albertoperdomo2
authored andcommitted
[Doc] ruff format remaining Python examples (vllm-project#26795)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Alberto Perdomo <aperdomo@redhat.com>
1 parent 31f027c commit c5f7be2

File tree

21 files changed

+166
-105
lines changed

21 files changed

+166
-105
lines changed

docs/features/quantization/auto_awq.md

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
2222
from awq import AutoAWQForCausalLM
2323
from transformers import AutoTokenizer
2424

25-
model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
26-
quant_path = 'mistral-instruct-v0.2-awq'
27-
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
25+
model_path = "mistralai/Mistral-7B-Instruct-v0.2"
26+
quant_path = "mistral-instruct-v0.2-awq"
27+
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
2828

2929
# Load model
3030
model = AutoAWQForCausalLM.from_pretrained(
31-
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
31+
model_path,
32+
low_cpu_mem_usage=True,
33+
use_cache=False,
3234
)
3335
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
3436

docs/features/quantization/bitblas.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ llm = LLM(
3434
model=model_id,
3535
dtype=torch.bfloat16,
3636
trust_remote_code=True,
37-
quantization="bitblas"
37+
quantization="bitblas",
3838
)
3939
```
4040

@@ -53,6 +53,6 @@ llm = LLM(
5353
dtype=torch.float16,
5454
trust_remote_code=True,
5555
quantization="bitblas",
56-
max_model_len=1024
56+
max_model_len=1024,
5757
)
5858
```

docs/features/quantization/bnb.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
2727
llm = LLM(
2828
model=model_id,
2929
dtype=torch.bfloat16,
30-
trust_remote_code=True
30+
trust_remote_code=True,
3131
)
3232
```
3333

@@ -43,7 +43,7 @@ llm = LLM(
4343
model=model_id,
4444
dtype=torch.bfloat16,
4545
trust_remote_code=True,
46-
quantization="bitsandbytes"
46+
quantization="bitsandbytes",
4747
)
4848
```
4949

docs/features/quantization/fp8.md

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
4141

4242
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
4343
model = AutoModelForCausalLM.from_pretrained(
44-
MODEL_ID, device_map="auto", torch_dtype="auto",
44+
MODEL_ID,
45+
device_map="auto",
46+
torch_dtype="auto",
4547
)
4648
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
4749
```
@@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio
6365

6466
# Configure the simple PTQ quantization
6567
recipe = QuantizationModifier(
66-
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
68+
targets="Linear",
69+
scheme="FP8_DYNAMIC",
70+
ignore=["lm_head"],
71+
)
6772

6873
# Apply the quantization algorithm.
6974
oneshot(model=model, recipe=recipe)

docs/features/quantization/gguf.md

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
4747
conversation = [
4848
{
4949
"role": "system",
50-
"content": "You are a helpful assistant"
50+
"content": "You are a helpful assistant",
5151
},
5252
{
5353
"role": "user",
54-
"content": "Hello"
54+
"content": "Hello",
5555
},
5656
{
5757
"role": "assistant",
58-
"content": "Hello! How can I assist you today?"
58+
"content": "Hello! How can I assist you today?",
5959
},
6060
{
6161
"role": "user",
@@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
6767
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
6868

6969
# Create an LLM.
70-
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
71-
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
70+
llm = LLM(
71+
model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
72+
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
73+
)
7274
# Generate texts from the prompts. The output is a list of RequestOutput objects
7375
# that contain the prompt, generated text, and other information.
7476
outputs = llm.chat(conversation, sampling_params)

docs/features/quantization/gptqmodel.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
4040
calibration_dataset = load_dataset(
4141
"allenai/c4",
4242
data_files="en/c4-train.00001-of-01024.json.gz",
43-
split="train"
43+
split="train",
4444
).select(range(1024))["text"]
4545

4646
quant_config = QuantizeConfig(bits=4, group_size=128)

docs/features/quantization/int4.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
3939

4040
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
4141
model = AutoModelForCausalLM.from_pretrained(
42-
MODEL_ID, device_map="auto", torch_dtype="auto",
42+
MODEL_ID,
43+
device_map="auto",
44+
torch_dtype="auto",
4345
)
4446
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
4547
```
@@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
166168
},
167169
ignore=["lm_head"],
168170
update_size=NUM_CALIBRATION_SAMPLES,
169-
dampening_frac=0.01
171+
dampening_frac=0.01,
170172
)
171173
```
172174

docs/features/quantization/int8.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
4444

4545
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
4646
model = AutoModelForCausalLM.from_pretrained(
47-
MODEL_ID, device_map="auto", torch_dtype="auto",
47+
MODEL_ID,
48+
device_map="auto",
49+
torch_dtype="auto",
4850
)
4951
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
5052
```

docs/features/quantization/modelopt.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
5656
from vllm import LLM, SamplingParams
5757

5858
def main():
59-
6059
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
61-
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
60+
61+
# Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
6262
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
6363

6464
sampling_params = SamplingParams(temperature=0.8, top_p=0.9)

docs/features/quantization/quantized_kvcache.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
4141
from vllm import LLM, SamplingParams
4242

4343
sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
44-
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
45-
kv_cache_dtype="fp8",
46-
calculate_kv_scales=True)
44+
llm = LLM(
45+
model="meta-llama/Llama-2-7b-chat-hf",
46+
kv_cache_dtype="fp8",
47+
calculate_kv_scales=True,
48+
)
4749
prompt = "London is the capital of"
4850
out = llm.generate(prompt, sampling_params)[0].outputs[0].text
4951
print(out)

0 commit comments

Comments
 (0)