Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions docs/features/quantization/auto_awq.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
quant_path = 'mistral-instruct-v0.2-awq'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
model_path = "mistralai/Mistral-7B-Instruct-v0.2"
quant_path = "mistral-instruct-v0.2-awq"
quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}

# Load model
model = AutoAWQForCausalLM.from_pretrained(
model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
model_path,
low_cpu_mem_usage=True,
use_cache=False,
)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

Expand Down
4 changes: 2 additions & 2 deletions docs/features/quantization/bitblas.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ llm = LLM(
model=model_id,
dtype=torch.bfloat16,
trust_remote_code=True,
quantization="bitblas"
quantization="bitblas",
)
```

Expand All @@ -53,6 +53,6 @@ llm = LLM(
dtype=torch.float16,
trust_remote_code=True,
quantization="bitblas",
max_model_len=1024
max_model_len=1024,
)
```
4 changes: 2 additions & 2 deletions docs/features/quantization/bnb.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
llm = LLM(
model=model_id,
dtype=torch.bfloat16,
trust_remote_code=True
trust_remote_code=True,
)
```

Expand All @@ -43,7 +43,7 @@ llm = LLM(
model=model_id,
dtype=torch.bfloat16,
trust_remote_code=True,
quantization="bitsandbytes"
quantization="bitsandbytes",
)
```

Expand Down
9 changes: 7 additions & 2 deletions docs/features/quantization/fp8.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto",
MODEL_ID,
device_map="auto",
torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
```
Expand All @@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio

# Configure the simple PTQ quantization
recipe = QuantizationModifier(
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
targets="Linear",
scheme="FP8_DYNAMIC",
ignore=["lm_head"],
)

# Apply the quantization algorithm.
oneshot(model=model, recipe=recipe)
Expand Down
12 changes: 7 additions & 5 deletions docs/features/quantization/gguf.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
conversation = [
{
"role": "system",
"content": "You are a helpful assistant"
"content": "You are a helpful assistant",
},
{
"role": "user",
"content": "Hello"
"content": "Hello",
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
"content": "Hello! How can I assist you today?",
},
{
"role": "user",
Expand All @@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# Create an LLM.
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
llm = LLM(
model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.chat(conversation, sampling_params)
Expand Down
2 changes: 1 addition & 1 deletion docs/features/quantization/gptqmodel.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
calibration_dataset = load_dataset(
"allenai/c4",
data_files="en/c4-train.00001-of-01024.json.gz",
split="train"
split="train",
).select(range(1024))["text"]

quant_config = QuantizeConfig(bits=4, group_size=128)
Expand Down
6 changes: 4 additions & 2 deletions docs/features/quantization/int4.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto",
MODEL_ID,
device_map="auto",
torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
```
Expand Down Expand Up @@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
},
ignore=["lm_head"],
update_size=NUM_CALIBRATION_SAMPLES,
dampening_frac=0.01
dampening_frac=0.01,
)
```

Expand Down
4 changes: 3 additions & 1 deletion docs/features/quantization/int8.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto",
MODEL_ID,
device_map="auto",
torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
```
Expand Down
4 changes: 2 additions & 2 deletions docs/features/quantization/modelopt.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
from vllm import LLM, SamplingParams

def main():

model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint

# Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)

sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
Expand Down
8 changes: 5 additions & 3 deletions docs/features/quantization/quantized_kvcache.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
from vllm import LLM, SamplingParams

sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
kv_cache_dtype="fp8",
calculate_kv_scales=True)
llm = LLM(
model="meta-llama/Llama-2-7b-chat-hf",
kv_cache_dtype="fp8",
calculate_kv_scales=True,
)
prompt = "London is the capital of"
out = llm.generate(prompt, sampling_params)[0].outputs[0].text
print(out)
Expand Down
65 changes: 44 additions & 21 deletions docs/features/quantization/quark.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ to fetch model and tokenizer.
MAX_SEQ_LEN = 512

model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto",
MODEL_ID,
device_map="auto",
torch_dtype="auto",
)
model.eval()

Expand All @@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
text_data = dataset["text"][:NUM_CALIBRATION_DATA]

tokenized_outputs = tokenizer(text_data, return_tensors="pt",
padding=True, truncation=True, max_length=MAX_SEQ_LEN)
calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
batch_size=BATCH_SIZE, drop_last=True)
tokenized_outputs = tokenizer(
text_data,
return_tensors="pt",
padding=True,
truncation=True,
max_length=MAX_SEQ_LEN,
)
calib_dataloader = DataLoader(
tokenized_outputs['input_ids'],
batch_size=BATCH_SIZE,
drop_last=True,
)
```

### 3. Set the Quantization Configuration
Expand All @@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
load_quant_algo_config_from_file)

# Define fp8/per-tensor/static spec.
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
is_dynamic=False).to_quantization_spec()
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
observer_method="min_max",
is_dynamic=False,
).to_quantization_spec()

# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
weight=FP8_PER_TENSOR_SPEC)
global_quant_config = QuantizationConfig(
input_tensors=FP8_PER_TENSOR_SPEC,
weight=FP8_PER_TENSOR_SPEC,
)

# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
kv_cache_quant_config = {name :
QuantizationConfig(input_tensors=global_quant_config.input_tensors,
weight=global_quant_config.weight,
output_tensors=KV_CACHE_SPEC)
for name in kv_cache_layer_names_for_llama}
kv_cache_quant_config = {
name: QuantizationConfig(
input_tensors=global_quant_config.input_tensors,
weight=global_quant_config.weight,
output_tensors=KV_CACHE_SPEC,
)
for name in kv_cache_layer_names_for_llama
}
layer_quant_config = kv_cache_quant_config.copy()

# Define algorithm config by config file.
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)

EXCLUDE_LAYERS = ["lm_head"]
Expand All @@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
layer_quant_config=layer_quant_config,
kv_cache_quant_config=kv_cache_quant_config,
exclude=EXCLUDE_LAYERS,
algo_config=algo_config)
algo_config=algo_config,
)
```

### 4. Quantize the Model and Export
Expand Down Expand Up @@ -165,8 +182,11 @@ for more exporting format details.
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
with torch.no_grad():
exporter.export_safetensors_model(freezed_model,
quant_config=quant_config, tokenizer=tokenizer)
exporter.export_safetensors_model(
freezed_model,
quant_config=quant_config,
tokenizer=tokenizer,
)
```

### 5. Evaluation in vLLM
Expand All @@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

# Create an LLM.
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
kv_cache_dtype='fp8',quantization='quark')
llm = LLM(
model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
kv_cache_dtype="fp8",
quantization="quark",
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
Expand Down
8 changes: 5 additions & 3 deletions docs/getting_started/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
api_key=openai_api_key,
base_url=openai_api_base,
)
completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
prompt="San Francisco is a")
completion = client.completions.create(
model="Qwen/Qwen2.5-1.5B-Instruct",
prompt="San Francisco is a",
)
print("Completion result:", completion)
```

Expand Down Expand Up @@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package:
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Tell me a joke."},
]
],
)
print("Chat response:", chat_response)
```
Expand Down
4 changes: 2 additions & 2 deletions docs/models/extensions/tensorizer.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ from vllm import LLM
llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1",
load_format="tensorizer",
enable_lora=True
enable_lora=True,
)
```

Expand Down Expand Up @@ -97,6 +97,6 @@ llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1",
load_format="tensorizer",
enable_lora=True,
model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}}
model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}},
)
```
6 changes: 3 additions & 3 deletions docs/models/generative_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
conversation = [
{
"role": "system",
"content": "You are a helpful assistant"
"content": "You are a helpful assistant",
},
{
"role": "user",
"content": "Hello"
"content": "Hello",
},
{
"role": "assistant",
"content": "Hello! How can I assist you today?"
"content": "Hello! How can I assist you today?",
},
{
"role": "user",
Expand Down
Loading