Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin' into kylesayrs/move-sparsegptq
Browse files Browse the repository at this point in the history
  • Loading branch information
kylesayrs committed Dec 19, 2024
2 parents 8513782 + 7366a2d commit 1ad71db
Show file tree
Hide file tree
Showing 130 changed files with 1,150 additions and 1,287 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ Quantization is applied by selecting an algorithm and calling the `oneshot` API.
from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.transformers import oneshot
from transformers import AutoModelForCausalLM

# Select quantization algorithm. In this case, we:
# * apply SmoothQuant to make the activations easier to quantize
Expand Down
16 changes: 0 additions & 16 deletions examples/automodelforcausallm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,3 @@ MODEL_ID = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"

model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
```

These models are still supported through the `SparseAutoModelForCausalLM` pathway:

```python
from llmcompressor.transformers import SparseAutoModelForCausalLM

MODEL_ID = "nm-testing/tinyllama-w8a8-compressed-hf-quantizer"
model = SparseAutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
)
```

Models quantized through `llm-compressor` using `compressed-tensors=<v0.6.0` are not
supported through the `AutoModelForCausalLM` and will still need the
`SparseAutoModelForCausalLM` pathway to run.
8 changes: 4 additions & 4 deletions examples/big_models_with_accelerate/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
To enable `accelerate` features with `llmcompressor`, simple insert `device_map` in `from_pretrained` during model load.

```python
from llmcompressor.transformers import SparseAutoModelForCausalLM
from transformers import AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"

# device_map="auto" triggers usage of accelerate
# if > 1 GPU, the model will be sharded across the GPUs
# if not enough GPU memory to fit the model, parameters are offloaded to the CPU
model = SparseAutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto")
```

Expand All @@ -34,12 +34,12 @@ potentially going out-of-memory.

```python
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map
from llmcompressor.transformers import SparseAutoModelForCausalLM,
from transformers import AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"

# Load model, reserving memory in the device map for sequential GPTQ (adjust num_gpus as needed)
device_map = calculate_offload_device_map(MODEL_ID, reserve_for_hessians=True, num_gpus=1)
model = SparseAutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map=device_map,
torch_dtype="auto",
Expand Down
6 changes: 3 additions & 3 deletions examples/big_models_with_accelerate/cpu_offloading_fp8.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers import oneshot

MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"

# Load model
# Note: device_map="auto" will offload to CPU if not enough space on GPU.
model = SparseAutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", trust_remote_code=True
)

Expand Down
10 changes: 5 additions & 5 deletions examples/big_models_with_accelerate/mult_gpus_int8_device_map.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
import torch
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.compression.helpers import calculate_offload_device_map

MODEL_ID = "mistralai/Mistral-Nemo-Instruct-2407"
MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"

# adjust based off number of desired GPUs
# reserve_for_hessians=True reserves memory which is required by
# GPTQModifier and SparseGPTModifier
device_map = calculate_offload_device_map(
MODEL_ID, num_gpus=2, reserve_for_hessians=True, torch_dtype=torch.bfloat16
MODEL_ID, num_gpus=1, reserve_for_hessians=True, torch_dtype=torch.bfloat16
)

model = SparseAutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map=device_map, torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
Expand Down
6 changes: 3 additions & 3 deletions examples/big_models_with_accelerate/multi_gpu_int8.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers import oneshot

MODEL_ID = "meta-llama/Meta-Llama-3-70B-Instruct"
SAVE_DIR = MODEL_ID.split("/")[1] + "-W8A8-Dynamic"

# 1) Load model (device_map="auto" with shard the model over multiple GPUs!).
model = SparseAutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype="auto",
Expand Down
17 changes: 6 additions & 11 deletions examples/compressed_inference/fp8_compressed_inference.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,7 @@
from transformers import AutoTokenizer

from llmcompressor.transformers import SparseAutoModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer

"""
This example covers how to load a quantized model in compressed mode. By default,
SparseAutoModelForCausalLM will decompress the whole model on load resulting in no
memory savings from quantization. By setting the `run_compressed` kwarg to True, the
model will remain compressed in memory on load, saving memory during inference at the
cost of increased runtime
This example covers how to load a quantized model using AutoModelForCausalLM.
During inference, each layer will be decompressed as needed before the forward pass.
This saves memory as only a single layer is ever uncompressed at a time, but increases
Expand All @@ -25,9 +19,10 @@
"def fibonacci(n):",
]

# set run_compressed=True to enable running in compressed mode
compressed_model = SparseAutoModelForCausalLM.from_pretrained(
MODEL_STUB, torch_dtype="auto", device_map="cuda:0", run_compressed=True
compressed_model = AutoModelForCausalLM.from_pretrained(
MODEL_STUB,
torch_dtype="auto",
device_map="cuda:0",
)

# tokenize the sample data
Expand Down
9 changes: 5 additions & 4 deletions examples/quantization_2of4_sparse_w4a16/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,10 @@ and quantize to 4 bits in one show using GPTQ.

```python
import torch
from llmcompressor.transformers import SparseAutoModelForCausalLM
from transformers import AutoModelForCausalLM

model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
model = SparseAutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
model_stub, torch_dtype=torch.bfloat16, device_map="auto"
)

Expand Down Expand Up @@ -86,6 +86,7 @@ apply(
lr_scheduler_type="cosine",
warmup_ratio=0.1,
)

```


Expand All @@ -96,10 +97,10 @@ run the following:

```python
import torch
from llmcompressor.transformers import SparseAutoModelForCausalLM
from transformers import AutoModelForCausalLM

compressed_output_dir = "output_llama7b_2of4_w4a16_channel_compressed"
model = SparseAutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16)
model = AutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.bfloat16)
model.save_pretrained(compressed_output_dir, save_compressed=True)
```

Expand Down
10 changes: 8 additions & 2 deletions examples/quantization_2of4_sparse_w4a16/llama7b_sparse_w4a16.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import torch
from loguru import logger
from transformers import AutoModelForCausalLM

from llmcompressor.transformers import SparseAutoModelForCausalLM, apply
from llmcompressor.transformers import apply

# define a recipe to handle sparsity, finetuning and quantization
recipe = "2of4_w4a16_recipe.yaml"

# load the model in as bfloat16 to save on memory and compute
model_stub = "neuralmagic/Llama-2-7b-ultrachat200k"
model = SparseAutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
model_stub, torch_dtype=torch.bfloat16, device_map="auto"
)

Expand Down Expand Up @@ -51,3 +53,7 @@
lr_scheduler_type=lr_scheduler_type,
warmup_ratio=warmup_ratio,
)
logger.info(
"Note: vLLM requires the dtype=torch.float16 when running the ",
"compressed marlin-24 model",
)
7 changes: 3 additions & 4 deletions examples/quantization_kv_cache/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,13 @@ Let's walk through the main steps of the quantization process:

### 1. Load Model

Load the model using `SparseAutoModelForCausalLM`:
Load the model using `AutoModelForCausalLM`:

```python
from llmcompressor.transformers import SparseAutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = SparseAutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype="auto",
Expand Down
6 changes: 6 additions & 0 deletions examples/quantization_kv_cache/llama3_fp8_kv_example.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datasets import load_dataset
from loguru import logger
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.transformers import oneshot
Expand Down Expand Up @@ -81,6 +82,11 @@ def process_and_tokenize(example):
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

logger.info(
"Running sample generation. ",
"Note: Inference with the quantized kv_cache is not supported. ",
"Please use vLLM for inference with the quantized kv_cache.",
)
# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
Expand Down
7 changes: 3 additions & 4 deletions examples/quantization_w4a16/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,13 @@ Now, we will step though the code in the example. There are four steps:

### 1) Load Model

Load the model using `SparseAutoModelForCausalLM`, which is a wrapper around `AutoModel` for handling quantized saving and loading. Note that `SparseAutoModel` is compatible with `accelerate` so you can load your model onto multiple GPUs if needed.
Load the model using `AutoModelForCausalLM` for handling quantized saving and loading.

```python
from llmcompressor.transformers import SparseAutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = SparseAutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
Expand Down
6 changes: 3 additions & 3 deletions examples/quantization_w4a16/llama3_example.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.modifiers.quantization import GPTQModifier
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers import oneshot

# Select model and load it.
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

model = SparseAutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype="auto",
Expand Down
7 changes: 3 additions & 4 deletions examples/quantization_w8a8_fp8/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,14 @@ Now, we will step though the code in the example. There are three steps:

### 1) Load Model

Load the model using `SparseAutoModelForCausalLM`, which wraps `AutoModelForCausalLM` for saving and loading quantized models.
Load the model using `AutoModelForCausalLM`

```python
from llmcompressor.transformers import SparseAutoModelForCausalLM
from transformers import AutoTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

model = SparseAutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
```
Expand Down
13 changes: 9 additions & 4 deletions examples/quantization_w8a8_fp8/gemma2_example.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers import oneshot

MODEL_ID = "google/gemma-2-27b-it"

# 1) Load model.
model = SparseAutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
Expand All @@ -21,7 +21,12 @@

# 3) Apply quantization and save in compressed-tensors format.
OUTPUT_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
oneshot(model=model, recipe=recipe, output_dir=OUTPUT_DIR, tokenizer=tokenizer)
oneshot(
model=model,
recipe=recipe,
tokenizer=tokenizer,
output_dir=OUTPUT_DIR,
)

# Confirm generations of the quantized model look sane.
print("========== SAMPLE GENERATION ==============")
Expand Down
13 changes: 9 additions & 4 deletions examples/quantization_w8a8_fp8/llama3.2_vision_example.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from transformers import AutoProcessor, MllamaForConditionalGeneration

from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot, wrap_hf_model_class
from llmcompressor.transformers import oneshot

MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"

# Load model.
model_class = wrap_hf_model_class(MllamaForConditionalGeneration)
model = model_class.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
model = MllamaForConditionalGeneration.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto"
)
processor = AutoProcessor.from_pretrained(MODEL_ID)

# Configure the quantization algorithm and scheme.
Expand All @@ -22,7 +23,11 @@

# Apply quantization and save to disk in compressed-tensors format.
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-Dynamic"
oneshot(model=model, recipe=recipe, output_dir=SAVE_DIR)
oneshot(
model=model,
recipe=recipe,
output_dir=SAVE_DIR,
)
processor.save_pretrained(SAVE_DIR)

# Confirm generations of the quantized model look sane.
Expand Down
6 changes: 3 additions & 3 deletions examples/quantization_w8a8_fp8/llama3_example.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
from llmcompressor.transformers import oneshot

MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"

# Load model.
model = SparseAutoModelForCausalLM.from_pretrained(
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
Expand Down
7 changes: 4 additions & 3 deletions examples/quantization_w8a8_fp8/llava1.5_example.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from transformers import AutoProcessor, LlavaForConditionalGeneration

from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.transformers import oneshot, wrap_hf_model_class
from llmcompressor.transformers import oneshot

MODEL_ID = "llava-hf/llava-1.5-7b-hf"

# Load model.
model_class = wrap_hf_model_class(LlavaForConditionalGeneration)
model = model_class.from_pretrained(MODEL_ID, device_map="auto", torch_dtype="auto")
model = LlavaForConditionalGeneration.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto"
)
processor = AutoProcessor.from_pretrained(MODEL_ID)

# Configure the quantization algorithm and scheme.
Expand Down
Loading

0 comments on commit 1ad71db

Please sign in to comment.