Skip to content

Commit

Permalink
Int4 Packed Compressor (#47)
Browse files Browse the repository at this point in the history
* group size

* add logic in base observer

* Compressed lifecycle implementation (INT8 only)

* group size full lifecycle run

* Apply suggestions from code review

* before vectorize the for loop

* comments, todo add channelwise

* chan wise impl

* comments

* fix channel wise

* comments, validators

* fix typo

* small fixes for runtime

* add classes

* tensor return error fix

* WIP

* moving around classes

* fix sparseml-side of code and add per channel

* pyndatic defaults

* token wise quant

* Update src/compressed_tensors/quantization/quant_args.py

Co-authored-by: Benjamin Fineran <bfineran@users.noreply.github.com>

* comments'

* code complete

* tests passing

* unit test bugs

* fill out int decompression

* docstrings

* allow repeat frozens

* update dim

* int compressor unit tests

* move helper

* shape consistency

* initial commit

* first unit test passing

* Update src/compressed_tensors/quantization/lifecycle/forward.py

Co-authored-by: Benjamin Fineran <bfineran@users.noreply.github.com>

* comments

* tests passing

* one more test

* cleanup

* pass test_quant_args

* Quantization Compressor Support (#45)

* add classes

* WIP

* moving around classes

* code complete

* tests passing

* unit test bugs

* fill out int decompression

* docstrings

* allow repeat frozens

* int compressor unit tests

* PR comments

* fix device issue

* fixing leaf checker

* updating tests

* docstrings

* updating examples

* update examples

* fix channelwise

* new tests, some fail

* WIP

* new helper fn

* actually just a warning

* group size speedups + fixes

* group compression

* fix output type on decompress

* fix channelwise

* revert

* more tests

* move tests

* example notebook

* add example notebook

* update README

* cleanup

---------

Co-authored-by: George Ohashi <george@neuralmagic.com>
Co-authored-by: Benjamin <ben@neuralmagic.com>
Co-authored-by: Benjamin Fineran <bfineran@users.noreply.github.com>
  • Loading branch information
4 people authored May 15, 2024
1 parent d914b73 commit b35ecda
Show file tree
Hide file tree
Showing 15 changed files with 882 additions and 35 deletions.
41 changes: 41 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,44 @@ state_dict = dict(load_compressed("compressed_model.safetensors", compression_co
```

For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).


## Saving a Compressed Model with PTQ

We can use compressed-tensors to run basic post training quantization (PTQ) and save the quantized model compressed on disk

```python
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda:0")

config = QuantizationConfig.parse_file("./examples/bit_packing/int4_config.json")
config.quantization_status = QuantizationStatus.CALIBRATION
apply_quantization_config(model, config)

dataset = load_dataset("ptb_text_only")["train"]
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
return tokenizer(examples["sentence"], padding=False, truncation=True, max_length=1024)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
data_loader = DataLoader(tokenized_dataset, batch_size=1, collate_fn=DefaultDataCollator())

with torch.no_grad():
for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
sample = {key: value.to(device) for key,value in sample.items()}
_ = model(**sample)

if idx >= 512:
break

model.apply(freeze_module_quantization)
model.apply(compress_quantized_weights)

output_dir = "./ex_llama1.1b_w4a16_packed_quantize"
compressor = ModelCompressor(quantization_config=config)
compressed_state_dict = compressor.compress(model)
model.save_pretrained(output_dir, state_dict=compressed_state_dict)
```

For more in-depth tutorial on quantization compression, refer to the [notebook](./examples/quantize_and_pack_int4.ipynb).
87 changes: 87 additions & 0 deletions examples/bit_packing/ex_quantize_and_pack.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from tqdm import tqdm
from torch.utils.data import RandomSampler
from compressed_tensors.quantization import (
apply_quantization_config,
freeze_module_quantization,
QuantizationConfig,
QuantizationStatus,
)
from sparseml.transformers.finetune.data.data_args import DataTrainingArguments
from sparseml.transformers.finetune.data.base import TextGenerationDataset
from transformers import AutoModelForCausalLM, AutoTokenizer, DefaultDataCollator
from torch.utils.data import DataLoader
from sparseml.pytorch.utils import tensors_to_device
import torch
from compressed_tensors.compressors import ModelCompressor

config_file = "int4_config.json"
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
dataset_name = "open_platypus"
split = "train"
num_calibration_samples = 128
max_seq_length = 512
pad_to_max_length = False
output_dir = "./llama1.1b_new_quant_out_test_packing"
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
model.eval() # no grad or updates needed for base model
config = QuantizationConfig.parse_file(config_file)

# set status to calibration
config.quantization_status = QuantizationStatus.CALIBRATION

# initialize quantization
apply_quantization_config(model, config)

# create dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_args = DataTrainingArguments(
dataset=dataset_name,
max_seq_length=max_seq_length,
pad_to_max_length=pad_to_max_length,
)
dataset_manager = TextGenerationDataset.load_from_registry(
data_args.dataset,
data_args=data_args,
split=split,
tokenizer=tokenizer,
)
calib_dataset = dataset_manager.tokenize_and_process(
dataset_manager.get_raw_dataset()
)
data_loader = DataLoader(
calib_dataset, batch_size=1, collate_fn=DefaultDataCollator(), sampler=RandomSampler(calib_dataset)
)

# run calibration
with torch.no_grad():
for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
sample = tensors_to_device(sample, "cuda:0")
_ = model(**sample)

if idx >= num_calibration_samples:
break

# freeze params after calibration
model.apply(freeze_module_quantization)

# apply compression
compressor = ModelCompressor(quantization_config=config)
compressed_state_dict = compressor.compress(model)
model.save_pretrained(output_dir, state_dict=compressed_state_dict)
compressor.update_config(output_dir)
17 changes: 17 additions & 0 deletions examples/bit_packing/int4_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"quant_method": "sparseml",
"format": "pack-quantized",
"global_compression_ratio": null,
"config_groups": {
"group_1": {
"weights": {
"num_bits": 4,
"type": "int",
"symmetric": false,
"strategy": "tensor"
},
"targets": ["Linear"]
}
},
"ignore": ["lm_head"]
}
20 changes: 11 additions & 9 deletions examples/llama_1.1b/ex_config_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
max_seq_length = 1024
pad_to_max_length = False
output_dir = "./llama1.1b_new_quant_out"
device = "cuda:0" if torch.cuda_is_available() else "cpu"
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
model.eval() # no grad or updates needed for base model
Expand Down Expand Up @@ -68,18 +68,20 @@
)

# run calibration
for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
sample = tensors_to_device(sample, "cuda:0")
_ = model(**sample)
with torch.no_grad():
for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
sample = tensors_to_device(sample, "cuda:0")
_ = model(**sample)

if idx >= num_calibration_samples:
break
if idx >= num_calibration_samples:
break

# freeze params after calibration
model.apply(freeze_module_quantization)

# this functionality will move but for now we need to get the save override from
# SparseML in order to save the config
from sparseml.transformers.compression import modify_save_pretrained
# save quantized model
from sparseml.transformers.sparsification.compressed_tensors_utils import (
modify_save_pretrained,
)
modify_save_pretrained(model)
model.save_pretrained(output_dir)
Loading

0 comments on commit b35ecda

Please sign in to comment.