Int4 Packed Compressor (#47)

* group size * add logic in base observer * Compressed lifecycle implementation (INT8 only) * group size full lifecycle run * Apply suggestions from code review * before vectorize the for loop * comments, todo add channelwise * chan wise impl * comments * fix channel wise * comments, validators * fix typo * small fixes for runtime * add classes * tensor return error fix * WIP * moving around classes * fix sparseml-side of code and add per channel * pyndatic defaults * token wise quant * Update src/compressed_tensors/quantization/quant_args.py Co-authored-by: Benjamin Fineran <bfineran@users.noreply.github.com> * comments' * code complete * tests passing * unit test bugs * fill out int decompression * docstrings * allow repeat frozens * update dim * int compressor unit tests * move helper * shape consistency * initial commit * first unit test passing * Update src/compressed_tensors/quantization/lifecycle/forward.py Co-authored-by: Benjamin Fineran <bfineran@users.noreply.github.com> * comments * tests passing * one more test * cleanup * pass test_quant_args * Quantization Compressor Support (#45) * add classes * WIP * moving around classes * code complete * tests passing * unit test bugs * fill out int decompression * docstrings * allow repeat frozens * int compressor unit tests * PR comments * fix device issue * fixing leaf checker * updating tests * docstrings * updating examples * update examples * fix channelwise * new tests, some fail * WIP * new helper fn * actually just a warning * group size speedups + fixes * group compression * fix output type on decompress * fix channelwise * revert * more tests * move tests * example notebook * add example notebook * update README * cleanup --------- Co-authored-by: George Ohashi <george@neuralmagic.com> Co-authored-by: Benjamin <ben@neuralmagic.com> Co-authored-by: Benjamin Fineran <bfineran@users.noreply.github.com>
neuralmagic · May 15, 2024 · b35ecda · b35ecda
1 parent d914b73
commit b35ecda
Show file tree

Hide file tree

Showing 15 changed files with 882 additions and 35 deletions.
diff --git a/README.md b/README.md
@@ -82,3 +82,44 @@ state_dict = dict(load_compressed("compressed_model.safetensors", compression_co
 ```
 
 For more in-depth tutorial on bitmask compression, refer to the [notebook](https://github.com/neuralmagic/compressed-tensors/blob/d707c5b84bc3fef164aebdcd97cb6eaa571982f8/examples/bitmask_compression.ipynb).
+
+
+## Saving a Compressed Model with PTQ
+
+We can use compressed-tensors to run basic post training quantization (PTQ) and save the quantized model compressed on disk
+
+```python
+model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda:0")
+
+config = QuantizationConfig.parse_file("./examples/bit_packing/int4_config.json")
+config.quantization_status = QuantizationStatus.CALIBRATION
+apply_quantization_config(model, config)
+
+dataset = load_dataset("ptb_text_only")["train"]
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+def tokenize_function(examples):
+    return tokenizer(examples["sentence"], padding=False, truncation=True, max_length=1024)
+
+tokenized_dataset = dataset.map(tokenize_function, batched=True)
+data_loader = DataLoader(tokenized_dataset, batch_size=1, collate_fn=DefaultDataCollator())
+
+with torch.no_grad():
+    for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
+        sample = {key: value.to(device) for key,value in sample.items()}
+        _ = model(**sample)
+
+        if idx >= 512:
+            break
+
+model.apply(freeze_module_quantization)
+model.apply(compress_quantized_weights)
+
+output_dir = "./ex_llama1.1b_w4a16_packed_quantize"
+compressor = ModelCompressor(quantization_config=config)
+compressed_state_dict = compressor.compress(model)
+model.save_pretrained(output_dir, state_dict=compressed_state_dict)
+```
+
+For more in-depth tutorial on quantization compression, refer to the [notebook](./examples/quantize_and_pack_int4.ipynb).
diff --git a/examples/bit_packing/ex_quantize_and_pack.py b/examples/bit_packing/ex_quantize_and_pack.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tqdm import tqdm
+from torch.utils.data import RandomSampler
+from compressed_tensors.quantization import (
+    apply_quantization_config,
+    freeze_module_quantization,
+    QuantizationConfig,
+    QuantizationStatus,
+)
+from sparseml.transformers.finetune.data.data_args import DataTrainingArguments
+from sparseml.transformers.finetune.data.base import TextGenerationDataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, DefaultDataCollator
+from torch.utils.data import DataLoader
+from sparseml.pytorch.utils import tensors_to_device
+import torch
+from compressed_tensors.compressors import ModelCompressor
+
+config_file = "int4_config.json"
+model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+dataset_name = "open_platypus"
+split = "train"
+num_calibration_samples = 128
+max_seq_length = 512
+pad_to_max_length = False
+output_dir = "./llama1.1b_new_quant_out_test_packing"
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
+model.eval()  # no grad or updates needed for base model
+config = QuantizationConfig.parse_file(config_file)
+
+# set status to calibration
+config.quantization_status = QuantizationStatus.CALIBRATION
+
+# initialize quantization
+apply_quantization_config(model, config)
+
+# create dataset
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+data_args = DataTrainingArguments(
+    dataset=dataset_name,
+    max_seq_length=max_seq_length,
+    pad_to_max_length=pad_to_max_length,
+)
+dataset_manager = TextGenerationDataset.load_from_registry(
+    data_args.dataset,
+    data_args=data_args,
+    split=split,
+    tokenizer=tokenizer,
+)
+calib_dataset = dataset_manager.tokenize_and_process(
+    dataset_manager.get_raw_dataset()
+)
+data_loader = DataLoader(
+    calib_dataset, batch_size=1, collate_fn=DefaultDataCollator(), sampler=RandomSampler(calib_dataset)
+)
+
+# run calibration
+with torch.no_grad():
+    for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
+        sample = tensors_to_device(sample, "cuda:0")
+        _ = model(**sample)
+
+        if idx >= num_calibration_samples:
+            break
+
+# freeze params after calibration
+model.apply(freeze_module_quantization)
+
+# apply compression
+compressor = ModelCompressor(quantization_config=config)
+compressed_state_dict = compressor.compress(model)
+model.save_pretrained(output_dir, state_dict=compressed_state_dict)
+compressor.update_config(output_dir)
diff --git a/examples/bit_packing/int4_config.json b/examples/bit_packing/int4_config.json
@@ -0,0 +1,17 @@
+{
+	"quant_method": "sparseml",
+	"format": "pack-quantized",
+	"global_compression_ratio": null,
+	"config_groups": {
+        "group_1": {
+            "weights": {
+                "num_bits": 4,
+                "type": "int",
+                "symmetric": false,
+                "strategy": "tensor"
+            },
+            "targets": ["Linear"]
+        }
+    },
+	"ignore": ["lm_head"]
+}
diff --git a/examples/llama_1.1b/ex_config_quantization.py b/examples/llama_1.1b/ex_config_quantization.py
@@ -35,7 +35,7 @@
 max_seq_length = 1024
 pad_to_max_length = False
 output_dir = "./llama1.1b_new_quant_out"
-device = "cuda:0" if torch.cuda_is_available() else "cpu"
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
 model = AutoModelForCausalLM.from_pretrained(model_name, device_map=device)
 model.eval()  # no grad or updates needed for base model
@@ -68,18 +68,20 @@
 )
 
 # run calibration
-for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
-    sample = tensors_to_device(sample, "cuda:0")
-    _ = model(**sample)
+with torch.no_grad():
+    for idx, sample in tqdm(enumerate(data_loader), desc="Running calibration"):
+        sample = tensors_to_device(sample, "cuda:0")
+        _ = model(**sample)
 
-    if idx >= num_calibration_samples:
-        break
+        if idx >= num_calibration_samples:
+            break
 
 # freeze params after calibration
 model.apply(freeze_module_quantization)
 
-# this functionality will move but for now we need to get the save override from
-# SparseML in order to save the config
-from sparseml.transformers.compression import modify_save_pretrained
+# save quantized model
+from sparseml.transformers.sparsification.compressed_tensors_utils import (
+    modify_save_pretrained,
+)
 modify_save_pretrained(model) 
 model.save_pretrained(output_dir)