Vahe1994 · elvircrn · Nov 20, 2024 · Aug 29, 2024 · Sep 6, 2024 · Sep 8, 2024
diff --git a/.env b/.env
@@ -0,0 +1 @@
+PYTHONPATH=.
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,10 @@
+.vscode/*
 .idea/*
 .ipynb_checkpoints/*
+__pycache__
 __pycache__/*
-SpQR/.ipynb_checkpoints/*
-SpQR/__pycache__/*
+spqr/.ipynb_checkpoints/*
+spqr/__pycache__/*
 outliers/*
 outliers_stru*
 wandb/*
@@ -17,4 +19,4 @@ lm-evaluation-harness/*.sh
 lm-evaluation-harness/lm_eval/datasets/*/__pycache__/
 lm-evaluation-harness/lm_eval/*/__pycache__/
 lm-evaluation-harness/lm_eval/__pycache__/
-lm-evaluation-harness/lm_cache
+lm-evaluation-harness/lm_cache
diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,4 @@
-                                 Apache License
+                                Apache License
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
 

diff --git a/README.md b/README.md
@@ -133,6 +133,112 @@ Performance and runtime notes:
 * With enough spare GPU memory, one can raise batch size to accelerate evaluation process.
 
 
+## Inference
+
+This repository also contains an efficient CUDA kernel implementation of the 
+SpQR matvec. The file `inference_demo.py` contains a demo of this functionality 
+by running end-to-end model inference. Below is an example of how to launch it.
+
+```bash
+usage: inference_demo.py [-h] [--pretrained_model_path PRETRAINED_MODEL_PATH] [--compressed_model_path COMPRESSED_MODEL_PATH] --execution_mode {0,1}
+
+options:
+  -h, --help            show this help message and exit
+  --pretrained_model_path PRETRAINED_MODEL_PATH
+                        Path to the model to the pretrained model
+  --compressed_model_path COMPRESSED_MODEL_PATH
+                        Path to the compressed .pt model
+  --execution_mode {0,1}
+                        If set to 0, will evaluate the dense pretrained model. If set to 1, will evaluate the spqr-quantized model
+```
+
+This script also reports the mean and median time of the forward() passes and the total inference execution time. 
+
+# Pre-Requisites for Running the Conversion Scripts, Tests and Benchmarks
+
+In order to run the benchmark and test suite you need to build the sources used by these scripts.
+You can do so by running the following command:
+
+```bash
+/bin/bash scripts/build.sh 
+```
+
+which simply runs the `setup.py` script.
+
+# Conversion From Legacy to Optimized SPQR Storage
+
+After running SpQR which produces the tensors stored in int8, in order to run the efficient inference kernels, 
+one must convert the tensors produces by SpQR (legacy tensors) into the optimized storage format used by 
+the cuda kernel. In order to do so, run the following script:
+
+```bash
+usage: convert_legacy_model_format.py [-h] --base_model BASE_MODEL --legacy_model_path LEGACY_MODEL_PATH [--sparse_strategy {csr,ptcsr,optimize_latency}] [--save_pt SAVE_PT] [--save_per_layer SAVE_PER_LAYER]
+
+options:
+  -h, --help            show this help message and exit
+  --base_model BASE_MODEL
+                        path or name of the unquantized model
+  --legacy_model_path LEGACY_MODEL_PATH
+                        path to legacy model
+  --sparse_strategy {csr,ptcsr,optimize_latency}
+                        Sparse strategy storage. Options: csr, ptcsr, auto. CSR - Compressed Sparse Rows PTCSR - Alternative storage format optimize_latency - Use the current GPU to determine the optimal storage format to reduce
+                        kernel latency
+  --save_pt SAVE_PT     Save the converted quantized .pt model here
+  --save_per_layer SAVE_PER_LAYER
+                        Save the converted quantized m
+```
+
+# Hugginface Conversion
+
+To convert a model into a Hugging Face compatible format, use convert_to_hf.py script:
+
+```bash
+usage: convert_to_hf.py [-h] [--model MODEL] [--config_path CONFIG_PATH] [--in_path_pt IN_PATH_PT] [--out_path OUT_PATH] [--save_safetensors] [--trust_remote_code] [--load_model] [--save_tokenizer]
+
+options:
+  -h, --help            show this help message and exit
+  --model MODEL         Path to the model to base config on, as in AutoConfig.from_pretrained()
+  --config_path CONFIG_PATH
+                        Path to the model to base config on, as in AutoConfig.from_pretrained()
+  --in_path_pt IN_PATH_PT
+                        Path of the checkpoint to convert
+  --out_path OUT_PATH   Path to save HF compatible checkpoint to
+  --save_safetensors    Whether to save in safetensors format
+  --trust_remote_code   Whether to trust remote code
+  --load_model          Whether to load model
+  --save_tokenizer      Whether to save tokenizer
+```
+
+# Benchmarks (matvec kernel)
+
+In order to run the matvec benchmark suite, one should run:
+
+```bash 
+bench_spqr.py [-h] --tensor_path TENSOR_PATH [--ptcsr_path PTCSR_PATH] [--output_path OUTPUT_PATH]
+
+options:
+  -h, --help            show this help message and exit
+  --tensor_path TENSOR_PATH
+                        Path to folder containing the tensors of the formmodel_path/ 0/ tensor0 tensor1
+  --ptcsr_path PTCSR_PATH
+                        Path to folder containing the tensors of the formmodel_path/ 0/ tensor0 tensor1
+  --output_path OUTPUT_PATH
+                        Path to results *.csv file.
+
+```
+
+Make sure that the `<tensor_path>` and the optional `<ptcsr_path.` point to a folder containing quantized matrices produced by the `convert_legacy_model_format.py` script.
+Use `<cuda_device_id>` to set the cuda device during benchmark. The script outputs the results in `<results_output>`.
+
+# Tests
+
+In order to run the unittest, simply execute:
+
+```bash
+python3 tests/test.py
+```
+
+
 ## Citation
 ```
 @misc{dettmers2023spqr,
@@ -143,4 +249,4 @@ Performance and runtime notes:
       archivePrefix={arXiv},
       primaryClass={cs.CL}
 }
-```
+```
diff --git a/build.sh b/build.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+MAX_JOBS=16 python3 setup.py install
+
diff --git a/convert_legacy_model_format.py b/convert_legacy_model_format.py
@@ -0,0 +1,175 @@
+import argparse
+import os
+
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+
+from spqr import ModelArgs, QuantizedLinear, SPQRLegacy, flatten_tensor
+
+
+def load_legacy_tensor(p: str, model_args: ModelArgs) -> SPQRLegacy:
+    """
+    Load legacy tensor given tensor path @p and model args @model_args.
+    Background:
+    spqr_engine.py produces tensors whose 3-bit weights are stored as int8.
+    We refer to this storage scheme as legacy, since the 3-bit inference kernel
+    only accepts the compressed storage format.
+    @param p: Legacy tensor path.
+    @param model_args: Model arguments - we obtain the beta1, beta2, bits and the sparse compression format from here.
+    @return: QuantizedLinear object, storing the compressed matrix format ready to be used by the efficient inference
+    kernel.
+    """
+    bits = model_args.bits
+    beta1 = model_args.beta1
+    beta2 = model_args.beta2
+
+    legacy_tensor = torch.load(p, map_location="cpu")
+
+    W = legacy_tensor["quant_weights"]
+    m = W.shape[0]
+    n = W.shape[1]
+    W = flatten_tensor(W)
+    W_s = flatten_tensor(legacy_tensor["quant_layer_scale"])
+    W_z = flatten_tensor(legacy_tensor["quant_layer_zeros"])
+
+    perm = legacy_tensor["perm"]
+
+    outliers_matrix = legacy_tensor["outliers_matrix"].to_sparse_csr()
+
+    col_ids = outliers_matrix.col_indices().short()
+    values = outliers_matrix.values().half()
+
+    return SPQRLegacy(
+        m=m,
+        n=n,
+        bits=bits,
+        W=flatten_tensor(W),
+        beta1=beta1,
+        beta2=beta2,
+        W_s=W_s,
+        W_z=W_z,
+        W_s_s=flatten_tensor(legacy_tensor["quant_layer_scale_qq_scale"]),
+        W_s_z=flatten_tensor(legacy_tensor["quant_layer_scale_qq_zero"]),
+        W_z_s=flatten_tensor(legacy_tensor["quant_layer_zero_qq_scale"]),
+        W_z_z=flatten_tensor(legacy_tensor["quant_layer_zero_qq_zero"]),
+        row_offsets=outliers_matrix.crow_indices().int(),
+        col_ids=col_ids,
+        values=values,
+        in_perm=perm.long(),
+    )
+
+
+def replace_and_save_quantized_layers(
+    model_args: ModelArgs,
+    model_to_be_quantized,
+    legacy_model_path,
+    current_model=None,
+    layer_id: int = -1,
+    parent_tensor_name="",
+    output_per_layer_path=None,
+):
+    """
+    This function goes through the @model_to_be_quantized recursively and
+    replaces all the dense layers with their quantized counterpart where
+    applicable. The legacy quantized layers are stored in @legacy_model_path.
+
+    As we go through the model, we construct the tensor name using layer_id and parent tensor name.
+    We then use these values to check if the current dense tensor is a valid candidate for substitution
+    with its quantized counterpart.
+
+    @param model_args: Global model args.
+    @param model_to_be_quantized: Model to be quantized.
+    @param legacy_model_path: Location of the quantized tnesors stored in the legacy format as output by SpQR.
+    @param output_per_layer_path: Optionally, one may wish to store the compressed SpQR layers separately in a folder
+    specified by this parameter (for example, this may or may not be useful during benchmarking or data analysis).
+    @param layer_id: Internal used to keep track of the current layer as we descend the model.
+    @param parent_tensor_name: Name of the previous layer in the recursion chain.
+    """
+    if current_model == None:
+        current_model = model_to_be_quantized
+    for tensor_name, m in current_model.named_children():
+        if tensor_name.isnumeric():
+            layer_id = int(tensor_name)
+            if output_per_layer_path is not None:
+                os.makedirs(os.path.join(output_per_layer_path, str(layer_id)), exist_ok=True)
+
+        if isinstance(m, torch.nn.Linear):
+            assert m.bias is None
+            legacy_tensor_path = os.path.join(legacy_model_path, f"{layer_id}", f"{parent_tensor_name}.{tensor_name}")
+            if os.path.exists(legacy_tensor_path):
+                spqr_uncompressed = load_legacy_tensor(legacy_tensor_path, model_args)
+                spqr_module = QuantizedLinear.from_legacy(spqr_uncompressed, model_args, "cpu")
+                if output_per_layer_path is not None:
+                    per_layer_tensor_path = os.path.join(
+                        output_per_layer_path, f"{layer_id}", f"{parent_tensor_name}.{tensor_name}"
+                    )
+                    torch.save(spqr_module, per_layer_tensor_path)
+                setattr(current_model, tensor_name, spqr_module)
+        else:
+            replace_and_save_quantized_layers(
+                model_args, model_to_be_quantized, legacy_model_path, m, layer_id, tensor_name, output_per_layer_path
+            )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(add_help=True)
+    parser.add_argument(
+        "--base_model",
+        type=str,
+        required=True,
+        help="path or name of the unquantized model",
+    )
+    parser.add_argument(
+        "--legacy_model_path",
+        type=str,
+        required=True,
+        help="path to legacy model",
+    )
+    parser.add_argument(
+        "--sparse_strategy",
+        type=str,
+        default="csr",
+        choices=["csr", "ptcsr", "optimize_latency"],
+        help="Sparse strategy storage. Options: csr, ptcsr, auto.\nCSR - Compressed Sparse Rows\nPTCSR - Alternative storage format\noptimize_latency - Use the current GPU to determine the optimal storage format to reduce kernel latency",
+    )
+    parser.add_argument("--save_pt", type=str, required=False, help="Save the converted quantized .pt model here")
+    parser.add_argument(
+        "--save_per_layer",
+        type=str,
+        required=False,
+        help="Save the converted quantized model per layer here - useful for benchmarking individual layers",
+    )
+
+    args, leftovers = parser.parse_known_args()
+
+    if args.save_per_layer is not None:
+        os.makedirs(args.save_per_layer, exist_ok=True)
+
+    layers = os.listdir(args.legacy_model_path)
+
+    args_path = os.path.join(args.legacy_model_path, "args.pt")
+    model_args = ModelArgs.from_file(args.legacy_model_path, args.sparse_strategy)
+
+    config = AutoConfig.from_pretrained(args.base_model, return_dict=True)
+
+    config.max_position_embeddings = 4096
+
+    model = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path=args.base_model, trust_remote_code=True, torch_dtype=torch.half, config=config
+    )
+
+    if args.save_per_layer is not None:
+        not_quantized_weights_path = os.path.join(args.legacy_model_path, "not_quantized_weights.pt")
+        not_quantized_weights = torch.load(not_quantized_weights_path)
+        for w in not_quantized_weights.values():
+            w.requires_grad = False
+        model.load_state_dict(not_quantized_weights, strict=False)
+        for f in ["args.pt", "not_quantized_weights.pt"]:
+            os.system(f"cp {os.path.join(args.legacy_model_path, f)} {os.path.join(args.save_per_layer, f)}")
+
+    replace_and_save_quantized_layers(
+        model_args, model, args.legacy_model_path, output_per_layer_path=args.save_per_layer
+    )
+
+    if args.save_pt is not None:
+        torch.save(model, args.save_pt)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#!/bin/bash
		MAX_JOBS=16 python3 setup.py install