microsoft · YangWang92 · Oct 28, 2024 · Oct 28, 2024 · Oct 28, 2024 · Oct 28, 2024
diff --git a/README.md b/README.md
@@ -33,6 +33,7 @@ VPTQ can compress 70B, even the 405B model, to 1-2 bits without retraining and m
 
 
 ## News
+- [2024-10-28] ✨ VPTQ algorithm early-released at [algorithm branch](https://github.com/microsoft/VPTQ/tree/algorithm), and checkout the [tutorial](https://github.com/microsoft/VPTQ/blob/algorithm/algorithm.md).
 - [2024-10-22] 🌐 Open source community contributes [**Meta Llama 3.1 Nemotron 70B** models](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-nemotron-70b-instruct-hf-without-finetune-671730b96f16208d0b3fe942), check [how VPTQ counts 'r' on local GPU](documents/example_count_r.md). We are continuing to work on quantizing the 4-6 bit versions. Please stay tuned!
 - [2024-10-21] 🌐 Open source community contributes [**Meta Llama 3.1 405B @ 3/4 bits** models](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-405b-instruct-without-finetune-66f4413f9ba55e1a9e52cfb0)
 - [2024-10-18] 🌐 Open source community contributes [**Mistral Large Instruct 2407 (123B)** models](https://huggingface.co/collections/VPTQ-community/vptq-mistral-large-instruct-2407-without-finetune-6711ebfb7faf85eed9cceb16)

diff --git a/format.sh b/format.sh
@@ -1,5 +1,19 @@
-yapf  --recursive . --style='{based_on_style: google, column_limit: 120, indent_width: 4}' -i
+#!/bin/bash
 
+# Format Python files using yapf
+echo "Running yapf..."
+find . -type f -name "*.py" \
+    ! -path "./build/*" \
+    ! -path "./.git/*" \
+    ! -path "*.egg-info/*" \
+    -print0 | xargs -0 yapf --in-place
+
+# Format Python imports using isort
+echo "Running isort..."
 isort .
 
+# Format C++ files using clang-format
+echo "Formatting C++ files..."
 find csrc/ \( -name '*.h' -o -name '*.cc' -o -name '*.cu' -o -name '*.cuh' \) -print | xargs clang-format -i
+
+echo "Formatting complete!"
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,7 +49,8 @@ column_limit = 120
 indent_width = 4
 based_on_style = "google"
 split_before_logical_operator = false
-
+dedent_closing_brackets = true
+coalesce_brackets = true
 
 [tool.codespell]
 ignore-words-list = "ist"
@@ -58,3 +59,9 @@ skip = "./VPTQ_arxiv.pdf,./build"
 [tool.isort]
 use_parentheses = true
 skip_gitignore = true
+line_length = 120
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+ensure_newline_before_comments = true
diff --git a/setup.py b/setup.py
@@ -65,8 +65,9 @@ def build_cuda_extensions():
     if torch.cuda.is_available() and torch.version.hip is not None:
         extra_compile_args["nvcc"].extend(["-fbracket-depth=1024"])
     else:
-        extra_compile_args["nvcc"].extend(
-            ["--expt-relaxed-constexpr", "--expt-extended-lambda", "--use_fast_math", "-lineinfo"])
+        extra_compile_args["nvcc"].extend([
+            "--expt-relaxed-constexpr", "--expt-extended-lambda", "--use_fast_math", "-lineinfo"
+        ])
 
     extensions = CUDAExtension(
         "vptq.ops",

diff --git a/vptq/__init__.py b/vptq/__init__.py
@@ -4,4 +4,4 @@
 # --------------------------------------------------------------------------
 
 __version__ = "0.0.2.post1"
-from .layers import AutoModelForCausalLM as AutoModelForCausalLM
+from vptq.layers import AutoModelForCausalLM as AutoModelForCausalLM
diff --git a/vptq/__main__.py b/vptq/__main__.py
@@ -3,6 +3,6 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from .app_utils import main
+from vptq.app_utils import main
 
 main()
diff --git a/vptq/app.py b/vptq/app.py
@@ -9,8 +9,7 @@
 import gradio as gr
 from huggingface_hub import snapshot_download
 
-from vptq.app_gpu import disable_gpu_info, enable_gpu_info
-from vptq.app_gpu import update_charts as _update_charts
+from vptq.app_gpu import disable_gpu_info, enable_gpu_info, update_charts as _update_charts
 from vptq.app_utils import get_chat_loop_generator
 
 models = [
@@ -114,11 +113,11 @@ def respond(
     response = ""
 
     for message in chat_completion(
-            messages,
-            max_tokens=max_tokens,
-            stream=True,
-            temperature=temperature,
-            top_p=top_p,
+        messages,
+        max_tokens=max_tokens,
+        stream=True,
+        temperature=temperature,
+        top_p=top_p,
     ):
         token = message
 

diff --git a/vptq/app_gpu.py b/vptq/app_gpu.py
@@ -82,13 +82,15 @@ def update_charts(chart_height: int = 200) -> go.Figure:
             titlefont=dict(color='blue'),
             tickfont=dict(color='blue'),
         ),
-        yaxis2=dict(title='Memory Usage (GiB)',
-                    range=[0, max(24,
-                                  max(mem_usage_history) + 1)],
-                    titlefont=dict(color='red'),
-                    tickfont=dict(color='red'),
-                    overlaying='y',
-                    side='right'),
+        yaxis2=dict(
+            title='Memory Usage (GiB)',
+            range=[0, max(24,
+                          max(mem_usage_history) + 1)],
+            titlefont=dict(color='red'),
+            tickfont=dict(color='red'),
+            overlaying='y',
+            side='right'
+        ),
         height=chart_height,  # set the height of the chart
         margin=dict(l=10, r=10, t=0, b=0),  # set the margin of the chart
         showlegend=False  # disable the legend

diff --git a/vptq/app_utils.py b/vptq/app_utils.py
@@ -9,7 +9,7 @@
 
 import transformers
 
-from .layers.model_base import AutoModelForCausalLM as VQAutoModelQuantization
+from vptq.layers.model_base import AutoModelForCausalLM as VQAutoModelQuantization
 
 
 def define_basic_args():
@@ -21,10 +21,9 @@ def define_basic_args():
  """,
         formatter_class=argparse.RawTextHelpFormatter,
     )
-    parser.add_argument("--model",
-                        type=str,
-                        required=True,
-                        help="float/float16 model to load, such as [mosaicml/mpt-7b]")
+    parser.add_argument(
+        "--model", type=str, required=True, help="float/float16 model to load, such as [mosaicml/mpt-7b]"
+    )
     parser.add_argument("--tokenizer", type=str, default="", help="default same as [model]")
     parser.add_argument("--prompt", type=str, default="once upon a time, there ", help="prompt to start generation")
     parser.add_argument("--chat", action="store_true", help="chat with the model")
@@ -62,11 +61,9 @@ def chat_loop(model, tokenizer, args):
         encodeds = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
         model_inputs = encodeds.to(model.device)
         print("assistant: ", end='')
-        generated_ids = model.generate(model_inputs,
-                                       streamer=streamer,
-                                       pad_token_id=2,
-                                       max_new_tokens=500,
-                                       do_sample=True)
+        generated_ids = model.generate(
+            model_inputs, streamer=streamer, pad_token_id=2, max_new_tokens=500, do_sample=True
+        )
         decoded = tokenizer.batch_decode(generated_ids[:, model_inputs.shape[-1]:], skip_special_tokens=True)
         messages.append({"role": "assistant", "content": decoded[0]})
 
@@ -83,11 +80,9 @@ def get_chat_loop_generator(model_id):
     if getattr(tokenizer, "chat_template", None) is None:
         raise Exception("warning: this tokenizer didn't provide chat_template.!!!")
 
-    def chat_loop_generator(messages,
-                            max_tokens: int,
-                            stream: bool = True,
-                            temperature: float = 1.0,
-                            top_p: float = 1.0):
+    def chat_loop_generator(
+        messages, max_tokens: int, stream: bool = True, temperature: float = 1.0, top_p: float = 1.0
+    ):
         print("============================chat with the model============================")
         print("Press 'exit' to quit")
 
@@ -99,13 +94,15 @@ def chat_loop_generator(messages,
             return_dict=True,
         )
         model_inputs = encodeds.to(model.device)
-        generation_kwargs = dict(model_inputs,
-                                 streamer=streamer,
-                                 max_new_tokens=max_tokens,
-                                 pad_token_id=2,
-                                 do_sample=True,
-                                 temperature=temperature,
-                                 top_p=top_p)
+        generation_kwargs = dict(
+            model_inputs,
+            streamer=streamer,
+            max_new_tokens=max_tokens,
+            pad_token_id=2,
+            do_sample=True,
+            temperature=temperature,
+            top_p=top_p
+        )
         thread = Thread(target=model.generate, kwargs=generation_kwargs)
         thread.start()
         for new_text in streamer:

diff --git a/vptq/layers/__init__.py b/vptq/layers/__init__.py
@@ -3,4 +3,4 @@
 # Licensed under the MIT License.
 # --------------------------------------------------------------------------
 
-from .model_base import AutoModelForCausalLM as AutoModelForCausalLM
+from vptq.layers.model_base import AutoModelForCausalLM as AutoModelForCausalLM
diff --git a/vptq/layers/model_base.py b/vptq/layers/model_base.py
@@ -14,7 +14,7 @@
 import transformers
 from tqdm import tqdm
 
-from .vqlinear import VQuantLinear
+from vptq.layers.vqlinear import VQuantLinear
 
 
 def set_op_by_name(layer, name, new_module):
@@ -32,9 +32,9 @@ def set_op_by_name(layer, name, new_module):
 
 
 def make_quant_linear(module, quant_conf, name="", target_layer=None):
-    for module_name, sub_module in tqdm(module.named_modules(),
-                                        total=len(list(module.named_modules())),
-                                        desc="Replacing linear layers..."):
+    for module_name, sub_module in tqdm(
+        module.named_modules(), total=len(list(module.named_modules())), desc="Replacing linear layers..."
+    ):
         if module_name in quant_conf:
             layer_conf = quant_conf[module_name]
             new_module = target_layer(**layer_conf, enable_proxy_error=False, dtype=sub_module.weight.dtype)
@@ -124,9 +124,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             checkpoint = pretrained_model_name_or_path
         else:  # remote
             token_arg = {"token": kwargs.get("token", None)}
-            checkpoint = huggingface_hub.snapshot_download(repo_id=pretrained_model_name_or_path,
-                                                           ignore_patterns=["*.bin"],
-                                                           **token_arg)
+            checkpoint = huggingface_hub.snapshot_download(
+                repo_id=pretrained_model_name_or_path, ignore_patterns=["*.bin"], **token_arg
+            )
             weight_bins = glob.glob(str(Path(checkpoint).absolute() / "*.safetensors"))
             index_json = glob.glob(str(Path(checkpoint).absolute() / "*.index.json"))
             pytorch_model_bin = glob.glob(str(Path(checkpoint).absolute() / "pytorch_model.bin"))
@@ -148,13 +148,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             max_memory = local_max_memory
 
         accelerate.hooks.attach_execution_device_hook = attach_execution_device_hook
-        model = accelerate.load_checkpoint_and_dispatch(model,
-                                                        checkpoint=checkpoint,
-                                                        device_map=device_map,
-                                                        max_memory=max_memory,
-                                                        no_split_module_classes=no_split_module_classes[0],
-                                                        dtype=torch_dtype,
-                                                        preload_module_classes=["VQuantLinear"])
+        model = accelerate.load_checkpoint_and_dispatch(
+            model,
+            checkpoint=checkpoint,
+            device_map=device_map,
+            max_memory=max_memory,
+            no_split_module_classes=no_split_module_classes[0],
+            dtype=torch_dtype,
+            preload_module_classes=["VQuantLinear"]
+        )
 
         # check cuda kernel exist
         if importlib.util.find_spec("vptq.ops") is not None: