Skip to content

Commit

Permalink
Merge branch 'main' into fix/add-absolute-imports
Browse files Browse the repository at this point in the history
  • Loading branch information
YangWang92 authored Oct 28, 2024
2 parents f8686dd + 1c6b87e commit d02bc17
Show file tree
Hide file tree
Showing 10 changed files with 136 additions and 88 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ VPTQ can compress 70B, even the 405B model, to 1-2 bits without retraining and m


## News
- [2024-10-28] ✨ VPTQ algorithm early-released at [algorithm branch](https://github.com/microsoft/VPTQ/tree/algorithm), and checkout the [tutorial](https://github.com/microsoft/VPTQ/blob/algorithm/algorithm.md).
- [2024-10-22] 🌐 Open source community contributes [**Meta Llama 3.1 Nemotron 70B** models](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-nemotron-70b-instruct-hf-without-finetune-671730b96f16208d0b3fe942), check [how VPTQ counts 'r' on local GPU](documents/example_count_r.md). We are continuing to work on quantizing the 4-6 bit versions. Please stay tuned!
- [2024-10-21] 🌐 Open source community contributes [**Meta Llama 3.1 405B @ 3/4 bits** models](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-405b-instruct-without-finetune-66f4413f9ba55e1a9e52cfb0)
- [2024-10-18] 🌐 Open source community contributes [**Mistral Large Instruct 2407 (123B)** models](https://huggingface.co/collections/VPTQ-community/vptq-mistral-large-instruct-2407-without-finetune-6711ebfb7faf85eed9cceb16)
Expand Down
16 changes: 15 additions & 1 deletion format.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
yapf --recursive . --style='{based_on_style: google, column_limit: 120, indent_width: 4}' -i
#!/bin/bash

# Format Python files using yapf
echo "Running yapf..."
find . -type f -name "*.py" \
! -path "./build/*" \
! -path "./.git/*" \
! -path "*.egg-info/*" \
-print0 | xargs -0 yapf --in-place

# Format Python imports using isort
echo "Running isort..."
isort .

# Format C++ files using clang-format
echo "Formatting C++ files..."
find csrc/ \( -name '*.h' -o -name '*.cc' -o -name '*.cu' -o -name '*.cuh' \) -print | xargs clang-format -i

echo "Formatting complete!"
9 changes: 8 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ column_limit = 120
indent_width = 4
based_on_style = "google"
split_before_logical_operator = false

dedent_closing_brackets = true
coalesce_brackets = true

[tool.codespell]
ignore-words-list = "ist"
Expand All @@ -58,3 +59,9 @@ skip = "./VPTQ_arxiv.pdf,./build"
[tool.isort]
use_parentheses = true
skip_gitignore = true
line_length = 120
multi_line_output = 3
include_trailing_comma = true
force_grid_wrap = 0
combine_as_imports = true
ensure_newline_before_comments = true
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,9 @@ def build_cuda_extensions():
if torch.cuda.is_available() and torch.version.hip is not None:
extra_compile_args["nvcc"].extend(["-fbracket-depth=1024"])
else:
extra_compile_args["nvcc"].extend(
["--expt-relaxed-constexpr", "--expt-extended-lambda", "--use_fast_math", "-lineinfo"])
extra_compile_args["nvcc"].extend([
"--expt-relaxed-constexpr", "--expt-extended-lambda", "--use_fast_math", "-lineinfo"
])

extensions = CUDAExtension(
"vptq.ops",
Expand Down
13 changes: 6 additions & 7 deletions vptq/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
import gradio as gr
from huggingface_hub import snapshot_download

from vptq.app_gpu import disable_gpu_info, enable_gpu_info
from vptq.app_gpu import update_charts as _update_charts
from vptq.app_gpu import disable_gpu_info, enable_gpu_info, update_charts as _update_charts
from vptq.app_utils import get_chat_loop_generator

models = [
Expand Down Expand Up @@ -114,11 +113,11 @@ def respond(
response = ""

for message in chat_completion(
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
messages,
max_tokens=max_tokens,
stream=True,
temperature=temperature,
top_p=top_p,
):
token = message

Expand Down
16 changes: 9 additions & 7 deletions vptq/app_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,15 @@ def update_charts(chart_height: int = 200) -> go.Figure:
titlefont=dict(color='blue'),
tickfont=dict(color='blue'),
),
yaxis2=dict(title='Memory Usage (GiB)',
range=[0, max(24,
max(mem_usage_history) + 1)],
titlefont=dict(color='red'),
tickfont=dict(color='red'),
overlaying='y',
side='right'),
yaxis2=dict(
title='Memory Usage (GiB)',
range=[0, max(24,
max(mem_usage_history) + 1)],
titlefont=dict(color='red'),
tickfont=dict(color='red'),
overlaying='y',
side='right'
),
height=chart_height, # set the height of the chart
margin=dict(l=10, r=10, t=0, b=0), # set the margin of the chart
showlegend=False # disable the legend
Expand Down
43 changes: 20 additions & 23 deletions vptq/app_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@

import transformers

from vptq.layers.model_base import (
AutoModelForCausalLM as VQAutoModelQuantization)

from vptq.layers.model_base import AutoModelForCausalLM as VQAutoModelQuantization


def define_basic_args():
Expand All @@ -22,10 +22,9 @@ def define_basic_args():
""",
formatter_class=argparse.RawTextHelpFormatter,
)
parser.add_argument("--model",
type=str,
required=True,
help="float/float16 model to load, such as [mosaicml/mpt-7b]")
parser.add_argument(
"--model", type=str, required=True, help="float/float16 model to load, such as [mosaicml/mpt-7b]"
)
parser.add_argument("--tokenizer", type=str, default="", help="default same as [model]")
parser.add_argument("--prompt", type=str, default="once upon a time, there ", help="prompt to start generation")
parser.add_argument("--chat", action="store_true", help="chat with the model")
Expand Down Expand Up @@ -63,11 +62,9 @@ def chat_loop(model, tokenizer, args):
encodeds = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
model_inputs = encodeds.to(model.device)
print("assistant: ", end='')
generated_ids = model.generate(model_inputs,
streamer=streamer,
pad_token_id=2,
max_new_tokens=500,
do_sample=True)
generated_ids = model.generate(
model_inputs, streamer=streamer, pad_token_id=2, max_new_tokens=500, do_sample=True
)
decoded = tokenizer.batch_decode(generated_ids[:, model_inputs.shape[-1]:], skip_special_tokens=True)
messages.append({"role": "assistant", "content": decoded[0]})

Expand All @@ -84,11 +81,9 @@ def get_chat_loop_generator(model_id):
if getattr(tokenizer, "chat_template", None) is None:
raise Exception("warning: this tokenizer didn't provide chat_template.!!!")

def chat_loop_generator(messages,
max_tokens: int,
stream: bool = True,
temperature: float = 1.0,
top_p: float = 1.0):
def chat_loop_generator(
messages, max_tokens: int, stream: bool = True, temperature: float = 1.0, top_p: float = 1.0
):
print("============================chat with the model============================")
print("Press 'exit' to quit")

Expand All @@ -100,13 +95,15 @@ def chat_loop_generator(messages,
return_dict=True,
)
model_inputs = encodeds.to(model.device)
generation_kwargs = dict(model_inputs,
streamer=streamer,
max_new_tokens=max_tokens,
pad_token_id=2,
do_sample=True,
temperature=temperature,
top_p=top_p)
generation_kwargs = dict(
model_inputs,
streamer=streamer,
max_new_tokens=max_tokens,
pad_token_id=2,
do_sample=True,
temperature=temperature,
top_p=top_p
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
for new_text in streamer:
Expand Down
2 changes: 1 addition & 1 deletion vptq/layers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
# Licensed under the MIT License.
# --------------------------------------------------------------------------

from vptq.layers.model_base import AutoModelForCausalLM as AutoModelForCausalLM
from vptq.layers.model_base import AutoModelForCausalLM as AutoModelForCausalLM
29 changes: 16 additions & 13 deletions vptq/layers/model_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import transformers
from tqdm import tqdm


from vptq.layers.vqlinear import VQuantLinear


Expand All @@ -32,9 +33,9 @@ def set_op_by_name(layer, name, new_module):


def make_quant_linear(module, quant_conf, name="", target_layer=None):
for module_name, sub_module in tqdm(module.named_modules(),
total=len(list(module.named_modules())),
desc="Replacing linear layers..."):
for module_name, sub_module in tqdm(
module.named_modules(), total=len(list(module.named_modules())), desc="Replacing linear layers..."
):
if module_name in quant_conf:
layer_conf = quant_conf[module_name]
new_module = target_layer(**layer_conf, enable_proxy_error=False, dtype=sub_module.weight.dtype)
Expand Down Expand Up @@ -124,9 +125,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
checkpoint = pretrained_model_name_or_path
else: # remote
token_arg = {"token": kwargs.get("token", None)}
checkpoint = huggingface_hub.snapshot_download(repo_id=pretrained_model_name_or_path,
ignore_patterns=["*.bin"],
**token_arg)
checkpoint = huggingface_hub.snapshot_download(
repo_id=pretrained_model_name_or_path, ignore_patterns=["*.bin"], **token_arg
)
weight_bins = glob.glob(str(Path(checkpoint).absolute() / "*.safetensors"))
index_json = glob.glob(str(Path(checkpoint).absolute() / "*.index.json"))
pytorch_model_bin = glob.glob(str(Path(checkpoint).absolute() / "pytorch_model.bin"))
Expand All @@ -148,13 +149,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
max_memory = local_max_memory

accelerate.hooks.attach_execution_device_hook = attach_execution_device_hook
model = accelerate.load_checkpoint_and_dispatch(model,
checkpoint=checkpoint,
device_map=device_map,
max_memory=max_memory,
no_split_module_classes=no_split_module_classes[0],
dtype=torch_dtype,
preload_module_classes=["VQuantLinear"])
model = accelerate.load_checkpoint_and_dispatch(
model,
checkpoint=checkpoint,
device_map=device_map,
max_memory=max_memory,
no_split_module_classes=no_split_module_classes[0],
dtype=torch_dtype,
preload_module_classes=["VQuantLinear"]
)

# check cuda kernel exist
if importlib.util.find_spec("vptq.ops") is not None:
Expand Down
Loading

0 comments on commit d02bc17

Please sign in to comment.