Skip to content

Commit d90e33b

Browse files
authored
Merge branch 'main' into openai_api_chat_correctness
2 parents 76b8a5a + 147c292 commit d90e33b

File tree

7 files changed

+29
-75
lines changed

7 files changed

+29
-75
lines changed

build/builder.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ def _initialize_model(
440440
quantize,
441441
tokenizer=None,
442442
max_seq_length=None,
443+
support_tensor_subclass: bool = True,
443444
):
444445
print("Loading model...")
445446

@@ -510,7 +511,13 @@ def _initialize_model(
510511
if quantize:
511512
print(f"Quantizing the model with: {quantize}")
512513
with measure_time("Time to quantize model: {time:.02f} seconds"):
513-
quantize_model(model, builder_args.device, quantize, tokenizer)
514+
quantize_model(
515+
model,
516+
builder_args.device,
517+
quantize,
518+
tokenizer,
519+
support_tensor_subclass,
520+
)
514521
device_sync(device=builder_args.device)
515522

516523
if builder_args.setup_caches:

build/model_aoti.py

-65
This file was deleted.

export.py

+2
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ def main(args):
126126
quantize,
127127
tokenizer,
128128
max_seq_length=builder_args.max_seq_length,
129+
support_tensor_subclass=output_dso_path is None,
129130
)
130131
model_to_pte = model
131132
model_to_dso = model
@@ -143,6 +144,7 @@ def main(args):
143144
model_to_dso = _initialize_model(
144145
builder_args,
145146
quantize,
147+
support_tensor_subclass=False,
146148
)
147149
_unset_gguf_kwargs(builder_args)
148150

generate.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -199,9 +199,9 @@ def __init__(
199199
# global print
200200
# from tp import maybe_init_dist
201201
# rank = maybe_init_dist()
202-
# use_tp = False
202+
# use_distributed = False
203203
self.rank: Optional[int] = None
204-
# if use_tp:
204+
# if use_distributed:
205205
# if rank != 0:
206206
# # only print on rank 0
207207
# print = lambda *args, **kwargs: None
@@ -655,7 +655,7 @@ def chat(
655655
)
656656
if generator_args.compile:
657657
if (
658-
self.is_speculative and self.builder_args.use_tp
658+
self.is_speculative and self.builder_args.use_distributed
659659
): # and ("cuda" in builder_args.device):
660660
torch._inductor.config.triton.cudagraph_trees = (
661661
False # Bug with cudagraph trees in this case
@@ -783,7 +783,7 @@ def callback(x, *, done_generating=False):
783783
)
784784

785785
if (i != generator_args.num_samples - 1 or not self.profile) or (
786-
self.builder_args.use_tp and self.rank != 0
786+
self.builder_args.use_distributed and self.rank != 0
787787
):
788788
import contextlib
789789

@@ -820,7 +820,7 @@ def callback(x, *, done_generating=False):
820820
)
821821
compilation_time = time.perf_counter() - t0
822822
if hasattr(prof, "export_chrome_trace"):
823-
if self.builder_args.use_tp:
823+
if self.builder_args.use_distributed:
824824
prof.export_chrome_trace(f"{self.profile}_rank_{self.rank}.json")
825825
else:
826826
prof.export_chrome_trace(f"{self.profile}.json")

install_requirements.sh

+2-2
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ fi
4747
# NOTE: If a newly-fetched version of the executorch repo changes the value of
4848
# NIGHTLY_VERSION, you should re-run this script to install the necessary
4949
# package versions.
50-
NIGHTLY_VERSION=dev20240728
50+
NIGHTLY_VERSION=dev20240814
5151

5252
# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
5353
(
@@ -82,7 +82,7 @@ REQUIREMENTS_TO_INSTALL=(
8282
# TODO: Remove this and install nightly build, once it supports macos
8383
(
8484
set -x
85-
$PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@d477c0e59b458b5617dcb3e999290a87df3070d8
85+
$PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@e11201a62669f582d81cdb33e031a07fb8dfc4f3
8686
)
8787
if [[ -x "$(command -v nvidia-smi)" ]]; then
8888
(

quantization/quantize.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,13 @@
5050
### torchchat quantization API ###
5151

5252

53-
def quantize_model(model: nn.Module, device, quantize_options, tokenizer=None):
53+
def quantize_model(
54+
model: nn.Module,
55+
device,
56+
quantize_options,
57+
tokenizer=None,
58+
support_tensor_subclass: bool = True,
59+
):
5460
"""
5561
Quantize the specified model using the quantizers described by
5662
a quantization dict of the form:
@@ -74,7 +80,8 @@ def quantize_model(model: nn.Module, device, quantize_options, tokenizer=None):
7480
# Use tensor subclass API for int4 weight only.
7581
if device == "cuda" and quantizer == "linear:int4":
7682
quantize_(model, int4_weight_only(q_kwargs["groupsize"]))
77-
unwrap_tensor_subclass(model)
83+
if not support_tensor_subclass:
84+
unwrap_tensor_subclass(model)
7885
continue
7986
# Use dtype precision specified in user config, else fallback on global precision.
8087
if "precision" in quantize_options:

torchchat/README.md

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Chat with LLMs Everywhere
2+
3+
This directory is a WIP path that will host most of the files currently living in root

0 commit comments

Comments
 (0)