Merge branch 'main' into openai_api_chat_correctness

Jack-Khuu · web-flow · commit d90e33b68875 · 2024-08-16T13:59:56.000-07:00
diff --git a/build/builder.py b/build/builder.py
@@ -440,6 +440,7 @@ def _initialize_model(
     quantize,
     tokenizer=None,
     max_seq_length=None,
+    support_tensor_subclass: bool = True,
 ):
     print("Loading model...")
 
@@ -510,7 +511,13 @@ def _initialize_model(
         if quantize:
             print(f"Quantizing the model with: {quantize}")
             with measure_time("Time to quantize model: {time:.02f} seconds"):
-                quantize_model(model, builder_args.device, quantize, tokenizer)
+                quantize_model(
+                    model,
+                    builder_args.device,
+                    quantize,
+                    tokenizer,
+                    support_tensor_subclass,
+                )
                 device_sync(device=builder_args.device)
 
         if builder_args.setup_caches:
diff --git a/build/model_aoti.py b/build/model_aoti.py
diff --git a/export.py b/export.py
@@ -126,6 +126,7 @@ def main(args):
             quantize,
             tokenizer,
             max_seq_length=builder_args.max_seq_length,
+            support_tensor_subclass=output_dso_path is None,
         )
         model_to_pte = model
         model_to_dso = model
@@ -143,6 +144,7 @@ def main(args):
             model_to_dso = _initialize_model(
                 builder_args,
                 quantize,
+                support_tensor_subclass=False,
             )
             _unset_gguf_kwargs(builder_args)
 
diff --git a/generate.py b/generate.py
@@ -199,9 +199,9 @@ def __init__(
         # global print
         #    from tp import maybe_init_dist
         #    rank = maybe_init_dist()
-        # use_tp = False
+        # use_distributed = False
         self.rank: Optional[int] = None
-        #    if use_tp:
+        #    if use_distributed:
         #        if rank != 0:
         #            # only print on rank 0
         #            print = lambda *args, **kwargs: None
@@ -655,7 +655,7 @@ def chat(
         )
         if generator_args.compile:
             if (
-                self.is_speculative and self.builder_args.use_tp
+                self.is_speculative and self.builder_args.use_distributed
             ):  # and ("cuda" in builder_args.device):
                 torch._inductor.config.triton.cudagraph_trees = (
                     False  # Bug with cudagraph trees in this case
@@ -783,7 +783,7 @@ def callback(x, *, done_generating=False):
                     )
 
             if (i != generator_args.num_samples - 1 or not self.profile) or (
-                self.builder_args.use_tp and self.rank != 0
+                self.builder_args.use_distributed and self.rank != 0
             ):
                 import contextlib
 
@@ -820,7 +820,7 @@ def callback(x, *, done_generating=False):
             )
             compilation_time = time.perf_counter() - t0
             if hasattr(prof, "export_chrome_trace"):
-                if self.builder_args.use_tp:
+                if self.builder_args.use_distributed:
                     prof.export_chrome_trace(f"{self.profile}_rank_{self.rank}.json")
                 else:
                     prof.export_chrome_trace(f"{self.profile}.json")
diff --git a/install_requirements.sh b/install_requirements.sh
@@ -47,7 +47,7 @@ fi
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION=dev20240728
+NIGHTLY_VERSION=dev20240814
 
 # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
 (
@@ -82,7 +82,7 @@ REQUIREMENTS_TO_INSTALL=(
 # TODO: Remove this and install nightly build, once it supports macos
 (
   set -x
-  $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@d477c0e59b458b5617dcb3e999290a87df3070d8
+  $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@e11201a62669f582d81cdb33e031a07fb8dfc4f3
 )
 if [[ -x "$(command -v nvidia-smi)" ]]; then
   (
diff --git a/quantization/quantize.py b/quantization/quantize.py
@@ -50,7 +50,13 @@
 ###                  torchchat quantization API                       ###
 
 
-def quantize_model(model: nn.Module, device, quantize_options, tokenizer=None):
+def quantize_model(
+    model: nn.Module,
+    device,
+    quantize_options,
+    tokenizer=None,
+    support_tensor_subclass: bool = True,
+):
     """
     Quantize the specified model using the quantizers described by
     a quantization dict of the form:
@@ -74,7 +80,8 @@ def quantize_model(model: nn.Module, device, quantize_options, tokenizer=None):
             # Use tensor subclass API for int4 weight only.
             if device == "cuda" and quantizer == "linear:int4":
                 quantize_(model, int4_weight_only(q_kwargs["groupsize"]))
-                unwrap_tensor_subclass(model)
+                if not support_tensor_subclass:
+                    unwrap_tensor_subclass(model)
                 continue
             # Use dtype precision specified in user config, else fallback on global precision.
             if "precision" in quantize_options:
diff --git a/torchchat/README.md b/torchchat/README.md
@@ -0,0 +1,3 @@
+# Chat with LLMs Everywhere
+
+This directory is a WIP path that will host most of the files currently living in root

Original file line number	Diff line number	Diff line change
`@@ -126,6 +126,7 @@ def main(args):`
`126`	`126`	`quantize,`
`127`	`127`	`tokenizer,`
`128`	`128`	`max_seq_length=builder_args.max_seq_length,`
	`129`	`+ support_tensor_subclass=output_dso_path is None,`
`129`	`130`	`)`
`130`	`131`	`model_to_pte = model`
`131`	`132`	`model_to_dso = model`
`@@ -143,6 +144,7 @@ def main(args):`
`143`	`144`	`model_to_dso = _initialize_model(`
`144`	`145`	`builder_args,`
`145`	`146`	`quantize,`
	`147`	`+ support_tensor_subclass=False,`
`146`	`148`	`)`
`147`	`149`	`_unset_gguf_kwargs(builder_args)`
`148`	`150`
Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ fi`
`47`	`47`	`# NOTE: If a newly-fetched version of the executorch repo changes the value of`
`48`	`48`	`# NIGHTLY_VERSION, you should re-run this script to install the necessary`
`49`	`49`	`# package versions.`
`50`		`-NIGHTLY_VERSION=dev20240728`
	`50`	`+NIGHTLY_VERSION=dev20240814`
`51`	`51`
`52`	`52`	`# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same`
`53`	`53`	`(`
`@@ -82,7 +82,7 @@ REQUIREMENTS_TO_INSTALL=(`
`82`	`82`	`# TODO: Remove this and install nightly build, once it supports macos`
`83`	`83`	`(`
`84`	`84`	`set -x`
`85`		`- $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@d477c0e59b458b5617dcb3e999290a87df3070d8`
	`85`	`+ $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@e11201a62669f582d81cdb33e031a07fb8dfc4f3`
`86`	`86`	`)`
`87`	`87`	`if [[ -x "$(command -v nvidia-smi)" ]]; then`
`88`	`88`	`(`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Chat with LLMs Everywhere`
	`2`	`+`
	`3`	`+This directory is a WIP path that will host most of the files currently living in root`