pytorch · vayuda · Jun 18, 2024 · Jun 18, 2024
diff --git a/scripts/convert_hf_checkpoint.py b/scripts/convert_hf_checkpoint.py
@@ -8,17 +8,12 @@
 import json
 import re
 import shutil
-import sys
 from pathlib import Path
 from typing import Optional
 
 import torch
 
-# support running without installing as a package
-wd = Path(__file__).parent.parent.resolve()
-sys.path.append(str(wd))
-
-from model import ModelArgs
+from torchao._models.llama.model import ModelArgs
 
 
 @torch.inference_mode()

diff --git a/torchao/_models/llama/eval.py b/torchao/_models/llama/eval.py
@@ -13,7 +13,7 @@
 
 )
 from torchao.quantization.quant_api import (
-    quantize, int4wo, int8wo, int8da_int8w
+    quantize, int4wo, int8wo, int8da_int8w, unwrap_tensor_subclass
 
 )
 from torchao._models._eval import TransformerEvalWrapper, InputRecorder
@@ -70,7 +70,7 @@ def run_evaluation(
         if "int4wo" in quantization and "gptq" in quantization:
             groupsize=int(quantization.split("-")[-2])
             assert groupsize in [32,64,128,256], f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}"
-
+            assert precision==torch.bfloat16, f"{quantization} requires precision or bfloat16 but got {precision}"
             inputs = InputRecorder(
                 tokenizer,
                 calibration_seq_length,
@@ -83,9 +83,11 @@ def run_evaluation(
                 calibration_limit,
             ).get_inputs()
 
-            quantizer = Int4WeightOnlyGPTQQuantizer(groupsize=groupsize, precision=precision)
+            quantizer = Int4WeightOnlyGPTQQuantizer(groupsize=groupsize)
             model.setup_caches(max_batch_size=1, max_seq_length=calibration_seq_length)
             model = quantizer.quantize(model, inputs).to(device)
+        else:
+            unwrap_tensor_subclass(model)
 
     if compile:
         model = torch.compile(model, mode="max-autotune", fullgraph=True)

diff --git a/torchao/_models/llama/generate.py b/torchao/_models/llama/generate.py
@@ -189,20 +189,22 @@ def main(
 
     if quantization:
         from torchao.quantization.quant_api import (
-            change_linear_weights_to_int4_woqtensors,
-            change_linear_weights_to_int8_woqtensors,
-            change_linear_weights_to_int8_dqtensors,
+            quantize,
+            int8wo,
+            int8da_int8w,
+            int4wo,
             autoquant,
+            unwrap_tensor_subclass
     )
 
         if "int8wo" in quantization:
-            change_linear_weights_to_int8_woqtensors(model)
+            quantize(model, int8wo())
         if "int8dq" in quantization:
-            change_linear_weights_to_int8_dqtensors(model)
+            quantize(model, int8da_int8w())
         if "int4wo" in quantization:
             groupsize=int(quantization.split("-")[-1])
             assert groupsize in [32,64,128,256], f"int4wo groupsize needs to be one of [32,64,128,256] but got {groupsize}"
-            change_linear_weights_to_int4_woqtensors(model, groupsize=groupsize)
+            quantize(model, int4wo(groupsize=groupsize))
         if "autoquant" == quantization:
             model = autoquant(model)
             generate(
@@ -211,6 +213,9 @@ def main(
                 2,
                 interactive=False
             )
+        else:
+            unwrap_tensor_subclass(model)
+
 
     model_size = get_model_size_in_bytes(model, ignore_embeddings=True) / 1e9