diff --git a/neural_compressor/torch/algorithms/weight_only/autoround.py b/neural_compressor/torch/algorithms/weight_only/autoround.py index 81f2ab6df98..7d8a0e069ea 100644 --- a/neural_compressor/torch/algorithms/weight_only/autoround.py +++ b/neural_compressor/torch/algorithms/weight_only/autoround.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import json import time -import copy from typing import Union import torch @@ -135,6 +135,7 @@ def pack_model( set_module(compressed_model, k, new_module) return compressed_model + class InputCaptureModule(torch.nn.Module): def __init__(self, model) -> None: @@ -154,6 +155,7 @@ def forward(self, *args, **kwargs): logger.error("Handle cases where input data is neither a Tensor nor a dict") return self.orig_model.forward(*args, **kwargs) + class AutoRoundQuantizer(Quantizer): def __init__( self, @@ -280,7 +282,7 @@ def prepare(self, model: torch.nn.Module, *args, **kwargs): data_type=self.data_type, scale_dtype=self.scale_dtype, ) - + self.rounder.prepare() prepare_model = InputCaptureModule(model) return prepare_model @@ -384,7 +386,7 @@ def get_autoround_default_run_fn( class AutoRoundProcessor(AutoRound): - + @torch.no_grad() def cache_inter_data(self, block_names, n_samples, layer_names=[], last_cache_name=None): """Save the inputs of block_name for calibration. For layers, we cache both of inputs and output. @@ -431,8 +433,7 @@ def cache_inter_data(self, block_names, n_samples, layer_names=[], last_cache_na self.model = self.model.to(tmp_dtype) return res - - + @torch.no_grad() def prepare(self): """Prepares a given model for quantization.""" @@ -512,7 +513,6 @@ def convert(self): self.model = self.model.to("cpu") all_inputs = res - del self.inputs inputs = all_inputs[self.block_names[0]] diff --git a/test/3x/torch/quantization/weight_only/test_autoround.py b/test/3x/torch/quantization/weight_only/test_autoround.py index 157e69b8fc9..d61e2a29000 100644 --- a/test/3x/torch/quantization/weight_only/test_autoround.py +++ b/test/3x/torch/quantization/weight_only/test_autoround.py @@ -63,7 +63,7 @@ def test_autoround(self, quant_lm_head): # prepare + convert API model = prepare(model=fp32_model, quant_config=quant_config) - + run_fn(model, *run_args) q_model = convert(model) out = q_model(self.inp)[0]