vllm-project · kylesayrs · Aug 28, 2024 · Jul 2, 2024 · Jul 10, 2024 · Jul 10, 2024
diff --git a/src/llmcompressor/modifiers/quantization/gptq/base.py b/src/llmcompressor/modifiers/quantization/gptq/base.py
@@ -59,6 +59,7 @@ class GPTQModifier(Modifier):
     |                    symmetric: true
     |                    strategy: "tensor"
     |                    group_size: 128
+    |                    actorder: False
 
 
     :param sequential_update: Whether or not to update weights sequentially by layer,

diff --git a/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py b/src/llmcompressor/modifiers/quantization/gptq/utils/gptq_wrapper.py
@@ -122,11 +122,23 @@ def compress(
 
         tick = time.time()
 
-        # update quantization parameters for activation ordering
-        observer = MemorylessObserver(weight_quant_args)
-        scale, zero_point = observer(W)
-        update_parameter_data(self.layer, scale, "weight_scale")
-        update_parameter_data(self.layer, zero_point, "weight_zero_point")
+        # consider activation ordering
+        if weight_quant_args.actorder:
+            # use hessian to create a permutation of weights
+            perm = torch.argsort(torch.diag(self.H), descending=True)
+
+            # permute weight and hessian
+            W = W[:, perm]
+            self.H = self.H[perm][:, perm]
+
+            # update quantization parameters for activation ordering
+            observer = MemorylessObserver(weight_quant_args)
+            _scale, _zero_point = observer(W)
+            update_parameter_data(self.layer, _scale, "weight_scale")
+            update_parameter_data(self.layer, _zero_point, "weight_zero_point")
+
+        scale = self.layer.weight_scale
+        zero_point = self.layer.weight_zero_point
 
         # mask dead hessian values
         dead = torch.diag(self.H) == 0
@@ -135,6 +147,7 @@ def compress(
 
         Losses = torch.zeros(self.rows, device=self.dev)
 
+        # compute inverse hessian in place to save memory
         damp = percdamp * torch.mean(torch.diag(self.H))
         diag = torch.arange(self.columns, device=self.dev)
         self.H[diag, diag] += damp
@@ -224,12 +237,26 @@ def compress(
         if "METRIC" in logger._core.levels.keys():
             self.log_metrics(tick, Losses)
 
+        if weight_quant_args.actorder:
+            # restore original permutation
+            invperm = torch.argsort(perm)
+            W = W[:, invperm]
+
+            # g_idx describes the group index of the permuted weight
+            g_idx = torch.tensor(
+                [i // weight_quant_args.group_size for i in range(self.columns)],
+                dtype=torch.int,
+            ).to(device=invperm.device)
+
+            # invert to get the group index of the unpermuted weight
+            update_parameter_data(self.layer, g_idx[invperm], "weight_g_idx")
+
         if isinstance(self.layer, transformers.Conv1D):
             W.transpose_(0, 1)
         W = W.reshape(final_shape).to(final_dtype)
 
-        # This is a bit hacky, but FSDP updates only work if we change the weight in
-        # place, clone() or direct assignment won't work
+        # This is a bit hacky, but FSDP updates only work if we change
+        # the weight in place, clone() or direct assignment won't work
         self.layer.weight -= self.layer.weight
         self.layer.weight += W
 

diff --git a/tests/llmcompressor/transformers/compression/configs/actorder_1.1b.yaml b/tests/llmcompressor/transformers/compression/configs/actorder_1.1b.yaml
@@ -0,0 +1,5 @@
+cadence: "nightly"
+test_type: "regression"
+model_stub: "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+new_recipe: "tests/llmcompressor/transformers/compression/recipes/new_quant_actorder.yaml"
+ppl_threshold: 20
diff --git a/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder.yaml b/tests/llmcompressor/transformers/compression/recipes/new_quant_actorder.yaml
@@ -0,0 +1,19 @@
+test_stage:
+    quant_modifiers:
+        QuantizationModifier:
+            ignore: ["lm_head", "model.layers.0.mlp.down_proj"]
+            config_groups:
+                group_0:
+                    weights:
+                        num_bits: 4
+                        type: "int"
+                        symmetric: False
+                        strategy: "group"
+                        group_size: 128
+                        actorder: True
+                    input_activations: null
+                    output_activations: null
+                    targets: ["Linear"]
+        GPTQModifier:
+            block_size: 128
+            sequential_update: False