From 263737359414f110a750446f056586dc4fb76cfa Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Wed, 14 Aug 2024 20:07:35 +0000
Subject: [PATCH 1/2] Move kv cache scales from k/v_proj.output_scale to
 self_attn.k/v_scale

---
 .../compressors/model_compressor.py           | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/compressed_tensors/compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressor.py
index b8bf0001..d6365d47 100644
--- a/src/compressed_tensors/compressors/model_compressor.py
+++ b/src/compressed_tensors/compressors/model_compressor.py
@@ -240,6 +240,26 @@ def compress(
                 compressed_state_dict
             )
 
+        # HACK: Post-process step for kv cache scales to take the k/v_proj
+        # module `output_scale` parameters, and store them in the parent
+        # attention module as `k_scale` and `v_scale`
+        #
+        # Example:
+        #  Replace `model.layers.0.self_attn.k_proj.output_scale`
+        #  with    `model.layers.0.self_attn.k_scale`
+        if self.quantization_config.kv_cache_scheme is not None:
+            working_state_dict = {}
+            for key in compressed_state_dict.keys():
+                if key.endswith(".k_proj.output_scale"):
+                    new_key = key.replace(".k_proj.output_scale", ".k_scale")
+                    working_state_dict[new_key] = compressed_state_dict[key]
+                elif key.endswith(".v_proj.output_scale"):
+                    new_key = key.replace(".v_proj.output_scale", ".v_scale")
+                    working_state_dict[new_key] = compressed_state_dict[key]
+                else:
+                    working_state_dict[key] = compressed_state_dict[key]
+            compressed_state_dict = working_state_dict
+
         # HACK: Override the dtype_byte_size function in transformers to
         # support float8 types. Fix is posted upstream
         # https://github.com/huggingface/transformers/pull/30488

From e391ddb1641efc27fbabeeb4e8d2a10ae913ae04 Mon Sep 17 00:00:00 2001
From: mgoin <michael@neuralmagic.com>
Date: Thu, 15 Aug 2024 18:08:40 +0000
Subject: [PATCH 2/2] Add better checking that we hit our special case

---
 .../compressors/model_compressor.py           | 36 +++++++++++++++++--
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/src/compressed_tensors/compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressor.py
index d6365d47..9807cbec 100644
--- a/src/compressed_tensors/compressors/model_compressor.py
+++ b/src/compressed_tensors/compressors/model_compressor.py
@@ -240,14 +240,44 @@ def compress(
                 compressed_state_dict
             )
 
-        # HACK: Post-process step for kv cache scales to take the k/v_proj
-        # module `output_scale` parameters, and store them in the parent
-        # attention module as `k_scale` and `v_scale`
+        # HACK (mgoin): Post-process step for kv cache scales to take the
+        # k/v_proj module `output_scale` parameters, and store them in the
+        # parent attention module as `k_scale` and `v_scale`
         #
         # Example:
         #  Replace `model.layers.0.self_attn.k_proj.output_scale`
         #  with    `model.layers.0.self_attn.k_scale`
         if self.quantization_config.kv_cache_scheme is not None:
+            # HACK (mgoin): We assume the quantized modules in question
+            # will be k_proj and v_proj since those are the default targets.
+            # We check that both of these modules have output activation
+            # quantization, and additionally check that q_proj doesn't.
+            q_proj_has_no_quant_output = 0
+            k_proj_has_quant_output = 0
+            v_proj_has_quant_output = 0
+            for name, module in model.named_modules():
+                if not hasattr(module, "quantization_scheme"):
+                    continue
+                out_act = module.quantization_scheme.output_activations
+                if name.endswith(".q_proj") and out_act is None:
+                    q_proj_has_no_quant_output += 1
+                elif name.endswith(".k_proj") and out_act is not None:
+                    k_proj_has_quant_output += 1
+                elif name.endswith(".v_proj") and out_act is not None:
+                    v_proj_has_quant_output += 1
+
+            assert (
+                q_proj_has_no_quant_output > 0
+                and k_proj_has_quant_output > 0
+                and v_proj_has_quant_output > 0
+            )
+            assert (
+                q_proj_has_no_quant_output
+                == k_proj_has_quant_output
+                == v_proj_has_quant_output
+            )
+
+            # Move all .k/v_proj.output_scale parameters to .k/v_scale
             working_state_dict = {}
             for key in compressed_state_dict.keys():
                 if key.endswith(".k_proj.output_scale"):