From 263737359414f110a750446f056586dc4fb76cfa Mon Sep 17 00:00:00 2001 From: mgoin Date: Wed, 14 Aug 2024 20:07:35 +0000 Subject: [PATCH 1/2] Move kv cache scales from k/v_proj.output_scale to self_attn.k/v_scale --- .../compressors/model_compressor.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/compressed_tensors/compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressor.py index b8bf0001..d6365d47 100644 --- a/src/compressed_tensors/compressors/model_compressor.py +++ b/src/compressed_tensors/compressors/model_compressor.py @@ -240,6 +240,26 @@ def compress( compressed_state_dict ) + # HACK: Post-process step for kv cache scales to take the k/v_proj + # module `output_scale` parameters, and store them in the parent + # attention module as `k_scale` and `v_scale` + # + # Example: + # Replace `model.layers.0.self_attn.k_proj.output_scale` + # with `model.layers.0.self_attn.k_scale` + if self.quantization_config.kv_cache_scheme is not None: + working_state_dict = {} + for key in compressed_state_dict.keys(): + if key.endswith(".k_proj.output_scale"): + new_key = key.replace(".k_proj.output_scale", ".k_scale") + working_state_dict[new_key] = compressed_state_dict[key] + elif key.endswith(".v_proj.output_scale"): + new_key = key.replace(".v_proj.output_scale", ".v_scale") + working_state_dict[new_key] = compressed_state_dict[key] + else: + working_state_dict[key] = compressed_state_dict[key] + compressed_state_dict = working_state_dict + # HACK: Override the dtype_byte_size function in transformers to # support float8 types. Fix is posted upstream # https://github.com/huggingface/transformers/pull/30488 From e391ddb1641efc27fbabeeb4e8d2a10ae913ae04 Mon Sep 17 00:00:00 2001 From: mgoin Date: Thu, 15 Aug 2024 18:08:40 +0000 Subject: [PATCH 2/2] Add better checking that we hit our special case --- .../compressors/model_compressor.py | 36 +++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/src/compressed_tensors/compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressor.py index d6365d47..9807cbec 100644 --- a/src/compressed_tensors/compressors/model_compressor.py +++ b/src/compressed_tensors/compressors/model_compressor.py @@ -240,14 +240,44 @@ def compress( compressed_state_dict ) - # HACK: Post-process step for kv cache scales to take the k/v_proj - # module `output_scale` parameters, and store them in the parent - # attention module as `k_scale` and `v_scale` + # HACK (mgoin): Post-process step for kv cache scales to take the + # k/v_proj module `output_scale` parameters, and store them in the + # parent attention module as `k_scale` and `v_scale` # # Example: # Replace `model.layers.0.self_attn.k_proj.output_scale` # with `model.layers.0.self_attn.k_scale` if self.quantization_config.kv_cache_scheme is not None: + # HACK (mgoin): We assume the quantized modules in question + # will be k_proj and v_proj since those are the default targets. + # We check that both of these modules have output activation + # quantization, and additionally check that q_proj doesn't. + q_proj_has_no_quant_output = 0 + k_proj_has_quant_output = 0 + v_proj_has_quant_output = 0 + for name, module in model.named_modules(): + if not hasattr(module, "quantization_scheme"): + continue + out_act = module.quantization_scheme.output_activations + if name.endswith(".q_proj") and out_act is None: + q_proj_has_no_quant_output += 1 + elif name.endswith(".k_proj") and out_act is not None: + k_proj_has_quant_output += 1 + elif name.endswith(".v_proj") and out_act is not None: + v_proj_has_quant_output += 1 + + assert ( + q_proj_has_no_quant_output > 0 + and k_proj_has_quant_output > 0 + and v_proj_has_quant_output > 0 + ) + assert ( + q_proj_has_no_quant_output + == k_proj_has_quant_output + == v_proj_has_quant_output + ) + + # Move all .k/v_proj.output_scale parameters to .k/v_scale working_state_dict = {} for key in compressed_state_dict.keys(): if key.endswith(".k_proj.output_scale"):