From dbc07a006604e9418876c537a00a90274fc59e02 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 30 Sep 2025 17:34:51 -0700 Subject: [PATCH 1/3] Add regex support for ModuleFqnToConfig Summary: Similar to https://github.com/pytorch/ao/pull/3084 we added regex support in transformers so people can use regex to quantize the models. See https://github.com/pytorch/ao/pull/3084 for docs and precedence of different configurations Uploaded model: https://huggingface.co/torchao-testing/opt-125m-ModuleFqnToConfig-v1-regex-0.14.0.dev Test Plan: pytest tests/quantization/torchao_integration/test_torchao.py -k test_module_fqn_to_config_regex Reviewers: Subscribers: Tasks: Tags: --- .../quantizers/quantizer_torchao.py | 13 ++- .../torchao_integration/test_torchao.py | 85 +++++++++++++++++++ 2 files changed, 97 insertions(+), 1 deletion(-) diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py index 344c9e3534ed..bb8b6673ac01 100644 --- a/src/transformers/quantizers/quantizer_torchao.py +++ b/src/transformers/quantizers/quantizer_torchao.py @@ -297,6 +297,8 @@ def create_quantized_param( # handle ModuleFqnToConfig, introduced in torchao 0.12.0+ if self.quantization_config._get_ao_version() >= version.Version("0.12.0"): + import re + from torchao.quantization import ModuleFqnToConfig config = self.quantization_config.get_apply_tensor_subclass() @@ -306,7 +308,16 @@ def create_quantized_param( if module_fqn in config.module_fqn_to_config: c = config.module_fqn_to_config[module_fqn] else: - c = config.module_fqn_to_config.get("_default", None) + for maybe_module_fqn_pattern in config.module_fqn_to_config: + if not maybe_module_fqn_pattern.startswith("re:"): + continue + elif re.fullmatch(maybe_module_fqn_pattern[3:], module_fqn): + # we'll apply the config for first fully matched pattern + c = config.module_fqn_to_config[maybe_module_fqn_pattern] + break + else: + c = config.module_fqn_to_config.get("_default", None) + if c is not None: # filter_fn: not filtering out any modules quantize_(module, c, filter_fn=lambda x, fqn: True) diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py index 1ddc2de0801f..0d1ace7ede2a 100644 --- a/tests/quantization/torchao_integration/test_torchao.py +++ b/tests/quantization/torchao_integration/test_torchao.py @@ -46,6 +46,8 @@ TensorCoreTiledLayout, ) from torchao.quantization import ( + Float8Tensor, + Float8WeightOnlyConfig, Int8WeightOnlyConfig, IntxWeightOnlyConfig, MappingType, @@ -278,6 +280,89 @@ def test_per_module_config_skip(self): self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT) + @require_torchao_version_greater_or_equal("0.13.0") + def test_module_fqn_to_config_regex_basic(self): + linear_config = Int8WeightOnlyConfig() + config = ModuleFqnToConfig({"_default": linear_config, r"re:model\.layers\..+\.self_attn\.q_proj": None}) + quant_config = TorchAoConfig(quant_type=config) + quantized_model = AutoModelForCausalLM.from_pretrained( + self.model_name, + device_map=self.device, + quantization_config=quant_config, + ) + # making sure `model.layers.0.self_attn.q_proj` is skipped + self.assertTrue(not isinstance(quantized_model.model.layers[0].self_attn.q_proj.weight, AffineQuantizedTensor)) + tokenizer = AutoTokenizer.from_pretrained(self.model_name) + + input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device) + + output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens) + EXPECTED_OUTPUT = [ + "What are we having for dinner?\n\nJessica: (smiling)", + "What are we having for dinner?\n\nJess: (smiling) I", + ] + self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT) + + @require_torchao_version_greater_or_equal("0.13.0") + def test_module_fqn_to_config_regex_fullmatch(self): + """Testing that we will only match the fqns that fully + matches the regex + """ + linear1_config = Int8WeightOnlyConfig() + linear2_config = Float8WeightOnlyConfig() + # intentially removing `j` after `q_proj` so it's not a full match + config = ModuleFqnToConfig({r"re:model\.layers\.+\.self_attn\.q_pro": linear1_config, "model.layers.3.self_attn.q_proj": linear2_config}) + quant_config = TorchAoConfig(quant_type=config) + quantized_model = AutoModelForCausalLM.from_pretrained( + self.model_name, + device_map=self.device, + quantization_config=quant_config, + ) + # highest precedence is fully specified module fqn + self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor)) + # because regex `model\.layers\.+*\.self_attn\.q_pro` didin't fully match `model.layers.1.self_attn.q_proj` (missing last `j`) + # this layer is not expected to be quantized to int8 + self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor)) + tokenizer = AutoTokenizer.from_pretrained(self.model_name) + + input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device) + + output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens) + EXPECTED_OUTPUT = [ + "What are we having for dinner?\n\nJessica: (smiling)", + "What are we having for dinner?\n\nJess: (smiling) I", + ] + self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT) + + @require_torchao_version_greater_or_equal("0.13.0") + def test_module_fqn_to_config_regex_precedence(self): + linear1_config = Int8WeightOnlyConfig() + linear2_config = Float8WeightOnlyConfig() + config = ModuleFqnToConfig({r"re:model\.layers\..+\.self_attn\.q_proj": None, "model.layers.3.self_attn.q_proj": linear2_config, "_default": linear1_config}) + quant_config = TorchAoConfig(quant_type=config) + quantized_model = AutoModelForCausalLM.from_pretrained( + self.model_name, + device_map=self.device, + quantization_config=quant_config, + ) + # highest precedence is fully specified module fqn + self.assertTrue(isinstance(quantized_model.model.layers[3].self_attn.q_proj.weight, Float8Tensor)) + # second precedence: regex + self.assertTrue(not isinstance(quantized_model.model.layers[1].self_attn.q_proj.weight, AffineQuantizedTensor)) + # last precedence: _default + self.assertTrue(isinstance(quantized_model.model.layers[1].self_attn.k_proj.weight, AffineQuantizedTensor)) + tokenizer = AutoTokenizer.from_pretrained(self.model_name) + + input_ids = tokenizer(self.input_text, return_tensors="pt").to(self.device) + + output = quantized_model.generate(**input_ids, max_new_tokens=self.max_new_tokens) + EXPECTED_OUTPUT = [ + "What are we having for dinner?\n\nJessica: (smiling)", + "What are we having for dinner?\n\nJess: (smiling) I", + ] + self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT) + + @require_torch_accelerator class TorchAoAcceleratorTest(TorchAoTest): device = torch_device From d3027663ed8aab92862ab769d460942a3dcf39bc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 7 Oct 2025 16:13:55 +0000 Subject: [PATCH 2/3] Apply style fixes --- .../torchao_integration/test_torchao.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py index 0d1ace7ede2a..896e999d7666 100644 --- a/tests/quantization/torchao_integration/test_torchao.py +++ b/tests/quantization/torchao_integration/test_torchao.py @@ -279,7 +279,6 @@ def test_per_module_config_skip(self): ] self.assertTrue(tokenizer.decode(output[0], skip_special_tokens=True) in EXPECTED_OUTPUT) - @require_torchao_version_greater_or_equal("0.13.0") def test_module_fqn_to_config_regex_basic(self): linear_config = Int8WeightOnlyConfig() @@ -311,7 +310,12 @@ def test_module_fqn_to_config_regex_fullmatch(self): linear1_config = Int8WeightOnlyConfig() linear2_config = Float8WeightOnlyConfig() # intentially removing `j` after `q_proj` so it's not a full match - config = ModuleFqnToConfig({r"re:model\.layers\.+\.self_attn\.q_pro": linear1_config, "model.layers.3.self_attn.q_proj": linear2_config}) + config = ModuleFqnToConfig( + { + r"re:model\.layers\.+\.self_attn\.q_pro": linear1_config, + "model.layers.3.self_attn.q_proj": linear2_config, + } + ) quant_config = TorchAoConfig(quant_type=config) quantized_model = AutoModelForCausalLM.from_pretrained( self.model_name, @@ -338,7 +342,13 @@ def test_module_fqn_to_config_regex_fullmatch(self): def test_module_fqn_to_config_regex_precedence(self): linear1_config = Int8WeightOnlyConfig() linear2_config = Float8WeightOnlyConfig() - config = ModuleFqnToConfig({r"re:model\.layers\..+\.self_attn\.q_proj": None, "model.layers.3.self_attn.q_proj": linear2_config, "_default": linear1_config}) + config = ModuleFqnToConfig( + { + r"re:model\.layers\..+\.self_attn\.q_proj": None, + "model.layers.3.self_attn.q_proj": linear2_config, + "_default": linear1_config, + } + ) quant_config = TorchAoConfig(quant_type=config) quantized_model = AutoModelForCausalLM.from_pretrained( self.model_name, From 8111e5b7f9841bdfbebb52c2409e73a7dd1d4e36 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Tue, 7 Oct 2025 14:36:45 -0700 Subject: [PATCH 3/3] add assert for --- docs/source/en/quantization/torchao.md | 115 +++++++++++++++++- .../quantizers/quantizer_torchao.py | 5 +- 2 files changed, 117 insertions(+), 3 deletions(-) diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md index 8778f9f3e5ea..dae7123999d6 100644 --- a/docs/source/en/quantization/torchao.md +++ b/docs/source/en/quantization/torchao.md @@ -445,7 +445,7 @@ output_text = tokenizer.batch_decode( print(output_text) ``` -#### 2. Quantizing different layers with different quantization configs +#### 2. Quantizing different layers with different quantization configs (no regex) ```py import torch @@ -484,6 +484,119 @@ output_text = tokenizer.batch_decode( print(output_text) ``` +#### 3. Quantizing different layers with different quantization configs (with regex) +We can also use regex to specify the config for all modules that has `module_fqn` that +matches the regex, all regex should start with `re:`, for example `re:layers\..*\.gate_proj` will +match all layers like `layers.0.gate_proj`. See [here](https://github.com/pytorch/ao/blob/2fe0ca0899c730c528efdbec8886feaa38879f39/torchao/quantization/quant_api.py#L2392) for docs. + +```py +import logging + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig + +# Configure logging to see warnings and debug information +logging.basicConfig( + level=logging.INFO, format="%(name)s - %(levelname)s - %(message)s" +) + +# Enable specific loggers that might contain the serialization warnings +logging.getLogger("transformers").setLevel(logging.INFO) +logging.getLogger("torchao").setLevel(logging.INFO) +logging.getLogger("safetensors").setLevel(logging.INFO) +logging.getLogger("huggingface_hub").setLevel(logging.INFO) + +model_id = "facebook/opt-125m" + +from torchao.quantization import ( + Float8DynamicActivationFloat8WeightConfig, + Int4WeightOnlyConfig, + IntxWeightOnlyConfig, + PerRow, + PerAxis, + ModuleFqnToConfig, + Float8Tensor, + Int4TilePackedTo4dTensor, + IntxUnpackedToInt8Tensor, +) + +float8dyn = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow()) +int4wo = Int4WeightOnlyConfig(int4_packing_format="tile_packed_to_4d") +intxwo = IntxWeightOnlyConfig(weight_dtype=torch.int8, granularity=PerAxis(0)) + +qconfig_dict = { + # highest priority + "model.decoder.layers.3.self_attn.q_proj": int4wo, + "model.decoder.layers.3.self_attn.k_proj": int4wo, + "model.decoder.layers.3.self_attn.v_proj": int4wo, + # vllm + "model.decoder.layers.3.self_attn.qkv_proj": int4wo, + + "re:model\.decoder\.layers\..+\.self_attn\.q_proj": float8dyn, + "re:model\.decoder\.layers\..+\.self_attn\.k_proj": float8dyn, + "re:model\.decoder\.layers\..+\.self_attn\.v_proj": float8dyn, + # this should not take effect and we'll fallback to _default + # since no full mach (missing `j` in the end) + "re:model\.decoder\.layers\..+\.self_attn\.out_pro": float8dyn, + # vllm + "re:model\.decoder\.layers\..+\.self_attn\.qkv_proj": float8dyn, + + "_default": intxwo, +} +quant_config = ModuleFqnToConfig(qconfig_dict) +quantization_config = TorchAoConfig(quant_type=quant_config) +quantized_model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map="auto", + torch_dtype=torch.bfloat16, + quantization_config=quantization_config, +) +print("quantized model:", quantized_model) +tokenizer = AutoTokenizer.from_pretrained(model_id) +for i in range(12): + if i == 3: + assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Int4TilePackedTo4dTensor) + assert isinstance(quantized_model.model.decoder.layers[i].self_attn.k_proj.weight, Int4TilePackedTo4dTensor) + assert isinstance(quantized_model.model.decoder.layers[i].self_attn.v_proj.weight, Int4TilePackedTo4dTensor) + else: + assert isinstance(quantized_model.model.decoder.layers[i].self_attn.q_proj.weight, Float8Tensor) + assert isinstance(quantized_model.model.decoder.layers[i].self_attn.k_proj.weight, Float8Tensor) + assert isinstance(quantized_model.model.decoder.layers[i].self_attn.v_proj.weight, Float8Tensor) + assert isinstance(quantized_model.model.decoder.layers[i].self_attn.out_proj.weight, IntxUnpackedToInt8Tensor) + +# Manual Testing +prompt = "What are we having for dinner?" +print("Prompt:", prompt) +inputs = tokenizer( + prompt, + return_tensors="pt", +).to("cuda") +# setting temperature to 0 to make sure result deterministic +generated_ids = quantized_model.generate(**inputs, max_new_tokens=128, temperature=0) + +correct_output_text = tokenizer.batch_decode( + generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False +) +print("Response:", correct_output_text[0][len(prompt) :]) + + +# Load model from saved checkpoint +reloaded_model = AutoModelForCausalLM.from_pretrained( + save_to, + device_map="cuda:0", + torch_dtype=torch.bfloat16, + # quantization_config=quantization_config, +) + +generated_ids = reloaded_model.generate(**inputs, max_new_tokens=128, temperature=0) +output_text = tokenizer.batch_decode( + generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False +) +print("Response:", output_text[0][len(prompt) :]) + +assert(correct_output_text == output_text) +``` + ### Autoquant If you want to automatically choose a quantization type for quantizable layers (`nn.Linear`) you can use the [autoquant](https://pytorch.org/ao/stable/generated/torchao.quantization.autoquant.html#torchao.quantization.autoquant) API. diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py index bb8b6673ac01..3e18e98f5686 100644 --- a/src/transformers/quantizers/quantizer_torchao.py +++ b/src/transformers/quantizers/quantizer_torchao.py @@ -297,8 +297,6 @@ def create_quantized_param( # handle ModuleFqnToConfig, introduced in torchao 0.12.0+ if self.quantization_config._get_ao_version() >= version.Version("0.12.0"): - import re - from torchao.quantization import ModuleFqnToConfig config = self.quantization_config.get_apply_tensor_subclass() @@ -306,6 +304,9 @@ def create_quantized_param( module_fqn, _ = param_name.rsplit(".", 1) c = None if module_fqn in config.module_fqn_to_config: + assert not module_fqn.startswith("re:"), ( + "module fqn should not start with`re:`, which is used for specifying regex" + ) c = config.module_fqn_to_config[module_fqn] else: for maybe_module_fqn_pattern in config.module_fqn_to_config: