From cff31a41732f3001452609d588291c876d5ddd36 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Wed, 18 Jun 2025 16:31:59 +0000 Subject: [PATCH 01/99] add alora dir --- src/peft/tuners/alora/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/peft/tuners/alora/__init__.py diff --git a/src/peft/tuners/alora/__init__.py b/src/peft/tuners/alora/__init__.py new file mode 100644 index 0000000000..e69de29bb2 From beb92e4591b830e65b5e2ae3115165ce8a361a7f Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 18 Jun 2025 12:33:46 -0400 Subject: [PATCH 02/99] Add files via upload --- src/peft/tuners/alora/config.py | 675 +++++++ src/peft/tuners/alora/layer.py | 1274 ++++++++++++ src/peft/tuners/alora/model.py | 954 +++++++++ src/peft/tuners/alora/peft_model_alora.py | 2165 +++++++++++++++++++++ 4 files changed, 5068 insertions(+) create mode 100644 src/peft/tuners/alora/config.py create mode 100644 src/peft/tuners/alora/layer.py create mode 100644 src/peft/tuners/alora/model.py create mode 100644 src/peft/tuners/alora/peft_model_alora.py diff --git a/src/peft/tuners/alora/config.py b/src/peft/tuners/alora/config.py new file mode 100644 index 0000000000..ab0cff4337 --- /dev/null +++ b/src/peft/tuners/alora/config.py @@ -0,0 +1,675 @@ +from __future__ import annotations + +import warnings +from dataclasses import dataclass, field +from typing import Literal, Optional, Union + +from torch import nn +import enum +from peft.config import PeftConfig +from peft.utils import PeftType +#class PeftType(str, enum.Enum): +# ALORA = "ALORA" + + + +from peft import LoraConfig + + +def _check_and_remove_unused_kwargs(cls, kwargs): + """Make PEFT configs forward-compatible by removing unused kwargs that were added in later PEFT versions. + + This assumes that removing the unused kwargs will not affect the default behavior. + + Returns the filtered kwargs and the set of removed keys. + """ + # it's not pretty but eh + signature_parameters = inspect.signature(cls.__init__).parameters + unexpected_kwargs = set(kwargs.keys()) - set(signature_parameters.keys()) + for key in unexpected_kwargs: + del kwargs[key] + return kwargs, unexpected_kwargs + + +@dataclass +class aLoraConfig(LoraConfig): + """ + This is the configuration class to store the configuration of an [`aLoraModel`]. + + It subclasses PEFT's LoraConfig, modifies the default rank r to 32 (often best), and adds an additional parameter: + r (`int`): aLora attention dimension (the "rank"). Typically needs to be higher than used for standard Lora. Default=32. + invocation_string (str): String intended to activate the aLoRA. The aLoRA adapted weights will activate + 1 token after the first token in this string. This string must be present in all input data. + """ + r: int = field(default=32, metadata={"help": "aLora attention dimension. Typically needs to be higher than used for standard Lora. Default=32."}) + invocation_string: str = field( + default=None, + metadata={ + "help": ( + "aLoRA invocation string. The aLoRA adapted weights will activate 1 token after the first token in " + "this string. This string must be present in all input data." + ) + } + ) + + def __post_init__(self): #, *args, invocation_string=None, r=32, **kwargs): + # Call the parent's __post_init__ to initialize all the fields + super().__post_init__() #*args, r=r, **kwargs) + # Validate the additional field + + if self.invocation_string is None: + warnings.warn("invocation_string cannot be None", UserWarning) + + + + + + + + def from_peft_type(**kwargs): + r""" + This method loads the configuration of your adapter model from a set of kwargs. + + The appropriate configuration type is determined by the `peft_type` argument. If `peft_type` is not provided, + the calling class type is instantiated. + + Args: + kwargs (configuration keyword arguments): + Keyword arguments passed along to the configuration initialization. + """ + # Avoid circular dependency .. TODO: fix this with a larger refactor + #from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING + + # TODO: this hack is needed to fix the following issue (on commit 702f937): + # if someone saves a default config and loads it back with `PeftConfig` class it yields to + # not loading the correct config class. + # + # from peft import AdaLoraConfig, PeftConfig + # peft_config = AdaLoraConfig() + # print(peft_config) + # >>> AdaLoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, + # revision=None, task_type=None, inference_mode=False, r=8, target_modules=None, lora_alpha=8, lora_dropout=0.0, ... + # + # peft_config.save_pretrained("./test_config") + # peft_config = PeftConfig.from_pretrained("./test_config") + # print(peft_config) + # >>> PeftConfig(peft_type='ADALORA', auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=None, inference_mode=False) + + #if "peft_type" in kwargs: + # peft_type = kwargs["peft_type"] + config_cls = aLoraConfig #PEFT_TYPE_TO_CONFIG_MAPPING[peft_type] + #else: + + try: + config = config_cls(**kwargs) + except TypeError as exc: + # Here we potentially handle forward compatibility. Sometimes new keywords are added to configs, which makes + # new configs incompatible with older PEFT versions. We catch these and remove them to allow the program to + # continue, but warn the user about it. + + # First check if the error is due to unexpected keyword arguments, we don't want to accidentally catch + # other TypeErrors. + if "got an unexpected keyword argument" not in str(exc): + raise exc + + filtered_kwargs, unexpected_kwargs = _check_and_remove_unused_kwargs(config_cls, kwargs) + MIN_EXPECTED_CONFIG_KEYS = {"peft_type"} + if not MIN_EXPECTED_CONFIG_KEYS.issubset(set(filtered_kwargs.keys())): + raise TypeError( + f"The {cls.__name__} config that is trying to be loaded is missing required keys: " + f"{MIN_EXPECTED_CONFIG_KEYS}." + ) + + warnings.warn( + f"Unexpected keyword arguments {sorted(unexpected_kwargs)} for class {config_cls.__name__}, these are " + "ignored. This probably means that you're loading a configuration file that was saved using a " + "higher version of the library and additional parameters have been introduced since. It is " + "highly recommended to upgrade the PEFT version before continuing (e.g. by running `pip install " + "-U peft`)." + ) + config = config_cls.from_peft_type(**filtered_kwargs) + return config + + + +# from __future__ import annotations + +# import warnings +# from dataclasses import dataclass, field +# from typing import Literal, Optional, Union + +# from torch import nn +# import enum +# from peft.config import PeftConfig +# from peft.utils import PeftType +# #class PeftType(str, enum.Enum): +# # ALORA = "ALORA" + + + +# from peft import LoraConfig + +# @dataclass +# class aLoraConfig(LoraConfig): +# """ +# This is the configuration class to store the configuration of an [`aLoraModel`]. + +# It subclasses PEFT's LoraConfig, modifies the default rank r to 32 (often best), and adds an additional parameter: +# r (`int`): aLora attention dimension (the "rank"). Typically needs to be higher than used for standard Lora. Default=32. +# invocation_string (str): String intended to activate the aLoRA. The aLoRA adapted weights will activate +# 1 token after the first token in this string. This string must be present in all input data. +# """ +# r: int = field(default=32, metadata={"help": "aLora attention dimension. Typically needs to be higher than used for standard Lora. Default=32."}) +# invocation_string: str = field( +# default=None, +# metadata={ +# "help": ( +# "aLoRA invocation string. The aLoRA adapted weights will activate 1 token after the first token in " +# "this string. This string must be present in all input data." +# ) +# } +# ) + +# def __post_init__(self, *args, invocation_string=None, r=32, **kwargs): +# # Call the parent's __post_init__ to initialize all the fields +# super().__post_init__(*args, r=r, **kwargs) +# # Validate the additional field +# self.invocation_string = invocation_string +# if self.invocation_string is None: +# warnings.warn("invocation_string cannot be None", UserWarning) + + + + +# @dataclass +# class LoraRuntimeConfig: +# """ +# This is the sub-configuration class to store the runtime configurations for the model. + +# Args: +# ephemeral_gpu_offload (`bool`): +# Whether to use ephemeral GPU offloading for models partially kept in CPU memory. +# """ + +# ephemeral_gpu_offload: bool = field( +# default=False, +# metadata={ +# "help": ( +# "Whether to use ephemeral GPU offloading for models partially kept in CPU memory. Ephemeral GPU offloading result in " +# "the data involved in intense operations being momentarily copied over to the GPU, and the results copied " +# "back to CPU. There is a momentary VRAM overhead, but operations are generally orders of magnitude faster " +# "compared to performing them on the CPU. This is useful when parts of the model and/or components (such " +# "as adapters) are kept in CPU memory until they are needed. Rather than perform expensive operations on " +# "small data, the data is transferred to the GPU on-demand, the operation(s) performed, and the results " +# "moved back to CPU memory. Currently only affects DoRA initialization." +# ) +# }, +# ) + + +# @dataclass +# class LoftQConfig: +# """ +# This is the sub-configuration class to store the configuration of a [`LoraModel`]. + +# Args: +# bits_pattern (`dict`): The mapping from layer names or regexp expression to bits which are different from the +# default bits specified by `bits`. For example, `{model.decoder.layers.0.encoder_attn.k_proj: 2`}. +# bits (`int`): Quantization bits for LoftQ. +# iter (`int`): Alternating iterations for LoftQ. +# fake (`bool`): True: use fp16/fp32; used for first time to save weights. False: use bitsandbytes 4bit linear +# models. weights can't be saved. Recommend to set to True, save the weights and load the saved weights in 4 +# bits. +# """ + +# loftq_bits: int = field(default=4, metadata={"help": "Quantization bits for LoftQ"}) +# loftq_iter: int = field(default=1, metadata={"help": "Alternating iterations for LoftQ"}) + + +# @dataclass +# class EvaConfig: +# """ +# This is the sub-configuration class to store the configuration for a data-driven initialization via EVA. EVA was +# introduced in Explained Variance Adaptation. + +# Args: +# rho (`float`): +# Rho value for EVA redistribution (>= 1.0). The maximum rank for a layer is lora_r * rho. Default is 2.0, +# meaning the maximum rank allowed for a layer is 2r. Increasing rho will allow for a higher degree of +# redistribution of ranks across layers. Some pre-trained models might be more sensitive to a rank +# redistribution. It can therefore be beneficial to try rho=1.0 (no redistribution) if the performance is +# lower than expected. +# tau (`float`): +# Cosine similarity threshold for early stopping. Compares the cosine similarity of right-singular vectors +# between two consecutive SVD steps. If the cosine similarity is above this threshold, the SVD iteration is +# stopped. Default is 0.99. +# use_label_mask (`bool`): +# Use label mask for EVA initialization. This means that positions where labels=label_mask_value are ignored +# for the SVD computation. Setting use_label_mask=True is preferred in most cases and can be especially +# beneficial for multi-turn conversations. The default value is True. Filtering out items based on the label +# mask can sometimes lead to a small batch size and as a result instabilities in the SVD computation. For +# cases where a large share of batch items would be filtered out, set use_label_mask=False. +# label_mask_value (`int`): +# If use_label_mask=True the value to look for to mask out ignored tokens. Default is -100. +# whiten (`bool`): Apply whitening to singular vectors. Default is False. +# Whitening has been shown to be beneficial for EVA in the vision domain. +# adjust_scaling_factors (`bool`): +# Adjust LoRA scaling factors after the rank redistribution. Setting this to True means the scaling factors +# are adjusted so that all LoRA gradients have the same scale regardless of their rank. Default is True. +# """ + +# rho: float = field(default=2.0, metadata={"help": "Rho value for EVA redistribution"}) +# tau: float = field(default=0.99, metadata={"help": "Cosine similarity threshold for early stopping"}) +# use_label_mask: bool = field(default=True, metadata={"help": "Use label mask for EVA initialization"}) +# label_mask_value: int = field( +# default=-100, metadata={"help": "if use_label_mask=True the value to look for to mask out ignored tokens"} +# ) +# whiten: bool = field(default=False, metadata={"help": "Apply whitening to singular vectors"}) +# adjust_scaling_factors: bool = field( +# default=True, +# metadata={"help": "Adjust LoRA scaling factors after the rank redistribution"}, +# ) + +# def __post_init__(self): +# if self.rho < 1.0: +# raise ValueError("`rho` must be >= 1.0") +# if self.tau < 0.0 or self.tau > 1.0: +# raise ValueError("`tau` must be between 0.0 and 1.0.") + + +# @dataclass +# class aLoraConfig(PeftConfig): +# """ +# This is the configuration class to store the configuration of a [`aLoraModel`]. + +# Args: +# r (`int`): +# aLora attention dimension (the "rank"). +# target_modules (`Optional[Union[List[str], str]]`): +# The names of the modules to apply the adapter to. If this is specified, only the modules with the specified +# names will be replaced. When passing a string, a regex match will be performed. When passing a list of +# strings, either an exact match will be performed or it is checked if the name of the module ends with any +# of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen, +# excluding the output layer. If this is not specified, modules will be chosen according to the model +# architecture. If the architecture is not known, an error will be raised -- in this case, you should specify +# the target modules manually. +# exclude_modules (`Optional[Union[List[str], str]]`): +# The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. +# When passing a list of strings, either an exact match will be performed or it is checked if the name of the +# module ends with any of the passed strings. +# invocation_string (`str`): +# String intended to activate the aLoRA. The aLoRA adapted weights will activate 1 token after the first token in this string. +# This string must be present in all input data. +# lora_alpha (`int`): +# The alpha parameter for Lora scaling. +# lora_dropout (`float`): +# The dropout probability for Lora layers. +# fan_in_fan_out (`bool`): +# Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses +# `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. +# bias (`str`): +# Bias type for LoRA. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the corresponding biases +# will be updated during training. Be aware that this means that, even when disabling the adapters, the model +# will not produce the same output as the base model would have without adaptation. +# use_rslora (`bool`): +# When set to True, uses Rank-Stabilized LoRA which +# sets the adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it was proven to work better. +# Otherwise, it will use the original default value of `lora_alpha/r`. +# modules_to_save (`List[str]`): +# List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. +# init_lora_weights (`bool` | `Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "loftq"]`): +# How to initialize the weights of the adapter layers. Passing True (default) results in the default +# initialization from the reference implementation from Microsoft. Passing 'gaussian' results in Gaussian +# initialization scaled by the LoRA rank for linear and layers. Setting the initialization to False leads to +# completely random initialization and is discouraged. Pass `'loftq'` to use LoftQ initialization. Passing +# `'eva'` results in a data-driven initialization of Explained +# Variance Adaptation. EVA initalizes LoRA based on the SVD of layer input activations and achieves SOTA +# performance due to its ability to adapt to the finetuning data. Pass `'olora'` to use OLoRA initialization. +# Passing `'pissa'` results in the initialization of Principal +# Singular values and Singular vectors Adaptation (PiSSA), which converges more rapidly than LoRA and +# ultimately achieves superior performance. Moreover, PiSSA reduces the quantization error compared to QLoRA, +# leading to further enhancements. Passing `'pissa_niter_[number of iters]'` initiates Fast-SVD-based PiSSA +# initialization, where `[number of iters]` indicates the number of subspace iterations to perform FSVD, and +# must be a nonnegative integer. When `[number of iters]` is set to 16, it can complete the initialization of +# a 7B model within seconds, and the training effect is approximately equivalent to using SVD. +# layers_to_transform (`Union[List[int], int]`): +# The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices +# that are specified in this list. If a single integer is passed, it will apply the transformations on the +# layer at this index. +# layers_pattern (`Optional[Union[List[str], str]]`): +# The layer pattern name, used only if `layers_to_transform` is different from `None`. This should target the +# `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. +# rank_pattern (`dict`): +# The mapping from layer names or regexp expression to ranks which are different from the default rank +# specified by `r`. +# alpha_pattern (`dict`): +# The mapping from layer names or regexp expression to alphas which are different from the default alpha +# specified by `lora_alpha`. +# megatron_config (`Optional[dict]`): +# The TransformerConfig arguments for Megatron. It is used to create LoRA's parallel linear layer. You can +# get it like this, `core_transformer_config_from_args(get_args())`, these two functions being from Megatron. +# The arguments will be used to initialize the TransformerConfig of Megatron. You need to specify this +# parameter when you want to apply LoRA to the ColumnParallelLinear and RowParallelLinear layers of megatron. +# megatron_core (`Optional[str]`): +# The core module from Megatron to use, defaults to `"megatron.core"`. +# loftq_config (`Optional[LoftQConfig]`): +# The configuration of LoftQ. If this is not None, then LoftQ will be used to quantize the backbone weights +# and initialize Lora layers. Also pass `init_lora_weights='loftq'`. Note that you should not pass a +# quantized model in this case, as LoftQ will quantize the model itself. +# eva_config (`Optional[EvaConfig]`): +# The configuration of EVA. At a minimum the dataset argument needs to be set (use the same dataset as for +# finetuning). +# use_dora (`bool`): +# Enable 'Weight-Decomposed Low-Rank Adaptation' (DoRA). This technique decomposes the updates of the weights +# into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is +# handled by a separate learnable parameter. This can improve the performance of LoRA especially at low +# ranks. Right now, DoRA only supports linear and Conv2D layers. DoRA introduces a bigger overhead than pure +# LoRA, so it is recommended to merge weights for inference. For more information, see +# https://arxiv.org/abs/2402.09353. +# layer_replication (`List[Tuple[int, int]]`): +# Build a new stack of layers by stacking the original model layers according to the ranges specified. This +# allows expanding (or shrinking) the model without duplicating the base model weights. The new layers will +# all have separate LoRA adapters attached to them. +# runtime_config (`LoraRuntimeConfig`): +# Runtime configurations (which are not saved or restored). +# lora_bias (`bool`): +# Defaults to `False`. Whether to enable the bias term for the LoRA B parameter. Typically, this should be +# disabled. The main use case for this is when the LoRA weights were extracted from fully fine-tuned +# parameters so the bias of those parameters can be taken into account. +# """ + +# r: int = field(default=32, metadata={"help": "Lora attention dimension"}) +# target_modules: Optional[Union[list[str], str]] = field( +# default=None, +# metadata={ +# "help": ( +# "List of module names or regex expression of the module names to replace with LoRA." +# "NOTE: activated LoRA assumes only adapters in the attention weights, i.e. q,k,v." +# "For example, ['q', 'k', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|k|v)$'." +# "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer." +# "If not specified, modules will be chosen according to the model architecture, If the architecture is " +# "not known, an error will be raised -- in this case, you should specify the target modules manually." +# ), +# }, +# ) +# exclude_modules: Optional[Union[list[str], str]] = field( +# default=None, +# metadata={"help": "List of module names or regex expression of the module names to exclude from Lora."}, +# ) +# invocation_string: str = field(default=None, metadata={"help": "aLoRA invocation string. The aLoRA adapted weights will activate 1 token after the first token in this string. This string must be present in all input data."}) +# lora_alpha: int = field(default=8, metadata={"help": "Lora alpha"}) +# lora_dropout: float = field(default=0.0, metadata={"help": "Lora dropout"}) +# fan_in_fan_out: bool = field( +# default=False, +# metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, +# ) +# bias: Literal["none", "all", "lora_only"] = field( +# default="none", metadata={"help": "Bias type for Lora. Can be 'none', 'all' or 'lora_only'"} +# ) +# use_rslora: bool = field( +# default=False, +# metadata={ +# "help": ( +# "When set to True, uses Rank-Stabilized LoRA" +# " which sets the adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it" +# " was proven to work better. Otherwise, it will use the original default" +# " value of `lora_alpha/r`." +# ) +# }, +# ) +# modules_to_save: Optional[list[str]] = field( +# default=None, +# metadata={ +# "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. " +# "For example, in Sequence Classification or Token Classification tasks, " +# "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." +# }, +# ) +# init_lora_weights: ( +# bool | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "loftq"] +# ) = field( +# default=True, +# metadata={ +# "help": ( +# "How to initialize the weights of the LoRA layers. Passing `'True'` (default) results in the default " +# "initialization from the reference implementation from Microsoft. Passing `'gaussian'` results " +# "in Gaussian initialization scaled by the LoRA rank for linear and layers. Setting the initialization " +# "to `'False'` leads to completely random initialization and *is discouraged.*" +# "Pass `'eva'` results in a data-driven initialization of Explained Variance Adaptation." +# "Passing `'olora'` results in OLoRA initialization." +# "Passing `'pissa'` results in PiSSA initialization." +# "Passing `'pissa_niter_[number of iters]'` initiates Fast-SVD-based PiSSA initialization, " +# "where [number of iters] indicates the number of subspace iterations to perform fsvd, and must be a nonnegative integer." +# "Pass `'loftq'` to use LoftQ initialization" +# ), +# }, +# ) +# layers_to_transform: Optional[Union[list[int], int]] = field( +# default=None, +# metadata={ +# "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index. " +# "This only works when target_modules is a list of str." +# }, +# ) +# layers_pattern: Optional[Union[list[str], str]] = field( +# default=None, +# metadata={ +# "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern." +# "This only works when target_modules is a list of str. This should target the `nn.ModuleList` of the " +# "model, which is often called `'layers'` or `'h'`." +# }, +# ) +# rank_pattern: Optional[dict] = field( +# default_factory=dict, +# metadata={ +# "help": ( +# "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. " +# "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}" +# ) +# }, +# ) +# alpha_pattern: Optional[dict] = field( +# default_factory=dict, +# metadata={ +# "help": ( +# "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `lora_alpha`. " +# "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}" +# ) +# }, +# ) +# megatron_config: Optional[dict] = field( +# default=None, +# metadata={ +# "help": ( +# "The TransformerConfig from Megatron. It is used to create LoRA's parallel linear layer." +# "You can get it like this, `core_transformer_config_from_args(get_args())`, " +# "these two functions being from Megatron." +# "You need to specify this parameter when you want to apply LoRA to the ColumnParallelLinear and " +# "RowParallelLinear layers of megatron." +# "It should be noted that we may not be able to use the `save_pretrained` and `from_pretrained` " +# "functions, because TransformerConfig may not necessarily be serialized." +# "But when using megatron, we can use `get_peft_model_state_dict` function and " +# "megatron's framework, they can also save and load models and configurations." +# ) +# }, +# ) +# megatron_core: Optional[str] = field( +# default="megatron.core", +# metadata={ +# "help": ( +# "The core module from Megatron, it is used to create LoRA's parallel linear layer. " +# "It only needs to be passed in when you need to use your own modified megatron core module. " +# "Otherwise, it will use the default value `megatron.core`. " +# ) +# }, +# ) +# # dict type is used when loading config.json +# loftq_config: Union[LoftQConfig, dict] = field( +# default_factory=dict, +# metadata={ +# "help": ( +# "The configuration of LoftQ. If this is passed, then LoftQ will be used to quantize the backbone " +# "weights and initialize Lora layers. Also set `init_lora_weights='loftq'` in this case." +# ) +# }, +# ) +# eva_config: Optional[EvaConfig] = field( +# default=None, +# metadata={ +# "help": ( +# "The configuration of EVA. If this is passed, then EVA will be used to intialize the LoRA layers. " +# "Also set `init_lora_weights='eva'` in this case. " +# ) +# }, +# ) +# use_dora: bool = field( +# default=False, +# metadata={ +# "help": ( +# "Enable 'Weight-Decomposed Low-Rank Adaptation' (DoRA). This technique decomposes the updates of the " +# "weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the " +# "magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, " +# "especially at low ranks. Right now, DoRA only supports linear and Conv2D layers. DoRA introduces a bigger" +# "overhead than pure LoRA, so it is recommended to merge weights for inference." +# ) +# }, +# ) +# # Enables replicating layers in a model to expand it to a larger model. +# layer_replication: Optional[list[tuple[int, int]]] = field( +# default=None, +# metadata={ +# "help": ( +# "This enables using LoRA to effectively expand a transformer model to a larger size by repeating some layers. " +# "The transformation handles models (currently Llama, Bert or Falcon compatible architectures) with " +# "a module list in the model which it modifies to expand the number of modules. " +# "Base weights are shared so the memory usage is close to the original model. The intended use is these base weights " +# "remain fixed during finetuning but each layer has a separate LoRA adapter so the layers can be specialed via " +# "the adapter layers fit during fine tuning." +# "The format is a list of [start, end) pairs which specify the layer ranges to stack. For example:\n" +# " Original model has 5 layers labelled by their position in the model: `[0, 1, 2, 3, 4]`\n" +# " layer_replication: `[[0, 4], [2, 5]]`\n" +# " Final model will have this arrangement of original layers: `[0, 1, 2, 3, 2, 3, 4]`\n" +# "This format is based on what is used for pass-through merges in mergekit. It makes it simple to select sequential " +# "ranges of a model and stack them while reusing layers at either end of each sequence." +# ) +# }, +# ) +# runtime_config: LoraRuntimeConfig = field( +# default_factory=LoraRuntimeConfig, metadata={"help": "Runtime configurations"} +# ) +# lora_bias: bool = field( +# default=False, +# metadata={ +# "help": ( +# "Whether to enable the bias term for the LoRA B parameter. Typically, this should be disabled. The " +# "main use case for this is when the LoRA weights were extracted from fully fine-tuned parameters so " +# "the bias of those parameters can be taken into account." +# ) +# }, +# ) + +# def to_dict(self): +# """ +# Returns the configuration for your adapter model as a dictionary. Removes runtime configurations. +# """ +# rv = super().to_dict() +# rv.pop("runtime_config") +# return rv + +# def __post_init__(self): +# super().__post_init__() +# self.peft_type = PeftType.LORA +# self.target_modules = ( +# set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules +# ) +# self.exclude_modules = ( +# set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules +# ) + +# # if target_modules is a regex expression, then layers_to_transform should be None +# if isinstance(self.target_modules, str) and self.layers_to_transform is not None: +# raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.") + +# # if target_modules is a regex expression, then layers_pattern should be None +# if isinstance(self.target_modules, str) and self.layers_pattern is not None: +# raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.") + +# # check for layers_to_transform and layers_pattern +# if self.layers_pattern and not self.layers_to_transform: +# raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") + +# if self.use_dora and self.megatron_config: +# raise ValueError("DoRA does not support megatron_core, please set `use_dora=False`.") + +# # handle init_lora_weights and loftq_config +# if self.init_lora_weights == "loftq": +# import importlib + +# if not importlib.util.find_spec("scipy"): +# raise ImportError("The required package 'scipy' is not installed. Please install it to continue.") +# if not self.loftq_config: +# raise ValueError("`loftq_config` must be specified when `init_lora_weights` is 'loftq'.") +# if not isinstance(self.loftq_config, dict): +# # convert loftq_config to dict +# self.loftq_config = vars(self.loftq_config) +# elif self.loftq_config: +# self.loftq_config = {} +# warnings.warn("`loftq_config` specified but will be ignored when `init_lora_weights` is not 'loftq'.") + +# elif self.init_lora_weights == "eva" and self.eva_config is None: +# warnings.warn("`init_lora_weights` is 'eva' but `eva_config` is not specified. Using default EVA config.") +# self.eva_config = EvaConfig() +# elif self.init_lora_weights != "eva" and self.eva_config is not None: +# warnings.warn("`eva_config` specified but will be ignored when `init_lora_weights` is not 'eva'.") + +# if self.lora_bias: +# if self.init_lora_weights not in (True, False): +# raise ValueError( +# f"The argument lora_bias=True is only supported with init_lora_weights=True or False, got " +# f"init_lora_weights={self.init_lora_weights} instead." +# ) +# if self.use_dora: +# raise ValueError("The argument lora_bias=True is not supported for DoRA, please pass use_dora=False") +# if self.invocation_string is None: +# raise ValueError("invocation_string cannot be None") +# # Using post training conversion of modified base weights to restore their initial values (PiSSA, OLoRA) cannot +# # be correctly done when using rslora + rank_pattern/alpha_pattern. We can't really know if the user intends +# # this when they'll eventually call save_pretrained (i.e. if they'll pass +# # path_initial_model_for_weight_conversionl). Therefore, we only warn but don't raise an error here. +# if ( +# self.use_rslora +# and (self.rank_pattern or self.alpha_pattern) +# and ( +# (isinstance(self.init_lora_weights, str) and (self.init_lora_weights.startswith("pissa"))) +# or (self.init_lora_weights == "olora") +# ) +# ): +# msg = ( +# "Using Rank-Stabilized LoRA with rank_pattern/alpha_pattern and post-training conversion of modified " +# "base weights (PiSSA, OLoRA) means that you won't be able to pass " +# "`path_initial_model_for_weight_conversion` to `save_pretrained` to restore the initial values of the " +# "base weights; if you intend to do this, please ensure not to use rslora or rank_pattern/alpha_pattern." +# ) +# warnings.warn(msg) + +# self._custom_modules: Optional[dict[type[nn.Mmodule], type[nn.Module]]] = None + +# def _register_custom_module(self, mapping: dict[type[nn.Mmodule], type[nn.Module]]) -> None: +# """ +# Experimental API to support providing custom LoRA layers. + +# This API is subject to change, you should carefully read the docs before deciding to use it: + +# https://huggingface.co/docs/peft/developer_guides/custom_models + +# To register custom LoRA module types, call this method with a `mapping` argument that is a dict that maps from +# the target layer type to the custom LoRA layer type. The dict can contain multiple items if you wish to target +# multiple layer types. The target layer type can be any nn.Module that we currently don't support in PEFT, +# whether that is an official PyTorch layer type or a custom layer type. The custom LoRA module class has to be +# implemented by the user and follow the PEFT conventions for LoRA layers. + +# """ +# if self._custom_modules is None: +# self._custom_modules = {} +# self._custom_modules.update(mapping) + diff --git a/src/peft/tuners/alora/layer.py b/src/peft/tuners/alora/layer.py new file mode 100644 index 0000000000..fc97152659 --- /dev/null +++ b/src/peft/tuners/alora/layer.py @@ -0,0 +1,1274 @@ + +from __future__ import annotations + +import math +import warnings +from typing import Any, Optional, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from accelerate.utils.imports import is_xpu_available +from torch import svd_lowrank +from transformers.pytorch_utils import Conv1D + +from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge +from peft.utils.integrations import dequantize_module_weight, gather_params_ctx, get_bnb_param_type +from peft.utils.other import transpose + +from .config import aLoraConfig + +#Remove +#from peft import DoraConv2dLayer, DoraConv3dLayer, DoraEmbeddingLayer, DoraLinearLayer, _DoraConvNdLayer + + +class aLoraLayer(BaseTunerLayer): + # All names of layers that may contain (trainable) adapter weights + adapter_layer_names = ("lora_A", "lora_B", "lora_embedding_A", "lora_embedding_B") + # All names of other parameters that may contain adapter-related parameters + other_param_names = ("r", "lora_alpha", "scaling", "lora_dropout")#,"k") + + def __init__(self, base_layer: nn.Module, ephemeral_gpu_offload: bool = False, **kwargs) -> None: + self.base_layer = base_layer + self.r = {} + self.lora_alpha = {} + self.scaling = {} + self.lora_dropout = nn.ModuleDict({}) + self.lora_A = nn.ModuleDict({}) + self.lora_B = nn.ModuleDict({}) + # For Embedding layer + self.lora_embedding_A = nn.ParameterDict({}) + self.lora_embedding_B = nn.ParameterDict({}) + # Mark the weight as unmerged + self._disable_adapters = False + self.merged_adapters = [] + self.use_dora: dict[str, bool] = {} + self.lora_bias: dict[str, bool] = {} + self.lora_magnitude_vector = torch.nn.ModuleDict() # for DoRA + self._caches: dict[str, Any] = {} + self.ephemeral_gpu_offload: bool = ephemeral_gpu_offload + self.kwargs = kwargs + + base_layer = self.get_base_layer() + if isinstance(base_layer, nn.Linear): + in_features, out_features = base_layer.in_features, base_layer.out_features + #below not supported + # elif isinstance(base_layer, nn.Conv2d): + # in_features, out_features = base_layer.in_channels, base_layer.out_channels + # elif isinstance(base_layer, nn.Conv3d): + # in_features, out_features = base_layer.in_channels, base_layer.out_channels + # elif isinstance(base_layer, nn.Embedding): + # in_features, out_features = base_layer.num_embeddings, base_layer.embedding_dim + # elif isinstance(base_layer, Conv1D): + # in_features, out_features = ( + # base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape + # ) + elif hasattr(base_layer, "infeatures") and hasattr(base_layer, "outfeatures"): + # QuantLinear + in_features, out_features = base_layer.infeatures, base_layer.outfeatures + elif hasattr(base_layer, "input_size") and hasattr(base_layer, "output_size"): + # Megatron ColumnParallelLinear,RowParallelLinear + in_features, out_features = base_layer.input_size, base_layer.output_size + elif hasattr(base_layer, "codebooks") and base_layer.__class__.__name__ == "QuantizedLinear": + # AQLM QuantLinear + in_features, out_features = base_layer.in_features, base_layer.out_features + elif hasattr(base_layer, "w_bit") and base_layer.__class__.__name__ == "WQLinear_GEMM": + # Awq layers + in_features, out_features = base_layer.in_features, base_layer.out_features + elif base_layer.__class__.__name__ == "EetqLinear": + # Eetq layers + in_features, out_features = base_layer.in_features, base_layer.out_features + elif hasattr(base_layer, "W_q") and base_layer.__class__.__name__ == "HQQLinear": + # HQQ layers + in_features, out_features = base_layer.in_features, base_layer.out_features + else: + # possibly support user provided custom layer types using dynamic dispatch + if hasattr(base_layer, "in_features") and hasattr(base_layer, "out_features"): + in_features, out_features = base_layer.in_features, base_layer.out_features + else: + in_features, out_features = None, None + warnings.warn( + f"Unsupported layer type '{type(base_layer)}' encountered, proceed at your own risk.", UserWarning + ) + + self.in_features = in_features + self.out_features = out_features + + def update_layer( + self, + adapter_name, + r, + lora_alpha, + lora_dropout, + init_lora_weights, + use_rslora, +# k = 1,# added + use_dora: bool = False, + lora_bias: bool = False, + ): + # This code works for linear layers, override for other layer types + if r <= 0: + raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + + self.r[adapter_name] = r + # self.k[adapter_name] = k #added + self.lora_alpha[adapter_name] = lora_alpha + if lora_dropout > 0.0: + lora_dropout_layer = nn.Dropout(p=lora_dropout) + else: + lora_dropout_layer = nn.Identity() + + self.lora_dropout.update(nn.ModuleDict({adapter_name: lora_dropout_layer})) + # Actual trainable parameters + self.lora_A[adapter_name] = nn.Linear(self.in_features, r, bias=False) + self.lora_B[adapter_name] = nn.Linear(r, self.out_features, bias=lora_bias) + self.lora_bias[adapter_name] = lora_bias + + if use_rslora: + self.scaling[adapter_name] = lora_alpha / math.sqrt(r) + else: + self.scaling[adapter_name] = lora_alpha / r + + # for inits that require access to the base weight, use gather_param_ctx so that the weight is gathered when using DeepSpeed + if isinstance(init_lora_weights, str) and init_lora_weights.startswith("pissa"): + with gather_params_ctx(self.get_base_layer().weight): + self.pissa_init(adapter_name, init_lora_weights) + elif isinstance(init_lora_weights, str) and init_lora_weights.lower() == "olora": + with gather_params_ctx(self.get_base_layer().weight): + self.olora_init(adapter_name) + elif init_lora_weights == "loftq": + with gather_params_ctx(self.get_base_layer().weight): + self.loftq_init(adapter_name) + elif init_lora_weights == "eva": + nn.init.zeros_(self.lora_B[adapter_name].weight) + elif init_lora_weights: + self.reset_lora_parameters(adapter_name, init_lora_weights) + # call this before dora_init + self._move_adapter_to_device_of_base_layer(adapter_name) + + if use_dora: + self.dora_init(adapter_name) + self.use_dora[adapter_name] = True + else: + self.use_dora[adapter_name] = False + + self.set_adapter(self.active_adapters) + + def reset_lora_parameters(self, adapter_name, init_lora_weights): + if init_lora_weights is False: + return + + if adapter_name in self.lora_A.keys(): + if init_lora_weights is True: + # initialize A the same way as the default for nn.Linear and B to zero + # https://github.com/microsoft/LoRA/blob/a0a92e0f26c067cf94747bdbf1ce73793fa44d19/loralib/layers.py#L124 + nn.init.kaiming_uniform_(self.lora_A[adapter_name].weight, a=math.sqrt(5)) + elif init_lora_weights.lower() == "gaussian": + nn.init.normal_(self.lora_A[adapter_name].weight, std=1 / self.r[adapter_name]) + else: + raise ValueError(f"Unknown initialization {init_lora_weights=}") + nn.init.zeros_(self.lora_B[adapter_name].weight) + if self.lora_bias[adapter_name]: + nn.init.zeros_(self.lora_B[adapter_name].bias) + if adapter_name in self.lora_embedding_A.keys(): + # Initialize A to zeros and B the same way as the default for nn.Embedding, see: + # https://github.com/microsoft/LoRA/blob/4c0333854cb905966f8cc4e9a74068c1e507c7b7/loralib/layers.py#L59-L60 + nn.init.zeros_(self.lora_embedding_A[adapter_name]) + nn.init.normal_(self.lora_embedding_B[adapter_name]) + if self.lora_bias[adapter_name]: + # embeddings are not supported at the moment, but still adding this for consistency + nn.init.zeros_(self.lora_embedding_B[adapter_name].bias) + + def olora_init(self, adapter_name): + base_layer = self.get_base_layer() + orig_weight = base_layer.weight + bnb_param_type = get_bnb_param_type(orig_weight) + dtype = orig_weight.dtype + + if bnb_param_type: + # check without importing bitsandbytes and robust to bnb_4bit_quant_storage=float* + weight_tensor = dequantize_module_weight(base_layer) + elif dtype in [torch.float32, torch.float16, torch.bfloat16]: + weight_tensor = orig_weight + else: + raise TypeError(f"Unsupported data type for the base layer. Got {dtype}.") + + scale_factor = self.scaling[adapter_name] + r = self.r[adapter_name] + weight_tensor = weight_tensor.to(torch.float32) + Q, R = torch.linalg.qr(weight_tensor.data) + + Qr, Rr = Q[:, :r], R[:r] + + self.lora_A[adapter_name].weight.data = Rr.contiguous() + self.lora_B[adapter_name].weight.data = Qr.contiguous() + + weight_tensor.data -= scale_factor * self.lora_B[adapter_name].weight @ self.lora_A[adapter_name].weight + if bnb_param_type == "4bit": + weight_tensor = orig_weight.__class__( + weight_tensor, + quant_type=orig_weight.quant_type, + quant_storage=orig_weight.quant_storage, + compress_statistics=orig_weight.compress_statistics, + module=orig_weight.module, + ).to(orig_weight.device) + base_layer.weight = weight_tensor + elif bnb_param_type == "8bit": + weight_tensor = orig_weight.__class__( + weight_tensor, + requires_grad=orig_weight.requires_grad, + has_fp16_weights=orig_weight.has_fp16_weights, + ).to(orig_weight.device) + base_layer.weight = weight_tensor + else: + weight_tensor = weight_tensor.to(dtype) + base_layer.weight.data = weight_tensor + + def pissa_init(self, adapter_name, init_lora_weights): + weight = self.get_base_layer().weight + dtype = weight.dtype + if dtype not in [torch.float32, torch.float16, torch.bfloat16]: + raise TypeError( + "Please initialize PiSSA under float32, float16, or bfloat16. " + "Subsequently, re-quantize the residual model to help minimize quantization errors." + ) + weight = transpose(weight.to(torch.float32), self.fan_in_fan_out) + if init_lora_weights == "pissa": + # USV^T = W <-> VSU^T = W^T, where W^T = weight.data in R^{out_channel, in_channel}, + V, S, Uh = torch.linalg.svd(weight.data, full_matrices=False) + Vr = V[:, : self.r[adapter_name]] + Sr = S[: self.r[adapter_name]] + Sr /= self.scaling[adapter_name] + Uhr = Uh[: self.r[adapter_name]] + elif len(init_lora_weights.split("_niter_")) == 2: + Vr, Sr, Ur = svd_lowrank( + weight.data, self.r[adapter_name], niter=int(init_lora_weights.split("_niter_")[-1]) + ) + Sr /= self.scaling[adapter_name] + Uhr = Ur.t() + else: + raise ValueError( + f"init_lora_weights should be 'pissa' or 'pissa_niter_[number of iters]', got {init_lora_weights} instead." + ) + + lora_A = torch.diag(torch.sqrt(Sr)) @ Uhr + lora_B = Vr @ torch.diag(torch.sqrt(Sr)) + self.lora_A[adapter_name].weight.data = lora_A + self.lora_B[adapter_name].weight.data = lora_B + weight = weight.data - self.scaling[adapter_name] * lora_B @ lora_A + weight = transpose(weight.to(dtype), self.fan_in_fan_out) + self.get_base_layer().weight.data = weight + + def loftq_init(self, adapter_name): + from peft.utils.loftq_utils import loftq_init + + weight = self.get_base_layer().weight + kwargs = { + "num_bits": self.kwargs.get("loftq_bits", 4), + "reduced_rank": self.r[adapter_name], + "num_iter": self.kwargs.get("loftq_iter", 1), + } + + qweight, lora_A, lora_B = loftq_init(weight, **kwargs) + if adapter_name in self.lora_A.keys(): + # initialize A the same way as the default for nn.Linear and B to zero + self.lora_A[adapter_name].weight.data = lora_A + self.lora_B[adapter_name].weight.data = lora_B + if adapter_name in self.lora_embedding_A.keys(): + # initialize a the same way as the default for nn.linear and b to zero + self.lora_embedding_A[adapter_name].weight.data = lora_A + self.lora_embedding_B[adapter_name].weight.data = lora_B + self.get_base_layer().weight.data = qweight + + + def _cache_store(self, key: str, value: Any) -> None: + self._caches[key] = value + + def _cache_pop(self, key: str) -> Any: + value = self._caches.pop(key) + return value + + def set_scale(self, adapter, scale): + if adapter not in self.scaling: + # Ignore the case where the adapter is not in the layer + return + self.scaling[adapter] = scale * self.lora_alpha[adapter] / self.r[adapter] + + def scale_layer(self, scale: float) -> None: + if scale == 1: + return + + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + + self.scaling[active_adapter] *= scale + + def unscale_layer(self, scale=None) -> None: + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + + if scale is None: + self.scaling[active_adapter] = self.lora_alpha[active_adapter] / self.r[active_adapter] + else: + self.scaling[active_adapter] /= scale + + def _check_forward_args(self, x, *args, **kwargs): + """Check if the arguments are compatible with the configs and state of the model""" + adapter_names = kwargs.get("adapter_names", None) + if adapter_names is None: + return + + if len(x) != len(adapter_names): + msg = ( + "Length of `adapter_names` should be the same as the number of inputs, but got " + f"{len(adapter_names)} and {len(x)} respectively." + ) + raise ValueError(msg) + + if self.merged: + # It is unclear what would be the right thing to do if users pass adapter_names and there are merged + # adapters. Therefore, it is better to raise an error in this case. + msg = "Cannot pass `adapter_names` when there are merged adapters, please call `unmerge_adapter` first." + raise ValueError(msg) + + # # DoRA is not supported (yet), check that it's not being used. Don't check "__base__", as this is the + # # placeholder for the base model. + # unique_adapters = {name for name in adapter_names if name != "__base__"} + # for adapter_name in unique_adapters: + # if self.use_dora.get(adapter_name, False): + # msg = "Cannot pass `adapter_names` when DoRA is enabled." + # raise ValueError(msg) + + def _mixed_batch_forward( + self, x: torch.Tensor, *args: Any, adapter_names: list[str],alora_offsets: list[int], **kwargs: Any + ) -> torch.Tensor: + # This is a special method that handles the case when users pass the argument `adapter_names`. This is an + # extra argument that allows mixing different adapters in the same batch at inference time. + result = self.base_layer(x, *args, **kwargs) + torch_result_dtype = result.dtype + ks =alora_offsets + unique_adapters = set(adapter_names) + sub_batch_indices_list = [] + for adapter in unique_adapters: + sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) + + for i, active_adapter in enumerate(unique_adapters): + if active_adapter == "__base__": + continue + if active_adapter not in self.lora_A.keys(): + continue + + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + + # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear + # layer output + sub_batch = x[sub_batch_indices_list[i]].to(lora_A.weight.dtype) + if len(ks) > 1: + ks_batch = ks[sub_batch_indices_list[i]] + for j in range(len(ks_batch)): + k = min(ks_batch[j], result.shape[1]) + lora_output = lora_B(lora_A(dropout(sub_batch[j,-k:,:]))) * scaling + result[sub_batch_indices_list[i][j],-k:,:] += lora_output.to(torch_result_dtype) + else: + ks_batch = ks + k = min(result.shape[1],ks_batch[0]) + lora_output = lora_B(lora_A(dropout(sub_batch[:,-k:,:]))) * scaling + result[sub_batch_indices_list[i],-k:,:] += lora_output.to(torch_result_dtype) + + return result + + +# Below code is based on https://github.com/microsoft/LoRA/blob/main/loralib/layers.py +# and modified to work with PyTorch FSDP + + +# ------------------------------------------------------------------------------------------ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. +# ------------------------------------------------------------------------------------------ + + +class Linear(nn.Module, aLoraLayer): + # Lora implemented in a dense layer + def __init__( + self, + base_layer, + adapter_name: str, + r: int = 0, + lora_alpha: int = 1, + lora_dropout: float = 0.0, + fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) + is_target_conv_1d_layer: bool = False, + init_lora_weights: Union[bool, str] = True, + use_rslora: bool = False, + use_dora: bool = False, + lora_bias: bool = False, + **kwargs, + ) -> None: + super().__init__() + aLoraLayer.__init__(self, base_layer, **kwargs) + self.fan_in_fan_out = fan_in_fan_out + + self._active_adapter = adapter_name + self.update_layer( + adapter_name, + r, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + init_lora_weights=init_lora_weights, + use_rslora=use_rslora, + use_dora=use_dora, + lora_bias=lora_bias, + ) + self.is_target_conv_1d_layer = is_target_conv_1d_layer + + # def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: + # """ + # Merge the active adapter weights into the base weights + + # Args: + # safe_merge (`bool`, *optional*): + # If True, the merge operation will be performed in a copy of the original weights and check for NaNs + # before merging the weights. This is useful if you want to check if the merge operation will produce + # NaNs. Defaults to `False`. + # adapter_names (`list[str]`, *optional*): + # The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + # to `None`. + # """ + + # warnings.warn("NOT SUPPORTED FOR ASYMMETRIC LORA")# added + + # adapter_names = check_adapters_to_merge(self, adapter_names) + # if not adapter_names: + # # no adapter to merge + # return + + # for active_adapter in adapter_names: + # if active_adapter in self.lora_A.keys(): + # base_layer = self.get_base_layer() + # if safe_merge: + # # Note that safe_merge will be slower than the normal merge + # # because of the copy operation. + # orig_weights = base_layer.weight.data.clone() + # delta_weight = self.get_delta_weight(active_adapter) + # if not self.use_dora[active_adapter]: + # orig_weights += delta_weight + # else: + # # handle dora + # # since delta_weight already includes scaling, set it to 1 here + # weight_norm = ( + # self.lora_magnitude_vector[active_adapter] + # .get_weight_norm(orig_weights, transpose(delta_weight, self.fan_in_fan_out), scaling=1) + # .detach() + # ) + # # We need to cache weight_norm because it has to be based on the original weights. We + # # cannot calculate it on the fly based on the merged weights when unmerging because its a + # # different value + # self._cache_store(f"{active_adapter}-weight_norm", weight_norm) + # dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm + # dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out) + # orig_weights = dora_factor * (orig_weights + delta_weight) + + # if not torch.isfinite(orig_weights).all(): + # raise ValueError( + # f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + # ) + + # base_layer.weight.data = orig_weights + + # if self.lora_bias[active_adapter]: + # new_bias = base_layer.bias + self.lora_B[active_adapter].bias + # if not torch.isfinite(new_bias).all(): + # raise ValueError( + # f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" + # ) + # base_layer.bias.data = new_bias + + # else: + # delta_weight = self.get_delta_weight(active_adapter) + # if not self.use_dora[active_adapter]: + # base_layer.weight.data += delta_weight + # else: + # # handle dora + # # since delta_weight already includes scaling, set it to 1 here + # weight_norm = ( + # self.lora_magnitude_vector[active_adapter] + # .get_weight_norm( + # base_layer.weight, transpose(delta_weight, self.fan_in_fan_out), scaling=1 + # ) + # .detach() + # ) + # # We need to cache weight_norm because it has to be based on the original weights. We + # # cannot calculate it on the fly based on the merged weights when unmerging because its a + # # different value + # self._cache_store(f"{active_adapter}-weight_norm", weight_norm) + # dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm + # dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out) + # new_weight = dora_factor * (base_layer.weight.data + delta_weight) + # base_layer.weight.data = new_weight + + # if self.lora_bias[active_adapter]: + # base_layer.bias.data += self.lora_B[active_adapter].bias + + # self.merged_adapters.append(active_adapter) + + # def unmerge(self) -> None: + # """ + # This method unmerges all merged adapter layers from the base weights. + # """ + # if not self.merged: + # warnings.warn("Already unmerged. Nothing to do.") + # return + # while len(self.merged_adapters) > 0: + # active_adapter = self.merged_adapters.pop() + # if active_adapter in self.lora_A.keys(): + # weight = self.get_base_layer().weight + # delta_weight = self.get_delta_weight(active_adapter) + # if not self.use_dora[active_adapter]: + # weight.data -= delta_weight + # else: + # weight_norm = self._cache_pop(f"{active_adapter}-weight_norm") + # dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm + # weight_orig = weight.data / dora_factor.view(-1, 1) - delta_weight + # weight.data = weight_orig + + # if self.lora_bias[active_adapter]: + # self.get_base_layer().bias.data -= self.lora_B[active_adapter].bias + + def get_delta_weight(self, adapter) -> torch.Tensor: + """ + Compute the delta weight for the given adapter. + + Args: + adapter (str): + The name of the adapter for which the delta weight should be computed. + """ + device = self.lora_B[adapter].weight.device + dtype = self.lora_B[adapter].weight.dtype + + # In case users wants to merge the adapter weights that are in + # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to + # (b)float16 because some CPUs have slow bf16/fp16 matmuls. + cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + + weight_A = self.lora_A[adapter].weight + weight_B = self.lora_B[adapter].weight + + if cast_to_fp32: + weight_A = weight_A.float() + weight_B = weight_B.float() + + output_tensor = transpose(weight_B @ weight_A, self.fan_in_fan_out) * self.scaling[adapter] + + if cast_to_fp32: + output_tensor = output_tensor.to(dtype=dtype) + + # cast back the weights + self.lora_A[adapter].weight.data = weight_A.to(dtype) + self.lora_B[adapter].weight.data = weight_B.to(dtype) + + return output_tensor + + def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: + self._check_forward_args(x, *args, **kwargs) + adapter_names = kwargs.pop("adapter_names", None) + ks = kwargs.pop("alora_offsets", [1]) #added +# ks = [100000] +# print("layer forward") +# print(ks) + if self.disable_adapters or ks[0] <= 0: + if self.merged: + self.unmerge() + result = self.base_layer(x, *args, **kwargs) + elif adapter_names is not None: + result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, alora_offsets = ks, **kwargs) + elif self.merged: + result = self.base_layer(x, *args, **kwargs) + else: + #if len(ks) == 1: + + #k = ks # Maybe change + result = self.base_layer(x, *args, **kwargs) + torch_result_dtype = result.dtype + for active_adapter in self.active_adapters: + if active_adapter not in self.lora_A.keys(): + continue + lora_A = self.lora_A[active_adapter] + lora_B = self.lora_B[active_adapter] + dropout = self.lora_dropout[active_adapter] + scaling = self.scaling[active_adapter] + x = x.to(lora_A.weight.dtype) + + if not self.use_dora[active_adapter]: + # Only do the last k tokens +# print(f"{k} A {x[:,-k,:]}") + # k = 2 + if len(ks) == 1: + k = min(result.shape[1],ks[0]) +# print(k) + if k > 0: + result[:,-k:,:] = result[:,-k:,:] + lora_B(lora_A(dropout(x[:,-k:,:]))) * scaling#dropout + else: + + for i in range(result.shape[0]): + ks[i] = min(ks[i], result.shape[1]) + if ks[i] > 0: + result[i,-ks[i]:,:] = result[i,-ks[i]:,:] + lora_B(lora_A(dropout(x[i,-ks[i]:,:]))) + else: + warnings.warn("NOT SUPPORTED") + if isinstance(dropout, nn.Identity) or not self.training: + base_result = result + else: + x = dropout(x) + base_result = None + + result = result + self.lora_magnitude_vector[active_adapter]( + x, + lora_A=lora_A, + lora_B=lora_B, + scaling=scaling, + base_layer=self.get_base_layer(), + base_result=base_result, + ) + + result = result.to(torch_result_dtype) + + return result + + def __repr__(self) -> str: + rep = super().__repr__() + return "lora." + rep + +#below not supported +# class Embedding(nn.Module, aLoraLayer): +# # LoRA implemented in a Embedding layer +# def __init__( +# self, +# base_layer: nn.Module, +# adapter_name: str, +# r: int = 0, +# lora_alpha: int = 1, +# lora_dropout: float = 0.0, +# init_lora_weights: Union[bool, str] = True, +# use_rslora: bool = False, +# use_dora: bool = False, +# lora_bias: bool = False, +# **kwargs, +# ) -> None: +# if lora_bias: +# # lora_bias=True is not supported (yet) for embedding layers, as they use nn.Parameter +# raise ValueError(f"lora_bias={lora_bias} is not supported for {self.__class__.__name__}.") + +# super().__init__() +# aLoraLayer.__init__(self, base_layer) + +# self._active_adapter = adapter_name +# self.update_layer( +# adapter_name, +# r, +# lora_alpha=lora_alpha, +# lora_dropout=lora_dropout, +# init_lora_weights=init_lora_weights, +# use_rslora=use_rslora, +# use_dora=use_dora, +# lora_bias=lora_bias, +# ) + +# def update_layer( +# self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora, lora_bias +# ): +# if r <= 0: +# raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + +# self.r[adapter_name] = r +# self.lora_alpha[adapter_name] = lora_alpha +# if lora_dropout > 0.0: +# lora_dropout_layer = nn.Dropout(p=lora_dropout) +# else: +# lora_dropout_layer = nn.Identity() + +# self.lora_dropout[adapter_name] = lora_dropout_layer +# # Actual trainable parameters +# weight_A = torch.randn((r, self.in_features)) +# weight_B = torch.randn((self.out_features, r)) +# self.lora_embedding_A[adapter_name] = nn.Parameter(weight_A) +# self.lora_embedding_B[adapter_name] = nn.Parameter(weight_B) +# self.lora_bias[adapter_name] = lora_bias + +# if use_rslora: +# self.scaling[adapter_name] = lora_alpha / math.sqrt(r) +# else: +# self.scaling[adapter_name] = lora_alpha / r + +# if init_lora_weights == "loftq": +# self.loftq_init(adapter_name) +# elif init_lora_weights: +# self.reset_lora_parameters(adapter_name, init_lora_weights) + +# # call this before dora_init +# self._move_adapter_to_device_of_base_layer(adapter_name) + +# if use_dora: +# self.dora_init(adapter_name) +# self.use_dora[adapter_name] = True +# else: +# self.use_dora[adapter_name] = False + +# self.set_adapter(self.active_adapters) + +# def dora_init(self, adapter_name: str) -> None: +# if self.lora_magnitude_vector is None: +# # first dora layer being added, add lora_magnitude_vector to the list of learnable parameters +# self.adapter_layer_names = self.adapter_layer_names[:] + ("lora_magnitude_vector",) + +# # dora_layer = DoraEmbeddingLayer(fan_in_fan_out=True) +# # lora_embedding_A = self.lora_embedding_A[adapter_name] +# # lora_embedding_B = self.lora_embedding_B[adapter_name] +# # scaling = self.scaling[adapter_name] +# # dora_layer.update_layer( +# # base_layer=self.get_base_layer(), lora_A=lora_embedding_A, lora_B=lora_embedding_B, scaling=scaling +# # ) +# # self.lora_magnitude_vector[adapter_name] = dora_layer + +# def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: +# """ +# Merge the active adapter weights into the base weights + +# Args: +# safe_merge (`bool`, *optional*): +# If True, the merge operation will be performed in a copy of the original weights and check for NaNs +# before merging the weights. This is useful if you want to check if the merge operation will produce +# NaNs. Defaults to `False`. +# adapter_names (`list[str]`, *optional*): +# The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults +# to `None`. +# """ +# warnings.warn("NOT SUPPORTED") +# adapter_names = check_adapters_to_merge(self, adapter_names) +# if not adapter_names: +# # no adapter to merge +# return + +# for active_adapter in adapter_names: +# if active_adapter in self.lora_embedding_A.keys(): +# base_layer = self.get_base_layer() +# if safe_merge: +# # Note that safe_merge will be slower than the normal merge +# # because of the copy operation. +# orig_weights = base_layer.weight.data.clone() +# orig_weights += self.get_delta_weight(active_adapter) + +# if not torch.isfinite(orig_weights).all(): +# raise ValueError( +# f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" +# ) + +# base_layer.weight.data = orig_weights +# else: +# base_layer.weight.data += self.get_delta_weight(active_adapter) +# self.merged_adapters.append(active_adapter) + +# def unmerge(self) -> None: +# """ +# This method unmerges all merged adapter layers from the base weights. +# """ +# if not self.merged: +# warnings.warn("Already unmerged. Nothing to do.") +# return +# while len(self.merged_adapters) > 0: +# active_adapter = self.merged_adapters.pop() +# if active_adapter in self.lora_embedding_A.keys(): +# self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) + +# def get_delta_weight(self, adapter) -> torch.Tensor: +# """ +# Compute the delta weight for the given adapter. + +# Args: +# adapter (str): +# The name of the adapter for which the delta weight should be computed. +# """ +# device = self.lora_embedding_B[adapter].device +# dtype = self.lora_embedding_A[adapter].dtype + +# # In case users wants to merge the adapter weights that are in +# # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to +# # (b)float16 because some CPUs have slow bf16/fp16 matmuls. +# cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + +# weight_A = self.lora_embedding_A[adapter] +# weight_B = self.lora_embedding_B[adapter] + +# if cast_to_fp32: +# weight_A = weight_A.float() +# weight_B = weight_B.float() + +# output_tensor = transpose(weight_B @ weight_A, True) * self.scaling[adapter] + +# if cast_to_fp32: +# output_tensor = output_tensor.to(dtype=dtype) + +# # cast back the weights +# self.lora_embedding_A[adapter] = weight_A.to(dtype) +# self.lora_embedding_B[adapter] = weight_B.to(dtype) + +# return output_tensor + +# def _mixed_batch_forward( +# self, x: torch.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any +# ) -> torch.Tensor: +# # This is a special method that handles the case when users pass the argument `adapter_names`. This is an +# # extra argument that allows mixing different adapters in the same batch at inference time. +# result = self.base_layer(x, *args, **kwargs) + +# unique_adapters = set(adapter_names) +# sub_batch_indices_list = [] +# for adapter in unique_adapters: +# sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) + +# for i, active_adapter in enumerate(unique_adapters): +# if active_adapter == "__base__": +# continue +# if active_adapter not in self.lora_embedding_A.keys(): +# continue + +# embedding_A = self.lora_embedding_A[active_adapter].T +# embedding_B = self.lora_embedding_B[active_adapter].T +# scaling = self.scaling[active_adapter] + +# # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear +# # layer output +# sub_batch = x[sub_batch_indices_list[i]] +# after_A = self._embed(sub_batch, embedding_A) +# result[sub_batch_indices_list[i]] += (after_A @ embedding_B) * scaling + +# return result + +# def _embed(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: +# base_layer = self.get_base_layer() +# return F.embedding( +# input, +# weight, +# padding_idx=base_layer.padding_idx, +# max_norm=base_layer.max_norm, +# norm_type=base_layer.norm_type, +# scale_grad_by_freq=base_layer.scale_grad_by_freq, +# sparse=base_layer.sparse, +# ) + +# def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: +# # TODO: no dtype conversion here, unlike in Linear, is that correct? +# self._check_forward_args(x, *args, **kwargs) +# adapter_names = kwargs.pop("adapter_names", None) + +# if self.disable_adapters: +# if self.merged: +# self.unmerge() +# result = self.base_layer(x, *args, **kwargs) +# elif adapter_names is not None: +# result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs) +# elif self.merged: +# result = self.base_layer(x, *args, **kwargs) +# else: +# result = self.base_layer(x, *args, **kwargs) +# torch_result_dtype = result.dtype +# for active_adapter in self.active_adapters: +# if active_adapter not in self.lora_embedding_A: +# continue +# embedding_A = self.lora_embedding_A[active_adapter].T +# embedding_B = self.lora_embedding_B[active_adapter].T +# scaling = self.scaling[active_adapter] + +# if not self.use_dora[active_adapter]: +# after_A = self._embed(x, embedding_A) +# result = result + (after_A @ embedding_B) * scaling +# else: +# mag_norm_scale, dora_result = self.lora_magnitude_vector[active_adapter]( +# x, +# lora_A=embedding_A, +# lora_B=embedding_B, +# scaling=scaling, +# base_layer=self.get_base_layer(), +# embed_fn=self._embed, +# ) +# result = mag_norm_scale * result + dora_result +# result = result.to(torch_result_dtype) + +# return result + +# def __repr__(self) -> str: +# rep = super().__repr__() +# return "lora." + rep + + +# class _ConvNd(nn.Module, aLoraLayer): +# # Lora implemented in a conv(2,3)d layer +# def __init__( +# self, +# base_layer: nn.Module, +# adapter_name: str, +# r: int = 0, +# lora_alpha: int = 1, +# lora_dropout: float = 0.0, +# init_lora_weights: Union[bool, str] = True, +# use_rslora: bool = False, +# use_dora: bool = False, +# lora_bias: bool = False, +# **kwargs, +# ) -> None: +# super().__init__() +# aLoraLayer.__init__(self, base_layer) + +# self._active_adapter = adapter_name +# self._kernel_dim = base_layer.weight.dim() + +# self.update_layer( +# adapter_name, +# r, +# lora_alpha=lora_alpha, +# lora_dropout=lora_dropout, +# init_lora_weights=init_lora_weights, +# use_rslora=use_rslora, +# use_dora=use_dora, +# lora_bias=lora_bias, +# ) + +# def update_layer( +# self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora, lora_bias +# ): +# if r <= 0: +# raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") + +# self.r[adapter_name] = r +# self.lora_alpha[adapter_name] = lora_alpha +# if lora_dropout > 0.0: +# lora_dropout_layer = nn.Dropout(p=lora_dropout) +# else: +# lora_dropout_layer = nn.Identity() + +# self.lora_dropout[adapter_name] = lora_dropout_layer +# # Actual trainable parameters +# base_layer = self.get_base_layer() +# kernel_size = base_layer.kernel_size +# stride = base_layer.stride +# padding = base_layer.padding +# conv_layer = type(base_layer) +# out_kernel = out_stride = (1,) * (self._kernel_dim - 2) +# self.lora_A[adapter_name] = conv_layer(self.in_features, r, kernel_size, stride, padding, bias=False) +# self.lora_B[adapter_name] = conv_layer(r, self.out_features, out_kernel, out_stride, bias=lora_bias) +# self.lora_bias[adapter_name] = lora_bias + +# if use_rslora: +# self.scaling[adapter_name] = lora_alpha / math.sqrt(r) +# else: +# self.scaling[adapter_name] = lora_alpha / r + +# if init_lora_weights == "loftq": +# self.loftq_init(adapter_name) +# elif init_lora_weights: +# self.reset_lora_parameters(adapter_name, init_lora_weights) + +# # call this before dora_init +# self._move_adapter_to_device_of_base_layer(adapter_name) + +# if use_dora: +# self.dora_init(adapter_name) +# self.use_dora[adapter_name] = True +# else: +# self.use_dora[adapter_name] = False + +# self.set_adapter(self.active_adapters) + +# def _get_dora_factor_view(self): +# return (-1,) + (1,) * (self._kernel_dim - 1) + +# def dora_init(self, adapter_name: str) -> None: +# if self.lora_magnitude_vector is None: +# # first dora layer being added, add lora_magnitude_vector to the list of learnable parameters +# self.adapter_layer_names = self.adapter_layer_names[:] + ("lora_magnitude_vector",) + +# dora_layer_class = self._get_dora_layer_class() +# dora_layer = dora_layer_class(fan_in_fan_out=False) +# lora_A = self.lora_A[adapter_name].weight +# lora_B = self.lora_B[adapter_name].weight +# scaling = self.scaling[adapter_name] +# dora_layer.update_layer(base_layer=self.get_base_layer(), lora_A=lora_A, lora_B=lora_B, scaling=scaling) +# self.lora_magnitude_vector[adapter_name] = dora_layer + +# # def _get_dora_layer_class(self) -> type[_DoraConvNdLayer]: +# # # Subclasses should override this method to return the appropriate DoraLayer class +# # raise NotImplementedError + +# def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: +# """ +# Merge the active adapter weights inside the base weights + +# Args: +# safe_merge (`bool`, *optional*): +# If True, the merge operation will be performed in a copy of the original weights and check for NaNs +# before merging the weights. This is useful if you want to check if the merge operation will produce +# NaNs. Defaults to `False`. +# adapter_names (`list[str]`, *optional*): +# The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults +# to `None`. +# """ +# adapter_names = check_adapters_to_merge(self, adapter_names) +# if not adapter_names: +# # no adapter to merge +# return + +# for active_adapter in adapter_names: +# if active_adapter in self.lora_A.keys(): +# base_layer = self.get_base_layer() +# if safe_merge: +# # Note that safe_merge will be slower than the normal merge +# # because of the copy operation. +# orig_weights = base_layer.weight.data.clone() +# delta_weight = self.get_delta_weight(active_adapter) + +# if not self.use_dora[active_adapter]: +# orig_weights += delta_weight +# else: +# # handle dora +# # since delta_weight already includes scaling, set it to 1 here +# weight_norm = ( +# self.lora_magnitude_vector[active_adapter] +# .get_weight_norm(orig_weights, delta_weight, scaling=1) +# .detach() +# ) +# # We need to cache weight_norm because it has to be based on the original weights. We +# # cannot calculate it on the fly based on the merged weights when unmerging because its a +# # different value +# self._cache_store(f"{active_adapter}-weight_norm", weight_norm) +# dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm +# orig_weights = dora_factor.view(*self._get_dora_factor_view()) * (orig_weights + delta_weight) + +# if not torch.isfinite(orig_weights).all(): +# raise ValueError( +# f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" +# ) +# base_layer.weight.data = orig_weights + +# if self.lora_bias[active_adapter]: +# new_bias = base_layer.bias + self.lora_B[active_adapter].bias +# if not torch.isfinite(new_bias).all(): +# raise ValueError( +# f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" +# ) +# base_layer.bias.data = new_bias + +# else: +# delta_weight = self.get_delta_weight(active_adapter) +# if not self.use_dora[active_adapter]: +# base_layer.weight.data += delta_weight +# else: +# # handle dora +# # since delta_weight already includes scaling, set it to 1 here +# weight_norm = ( +# self.lora_magnitude_vector[active_adapter] +# .get_weight_norm(base_layer.weight, delta_weight, scaling=1) +# .detach() +# ) +# # We need to cache weight_norm because it has to be based on the original weights. We +# # cannot calculate it on the fly based on the merged weights when unmerging because its a +# # different value +# self._cache_store(f"{active_adapter}-weight_norm", weight_norm) +# dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm +# new_weight = dora_factor.view(*self._get_dora_factor_view()) * ( +# base_layer.weight.data + delta_weight +# ) +# base_layer.weight.data = new_weight + +# if self.lora_bias[active_adapter]: +# base_layer.bias.data += self.lora_B[active_adapter].bias + +# self.merged_adapters.append(active_adapter) + +# def unmerge(self) -> None: +# """ +# This method unmerges all merged adapter layers from the base weights. +# """ +# if not self.merged: +# warnings.warn("Already unmerged. Nothing to do.") +# return +# while len(self.merged_adapters) > 0: +# active_adapter = self.merged_adapters.pop() +# if active_adapter in self.lora_A.keys(): +# weight = self.get_base_layer().weight +# delta_weight = self.get_delta_weight(active_adapter) +# if not self.use_dora[active_adapter]: +# weight.data -= delta_weight +# else: +# weight_norm = self._cache_pop(f"{active_adapter}-weight_norm") +# dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm +# weight_orig = weight.data / dora_factor.view(*self._get_dora_factor_view()) - delta_weight +# weight.data = weight_orig + +# if self.lora_bias[active_adapter]: +# self.get_base_layer().bias.data -= self.lora_B[active_adapter].bias + +# def get_delta_weight(self, adapter) -> torch.Tensor: +# """ +# Compute the delta weight for the given adapter. + +# Args: +# adapter (str): +# The name of the adapter for which the delta weight should be computed. +# """ +# device = self.lora_B[adapter].weight.device +# dtype = self.lora_A[adapter].weight.dtype + +# # In case users wants to merge the adapter weights that are in +# # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to +# # (b)float16 because some CPUs have slow bf16/fp16 matmuls. +# cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) + +# weight_A = self.lora_A[adapter].weight +# weight_B = self.lora_B[adapter].weight + +# if cast_to_fp32: +# weight_A = weight_A.float() +# weight_B = weight_B.float() + +# # https://github.com/bmaltais/kohya_ss/blob/feb6728762a8f463d15ba936d189d4c3abfaa1ab/networks/lora.py#L117 +# if self.get_base_layer().weight.size()[2:4] == (1, 1): +# # conv2d 1x1 +# output_tensor = (weight_B.squeeze(3).squeeze(2) @ weight_A.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze( +# 3 +# ) * self.scaling[adapter] +# else: +# output_tensor = ( +# self.conv_fn( +# weight_A.transpose(0, 1), +# weight_B, +# ).transpose(0, 1) +# * self.scaling[adapter] +# ) + +# if cast_to_fp32: +# output_tensor = output_tensor.to(dtype=dtype) + +# # cast back the weights +# self.lora_A[adapter].weight.data = weight_A.to(dtype) +# self.lora_B[adapter].weight.data = weight_B.to(dtype) + +# return output_tensor + +# def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: +# self._check_forward_args(x, *args, **kwargs) +# adapter_names = kwargs.pop("adapter_names", None) + +# if self.disable_adapters: +# if self.merged: +# self.unmerge() +# result = self.base_layer(x, *args, **kwargs) +# elif adapter_names is not None: +# result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs) +# elif self.merged: +# result = self.base_layer(x, *args, **kwargs) +# else: +# result = self.base_layer(x, *args, **kwargs) +# torch_result_dtype = result.dtype + +# for active_adapter in self.active_adapters: +# if active_adapter not in self.lora_A.keys(): +# continue +# lora_A = self.lora_A[active_adapter] +# lora_B = self.lora_B[active_adapter] +# dropout = self.lora_dropout[active_adapter] +# scaling = self.scaling[active_adapter] +# x = x.to(lora_A.weight.dtype) + +# if not self.use_dora[active_adapter]: +# result = result + lora_B(lora_A(dropout(x))) * scaling +# else: +# x = dropout(x) +# result = result + self.lora_magnitude_vector[active_adapter]( +# x, +# lora_A=lora_A, +# lora_B=lora_B, +# scaling=scaling, +# base_layer=self.get_base_layer(), +# ) + +# result = result.to(torch_result_dtype) +# return result + +# def __repr__(self) -> str: +# rep = super().__repr__() +# return "lora." + rep + + +# class Conv2d(_ConvNd): +# # Lora implemented in a conv2d layer +# def __init__(self, *args, **kwargs): +# super().__init__(*args, **kwargs) +# if not self._kernel_dim == 4: +# raise ValueError(f"Conv2d layer kernel must have 4 dimensions, not {self._kernel_dim}") +# self.conv_fn = F.conv2d + +# def _get_dora_layer_class(self): +# return 0#DoraConv2dLayer + + +# class Conv3d(_ConvNd): +# # Lora implemented in a conv3d layer +# def __init__(self, *args, **kwargs): +# super().__init__(*args, **kwargs) +# if not self._kernel_dim == 5: +# raise ValueError(f"Conv3d layer kernel must have 5 dimensions, not {self._kernel_dim}") +# self.conv_fn = F.conv3d + +# def _get_dora_layer_class(self): +# return 0#DoraConv3dLayer + + +def dispatch_default( + target: torch.nn.Module, + adapter_name: str, + lora_config: aLoraConfig, + **kwargs, +) -> Optional[torch.nn.Module]: + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + if isinstance(target_base_layer, torch.nn.Embedding): + embedding_kwargs = kwargs.copy() + embedding_kwargs.pop("fan_in_fan_out", None) + embedding_kwargs.update(lora_config.loftq_config) + new_module = Embedding(target, adapter_name, **embedding_kwargs) + elif isinstance(target_base_layer, torch.nn.Conv2d): + kwargs.update(lora_config.loftq_config) + new_module = Conv2d(target, adapter_name, **kwargs) + elif isinstance(target_base_layer, torch.nn.Conv3d): + kwargs.update(lora_config.loftq_config) + new_module = Conv3d(target, adapter_name, **kwargs) + elif isinstance(target_base_layer, torch.nn.Linear): + if kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " + "Setting fan_in_fan_out to False." + ) + kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False + kwargs.update(lora_config.loftq_config) + new_module = Linear(target, adapter_name, **kwargs) + elif isinstance(target_base_layer, Conv1D): + if not kwargs["fan_in_fan_out"]: + warnings.warn( + "fan_in_fan_out is set to False but the target module is `Conv1D`. " "Setting fan_in_fan_out to True." + ) + kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True + kwargs.update(lora_config.loftq_config) + new_module = Linear(target, adapter_name, is_target_conv_1d_layer=True, **kwargs) + + return new_module + diff --git a/src/peft/tuners/alora/model.py b/src/peft/tuners/alora/model.py new file mode 100644 index 0000000000..4795756bda --- /dev/null +++ b/src/peft/tuners/alora/model.py @@ -0,0 +1,954 @@ + +from __future__ import annotations + +import math +import operator +import warnings +from contextlib import contextmanager +from dataclasses import asdict, replace +from enum import Enum +from functools import partial, reduce +from typing import Literal, Optional + +import torch +from torch import nn +from tqdm import tqdm + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available +from peft.tuners.tuners_utils import ( + BaseTuner, + BaseTunerLayer, + check_target_module_exists, + onload_layer, + replicate_layers, +) +from peft.utils import ( + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, + ModulesToSaveWrapper, + _freeze_adapter, + _get_submodules, + get_peft_model_state_dict, + get_quantization_config, +) +from peft.utils.merge_utils import dare_linear, dare_ties, magnitude_prune, task_arithmetic, ties +from peft.utils.other import get_pattern_key + +#from peft.aqlm import dispatch_aqlm +#from peft.awq import dispatch_awq +from .config import aLoraConfig +#from peft.eetq import dispatch_eetq +#from peft.gptq import dispatch_gptq +#from peft.hqq import dispatch_hqq +from .layer import aLoraLayer, dispatch_default +#from peft.torchao import dispatch_torchao +#from peft.tp_layer import dispatch_megatron + + +def _adapter_names_pre_forward_hook(target, args, kwargs, adapter_names,alora_offsets): + # pre-forward hook to inject the adapter_names argument when using mixed adapter batches inference + kwargs["adapter_names"] = adapter_names + if alora_offsets is not None: + kwargs["alora_offsets"] = alora_offsets + return args, kwargs + +def _alora_offsets_pre_forward_hook(target, args, kwargs, alora_offsets): + kwargs["alora_offsets"] = alora_offsets + return args, kwargs + +class aLoraModel(BaseTuner): + """ + Creates Low Rank Adapter (LoRA) model from a pretrained transformers model. + + The method is described in detail in https://arxiv.org/abs/2106.09685. + + Args: + model ([`torch.nn.Module`]): The model to be adapted. + config ([`LoraConfig`]): The configuration of the Lora model. + adapter_name (`str`): The name of the adapter, defaults to `"default"`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading process. + + Returns: + `torch.nn.Module`: The Lora model. + + Example: + + ```py + >>> from transformers import AutoModelForSeq2SeqLM + >>> from peft import LoraModel, LoraConfig + + >>> config = LoraConfig( + ... task_type="SEQ_2_SEQ_LM", + ... r=8, + ... lora_alpha=32, + ... target_modules=["q", "v"], + ... lora_dropout=0.01, + ... ) + + >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + >>> lora_model = LoraModel(model, config, "default") + ``` + + ```py + >>> import torch + >>> import transformers + >>> from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training + + >>> rank = ... + >>> target_modules = ["q_proj", "k_proj", "v_proj", "out_proj", "fc_in", "fc_out", "wte"] + >>> config = LoraConfig( + ... r=4, lora_alpha=16, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM" + ... ) + >>> quantization_config = transformers.BitsAndBytesConfig(load_in_8bit=True) + + >>> tokenizer = transformers.AutoTokenizer.from_pretrained( + ... "kakaobrain/kogpt", + ... revision="KoGPT6B-ryan1.5b-float16", # or float32 version: revision=KoGPT6B-ryan1.5b + ... bos_token="[BOS]", + ... eos_token="[EOS]", + ... unk_token="[UNK]", + ... pad_token="[PAD]", + ... mask_token="[MASK]", + ... ) + >>> model = transformers.GPTJForCausalLM.from_pretrained( + ... "kakaobrain/kogpt", + ... revision="KoGPT6B-ryan1.5b-float16", # or float32 version: revision=KoGPT6B-ryan1.5b + ... pad_token_id=tokenizer.eos_token_id, + ... use_cache=False, + ... device_map={"": rank}, + ... torch_dtype=torch.float16, + ... quantization_config=quantization_config, + ... ) + >>> model = prepare_model_for_kbit_training(model) + >>> lora_model = get_peft_model(model, config) + ``` + + **Attributes**: + - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. + - **peft_config** ([`LoraConfig`]): The configuration of the Lora model. + """ + + prefix: str = "lora_" + + def __init__(self, model, config, adapter_name, low_cpu_mem_usage: bool = False) -> None: + super().__init__(model, config, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage) + + def _check_new_adapter_config(self, config: aLoraConfig) -> None: + """ + A helper method to check the config when a new adapter is being added. + + Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters. + + """ + # TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check + # does not fully correspond to the error message. + if (len(self.peft_config) > 1) and (config.bias != "none"): + raise ValueError( + f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, " + "set bias to 'none' for all adapters." + ) + + @staticmethod + def _check_target_module_exists(lora_config, key): + return check_target_module_exists(lora_config, key) + + def _prepare_model(self, peft_config: aLoraConfig, model: nn.Module): + r""" + A private method to modify the model structure before adapter is applied. + + Args: + peft_config (`PeftConfig`): + The prepared adapter config. + model (`nn.Module`): + The model that is going to be adapted. + """ + if peft_config.layer_replication: + replicate_layers(model, peft_config.layer_replication) + + def _create_and_replace( + self, + lora_config, + adapter_name, + target, + target_name, + parent, + current_key, + ): + if current_key is None: + raise ValueError("Current Key shouldn't be `None`") + + # Regexp matching - Find key which matches current target_name in patterns provided + r_key = get_pattern_key(lora_config.rank_pattern.keys(), current_key) + alpha_key = get_pattern_key(lora_config.alpha_pattern.keys(), current_key) + r = lora_config.rank_pattern.get(r_key, lora_config.r) + alpha = lora_config.alpha_pattern.get(alpha_key, lora_config.lora_alpha) + + kwargs = { + "r": r, + "lora_alpha": alpha, + "lora_dropout": lora_config.lora_dropout, + "fan_in_fan_out": lora_config.fan_in_fan_out, + "init_lora_weights": lora_config.init_lora_weights, + "use_rslora": lora_config.use_rslora, + "use_dora": lora_config.use_dora, + "ephemeral_gpu_offload": lora_config.runtime_config.ephemeral_gpu_offload, + "lora_bias": lora_config.lora_bias, + "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False), + "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False), + } + # for torchao merging, we need the get_apply_tensor_subclass from the quantization config + try: + kwargs["get_apply_tensor_subclass"] = operator.attrgetter( + "hf_quantizer.quantization_config.get_apply_tensor_subclass" + )(self.model) + except AttributeError: + pass + + quant_methods = ["gptq", "aqlm", "awq"] + for quant_method in quant_methods: + quantization_config = get_quantization_config(self.model, method=quant_method) + if quantization_config is not None: + kwargs[f"{quant_method}_quantization_config"] = quantization_config + + # note: AdaLoraLayer is a subclass of LoraLayer, we need to exclude it + from peft.tuners.adalora import AdaLoraLayer + + if isinstance(target, aLoraLayer) and not isinstance(target, AdaLoraLayer): + target.update_layer( + adapter_name, + r, + lora_alpha=alpha, + lora_dropout=lora_config.lora_dropout, + init_lora_weights=lora_config.init_lora_weights, + use_rslora=lora_config.use_rslora, + use_dora=lora_config.use_dora, + lora_bias=lora_config.lora_bias, + ) + else: + new_module = self._create_new_module(lora_config, adapter_name, target, **kwargs) + if adapter_name not in self.active_adapters: + # adding an additional adapter: it is not automatically trainable + new_module.requires_grad_(False) + self._replace_module(parent, target_name, new_module, target) + + def _replace_module(self, parent, child_name, new_module, child): + setattr(parent, child_name, new_module) + # It's not necessary to set requires_grad here, as that is handled by + # _mark_only_adapters_as_trainable + + # child layer wraps the original module, unpack it + if hasattr(child, "base_layer"): + child = child.base_layer + + if not hasattr(new_module, "base_layer"): + if hasattr(new_module, "W_q"): # HQQ + new_module.W_q = child.W_q + else: + new_module.weight = child.weight + if hasattr(child, "bias"): + new_module.bias = child.bias + + if getattr(child, "state", None) is not None: + if hasattr(new_module, "base_layer"): + new_module.base_layer.state = child.state + else: + new_module.state = child.state + new_module.to(child.weight.device) + + meta = torch.device("meta") + # dispatch to correct device + for name, module in new_module.named_modules(): + if (self.prefix in name) or ("ranknum" in name): + weight = ( + child.qweight + if hasattr(child, "qweight") + else child.W_q + if hasattr(child, "W_q") + else child.weight + if hasattr(child, "weight") + else next(child.parameters()) + ) + if not any(p.device == meta for p in module.parameters()): + module.to(weight.device) + + def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: + for n, p in model.named_parameters(): + if self.prefix not in n: + p.requires_grad = False + + for active_adapter in self.active_adapters: + bias = self.peft_config[active_adapter].bias + if bias == "none": + continue + + if bias == "all": + for n, p in model.named_parameters(): + if "bias" in n: + p.requires_grad = True + elif bias == "lora_only": + for m in model.modules(): + if isinstance(m, aLoraLayer) and hasattr(m, "bias") and m.bias is not None: + m.bias.requires_grad = True + else: + raise NotImplementedError(f"Requested bias: {bias}, is not implemented.") + + @staticmethod + def _create_new_module(lora_config, adapter_name, target, **kwargs): + # Collect dispatcher functions to decide what backend to use for the replaced LoRA layer. The order matters, + # because the first match is always used. Therefore, the default layers should be checked last. + dispatchers = [] + + if lora_config._custom_modules: + # Experimental custom LoRA module support. Allows users to pass a custom mapping for unsupported layer + # types by impelementing their own LoRA layers. + def dynamic_dispatch_func(target, adapter_name, lora_config, **kwargs): + new_module = None + + if isinstance(target, BaseTunerLayer): + target_base_layer = target.get_base_layer() + else: + target_base_layer = target + + for key, custom_cls in lora_config._custom_modules.items(): + if isinstance(target_base_layer, key): + new_module = custom_cls(target, adapter_name, **kwargs) + break + + return new_module + + dispatchers.append(dynamic_dispatch_func) + + # avoid eager bnb import + if is_bnb_available(): + from peft.tuners.lora.bnb import dispatch_bnb_8bit + + dispatchers.append(dispatch_bnb_8bit) + + if is_bnb_4bit_available(): + from peft.tuners.lora.bnb import dispatch_bnb_4bit + + dispatchers.append(dispatch_bnb_4bit) + + dispatchers.extend( + [ + # dispatch_eetq, + # dispatch_aqlm, +# dispatch_awq, + # dispatch_gptq, + # dispatch_hqq, + # dispatch_torchao, + # dispatch_megatron, + dispatch_default, + ] + ) + + new_module = None + for dispatcher in dispatchers: + new_module = dispatcher(target, adapter_name, lora_config=lora_config, **kwargs) + if new_module is not None: # first match wins + break + + if new_module is None: + # no module could be matched + raise ValueError( + f"Target module {target} is not supported. Currently, only the following modules are supported: " + "`torch.nn.Linear`" + ) + + return new_module + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + if name == "model": # see #1892: prevent infinite recursion if class is not initialized + raise + return getattr(self.model, name) + + def get_peft_config_as_dict(self, inference: bool = False): + config_dict = {} + for key, value in self.peft_config.items(): + config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()} + if inference: + config["inference_mode"] = True + config_dict[key] = config + return config + + def _set_adapter_layers(self, enabled: bool = True) -> None: + for module in self.model.modules(): + if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)): + module.enable_adapters(enabled) + + def enable_adapter_layers(self) -> None: + """Enable all adapters. + + Call this if you have previously disabled all adapters and want to re-enable them. + """ + self._set_adapter_layers(enabled=True) + + def disable_adapter_layers(self) -> None: + """Disable all adapters. + + When disabling all adapters, the model output corresponds to the output of the base model. + """ + for active_adapter in self.active_adapters: + val = self.peft_config[active_adapter].bias + if val != "none": + msg = ( + f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same " + "output as the the base model would without adaption." + ) + warnings.warn(msg) + self._set_adapter_layers(enabled=False) + + def set_adapter(self, adapter_name: str | list[str]) -> None: + """Set the active adapter(s). + + Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is + not desired, use the following code. + + ```py + >>> for name, param in model_peft.named_parameters(): + ... if ...: # some check on name (ex. if 'lora' in name) + ... param.requires_grad = False + ``` + + Args: + adapter_name (`str` or `list[str]`): Name of the adapter(s) to be activated. + """ + for module in self.model.modules(): + if isinstance(module, aLoraLayer): + if module.merged: + warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.") + module.unmerge() + module.set_adapter(adapter_name) + self.active_adapter = adapter_name + + @contextmanager + def _enable_peft_forward_hooks(self, *args, **kwargs): + # If adapter_names is passed as an argument, we inject it into the forward arguments. + adapter_names = kwargs.pop("adapter_names", None) + alora_offsets = kwargs.pop("alora_offsets", None) + if adapter_names is None and alora_offsets is None: + # nothing to do + yield + return + if adapter_names is None: + hook_handles = [] + for module in self.modules(): + if isinstance(module, aLoraLayer): + pre_forward = partial(_alora_offsets_pre_forward_hook, alora_offsets = alora_offsets) + handle = module.register_forward_pre_hook(pre_forward, with_kwargs=True) + hook_handles.append(handle) + yield + + for handle in hook_handles: + handle.remove() + + return + + + + + + + ################################ + #if ks is not None: + # raise ValueError("Multiple adapters not supported with alora yet") + if self.training: + raise ValueError("Cannot pass `adapter_names` when the model is in training mode.") + + # Check that users only passed actually existing adapters. + # Note: We cannot do this on the layer level, as each individual layer may not have each adapter. Still, we want + # to check that there is at least one layer with the given name, or else something like typos can easily slip. + expected_adapters = set() + for layer in self.modules(): + if isinstance(layer, aLoraLayer): + expected_adapters |= layer.lora_A.keys() + expected_adapters |= layer.lora_embedding_A.keys() + unique_adapters = {name for name in adapter_names if name != "__base__"} + unexpected_adapters = unique_adapters - expected_adapters + if unexpected_adapters: + raise ValueError(f"Trying to infer with non-existing adapter(s): {', '.join(sorted(unexpected_adapters))}") + + hook_handles = [] + for module in self.modules(): + if isinstance(module, aLoraLayer) or isinstance(module, ModulesToSaveWrapper): + pre_forward = partial(_adapter_names_pre_forward_hook, adapter_names=adapter_names, alora_offsets = alora_offsets) + handle = module.register_forward_pre_hook(pre_forward, with_kwargs=True) + hook_handles.append(handle) + + yield + + for handle in hook_handles: + handle.remove() + + def _check_merge_allowed(self): + """Verify that the configuration supports merging. + + Currently gptq quantization and replicated layers do not support merging. + """ + super()._check_merge_allowed() + if getattr(self.model, "quantization_method", None) == "gptq": + raise ValueError("Cannot merge LORA layers when the model is gptq quantized") + if self.peft_config.get("layer_replication"): + raise ValueError("Cannot merge LORA layers when base model layers are replicated") + + @staticmethod + def _prepare_adapter_config(peft_config, model_config): + if peft_config.target_modules is None: + if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING: + raise ValueError("Please specify `target_modules` in `peft_config`") + peft_config.target_modules = set( + TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]] + ) + return peft_config + + def _unload_and_optionally_merge( + self, + merge=True, + progressbar: bool = False, + safe_merge: bool = False, + adapter_names: Optional[list[str]] = None, + ): + if merge: + self._check_merge_allowed() + + key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] + desc = "Unloading " + ("and merging " if merge else "") + "model" + for key in tqdm(key_list, disable=not progressbar, desc=desc): + try: + parent, target, target_name = _get_submodules(self.model, key) + except AttributeError: + continue + with onload_layer(target): + if hasattr(target, "base_layer"): + if merge: + target.merge(safe_merge=safe_merge, adapter_names=adapter_names) + self._replace_module(parent, target_name, target.get_base_layer(), target) + elif isinstance(target, ModulesToSaveWrapper): + # save any additional trainable modules part of `modules_to_save` + new_module = target.modules_to_save[target.active_adapter] + if hasattr(new_module, "base_layer"): + # check if the module is itself a tuner layer + if merge: + new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names) + new_module = new_module.get_base_layer() + setattr(parent, target_name, new_module) + + return self.model + + def _check_add_weighted_adapter( + self, adapters: list[str], combination_type: str, svd_rank: int | None + ) -> tuple[str, int, str]: + """ + Helper function to check if the arguments to add_weighted_adapter are valid and compatible with the underlying + model. + """ + for adapter in adapters: + if adapter not in list(self.peft_config.keys()): + raise ValueError(f"Adapter {adapter} does not exist") + + # If more than one of the adapters targets the same module with modules_to_save, raise an error, as these + # modules cannot be merged. First, find the ModulesToSaveWrapper instances in the model, then check if they + # have modules for the adapters to be merged. + modules_to_save_wrappers = [module for module in self.modules() if isinstance(module, ModulesToSaveWrapper)] + problematic_wrappers = [ + wrapper + for wrapper in modules_to_save_wrappers + if sum(adapter in wrapper.modules_to_save for adapter in adapters) > 1 + ] + if problematic_wrappers: + raise ValueError( + "Cannot add weighted adapters if they target the same module with modules_to_save, but found " + f"{len(problematic_wrappers)} such instance(s)." + ) + + # if there is only one adapter, we can only use linear merging + combination_type = "linear" if len(adapters) == 1 else combination_type + + adapters_ranks = [self.peft_config[adapter].r for adapter in adapters] + if combination_type in ("linear", "ties", "dare_ties", "dare_linear", "magnitude_prune"): + # all adapters ranks should be same, new rank is just this value + if len(set(adapters_ranks)) != 1: + raise ValueError( + "All adapters must have the same r value when using combination_type linear, ties, dare_ties or " + "dare_linear." + ) + new_rank = adapters_ranks[0] + elif combination_type == "cat": + # adapters ranks may be different, new rank is sum of all ranks + # be careful, because output adapter rank may be really big if mixing a lot of adapters + new_rank = sum(adapters_ranks) + elif combination_type.endswith("svd"): + # new rank is the max of all ranks of the adapters if not provided + new_rank = svd_rank or max(adapters_ranks) + else: + raise ValueError(f"Invalid combination_type: {combination_type}") + + target_module_types = [type(self.peft_config[adapter].target_modules) for adapter in adapters] + if not target_module_types: + raise ValueError(f"Found no adapter matching the names in {adapters}") + if len(set(target_module_types)) > 1: + raise ValueError( + "all adapter configs should follow the same target modules type. " + "Combining adapters with `target_modules` type being a mix of list/set and string is not supported." + ) + + if target_module_types[0] is str: + new_target_modules = "|".join(f"({self.peft_config[adapter].target_modules})" for adapter in adapters) + elif target_module_types[0] is set: + new_target_modules = reduce( + operator.or_, (self.peft_config[adapter].target_modules for adapter in adapters) + ) + else: + raise TypeError(f"Invalid type {target_module_types[0]} found in target_modules") + + return combination_type, new_rank, new_target_modules + + def add_weighted_adapter( + self, + adapters: list[str], + weights: list[float], + adapter_name: str, + combination_type: str = "svd", + svd_rank: int | None = None, + svd_clamp: int | None = None, + svd_full_matrices: bool = True, + svd_driver: str | None = None, + density: float | None = None, + majority_sign_method: Literal["total", "frequency"] = "total", + ) -> None: + """ + This method adds a new adapter by merging the given adapters with the given weights. + + When using the `cat` combination_type you should be aware that rank of the resulting adapter will be equal to + the sum of all adapters ranks. So it's possible that the mixed adapter may become too big and result in OOM + errors. + + Args: + adapters (`list`): + List of adapter names to be merged. + weights (`list`): + List of weights for each adapter. + adapter_name (`str`): + Name of the new adapter. + combination_type (`str`): + The merging type can be one of [`svd`, `linear`, `cat`, `ties`, `ties_svd`, `dare_ties`, `dare_linear`, + `dare_ties_svd`, `dare_linear_svd`, `magnitude_prune`, `magnitude_prune_svd`]. When using the `cat` + combination_type, the rank of the resulting adapter is equal to the sum of all adapters ranks (the + mixed adapter may be too big and result in OOM errors). + svd_rank (`int`, *optional*): + Rank of output adapter for svd. If None provided, will use max rank of merging adapters. + svd_clamp (`float`, *optional*): + A quantile threshold for clamping SVD decomposition output. If None is provided, do not perform + clamping. Defaults to None. + svd_full_matrices (`bool`, *optional*): + Controls whether to compute the full or reduced SVD, and consequently, the shape of the returned + tensors U and Vh. Defaults to True. + svd_driver (`str`, *optional*): + Name of the cuSOLVER method to be used. This keyword argument only works when merging on CUDA. Can be + one of [None, `gesvd`, `gesvdj`, `gesvda`]. For more info please refer to `torch.linalg.svd` + documentation. Defaults to None. + density (`float`, *optional*): + Value between 0 and 1. 0 means all values are pruned and 1 means no values are pruned. Should be used + with [`ties`, `ties_svd`, `dare_ties`, `dare_linear`, `dare_ties_svd`, `dare_linear_svd`, + `magnintude_prune`, `magnitude_prune_svd`] + majority_sign_method (`str`): + The method, should be one of ["total", "frequency"], to use to get the magnitude of the sign values. + Should be used with [`ties`, `ties_svd`, `dare_ties`, `dare_ties_svd`] + """ + + if adapter_name in list(self.peft_config.keys()): + return + + combination_type, new_rank, new_target_modules = self._check_add_weighted_adapter( + adapters=adapters, + combination_type=combination_type, + svd_rank=svd_rank, + ) + + self.peft_config[adapter_name] = replace( + self.peft_config[adapters[0]], + r=new_rank, + lora_alpha=new_rank, + target_modules=new_target_modules, + ) + self.inject_adapter(self.model, adapter_name) + + # Do we really need that? + _freeze_adapter(self.model, adapter_name) + + key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] + for key in key_list: + _, target, _ = _get_submodules(self.model, key) + if isinstance(target, aLoraLayer): + if adapter_name in target.lora_A: + target_lora_A = target.lora_A[adapter_name].weight + target_lora_B = target.lora_B[adapter_name].weight + elif adapter_name in target.lora_embedding_A: + target_lora_A = target.lora_embedding_A[adapter_name] + target_lora_B = target.lora_embedding_B[adapter_name] + else: + continue + + target_lora_A.data = target_lora_A.data * 0.0 + target_lora_B.data = target_lora_B.data * 0.0 + if combination_type == "cat": + loras_A, loras_B = [], [] + for adapter, weight in zip(adapters, weights): + if adapter in target.lora_A: + current_adapter_lora_A = target.lora_A[adapter].weight + current_adapter_lora_B = target.lora_B[adapter].weight + elif adapter in target.lora_embedding_A: + current_adapter_lora_A = target.lora_embedding_A[adapter] + current_adapter_lora_B = target.lora_embedding_B[adapter] + else: + continue + loras_A.append(current_adapter_lora_A.data * weight * target.scaling[adapter]) + loras_B.append(current_adapter_lora_B.data) + + if len(loras_A) == 0: + raise ValueError("No matching LoRAs found. Please raise an issue on GitHub.") + loras_A = torch.cat(loras_A, dim=0) + loras_B = torch.cat(loras_B, dim=1) + target_lora_A.data[: loras_A.shape[0], :] = loras_A + target_lora_B.data[:, : loras_B.shape[1]] = loras_B + elif combination_type in [ + "svd", + "ties_svd", + "dare_linear_svd", + "dare_ties_svd", + "magnitude_prune_svd", + ]: + target_lora_A.data, target_lora_B.data = self._svd_generalized_task_arithmetic_weighted_adapter( + combination_type, + adapters, + weights, + new_rank, + target, + target_lora_A, + target_lora_B, + density, + majority_sign_method, + svd_clamp, + full_matrices=svd_full_matrices, + driver=svd_driver, + ) + elif combination_type in ["linear", "ties", "dare_linear", "dare_ties", "magnitude_prune"]: + target_lora_A.data, target_lora_B.data = self._generalized_task_arithmetic_weighted_adapter( + combination_type, adapters, weights, target, density, majority_sign_method + ) + + def _svd_generalized_task_arithmetic_weighted_adapter( + self, + combination_type, + adapters, + weights, + new_rank, + target, + target_lora_A, + target_lora_B, + density, + majority_sign_method, + clamp=None, + full_matrices=True, + driver=None, + ): + valid_adapters = [] + valid_weights = [] + is_embedding = any(adapter in target.lora_embedding_A for adapter in adapters) + for adapter, weight in zip(adapters, weights): + if adapter in target.lora_A or adapter in target.lora_embedding_A: + valid_adapters.append(adapter) + valid_weights.append(weight * target.scaling[adapter]) + + # if no valid adapter, nothing to do + if len(valid_adapters) == 0: + raise ValueError("No matching LoRAs found. Please raise an issue on Github.") + delta_weight = [target.get_delta_weight(adapter) for adapter in valid_adapters] + valid_weights = torch.tensor(valid_weights).to(delta_weight[0].device) + if combination_type == "svd": + delta_weight = task_arithmetic(delta_weight, valid_weights) + elif combination_type == "ties_svd": + delta_weight = ties(delta_weight, valid_weights, density, majority_sign_method) + elif combination_type == "dare_linear_svd": + delta_weight = dare_linear(delta_weight, valid_weights, density) + elif combination_type == "dare_ties_svd": + delta_weight = dare_ties(delta_weight, valid_weights, density, majority_sign_method) + elif combination_type == "magnitude_prune_svd": + delta_weight = magnitude_prune(delta_weight, valid_weights, density) + else: + raise ValueError(f"Invalid value passed to combination type: {combination_type}") + + conv2d = False #isinstance(target, Conv2d) + if conv2d: + conv2d_1x1 = target.weight.size()[2:4] == (1, 1) + if not conv2d_1x1: + delta_weight = delta_weight.flatten(start_dim=1) + else: + delta_weight = delta_weight.squeeze() + if (hasattr(target, "fan_in_fan_out") and target.fan_in_fan_out) or is_embedding: + delta_weight = delta_weight.T + + # based on https://github.com/kohya-ss/sd-scripts/blob/main/networks/svd_merge_lora.py#L114-L131 + U, S, Vh = torch.linalg.svd(delta_weight, full_matrices=full_matrices, driver=driver) + U = U[:, :new_rank] + S = S[:new_rank] + U = U @ torch.diag(S) + Vh = Vh[:new_rank, :] + if clamp is not None: + dist = torch.cat([U.flatten(), Vh.flatten()]) + hi_val = torch.quantile(dist, clamp) + low_val = -hi_val + U = U.clamp(low_val, hi_val) + Vh = Vh.clamp(low_val, hi_val) + if conv2d: + U = U.reshape(target_lora_B.data.shape) + Vh = Vh.reshape(target_lora_A.data.shape) + return Vh, U + + def _generalized_task_arithmetic_weighted_adapter( + self, + combination_type, + adapters, + weights, + target, + density, + majority_sign_method, + ): + # account weights for LoRA A and B layers. + valid_weights = [] + lora_A_deltas = [] + lora_B_deltas = [] + for adapter, weight in zip(adapters, weights): + if adapter in target.lora_A: + current_adapter_lora_A = target.lora_A[adapter].weight + current_adapter_lora_B = target.lora_B[adapter].weight + elif adapter in target.lora_embedding_A: + current_adapter_lora_A = target.lora_embedding_A[adapter] + current_adapter_lora_B = target.lora_embedding_B[adapter] + else: + continue + valid_weights.append(math.sqrt(weight * target.scaling[adapter])) + lora_A_deltas.append(current_adapter_lora_A.data) + lora_B_deltas.append(current_adapter_lora_B.data) + valid_weights = torch.tensor(valid_weights).to(lora_A_deltas[0].device) + lora_deltas = [lora_A_deltas, lora_B_deltas] + dtype = lora_A_deltas[0].dtype + for i, task_tensors in enumerate(lora_deltas): + if combination_type == "linear": + lora_deltas[i] = task_arithmetic(task_tensors, valid_weights) + elif combination_type == "ties": + lora_deltas[i] = ties(task_tensors, valid_weights, density, majority_sign_method) + elif combination_type == "dare_linear": + lora_deltas[i] = dare_linear(task_tensors, valid_weights, density) + elif combination_type == "dare_ties": + lora_deltas[i] = dare_ties(task_tensors, valid_weights, density, majority_sign_method) + elif combination_type == "magnitude_prune": + lora_deltas[i] = magnitude_prune(task_tensors, valid_weights, density) + else: + raise ValueError("Invalid combination type") + lora_deltas = [delta.to(dtype) for delta in lora_deltas] + return lora_deltas + + def delete_adapter(self, adapter_name: str) -> None: + """ + Deletes an existing adapter. + + Args: + adapter_name (str): Name of the adapter to be deleted. + """ + if adapter_name not in list(self.peft_config.keys()): + raise ValueError(f"Adapter {adapter_name} does not exist") + del self.peft_config[adapter_name] + + key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] + new_adapter = None + for key in key_list: + _, target, _ = _get_submodules(self.model, key) + if isinstance(target, aLoraLayer): + target.delete_adapter(adapter_name) + if new_adapter is None: + new_adapter = target.active_adapters[:] + + self.active_adapter = new_adapter or [] + + def merge_and_unload( + self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None + ) -> torch.nn.Module: + r""" + This method merges the LoRa layers into the base model. This is needed if someone wants to use the base model + as a standalone model. + + Args: + progressbar (`bool`): + whether to show a progressbar indicating the unload and merge process + safe_merge (`bool`): + whether to activate the safe merging check to check if there is any potential Nan in the adapter + weights + adapter_names (`List[str]`, *optional*): + The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults + to `None`. + Example: + + ```py + >>> from transformers import AutoModelForCausalLM + >>> from peft import PeftModel + + >>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b") + >>> peft_model_id = "smangrul/falcon-40B-int4-peft-lora-sfttrainer-sample" + >>> model = PeftModel.from_pretrained(base_model, peft_model_id) + >>> merged_model = model.merge_and_unload() + ``` + """ + return self._unload_and_optionally_merge( + progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names + ) + + def unload(self) -> torch.nn.Module: + """ + Gets back the base model by removing all the lora modules without merging. This gives back the original base + model. + """ + return self._unload_and_optionally_merge(merge=False) + + def subtract_mutated_init(self, output_state_dict: dict[str, torch.Tensor], adapter_name: str, kwargs=None): + """ + This function can calculate the updates of the [PiSSA | OLoRA] by comparing the parameters of the [PiSSA | + OLoRA] adapter in `output_state_dict` with the initial values of [PiSSA | OLoRA] in `adapter_name`, thus + converting [PiSSA | OLoRA] to LoRA. + """ + for name, param in self.model.named_parameters(): + if ( + param.data.dtype != torch.float32 + and param.data.dtype != torch.float16 + and param.data.dtype != torch.bfloat16 + ) and adapter_name.startswith("pissa"): + warnings.warn( + r"Note that Quant(W_res) + AB != Quant(W) + \Delta(AB); " + "the converted LoRA, when combined with W or Quant(W), may introduce a certain gap in the fine-tuned model. " + "Therefore, we recommend directly using the Quant(W_res) in conjunction with the PiSSA adapter. " + ) + mutated_init_state_dict = get_peft_model_state_dict( + self, + state_dict=kwargs.get("state_dict", None), + adapter_name=adapter_name, + ) + tensors_lora = {} + for name in output_state_dict.keys(): + ## W = W^{res} + A_0 \times B_0, + ## W + \Delta W = W^{res} + A \times B, + ## \Delta W = A \times B - A_0 \times B_0 = [A | A_0] \times [B | -B_0]^T = A'B'. + if "lora_A" in name: + tensors_lora[name] = torch.cat( + [output_state_dict[name], mutated_init_state_dict[".".join(name.split(".")[1:])]], dim=0 + ) + elif "lora_B" in name: + tensors_lora[name] = torch.cat( + [output_state_dict[name], -mutated_init_state_dict[".".join(name.split(".")[1:])]], dim=1 + ) + + return tensors_lora + diff --git a/src/peft/tuners/alora/peft_model_alora.py b/src/peft/tuners/alora/peft_model_alora.py new file mode 100644 index 0000000000..3e036bbaaa --- /dev/null +++ b/src/peft/tuners/alora/peft_model_alora.py @@ -0,0 +1,2165 @@ + + +from __future__ import annotations + +import collections +import copy +import inspect +import os +import warnings +from contextlib import contextmanager, nullcontext +from copy import deepcopy +from dataclasses import dataclass +from typing import Any, Literal, Optional, Union +import numpy as np +import packaging.version +import torch +import transformers +from accelerate import dispatch_model, infer_auto_device_map, init_empty_weights +from accelerate.hooks import AlignDevicesHook, add_hook_to_module, remove_hook_from_submodules +from accelerate.utils import get_balanced_memory, named_module_tensors +from huggingface_hub import HfFileSystem, ModelCard, ModelCardData, hf_hub_download +from safetensors import safe_open +from safetensors.torch import save_file as safe_save_file +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers import Cache, DynamicCache, EncoderDecoderCache, PreTrainedModel +from transformers.modeling_outputs import QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput +from transformers.utils import PushToHubMixin + +from peft.utils.constants import DUMMY_MODEL_CONFIG #, PEFT_TYPE_TO_PREFIX_MAPPING + +from peft import __version__ +from peft.config import PeftConfig +import enum +from peft import ( + #LoraConfig, + PeftModel, + #PeftModelForCausalLM, + #PeftModelForSequenceClassification, + get_peft_model, + prepare_model_for_kbit_training) + +from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_eetq_available + +from .config import aLoraConfig +#from .eva import get_eva_state_dict, initialize_lora_eva_weights +#from .gptq import QuantLinear +from .layer import Linear, aLoraLayer #Conv2d, Conv3d, Embedding, Linear, aLoraLayer +from .model import aLoraModel + +from peft.tuners.lora import ( + #LoraConfig, + #LoraModel, + LoftQConfig, + EvaConfig, + LoraRuntimeConfig, + get_eva_state_dict, + initialize_lora_eva_weights, +) +from peft.tuners import ( + AdaLoraModel, + AdaptionPromptModel, + BOFTModel, + BoneModel, + CPTEmbedding, + FourierFTModel, + HRAModel, + IA3Model, + LNTuningModel, + LoHaModel, + LoKrModel, + LoraModel, + MultitaskPromptEmbedding, + OFTModel, + PolyModel, + PrefixEncoder, + PromptEmbedding, + PromptEncoder, + VBLoRAModel, + VeraModel, + XLoraConfig, + XLoraModel, +) +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.utils import ( + SAFETENSORS_WEIGHTS_NAME, + TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING, + WEIGHTS_NAME, + PeftType, + TaskType, + _get_batch_size, + _prepare_prompt_learning_config, + _set_adapter, + _set_trainable, + get_peft_model_state_dict, + id_tensor_storage, + infer_device, + load_peft_weights, + map_cache_to_layer_device_map, + set_peft_model_state_dict, + shift_tokens_right, +) + + +PEFT_TYPE_TO_MODEL_MAPPING = { + "LORA": aLoraModel, +} + + +class PeftType(str, enum.Enum): + LORA = "LORA" + + + +PEFT_TYPE_TO_PREFIX_MAPPING = { + PeftType.LORA: "lora_", +} + +#MODEL_TYPE_TO_PEFT_MODEL_MAPPING: dict[str, type[PeftModel]] = { + #"SEQ_CLS": PeftModelForSequenceClassification, + #"SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM, + # "CAUSAL_LM": PeftModelForCausalLM, + #"TOKEN_CLS": PeftModelForTokenClassification, + #"QUESTION_ANS": PeftModelForQuestionAnswering, + #"FEATURE_EXTRACTION": PeftModelForFeatureExtraction, + # } + +PEFT_TYPE_TO_CONFIG_MAPPING: dict[str, type[PeftConfig]] = { + #"ADAPTION_PROMPT": AdaptionPromptConfig, + #"PROMPT_TUNING": PromptTuningConfig, + #"PREFIX_TUNING": PrefixTuningConfig, + #"P_TUNING": PromptEncoderConfig, + "LORA": aLoraConfig, + #"LOHA": LoHaConfig, + #"LORAPLUS": LoraConfig, + #"LOKR": LoKrConfig, + #"ADALORA": AdaLoraConfig, + #"BOFT": BOFTConfig, + #"IA3": IA3Config, + #"MULTITASK_PROMPT_TUNING": MultitaskPromptTuningConfig, + #"OFT": OFTConfig, + #"POLY": PolyConfig, + #"LN_TUNING": LNTuningConfig, + #"VERA": VeraConfig, + #"FOURIERFT": FourierFTConfig, + #"XLORA": XLoraConfig, + #"HRA": HRAConfig, + #"VBLORA": VBLoRAConfig, + #"CPT": CPTConfig, + #"BONE": BoneConfig, + } +class PeftModelALoRA(PeftModel): + """ + Base model encompassing various Peft methods. + + Args: + model ([`~transformers.PreTrainedModel`]): The base transformer model used for Peft. + peft_config ([`PeftConfig`]): The configuration of the Peft model. + adapter_name (`str`, *optional*): The name of the adapter, defaults to `"default"`. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights + using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect + select PEFT tuners. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the loading loading process. + + + + Don't use `low_cpu_mem_usage=True` when creating a new PEFT adapter for training. + + + + **Attributes**: + - **base_model** ([`torch.nn.Module`]) -- The base transformer model used for Peft. + - **peft_config** ([`PeftConfig`]) -- The configuration of the Peft model. + - **modules_to_save** (`list` of `str`) -- The list of sub-module names to save when + saving the model. + - **prompt_encoder** ([`PromptEncoder`]) -- The prompt encoder used for Peft if + using [`PromptLearningConfig`]. + - **prompt_tokens** (`torch.Tensor`) -- The virtual prompt tokens used for Peft if + using [`PromptLearningConfig`]. + - **transformer_backbone_name** (`str`) -- The name of the transformer + backbone in the base model if using [`PromptLearningConfig`]. + - **word_embeddings** (`torch.nn.Embedding`) -- The word embeddings of the transformer backbone + in the base model if using [`PromptLearningConfig`]. + """ + + def __init__( + self, + model: PreTrainedModel, + peft_config: PeftConfig, + adapter_name: str = "default", + autocast_adapter_dtype: bool = True, + low_cpu_mem_usage: bool = False, + response_token_ids = None + ) -> None: + super().__init__(model, peft_config, adapter_name) + self.modules_to_save = None + self.active_adapter = adapter_name + self.peft_type = peft_config.peft_type + # These args are special PEFT arguments that users can pass. They need to be removed before passing them to + # forward. + self.special_peft_forward_args = {"adapter_names","alora_offsets"} + self.response_token_ids = response_token_ids + self._is_prompt_learning = peft_config.is_prompt_learning + if self._is_prompt_learning: + self._peft_config = {adapter_name: peft_config} + self.base_model = model + self.add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage) + else: + self._peft_config = None + cls = PEFT_TYPE_TO_MODEL_MAPPING[peft_config.peft_type] + ctx = init_empty_weights if low_cpu_mem_usage else nullcontext + with ctx(): + self.base_model = cls(model, {adapter_name: peft_config}, adapter_name) + self.set_additional_trainable_modules(peft_config, adapter_name) + + if hasattr(self.base_model, "_cast_adapter_dtype"): + self.base_model._cast_adapter_dtype( + adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype + ) + + if getattr(model, "is_gradient_checkpointing", True): + model = self._prepare_model_for_gradient_checkpointing(model) + + # the `pretraining_tp` is set for some models to simulate Tensor Parallelism during inference to avoid + # numerical differences, https://github.com/pytorch/pytorch/issues/76232 - to avoid any unexpected + # behavior we disable that in this line. + if hasattr(self.base_model, "config") and hasattr(self.base_model.config, "pretraining_tp"): + self.base_model.config.pretraining_tp = 1 + + @property + def peft_config(self) -> dict[str, PeftConfig]: + if self._is_prompt_learning: + return self._peft_config + return self.base_model.peft_config + + @property + def active_adapters(self) -> list[str]: + try: + adapters = self.base_model.active_adapters + if not isinstance(adapters, list): + # Base model is probably a transformers model, see: + # https://github.com/huggingface/transformers/pull/30790#issuecomment-2253808249 + # Unfortunately, transformers models also have an active_adapters method but it's 1) not a property and + # 2) calling it fails because the base model (usually) has no loaded adapter. The base model can be a + # transformers model for prompt learning, where the base model is not wrapped in a LoraModel or similar. + adapters = self.active_adapter + if isinstance(adapters, str): + adapters = [adapters] + except AttributeError: + adapters = self.active_adapter + if isinstance(adapters, str): + adapters = [adapters] + return adapters + + @peft_config.setter + def peft_config(self, value: dict[str, PeftConfig]): + if self._is_prompt_learning: + self._peft_config = value + else: + self.base_model.peft_config = value + + def save_pretrained( + self, + save_directory: str, + safe_serialization: bool = True, + selected_adapters: Optional[list[str]] = None, + save_embedding_layers: Union[str, bool] = "auto", + is_main_process: bool = True, + path_initial_model_for_weight_conversion: Optional[str] = None, + **kwargs: Any, + ) -> None: + r""" + This function saves the adapter model and the adapter configuration files to a directory, so that it can be + reloaded using the [`PeftModel.from_pretrained`] class method, and also used by the [`PeftModel.push_to_hub`] + method. + + Args: + save_directory (`str`): + Directory where the adapter model and configuration files will be saved (will be created if it does not + exist). + safe_serialization (`bool`, *optional*): + Whether to save the adapter files in safetensors format, defaults to `True`. + selected_adapters (`List[str]`, *optional*): + A list of adapters to be saved. If `None`, will default to all adapters. + save_embedding_layers (`Union[bool, str]`, *optional*, defaults to `"auto"`): + If `True`, save the embedding layers in addition to adapter weights. If `auto`, checks the common + embedding layers `peft.utils.other.EMBEDDING_LAYER_NAMES` in config's `target_modules` when available. + and automatically sets the boolean flag. This only works for 🤗 transformers models. + is_main_process (`bool`, *optional*): + Whether the process calling this is the main process or not. Will default to `True`. Will not save the + checkpoint if not on the main process, which is important for multi device setups (e.g. DDP). + path_initial_model_for_weight_conversion (`str, *optional*`): + The path to the initialized adapter, which is obtained after initializing the model with PiSSA or OLoRA + and before performing any training. When `path_initial_model_for_weight_conversion` is not None, the + difference in adapter before and after fine-tuning is calculated. This difference can be represented as + the parameters of a standard LoRA adapter. Using this converted adapter does not require changes to the + base model, thus conveniently allowing the use of multiple PiSSA or OLoRA adapters with LoRA adapters, + and the activation or deactivation of any adapters. Note that this conversion is not supported if + `rslora` is used in combination with `rank_pattern` or `alpha_pattern`. + kwargs (additional keyword arguments, *optional*): + Additional keyword arguments passed along to the `push_to_hub` method. + + """ + if os.path.isfile(save_directory): + raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file") + + if selected_adapters is None: + selected_adapters = list(self.peft_config.keys()) + else: + if any( + selected_adapter_name not in list(self.peft_config.keys()) + for selected_adapter_name in selected_adapters + ): + raise ValueError( + f"You passed an invalid `selected_adapters` arguments, current supported adapter names are" + f" {list(self.peft_config.keys())} - got {selected_adapters}." + ) + + def save_mutated_as_lora(peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs): + if peft_config.use_rslora and (peft_config.rank_pattern or peft_config.alpha_pattern): + msg = ( + "Passing `path_initial_model_for_weight_conversion` to `save_pretrained` is not supported when " + "using `rank_pattern` or `alpha_pattern` at the same time as `use_rslora=True`." + ) + raise ValueError(msg) + + if not any( + str(peft_config.init_lora_weights).lower().startswith(prefix) for prefix in ["pissa", "olora", "true"] + ): + warnings.warn( + "`path_initial_model_for_weight_conversion` only works for converting a PiSSA or OLoRA adapter to " + "a LoRA adapter" + ) + initial_adapter_name = os.path.basename(path_initial_model_for_weight_conversion) + try: + self.load_adapter( + os.path.dirname(path_initial_model_for_weight_conversion), + subfolder=initial_adapter_name, + adapter_name=initial_adapter_name, + ) + is_pissa = str(self.peft_config[initial_adapter_name].init_lora_weights).lower().startswith("pissa") + is_olora = str(self.peft_config[initial_adapter_name].init_lora_weights).lower() == "olora" + if is_pissa or is_olora: + raise ValueError( + "The `init_lora_weights` parameter of the initial adapter should be set to `True`. " + "Otherwise, `self.load_adapter` will subtract the decomposed values again based on the " + "residual model." + ) + output_state_dict = self.base_model.subtract_mutated_init( + output_state_dict, initial_adapter_name, kwargs + ) + finally: + self.delete_adapter(initial_adapter_name) + return output_state_dict + + if is_main_process: + os.makedirs(save_directory, exist_ok=True) + self.create_or_update_model_card(save_directory) + + for adapter_name in selected_adapters: + peft_config = self.peft_config[adapter_name] + # save only the trainable weights + output_state_dict = get_peft_model_state_dict( + self, + state_dict=kwargs.get("state_dict", None), + adapter_name=adapter_name, + save_embedding_layers=save_embedding_layers, + ) + output_dir = os.path.join(save_directory, adapter_name) if adapter_name != "default" else save_directory + os.makedirs(output_dir, exist_ok=True) + + if is_main_process and safe_serialization: + # Section copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L2111-L2134 + # Safetensors does not allow tensor aliasing. + # We're going to remove aliases before saving + ptrs = collections.defaultdict(list) + for name, tensor in output_state_dict.items(): + # Sometimes in the state_dict we have non-tensor objects. + # e.g. in bitsandbytes we have some `str` objects in the state_dict + if isinstance(tensor, torch.Tensor): + ptrs[id_tensor_storage(tensor)].append(name) + else: + # In the non-tensor case, fall back to the pointer of the object itself + ptrs[id(tensor)].append(name) + + # These are all the pointers of shared tensors. + shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1} + + for _, names in shared_ptrs.items(): + # Here we just clone the shared tensors to avoid tensor aliasing which is + # not supported in safetensors. + for shared_tensor_name in names[1:]: + output_state_dict[shared_tensor_name] = output_state_dict[shared_tensor_name].clone() + if path_initial_model_for_weight_conversion is not None: + peft_config = copy.deepcopy(peft_config) + peft_config.init_lora_weights = True + peft_config.save_pretrained(path_initial_model_for_weight_conversion) + output_state_dict = save_mutated_as_lora( + peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs + ) + safe_save_file( + output_state_dict, + os.path.join(output_dir, SAFETENSORS_WEIGHTS_NAME), + metadata={"format": "pt"}, + ) + elif is_main_process: + if path_initial_model_for_weight_conversion is not None: + peft_config = copy.deepcopy(peft_config) + peft_config.init_lora_weights = True + peft_config.save_pretrained(path_initial_model_for_weight_conversion) + output_state_dict = save_mutated_as_lora( + peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs + ) + torch.save(output_state_dict, os.path.join(output_dir, WEIGHTS_NAME)) + + # save the config and change the inference mode to `True` + if peft_config.base_model_name_or_path is None: + peft_config.base_model_name_or_path = ( + self.base_model.__dict__.get("name_or_path", None) + if peft_config.is_prompt_learning + else self.base_model.model.__dict__.get("name_or_path", None) + ) + inference_mode = peft_config.inference_mode + peft_config.inference_mode = True + + if peft_config.task_type is None: + # deal with auto mapping + base_model_class = self._get_base_model_class( + is_prompt_tuning=peft_config.is_prompt_learning, + ) + parent_library = base_model_class.__module__ + + auto_mapping_dict = { + "base_model_class": base_model_class.__name__, + "parent_library": parent_library, + } + else: + auto_mapping_dict = None + + if is_main_process: + if path_initial_model_for_weight_conversion is not None: + peft_config.init_lora_weights = True + peft_config.r *= 2 + if not peft_config.use_rslora: + peft_config.lora_alpha *= 2 + else: + # with rslora, we have scaling = alpha / sqrt(r), we thus adjust alpha to keep the same scaling + peft_config.lora_alpha *= 2**0.5 + + if peft_config.rank_pattern: + peft_config.rank_pattern = {key: 2 * val for key, val in peft_config.rank_pattern.items()} + if peft_config.alpha_pattern: + peft_config.alpha_pattern = {key: 2 * val for key, val in peft_config.alpha_pattern.items()} + + peft_config.save_pretrained(output_dir, auto_mapping_dict=auto_mapping_dict) + peft_config.inference_mode = inference_mode + + @classmethod + def from_pretrained( + cls, + model: torch.nn.Module, + model_id: Union[str, os.PathLike], + adapter_name: str = "default", + is_trainable: bool = False, + config: Optional[PeftConfig] = None, + autocast_adapter_dtype: bool = True, + ephemeral_gpu_offload: bool = False, + low_cpu_mem_usage: bool = False, + ks = None,#new + response_token_ids = None, #new + **kwargs: Any, + ) -> PeftModel: + r""" + Instantiate a PEFT model from a pretrained model and loaded PEFT weights. + + Note that the passed `model` may be modified inplace. + + Args: + model ([`torch.nn.Module`]): + The model to be adapted. For 🤗 Transformers models, the model should be initialized with the + [`~transformers.PreTrainedModel.from_pretrained`]. + model_id (`str` or `os.PathLike`): + The name of the PEFT configuration to use. Can be either: + - A string, the `model id` of a PEFT configuration hosted inside a model repo on the Hugging Face + Hub. + - A path to a directory containing a PEFT configuration file saved using the `save_pretrained` + method (`./my_peft_config_directory/`). + adapter_name (`str`, *optional*, defaults to `"default"`): + The name of the adapter to be loaded. This is useful for loading multiple adapters. + is_trainable (`bool`, *optional*, defaults to `False`): + Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be + used for inference. + config ([`~peft.PeftConfig`], *optional*): + The configuration object to use instead of an automatically loaded configuration. This configuration + object is mutually exclusive with `model_id` and `kwargs`. This is useful when configuration is already + loaded before calling `from_pretrained`. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. Only relevant for specific adapter types. + ephemeral_gpu_offload (`bool`, *optional*): + Whether to use ephemeral GPU offloading for partially loaded modules. Defaults to `False`. This is + useful when parts of the model and/or components (such as adapters) are kept in CPU memory until they + are needed. Rather than perform expensive operations on small data, the data is transferred to the GPU + on-demand, the operation(s) performed, and the results moved back to CPU memory. This brings a slight + momentary VRAM overhead but gives orders of magnitude speedup in certain cases. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device before loading the saved weights. Useful to speed up the + process. + torch_device (`str`, *optional*, defaults to None): + The device to load the adapter on. If `None`, the device will be inferred. + kwargs: (`optional`): + Additional keyword arguments passed along to the specific PEFT configuration class. + """ + + +# from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING, PEFT_TYPE_TO_CONFIG_MAPPING + MODEL_TYPE_TO_PEFT_MODEL_MAPPING: dict[str, type[PeftModel]] = { + #"SEQ_CLS": PeftModelForSequenceClassification, + #"SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM, + "CAUSAL_LM": aLoRAPeftModelForCausalLM, + #"TOKEN_CLS": PeftModelForTokenClassification, + #"QUESTION_ANS": PeftModelForQuestionAnswering, + #"FEATURE_EXTRACTION": PeftModelForFeatureExtraction, + } + + PEFT_TYPE_TO_CONFIG_MAPPING: dict[str, type[PeftConfig]] = { + #"ADAPTION_PROMPT": AdaptionPromptConfig, + #"PROMPT_TUNING": PromptTuningConfig, + #"PREFIX_TUNING": PrefixTuningConfig, + #"P_TUNING": PromptEncoderConfig, + "LORA": aLoraConfig, + #"LOHA": LoHaConfig, + #"LORAPLUS": LoraConfig, + #"LOKR": LoKrConfig, + #"ADALORA": AdaLoraConfig, + #"BOFT": BOFTConfig, + #"IA3": IA3Config, + #"MULTITASK_PROMPT_TUNING": MultitaskPromptTuningConfig, + #"OFT": OFTConfig, + #"POLY": PolyConfig, + #"LN_TUNING": LNTuningConfig, + #"VERA": VeraConfig, + #"FOURIERFT": FourierFTConfig, + #"XLORA": XLoraConfig, + #"HRA": HRAConfig, + #"VBLORA": VBLoRAConfig, + #"CPT": CPTConfig, + #"BONE": BoneConfig, + } + +# self.disable_adapters = False #NEW + # load the config + if config is None: + config = PEFT_TYPE_TO_CONFIG_MAPPING[ + PeftConfig._get_peft_type( + model_id, + subfolder=kwargs.get("subfolder", None), + revision=kwargs.get("revision", None), + cache_dir=kwargs.get("cache_dir", None), + use_auth_token=kwargs.get("use_auth_token", None), + token=kwargs.get("token", None), + ) + ].from_pretrained(model_id, **kwargs) + elif isinstance(config, PeftConfig): + config.inference_mode = not is_trainable + else: + raise ValueError(f"The input config must be a PeftConfig, got {config.__class__}") + + # Runtime configuration, if supported + if hasattr(config, "runtime_config"): + config.runtime_config.ephemeral_gpu_offload = ephemeral_gpu_offload + else: + if ephemeral_gpu_offload: + warnings.warn("Ephemeral GPU offloading is not supported for this model. Ignoring.") + + if hasattr(model, "hf_device_map"): + weight_map = dict(named_module_tensors(model, recurse=True)) + + # recreate the offload_index for disk-offloaded modules: we need to know the location in storage of each weight + # before the offload hook is removed from the model + disk_modules = set() + index = None + for name, module in model.named_modules(): + if hasattr(module, "_hf_hook") and hasattr(module._hf_hook, "original_devices"): + if hasattr(module._hf_hook.weights_map, "dataset"): + index = module._hf_hook.weights_map.dataset.index + for key in module._hf_hook.original_devices.keys(): + if module._hf_hook.original_devices[key] == torch.device("meta"): + disk_modules.add(str(name) + "." + str(key)) + + if disk_modules and not kwargs.get("use_safetensors", True): + raise ValueError("Disk offloading currently only supported for safetensors") + + if index: + offload_index = { + p: { + "safetensors_file": index[p]["safetensors_file"], + "weight_name": p, + "dtype": str(weight_map[p].dtype).replace("torch.", ""), + } + for p in weight_map.keys() + if p in disk_modules + } + kwargs["offload_index"] = offload_index + + if (getattr(model, "hf_device_map", None) is not None) and len( + set(model.hf_device_map.values()).intersection({"cpu", "disk"}) + ) > 0: + remove_hook_from_submodules(model) + + if config.is_prompt_learning and is_trainable: + raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.") + else: + config.inference_mode = not is_trainable + if isinstance(getattr(model, "base_model", None), XLoraModel): + if not isinstance(config, XLoraConfig): + raise TypeError(f"Expected 'XLoraConfig', got '{type(config)}' instead.") + if "adapters" in kwargs: + config.adapters = kwargs["adapters"] + else: + # If the path is on HF hub, then we get the adapter names to create a subfolders list which tells + # `load_adapter` where the adapters are. + if not os.path.exists(model_id): + s = HfFileSystem() + + # The names of the adapters which must be in folders + adapter_names = [ + file["name"][len(model_id) + 1 :] for file in s.ls(model_id) if file["type"] == "directory" + ] + # Prepare a dict of adapter paths, which really just point to the hf id; we will use the subfolders + adapter_paths = {} + for adapter_name in adapter_names: + adapter_paths[adapter_name] = os.path.join(model_id, model_id) + config.adapters = adapter_paths + config._subfolders = adapter_names + else: + if "adapters" not in kwargs: + raise ValueError("If model_id is a local path, then `adapters` must be passed in kwargs.") + + if config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys(): + model = cls( + model, + config, + adapter_name, + autocast_adapter_dtype=autocast_adapter_dtype, + low_cpu_mem_usage=low_cpu_mem_usage, + ) + else: + model = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[config.task_type]( + model, + config, + adapter_name, + autocast_adapter_dtype=autocast_adapter_dtype, + low_cpu_mem_usage=low_cpu_mem_usage, + ) + + load_result = model.load_adapter( + model_id, + adapter_name, + is_trainable=is_trainable, + autocast_adapter_dtype=autocast_adapter_dtype, + low_cpu_mem_usage=low_cpu_mem_usage, + **kwargs, + ) + + # 1. Remove VB-LoRA vector bank, since it's a shared parameter set via the VBLoRAModel + # 2. Remove the prompt encoder, as it does not need to be part of the checkpoint + missing_keys = [ + k for k in load_result.missing_keys if "vblora_vector_bank" not in k and "prompt_encoder" not in k + ] + if missing_keys: + # Let's warn here since (in contrast to load_adapter) we don't return the load result, so it could be quite + # difficult for users to even notice that something might have gone wrong here. As we filter out non PEFT + # keys from the missing keys, this gives no false positives. + warnings.warn(f"Found missing adapter keys while loading the checkpoint: {missing_keys}") + + model.ks = ks + model.response_token_ids = response_token_ids + model.disable_adapters = False #NEW + return model + + def _setup_prompt_encoder(self, adapter_name: str): + config = self.peft_config[adapter_name] + if not hasattr(self, "prompt_encoder"): + self.prompt_encoder = torch.nn.ModuleDict({}) + self.prompt_tokens = {} + transformer_backbone = None + for name, module in self.base_model.named_children(): + for param in module.parameters(): + param.requires_grad = False + if isinstance(module, PreTrainedModel): + # Make sure to freeze Tranformers model + if transformer_backbone is None: + transformer_backbone = module + self.transformer_backbone_name = name + if transformer_backbone is None: + transformer_backbone = self.base_model + + if config.num_transformer_submodules is None: + config.num_transformer_submodules = 2 if config.task_type == TaskType.SEQ_2_SEQ_LM else 1 + + # determine the word embeddings + word_embeddings = None + try: + # First try to find the word embeddings based on the module name, this should work for models like Bert, + # Roberta, Deberta, etc. + word_embeddings = self.base_model.get_submodule("embeddings.word_embeddings") + except AttributeError: + pass + + if word_embeddings is None: + # Word embeddings could not be determined. Next try to guess them by checking which parameter has the size + # of the vocab. + for named_param, value in list(transformer_backbone.named_parameters()): + # for ZeRO-3, the tensor is sharded across accelerators and deepspeed modifies it to a tensor with shape + # [0] the actual unsharded shape is stored in "ds_shape" attribute special handling is needed in case + # the model is initialized in deepspeed.zero.Init() context or HfDeepSpeedConfig has been called before + # For reference refer to issue: https://github.com/huggingface/peft/issues/996 + deepspeed_distributed_tensor_shape = getattr(value, "ds_shape", None) + + if value.shape[0] == self.base_model.config.vocab_size or ( + deepspeed_distributed_tensor_shape is not None + and deepspeed_distributed_tensor_shape[0] == self.base_model.config.vocab_size + ): + word_embeddings = transformer_backbone.get_submodule(named_param.replace(".weight", "")) + break + + self.word_embeddings = word_embeddings + + if config.peft_type == PeftType.PROMPT_TUNING: + prompt_encoder = PromptEmbedding(config, self.word_embeddings) + elif config.peft_type == PeftType.MULTITASK_PROMPT_TUNING: + prompt_encoder = MultitaskPromptEmbedding(config, self.word_embeddings) + elif config.peft_type == PeftType.P_TUNING: + prompt_encoder = PromptEncoder(config) + elif config.peft_type == PeftType.PREFIX_TUNING: + # prefix tuning now uses Cache but that won't work with gradient checkpointing + if any(getattr(module, "gradient_checkpointing", False) for module in self.get_base_model().modules()): + raise ValueError("Prefix tuning does not work with gradient checkpointing.") + prompt_encoder = PrefixEncoder(config) + elif config.peft_type == PeftType.CPT: + prompt_encoder = CPTEmbedding(config, self.word_embeddings) + else: + raise ValueError("Not supported") + + prompt_encoder = prompt_encoder.to(self.device) + self.prompt_encoder.update(torch.nn.ModuleDict({adapter_name: prompt_encoder})) + self.prompt_tokens[adapter_name] = torch.arange( + config.num_virtual_tokens * config.num_transformer_submodules + ).long() + + def _prepare_model_for_gradient_checkpointing(self, model: PreTrainedModel): + r""" + Prepares the model for gradient checkpointing if necessary + """ + if not ( + getattr(model, "is_loaded_in_8bit", False) + or getattr(model, "is_loaded_in_4bit", False) + or getattr(model, "is_quantized", False) + ): + if hasattr(model, "enable_input_require_grads"): + model.enable_input_require_grads() + elif hasattr(model, "get_input_embeddings"): + + def make_inputs_require_grad(module, input, output): + output.requires_grad_(True) + + model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) + return model + + def get_prompt_embedding_to_save(self, adapter_name: str) -> torch.Tensor: + """ + Returns the prompt embedding to save when saving the model. Only applicable when using a prompt learning + method. + """ + prompt_encoder = self.prompt_encoder[adapter_name] + prompt_tokens = ( + self.prompt_tokens[adapter_name].unsqueeze(0).expand(1, -1).to(prompt_encoder.embedding.weight.device) + ) + if self.peft_config[adapter_name].peft_type == PeftType.PREFIX_TUNING: + prompt_tokens = prompt_tokens[:, : self.peft_config[adapter_name].num_virtual_tokens] + + if self.peft_config[adapter_name].peft_type == PeftType.MULTITASK_PROMPT_TUNING: + prompt_embeddings = super(MultitaskPromptEmbedding, prompt_encoder).forward(prompt_tokens) + else: + prompt_embeddings = prompt_encoder(prompt_tokens) + + return prompt_embeddings[0].detach().cpu() + + def get_prompt(self, batch_size: int, task_ids: Optional[torch.Tensor] = None) -> torch.Tensor: + """ + Returns the virtual prompts to use for Peft. Only applicable when using a prompt learning method. + """ + peft_config = self.active_peft_config + prompt_encoder = self.prompt_encoder[self.active_adapter] + prompt_tokens = ( + self.prompt_tokens[self.active_adapter] + .unsqueeze(0) + .expand(batch_size, -1) + .to(prompt_encoder.embedding.weight.device) + ) + if peft_config.peft_type == PeftType.PREFIX_TUNING: + prompt_tokens = prompt_tokens[:, : peft_config.num_virtual_tokens] + if peft_config.inference_mode: + past_key_values = prompt_encoder.embedding.weight.repeat(batch_size, 1, 1) + else: + past_key_values = prompt_encoder(prompt_tokens) + if self.base_model_torch_dtype is not None: + past_key_values = past_key_values.to(self.base_model_torch_dtype) + past_key_values = past_key_values.view( + batch_size, + peft_config.num_virtual_tokens, + peft_config.num_layers * 2, + peft_config.num_attention_heads, + peft_config.token_dim // peft_config.num_attention_heads, + ) + if peft_config.num_transformer_submodules == 2: + past_key_values = torch.cat([past_key_values, past_key_values], dim=2) + past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split( + peft_config.num_transformer_submodules * 2 + ) + if TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING.get(self.config.model_type, None) is not None: + post_process_fn = TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING[self.config.model_type] + past_key_values = post_process_fn(past_key_values) + elif peft_config.num_transformer_submodules == 1: + # Dont' apply this to encoder-decoder models and not to models requiring special processing. + # local import in case users use a very old transformers version + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + elif peft_config.num_transformer_submodules == 2 and self.base_model._supports_cache_class: + # Dont' apply this to encoder-decoder models that don't support new Cachc format yet + # If we don't apply this, prefix-tuning fails to update cross-attn cache + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + past_key_values.cross_attention_cache = DynamicCache() + past_key_values.is_updated = { + layer_idx: False for layer_idx in range(len(past_key_values.cross_attention_cache.key_cache)) + } + map_cache_to_layer_device_map(self.get_base_model(), past_key_values) # no-op if not a Cache instance + return past_key_values + else: + if peft_config.peft_type == PeftType.MULTITASK_PROMPT_TUNING: + prompts = prompt_encoder(prompt_tokens, task_ids) + else: + if peft_config.inference_mode: + prompts = prompt_encoder.embedding.weight + else: + # Take only one prompt token sample and expand the output instead of expanding the input, see: + # https://github.com/huggingface/peft/issues/2043#issuecomment-2321522577 + prompt_tokens = prompt_tokens[:1] + prompts = prompt_encoder(prompt_tokens) + prompts = prompts.repeat(batch_size, 1, 1) + return prompts + + def get_nb_trainable_parameters(self) -> tuple[int, int]: + r""" + Returns the number of trainable parameters and the number of all parameters in the model. + """ + trainable_params = 0 + all_param = 0 + for _, param in self.named_parameters(): + num_params = param.numel() + # if using DS Zero 3 and the weights are initialized empty + if num_params == 0 and hasattr(param, "ds_numel"): + num_params = param.ds_numel + + # Due to the design of 4bit linear layers from bitsandbytes + # one needs to multiply the number of parameters by 2 to get + # the correct number of parameters + if param.__class__.__name__ == "Params4bit": + if hasattr(param, "element_size"): + num_bytes = param.element_size() + elif not hasattr(param, "quant_storage"): + num_bytes = 1 + else: + num_bytes = param.quant_storage.itemsize + num_params = num_params * 2 * num_bytes + + all_param += num_params + if param.requires_grad: + trainable_params += num_params + + return trainable_params, all_param + + def print_trainable_parameters(self) -> None: + """ + Prints the number of trainable parameters in the model. + + Note: print_trainable_parameters() uses get_nb_trainable_parameters() which is different from + num_parameters(only_trainable=True) from huggingface/transformers. get_nb_trainable_parameters() returns + (trainable parameters, all parameters) of the Peft Model which includes modified backbone transformer model. + For techniques like LoRA, the backbone transformer model is modified in place with LoRA modules. However, for + prompt tuning, the backbone transformer model is unmodified. num_parameters(only_trainable=True) returns number + of trainable parameters of the backbone transformer model which can be different. + """ + trainable_params, all_param = self.get_nb_trainable_parameters() + + print( + f"trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param:.4f}" + ) + + def __getattr__(self, name: str): + """Forward missing attributes to the wrapped module.""" + try: + return super().__getattr__(name) # defer to nn.Module's logic + except AttributeError: + if name == "base_model": # see #1892: prevent infinite recursion if class is not initialized + raise + return getattr(self.base_model, name) + + @contextmanager + def _enable_peft_forward_hooks(self, *args, **kwargs): + # If the base model has a method called _enable_peft_forward_hooks, it is invoked as a context. Otherwise, this + # runs without any changes + if hasattr(self.base_model, "_enable_peft_forward_hooks"): + with self.base_model._enable_peft_forward_hooks(*args, **kwargs): + yield + return + else: + # nothing to enable + yield + return + + def forward(self, *args: Any, **kwargs: Any): + """ + Forward pass of the model. + """ +# print('forward') +# print(args) +# print(kwargs) + with self._enable_peft_forward_hooks(*args, **kwargs): + kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} + return self.get_base_model()(*args, **kwargs) + + def generate(self, *args, **kwargs): + # print("generate") + # print(args) + # print(kwargs) + with self._enable_peft_forward_hooks(*args, **kwargs): + kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} + return self.get_base_model().generate(*args, **kwargs) + + def _get_base_model_class(self, is_prompt_tuning=False): + """ + Returns the base model class. + """ + if not is_prompt_tuning: + return self.base_model.model.__class__ + return self.base_model.__class__ + + @contextmanager + def disable_adapter(self): + """ + Context manager that disables the adapter module. Use this to run inference on the base model. + + Example: + + ```py + >>> with model.disable_adapter(): + ... model(inputs) + ``` + """ + #self.disable_adapters = True + if self.peft_config[self.active_adapter].is_prompt_learning: + try: + # TODO: consider replacing this patching of methods with a more robust mechanism: setting a flag and + # letting the underlying methods deal with it, same as how LoRA does it. + old_forward = self.forward + self.forward = self.base_model.forward + old_prepare_inputs_for_generation = self.prepare_inputs_for_generation + self.prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation + yield + finally: + self.forward = old_forward + self.prepare_inputs_for_generation = old_prepare_inputs_for_generation + + elif self.peft_config[self.active_adapter].is_adaption_prompt: + try: + self.base_model.disable_adapter_layers() + yield + finally: + self.base_model.enable_adapter_layers() + + else: # LoRA, LoHa, etc. + model_status = self.get_model_status() + if model_status.enabled == "irregular": + warnings.warn( + "The model contains some adapter layers that are enabled and others that are disabled. " + "This is most likely unintentional. After exiting the disable_adapter context, all adapters " + "will be enabled" + ) + try: + self.disable_adapters = True + self.base_model.disable_adapter_layers() + yield + finally: + if model_status.enabled is not False: + self.disable_adapters = False + # model_status.enabled is `True` or `"irregular"` + self.base_model.enable_adapter_layers() + + def get_base_model(self) -> torch.nn.Module: + """ + Returns the base model. + """ + return ( + self.base_model + if (False) #(self.active_peft_config.is_prompt_learning or self.peft_type == PeftType.POLY) + else self.base_model.model + ) + + def add_adapter(self, adapter_name: str, peft_config: PeftConfig, low_cpu_mem_usage: bool = False) -> None: + """ + Add an adapter to the model based on the passed configuration. + + This adapter is not trained. To load a trained adapter, check out [`PeftModel.load_adapter`]. + + The name for the new adapter should be unique. + + The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active + adapter. + + Args: + adapter_name (`str`): + The name of the adapter to be added. + peft_config ([`PeftConfig`]): + The configuration of the adapter to be added. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device. Useful to speed up the process when loading saved + adapters. Don't use this option when creating a new PEFT adapter for training. + + """ + if peft_config.peft_type != self.peft_type: + raise ValueError( + f"Cannot combine adapters with different peft types. " + f"Found {self.peft_type} and {peft_config.peft_type}." + ) + + try: + if peft_config.is_prompt_learning: + self.peft_config[adapter_name] = peft_config + if hasattr(self.config, "to_dict"): + dict_config = self.config.to_dict() + else: + dict_config = self.config + + peft_config = _prepare_prompt_learning_config(peft_config, dict_config) + self._setup_prompt_encoder(adapter_name) + elif peft_config.is_adaption_prompt: + self.base_model.add_adapter(adapter_name, peft_config) + else: + self.peft_config[adapter_name] = peft_config + self.base_model.inject_adapter( + self.base_model.model, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage + ) + except Exception: # something went wrong, roll back + if adapter_name in self.peft_config: + del self.peft_config[adapter_name] + raise + + self.set_additional_trainable_modules(peft_config, adapter_name) + + def set_additional_trainable_modules(self, peft_config, adapter_name): + if getattr(peft_config, "modules_to_save", None) is not None: + if self.modules_to_save is None: + self.modules_to_save = set(peft_config.modules_to_save) + else: + self.modules_to_save.update(peft_config.modules_to_save) + _set_trainable(self, adapter_name) # this may add a new ModulesToSaveWrapper + + def get_layer_status(self) -> list[TunerLayerStatus]: + """Get the status of each adapter layer in the model. + + This method returns a list of `TunerLayerStatus` dataclass instances, each of which contains the following + attributes: + + - `name` (`str`): + The name of the adapter layer, e.g. `model.encoder.block.0.layer.0.SelfAttention.q`. + - `module_type` (`str`): + The type of the adapter layer, e.g. `lora.Linear`. + - `enabled` (`bool`): + Whether the adapter layer is enabled. + - `active_adapters` (`list[str]`): + The names of the active adapters, if any, e.g. `["default"]`. + - `merged_adapters` (`list[str]`): + The names of the merged adapters, if any, e.g. `["default"]`. + - `available_adapters` (`list[str]`): + The names of the available adapters, e.g. `["default"]`. + + Args: + model ([`~PeftModel`]): + The model to get the adapter layer status from. + + Returns: + list[`peft.peft_model.TunerLayerStatus`]: + A list of dataclasses, each containing the status of the corresponding adapter layer. + + """ + return get_layer_status(self) + + def get_model_status(self) -> TunerModelStatus: + """Get the status of tuners of the model. + + This method returns a `TunerModelStatus` dataclass instance, which contains the following attributes: + + - `base_model_type` (`str`): + The type of the base model, e.g. `T5Model`. + - `adapter_model_type` (`str`): + The type of the adapter model, e.g. `LoraModel`. + - `peft_types` (`dict[str, str]`): + The mapping of adapter name to adapter type, e.g. `{"default": "LORA"}`. + - `trainable_params` (`int`): + The number of trainable parameters in the model. + - `total_params` (`int`): + The total number of parameters in the model. + - `num_adapter_layers` (`int`): + The number of adapter layers in the model. + - `enabled` (`bool`, `Literal["irregular"]`): + Whether all adapter layers are enabled. If some are enabled and some are not, this will be `"irregular"`. + This means that your model is in an inconsistent state and might not work as expected. + - `active_adapters` (`list[str]`, `Literal["irregular"]`): + The names of the active adapters. If the active adapters are not consistent across all layers, this will be + `"irregular"`, which means that your model is in an inconsistent state and might not work as expected. + - `merged_adapters` (`list[str]`, `Literal["irregular"]`): + The names of the merged adapters. If the merged adapters are not consistent across all layers, this will be + `"irregular"`, which means that your model is in an inconsistent state and might not work as expected. + - `available_adapters` (`list[str]`): + The names of the available adapters, e.g. `["default"]`. + + Args: + model ([`~PeftModel`]): + The model to get the adapter layer status from. + + Returns: + `peft.peft_model.TunerModelStatus`: + A dataclass containing the status of the model. + + """ + return get_model_status(self) + + @classmethod + def _split_kwargs(cls, kwargs: dict[str, Any]): + _kwargs_not_in_hf_hub_download_signature = ("use_auth_token",) + hf_hub_download_kwargs = {} + other_kwargs = {} + + for key, value in kwargs.items(): + if key in inspect.signature(hf_hub_download).parameters or key in _kwargs_not_in_hf_hub_download_signature: + hf_hub_download_kwargs[key] = value + else: + other_kwargs[key] = value + + return hf_hub_download_kwargs, other_kwargs + + def _update_offload(self, offload_index: dict[str, dict[str, str]], adapters_weights: dict[str, torch.tensor]): + """ + Update the offload_index and safetensors files for loading and mergine PeftModels with disk-offloaded modules. + + Args: + offload_index (Dict[str: str]): + Dictionary of disk-offloaded modules with their metadata and safetensors filenames + adapters_weights (Dict[str: torch.tensor]): + Dictionary of Peft adapter module names and weights + """ + + if not offload_index: + return offload_index + + prefix = "base_model.model." + # rename offload index weight and model names + adapter_names = list(self.peft_config.keys()) + for adapter_name in adapter_names: + keys = list(offload_index.keys()) + block_id = keys[0].split(".")[0] + "." # for writing safetensors key, + + # replace original offload index keys with PeftModel keys + for key in keys: + suffix_pos = key.rfind(".") + extended_prefix = prefix + key[:suffix_pos] + module = dict(self.named_modules())[extended_prefix] + if isinstance(module, BaseTunerLayer): + new_key = prefix + key[:suffix_pos] + ".base_layer" + key[suffix_pos:] + else: + new_key = prefix + key + offload_index[key]["weight_name"] = new_key + offload_index[new_key] = offload_index[key] + del offload_index[key] + + files_seen = set() + # rename safetensors for dispatch + for new_key in list(offload_index.keys()): + fname = offload_index[new_key]["safetensors_file"] + + # make a new file name + new_fname_list = list(fname.split(os.sep)) + for i, name in enumerate(new_fname_list): + if "--" in name: + new_fname_list[i] += "-peft" + break + new_fname = os.path.join(*new_fname_list) + + if fname in files_seen: + continue + safe_dict = {} + with safe_open(fname, framework="pt") as f: + for safe_key in f.keys(): + safe_tensor = f.get_tensor(safe_key) + metadata = f.metadata() + suffix_pos = safe_key.rfind(".") + extended_prefix = prefix + block_id + safe_key[:suffix_pos] + safe_module = dict(self.named_modules())[extended_prefix] + if isinstance(safe_module, BaseTunerLayer): + final_key = extended_prefix + ".base_layer" + safe_key[suffix_pos:] + lora_dict = {key: val for key, val in adapters_weights.items() if extended_prefix in key} + + # add LoRA keys and values to disk offload + for lora_key, lora_val in lora_dict.items(): + divide = lora_key.rfind(".") + new_key = lora_key[:divide] + f".{adapter_name}" + lora_key[divide:] + safe_dict[new_key] = lora_val + else: + final_key = prefix + block_id + safe_key + safe_dict[final_key] = safe_tensor + files_seen.add(new_fname) + + # avoid overwriting original safetensors + for key in safe_dict.keys(): + offload_index[key] = {"safetensors_file": new_fname, "weight_name": key} + + base_name = os.path.dirname(new_fname) + if not os.path.exists(base_name): + os.makedirs(base_name) + safe_save_file(safe_dict, new_fname, metadata=metadata) + + def _check_new_adapter_config(self, peft_config: PeftConfig, is_trainable: bool) -> None: + """Perform checks on newly added PEFT configs to ensure integrity.""" + if peft_config.is_prompt_learning and is_trainable: + raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.") + + # Since PiSSA/OLoRA modifies the base weights, it should not be combined with other adapters. + all_configs = [peft_config] + list(self.peft_config.values()) + if len(all_configs) > 1: + if any(getattr(config, "init_lora_weights", None) == "pissa" for config in all_configs): + msg = ( + "PiSSA changes the base weights of the model and should thus not be used with other adapters. " + "Consider converting the PiSSA adapter into a normal LoRA adapter: " + "https://github.com/huggingface/peft/tree/main/examples/pissa_finetuning#convert-pissa-to-lora" + ) + warnings.warn(msg) + elif any(getattr(config, "init_lora_weights", None) == "olora" for config in all_configs): + msg = ( + "OLoRA changes the base weights of the model and should thus not be used with other adapters. " + "Consider converting the OLoRA adapter into a normal LoRA adapter: " + "https://github.com/huggingface/peft/tree/main/examples/olora_finetuning#olora-and-lora" + ) + warnings.warn(msg) + + def load_adapter( + self, + model_id: Union[str, os.PathLike], + adapter_name: str, + is_trainable: bool = False, + torch_device: Optional[str] = None, + autocast_adapter_dtype: bool = True, + ephemeral_gpu_offload: bool = False, + low_cpu_mem_usage: bool = False, + **kwargs: Any, + ): + """ + Load a trained adapter into the model. + + The name for the new adapter should be unique. + + The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active + adapter. + + Args: + model_id (`str` or `os.PathLike`): + The name of the PEFT configuration to use. Can be either: + - A string, the `model id` of a PEFT configuration hosted inside a model repo on the Hugging Face + Hub. + - A path to a directory containing a PEFT configuration file saved using the `save_pretrained` + method (`./my_peft_config_directory/`). + adapter_name (`str`): + The name of the adapter to be added. + is_trainable (`bool`, *optional*, defaults to `False`): + Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be + used for inference. + torch_device (`str`, *optional*, defaults to None): + The device to load the adapter on. If `None`, the device will be inferred. + autocast_adapter_dtype (`bool`, *optional*, defaults to `True`): + Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter + weights using float16 and bfloat16 to float32, as this is typically required for stable training, and + only affect select PEFT tuners. + ephemeral_gpu_offload (`bool`, *optional*, defaults to `False`): + Whether to use ephemeral GPU offloading for partially loaded modules. Defaults to `False`. + low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): + Create empty adapter weights on meta device before loading the saved weights. Useful to speed up the + process. + kwargs: (`optional`): + Additional arguments to modify the way the adapter is loaded, e.g. the token for Hugging Face Hub. + """ +# from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING + + hf_hub_download_kwargs, kwargs = self._split_kwargs(kwargs) + if torch_device is None: + torch_device = infer_device() + + if adapter_name not in self.peft_config: + # load the config + peft_config = PEFT_TYPE_TO_CONFIG_MAPPING[ + PeftConfig._get_peft_type( + model_id, + **hf_hub_download_kwargs, + ) + ].from_pretrained( + model_id, + ephemeral_gpu_offload=ephemeral_gpu_offload, + **hf_hub_download_kwargs, + ) + self._check_new_adapter_config(peft_config, is_trainable=is_trainable) + peft_config.inference_mode = not is_trainable + self.add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage) + + adapters_weights = load_peft_weights(model_id, device=torch_device, **hf_hub_download_kwargs) + + # load the weights into the model + ignore_mismatched_sizes = kwargs.get("ignore_mismatched_sizes", False) + load_result = set_peft_model_state_dict( + self, + adapters_weights, + adapter_name=adapter_name, + ignore_mismatched_sizes=ignore_mismatched_sizes, + low_cpu_mem_usage=low_cpu_mem_usage, + ) + + tuner = self.peft_config[adapter_name].peft_type + tuner_prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(tuner, "") + adapter_missing_keys = [] + + # Filter missing keys specific to the current adapter and tuner prefix. + for key in load_result.missing_keys: + if tuner_prefix in key and adapter_name in key: + adapter_missing_keys.append(key) + + load_result.missing_keys.clear() + load_result.missing_keys.extend(adapter_missing_keys) + + if ( + (getattr(self, "hf_device_map", None) is not None) + and (len(set(self.hf_device_map.values()).intersection({"cpu", "disk"})) > 0) + and len(self.peft_config) == 1 + ): + device_map = kwargs.get("device_map", "auto") + max_memory = kwargs.get("max_memory", None) + offload_dir = kwargs.get("offload_folder", None) + offload_index = kwargs.get("offload_index", None) + + dispatch_model_kwargs = {} + # Safety checker for previous `accelerate` versions + # `offload_index` was introduced in https://github.com/huggingface/accelerate/pull/873/ + if "offload_index" in inspect.signature(dispatch_model).parameters: + dispatch_model_kwargs["offload_index"] = offload_index + + no_split_module_classes = self._no_split_modules + + if device_map != "sequential": + max_memory = get_balanced_memory( + self, + max_memory=max_memory, + no_split_module_classes=no_split_module_classes, + low_zero=(device_map == "balanced_low_0"), + ) + + if isinstance(device_map, str): + device_map = infer_auto_device_map( + self, max_memory=max_memory, no_split_module_classes=no_split_module_classes + ) + + self._update_offload(offload_index, adapters_weights) + dispatch_model_kwargs["offload_index"] = offload_index + + dispatch_model( + self, + device_map=device_map, + offload_dir=offload_dir, + **dispatch_model_kwargs, + ) + + hook = AlignDevicesHook(io_same_device=True) + if self.peft_config[adapter_name].is_prompt_learning: + remove_hook_from_submodules(self.prompt_encoder) + add_hook_to_module(self.get_base_model(), hook) + + if hasattr(self.base_model, "_cast_adapter_dtype"): + self.base_model._cast_adapter_dtype( + adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype + ) + + # Set model in evaluation mode to deactivate Dropout modules by default + if not is_trainable: + self.eval() + return load_result + + def set_adapter(self, adapter_name: str) -> None: + """ + Sets the active adapter. + + Only one adapter can be active at a time. + + Additionally, this function will set the specified adapter to trainable (i.e., requires_grad=True). If this is + not desired, use the following code. + + ```py + >>> for name, param in model_peft.named_parameters(): + ... if ...: # some check on name (ex. if 'lora' in name) + ... param.requires_grad = False + ``` + + Args: + adapter_name (`str`): + The name of the adapter to be set as active. The adapter must be loaded first. + """ + if adapter_name not in self.peft_config: + raise ValueError(f"Adapter {adapter_name} not found.") + self.active_adapter = adapter_name + if not self.peft_config[adapter_name].is_prompt_learning: + self.base_model.set_adapter(adapter_name) + _set_adapter(self, adapter_name) + + @property + def base_model_torch_dtype(self): + return getattr(self.base_model, "dtype", None) + + @property + def active_peft_config(self): + return self.peft_config[self.active_adapter] + + def create_or_update_model_card(self, output_dir: str): + """ + Updates or create model card to include information about peft: + 1. Adds `peft` library tag + 2. Adds peft version + 3. Adds base model info + 4. Adds quantization information if it was used + """ + + filename = os.path.join(output_dir, "README.md") + + card = ModelCard.load(filename) if os.path.exists(filename) else ModelCard.from_template(ModelCardData()) + + card.data["library_name"] = "peft" + + model_config = BaseTuner.get_model_config(self) + model_config = None if model_config == DUMMY_MODEL_CONFIG else model_config + if model_config is not None and "_name_or_path" in model_config: + card.data["base_model"] = model_config["_name_or_path"] + + lines = card.text.splitlines() + + quantization_config = None + if hasattr(model_config, "quantization_config"): + quantization_config = self.config.quantization_config.to_dict() + training_config_text = "" + quantization_prefix = "The following `bitsandbytes` quantization config was used during training:" + + if isinstance(self.peft_config[self.active_adapter], aLoraConfig): + training_config_text += f"\nActivated LoRA invocation string:\n{self.peft_config[self.active_adapter].invocation_string}" + # Adds quantization information if it was used + if quantization_config is not None: + training_config_text += f"\n{quantization_prefix}\n" + training_config_text += "\n".join([f"- {name}: {value}" for name, value in quantization_config.items()]) + training_config_text += "\n" + + training_procedure_heading = "## Training procedure" + if quantization_prefix not in lines and bool(training_config_text): + if training_procedure_heading in lines: + lines.insert(lines.index(training_procedure_heading) + 2, training_config_text) + else: + lines.append(f"{training_procedure_heading}\n{training_config_text}") + + # Adds peft version + framework_block_heading = "### Framework versions" + if f"- PEFT {__version__}" not in lines: + if framework_block_heading in lines: + lines.insert(lines.index(framework_block_heading) + 2, f"- PEFT {__version__}") + else: + lines.append(f"{framework_block_heading}\n\n- PEFT {__version__}") + + card.text = "\n".join(lines) + card.save(filename) + + + + + +class aLoRAPeftModelForCausalLM(PeftModelALoRA): + """ + Peft model for causal language modeling. + + Args: + model ([`~transformers.PreTrainedModel`]): Base transformer model. + peft_config ([`PeftConfig`]): Peft config. + adapter_name (`str`, *optional*): The name of the adapter, defaults to `"default"`. + autocast_adapter_dtype (`bool`, *optional*): + Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights + using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect + select PEFT tuners. + + Example: + + ```py + >>> from transformers import AutoModelForCausalLM + >>> from peft import PeftModelForCausalLM, get_peft_config + + >>> config = { + ... "peft_type": "PREFIX_TUNING", + ... "task_type": "CAUSAL_LM", + ... "inference_mode": False, + ... "num_virtual_tokens": 20, + ... "token_dim": 1280, + ... "num_transformer_submodules": 1, + ... "num_attention_heads": 20, + ... "num_layers": 36, + ... "encoder_hidden_size": 1280, + ... "prefix_projection": False, + ... "postprocess_past_key_value_function": None, + ... } + + >>> peft_config = get_peft_config(config) + >>> model = AutoModelForCausalLM.from_pretrained("gpt2-large") + >>> peft_model = PeftModelForCausalLM(model, peft_config) + >>> peft_model.print_trainable_parameters() + trainable params: 1843200 || all params: 775873280 || trainable%: 0.23756456724479544 + ``` + """ + + def __init__( + self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default",alora_offsets =None,response_token_ids=None, **kwargs + ) -> None: + super().__init__(model, peft_config, adapter_name, **kwargs) + self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation + self.response_token_ids = response_token_ids + #self.alora_offsets = alora_offsets + def forward( + self, + input_ids=None, + attention_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + task_ids=None, + **kwargs, + ): + # Figure out alora_offsets + alora_offsets = kwargs.pop("alora_offsets",None) + if self.disable_adapters == True: + alora_offsets = [-1] #Do not use adapter. + + elif self.response_token_ids is not None and alora_offsets is None:#Compute offsets using defined invocation sequence # and self.disable_adapters == False: + alora_offsets = [1]*len(input_ids) + for i in range(len(input_ids)): + response_token_ids_start_idx = None + for ii in range(len(self.response_token_ids)): + for idx in (torch.where(input_ids[i] == self.response_token_ids[ii][0])[0]).tolist(): + # `response_token_ids` is `'### Response:\n'`, here we are just making sure that the token IDs match + if ( + self.response_token_ids[ii].tolist() + == input_ids[i][idx : idx + len(self.response_token_ids[ii])].tolist() + ): + if response_token_ids_start_idx is None or idx > response_token_ids_start_idx: + + response_token_ids_start_idx = idx + response_token_ids_end_idx = idx + len(self.response_token_ids[ii]) + + if response_token_ids_start_idx is None: + warnings.warn( + f"Could not find response key in the " + f'following instance' + f'{self.response_token_ids}' + f'{input_ids[i]}' + f"Setting alora_offsets to 1 " + ) + #ks[i] = 1 + + else: +# print(self.response_token_ids) + # print(input_ids[i]) + alora_offsets[i] = len(input_ids[i]) - response_token_ids_start_idx + #elif self.alora_offsets is not None: + # alora_offsets = self.alora_offsets + elif alora_offsets is None: + warnings.warn('ALoRA offsets not available or computed. Adapter disabled.') + alora_offsets = [-1] #Do not use adapter. This does need to be consistent from train to test though. + + #Pass forward to peft hooks + kwargs['alora_offsets'] = alora_offsets + + + + + + + +# print('forward') + # print(input_ids) + # print('offsets') +# print(alora_offsets) + peft_config = self.active_peft_config + if not peft_config.is_prompt_learning: + #if self.base_model.config.model_type == "mpt": + # if inputs_embeds is not None: + # raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds") + # return self.base_model( + # input_ids=input_ids, + # attention_mask=attention_mask, + # labels=labels, + # output_attentions=output_attentions, + # output_hidden_states=output_hidden_states, + # return_dict=return_dict, + # **kwargs, + # ) + +# if peft_config.peft_type == PeftType.POLY: + # kwargs["task_ids"] = task_ids + + with self._enable_peft_forward_hooks(**kwargs): + kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} + return self.base_model( + input_ids=input_ids, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + labels=labels, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + batch_size = _get_batch_size(input_ids, inputs_embeds) + if attention_mask is not None: + # concat prompt attention mask + prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device) + attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) + + if kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + kwargs["position_ids"] = None + if kwargs.get("token_type_ids", None) is not None: + warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids") + kwargs["token_type_ids"] = None + kwargs.update( + { + "attention_mask": attention_mask, + "labels": labels, + "output_attentions": output_attentions, + "output_hidden_states": output_hidden_states, + "return_dict": return_dict, + } + ) + + if peft_config.peft_type == PeftType.PREFIX_TUNING: + # overwrite past_kv in kwargs + kwargs["past_key_values"] = self.get_prompt(batch_size) + return self.base_model(input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs) + elif peft_config.peft_type == PeftType.CPT: + return self._cpt_forward(input_ids, inputs_embeds, peft_config, task_ids, batch_size, **kwargs) + else: + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + # concat prompt labels + if labels is not None: + prefix_labels = torch.full((batch_size, peft_config.num_virtual_tokens), -100).to(labels.device) + kwargs["labels"] = torch.cat((prefix_labels, labels), dim=1) + prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids) + prompts = prompts.to(inputs_embeds.dtype) + inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) + return self.base_model(inputs_embeds=inputs_embeds, **kwargs) + + # def _cpt_forward( + # self, input_ids=None, inputs_embeds=None, peft_config=None, task_ids=None, batch_size=None, **kwargs + # ): + # # Extract labels from kwargs + # labels = kwargs.pop("labels") + # device = [i.device for i in [input_ids, inputs_embeds, labels] if i is not None][0] + # # Extract input_type_mask from kwargs and move it to the same device as labels + # if "input_type_mask" in kwargs.keys(): + # input_type_mask = kwargs.pop("input_type_mask").to(device) + # else: + # if input_ids is None: + # N_tokens = inputs_embeds.shape[1] + # else: + # N_tokens = input_ids.shape[1] + # input_type_mask = torch.ones((batch_size, N_tokens)).to(device) * 4 + + # cpt_token_ids = peft_config.cpt_token_ids + # cpt_tokens_type_mask = peft_config.cpt_tokens_type_mask + + # # Generate embeddings if not provided + # if inputs_embeds is None: + # inputs_embeds = self.word_embeddings(input_ids) + # # Get prompt and concatenate with input embeddings + # prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids) + # prompts = prompts.to(inputs_embeds.dtype) + # inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) + # # If labels are provided, generate prefix labels and type mask + # cpt_labels = None + # if labels is not None: + # # Generate prefix labels and concatenate with the input labels + # prefix_labels = torch.Tensor(cpt_token_ids).long().view(1, -1) + # prefix_labels = prefix_labels.repeat(batch_size, 1).to(labels.device) + # cpt_labels = torch.cat((prefix_labels, labels), dim=1) + # # Generate prefix type mask and shift input type mask values to avoid conflicts + # prefix_type_mask = torch.Tensor(cpt_tokens_type_mask).long().view(1, -1) + # prefix_type_mask = prefix_type_mask.repeat(batch_size, 1).to(labels.device) + # adjusted_input_type_mask = input_type_mask + # adjusted_input_type_mask[adjusted_input_type_mask > 0] += prefix_type_mask.max() + # # Concatenate prefix and shifted input type masks + # cpt_type_mask = torch.cat((prefix_type_mask, adjusted_input_type_mask), dim=1) + # # Identify valid label positions and mask invalid ones with -100 + # labels_idx = (cpt_type_mask > 0) & (cpt_type_mask % 4 == 0) + # cpt_labels[~labels_idx] = -100 + # # Update kwargs with the modified labels + + # kwargs["labels"] = cpt_labels + # # Pass the modified inputs to the base model + # base_model_output = self.base_model(inputs_embeds=inputs_embeds, **kwargs) + # if labels is None: + # return base_model_output + # else: + # # Calculate the loss using the custom CPT loss function + # base_model_output = CPTEmbedding.calculate_loss( + # base_model_output, cpt_labels, cpt_type_mask, self.peft_config["default"] + # ) + # return base_model_output + + def generate(self, *args, **kwargs): +# print(f"adaps: {self.active_adapters}") + peft_config = self.active_peft_config + self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation + if hasattr(self.base_model, "model"): + self.base_model.model.generation_config = self.generation_config + else: + self.base_model.generation_config = self.generation_config + try: + if not peft_config.is_prompt_learning: +# print(args) +# print(kwargs) + alora_offsets = kwargs.pop("alora_offsets",None) + + + + input_ids = kwargs.get("input_ids") if not args else args[0] + if len(input_ids.shape) == 1: + input_ids = [args[0]] + if self.disable_adapters == True: + alora_offsets = [-1] #Do not use adapter. + # Figure out alora_offsets + elif alora_offsets is None and self.response_token_ids is not None:# and self.disable_adapters == False: + alora_offsets = [1]*len(input_ids) + for i in range(len(input_ids)): + response_token_ids_start_idx = None + for ii in range(len(self.response_token_ids)): + for idx in (torch.where(input_ids[i] == self.response_token_ids[ii][0])[0]).tolist(): + # `response_token_ids` is `'### Response:\n'`, here we are just making sure that the token IDs match + if ( + self.response_token_ids[ii].tolist() + == input_ids[i][idx : idx + len(self.response_token_ids[ii])].tolist() + ): + if response_token_ids_start_idx is None or idx > response_token_ids_start_idx: + response_token_ids_start_idx = idx + response_token_ids_end_idx = idx + len(self.response_token_ids[ii]) + + if response_token_ids_start_idx is None: + warnings.warn( + f"Could not find response key in the " + f'following instance' + f'{self.response_token_ids}' + f'{input_ids[i]}' + f"Setting alora_offsets to 0 (Starting aLoRA at end of prompt) " + ) + #ks[i] = 1 + + else: + alora_offsets[i] = len(input_ids[i])-1 - response_token_ids_start_idx + #elif self.alora_offsets is not None: + # alora_offsets = self.alora_offsets + elif alora_offsets is None: + warnings.warn('ALoRA offsets not available or computed. Adapter disabled.') + alora_offsets = [-1] + + +# print(ks) + #Pass forward to peft hooks + kwargs['alora_offsets'] = alora_offsets + with self._enable_peft_forward_hooks(*args, **kwargs): + kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} + outputs = self.base_model.generate(*args, **kwargs) + else: + outputs = self.base_model.generate(**kwargs) + except: + self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation + raise + else: + self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation + return outputs + + def prepare_inputs_for_generation(self, *args, task_ids: Optional[torch.Tensor] = None, **kwargs): + peft_config = self.active_peft_config + model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs) + + # https://github.com/huggingface/transformers/pull/26681/ introduced new cache format + # for some architectures which requires a special fix for prompt tuning etc. + # TODO: starting with transformers 4.38, all architectures should support caching. + uses_transformers_4_38 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.38.0") + uses_transformers_4_36 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.36.0") + transformers_new_cache_archs = ["llama", "mistral", "persimmon", "phi"] + if packaging.version.parse(transformers.__version__) > packaging.version.parse("4.43.3"): + # https://github.com/huggingface/transformers/pull/31445 + transformers_new_cache_archs.append("bloom") + + uses_cache = uses_transformers_4_38 or ( + uses_transformers_4_36 and self.base_model.config.model_type in transformers_new_cache_archs + ) + + if peft_config.peft_type == PeftType.POLY: + model_kwargs["task_ids"] = task_ids + if peft_config.is_prompt_learning: + if uses_cache and (model_kwargs.get("past_key_values", None) is not None): + # change in the logic of `prepare_inputs_for_generation` makes the below code necessary + # In prompt learning methods, past key values are longer when compared to the `input_ids`. + # As such only consider the last input ids in the autogressive generation phase. + past_key_values = model_kwargs["past_key_values"] + if isinstance(past_key_values, (tuple, list)): + seq_len = past_key_values[0][0].shape[-2] + else: # using transformers kv cache + seq_len = past_key_values.get_seq_length() + if seq_len >= model_kwargs["input_ids"].shape[1]: + model_kwargs["input_ids"] = model_kwargs["input_ids"][:, -1:] + + if model_kwargs.get("attention_mask", None) is not None: + size = model_kwargs["input_ids"].shape[0], peft_config.num_virtual_tokens + prefix_attention_mask = torch.ones(size).to(model_kwargs["input_ids"].device) + model_kwargs["attention_mask"] = torch.cat( + (prefix_attention_mask, model_kwargs["attention_mask"]), dim=1 + ) + + if model_kwargs.get("position_ids", None) is not None: + warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") + model_kwargs["position_ids"] = None + + if kwargs.get("token_type_ids", None) is not None: + warnings.warn( + "Token type ids are not supported for parameter efficient tuning. Ignoring token type ids" + ) + kwargs["token_type_ids"] = None + + # no past_key_values or past_key_values empty cache + requires_prompt_injection = (model_kwargs.get("past_key_values", None) is None) or ( + isinstance(model_kwargs["past_key_values"], transformers.Cache) + and not model_kwargs["past_key_values"].get_seq_length() + ) + + if requires_prompt_injection and peft_config.peft_type == PeftType.PREFIX_TUNING: + new_past_key_values = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0]) + model_kwargs["past_key_values"] = new_past_key_values + elif requires_prompt_injection: + inputs_embeds = self.word_embeddings(model_kwargs["input_ids"]) + prompts = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0], task_ids=task_ids) + prompts = prompts.to(inputs_embeds.dtype) + model_kwargs["inputs_embeds"] = torch.cat((prompts, inputs_embeds), dim=1) + model_kwargs["input_ids"] = None + + # For transformers>=4.38.0 - for some architectures such as Llama, `cache_position` is + # passed in the forward pass to keep track of the position ids of the cache. We have to + # pop that from `model_kwargs` as `cache_position` is properly created by the model, using the passed + # `inputs_embeds`: https://github.com/huggingface/transformers/blob/593230f0a1150ea9c0477b9d859f25daf73c8c33/src/transformers/models/llama/modeling_llama.py#L956 + _ = model_kwargs.pop("cache_position", None) + + return model_kwargs + + + +@dataclass +class TunerLayerStatus: + name: str + module_type: str + enabled: bool + active_adapters: list[str] + merged_adapters: list[str] + requires_grad: dict[str, bool | Literal["irregular"]] + available_adapters: list[str] + devices: dict[str, list[str]] + + +def get_layer_status(model: torch.nn.Module) -> list[TunerLayerStatus]: + """Get the status of each adapter layer in the model. + + This function returns a list of `TunerLayerStatus` dataclass instances, each of which contains the following + attributes: + + - `name` (`str`): + The name of the adapter layer, e.g. `model.encoder.block.0.layer.0.SelfAttention.q`. + - `module_type` (`str`): + The type of the adapter layer, e.g. `lora.Linear`. + - `enabled` (`bool`): + Whether the adapter layer is enabled. + - `active_adapters` (`list[str]`): + The names of the active adapters, if any, e.g. `["default"]`. + - `merged_adapters` (`list[str]`): + The names of the merged adapters, if any, e.g. `["default"]`. + - requires_grad : dict[str, bool | Literal["irregular"]] + The requires_grad status of the parameters for each adapter module. Ideally, it should be either `True` or + `False`. If the requires_grad status is not consistent across all parameters, the value will be set to + `"irregular"`. + - `available_adapters` (`list[str]`): + The names of the available adapters, e.g. `["default"]`. + - `devices` (`dict[str, list[str]]`): + The devices where the parameters of the given adapter are stored, e.g. `["cuda"]`. + + Args: + model ([Union[`~PeftModel`, `~transformers.PreTrainedModel`, `nn.Module`]]): + The model to get the adapter layer status from. + + Returns: + list[`peft.peft_model.TunerLayerStatus`]: + A list of dataclasses, each containing the status of the corresponding adapter layer. + + """ + if isinstance(model, PeftModel): + base_model = model.base_model + if not isinstance(base_model, BaseTuner): + raise TypeError( + "get_layer_status() got an invalid PeftModel instance; prefix tuning and adaption prompt are not " + "supported." + ) + else: + base_model = model + + layer_status: list[TunerLayerStatus] = [] + for name, module in base_model.named_modules(): + if not isinstance(module, BaseTunerLayer): + continue + + # determine if all submodules/parameters if this module require grad or not + mapping_requires_grad_list: dict[str, list[bool]] = collections.defaultdict(list) + for adapter_module_name in module.adapter_layer_names: + adapter_module = getattr(module, adapter_module_name) + if isinstance(adapter_module, torch.nn.ModuleDict): + for key, submodule in adapter_module.items(): + for param in submodule.parameters(): + mapping_requires_grad_list[key].append(param.requires_grad) + elif isinstance(adapter_module, torch.nn.ParameterDict): + for key, param in adapter_module.items(): + mapping_requires_grad_list[key].append(param.requires_grad) + else: + # strange, we don't know how to handle this, ignore for now + pass + + def check_irrgular(vals: list[bool]) -> bool | Literal["irregular"]: + if all(vals): + return True + if not any(vals): + return False + return "irregular" + + requires_grad = {key: check_irrgular(vals) for key, vals in mapping_requires_grad_list.items()} + + devices_dd = collections.defaultdict(list) + for adapter_module_name in module.adapter_layer_names + module.other_param_names: + adapter_module = getattr(module, adapter_module_name) + if isinstance(adapter_module, torch.nn.ModuleDict): + for key, submodule in adapter_module.items(): + devices_dd[key].extend([param.device.type for param in submodule.parameters()]) + elif isinstance(adapter_module, torch.nn.ParameterDict) or ( + adapter_module.__class__.__name__ == "BufferDict" + ): # VeRA + for key, param in adapter_module.items(): + devices_dd[key].append(param.device.type) + devices = {key: sorted(set(val)) for key, val in devices_dd.items()} + + status = TunerLayerStatus( + name=name, + module_type=repr(module).partition("(")[0], + enabled=not module.disable_adapters, + active_adapters=module.active_adapters, + merged_adapters=module.merged_adapters, + requires_grad=requires_grad, + available_adapters=sorted(module._get_available_adapters()), + devices=devices, + ) + layer_status.append(status) + + if not layer_status: + raise ValueError( + "No adapter layers found in the model, please ensure that it's a PEFT model or that you have PEFT adapters " + "injected in the model." + ) + + return layer_status + + +@dataclass +class TunerModelStatus: + base_model_type: str + adapter_model_type: str + peft_types: dict[str, str] + trainable_params: int + total_params: int + num_adapter_layers: int + enabled: bool | Literal["irregular"] + active_adapters: list[str] | Literal["irregular"] + merged_adapters: list[str] | Literal["irregular"] + requires_grad: dict[str, bool | Literal["irregular"]] + available_adapters: list[str] + devices: dict[str, list[str]] + + +def get_model_status(model: torch.nn.Module) -> TunerModelStatus: + """Get the status of tuners of the model. + + This function returns a `TunerModelStatus` dataclass instance, which contains the following attributes: + + - `base_model_type` (`str`): + The type of the base model, e.g. `T5Model`. + - `adapter_model_type` (`str`): + The type of the adapter model, e.g. `LoraModel`. + - `peft_types` (`dict[str, str]`): + The mapping of adapter name to adapter type, e.g. `{"default": "LORA"}`. + - `trainable_params` (`int`): + The number of trainable parameters in the model. + - `total_params` (`int`): + The total number of parameters in the model. + - `num_adapter_layers` (`int`): + The number of adapter layers in the model. + - `enabled` (`bool`, `Literal["irregular"]`): + Whether all adapter layers are enabled. If some are enabled and some are not, this will be `"irregular"`. This + means that your model is in an inconsistent state and might not work as expected. + - `active_adapters` (`list[str]`, `Literal["irregular"]`): + The names of the active adapters. If the active adapters are not consistent across all layers, this will be + `"irregular"`, which means that your model is in an inconsistent state and might not work as expected. + - `merged_adapters` (`list[str]`, `Literal["irregular"]`): + The names of the merged adapters. If the merged adapters are not consistent across all layers, this will be + `"irregular"`, which means that your model is in an inconsistent state and might not work as expected. + - `requires_grad` (`dict[str, bool | Literal["irregular"]]`): + Whether for the given adapter, all adapter layers have `requires_grad` set to `True` or `False`. If there is a + mix, this will be set to `"irregular"`, which means that your model is in an inconsistent state and might not + work as expected. + - `available_adapters` (`list[str]`): + The names of the available adapters, e.g. `["default"]`. + - `devices` (`dict[str, list[str]]`): + The devices where the parameters of the given adapter are stored, e.g. `["cuda"]`. + + Args: + model ([Union[`~PeftModel`, `~transformers.PreTrainedModel`, `nn.Module`]]): + The model to get the adapter layer status from. + + Returns: + `peft.peft_model.TunerModelStatus`: + A dataclass containing the status of the model. + + """ + if isinstance(model, PeftModel): + if not isinstance(model.base_model, BaseTuner): + raise TypeError( + "get_model_status() got an invalid PeftModel instance; prefix tuning and adaption prompt are not " + "supported." + ) + base_model_type = model.get_base_model().__class__.__name__ + trainable_params, total_params = model.get_nb_trainable_parameters() + base_model = model.base_model + peft_types = {key: str(config.peft_type).partition(".")[-1] for key, config in base_model.peft_config.items()} + adapter_model_type = base_model.__class__.__name__ + elif isinstance(model, PreTrainedModel): + base_model_type = model.__class__.__name__ + trainable_params, total_params = PeftModel.get_nb_trainable_parameters(model) + base_model = model + peft_types = {} + adapter_model_type = "None" + else: + base_model_type = "other" + trainable_params, total_params = PeftModel.get_nb_trainable_parameters(model) + base_model = model + peft_types = {} + adapter_model_type = "None" + + layer_status = get_layer_status(model) + num_adapter_layers = len(layer_status) + + enabled_set: set[bool] = {status.enabled for status in layer_status} # must be {True}, {False}, or {True, False} + enabled: bool | Literal["irregular"] + if len(enabled_set) == 1: + enabled = enabled_set.pop() + else: + enabled = "irregular" + + available_adapters: list[str] = sorted(set().union(*(status.available_adapters for status in layer_status))) + + # ideally, active adapters should be consistent across all layers of the model, but we cannot guarantee it + all_active_adapters: set[tuple[str, ...]] = {tuple(status.active_adapters) for status in layer_status} + active_adapters: list[str] | Literal["irregular"] + if not all_active_adapters: + active_adapters = [] + elif len(all_active_adapters) == 1: + active_adapters = list(all_active_adapters.pop()) + else: + active_adapters = "irregular" + + # Here we determine what adapters are merged. This is not trivial because multiple adapters can be merged or not at + # the same time. Some layers may only have adapter A, some only adapter B, so it's not as easy as just checking + # which adapters are merged on each layer. + + # First, determine all adapters that are merged on at least on module. + merged_all: set[str] = set() + for status in layer_status: + merged_all.update(status.merged_adapters) + + # Next, check if on any layer, on of these adapters is not merged. + merged_adapters: list[str] | Literal["irregular"] = sorted(merged_all) + for status in layer_status: + unmerged = set(status.available_adapters) - set(status.merged_adapters) + if unmerged & merged_all: + # there is overlap between unmerged adapters and adapters that should be merged + merged_adapters = "irregular" + break + + # check status of requires_grad + # first, merge the values for all layers + requires_grad_all: dict[str, list[bool | Literal["irregular"]]] = collections.defaultdict(list) + for status in layer_status: + for key, val in status.requires_grad.items(): + requires_grad_all[key].append(val) + + # then, check if the values are consistent + def check_irrgular(vals: list[bool | Literal["irregular"]]) -> bool | Literal["irregular"]: + if all(val is True for val in vals): + return True + if all(val is False for val in vals): + return False + return "irregular" + + requires_grad = {key: check_irrgular(vals) for key, vals in requires_grad_all.items()} + + devices_dd = collections.defaultdict(list) + for status in layer_status: + for key, val in status.devices.items(): + devices_dd[key].extend(val) + devices = {key: sorted(set(val)) for key, val in devices_dd.items()} + + adapter_model_status = TunerModelStatus( + base_model_type=base_model_type, + adapter_model_type=adapter_model_type, + peft_types=peft_types, + trainable_params=trainable_params, + total_params=total_params, + num_adapter_layers=num_adapter_layers, + enabled=enabled, + active_adapters=active_adapters, + merged_adapters=merged_adapters, + requires_grad=requires_grad, + available_adapters=available_adapters, + devices=devices, + ) + return adapter_model_status + From 0b445a4052a69fa4841332f731821ef891fa2ea5 Mon Sep 17 00:00:00 2001 From: Kristjan Greenewald Date: Thu, 19 Jun 2025 10:31:00 -0400 Subject: [PATCH 03/99] initial alora-peft integration --- src/peft/peft_model.py | 153 +++++++- src/peft/tuners/alora/__init__.py | 23 ++ src/peft/tuners/alora/config.py | 632 +----------------------------- src/peft/tuners/alora/model.py | 80 +--- src/peft/utils/peft_types.py | 2 + 5 files changed, 192 insertions(+), 698 deletions(-) diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index 056ea02bf5..49d2f6c4f9 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -47,6 +47,7 @@ from .config import PeftConfig from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING, PEFT_TYPE_TO_PREFIX_MAPPING, PEFT_TYPE_TO_TUNER_MAPPING from .utils import ( + PeftType, SAFETENSORS_WEIGHTS_NAME, TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING, WEIGHTS_NAME, @@ -64,6 +65,7 @@ set_peft_model_state_dict, shift_tokens_right, ) +from .tuners.alora.config import aLoraConfig class PeftModel(PushToHubMixin, torch.nn.Module): @@ -109,13 +111,15 @@ def __init__( adapter_name: str = "default", autocast_adapter_dtype: bool = True, low_cpu_mem_usage: bool = False, + tokenizer: Optional[Any] = None, ) -> None: super().__init__() self.active_adapter = adapter_name self.peft_type = peft_config.peft_type # These args are special PEFT arguments that users can pass. They need to be removed before passing them to # forward. - self.special_peft_forward_args = {"adapter_names"} + self.special_peft_forward_args = {"adapter_names", "alora_offsets"} + self.tokenizer = tokenizer # Added for ALORA self._is_prompt_learning = peft_config.is_prompt_learning if self._is_prompt_learning: @@ -385,6 +389,7 @@ def from_pretrained( ephemeral_gpu_offload: bool = False, low_cpu_mem_usage: bool = False, key_mapping: Optional[dict[str, str]] = None, + tokenizer: Optional[Any] = None, # Added for ALORA **kwargs: Any, ) -> PeftModel: r""" @@ -542,6 +547,7 @@ def from_pretrained( adapter_name, autocast_adapter_dtype=autocast_adapter_dtype, low_cpu_mem_usage=low_cpu_mem_usage, + tokenizer=tokenizer, ) else: model = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[config.task_type]( @@ -550,6 +556,7 @@ def from_pretrained( adapter_name, autocast_adapter_dtype=autocast_adapter_dtype, low_cpu_mem_usage=low_cpu_mem_usage, + tokenizer=tokenizer, ) load_result = model.load_adapter( @@ -1771,11 +1778,82 @@ class PeftModelForCausalLM(PeftModel): """ def __init__( - self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", **kwargs + self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", tokenizer: Optional[Any] = None, **kwargs ) -> None: - super().__init__(model, peft_config, adapter_name, **kwargs) + super().__init__(model, peft_config, adapter_name, tokenizer=tokenizer, **kwargs) self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation + + def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optional[list[str]] = None) -> list[int]: + if input_ids is None: + return [] + + batch_size = input_ids.shape[0] + alora_offsets = [-1] * batch_size + + cached_invocation_tensors = {} + adapters_to_process_indices = collections.defaultdict(list) + + for i in range(batch_size): + current_adapter_name = adapter_names[i] if adapter_names and i < len(adapter_names) else self.active_adapter + + if current_adapter_name == "__base__": + alora_offsets[i] = -1 + continue + + if current_adapter_name not in self.peft_config: + warnings.warn(f"Adapter '{current_adapter_name}' not found in peft_config. Using offset -1 for row {i}.") + alora_offsets[i] = -1 + continue + + current_peft_config = self.peft_config[current_adapter_name] + + if not isinstance(current_peft_config, aLoraConfig): + alora_offsets[i] = -1 # Not an aLoRA adapter or wrong type + continue + + invocation_string = getattr(current_peft_config, 'invocation_string', None) + if not self.tokenizer or not invocation_string: + alora_offsets[i] = -1 # No way to calculate offset + continue + + if current_adapter_name not in cached_invocation_tensors: + tokenized_ids_list = self.tokenizer.encode(invocation_string, add_special_tokens=False) + cached_invocation_tensors[current_adapter_name] = torch.tensor(tokenized_ids_list, dtype=torch.long, device=input_ids.device) + + adapters_to_process_indices[current_adapter_name].append(i) + + for adapter_name_to_process, indices in adapters_to_process_indices.items(): + current_invocation_ids_tensor = cached_invocation_tensors[adapter_name_to_process] + invocation_len = len(current_invocation_ids_tensor) + + for i in indices: + sequence = input_ids[i] + seq_len = len(sequence) + best_match_start_idx = -1 + + possible_starts = (sequence == current_invocation_ids_tensor[0]).nonzero(as_tuple=True)[0] + + for start_idx_tensor in possible_starts: + idx = start_idx_tensor.item() + if idx + invocation_len <= seq_len: + if torch.equal(sequence[idx : idx + invocation_len], current_invocation_ids_tensor): + if idx > best_match_start_idx: + best_match_start_idx = idx + + if best_match_start_idx != -1: + offset_val = seq_len - best_match_start_idx + alora_offsets[i] = offset_val if offset_val > 0 else -1 + else: + warnings.warn( + f"Invocation string for adapter '{adapter_name_to_process}' not found in input row {i}. " + f"Input: {self.tokenizer.decode(input_ids[i]) if self.tokenizer else input_ids[i]}. " + f"Invocation: {self.peft_config[adapter_name_to_process].invocation_string}. " + "Adapter will be disabled for this row." + ) + alora_offsets[i] = -1 + return alora_offsets + def forward( self, input_ids=None, @@ -1789,7 +1867,33 @@ def forward( **kwargs, ): peft_config = self.active_peft_config + if not peft_config.is_prompt_learning: + adapter_names_for_offset_calc = kwargs.get("adapter_names") + + is_alora_relevant = False + if self.active_peft_config.peft_type == PeftType.ALORA: + is_alora_relevant = True + elif adapter_names_for_offset_calc: + for name in adapter_names_for_offset_calc: + if name == "__base__": + continue + config_ = self.peft_config.get(name) + if config_ and config_.peft_type == PeftType.ALORA: + is_alora_relevant = True + break + + if is_alora_relevant: + alora_offsets = kwargs.get("alora_offsets") + if alora_offsets is None: + if input_ids is None and inputs_embeds is not None: + warnings.warn("Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass.") + alora_offsets = [-1] * inputs_embeds.shape[0] + elif input_ids is not None: + alora_offsets = self._calculate_alora_offsets(input_ids, adapter_names=adapter_names_for_offset_calc) + else: + alora_offsets = [] # Should not happen if _get_batch_size logic is sound + kwargs['alora_offsets'] = alora_offsets if self.base_model.config.model_type == "mpt": if inputs_embeds is not None: raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds") @@ -1929,6 +2033,49 @@ def generate(self, *args, **kwargs): self.base_model.generation_config = self.generation_config try: if not peft_config.is_prompt_learning: + adapter_names_for_offset_calc = kwargs.get("adapter_names") + is_alora_relevant_in_generate = False + + if self.active_peft_config.peft_type == PeftType.ALORA: + is_alora_relevant_in_generate = True + elif adapter_names_for_offset_calc: + for name in adapter_names_for_offset_calc: + if name == "__base__": + continue + config_ = self.peft_config.get(name) + if config_ and config_.peft_type == PeftType.ALORA: + is_alora_relevant_in_generate = True + break + + if is_alora_relevant_in_generate: + alora_offsets_from_kwargs = kwargs.get("alora_offsets") + if alora_offsets_from_kwargs is None: + current_input_ids = kwargs.get("input_ids") + if not args and not current_input_ids : # args[0] is usually input_ids + if args and isinstance(args[0], torch.Tensor) and args[0].dim() >=1 : + current_input_ids = args[0] + else: + current_input_ids = None + + if current_input_ids is not None: + if current_input_ids.ndim == 1: + current_input_ids = current_input_ids.unsqueeze(0) + calculated_offsets = self._calculate_alora_offsets(current_input_ids, adapter_names=adapter_names_for_offset_calc) + kwargs['alora_offsets'] = calculated_offsets + else: + warnings.warn("Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA.") + bs = 1 + if "attention_mask" in kwargs and kwargs["attention_mask"] is not None: + bs = kwargs["attention_mask"].shape[0] + elif "inputs_embeds" in kwargs and kwargs["inputs_embeds"] is not None: + bs = kwargs["inputs_embeds"].shape[0] + elif args and isinstance(args[0], torch.Tensor) and args[0].dim() > 0: # input_ids might be in args[0] + bs = args[0].shape[0] + elif "input_ids" in kwargs and kwargs["input_ids"] is not None: # Should have been caught by current_input_ids + bs = kwargs["input_ids"].shape[0] + + kwargs['alora_offsets'] = [-1] * bs + with self._enable_peft_forward_hooks(*args, **kwargs): kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} outputs = self.base_model.generate(*args, **kwargs) diff --git a/src/peft/tuners/alora/__init__.py b/src/peft/tuners/alora/__init__.py index e69de29bb2..5d7bb07b38 100644 --- a/src/peft/tuners/alora/__init__.py +++ b/src/peft/tuners/alora/__init__.py @@ -0,0 +1,23 @@ +# Copyright 2023-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from peft.utils import register_peft_method + +from .config import aLoraConfig +from .layer import Linear, aLoraLayer +from .model import aLoraModel + +__all__ = ["Linear", "aLoraConfig", "aLoraLayer", "aLoraModel"] + +register_peft_method(name="alora", config_cls=aLoraConfig, model_cls=aLoraModel, prefix="alora_", is_mixed_compatible=True) diff --git a/src/peft/tuners/alora/config.py b/src/peft/tuners/alora/config.py index ab0cff4337..2c548a356f 100644 --- a/src/peft/tuners/alora/config.py +++ b/src/peft/tuners/alora/config.py @@ -5,14 +5,7 @@ from typing import Literal, Optional, Union from torch import nn -import enum -from peft.config import PeftConfig from peft.utils import PeftType -#class PeftType(str, enum.Enum): -# ALORA = "ALORA" - - - from peft import LoraConfig @@ -52,624 +45,11 @@ class aLoraConfig(LoraConfig): } ) - def __post_init__(self): #, *args, invocation_string=None, r=32, **kwargs): - # Call the parent's __post_init__ to initialize all the fields - super().__post_init__() #*args, r=r, **kwargs) - # Validate the additional field - + def __post_init__(self): + super().__post_init__() + self.peft_type = PeftType.ALORA if self.invocation_string is None: - warnings.warn("invocation_string cannot be None", UserWarning) - - - - - - - - def from_peft_type(**kwargs): - r""" - This method loads the configuration of your adapter model from a set of kwargs. - - The appropriate configuration type is determined by the `peft_type` argument. If `peft_type` is not provided, - the calling class type is instantiated. - - Args: - kwargs (configuration keyword arguments): - Keyword arguments passed along to the configuration initialization. - """ - # Avoid circular dependency .. TODO: fix this with a larger refactor - #from peft.mapping import PEFT_TYPE_TO_CONFIG_MAPPING - - # TODO: this hack is needed to fix the following issue (on commit 702f937): - # if someone saves a default config and loads it back with `PeftConfig` class it yields to - # not loading the correct config class. - # - # from peft import AdaLoraConfig, PeftConfig - # peft_config = AdaLoraConfig() - # print(peft_config) - # >>> AdaLoraConfig(peft_type=, auto_mapping=None, base_model_name_or_path=None, - # revision=None, task_type=None, inference_mode=False, r=8, target_modules=None, lora_alpha=8, lora_dropout=0.0, ... - # - # peft_config.save_pretrained("./test_config") - # peft_config = PeftConfig.from_pretrained("./test_config") - # print(peft_config) - # >>> PeftConfig(peft_type='ADALORA', auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=None, inference_mode=False) - - #if "peft_type" in kwargs: - # peft_type = kwargs["peft_type"] - config_cls = aLoraConfig #PEFT_TYPE_TO_CONFIG_MAPPING[peft_type] - #else: - - try: - config = config_cls(**kwargs) - except TypeError as exc: - # Here we potentially handle forward compatibility. Sometimes new keywords are added to configs, which makes - # new configs incompatible with older PEFT versions. We catch these and remove them to allow the program to - # continue, but warn the user about it. - - # First check if the error is due to unexpected keyword arguments, we don't want to accidentally catch - # other TypeErrors. - if "got an unexpected keyword argument" not in str(exc): - raise exc - - filtered_kwargs, unexpected_kwargs = _check_and_remove_unused_kwargs(config_cls, kwargs) - MIN_EXPECTED_CONFIG_KEYS = {"peft_type"} - if not MIN_EXPECTED_CONFIG_KEYS.issubset(set(filtered_kwargs.keys())): - raise TypeError( - f"The {cls.__name__} config that is trying to be loaded is missing required keys: " - f"{MIN_EXPECTED_CONFIG_KEYS}." - ) - - warnings.warn( - f"Unexpected keyword arguments {sorted(unexpected_kwargs)} for class {config_cls.__name__}, these are " - "ignored. This probably means that you're loading a configuration file that was saved using a " - "higher version of the library and additional parameters have been introduced since. It is " - "highly recommended to upgrade the PEFT version before continuing (e.g. by running `pip install " - "-U peft`)." - ) - config = config_cls.from_peft_type(**filtered_kwargs) - return config - - - -# from __future__ import annotations - -# import warnings -# from dataclasses import dataclass, field -# from typing import Literal, Optional, Union - -# from torch import nn -# import enum -# from peft.config import PeftConfig -# from peft.utils import PeftType -# #class PeftType(str, enum.Enum): -# # ALORA = "ALORA" - - - -# from peft import LoraConfig - -# @dataclass -# class aLoraConfig(LoraConfig): -# """ -# This is the configuration class to store the configuration of an [`aLoraModel`]. - -# It subclasses PEFT's LoraConfig, modifies the default rank r to 32 (often best), and adds an additional parameter: -# r (`int`): aLora attention dimension (the "rank"). Typically needs to be higher than used for standard Lora. Default=32. -# invocation_string (str): String intended to activate the aLoRA. The aLoRA adapted weights will activate -# 1 token after the first token in this string. This string must be present in all input data. -# """ -# r: int = field(default=32, metadata={"help": "aLora attention dimension. Typically needs to be higher than used for standard Lora. Default=32."}) -# invocation_string: str = field( -# default=None, -# metadata={ -# "help": ( -# "aLoRA invocation string. The aLoRA adapted weights will activate 1 token after the first token in " -# "this string. This string must be present in all input data." -# ) -# } -# ) - -# def __post_init__(self, *args, invocation_string=None, r=32, **kwargs): -# # Call the parent's __post_init__ to initialize all the fields -# super().__post_init__(*args, r=r, **kwargs) -# # Validate the additional field -# self.invocation_string = invocation_string -# if self.invocation_string is None: -# warnings.warn("invocation_string cannot be None", UserWarning) - - - - -# @dataclass -# class LoraRuntimeConfig: -# """ -# This is the sub-configuration class to store the runtime configurations for the model. - -# Args: -# ephemeral_gpu_offload (`bool`): -# Whether to use ephemeral GPU offloading for models partially kept in CPU memory. -# """ - -# ephemeral_gpu_offload: bool = field( -# default=False, -# metadata={ -# "help": ( -# "Whether to use ephemeral GPU offloading for models partially kept in CPU memory. Ephemeral GPU offloading result in " -# "the data involved in intense operations being momentarily copied over to the GPU, and the results copied " -# "back to CPU. There is a momentary VRAM overhead, but operations are generally orders of magnitude faster " -# "compared to performing them on the CPU. This is useful when parts of the model and/or components (such " -# "as adapters) are kept in CPU memory until they are needed. Rather than perform expensive operations on " -# "small data, the data is transferred to the GPU on-demand, the operation(s) performed, and the results " -# "moved back to CPU memory. Currently only affects DoRA initialization." -# ) -# }, -# ) - - -# @dataclass -# class LoftQConfig: -# """ -# This is the sub-configuration class to store the configuration of a [`LoraModel`]. - -# Args: -# bits_pattern (`dict`): The mapping from layer names or regexp expression to bits which are different from the -# default bits specified by `bits`. For example, `{model.decoder.layers.0.encoder_attn.k_proj: 2`}. -# bits (`int`): Quantization bits for LoftQ. -# iter (`int`): Alternating iterations for LoftQ. -# fake (`bool`): True: use fp16/fp32; used for first time to save weights. False: use bitsandbytes 4bit linear -# models. weights can't be saved. Recommend to set to True, save the weights and load the saved weights in 4 -# bits. -# """ - -# loftq_bits: int = field(default=4, metadata={"help": "Quantization bits for LoftQ"}) -# loftq_iter: int = field(default=1, metadata={"help": "Alternating iterations for LoftQ"}) - - -# @dataclass -# class EvaConfig: -# """ -# This is the sub-configuration class to store the configuration for a data-driven initialization via EVA. EVA was -# introduced in Explained Variance Adaptation. - -# Args: -# rho (`float`): -# Rho value for EVA redistribution (>= 1.0). The maximum rank for a layer is lora_r * rho. Default is 2.0, -# meaning the maximum rank allowed for a layer is 2r. Increasing rho will allow for a higher degree of -# redistribution of ranks across layers. Some pre-trained models might be more sensitive to a rank -# redistribution. It can therefore be beneficial to try rho=1.0 (no redistribution) if the performance is -# lower than expected. -# tau (`float`): -# Cosine similarity threshold for early stopping. Compares the cosine similarity of right-singular vectors -# between two consecutive SVD steps. If the cosine similarity is above this threshold, the SVD iteration is -# stopped. Default is 0.99. -# use_label_mask (`bool`): -# Use label mask for EVA initialization. This means that positions where labels=label_mask_value are ignored -# for the SVD computation. Setting use_label_mask=True is preferred in most cases and can be especially -# beneficial for multi-turn conversations. The default value is True. Filtering out items based on the label -# mask can sometimes lead to a small batch size and as a result instabilities in the SVD computation. For -# cases where a large share of batch items would be filtered out, set use_label_mask=False. -# label_mask_value (`int`): -# If use_label_mask=True the value to look for to mask out ignored tokens. Default is -100. -# whiten (`bool`): Apply whitening to singular vectors. Default is False. -# Whitening has been shown to be beneficial for EVA in the vision domain. -# adjust_scaling_factors (`bool`): -# Adjust LoRA scaling factors after the rank redistribution. Setting this to True means the scaling factors -# are adjusted so that all LoRA gradients have the same scale regardless of their rank. Default is True. -# """ - -# rho: float = field(default=2.0, metadata={"help": "Rho value for EVA redistribution"}) -# tau: float = field(default=0.99, metadata={"help": "Cosine similarity threshold for early stopping"}) -# use_label_mask: bool = field(default=True, metadata={"help": "Use label mask for EVA initialization"}) -# label_mask_value: int = field( -# default=-100, metadata={"help": "if use_label_mask=True the value to look for to mask out ignored tokens"} -# ) -# whiten: bool = field(default=False, metadata={"help": "Apply whitening to singular vectors"}) -# adjust_scaling_factors: bool = field( -# default=True, -# metadata={"help": "Adjust LoRA scaling factors after the rank redistribution"}, -# ) - -# def __post_init__(self): -# if self.rho < 1.0: -# raise ValueError("`rho` must be >= 1.0") -# if self.tau < 0.0 or self.tau > 1.0: -# raise ValueError("`tau` must be between 0.0 and 1.0.") - - -# @dataclass -# class aLoraConfig(PeftConfig): -# """ -# This is the configuration class to store the configuration of a [`aLoraModel`]. - -# Args: -# r (`int`): -# aLora attention dimension (the "rank"). -# target_modules (`Optional[Union[List[str], str]]`): -# The names of the modules to apply the adapter to. If this is specified, only the modules with the specified -# names will be replaced. When passing a string, a regex match will be performed. When passing a list of -# strings, either an exact match will be performed or it is checked if the name of the module ends with any -# of the passed strings. If this is specified as 'all-linear', then all linear/Conv1D modules are chosen, -# excluding the output layer. If this is not specified, modules will be chosen according to the model -# architecture. If the architecture is not known, an error will be raised -- in this case, you should specify -# the target modules manually. -# exclude_modules (`Optional[Union[List[str], str]]`): -# The names of the modules to not apply the adapter. When passing a string, a regex match will be performed. -# When passing a list of strings, either an exact match will be performed or it is checked if the name of the -# module ends with any of the passed strings. -# invocation_string (`str`): -# String intended to activate the aLoRA. The aLoRA adapted weights will activate 1 token after the first token in this string. -# This string must be present in all input data. -# lora_alpha (`int`): -# The alpha parameter for Lora scaling. -# lora_dropout (`float`): -# The dropout probability for Lora layers. -# fan_in_fan_out (`bool`): -# Set this to True if the layer to replace stores weight like (fan_in, fan_out). For example, gpt-2 uses -# `Conv1D` which stores weights like (fan_in, fan_out) and hence this should be set to `True`. -# bias (`str`): -# Bias type for LoRA. Can be 'none', 'all' or 'lora_only'. If 'all' or 'lora_only', the corresponding biases -# will be updated during training. Be aware that this means that, even when disabling the adapters, the model -# will not produce the same output as the base model would have without adaptation. -# use_rslora (`bool`): -# When set to True, uses Rank-Stabilized LoRA which -# sets the adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it was proven to work better. -# Otherwise, it will use the original default value of `lora_alpha/r`. -# modules_to_save (`List[str]`): -# List of modules apart from adapter layers to be set as trainable and saved in the final checkpoint. -# init_lora_weights (`bool` | `Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "loftq"]`): -# How to initialize the weights of the adapter layers. Passing True (default) results in the default -# initialization from the reference implementation from Microsoft. Passing 'gaussian' results in Gaussian -# initialization scaled by the LoRA rank for linear and layers. Setting the initialization to False leads to -# completely random initialization and is discouraged. Pass `'loftq'` to use LoftQ initialization. Passing -# `'eva'` results in a data-driven initialization of Explained -# Variance Adaptation. EVA initalizes LoRA based on the SVD of layer input activations and achieves SOTA -# performance due to its ability to adapt to the finetuning data. Pass `'olora'` to use OLoRA initialization. -# Passing `'pissa'` results in the initialization of Principal -# Singular values and Singular vectors Adaptation (PiSSA), which converges more rapidly than LoRA and -# ultimately achieves superior performance. Moreover, PiSSA reduces the quantization error compared to QLoRA, -# leading to further enhancements. Passing `'pissa_niter_[number of iters]'` initiates Fast-SVD-based PiSSA -# initialization, where `[number of iters]` indicates the number of subspace iterations to perform FSVD, and -# must be a nonnegative integer. When `[number of iters]` is set to 16, it can complete the initialization of -# a 7B model within seconds, and the training effect is approximately equivalent to using SVD. -# layers_to_transform (`Union[List[int], int]`): -# The layer indices to transform. If a list of ints is passed, it will apply the adapter to the layer indices -# that are specified in this list. If a single integer is passed, it will apply the transformations on the -# layer at this index. -# layers_pattern (`Optional[Union[List[str], str]]`): -# The layer pattern name, used only if `layers_to_transform` is different from `None`. This should target the -# `nn.ModuleList` of the model, which is often called `'layers'` or `'h'`. -# rank_pattern (`dict`): -# The mapping from layer names or regexp expression to ranks which are different from the default rank -# specified by `r`. -# alpha_pattern (`dict`): -# The mapping from layer names or regexp expression to alphas which are different from the default alpha -# specified by `lora_alpha`. -# megatron_config (`Optional[dict]`): -# The TransformerConfig arguments for Megatron. It is used to create LoRA's parallel linear layer. You can -# get it like this, `core_transformer_config_from_args(get_args())`, these two functions being from Megatron. -# The arguments will be used to initialize the TransformerConfig of Megatron. You need to specify this -# parameter when you want to apply LoRA to the ColumnParallelLinear and RowParallelLinear layers of megatron. -# megatron_core (`Optional[str]`): -# The core module from Megatron to use, defaults to `"megatron.core"`. -# loftq_config (`Optional[LoftQConfig]`): -# The configuration of LoftQ. If this is not None, then LoftQ will be used to quantize the backbone weights -# and initialize Lora layers. Also pass `init_lora_weights='loftq'`. Note that you should not pass a -# quantized model in this case, as LoftQ will quantize the model itself. -# eva_config (`Optional[EvaConfig]`): -# The configuration of EVA. At a minimum the dataset argument needs to be set (use the same dataset as for -# finetuning). -# use_dora (`bool`): -# Enable 'Weight-Decomposed Low-Rank Adaptation' (DoRA). This technique decomposes the updates of the weights -# into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is -# handled by a separate learnable parameter. This can improve the performance of LoRA especially at low -# ranks. Right now, DoRA only supports linear and Conv2D layers. DoRA introduces a bigger overhead than pure -# LoRA, so it is recommended to merge weights for inference. For more information, see -# https://arxiv.org/abs/2402.09353. -# layer_replication (`List[Tuple[int, int]]`): -# Build a new stack of layers by stacking the original model layers according to the ranges specified. This -# allows expanding (or shrinking) the model without duplicating the base model weights. The new layers will -# all have separate LoRA adapters attached to them. -# runtime_config (`LoraRuntimeConfig`): -# Runtime configurations (which are not saved or restored). -# lora_bias (`bool`): -# Defaults to `False`. Whether to enable the bias term for the LoRA B parameter. Typically, this should be -# disabled. The main use case for this is when the LoRA weights were extracted from fully fine-tuned -# parameters so the bias of those parameters can be taken into account. -# """ - -# r: int = field(default=32, metadata={"help": "Lora attention dimension"}) -# target_modules: Optional[Union[list[str], str]] = field( -# default=None, -# metadata={ -# "help": ( -# "List of module names or regex expression of the module names to replace with LoRA." -# "NOTE: activated LoRA assumes only adapters in the attention weights, i.e. q,k,v." -# "For example, ['q', 'k', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|k|v)$'." -# "This can also be a wildcard 'all-linear' which matches all linear/Conv1D layers except the output layer." -# "If not specified, modules will be chosen according to the model architecture, If the architecture is " -# "not known, an error will be raised -- in this case, you should specify the target modules manually." -# ), -# }, -# ) -# exclude_modules: Optional[Union[list[str], str]] = field( -# default=None, -# metadata={"help": "List of module names or regex expression of the module names to exclude from Lora."}, -# ) -# invocation_string: str = field(default=None, metadata={"help": "aLoRA invocation string. The aLoRA adapted weights will activate 1 token after the first token in this string. This string must be present in all input data."}) -# lora_alpha: int = field(default=8, metadata={"help": "Lora alpha"}) -# lora_dropout: float = field(default=0.0, metadata={"help": "Lora dropout"}) -# fan_in_fan_out: bool = field( -# default=False, -# metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"}, -# ) -# bias: Literal["none", "all", "lora_only"] = field( -# default="none", metadata={"help": "Bias type for Lora. Can be 'none', 'all' or 'lora_only'"} -# ) -# use_rslora: bool = field( -# default=False, -# metadata={ -# "help": ( -# "When set to True, uses Rank-Stabilized LoRA" -# " which sets the adapter scaling factor to `lora_alpha/math.sqrt(r)`, since it" -# " was proven to work better. Otherwise, it will use the original default" -# " value of `lora_alpha/r`." -# ) -# }, -# ) -# modules_to_save: Optional[list[str]] = field( -# default=None, -# metadata={ -# "help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. " -# "For example, in Sequence Classification or Token Classification tasks, " -# "the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved." -# }, -# ) -# init_lora_weights: ( -# bool | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "loftq"] -# ) = field( -# default=True, -# metadata={ -# "help": ( -# "How to initialize the weights of the LoRA layers. Passing `'True'` (default) results in the default " -# "initialization from the reference implementation from Microsoft. Passing `'gaussian'` results " -# "in Gaussian initialization scaled by the LoRA rank for linear and layers. Setting the initialization " -# "to `'False'` leads to completely random initialization and *is discouraged.*" -# "Pass `'eva'` results in a data-driven initialization of Explained Variance Adaptation." -# "Passing `'olora'` results in OLoRA initialization." -# "Passing `'pissa'` results in PiSSA initialization." -# "Passing `'pissa_niter_[number of iters]'` initiates Fast-SVD-based PiSSA initialization, " -# "where [number of iters] indicates the number of subspace iterations to perform fsvd, and must be a nonnegative integer." -# "Pass `'loftq'` to use LoftQ initialization" -# ), -# }, -# ) -# layers_to_transform: Optional[Union[list[int], int]] = field( -# default=None, -# metadata={ -# "help": "The layer indexes to transform, is this argument is specified, PEFT will transform only the layers indexes that are specified inside this list. If a single integer is passed, PEFT will transform only the layer at this index. " -# "This only works when target_modules is a list of str." -# }, -# ) -# layers_pattern: Optional[Union[list[str], str]] = field( -# default=None, -# metadata={ -# "help": "The layer pattern name, used only if `layers_to_transform` is different to None and if the layer pattern is not in the common layers pattern." -# "This only works when target_modules is a list of str. This should target the `nn.ModuleList` of the " -# "model, which is often called `'layers'` or `'h'`." -# }, -# ) -# rank_pattern: Optional[dict] = field( -# default_factory=dict, -# metadata={ -# "help": ( -# "The mapping from layer names or regexp expression to ranks which are different from the default rank specified by `r`. " -# "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 8`}" -# ) -# }, -# ) -# alpha_pattern: Optional[dict] = field( -# default_factory=dict, -# metadata={ -# "help": ( -# "The mapping from layer names or regexp expression to alphas which are different from the default alpha specified by `lora_alpha`. " -# "For example, `{model.decoder.layers.0.encoder_attn.k_proj: 32`}" -# ) -# }, -# ) -# megatron_config: Optional[dict] = field( -# default=None, -# metadata={ -# "help": ( -# "The TransformerConfig from Megatron. It is used to create LoRA's parallel linear layer." -# "You can get it like this, `core_transformer_config_from_args(get_args())`, " -# "these two functions being from Megatron." -# "You need to specify this parameter when you want to apply LoRA to the ColumnParallelLinear and " -# "RowParallelLinear layers of megatron." -# "It should be noted that we may not be able to use the `save_pretrained` and `from_pretrained` " -# "functions, because TransformerConfig may not necessarily be serialized." -# "But when using megatron, we can use `get_peft_model_state_dict` function and " -# "megatron's framework, they can also save and load models and configurations." -# ) -# }, -# ) -# megatron_core: Optional[str] = field( -# default="megatron.core", -# metadata={ -# "help": ( -# "The core module from Megatron, it is used to create LoRA's parallel linear layer. " -# "It only needs to be passed in when you need to use your own modified megatron core module. " -# "Otherwise, it will use the default value `megatron.core`. " -# ) -# }, -# ) -# # dict type is used when loading config.json -# loftq_config: Union[LoftQConfig, dict] = field( -# default_factory=dict, -# metadata={ -# "help": ( -# "The configuration of LoftQ. If this is passed, then LoftQ will be used to quantize the backbone " -# "weights and initialize Lora layers. Also set `init_lora_weights='loftq'` in this case." -# ) -# }, -# ) -# eva_config: Optional[EvaConfig] = field( -# default=None, -# metadata={ -# "help": ( -# "The configuration of EVA. If this is passed, then EVA will be used to intialize the LoRA layers. " -# "Also set `init_lora_weights='eva'` in this case. " -# ) -# }, -# ) -# use_dora: bool = field( -# default=False, -# metadata={ -# "help": ( -# "Enable 'Weight-Decomposed Low-Rank Adaptation' (DoRA). This technique decomposes the updates of the " -# "weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the " -# "magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, " -# "especially at low ranks. Right now, DoRA only supports linear and Conv2D layers. DoRA introduces a bigger" -# "overhead than pure LoRA, so it is recommended to merge weights for inference." -# ) -# }, -# ) -# # Enables replicating layers in a model to expand it to a larger model. -# layer_replication: Optional[list[tuple[int, int]]] = field( -# default=None, -# metadata={ -# "help": ( -# "This enables using LoRA to effectively expand a transformer model to a larger size by repeating some layers. " -# "The transformation handles models (currently Llama, Bert or Falcon compatible architectures) with " -# "a module list in the model which it modifies to expand the number of modules. " -# "Base weights are shared so the memory usage is close to the original model. The intended use is these base weights " -# "remain fixed during finetuning but each layer has a separate LoRA adapter so the layers can be specialed via " -# "the adapter layers fit during fine tuning." -# "The format is a list of [start, end) pairs which specify the layer ranges to stack. For example:\n" -# " Original model has 5 layers labelled by their position in the model: `[0, 1, 2, 3, 4]`\n" -# " layer_replication: `[[0, 4], [2, 5]]`\n" -# " Final model will have this arrangement of original layers: `[0, 1, 2, 3, 2, 3, 4]`\n" -# "This format is based on what is used for pass-through merges in mergekit. It makes it simple to select sequential " -# "ranges of a model and stack them while reusing layers at either end of each sequence." -# ) -# }, -# ) -# runtime_config: LoraRuntimeConfig = field( -# default_factory=LoraRuntimeConfig, metadata={"help": "Runtime configurations"} -# ) -# lora_bias: bool = field( -# default=False, -# metadata={ -# "help": ( -# "Whether to enable the bias term for the LoRA B parameter. Typically, this should be disabled. The " -# "main use case for this is when the LoRA weights were extracted from fully fine-tuned parameters so " -# "the bias of those parameters can be taken into account." -# ) -# }, -# ) - -# def to_dict(self): -# """ -# Returns the configuration for your adapter model as a dictionary. Removes runtime configurations. -# """ -# rv = super().to_dict() -# rv.pop("runtime_config") -# return rv - -# def __post_init__(self): -# super().__post_init__() -# self.peft_type = PeftType.LORA -# self.target_modules = ( -# set(self.target_modules) if isinstance(self.target_modules, list) else self.target_modules -# ) -# self.exclude_modules = ( -# set(self.exclude_modules) if isinstance(self.exclude_modules, list) else self.exclude_modules -# ) - -# # if target_modules is a regex expression, then layers_to_transform should be None -# if isinstance(self.target_modules, str) and self.layers_to_transform is not None: -# raise ValueError("`layers_to_transform` cannot be used when `target_modules` is a str.") - -# # if target_modules is a regex expression, then layers_pattern should be None -# if isinstance(self.target_modules, str) and self.layers_pattern is not None: -# raise ValueError("`layers_pattern` cannot be used when `target_modules` is a str.") - -# # check for layers_to_transform and layers_pattern -# if self.layers_pattern and not self.layers_to_transform: -# raise ValueError("When `layers_pattern` is specified, `layers_to_transform` must also be specified. ") - -# if self.use_dora and self.megatron_config: -# raise ValueError("DoRA does not support megatron_core, please set `use_dora=False`.") - -# # handle init_lora_weights and loftq_config -# if self.init_lora_weights == "loftq": -# import importlib - -# if not importlib.util.find_spec("scipy"): -# raise ImportError("The required package 'scipy' is not installed. Please install it to continue.") -# if not self.loftq_config: -# raise ValueError("`loftq_config` must be specified when `init_lora_weights` is 'loftq'.") -# if not isinstance(self.loftq_config, dict): -# # convert loftq_config to dict -# self.loftq_config = vars(self.loftq_config) -# elif self.loftq_config: -# self.loftq_config = {} -# warnings.warn("`loftq_config` specified but will be ignored when `init_lora_weights` is not 'loftq'.") - -# elif self.init_lora_weights == "eva" and self.eva_config is None: -# warnings.warn("`init_lora_weights` is 'eva' but `eva_config` is not specified. Using default EVA config.") -# self.eva_config = EvaConfig() -# elif self.init_lora_weights != "eva" and self.eva_config is not None: -# warnings.warn("`eva_config` specified but will be ignored when `init_lora_weights` is not 'eva'.") - -# if self.lora_bias: -# if self.init_lora_weights not in (True, False): -# raise ValueError( -# f"The argument lora_bias=True is only supported with init_lora_weights=True or False, got " -# f"init_lora_weights={self.init_lora_weights} instead." -# ) -# if self.use_dora: -# raise ValueError("The argument lora_bias=True is not supported for DoRA, please pass use_dora=False") -# if self.invocation_string is None: -# raise ValueError("invocation_string cannot be None") -# # Using post training conversion of modified base weights to restore their initial values (PiSSA, OLoRA) cannot -# # be correctly done when using rslora + rank_pattern/alpha_pattern. We can't really know if the user intends -# # this when they'll eventually call save_pretrained (i.e. if they'll pass -# # path_initial_model_for_weight_conversionl). Therefore, we only warn but don't raise an error here. -# if ( -# self.use_rslora -# and (self.rank_pattern or self.alpha_pattern) -# and ( -# (isinstance(self.init_lora_weights, str) and (self.init_lora_weights.startswith("pissa"))) -# or (self.init_lora_weights == "olora") -# ) -# ): -# msg = ( -# "Using Rank-Stabilized LoRA with rank_pattern/alpha_pattern and post-training conversion of modified " -# "base weights (PiSSA, OLoRA) means that you won't be able to pass " -# "`path_initial_model_for_weight_conversion` to `save_pretrained` to restore the initial values of the " -# "base weights; if you intend to do this, please ensure not to use rslora or rank_pattern/alpha_pattern." -# ) -# warnings.warn(msg) - -# self._custom_modules: Optional[dict[type[nn.Mmodule], type[nn.Module]]] = None - -# def _register_custom_module(self, mapping: dict[type[nn.Mmodule], type[nn.Module]]) -> None: -# """ -# Experimental API to support providing custom LoRA layers. - -# This API is subject to change, you should carefully read the docs before deciding to use it: - -# https://huggingface.co/docs/peft/developer_guides/custom_models - -# To register custom LoRA module types, call this method with a `mapping` argument that is a dict that maps from -# the target layer type to the custom LoRA layer type. The dict can contain multiple items if you wish to target -# multiple layer types. The target layer type can be any nn.Module that we currently don't support in PEFT, -# whether that is an official PyTorch layer type or a custom layer type. The custom LoRA module class has to be -# implemented by the user and follow the PEFT conventions for LoRA layers. - -# """ -# if self._custom_modules is None: -# self._custom_modules = {} -# self._custom_modules.update(mapping) + warnings.warn("invocation_string cannot be None for aLoRA.", UserWarning) + # The r field with default=32 is handled by the dataclass field definition. + # LoraConfig's __post_init__ does not modify self.r. diff --git a/src/peft/tuners/alora/model.py b/src/peft/tuners/alora/model.py index 4795756bda..2b93c1f0d6 100644 --- a/src/peft/tuners/alora/model.py +++ b/src/peft/tuners/alora/model.py @@ -57,78 +57,27 @@ def _alora_offsets_pre_forward_hook(target, args, kwargs, alora_offsets): class aLoraModel(BaseTuner): """ - Creates Low Rank Adapter (LoRA) model from a pretrained transformers model. + Creates Activated Low Rank Adapter (aLoRA) model from a pretrained transformers model. - The method is described in detail in https://arxiv.org/abs/2106.09685. + The method is described in detail in https://arxiv.org/abs/2504.12397. Args: model ([`torch.nn.Module`]): The model to be adapted. - config ([`LoraConfig`]): The configuration of the Lora model. + config ([`aLoraConfig`]): The configuration of the aLora model. adapter_name (`str`): The name of the adapter, defaults to `"default"`. low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): Create empty adapter weights on meta device. Useful to speed up the loading process. Returns: - `torch.nn.Module`: The Lora model. - - Example: - - ```py - >>> from transformers import AutoModelForSeq2SeqLM - >>> from peft import LoraModel, LoraConfig - - >>> config = LoraConfig( - ... task_type="SEQ_2_SEQ_LM", - ... r=8, - ... lora_alpha=32, - ... target_modules=["q", "v"], - ... lora_dropout=0.01, - ... ) - - >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") - >>> lora_model = LoraModel(model, config, "default") - ``` - - ```py - >>> import torch - >>> import transformers - >>> from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training - - >>> rank = ... - >>> target_modules = ["q_proj", "k_proj", "v_proj", "out_proj", "fc_in", "fc_out", "wte"] - >>> config = LoraConfig( - ... r=4, lora_alpha=16, target_modules=target_modules, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM" - ... ) - >>> quantization_config = transformers.BitsAndBytesConfig(load_in_8bit=True) - - >>> tokenizer = transformers.AutoTokenizer.from_pretrained( - ... "kakaobrain/kogpt", - ... revision="KoGPT6B-ryan1.5b-float16", # or float32 version: revision=KoGPT6B-ryan1.5b - ... bos_token="[BOS]", - ... eos_token="[EOS]", - ... unk_token="[UNK]", - ... pad_token="[PAD]", - ... mask_token="[MASK]", - ... ) - >>> model = transformers.GPTJForCausalLM.from_pretrained( - ... "kakaobrain/kogpt", - ... revision="KoGPT6B-ryan1.5b-float16", # or float32 version: revision=KoGPT6B-ryan1.5b - ... pad_token_id=tokenizer.eos_token_id, - ... use_cache=False, - ... device_map={"": rank}, - ... torch_dtype=torch.float16, - ... quantization_config=quantization_config, - ... ) - >>> model = prepare_model_for_kbit_training(model) - >>> lora_model = get_peft_model(model, config) - ``` + `torch.nn.Module`: The aLora model. + **Attributes**: - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. - - **peft_config** ([`LoraConfig`]): The configuration of the Lora model. + - **peft_config** ([`aLoraConfig`]): The configuration of the aLora model. """ - prefix: str = "lora_" + prefix: str = "alora_" def __init__(self, model, config, adapter_name, low_cpu_mem_usage: bool = False) -> None: super().__init__(model, config, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage) @@ -454,8 +403,7 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): ################################ - #if ks is not None: - # raise ValueError("Multiple adapters not supported with alora yet") + if self.training: raise ValueError("Cannot pass `adapter_names` when the model is in training mode.") @@ -485,16 +433,11 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): handle.remove() def _check_merge_allowed(self): - """Verify that the configuration supports merging. - - Currently gptq quantization and replicated layers do not support merging. + """Merging is not allowed for activated LoRA models """ - super()._check_merge_allowed() - if getattr(self.model, "quantization_method", None) == "gptq": - raise ValueError("Cannot merge LORA layers when the model is gptq quantized") - if self.peft_config.get("layer_replication"): - raise ValueError("Cannot merge LORA layers when base model layers are replicated") + raise ValueError("Merging of aLoRA layers is not possible by definition.") + @staticmethod def _prepare_adapter_config(peft_config, model_config): if peft_config.target_modules is None: @@ -951,4 +894,3 @@ def subtract_mutated_init(self, output_state_dict: dict[str, torch.Tensor], adap ) return tensors_lora - diff --git a/src/peft/utils/peft_types.py b/src/peft/utils/peft_types.py index 023fbaed78..00248b03d0 100644 --- a/src/peft/utils/peft_types.py +++ b/src/peft/utils/peft_types.py @@ -26,6 +26,7 @@ class PeftType(str, enum.Enum): - P_TUNING - PREFIX_TUNING - LORA + - ALORA - ADALORA - BOFT - ADAPTION_PROMPT @@ -48,6 +49,7 @@ class PeftType(str, enum.Enum): P_TUNING = "P_TUNING" PREFIX_TUNING = "PREFIX_TUNING" LORA = "LORA" + ALORA = "ALORA" ADALORA = "ADALORA" BOFT = "BOFT" ADAPTION_PROMPT = "ADAPTION_PROMPT" From 362509682d3d73a757362008efb14320276670c2 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Thu, 19 Jun 2025 16:08:53 +0000 Subject: [PATCH 04/99] fix init files --- src/peft/__init__.py | 4 ++++ src/peft/tuners/__init__.py | 3 +++ src/peft/tuners/alora/config.py | 2 +- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/peft/__init__.py b/src/peft/__init__.py index 51a42558d2..7381cd370f 100644 --- a/src/peft/__init__.py +++ b/src/peft/__init__.py @@ -50,6 +50,8 @@ AdaLoraModel, AdaptionPromptConfig, AdaptionPromptModel, + aLoraConfig, + aLoraModel, BOFTConfig, BOFTModel, BoneConfig, @@ -125,6 +127,8 @@ "AdaLoraModel", "AdaptionPromptConfig", "AdaptionPromptModel", + "aLoraConfig", + "aLoraModel", "AutoPeftModel", "AutoPeftModelForCausalLM", "AutoPeftModelForFeatureExtraction", diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py index bb38230bf0..26fe17a73a 100644 --- a/src/peft/tuners/__init__.py +++ b/src/peft/tuners/__init__.py @@ -14,6 +14,7 @@ from .adalora import AdaLoraConfig, AdaLoraModel from .adaption_prompt import AdaptionPromptConfig, AdaptionPromptModel +from .alora import aLoraConfig, aLoraModel from .boft import BOFTConfig, BOFTModel from .bone import BoneConfig, BoneModel from .cpt import CPTConfig, CPTEmbedding @@ -51,6 +52,8 @@ "AdaLoraModel", "AdaptionPromptConfig", "AdaptionPromptModel", + "aLoraConfig", + "aLoraModel", "BOFTConfig", "BOFTModel", "BoneConfig", diff --git a/src/peft/tuners/alora/config.py b/src/peft/tuners/alora/config.py index 2c548a356f..9c0c60fa84 100644 --- a/src/peft/tuners/alora/config.py +++ b/src/peft/tuners/alora/config.py @@ -6,7 +6,7 @@ from torch import nn from peft.utils import PeftType -from peft import LoraConfig +from peft.tuners.lora import LoraConfig def _check_and_remove_unused_kwargs(cls, kwargs): From bde502186d73368118d6581617a19d9646ae4459 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Mon, 23 Jun 2025 17:22:07 +0000 Subject: [PATCH 05/99] bugfixes --- src/peft/peft_model.py | 19 +- src/peft/tuners/alora/config.py | 15 - src/peft/tuners/alora/layer.py | 704 +------ src/peft/tuners/alora/peft_model_alora.py | 2165 --------------------- 4 files changed, 15 insertions(+), 2888 deletions(-) delete mode 100644 src/peft/tuners/alora/peft_model_alora.py diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index 49d2f6c4f9..80761fc4b6 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -1845,12 +1845,12 @@ def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optio offset_val = seq_len - best_match_start_idx alora_offsets[i] = offset_val if offset_val > 0 else -1 else: - warnings.warn( - f"Invocation string for adapter '{adapter_name_to_process}' not found in input row {i}. " - f"Input: {self.tokenizer.decode(input_ids[i]) if self.tokenizer else input_ids[i]}. " - f"Invocation: {self.peft_config[adapter_name_to_process].invocation_string}. " - "Adapter will be disabled for this row." - ) + #warnings.warn( + # f"Invocation string for adapter '{adapter_name_to_process}' not found in input row {i}. " + # f"Input: {self.tokenizer.decode(input_ids[i]) if self.tokenizer else input_ids[i]}. " + # f"Invocation: {self.peft_config[adapter_name_to_process].invocation_string}. " + # "Adapter will be disabled for this row." + #) alora_offsets[i] = -1 return alora_offsets @@ -2051,8 +2051,8 @@ def generate(self, *args, **kwargs): alora_offsets_from_kwargs = kwargs.get("alora_offsets") if alora_offsets_from_kwargs is None: current_input_ids = kwargs.get("input_ids") - if not args and not current_input_ids : # args[0] is usually input_ids - if args and isinstance(args[0], torch.Tensor) and args[0].dim() >=1 : + if not current_input_ids: # args[0] is usually input_ids + if args and isinstance(args[0], torch.Tensor): # and args[0].dim() >=1 : current_input_ids = args[0] else: current_input_ids = None @@ -2061,7 +2061,10 @@ def generate(self, *args, **kwargs): if current_input_ids.ndim == 1: current_input_ids = current_input_ids.unsqueeze(0) calculated_offsets = self._calculate_alora_offsets(current_input_ids, adapter_names=adapter_names_for_offset_calc) + for i in range(len(calculated_offsets)): + calculated_offsets[i] -= 1 kwargs['alora_offsets'] = calculated_offsets + else: warnings.warn("Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA.") bs = 1 diff --git a/src/peft/tuners/alora/config.py b/src/peft/tuners/alora/config.py index 9c0c60fa84..96b8893707 100644 --- a/src/peft/tuners/alora/config.py +++ b/src/peft/tuners/alora/config.py @@ -9,21 +9,6 @@ from peft.tuners.lora import LoraConfig -def _check_and_remove_unused_kwargs(cls, kwargs): - """Make PEFT configs forward-compatible by removing unused kwargs that were added in later PEFT versions. - - This assumes that removing the unused kwargs will not affect the default behavior. - - Returns the filtered kwargs and the set of removed keys. - """ - # it's not pretty but eh - signature_parameters = inspect.signature(cls.__init__).parameters - unexpected_kwargs = set(kwargs.keys()) - set(signature_parameters.keys()) - for key in unexpected_kwargs: - del kwargs[key] - return kwargs, unexpected_kwargs - - @dataclass class aLoraConfig(LoraConfig): """ diff --git a/src/peft/tuners/alora/layer.py b/src/peft/tuners/alora/layer.py index fc97152659..e2c0c138e2 100644 --- a/src/peft/tuners/alora/layer.py +++ b/src/peft/tuners/alora/layer.py @@ -427,118 +427,6 @@ def __init__( ) self.is_target_conv_1d_layer = is_target_conv_1d_layer - # def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: - # """ - # Merge the active adapter weights into the base weights - - # Args: - # safe_merge (`bool`, *optional*): - # If True, the merge operation will be performed in a copy of the original weights and check for NaNs - # before merging the weights. This is useful if you want to check if the merge operation will produce - # NaNs. Defaults to `False`. - # adapter_names (`list[str]`, *optional*): - # The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults - # to `None`. - # """ - - # warnings.warn("NOT SUPPORTED FOR ASYMMETRIC LORA")# added - - # adapter_names = check_adapters_to_merge(self, adapter_names) - # if not adapter_names: - # # no adapter to merge - # return - - # for active_adapter in adapter_names: - # if active_adapter in self.lora_A.keys(): - # base_layer = self.get_base_layer() - # if safe_merge: - # # Note that safe_merge will be slower than the normal merge - # # because of the copy operation. - # orig_weights = base_layer.weight.data.clone() - # delta_weight = self.get_delta_weight(active_adapter) - # if not self.use_dora[active_adapter]: - # orig_weights += delta_weight - # else: - # # handle dora - # # since delta_weight already includes scaling, set it to 1 here - # weight_norm = ( - # self.lora_magnitude_vector[active_adapter] - # .get_weight_norm(orig_weights, transpose(delta_weight, self.fan_in_fan_out), scaling=1) - # .detach() - # ) - # # We need to cache weight_norm because it has to be based on the original weights. We - # # cannot calculate it on the fly based on the merged weights when unmerging because its a - # # different value - # self._cache_store(f"{active_adapter}-weight_norm", weight_norm) - # dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm - # dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out) - # orig_weights = dora_factor * (orig_weights + delta_weight) - - # if not torch.isfinite(orig_weights).all(): - # raise ValueError( - # f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" - # ) - - # base_layer.weight.data = orig_weights - - # if self.lora_bias[active_adapter]: - # new_bias = base_layer.bias + self.lora_B[active_adapter].bias - # if not torch.isfinite(new_bias).all(): - # raise ValueError( - # f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" - # ) - # base_layer.bias.data = new_bias - - # else: - # delta_weight = self.get_delta_weight(active_adapter) - # if not self.use_dora[active_adapter]: - # base_layer.weight.data += delta_weight - # else: - # # handle dora - # # since delta_weight already includes scaling, set it to 1 here - # weight_norm = ( - # self.lora_magnitude_vector[active_adapter] - # .get_weight_norm( - # base_layer.weight, transpose(delta_weight, self.fan_in_fan_out), scaling=1 - # ) - # .detach() - # ) - # # We need to cache weight_norm because it has to be based on the original weights. We - # # cannot calculate it on the fly based on the merged weights when unmerging because its a - # # different value - # self._cache_store(f"{active_adapter}-weight_norm", weight_norm) - # dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm - # dora_factor = transpose(dora_factor.view(-1, 1), self.fan_in_fan_out) - # new_weight = dora_factor * (base_layer.weight.data + delta_weight) - # base_layer.weight.data = new_weight - - # if self.lora_bias[active_adapter]: - # base_layer.bias.data += self.lora_B[active_adapter].bias - - # self.merged_adapters.append(active_adapter) - - # def unmerge(self) -> None: - # """ - # This method unmerges all merged adapter layers from the base weights. - # """ - # if not self.merged: - # warnings.warn("Already unmerged. Nothing to do.") - # return - # while len(self.merged_adapters) > 0: - # active_adapter = self.merged_adapters.pop() - # if active_adapter in self.lora_A.keys(): - # weight = self.get_base_layer().weight - # delta_weight = self.get_delta_weight(active_adapter) - # if not self.use_dora[active_adapter]: - # weight.data -= delta_weight - # else: - # weight_norm = self._cache_pop(f"{active_adapter}-weight_norm") - # dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm - # weight_orig = weight.data / dora_factor.view(-1, 1) - delta_weight - # weight.data = weight_orig - - # if self.lora_bias[active_adapter]: - # self.get_base_layer().bias.data -= self.lora_B[active_adapter].bias def get_delta_weight(self, adapter) -> torch.Tensor: """ @@ -603,14 +491,12 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: dropout = self.lora_dropout[active_adapter] scaling = self.scaling[active_adapter] x = x.to(lora_A.weight.dtype) - + if not self.use_dora[active_adapter]: # Only do the last k tokens -# print(f"{k} A {x[:,-k,:]}") - # k = 2 if len(ks) == 1: k = min(result.shape[1],ks[0]) -# print(k) + if k > 0: result[:,-k:,:] = result[:,-k:,:] + lora_B(lora_A(dropout(x[:,-k:,:]))) * scaling#dropout else: @@ -642,590 +528,8 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: def __repr__(self) -> str: rep = super().__repr__() - return "lora." + rep - -#below not supported -# class Embedding(nn.Module, aLoraLayer): -# # LoRA implemented in a Embedding layer -# def __init__( -# self, -# base_layer: nn.Module, -# adapter_name: str, -# r: int = 0, -# lora_alpha: int = 1, -# lora_dropout: float = 0.0, -# init_lora_weights: Union[bool, str] = True, -# use_rslora: bool = False, -# use_dora: bool = False, -# lora_bias: bool = False, -# **kwargs, -# ) -> None: -# if lora_bias: -# # lora_bias=True is not supported (yet) for embedding layers, as they use nn.Parameter -# raise ValueError(f"lora_bias={lora_bias} is not supported for {self.__class__.__name__}.") - -# super().__init__() -# aLoraLayer.__init__(self, base_layer) - -# self._active_adapter = adapter_name -# self.update_layer( -# adapter_name, -# r, -# lora_alpha=lora_alpha, -# lora_dropout=lora_dropout, -# init_lora_weights=init_lora_weights, -# use_rslora=use_rslora, -# use_dora=use_dora, -# lora_bias=lora_bias, -# ) - -# def update_layer( -# self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora, lora_bias -# ): -# if r <= 0: -# raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") - -# self.r[adapter_name] = r -# self.lora_alpha[adapter_name] = lora_alpha -# if lora_dropout > 0.0: -# lora_dropout_layer = nn.Dropout(p=lora_dropout) -# else: -# lora_dropout_layer = nn.Identity() - -# self.lora_dropout[adapter_name] = lora_dropout_layer -# # Actual trainable parameters -# weight_A = torch.randn((r, self.in_features)) -# weight_B = torch.randn((self.out_features, r)) -# self.lora_embedding_A[adapter_name] = nn.Parameter(weight_A) -# self.lora_embedding_B[adapter_name] = nn.Parameter(weight_B) -# self.lora_bias[adapter_name] = lora_bias - -# if use_rslora: -# self.scaling[adapter_name] = lora_alpha / math.sqrt(r) -# else: -# self.scaling[adapter_name] = lora_alpha / r - -# if init_lora_weights == "loftq": -# self.loftq_init(adapter_name) -# elif init_lora_weights: -# self.reset_lora_parameters(adapter_name, init_lora_weights) - -# # call this before dora_init -# self._move_adapter_to_device_of_base_layer(adapter_name) - -# if use_dora: -# self.dora_init(adapter_name) -# self.use_dora[adapter_name] = True -# else: -# self.use_dora[adapter_name] = False - -# self.set_adapter(self.active_adapters) - -# def dora_init(self, adapter_name: str) -> None: -# if self.lora_magnitude_vector is None: -# # first dora layer being added, add lora_magnitude_vector to the list of learnable parameters -# self.adapter_layer_names = self.adapter_layer_names[:] + ("lora_magnitude_vector",) - -# # dora_layer = DoraEmbeddingLayer(fan_in_fan_out=True) -# # lora_embedding_A = self.lora_embedding_A[adapter_name] -# # lora_embedding_B = self.lora_embedding_B[adapter_name] -# # scaling = self.scaling[adapter_name] -# # dora_layer.update_layer( -# # base_layer=self.get_base_layer(), lora_A=lora_embedding_A, lora_B=lora_embedding_B, scaling=scaling -# # ) -# # self.lora_magnitude_vector[adapter_name] = dora_layer - -# def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: -# """ -# Merge the active adapter weights into the base weights - -# Args: -# safe_merge (`bool`, *optional*): -# If True, the merge operation will be performed in a copy of the original weights and check for NaNs -# before merging the weights. This is useful if you want to check if the merge operation will produce -# NaNs. Defaults to `False`. -# adapter_names (`list[str]`, *optional*): -# The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults -# to `None`. -# """ -# warnings.warn("NOT SUPPORTED") -# adapter_names = check_adapters_to_merge(self, adapter_names) -# if not adapter_names: -# # no adapter to merge -# return - -# for active_adapter in adapter_names: -# if active_adapter in self.lora_embedding_A.keys(): -# base_layer = self.get_base_layer() -# if safe_merge: -# # Note that safe_merge will be slower than the normal merge -# # because of the copy operation. -# orig_weights = base_layer.weight.data.clone() -# orig_weights += self.get_delta_weight(active_adapter) - -# if not torch.isfinite(orig_weights).all(): -# raise ValueError( -# f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" -# ) - -# base_layer.weight.data = orig_weights -# else: -# base_layer.weight.data += self.get_delta_weight(active_adapter) -# self.merged_adapters.append(active_adapter) - -# def unmerge(self) -> None: -# """ -# This method unmerges all merged adapter layers from the base weights. -# """ -# if not self.merged: -# warnings.warn("Already unmerged. Nothing to do.") -# return -# while len(self.merged_adapters) > 0: -# active_adapter = self.merged_adapters.pop() -# if active_adapter in self.lora_embedding_A.keys(): -# self.get_base_layer().weight.data -= self.get_delta_weight(active_adapter) - -# def get_delta_weight(self, adapter) -> torch.Tensor: -# """ -# Compute the delta weight for the given adapter. - -# Args: -# adapter (str): -# The name of the adapter for which the delta weight should be computed. -# """ -# device = self.lora_embedding_B[adapter].device -# dtype = self.lora_embedding_A[adapter].dtype - -# # In case users wants to merge the adapter weights that are in -# # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to -# # (b)float16 because some CPUs have slow bf16/fp16 matmuls. -# cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) - -# weight_A = self.lora_embedding_A[adapter] -# weight_B = self.lora_embedding_B[adapter] - -# if cast_to_fp32: -# weight_A = weight_A.float() -# weight_B = weight_B.float() - -# output_tensor = transpose(weight_B @ weight_A, True) * self.scaling[adapter] - -# if cast_to_fp32: -# output_tensor = output_tensor.to(dtype=dtype) - -# # cast back the weights -# self.lora_embedding_A[adapter] = weight_A.to(dtype) -# self.lora_embedding_B[adapter] = weight_B.to(dtype) - -# return output_tensor - -# def _mixed_batch_forward( -# self, x: torch.Tensor, *args: Any, adapter_names: list[str], **kwargs: Any -# ) -> torch.Tensor: -# # This is a special method that handles the case when users pass the argument `adapter_names`. This is an -# # extra argument that allows mixing different adapters in the same batch at inference time. -# result = self.base_layer(x, *args, **kwargs) - -# unique_adapters = set(adapter_names) -# sub_batch_indices_list = [] -# for adapter in unique_adapters: -# sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) - -# for i, active_adapter in enumerate(unique_adapters): -# if active_adapter == "__base__": -# continue -# if active_adapter not in self.lora_embedding_A.keys(): -# continue - -# embedding_A = self.lora_embedding_A[active_adapter].T -# embedding_B = self.lora_embedding_B[active_adapter].T -# scaling = self.scaling[active_adapter] - -# # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear -# # layer output -# sub_batch = x[sub_batch_indices_list[i]] -# after_A = self._embed(sub_batch, embedding_A) -# result[sub_batch_indices_list[i]] += (after_A @ embedding_B) * scaling - -# return result - -# def _embed(self, input: torch.Tensor, weight: torch.Tensor) -> torch.Tensor: -# base_layer = self.get_base_layer() -# return F.embedding( -# input, -# weight, -# padding_idx=base_layer.padding_idx, -# max_norm=base_layer.max_norm, -# norm_type=base_layer.norm_type, -# scale_grad_by_freq=base_layer.scale_grad_by_freq, -# sparse=base_layer.sparse, -# ) - -# def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: -# # TODO: no dtype conversion here, unlike in Linear, is that correct? -# self._check_forward_args(x, *args, **kwargs) -# adapter_names = kwargs.pop("adapter_names", None) - -# if self.disable_adapters: -# if self.merged: -# self.unmerge() -# result = self.base_layer(x, *args, **kwargs) -# elif adapter_names is not None: -# result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs) -# elif self.merged: -# result = self.base_layer(x, *args, **kwargs) -# else: -# result = self.base_layer(x, *args, **kwargs) -# torch_result_dtype = result.dtype -# for active_adapter in self.active_adapters: -# if active_adapter not in self.lora_embedding_A: -# continue -# embedding_A = self.lora_embedding_A[active_adapter].T -# embedding_B = self.lora_embedding_B[active_adapter].T -# scaling = self.scaling[active_adapter] - -# if not self.use_dora[active_adapter]: -# after_A = self._embed(x, embedding_A) -# result = result + (after_A @ embedding_B) * scaling -# else: -# mag_norm_scale, dora_result = self.lora_magnitude_vector[active_adapter]( -# x, -# lora_A=embedding_A, -# lora_B=embedding_B, -# scaling=scaling, -# base_layer=self.get_base_layer(), -# embed_fn=self._embed, -# ) -# result = mag_norm_scale * result + dora_result -# result = result.to(torch_result_dtype) - -# return result - -# def __repr__(self) -> str: -# rep = super().__repr__() -# return "lora." + rep - - -# class _ConvNd(nn.Module, aLoraLayer): -# # Lora implemented in a conv(2,3)d layer -# def __init__( -# self, -# base_layer: nn.Module, -# adapter_name: str, -# r: int = 0, -# lora_alpha: int = 1, -# lora_dropout: float = 0.0, -# init_lora_weights: Union[bool, str] = True, -# use_rslora: bool = False, -# use_dora: bool = False, -# lora_bias: bool = False, -# **kwargs, -# ) -> None: -# super().__init__() -# aLoraLayer.__init__(self, base_layer) - -# self._active_adapter = adapter_name -# self._kernel_dim = base_layer.weight.dim() - -# self.update_layer( -# adapter_name, -# r, -# lora_alpha=lora_alpha, -# lora_dropout=lora_dropout, -# init_lora_weights=init_lora_weights, -# use_rslora=use_rslora, -# use_dora=use_dora, -# lora_bias=lora_bias, -# ) - -# def update_layer( -# self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora, lora_bias -# ): -# if r <= 0: -# raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") - -# self.r[adapter_name] = r -# self.lora_alpha[adapter_name] = lora_alpha -# if lora_dropout > 0.0: -# lora_dropout_layer = nn.Dropout(p=lora_dropout) -# else: -# lora_dropout_layer = nn.Identity() - -# self.lora_dropout[adapter_name] = lora_dropout_layer -# # Actual trainable parameters -# base_layer = self.get_base_layer() -# kernel_size = base_layer.kernel_size -# stride = base_layer.stride -# padding = base_layer.padding -# conv_layer = type(base_layer) -# out_kernel = out_stride = (1,) * (self._kernel_dim - 2) -# self.lora_A[adapter_name] = conv_layer(self.in_features, r, kernel_size, stride, padding, bias=False) -# self.lora_B[adapter_name] = conv_layer(r, self.out_features, out_kernel, out_stride, bias=lora_bias) -# self.lora_bias[adapter_name] = lora_bias - -# if use_rslora: -# self.scaling[adapter_name] = lora_alpha / math.sqrt(r) -# else: -# self.scaling[adapter_name] = lora_alpha / r - -# if init_lora_weights == "loftq": -# self.loftq_init(adapter_name) -# elif init_lora_weights: -# self.reset_lora_parameters(adapter_name, init_lora_weights) - -# # call this before dora_init -# self._move_adapter_to_device_of_base_layer(adapter_name) - -# if use_dora: -# self.dora_init(adapter_name) -# self.use_dora[adapter_name] = True -# else: -# self.use_dora[adapter_name] = False - -# self.set_adapter(self.active_adapters) - -# def _get_dora_factor_view(self): -# return (-1,) + (1,) * (self._kernel_dim - 1) - -# def dora_init(self, adapter_name: str) -> None: -# if self.lora_magnitude_vector is None: -# # first dora layer being added, add lora_magnitude_vector to the list of learnable parameters -# self.adapter_layer_names = self.adapter_layer_names[:] + ("lora_magnitude_vector",) - -# dora_layer_class = self._get_dora_layer_class() -# dora_layer = dora_layer_class(fan_in_fan_out=False) -# lora_A = self.lora_A[adapter_name].weight -# lora_B = self.lora_B[adapter_name].weight -# scaling = self.scaling[adapter_name] -# dora_layer.update_layer(base_layer=self.get_base_layer(), lora_A=lora_A, lora_B=lora_B, scaling=scaling) -# self.lora_magnitude_vector[adapter_name] = dora_layer - -# # def _get_dora_layer_class(self) -> type[_DoraConvNdLayer]: -# # # Subclasses should override this method to return the appropriate DoraLayer class -# # raise NotImplementedError - -# def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: -# """ -# Merge the active adapter weights inside the base weights - -# Args: -# safe_merge (`bool`, *optional*): -# If True, the merge operation will be performed in a copy of the original weights and check for NaNs -# before merging the weights. This is useful if you want to check if the merge operation will produce -# NaNs. Defaults to `False`. -# adapter_names (`list[str]`, *optional*): -# The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults -# to `None`. -# """ -# adapter_names = check_adapters_to_merge(self, adapter_names) -# if not adapter_names: -# # no adapter to merge -# return - -# for active_adapter in adapter_names: -# if active_adapter in self.lora_A.keys(): -# base_layer = self.get_base_layer() -# if safe_merge: -# # Note that safe_merge will be slower than the normal merge -# # because of the copy operation. -# orig_weights = base_layer.weight.data.clone() -# delta_weight = self.get_delta_weight(active_adapter) - -# if not self.use_dora[active_adapter]: -# orig_weights += delta_weight -# else: -# # handle dora -# # since delta_weight already includes scaling, set it to 1 here -# weight_norm = ( -# self.lora_magnitude_vector[active_adapter] -# .get_weight_norm(orig_weights, delta_weight, scaling=1) -# .detach() -# ) -# # We need to cache weight_norm because it has to be based on the original weights. We -# # cannot calculate it on the fly based on the merged weights when unmerging because its a -# # different value -# self._cache_store(f"{active_adapter}-weight_norm", weight_norm) -# dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm -# orig_weights = dora_factor.view(*self._get_dora_factor_view()) * (orig_weights + delta_weight) - -# if not torch.isfinite(orig_weights).all(): -# raise ValueError( -# f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" -# ) -# base_layer.weight.data = orig_weights - -# if self.lora_bias[active_adapter]: -# new_bias = base_layer.bias + self.lora_B[active_adapter].bias -# if not torch.isfinite(new_bias).all(): -# raise ValueError( -# f"NaNs detected in the merged weights. The adapter {active_adapter} seems to be broken" -# ) -# base_layer.bias.data = new_bias - -# else: -# delta_weight = self.get_delta_weight(active_adapter) -# if not self.use_dora[active_adapter]: -# base_layer.weight.data += delta_weight -# else: -# # handle dora -# # since delta_weight already includes scaling, set it to 1 here -# weight_norm = ( -# self.lora_magnitude_vector[active_adapter] -# .get_weight_norm(base_layer.weight, delta_weight, scaling=1) -# .detach() -# ) -# # We need to cache weight_norm because it has to be based on the original weights. We -# # cannot calculate it on the fly based on the merged weights when unmerging because its a -# # different value -# self._cache_store(f"{active_adapter}-weight_norm", weight_norm) -# dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm -# new_weight = dora_factor.view(*self._get_dora_factor_view()) * ( -# base_layer.weight.data + delta_weight -# ) -# base_layer.weight.data = new_weight - -# if self.lora_bias[active_adapter]: -# base_layer.bias.data += self.lora_B[active_adapter].bias - -# self.merged_adapters.append(active_adapter) - -# def unmerge(self) -> None: -# """ -# This method unmerges all merged adapter layers from the base weights. -# """ -# if not self.merged: -# warnings.warn("Already unmerged. Nothing to do.") -# return -# while len(self.merged_adapters) > 0: -# active_adapter = self.merged_adapters.pop() -# if active_adapter in self.lora_A.keys(): -# weight = self.get_base_layer().weight -# delta_weight = self.get_delta_weight(active_adapter) -# if not self.use_dora[active_adapter]: -# weight.data -= delta_weight -# else: -# weight_norm = self._cache_pop(f"{active_adapter}-weight_norm") -# dora_factor = self.lora_magnitude_vector[active_adapter].weight / weight_norm -# weight_orig = weight.data / dora_factor.view(*self._get_dora_factor_view()) - delta_weight -# weight.data = weight_orig - -# if self.lora_bias[active_adapter]: -# self.get_base_layer().bias.data -= self.lora_B[active_adapter].bias - -# def get_delta_weight(self, adapter) -> torch.Tensor: -# """ -# Compute the delta weight for the given adapter. - -# Args: -# adapter (str): -# The name of the adapter for which the delta weight should be computed. -# """ -# device = self.lora_B[adapter].weight.device -# dtype = self.lora_A[adapter].weight.dtype - -# # In case users wants to merge the adapter weights that are in -# # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to -# # (b)float16 because some CPUs have slow bf16/fp16 matmuls. -# cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) - -# weight_A = self.lora_A[adapter].weight -# weight_B = self.lora_B[adapter].weight - -# if cast_to_fp32: -# weight_A = weight_A.float() -# weight_B = weight_B.float() - -# # https://github.com/bmaltais/kohya_ss/blob/feb6728762a8f463d15ba936d189d4c3abfaa1ab/networks/lora.py#L117 -# if self.get_base_layer().weight.size()[2:4] == (1, 1): -# # conv2d 1x1 -# output_tensor = (weight_B.squeeze(3).squeeze(2) @ weight_A.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze( -# 3 -# ) * self.scaling[adapter] -# else: -# output_tensor = ( -# self.conv_fn( -# weight_A.transpose(0, 1), -# weight_B, -# ).transpose(0, 1) -# * self.scaling[adapter] -# ) - -# if cast_to_fp32: -# output_tensor = output_tensor.to(dtype=dtype) - -# # cast back the weights -# self.lora_A[adapter].weight.data = weight_A.to(dtype) -# self.lora_B[adapter].weight.data = weight_B.to(dtype) - -# return output_tensor - -# def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: -# self._check_forward_args(x, *args, **kwargs) -# adapter_names = kwargs.pop("adapter_names", None) - -# if self.disable_adapters: -# if self.merged: -# self.unmerge() -# result = self.base_layer(x, *args, **kwargs) -# elif adapter_names is not None: -# result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs) -# elif self.merged: -# result = self.base_layer(x, *args, **kwargs) -# else: -# result = self.base_layer(x, *args, **kwargs) -# torch_result_dtype = result.dtype - -# for active_adapter in self.active_adapters: -# if active_adapter not in self.lora_A.keys(): -# continue -# lora_A = self.lora_A[active_adapter] -# lora_B = self.lora_B[active_adapter] -# dropout = self.lora_dropout[active_adapter] -# scaling = self.scaling[active_adapter] -# x = x.to(lora_A.weight.dtype) - -# if not self.use_dora[active_adapter]: -# result = result + lora_B(lora_A(dropout(x))) * scaling -# else: -# x = dropout(x) -# result = result + self.lora_magnitude_vector[active_adapter]( -# x, -# lora_A=lora_A, -# lora_B=lora_B, -# scaling=scaling, -# base_layer=self.get_base_layer(), -# ) - -# result = result.to(torch_result_dtype) -# return result - -# def __repr__(self) -> str: -# rep = super().__repr__() -# return "lora." + rep - - -# class Conv2d(_ConvNd): -# # Lora implemented in a conv2d layer -# def __init__(self, *args, **kwargs): -# super().__init__(*args, **kwargs) -# if not self._kernel_dim == 4: -# raise ValueError(f"Conv2d layer kernel must have 4 dimensions, not {self._kernel_dim}") -# self.conv_fn = F.conv2d - -# def _get_dora_layer_class(self): -# return 0#DoraConv2dLayer - - -# class Conv3d(_ConvNd): -# # Lora implemented in a conv3d layer -# def __init__(self, *args, **kwargs): -# super().__init__(*args, **kwargs) -# if not self._kernel_dim == 5: -# raise ValueError(f"Conv3d layer kernel must have 5 dimensions, not {self._kernel_dim}") -# self.conv_fn = F.conv3d - -# def _get_dora_layer_class(self): -# return 0#DoraConv3dLayer + return "alora." + rep + def dispatch_default( diff --git a/src/peft/tuners/alora/peft_model_alora.py b/src/peft/tuners/alora/peft_model_alora.py deleted file mode 100644 index 3e036bbaaa..0000000000 --- a/src/peft/tuners/alora/peft_model_alora.py +++ /dev/null @@ -1,2165 +0,0 @@ - - -from __future__ import annotations - -import collections -import copy -import inspect -import os -import warnings -from contextlib import contextmanager, nullcontext -from copy import deepcopy -from dataclasses import dataclass -from typing import Any, Literal, Optional, Union -import numpy as np -import packaging.version -import torch -import transformers -from accelerate import dispatch_model, infer_auto_device_map, init_empty_weights -from accelerate.hooks import AlignDevicesHook, add_hook_to_module, remove_hook_from_submodules -from accelerate.utils import get_balanced_memory, named_module_tensors -from huggingface_hub import HfFileSystem, ModelCard, ModelCardData, hf_hub_download -from safetensors import safe_open -from safetensors.torch import save_file as safe_save_file -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss -from transformers import Cache, DynamicCache, EncoderDecoderCache, PreTrainedModel -from transformers.modeling_outputs import QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput -from transformers.utils import PushToHubMixin - -from peft.utils.constants import DUMMY_MODEL_CONFIG #, PEFT_TYPE_TO_PREFIX_MAPPING - -from peft import __version__ -from peft.config import PeftConfig -import enum -from peft import ( - #LoraConfig, - PeftModel, - #PeftModelForCausalLM, - #PeftModelForSequenceClassification, - get_peft_model, - prepare_model_for_kbit_training) - -from peft.import_utils import is_bnb_4bit_available, is_bnb_available, is_eetq_available - -from .config import aLoraConfig -#from .eva import get_eva_state_dict, initialize_lora_eva_weights -#from .gptq import QuantLinear -from .layer import Linear, aLoraLayer #Conv2d, Conv3d, Embedding, Linear, aLoraLayer -from .model import aLoraModel - -from peft.tuners.lora import ( - #LoraConfig, - #LoraModel, - LoftQConfig, - EvaConfig, - LoraRuntimeConfig, - get_eva_state_dict, - initialize_lora_eva_weights, -) -from peft.tuners import ( - AdaLoraModel, - AdaptionPromptModel, - BOFTModel, - BoneModel, - CPTEmbedding, - FourierFTModel, - HRAModel, - IA3Model, - LNTuningModel, - LoHaModel, - LoKrModel, - LoraModel, - MultitaskPromptEmbedding, - OFTModel, - PolyModel, - PrefixEncoder, - PromptEmbedding, - PromptEncoder, - VBLoRAModel, - VeraModel, - XLoraConfig, - XLoraModel, -) -from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer -from peft.utils import ( - SAFETENSORS_WEIGHTS_NAME, - TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING, - WEIGHTS_NAME, - PeftType, - TaskType, - _get_batch_size, - _prepare_prompt_learning_config, - _set_adapter, - _set_trainable, - get_peft_model_state_dict, - id_tensor_storage, - infer_device, - load_peft_weights, - map_cache_to_layer_device_map, - set_peft_model_state_dict, - shift_tokens_right, -) - - -PEFT_TYPE_TO_MODEL_MAPPING = { - "LORA": aLoraModel, -} - - -class PeftType(str, enum.Enum): - LORA = "LORA" - - - -PEFT_TYPE_TO_PREFIX_MAPPING = { - PeftType.LORA: "lora_", -} - -#MODEL_TYPE_TO_PEFT_MODEL_MAPPING: dict[str, type[PeftModel]] = { - #"SEQ_CLS": PeftModelForSequenceClassification, - #"SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM, - # "CAUSAL_LM": PeftModelForCausalLM, - #"TOKEN_CLS": PeftModelForTokenClassification, - #"QUESTION_ANS": PeftModelForQuestionAnswering, - #"FEATURE_EXTRACTION": PeftModelForFeatureExtraction, - # } - -PEFT_TYPE_TO_CONFIG_MAPPING: dict[str, type[PeftConfig]] = { - #"ADAPTION_PROMPT": AdaptionPromptConfig, - #"PROMPT_TUNING": PromptTuningConfig, - #"PREFIX_TUNING": PrefixTuningConfig, - #"P_TUNING": PromptEncoderConfig, - "LORA": aLoraConfig, - #"LOHA": LoHaConfig, - #"LORAPLUS": LoraConfig, - #"LOKR": LoKrConfig, - #"ADALORA": AdaLoraConfig, - #"BOFT": BOFTConfig, - #"IA3": IA3Config, - #"MULTITASK_PROMPT_TUNING": MultitaskPromptTuningConfig, - #"OFT": OFTConfig, - #"POLY": PolyConfig, - #"LN_TUNING": LNTuningConfig, - #"VERA": VeraConfig, - #"FOURIERFT": FourierFTConfig, - #"XLORA": XLoraConfig, - #"HRA": HRAConfig, - #"VBLORA": VBLoRAConfig, - #"CPT": CPTConfig, - #"BONE": BoneConfig, - } -class PeftModelALoRA(PeftModel): - """ - Base model encompassing various Peft methods. - - Args: - model ([`~transformers.PreTrainedModel`]): The base transformer model used for Peft. - peft_config ([`PeftConfig`]): The configuration of the Peft model. - adapter_name (`str`, *optional*): The name of the adapter, defaults to `"default"`. - autocast_adapter_dtype (`bool`, *optional*): - Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights - using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect - select PEFT tuners. - low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): - Create empty adapter weights on meta device. Useful to speed up the loading loading process. - - - - Don't use `low_cpu_mem_usage=True` when creating a new PEFT adapter for training. - - - - **Attributes**: - - **base_model** ([`torch.nn.Module`]) -- The base transformer model used for Peft. - - **peft_config** ([`PeftConfig`]) -- The configuration of the Peft model. - - **modules_to_save** (`list` of `str`) -- The list of sub-module names to save when - saving the model. - - **prompt_encoder** ([`PromptEncoder`]) -- The prompt encoder used for Peft if - using [`PromptLearningConfig`]. - - **prompt_tokens** (`torch.Tensor`) -- The virtual prompt tokens used for Peft if - using [`PromptLearningConfig`]. - - **transformer_backbone_name** (`str`) -- The name of the transformer - backbone in the base model if using [`PromptLearningConfig`]. - - **word_embeddings** (`torch.nn.Embedding`) -- The word embeddings of the transformer backbone - in the base model if using [`PromptLearningConfig`]. - """ - - def __init__( - self, - model: PreTrainedModel, - peft_config: PeftConfig, - adapter_name: str = "default", - autocast_adapter_dtype: bool = True, - low_cpu_mem_usage: bool = False, - response_token_ids = None - ) -> None: - super().__init__(model, peft_config, adapter_name) - self.modules_to_save = None - self.active_adapter = adapter_name - self.peft_type = peft_config.peft_type - # These args are special PEFT arguments that users can pass. They need to be removed before passing them to - # forward. - self.special_peft_forward_args = {"adapter_names","alora_offsets"} - self.response_token_ids = response_token_ids - self._is_prompt_learning = peft_config.is_prompt_learning - if self._is_prompt_learning: - self._peft_config = {adapter_name: peft_config} - self.base_model = model - self.add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage) - else: - self._peft_config = None - cls = PEFT_TYPE_TO_MODEL_MAPPING[peft_config.peft_type] - ctx = init_empty_weights if low_cpu_mem_usage else nullcontext - with ctx(): - self.base_model = cls(model, {adapter_name: peft_config}, adapter_name) - self.set_additional_trainable_modules(peft_config, adapter_name) - - if hasattr(self.base_model, "_cast_adapter_dtype"): - self.base_model._cast_adapter_dtype( - adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype - ) - - if getattr(model, "is_gradient_checkpointing", True): - model = self._prepare_model_for_gradient_checkpointing(model) - - # the `pretraining_tp` is set for some models to simulate Tensor Parallelism during inference to avoid - # numerical differences, https://github.com/pytorch/pytorch/issues/76232 - to avoid any unexpected - # behavior we disable that in this line. - if hasattr(self.base_model, "config") and hasattr(self.base_model.config, "pretraining_tp"): - self.base_model.config.pretraining_tp = 1 - - @property - def peft_config(self) -> dict[str, PeftConfig]: - if self._is_prompt_learning: - return self._peft_config - return self.base_model.peft_config - - @property - def active_adapters(self) -> list[str]: - try: - adapters = self.base_model.active_adapters - if not isinstance(adapters, list): - # Base model is probably a transformers model, see: - # https://github.com/huggingface/transformers/pull/30790#issuecomment-2253808249 - # Unfortunately, transformers models also have an active_adapters method but it's 1) not a property and - # 2) calling it fails because the base model (usually) has no loaded adapter. The base model can be a - # transformers model for prompt learning, where the base model is not wrapped in a LoraModel or similar. - adapters = self.active_adapter - if isinstance(adapters, str): - adapters = [adapters] - except AttributeError: - adapters = self.active_adapter - if isinstance(adapters, str): - adapters = [adapters] - return adapters - - @peft_config.setter - def peft_config(self, value: dict[str, PeftConfig]): - if self._is_prompt_learning: - self._peft_config = value - else: - self.base_model.peft_config = value - - def save_pretrained( - self, - save_directory: str, - safe_serialization: bool = True, - selected_adapters: Optional[list[str]] = None, - save_embedding_layers: Union[str, bool] = "auto", - is_main_process: bool = True, - path_initial_model_for_weight_conversion: Optional[str] = None, - **kwargs: Any, - ) -> None: - r""" - This function saves the adapter model and the adapter configuration files to a directory, so that it can be - reloaded using the [`PeftModel.from_pretrained`] class method, and also used by the [`PeftModel.push_to_hub`] - method. - - Args: - save_directory (`str`): - Directory where the adapter model and configuration files will be saved (will be created if it does not - exist). - safe_serialization (`bool`, *optional*): - Whether to save the adapter files in safetensors format, defaults to `True`. - selected_adapters (`List[str]`, *optional*): - A list of adapters to be saved. If `None`, will default to all adapters. - save_embedding_layers (`Union[bool, str]`, *optional*, defaults to `"auto"`): - If `True`, save the embedding layers in addition to adapter weights. If `auto`, checks the common - embedding layers `peft.utils.other.EMBEDDING_LAYER_NAMES` in config's `target_modules` when available. - and automatically sets the boolean flag. This only works for 🤗 transformers models. - is_main_process (`bool`, *optional*): - Whether the process calling this is the main process or not. Will default to `True`. Will not save the - checkpoint if not on the main process, which is important for multi device setups (e.g. DDP). - path_initial_model_for_weight_conversion (`str, *optional*`): - The path to the initialized adapter, which is obtained after initializing the model with PiSSA or OLoRA - and before performing any training. When `path_initial_model_for_weight_conversion` is not None, the - difference in adapter before and after fine-tuning is calculated. This difference can be represented as - the parameters of a standard LoRA adapter. Using this converted adapter does not require changes to the - base model, thus conveniently allowing the use of multiple PiSSA or OLoRA adapters with LoRA adapters, - and the activation or deactivation of any adapters. Note that this conversion is not supported if - `rslora` is used in combination with `rank_pattern` or `alpha_pattern`. - kwargs (additional keyword arguments, *optional*): - Additional keyword arguments passed along to the `push_to_hub` method. - - """ - if os.path.isfile(save_directory): - raise ValueError(f"Provided path ({save_directory}) should be a directory, not a file") - - if selected_adapters is None: - selected_adapters = list(self.peft_config.keys()) - else: - if any( - selected_adapter_name not in list(self.peft_config.keys()) - for selected_adapter_name in selected_adapters - ): - raise ValueError( - f"You passed an invalid `selected_adapters` arguments, current supported adapter names are" - f" {list(self.peft_config.keys())} - got {selected_adapters}." - ) - - def save_mutated_as_lora(peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs): - if peft_config.use_rslora and (peft_config.rank_pattern or peft_config.alpha_pattern): - msg = ( - "Passing `path_initial_model_for_weight_conversion` to `save_pretrained` is not supported when " - "using `rank_pattern` or `alpha_pattern` at the same time as `use_rslora=True`." - ) - raise ValueError(msg) - - if not any( - str(peft_config.init_lora_weights).lower().startswith(prefix) for prefix in ["pissa", "olora", "true"] - ): - warnings.warn( - "`path_initial_model_for_weight_conversion` only works for converting a PiSSA or OLoRA adapter to " - "a LoRA adapter" - ) - initial_adapter_name = os.path.basename(path_initial_model_for_weight_conversion) - try: - self.load_adapter( - os.path.dirname(path_initial_model_for_weight_conversion), - subfolder=initial_adapter_name, - adapter_name=initial_adapter_name, - ) - is_pissa = str(self.peft_config[initial_adapter_name].init_lora_weights).lower().startswith("pissa") - is_olora = str(self.peft_config[initial_adapter_name].init_lora_weights).lower() == "olora" - if is_pissa or is_olora: - raise ValueError( - "The `init_lora_weights` parameter of the initial adapter should be set to `True`. " - "Otherwise, `self.load_adapter` will subtract the decomposed values again based on the " - "residual model." - ) - output_state_dict = self.base_model.subtract_mutated_init( - output_state_dict, initial_adapter_name, kwargs - ) - finally: - self.delete_adapter(initial_adapter_name) - return output_state_dict - - if is_main_process: - os.makedirs(save_directory, exist_ok=True) - self.create_or_update_model_card(save_directory) - - for adapter_name in selected_adapters: - peft_config = self.peft_config[adapter_name] - # save only the trainable weights - output_state_dict = get_peft_model_state_dict( - self, - state_dict=kwargs.get("state_dict", None), - adapter_name=adapter_name, - save_embedding_layers=save_embedding_layers, - ) - output_dir = os.path.join(save_directory, adapter_name) if adapter_name != "default" else save_directory - os.makedirs(output_dir, exist_ok=True) - - if is_main_process and safe_serialization: - # Section copied from: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L2111-L2134 - # Safetensors does not allow tensor aliasing. - # We're going to remove aliases before saving - ptrs = collections.defaultdict(list) - for name, tensor in output_state_dict.items(): - # Sometimes in the state_dict we have non-tensor objects. - # e.g. in bitsandbytes we have some `str` objects in the state_dict - if isinstance(tensor, torch.Tensor): - ptrs[id_tensor_storage(tensor)].append(name) - else: - # In the non-tensor case, fall back to the pointer of the object itself - ptrs[id(tensor)].append(name) - - # These are all the pointers of shared tensors. - shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1} - - for _, names in shared_ptrs.items(): - # Here we just clone the shared tensors to avoid tensor aliasing which is - # not supported in safetensors. - for shared_tensor_name in names[1:]: - output_state_dict[shared_tensor_name] = output_state_dict[shared_tensor_name].clone() - if path_initial_model_for_weight_conversion is not None: - peft_config = copy.deepcopy(peft_config) - peft_config.init_lora_weights = True - peft_config.save_pretrained(path_initial_model_for_weight_conversion) - output_state_dict = save_mutated_as_lora( - peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs - ) - safe_save_file( - output_state_dict, - os.path.join(output_dir, SAFETENSORS_WEIGHTS_NAME), - metadata={"format": "pt"}, - ) - elif is_main_process: - if path_initial_model_for_weight_conversion is not None: - peft_config = copy.deepcopy(peft_config) - peft_config.init_lora_weights = True - peft_config.save_pretrained(path_initial_model_for_weight_conversion) - output_state_dict = save_mutated_as_lora( - peft_config, path_initial_model_for_weight_conversion, output_state_dict, kwargs - ) - torch.save(output_state_dict, os.path.join(output_dir, WEIGHTS_NAME)) - - # save the config and change the inference mode to `True` - if peft_config.base_model_name_or_path is None: - peft_config.base_model_name_or_path = ( - self.base_model.__dict__.get("name_or_path", None) - if peft_config.is_prompt_learning - else self.base_model.model.__dict__.get("name_or_path", None) - ) - inference_mode = peft_config.inference_mode - peft_config.inference_mode = True - - if peft_config.task_type is None: - # deal with auto mapping - base_model_class = self._get_base_model_class( - is_prompt_tuning=peft_config.is_prompt_learning, - ) - parent_library = base_model_class.__module__ - - auto_mapping_dict = { - "base_model_class": base_model_class.__name__, - "parent_library": parent_library, - } - else: - auto_mapping_dict = None - - if is_main_process: - if path_initial_model_for_weight_conversion is not None: - peft_config.init_lora_weights = True - peft_config.r *= 2 - if not peft_config.use_rslora: - peft_config.lora_alpha *= 2 - else: - # with rslora, we have scaling = alpha / sqrt(r), we thus adjust alpha to keep the same scaling - peft_config.lora_alpha *= 2**0.5 - - if peft_config.rank_pattern: - peft_config.rank_pattern = {key: 2 * val for key, val in peft_config.rank_pattern.items()} - if peft_config.alpha_pattern: - peft_config.alpha_pattern = {key: 2 * val for key, val in peft_config.alpha_pattern.items()} - - peft_config.save_pretrained(output_dir, auto_mapping_dict=auto_mapping_dict) - peft_config.inference_mode = inference_mode - - @classmethod - def from_pretrained( - cls, - model: torch.nn.Module, - model_id: Union[str, os.PathLike], - adapter_name: str = "default", - is_trainable: bool = False, - config: Optional[PeftConfig] = None, - autocast_adapter_dtype: bool = True, - ephemeral_gpu_offload: bool = False, - low_cpu_mem_usage: bool = False, - ks = None,#new - response_token_ids = None, #new - **kwargs: Any, - ) -> PeftModel: - r""" - Instantiate a PEFT model from a pretrained model and loaded PEFT weights. - - Note that the passed `model` may be modified inplace. - - Args: - model ([`torch.nn.Module`]): - The model to be adapted. For 🤗 Transformers models, the model should be initialized with the - [`~transformers.PreTrainedModel.from_pretrained`]. - model_id (`str` or `os.PathLike`): - The name of the PEFT configuration to use. Can be either: - - A string, the `model id` of a PEFT configuration hosted inside a model repo on the Hugging Face - Hub. - - A path to a directory containing a PEFT configuration file saved using the `save_pretrained` - method (`./my_peft_config_directory/`). - adapter_name (`str`, *optional*, defaults to `"default"`): - The name of the adapter to be loaded. This is useful for loading multiple adapters. - is_trainable (`bool`, *optional*, defaults to `False`): - Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be - used for inference. - config ([`~peft.PeftConfig`], *optional*): - The configuration object to use instead of an automatically loaded configuration. This configuration - object is mutually exclusive with `model_id` and `kwargs`. This is useful when configuration is already - loaded before calling `from_pretrained`. - autocast_adapter_dtype (`bool`, *optional*): - Whether to autocast the adapter dtype. Defaults to `True`. Only relevant for specific adapter types. - ephemeral_gpu_offload (`bool`, *optional*): - Whether to use ephemeral GPU offloading for partially loaded modules. Defaults to `False`. This is - useful when parts of the model and/or components (such as adapters) are kept in CPU memory until they - are needed. Rather than perform expensive operations on small data, the data is transferred to the GPU - on-demand, the operation(s) performed, and the results moved back to CPU memory. This brings a slight - momentary VRAM overhead but gives orders of magnitude speedup in certain cases. - low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): - Create empty adapter weights on meta device before loading the saved weights. Useful to speed up the - process. - torch_device (`str`, *optional*, defaults to None): - The device to load the adapter on. If `None`, the device will be inferred. - kwargs: (`optional`): - Additional keyword arguments passed along to the specific PEFT configuration class. - """ - - -# from .mapping import MODEL_TYPE_TO_PEFT_MODEL_MAPPING, PEFT_TYPE_TO_CONFIG_MAPPING - MODEL_TYPE_TO_PEFT_MODEL_MAPPING: dict[str, type[PeftModel]] = { - #"SEQ_CLS": PeftModelForSequenceClassification, - #"SEQ_2_SEQ_LM": PeftModelForSeq2SeqLM, - "CAUSAL_LM": aLoRAPeftModelForCausalLM, - #"TOKEN_CLS": PeftModelForTokenClassification, - #"QUESTION_ANS": PeftModelForQuestionAnswering, - #"FEATURE_EXTRACTION": PeftModelForFeatureExtraction, - } - - PEFT_TYPE_TO_CONFIG_MAPPING: dict[str, type[PeftConfig]] = { - #"ADAPTION_PROMPT": AdaptionPromptConfig, - #"PROMPT_TUNING": PromptTuningConfig, - #"PREFIX_TUNING": PrefixTuningConfig, - #"P_TUNING": PromptEncoderConfig, - "LORA": aLoraConfig, - #"LOHA": LoHaConfig, - #"LORAPLUS": LoraConfig, - #"LOKR": LoKrConfig, - #"ADALORA": AdaLoraConfig, - #"BOFT": BOFTConfig, - #"IA3": IA3Config, - #"MULTITASK_PROMPT_TUNING": MultitaskPromptTuningConfig, - #"OFT": OFTConfig, - #"POLY": PolyConfig, - #"LN_TUNING": LNTuningConfig, - #"VERA": VeraConfig, - #"FOURIERFT": FourierFTConfig, - #"XLORA": XLoraConfig, - #"HRA": HRAConfig, - #"VBLORA": VBLoRAConfig, - #"CPT": CPTConfig, - #"BONE": BoneConfig, - } - -# self.disable_adapters = False #NEW - # load the config - if config is None: - config = PEFT_TYPE_TO_CONFIG_MAPPING[ - PeftConfig._get_peft_type( - model_id, - subfolder=kwargs.get("subfolder", None), - revision=kwargs.get("revision", None), - cache_dir=kwargs.get("cache_dir", None), - use_auth_token=kwargs.get("use_auth_token", None), - token=kwargs.get("token", None), - ) - ].from_pretrained(model_id, **kwargs) - elif isinstance(config, PeftConfig): - config.inference_mode = not is_trainable - else: - raise ValueError(f"The input config must be a PeftConfig, got {config.__class__}") - - # Runtime configuration, if supported - if hasattr(config, "runtime_config"): - config.runtime_config.ephemeral_gpu_offload = ephemeral_gpu_offload - else: - if ephemeral_gpu_offload: - warnings.warn("Ephemeral GPU offloading is not supported for this model. Ignoring.") - - if hasattr(model, "hf_device_map"): - weight_map = dict(named_module_tensors(model, recurse=True)) - - # recreate the offload_index for disk-offloaded modules: we need to know the location in storage of each weight - # before the offload hook is removed from the model - disk_modules = set() - index = None - for name, module in model.named_modules(): - if hasattr(module, "_hf_hook") and hasattr(module._hf_hook, "original_devices"): - if hasattr(module._hf_hook.weights_map, "dataset"): - index = module._hf_hook.weights_map.dataset.index - for key in module._hf_hook.original_devices.keys(): - if module._hf_hook.original_devices[key] == torch.device("meta"): - disk_modules.add(str(name) + "." + str(key)) - - if disk_modules and not kwargs.get("use_safetensors", True): - raise ValueError("Disk offloading currently only supported for safetensors") - - if index: - offload_index = { - p: { - "safetensors_file": index[p]["safetensors_file"], - "weight_name": p, - "dtype": str(weight_map[p].dtype).replace("torch.", ""), - } - for p in weight_map.keys() - if p in disk_modules - } - kwargs["offload_index"] = offload_index - - if (getattr(model, "hf_device_map", None) is not None) and len( - set(model.hf_device_map.values()).intersection({"cpu", "disk"}) - ) > 0: - remove_hook_from_submodules(model) - - if config.is_prompt_learning and is_trainable: - raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.") - else: - config.inference_mode = not is_trainable - if isinstance(getattr(model, "base_model", None), XLoraModel): - if not isinstance(config, XLoraConfig): - raise TypeError(f"Expected 'XLoraConfig', got '{type(config)}' instead.") - if "adapters" in kwargs: - config.adapters = kwargs["adapters"] - else: - # If the path is on HF hub, then we get the adapter names to create a subfolders list which tells - # `load_adapter` where the adapters are. - if not os.path.exists(model_id): - s = HfFileSystem() - - # The names of the adapters which must be in folders - adapter_names = [ - file["name"][len(model_id) + 1 :] for file in s.ls(model_id) if file["type"] == "directory" - ] - # Prepare a dict of adapter paths, which really just point to the hf id; we will use the subfolders - adapter_paths = {} - for adapter_name in adapter_names: - adapter_paths[adapter_name] = os.path.join(model_id, model_id) - config.adapters = adapter_paths - config._subfolders = adapter_names - else: - if "adapters" not in kwargs: - raise ValueError("If model_id is a local path, then `adapters` must be passed in kwargs.") - - if config.task_type not in MODEL_TYPE_TO_PEFT_MODEL_MAPPING.keys(): - model = cls( - model, - config, - adapter_name, - autocast_adapter_dtype=autocast_adapter_dtype, - low_cpu_mem_usage=low_cpu_mem_usage, - ) - else: - model = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[config.task_type]( - model, - config, - adapter_name, - autocast_adapter_dtype=autocast_adapter_dtype, - low_cpu_mem_usage=low_cpu_mem_usage, - ) - - load_result = model.load_adapter( - model_id, - adapter_name, - is_trainable=is_trainable, - autocast_adapter_dtype=autocast_adapter_dtype, - low_cpu_mem_usage=low_cpu_mem_usage, - **kwargs, - ) - - # 1. Remove VB-LoRA vector bank, since it's a shared parameter set via the VBLoRAModel - # 2. Remove the prompt encoder, as it does not need to be part of the checkpoint - missing_keys = [ - k for k in load_result.missing_keys if "vblora_vector_bank" not in k and "prompt_encoder" not in k - ] - if missing_keys: - # Let's warn here since (in contrast to load_adapter) we don't return the load result, so it could be quite - # difficult for users to even notice that something might have gone wrong here. As we filter out non PEFT - # keys from the missing keys, this gives no false positives. - warnings.warn(f"Found missing adapter keys while loading the checkpoint: {missing_keys}") - - model.ks = ks - model.response_token_ids = response_token_ids - model.disable_adapters = False #NEW - return model - - def _setup_prompt_encoder(self, adapter_name: str): - config = self.peft_config[adapter_name] - if not hasattr(self, "prompt_encoder"): - self.prompt_encoder = torch.nn.ModuleDict({}) - self.prompt_tokens = {} - transformer_backbone = None - for name, module in self.base_model.named_children(): - for param in module.parameters(): - param.requires_grad = False - if isinstance(module, PreTrainedModel): - # Make sure to freeze Tranformers model - if transformer_backbone is None: - transformer_backbone = module - self.transformer_backbone_name = name - if transformer_backbone is None: - transformer_backbone = self.base_model - - if config.num_transformer_submodules is None: - config.num_transformer_submodules = 2 if config.task_type == TaskType.SEQ_2_SEQ_LM else 1 - - # determine the word embeddings - word_embeddings = None - try: - # First try to find the word embeddings based on the module name, this should work for models like Bert, - # Roberta, Deberta, etc. - word_embeddings = self.base_model.get_submodule("embeddings.word_embeddings") - except AttributeError: - pass - - if word_embeddings is None: - # Word embeddings could not be determined. Next try to guess them by checking which parameter has the size - # of the vocab. - for named_param, value in list(transformer_backbone.named_parameters()): - # for ZeRO-3, the tensor is sharded across accelerators and deepspeed modifies it to a tensor with shape - # [0] the actual unsharded shape is stored in "ds_shape" attribute special handling is needed in case - # the model is initialized in deepspeed.zero.Init() context or HfDeepSpeedConfig has been called before - # For reference refer to issue: https://github.com/huggingface/peft/issues/996 - deepspeed_distributed_tensor_shape = getattr(value, "ds_shape", None) - - if value.shape[0] == self.base_model.config.vocab_size or ( - deepspeed_distributed_tensor_shape is not None - and deepspeed_distributed_tensor_shape[0] == self.base_model.config.vocab_size - ): - word_embeddings = transformer_backbone.get_submodule(named_param.replace(".weight", "")) - break - - self.word_embeddings = word_embeddings - - if config.peft_type == PeftType.PROMPT_TUNING: - prompt_encoder = PromptEmbedding(config, self.word_embeddings) - elif config.peft_type == PeftType.MULTITASK_PROMPT_TUNING: - prompt_encoder = MultitaskPromptEmbedding(config, self.word_embeddings) - elif config.peft_type == PeftType.P_TUNING: - prompt_encoder = PromptEncoder(config) - elif config.peft_type == PeftType.PREFIX_TUNING: - # prefix tuning now uses Cache but that won't work with gradient checkpointing - if any(getattr(module, "gradient_checkpointing", False) for module in self.get_base_model().modules()): - raise ValueError("Prefix tuning does not work with gradient checkpointing.") - prompt_encoder = PrefixEncoder(config) - elif config.peft_type == PeftType.CPT: - prompt_encoder = CPTEmbedding(config, self.word_embeddings) - else: - raise ValueError("Not supported") - - prompt_encoder = prompt_encoder.to(self.device) - self.prompt_encoder.update(torch.nn.ModuleDict({adapter_name: prompt_encoder})) - self.prompt_tokens[adapter_name] = torch.arange( - config.num_virtual_tokens * config.num_transformer_submodules - ).long() - - def _prepare_model_for_gradient_checkpointing(self, model: PreTrainedModel): - r""" - Prepares the model for gradient checkpointing if necessary - """ - if not ( - getattr(model, "is_loaded_in_8bit", False) - or getattr(model, "is_loaded_in_4bit", False) - or getattr(model, "is_quantized", False) - ): - if hasattr(model, "enable_input_require_grads"): - model.enable_input_require_grads() - elif hasattr(model, "get_input_embeddings"): - - def make_inputs_require_grad(module, input, output): - output.requires_grad_(True) - - model.get_input_embeddings().register_forward_hook(make_inputs_require_grad) - return model - - def get_prompt_embedding_to_save(self, adapter_name: str) -> torch.Tensor: - """ - Returns the prompt embedding to save when saving the model. Only applicable when using a prompt learning - method. - """ - prompt_encoder = self.prompt_encoder[adapter_name] - prompt_tokens = ( - self.prompt_tokens[adapter_name].unsqueeze(0).expand(1, -1).to(prompt_encoder.embedding.weight.device) - ) - if self.peft_config[adapter_name].peft_type == PeftType.PREFIX_TUNING: - prompt_tokens = prompt_tokens[:, : self.peft_config[adapter_name].num_virtual_tokens] - - if self.peft_config[adapter_name].peft_type == PeftType.MULTITASK_PROMPT_TUNING: - prompt_embeddings = super(MultitaskPromptEmbedding, prompt_encoder).forward(prompt_tokens) - else: - prompt_embeddings = prompt_encoder(prompt_tokens) - - return prompt_embeddings[0].detach().cpu() - - def get_prompt(self, batch_size: int, task_ids: Optional[torch.Tensor] = None) -> torch.Tensor: - """ - Returns the virtual prompts to use for Peft. Only applicable when using a prompt learning method. - """ - peft_config = self.active_peft_config - prompt_encoder = self.prompt_encoder[self.active_adapter] - prompt_tokens = ( - self.prompt_tokens[self.active_adapter] - .unsqueeze(0) - .expand(batch_size, -1) - .to(prompt_encoder.embedding.weight.device) - ) - if peft_config.peft_type == PeftType.PREFIX_TUNING: - prompt_tokens = prompt_tokens[:, : peft_config.num_virtual_tokens] - if peft_config.inference_mode: - past_key_values = prompt_encoder.embedding.weight.repeat(batch_size, 1, 1) - else: - past_key_values = prompt_encoder(prompt_tokens) - if self.base_model_torch_dtype is not None: - past_key_values = past_key_values.to(self.base_model_torch_dtype) - past_key_values = past_key_values.view( - batch_size, - peft_config.num_virtual_tokens, - peft_config.num_layers * 2, - peft_config.num_attention_heads, - peft_config.token_dim // peft_config.num_attention_heads, - ) - if peft_config.num_transformer_submodules == 2: - past_key_values = torch.cat([past_key_values, past_key_values], dim=2) - past_key_values = past_key_values.permute([2, 0, 3, 1, 4]).split( - peft_config.num_transformer_submodules * 2 - ) - if TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING.get(self.config.model_type, None) is not None: - post_process_fn = TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING[self.config.model_type] - past_key_values = post_process_fn(past_key_values) - elif peft_config.num_transformer_submodules == 1: - # Dont' apply this to encoder-decoder models and not to models requiring special processing. - # local import in case users use a very old transformers version - past_key_values = DynamicCache.from_legacy_cache(past_key_values) - elif peft_config.num_transformer_submodules == 2 and self.base_model._supports_cache_class: - # Dont' apply this to encoder-decoder models that don't support new Cachc format yet - # If we don't apply this, prefix-tuning fails to update cross-attn cache - past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) - past_key_values.cross_attention_cache = DynamicCache() - past_key_values.is_updated = { - layer_idx: False for layer_idx in range(len(past_key_values.cross_attention_cache.key_cache)) - } - map_cache_to_layer_device_map(self.get_base_model(), past_key_values) # no-op if not a Cache instance - return past_key_values - else: - if peft_config.peft_type == PeftType.MULTITASK_PROMPT_TUNING: - prompts = prompt_encoder(prompt_tokens, task_ids) - else: - if peft_config.inference_mode: - prompts = prompt_encoder.embedding.weight - else: - # Take only one prompt token sample and expand the output instead of expanding the input, see: - # https://github.com/huggingface/peft/issues/2043#issuecomment-2321522577 - prompt_tokens = prompt_tokens[:1] - prompts = prompt_encoder(prompt_tokens) - prompts = prompts.repeat(batch_size, 1, 1) - return prompts - - def get_nb_trainable_parameters(self) -> tuple[int, int]: - r""" - Returns the number of trainable parameters and the number of all parameters in the model. - """ - trainable_params = 0 - all_param = 0 - for _, param in self.named_parameters(): - num_params = param.numel() - # if using DS Zero 3 and the weights are initialized empty - if num_params == 0 and hasattr(param, "ds_numel"): - num_params = param.ds_numel - - # Due to the design of 4bit linear layers from bitsandbytes - # one needs to multiply the number of parameters by 2 to get - # the correct number of parameters - if param.__class__.__name__ == "Params4bit": - if hasattr(param, "element_size"): - num_bytes = param.element_size() - elif not hasattr(param, "quant_storage"): - num_bytes = 1 - else: - num_bytes = param.quant_storage.itemsize - num_params = num_params * 2 * num_bytes - - all_param += num_params - if param.requires_grad: - trainable_params += num_params - - return trainable_params, all_param - - def print_trainable_parameters(self) -> None: - """ - Prints the number of trainable parameters in the model. - - Note: print_trainable_parameters() uses get_nb_trainable_parameters() which is different from - num_parameters(only_trainable=True) from huggingface/transformers. get_nb_trainable_parameters() returns - (trainable parameters, all parameters) of the Peft Model which includes modified backbone transformer model. - For techniques like LoRA, the backbone transformer model is modified in place with LoRA modules. However, for - prompt tuning, the backbone transformer model is unmodified. num_parameters(only_trainable=True) returns number - of trainable parameters of the backbone transformer model which can be different. - """ - trainable_params, all_param = self.get_nb_trainable_parameters() - - print( - f"trainable params: {trainable_params:,d} || all params: {all_param:,d} || trainable%: {100 * trainable_params / all_param:.4f}" - ) - - def __getattr__(self, name: str): - """Forward missing attributes to the wrapped module.""" - try: - return super().__getattr__(name) # defer to nn.Module's logic - except AttributeError: - if name == "base_model": # see #1892: prevent infinite recursion if class is not initialized - raise - return getattr(self.base_model, name) - - @contextmanager - def _enable_peft_forward_hooks(self, *args, **kwargs): - # If the base model has a method called _enable_peft_forward_hooks, it is invoked as a context. Otherwise, this - # runs without any changes - if hasattr(self.base_model, "_enable_peft_forward_hooks"): - with self.base_model._enable_peft_forward_hooks(*args, **kwargs): - yield - return - else: - # nothing to enable - yield - return - - def forward(self, *args: Any, **kwargs: Any): - """ - Forward pass of the model. - """ -# print('forward') -# print(args) -# print(kwargs) - with self._enable_peft_forward_hooks(*args, **kwargs): - kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} - return self.get_base_model()(*args, **kwargs) - - def generate(self, *args, **kwargs): - # print("generate") - # print(args) - # print(kwargs) - with self._enable_peft_forward_hooks(*args, **kwargs): - kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} - return self.get_base_model().generate(*args, **kwargs) - - def _get_base_model_class(self, is_prompt_tuning=False): - """ - Returns the base model class. - """ - if not is_prompt_tuning: - return self.base_model.model.__class__ - return self.base_model.__class__ - - @contextmanager - def disable_adapter(self): - """ - Context manager that disables the adapter module. Use this to run inference on the base model. - - Example: - - ```py - >>> with model.disable_adapter(): - ... model(inputs) - ``` - """ - #self.disable_adapters = True - if self.peft_config[self.active_adapter].is_prompt_learning: - try: - # TODO: consider replacing this patching of methods with a more robust mechanism: setting a flag and - # letting the underlying methods deal with it, same as how LoRA does it. - old_forward = self.forward - self.forward = self.base_model.forward - old_prepare_inputs_for_generation = self.prepare_inputs_for_generation - self.prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation - yield - finally: - self.forward = old_forward - self.prepare_inputs_for_generation = old_prepare_inputs_for_generation - - elif self.peft_config[self.active_adapter].is_adaption_prompt: - try: - self.base_model.disable_adapter_layers() - yield - finally: - self.base_model.enable_adapter_layers() - - else: # LoRA, LoHa, etc. - model_status = self.get_model_status() - if model_status.enabled == "irregular": - warnings.warn( - "The model contains some adapter layers that are enabled and others that are disabled. " - "This is most likely unintentional. After exiting the disable_adapter context, all adapters " - "will be enabled" - ) - try: - self.disable_adapters = True - self.base_model.disable_adapter_layers() - yield - finally: - if model_status.enabled is not False: - self.disable_adapters = False - # model_status.enabled is `True` or `"irregular"` - self.base_model.enable_adapter_layers() - - def get_base_model(self) -> torch.nn.Module: - """ - Returns the base model. - """ - return ( - self.base_model - if (False) #(self.active_peft_config.is_prompt_learning or self.peft_type == PeftType.POLY) - else self.base_model.model - ) - - def add_adapter(self, adapter_name: str, peft_config: PeftConfig, low_cpu_mem_usage: bool = False) -> None: - """ - Add an adapter to the model based on the passed configuration. - - This adapter is not trained. To load a trained adapter, check out [`PeftModel.load_adapter`]. - - The name for the new adapter should be unique. - - The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active - adapter. - - Args: - adapter_name (`str`): - The name of the adapter to be added. - peft_config ([`PeftConfig`]): - The configuration of the adapter to be added. - low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): - Create empty adapter weights on meta device. Useful to speed up the process when loading saved - adapters. Don't use this option when creating a new PEFT adapter for training. - - """ - if peft_config.peft_type != self.peft_type: - raise ValueError( - f"Cannot combine adapters with different peft types. " - f"Found {self.peft_type} and {peft_config.peft_type}." - ) - - try: - if peft_config.is_prompt_learning: - self.peft_config[adapter_name] = peft_config - if hasattr(self.config, "to_dict"): - dict_config = self.config.to_dict() - else: - dict_config = self.config - - peft_config = _prepare_prompt_learning_config(peft_config, dict_config) - self._setup_prompt_encoder(adapter_name) - elif peft_config.is_adaption_prompt: - self.base_model.add_adapter(adapter_name, peft_config) - else: - self.peft_config[adapter_name] = peft_config - self.base_model.inject_adapter( - self.base_model.model, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage - ) - except Exception: # something went wrong, roll back - if adapter_name in self.peft_config: - del self.peft_config[adapter_name] - raise - - self.set_additional_trainable_modules(peft_config, adapter_name) - - def set_additional_trainable_modules(self, peft_config, adapter_name): - if getattr(peft_config, "modules_to_save", None) is not None: - if self.modules_to_save is None: - self.modules_to_save = set(peft_config.modules_to_save) - else: - self.modules_to_save.update(peft_config.modules_to_save) - _set_trainable(self, adapter_name) # this may add a new ModulesToSaveWrapper - - def get_layer_status(self) -> list[TunerLayerStatus]: - """Get the status of each adapter layer in the model. - - This method returns a list of `TunerLayerStatus` dataclass instances, each of which contains the following - attributes: - - - `name` (`str`): - The name of the adapter layer, e.g. `model.encoder.block.0.layer.0.SelfAttention.q`. - - `module_type` (`str`): - The type of the adapter layer, e.g. `lora.Linear`. - - `enabled` (`bool`): - Whether the adapter layer is enabled. - - `active_adapters` (`list[str]`): - The names of the active adapters, if any, e.g. `["default"]`. - - `merged_adapters` (`list[str]`): - The names of the merged adapters, if any, e.g. `["default"]`. - - `available_adapters` (`list[str]`): - The names of the available adapters, e.g. `["default"]`. - - Args: - model ([`~PeftModel`]): - The model to get the adapter layer status from. - - Returns: - list[`peft.peft_model.TunerLayerStatus`]: - A list of dataclasses, each containing the status of the corresponding adapter layer. - - """ - return get_layer_status(self) - - def get_model_status(self) -> TunerModelStatus: - """Get the status of tuners of the model. - - This method returns a `TunerModelStatus` dataclass instance, which contains the following attributes: - - - `base_model_type` (`str`): - The type of the base model, e.g. `T5Model`. - - `adapter_model_type` (`str`): - The type of the adapter model, e.g. `LoraModel`. - - `peft_types` (`dict[str, str]`): - The mapping of adapter name to adapter type, e.g. `{"default": "LORA"}`. - - `trainable_params` (`int`): - The number of trainable parameters in the model. - - `total_params` (`int`): - The total number of parameters in the model. - - `num_adapter_layers` (`int`): - The number of adapter layers in the model. - - `enabled` (`bool`, `Literal["irregular"]`): - Whether all adapter layers are enabled. If some are enabled and some are not, this will be `"irregular"`. - This means that your model is in an inconsistent state and might not work as expected. - - `active_adapters` (`list[str]`, `Literal["irregular"]`): - The names of the active adapters. If the active adapters are not consistent across all layers, this will be - `"irregular"`, which means that your model is in an inconsistent state and might not work as expected. - - `merged_adapters` (`list[str]`, `Literal["irregular"]`): - The names of the merged adapters. If the merged adapters are not consistent across all layers, this will be - `"irregular"`, which means that your model is in an inconsistent state and might not work as expected. - - `available_adapters` (`list[str]`): - The names of the available adapters, e.g. `["default"]`. - - Args: - model ([`~PeftModel`]): - The model to get the adapter layer status from. - - Returns: - `peft.peft_model.TunerModelStatus`: - A dataclass containing the status of the model. - - """ - return get_model_status(self) - - @classmethod - def _split_kwargs(cls, kwargs: dict[str, Any]): - _kwargs_not_in_hf_hub_download_signature = ("use_auth_token",) - hf_hub_download_kwargs = {} - other_kwargs = {} - - for key, value in kwargs.items(): - if key in inspect.signature(hf_hub_download).parameters or key in _kwargs_not_in_hf_hub_download_signature: - hf_hub_download_kwargs[key] = value - else: - other_kwargs[key] = value - - return hf_hub_download_kwargs, other_kwargs - - def _update_offload(self, offload_index: dict[str, dict[str, str]], adapters_weights: dict[str, torch.tensor]): - """ - Update the offload_index and safetensors files for loading and mergine PeftModels with disk-offloaded modules. - - Args: - offload_index (Dict[str: str]): - Dictionary of disk-offloaded modules with their metadata and safetensors filenames - adapters_weights (Dict[str: torch.tensor]): - Dictionary of Peft adapter module names and weights - """ - - if not offload_index: - return offload_index - - prefix = "base_model.model." - # rename offload index weight and model names - adapter_names = list(self.peft_config.keys()) - for adapter_name in adapter_names: - keys = list(offload_index.keys()) - block_id = keys[0].split(".")[0] + "." # for writing safetensors key, - - # replace original offload index keys with PeftModel keys - for key in keys: - suffix_pos = key.rfind(".") - extended_prefix = prefix + key[:suffix_pos] - module = dict(self.named_modules())[extended_prefix] - if isinstance(module, BaseTunerLayer): - new_key = prefix + key[:suffix_pos] + ".base_layer" + key[suffix_pos:] - else: - new_key = prefix + key - offload_index[key]["weight_name"] = new_key - offload_index[new_key] = offload_index[key] - del offload_index[key] - - files_seen = set() - # rename safetensors for dispatch - for new_key in list(offload_index.keys()): - fname = offload_index[new_key]["safetensors_file"] - - # make a new file name - new_fname_list = list(fname.split(os.sep)) - for i, name in enumerate(new_fname_list): - if "--" in name: - new_fname_list[i] += "-peft" - break - new_fname = os.path.join(*new_fname_list) - - if fname in files_seen: - continue - safe_dict = {} - with safe_open(fname, framework="pt") as f: - for safe_key in f.keys(): - safe_tensor = f.get_tensor(safe_key) - metadata = f.metadata() - suffix_pos = safe_key.rfind(".") - extended_prefix = prefix + block_id + safe_key[:suffix_pos] - safe_module = dict(self.named_modules())[extended_prefix] - if isinstance(safe_module, BaseTunerLayer): - final_key = extended_prefix + ".base_layer" + safe_key[suffix_pos:] - lora_dict = {key: val for key, val in adapters_weights.items() if extended_prefix in key} - - # add LoRA keys and values to disk offload - for lora_key, lora_val in lora_dict.items(): - divide = lora_key.rfind(".") - new_key = lora_key[:divide] + f".{adapter_name}" + lora_key[divide:] - safe_dict[new_key] = lora_val - else: - final_key = prefix + block_id + safe_key - safe_dict[final_key] = safe_tensor - files_seen.add(new_fname) - - # avoid overwriting original safetensors - for key in safe_dict.keys(): - offload_index[key] = {"safetensors_file": new_fname, "weight_name": key} - - base_name = os.path.dirname(new_fname) - if not os.path.exists(base_name): - os.makedirs(base_name) - safe_save_file(safe_dict, new_fname, metadata=metadata) - - def _check_new_adapter_config(self, peft_config: PeftConfig, is_trainable: bool) -> None: - """Perform checks on newly added PEFT configs to ensure integrity.""" - if peft_config.is_prompt_learning and is_trainable: - raise ValueError("Cannot set a prompt learning adapter to trainable when loading pretrained adapter.") - - # Since PiSSA/OLoRA modifies the base weights, it should not be combined with other adapters. - all_configs = [peft_config] + list(self.peft_config.values()) - if len(all_configs) > 1: - if any(getattr(config, "init_lora_weights", None) == "pissa" for config in all_configs): - msg = ( - "PiSSA changes the base weights of the model and should thus not be used with other adapters. " - "Consider converting the PiSSA adapter into a normal LoRA adapter: " - "https://github.com/huggingface/peft/tree/main/examples/pissa_finetuning#convert-pissa-to-lora" - ) - warnings.warn(msg) - elif any(getattr(config, "init_lora_weights", None) == "olora" for config in all_configs): - msg = ( - "OLoRA changes the base weights of the model and should thus not be used with other adapters. " - "Consider converting the OLoRA adapter into a normal LoRA adapter: " - "https://github.com/huggingface/peft/tree/main/examples/olora_finetuning#olora-and-lora" - ) - warnings.warn(msg) - - def load_adapter( - self, - model_id: Union[str, os.PathLike], - adapter_name: str, - is_trainable: bool = False, - torch_device: Optional[str] = None, - autocast_adapter_dtype: bool = True, - ephemeral_gpu_offload: bool = False, - low_cpu_mem_usage: bool = False, - **kwargs: Any, - ): - """ - Load a trained adapter into the model. - - The name for the new adapter should be unique. - - The new adapter is not automatically set as the active adapter. Use [`PeftModel.set_adapter`] to set the active - adapter. - - Args: - model_id (`str` or `os.PathLike`): - The name of the PEFT configuration to use. Can be either: - - A string, the `model id` of a PEFT configuration hosted inside a model repo on the Hugging Face - Hub. - - A path to a directory containing a PEFT configuration file saved using the `save_pretrained` - method (`./my_peft_config_directory/`). - adapter_name (`str`): - The name of the adapter to be added. - is_trainable (`bool`, *optional*, defaults to `False`): - Whether the adapter should be trainable or not. If `False`, the adapter will be frozen and can only be - used for inference. - torch_device (`str`, *optional*, defaults to None): - The device to load the adapter on. If `None`, the device will be inferred. - autocast_adapter_dtype (`bool`, *optional*, defaults to `True`): - Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter - weights using float16 and bfloat16 to float32, as this is typically required for stable training, and - only affect select PEFT tuners. - ephemeral_gpu_offload (`bool`, *optional*, defaults to `False`): - Whether to use ephemeral GPU offloading for partially loaded modules. Defaults to `False`. - low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): - Create empty adapter weights on meta device before loading the saved weights. Useful to speed up the - process. - kwargs: (`optional`): - Additional arguments to modify the way the adapter is loaded, e.g. the token for Hugging Face Hub. - """ -# from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING - - hf_hub_download_kwargs, kwargs = self._split_kwargs(kwargs) - if torch_device is None: - torch_device = infer_device() - - if adapter_name not in self.peft_config: - # load the config - peft_config = PEFT_TYPE_TO_CONFIG_MAPPING[ - PeftConfig._get_peft_type( - model_id, - **hf_hub_download_kwargs, - ) - ].from_pretrained( - model_id, - ephemeral_gpu_offload=ephemeral_gpu_offload, - **hf_hub_download_kwargs, - ) - self._check_new_adapter_config(peft_config, is_trainable=is_trainable) - peft_config.inference_mode = not is_trainable - self.add_adapter(adapter_name, peft_config, low_cpu_mem_usage=low_cpu_mem_usage) - - adapters_weights = load_peft_weights(model_id, device=torch_device, **hf_hub_download_kwargs) - - # load the weights into the model - ignore_mismatched_sizes = kwargs.get("ignore_mismatched_sizes", False) - load_result = set_peft_model_state_dict( - self, - adapters_weights, - adapter_name=adapter_name, - ignore_mismatched_sizes=ignore_mismatched_sizes, - low_cpu_mem_usage=low_cpu_mem_usage, - ) - - tuner = self.peft_config[adapter_name].peft_type - tuner_prefix = PEFT_TYPE_TO_PREFIX_MAPPING.get(tuner, "") - adapter_missing_keys = [] - - # Filter missing keys specific to the current adapter and tuner prefix. - for key in load_result.missing_keys: - if tuner_prefix in key and adapter_name in key: - adapter_missing_keys.append(key) - - load_result.missing_keys.clear() - load_result.missing_keys.extend(adapter_missing_keys) - - if ( - (getattr(self, "hf_device_map", None) is not None) - and (len(set(self.hf_device_map.values()).intersection({"cpu", "disk"})) > 0) - and len(self.peft_config) == 1 - ): - device_map = kwargs.get("device_map", "auto") - max_memory = kwargs.get("max_memory", None) - offload_dir = kwargs.get("offload_folder", None) - offload_index = kwargs.get("offload_index", None) - - dispatch_model_kwargs = {} - # Safety checker for previous `accelerate` versions - # `offload_index` was introduced in https://github.com/huggingface/accelerate/pull/873/ - if "offload_index" in inspect.signature(dispatch_model).parameters: - dispatch_model_kwargs["offload_index"] = offload_index - - no_split_module_classes = self._no_split_modules - - if device_map != "sequential": - max_memory = get_balanced_memory( - self, - max_memory=max_memory, - no_split_module_classes=no_split_module_classes, - low_zero=(device_map == "balanced_low_0"), - ) - - if isinstance(device_map, str): - device_map = infer_auto_device_map( - self, max_memory=max_memory, no_split_module_classes=no_split_module_classes - ) - - self._update_offload(offload_index, adapters_weights) - dispatch_model_kwargs["offload_index"] = offload_index - - dispatch_model( - self, - device_map=device_map, - offload_dir=offload_dir, - **dispatch_model_kwargs, - ) - - hook = AlignDevicesHook(io_same_device=True) - if self.peft_config[adapter_name].is_prompt_learning: - remove_hook_from_submodules(self.prompt_encoder) - add_hook_to_module(self.get_base_model(), hook) - - if hasattr(self.base_model, "_cast_adapter_dtype"): - self.base_model._cast_adapter_dtype( - adapter_name=adapter_name, autocast_adapter_dtype=autocast_adapter_dtype - ) - - # Set model in evaluation mode to deactivate Dropout modules by default - if not is_trainable: - self.eval() - return load_result - - def set_adapter(self, adapter_name: str) -> None: - """ - Sets the active adapter. - - Only one adapter can be active at a time. - - Additionally, this function will set the specified adapter to trainable (i.e., requires_grad=True). If this is - not desired, use the following code. - - ```py - >>> for name, param in model_peft.named_parameters(): - ... if ...: # some check on name (ex. if 'lora' in name) - ... param.requires_grad = False - ``` - - Args: - adapter_name (`str`): - The name of the adapter to be set as active. The adapter must be loaded first. - """ - if adapter_name not in self.peft_config: - raise ValueError(f"Adapter {adapter_name} not found.") - self.active_adapter = adapter_name - if not self.peft_config[adapter_name].is_prompt_learning: - self.base_model.set_adapter(adapter_name) - _set_adapter(self, adapter_name) - - @property - def base_model_torch_dtype(self): - return getattr(self.base_model, "dtype", None) - - @property - def active_peft_config(self): - return self.peft_config[self.active_adapter] - - def create_or_update_model_card(self, output_dir: str): - """ - Updates or create model card to include information about peft: - 1. Adds `peft` library tag - 2. Adds peft version - 3. Adds base model info - 4. Adds quantization information if it was used - """ - - filename = os.path.join(output_dir, "README.md") - - card = ModelCard.load(filename) if os.path.exists(filename) else ModelCard.from_template(ModelCardData()) - - card.data["library_name"] = "peft" - - model_config = BaseTuner.get_model_config(self) - model_config = None if model_config == DUMMY_MODEL_CONFIG else model_config - if model_config is not None and "_name_or_path" in model_config: - card.data["base_model"] = model_config["_name_or_path"] - - lines = card.text.splitlines() - - quantization_config = None - if hasattr(model_config, "quantization_config"): - quantization_config = self.config.quantization_config.to_dict() - training_config_text = "" - quantization_prefix = "The following `bitsandbytes` quantization config was used during training:" - - if isinstance(self.peft_config[self.active_adapter], aLoraConfig): - training_config_text += f"\nActivated LoRA invocation string:\n{self.peft_config[self.active_adapter].invocation_string}" - # Adds quantization information if it was used - if quantization_config is not None: - training_config_text += f"\n{quantization_prefix}\n" - training_config_text += "\n".join([f"- {name}: {value}" for name, value in quantization_config.items()]) - training_config_text += "\n" - - training_procedure_heading = "## Training procedure" - if quantization_prefix not in lines and bool(training_config_text): - if training_procedure_heading in lines: - lines.insert(lines.index(training_procedure_heading) + 2, training_config_text) - else: - lines.append(f"{training_procedure_heading}\n{training_config_text}") - - # Adds peft version - framework_block_heading = "### Framework versions" - if f"- PEFT {__version__}" not in lines: - if framework_block_heading in lines: - lines.insert(lines.index(framework_block_heading) + 2, f"- PEFT {__version__}") - else: - lines.append(f"{framework_block_heading}\n\n- PEFT {__version__}") - - card.text = "\n".join(lines) - card.save(filename) - - - - - -class aLoRAPeftModelForCausalLM(PeftModelALoRA): - """ - Peft model for causal language modeling. - - Args: - model ([`~transformers.PreTrainedModel`]): Base transformer model. - peft_config ([`PeftConfig`]): Peft config. - adapter_name (`str`, *optional*): The name of the adapter, defaults to `"default"`. - autocast_adapter_dtype (`bool`, *optional*): - Whether to autocast the adapter dtype. Defaults to `True`. Right now, this will only cast adapter weights - using float16 and bfloat16 to float32, as this is typically required for stable training, and only affect - select PEFT tuners. - - Example: - - ```py - >>> from transformers import AutoModelForCausalLM - >>> from peft import PeftModelForCausalLM, get_peft_config - - >>> config = { - ... "peft_type": "PREFIX_TUNING", - ... "task_type": "CAUSAL_LM", - ... "inference_mode": False, - ... "num_virtual_tokens": 20, - ... "token_dim": 1280, - ... "num_transformer_submodules": 1, - ... "num_attention_heads": 20, - ... "num_layers": 36, - ... "encoder_hidden_size": 1280, - ... "prefix_projection": False, - ... "postprocess_past_key_value_function": None, - ... } - - >>> peft_config = get_peft_config(config) - >>> model = AutoModelForCausalLM.from_pretrained("gpt2-large") - >>> peft_model = PeftModelForCausalLM(model, peft_config) - >>> peft_model.print_trainable_parameters() - trainable params: 1843200 || all params: 775873280 || trainable%: 0.23756456724479544 - ``` - """ - - def __init__( - self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default",alora_offsets =None,response_token_ids=None, **kwargs - ) -> None: - super().__init__(model, peft_config, adapter_name, **kwargs) - self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation - self.response_token_ids = response_token_ids - #self.alora_offsets = alora_offsets - def forward( - self, - input_ids=None, - attention_mask=None, - inputs_embeds=None, - labels=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - task_ids=None, - **kwargs, - ): - # Figure out alora_offsets - alora_offsets = kwargs.pop("alora_offsets",None) - if self.disable_adapters == True: - alora_offsets = [-1] #Do not use adapter. - - elif self.response_token_ids is not None and alora_offsets is None:#Compute offsets using defined invocation sequence # and self.disable_adapters == False: - alora_offsets = [1]*len(input_ids) - for i in range(len(input_ids)): - response_token_ids_start_idx = None - for ii in range(len(self.response_token_ids)): - for idx in (torch.where(input_ids[i] == self.response_token_ids[ii][0])[0]).tolist(): - # `response_token_ids` is `'### Response:\n'`, here we are just making sure that the token IDs match - if ( - self.response_token_ids[ii].tolist() - == input_ids[i][idx : idx + len(self.response_token_ids[ii])].tolist() - ): - if response_token_ids_start_idx is None or idx > response_token_ids_start_idx: - - response_token_ids_start_idx = idx - response_token_ids_end_idx = idx + len(self.response_token_ids[ii]) - - if response_token_ids_start_idx is None: - warnings.warn( - f"Could not find response key in the " - f'following instance' - f'{self.response_token_ids}' - f'{input_ids[i]}' - f"Setting alora_offsets to 1 " - ) - #ks[i] = 1 - - else: -# print(self.response_token_ids) - # print(input_ids[i]) - alora_offsets[i] = len(input_ids[i]) - response_token_ids_start_idx - #elif self.alora_offsets is not None: - # alora_offsets = self.alora_offsets - elif alora_offsets is None: - warnings.warn('ALoRA offsets not available or computed. Adapter disabled.') - alora_offsets = [-1] #Do not use adapter. This does need to be consistent from train to test though. - - #Pass forward to peft hooks - kwargs['alora_offsets'] = alora_offsets - - - - - - - -# print('forward') - # print(input_ids) - # print('offsets') -# print(alora_offsets) - peft_config = self.active_peft_config - if not peft_config.is_prompt_learning: - #if self.base_model.config.model_type == "mpt": - # if inputs_embeds is not None: - # raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds") - # return self.base_model( - # input_ids=input_ids, - # attention_mask=attention_mask, - # labels=labels, - # output_attentions=output_attentions, - # output_hidden_states=output_hidden_states, - # return_dict=return_dict, - # **kwargs, - # ) - -# if peft_config.peft_type == PeftType.POLY: - # kwargs["task_ids"] = task_ids - - with self._enable_peft_forward_hooks(**kwargs): - kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} - return self.base_model( - input_ids=input_ids, - attention_mask=attention_mask, - inputs_embeds=inputs_embeds, - labels=labels, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - **kwargs, - ) - - batch_size = _get_batch_size(input_ids, inputs_embeds) - if attention_mask is not None: - # concat prompt attention mask - prefix_attention_mask = torch.ones(batch_size, peft_config.num_virtual_tokens).to(attention_mask.device) - attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1) - - if kwargs.get("position_ids", None) is not None: - warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") - kwargs["position_ids"] = None - if kwargs.get("token_type_ids", None) is not None: - warnings.warn("Token type ids are not supported for parameter efficient tuning. Ignoring token type ids") - kwargs["token_type_ids"] = None - kwargs.update( - { - "attention_mask": attention_mask, - "labels": labels, - "output_attentions": output_attentions, - "output_hidden_states": output_hidden_states, - "return_dict": return_dict, - } - ) - - if peft_config.peft_type == PeftType.PREFIX_TUNING: - # overwrite past_kv in kwargs - kwargs["past_key_values"] = self.get_prompt(batch_size) - return self.base_model(input_ids=input_ids, inputs_embeds=inputs_embeds, **kwargs) - elif peft_config.peft_type == PeftType.CPT: - return self._cpt_forward(input_ids, inputs_embeds, peft_config, task_ids, batch_size, **kwargs) - else: - if inputs_embeds is None: - inputs_embeds = self.word_embeddings(input_ids) - # concat prompt labels - if labels is not None: - prefix_labels = torch.full((batch_size, peft_config.num_virtual_tokens), -100).to(labels.device) - kwargs["labels"] = torch.cat((prefix_labels, labels), dim=1) - prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids) - prompts = prompts.to(inputs_embeds.dtype) - inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) - return self.base_model(inputs_embeds=inputs_embeds, **kwargs) - - # def _cpt_forward( - # self, input_ids=None, inputs_embeds=None, peft_config=None, task_ids=None, batch_size=None, **kwargs - # ): - # # Extract labels from kwargs - # labels = kwargs.pop("labels") - # device = [i.device for i in [input_ids, inputs_embeds, labels] if i is not None][0] - # # Extract input_type_mask from kwargs and move it to the same device as labels - # if "input_type_mask" in kwargs.keys(): - # input_type_mask = kwargs.pop("input_type_mask").to(device) - # else: - # if input_ids is None: - # N_tokens = inputs_embeds.shape[1] - # else: - # N_tokens = input_ids.shape[1] - # input_type_mask = torch.ones((batch_size, N_tokens)).to(device) * 4 - - # cpt_token_ids = peft_config.cpt_token_ids - # cpt_tokens_type_mask = peft_config.cpt_tokens_type_mask - - # # Generate embeddings if not provided - # if inputs_embeds is None: - # inputs_embeds = self.word_embeddings(input_ids) - # # Get prompt and concatenate with input embeddings - # prompts = self.get_prompt(batch_size=batch_size, task_ids=task_ids) - # prompts = prompts.to(inputs_embeds.dtype) - # inputs_embeds = torch.cat((prompts, inputs_embeds), dim=1) - # # If labels are provided, generate prefix labels and type mask - # cpt_labels = None - # if labels is not None: - # # Generate prefix labels and concatenate with the input labels - # prefix_labels = torch.Tensor(cpt_token_ids).long().view(1, -1) - # prefix_labels = prefix_labels.repeat(batch_size, 1).to(labels.device) - # cpt_labels = torch.cat((prefix_labels, labels), dim=1) - # # Generate prefix type mask and shift input type mask values to avoid conflicts - # prefix_type_mask = torch.Tensor(cpt_tokens_type_mask).long().view(1, -1) - # prefix_type_mask = prefix_type_mask.repeat(batch_size, 1).to(labels.device) - # adjusted_input_type_mask = input_type_mask - # adjusted_input_type_mask[adjusted_input_type_mask > 0] += prefix_type_mask.max() - # # Concatenate prefix and shifted input type masks - # cpt_type_mask = torch.cat((prefix_type_mask, adjusted_input_type_mask), dim=1) - # # Identify valid label positions and mask invalid ones with -100 - # labels_idx = (cpt_type_mask > 0) & (cpt_type_mask % 4 == 0) - # cpt_labels[~labels_idx] = -100 - # # Update kwargs with the modified labels - - # kwargs["labels"] = cpt_labels - # # Pass the modified inputs to the base model - # base_model_output = self.base_model(inputs_embeds=inputs_embeds, **kwargs) - # if labels is None: - # return base_model_output - # else: - # # Calculate the loss using the custom CPT loss function - # base_model_output = CPTEmbedding.calculate_loss( - # base_model_output, cpt_labels, cpt_type_mask, self.peft_config["default"] - # ) - # return base_model_output - - def generate(self, *args, **kwargs): -# print(f"adaps: {self.active_adapters}") - peft_config = self.active_peft_config - self.base_model.prepare_inputs_for_generation = self.prepare_inputs_for_generation - if hasattr(self.base_model, "model"): - self.base_model.model.generation_config = self.generation_config - else: - self.base_model.generation_config = self.generation_config - try: - if not peft_config.is_prompt_learning: -# print(args) -# print(kwargs) - alora_offsets = kwargs.pop("alora_offsets",None) - - - - input_ids = kwargs.get("input_ids") if not args else args[0] - if len(input_ids.shape) == 1: - input_ids = [args[0]] - if self.disable_adapters == True: - alora_offsets = [-1] #Do not use adapter. - # Figure out alora_offsets - elif alora_offsets is None and self.response_token_ids is not None:# and self.disable_adapters == False: - alora_offsets = [1]*len(input_ids) - for i in range(len(input_ids)): - response_token_ids_start_idx = None - for ii in range(len(self.response_token_ids)): - for idx in (torch.where(input_ids[i] == self.response_token_ids[ii][0])[0]).tolist(): - # `response_token_ids` is `'### Response:\n'`, here we are just making sure that the token IDs match - if ( - self.response_token_ids[ii].tolist() - == input_ids[i][idx : idx + len(self.response_token_ids[ii])].tolist() - ): - if response_token_ids_start_idx is None or idx > response_token_ids_start_idx: - response_token_ids_start_idx = idx - response_token_ids_end_idx = idx + len(self.response_token_ids[ii]) - - if response_token_ids_start_idx is None: - warnings.warn( - f"Could not find response key in the " - f'following instance' - f'{self.response_token_ids}' - f'{input_ids[i]}' - f"Setting alora_offsets to 0 (Starting aLoRA at end of prompt) " - ) - #ks[i] = 1 - - else: - alora_offsets[i] = len(input_ids[i])-1 - response_token_ids_start_idx - #elif self.alora_offsets is not None: - # alora_offsets = self.alora_offsets - elif alora_offsets is None: - warnings.warn('ALoRA offsets not available or computed. Adapter disabled.') - alora_offsets = [-1] - - -# print(ks) - #Pass forward to peft hooks - kwargs['alora_offsets'] = alora_offsets - with self._enable_peft_forward_hooks(*args, **kwargs): - kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} - outputs = self.base_model.generate(*args, **kwargs) - else: - outputs = self.base_model.generate(**kwargs) - except: - self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation - raise - else: - self.base_model.prepare_inputs_for_generation = self.base_model_prepare_inputs_for_generation - return outputs - - def prepare_inputs_for_generation(self, *args, task_ids: Optional[torch.Tensor] = None, **kwargs): - peft_config = self.active_peft_config - model_kwargs = self.base_model_prepare_inputs_for_generation(*args, **kwargs) - - # https://github.com/huggingface/transformers/pull/26681/ introduced new cache format - # for some architectures which requires a special fix for prompt tuning etc. - # TODO: starting with transformers 4.38, all architectures should support caching. - uses_transformers_4_38 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.38.0") - uses_transformers_4_36 = packaging.version.parse(transformers.__version__) >= packaging.version.parse("4.36.0") - transformers_new_cache_archs = ["llama", "mistral", "persimmon", "phi"] - if packaging.version.parse(transformers.__version__) > packaging.version.parse("4.43.3"): - # https://github.com/huggingface/transformers/pull/31445 - transformers_new_cache_archs.append("bloom") - - uses_cache = uses_transformers_4_38 or ( - uses_transformers_4_36 and self.base_model.config.model_type in transformers_new_cache_archs - ) - - if peft_config.peft_type == PeftType.POLY: - model_kwargs["task_ids"] = task_ids - if peft_config.is_prompt_learning: - if uses_cache and (model_kwargs.get("past_key_values", None) is not None): - # change in the logic of `prepare_inputs_for_generation` makes the below code necessary - # In prompt learning methods, past key values are longer when compared to the `input_ids`. - # As such only consider the last input ids in the autogressive generation phase. - past_key_values = model_kwargs["past_key_values"] - if isinstance(past_key_values, (tuple, list)): - seq_len = past_key_values[0][0].shape[-2] - else: # using transformers kv cache - seq_len = past_key_values.get_seq_length() - if seq_len >= model_kwargs["input_ids"].shape[1]: - model_kwargs["input_ids"] = model_kwargs["input_ids"][:, -1:] - - if model_kwargs.get("attention_mask", None) is not None: - size = model_kwargs["input_ids"].shape[0], peft_config.num_virtual_tokens - prefix_attention_mask = torch.ones(size).to(model_kwargs["input_ids"].device) - model_kwargs["attention_mask"] = torch.cat( - (prefix_attention_mask, model_kwargs["attention_mask"]), dim=1 - ) - - if model_kwargs.get("position_ids", None) is not None: - warnings.warn("Position ids are not supported for parameter efficient tuning. Ignoring position ids.") - model_kwargs["position_ids"] = None - - if kwargs.get("token_type_ids", None) is not None: - warnings.warn( - "Token type ids are not supported for parameter efficient tuning. Ignoring token type ids" - ) - kwargs["token_type_ids"] = None - - # no past_key_values or past_key_values empty cache - requires_prompt_injection = (model_kwargs.get("past_key_values", None) is None) or ( - isinstance(model_kwargs["past_key_values"], transformers.Cache) - and not model_kwargs["past_key_values"].get_seq_length() - ) - - if requires_prompt_injection and peft_config.peft_type == PeftType.PREFIX_TUNING: - new_past_key_values = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0]) - model_kwargs["past_key_values"] = new_past_key_values - elif requires_prompt_injection: - inputs_embeds = self.word_embeddings(model_kwargs["input_ids"]) - prompts = self.get_prompt(batch_size=model_kwargs["input_ids"].shape[0], task_ids=task_ids) - prompts = prompts.to(inputs_embeds.dtype) - model_kwargs["inputs_embeds"] = torch.cat((prompts, inputs_embeds), dim=1) - model_kwargs["input_ids"] = None - - # For transformers>=4.38.0 - for some architectures such as Llama, `cache_position` is - # passed in the forward pass to keep track of the position ids of the cache. We have to - # pop that from `model_kwargs` as `cache_position` is properly created by the model, using the passed - # `inputs_embeds`: https://github.com/huggingface/transformers/blob/593230f0a1150ea9c0477b9d859f25daf73c8c33/src/transformers/models/llama/modeling_llama.py#L956 - _ = model_kwargs.pop("cache_position", None) - - return model_kwargs - - - -@dataclass -class TunerLayerStatus: - name: str - module_type: str - enabled: bool - active_adapters: list[str] - merged_adapters: list[str] - requires_grad: dict[str, bool | Literal["irregular"]] - available_adapters: list[str] - devices: dict[str, list[str]] - - -def get_layer_status(model: torch.nn.Module) -> list[TunerLayerStatus]: - """Get the status of each adapter layer in the model. - - This function returns a list of `TunerLayerStatus` dataclass instances, each of which contains the following - attributes: - - - `name` (`str`): - The name of the adapter layer, e.g. `model.encoder.block.0.layer.0.SelfAttention.q`. - - `module_type` (`str`): - The type of the adapter layer, e.g. `lora.Linear`. - - `enabled` (`bool`): - Whether the adapter layer is enabled. - - `active_adapters` (`list[str]`): - The names of the active adapters, if any, e.g. `["default"]`. - - `merged_adapters` (`list[str]`): - The names of the merged adapters, if any, e.g. `["default"]`. - - requires_grad : dict[str, bool | Literal["irregular"]] - The requires_grad status of the parameters for each adapter module. Ideally, it should be either `True` or - `False`. If the requires_grad status is not consistent across all parameters, the value will be set to - `"irregular"`. - - `available_adapters` (`list[str]`): - The names of the available adapters, e.g. `["default"]`. - - `devices` (`dict[str, list[str]]`): - The devices where the parameters of the given adapter are stored, e.g. `["cuda"]`. - - Args: - model ([Union[`~PeftModel`, `~transformers.PreTrainedModel`, `nn.Module`]]): - The model to get the adapter layer status from. - - Returns: - list[`peft.peft_model.TunerLayerStatus`]: - A list of dataclasses, each containing the status of the corresponding adapter layer. - - """ - if isinstance(model, PeftModel): - base_model = model.base_model - if not isinstance(base_model, BaseTuner): - raise TypeError( - "get_layer_status() got an invalid PeftModel instance; prefix tuning and adaption prompt are not " - "supported." - ) - else: - base_model = model - - layer_status: list[TunerLayerStatus] = [] - for name, module in base_model.named_modules(): - if not isinstance(module, BaseTunerLayer): - continue - - # determine if all submodules/parameters if this module require grad or not - mapping_requires_grad_list: dict[str, list[bool]] = collections.defaultdict(list) - for adapter_module_name in module.adapter_layer_names: - adapter_module = getattr(module, adapter_module_name) - if isinstance(adapter_module, torch.nn.ModuleDict): - for key, submodule in adapter_module.items(): - for param in submodule.parameters(): - mapping_requires_grad_list[key].append(param.requires_grad) - elif isinstance(adapter_module, torch.nn.ParameterDict): - for key, param in adapter_module.items(): - mapping_requires_grad_list[key].append(param.requires_grad) - else: - # strange, we don't know how to handle this, ignore for now - pass - - def check_irrgular(vals: list[bool]) -> bool | Literal["irregular"]: - if all(vals): - return True - if not any(vals): - return False - return "irregular" - - requires_grad = {key: check_irrgular(vals) for key, vals in mapping_requires_grad_list.items()} - - devices_dd = collections.defaultdict(list) - for adapter_module_name in module.adapter_layer_names + module.other_param_names: - adapter_module = getattr(module, adapter_module_name) - if isinstance(adapter_module, torch.nn.ModuleDict): - for key, submodule in adapter_module.items(): - devices_dd[key].extend([param.device.type for param in submodule.parameters()]) - elif isinstance(adapter_module, torch.nn.ParameterDict) or ( - adapter_module.__class__.__name__ == "BufferDict" - ): # VeRA - for key, param in adapter_module.items(): - devices_dd[key].append(param.device.type) - devices = {key: sorted(set(val)) for key, val in devices_dd.items()} - - status = TunerLayerStatus( - name=name, - module_type=repr(module).partition("(")[0], - enabled=not module.disable_adapters, - active_adapters=module.active_adapters, - merged_adapters=module.merged_adapters, - requires_grad=requires_grad, - available_adapters=sorted(module._get_available_adapters()), - devices=devices, - ) - layer_status.append(status) - - if not layer_status: - raise ValueError( - "No adapter layers found in the model, please ensure that it's a PEFT model or that you have PEFT adapters " - "injected in the model." - ) - - return layer_status - - -@dataclass -class TunerModelStatus: - base_model_type: str - adapter_model_type: str - peft_types: dict[str, str] - trainable_params: int - total_params: int - num_adapter_layers: int - enabled: bool | Literal["irregular"] - active_adapters: list[str] | Literal["irregular"] - merged_adapters: list[str] | Literal["irregular"] - requires_grad: dict[str, bool | Literal["irregular"]] - available_adapters: list[str] - devices: dict[str, list[str]] - - -def get_model_status(model: torch.nn.Module) -> TunerModelStatus: - """Get the status of tuners of the model. - - This function returns a `TunerModelStatus` dataclass instance, which contains the following attributes: - - - `base_model_type` (`str`): - The type of the base model, e.g. `T5Model`. - - `adapter_model_type` (`str`): - The type of the adapter model, e.g. `LoraModel`. - - `peft_types` (`dict[str, str]`): - The mapping of adapter name to adapter type, e.g. `{"default": "LORA"}`. - - `trainable_params` (`int`): - The number of trainable parameters in the model. - - `total_params` (`int`): - The total number of parameters in the model. - - `num_adapter_layers` (`int`): - The number of adapter layers in the model. - - `enabled` (`bool`, `Literal["irregular"]`): - Whether all adapter layers are enabled. If some are enabled and some are not, this will be `"irregular"`. This - means that your model is in an inconsistent state and might not work as expected. - - `active_adapters` (`list[str]`, `Literal["irregular"]`): - The names of the active adapters. If the active adapters are not consistent across all layers, this will be - `"irregular"`, which means that your model is in an inconsistent state and might not work as expected. - - `merged_adapters` (`list[str]`, `Literal["irregular"]`): - The names of the merged adapters. If the merged adapters are not consistent across all layers, this will be - `"irregular"`, which means that your model is in an inconsistent state and might not work as expected. - - `requires_grad` (`dict[str, bool | Literal["irregular"]]`): - Whether for the given adapter, all adapter layers have `requires_grad` set to `True` or `False`. If there is a - mix, this will be set to `"irregular"`, which means that your model is in an inconsistent state and might not - work as expected. - - `available_adapters` (`list[str]`): - The names of the available adapters, e.g. `["default"]`. - - `devices` (`dict[str, list[str]]`): - The devices where the parameters of the given adapter are stored, e.g. `["cuda"]`. - - Args: - model ([Union[`~PeftModel`, `~transformers.PreTrainedModel`, `nn.Module`]]): - The model to get the adapter layer status from. - - Returns: - `peft.peft_model.TunerModelStatus`: - A dataclass containing the status of the model. - - """ - if isinstance(model, PeftModel): - if not isinstance(model.base_model, BaseTuner): - raise TypeError( - "get_model_status() got an invalid PeftModel instance; prefix tuning and adaption prompt are not " - "supported." - ) - base_model_type = model.get_base_model().__class__.__name__ - trainable_params, total_params = model.get_nb_trainable_parameters() - base_model = model.base_model - peft_types = {key: str(config.peft_type).partition(".")[-1] for key, config in base_model.peft_config.items()} - adapter_model_type = base_model.__class__.__name__ - elif isinstance(model, PreTrainedModel): - base_model_type = model.__class__.__name__ - trainable_params, total_params = PeftModel.get_nb_trainable_parameters(model) - base_model = model - peft_types = {} - adapter_model_type = "None" - else: - base_model_type = "other" - trainable_params, total_params = PeftModel.get_nb_trainable_parameters(model) - base_model = model - peft_types = {} - adapter_model_type = "None" - - layer_status = get_layer_status(model) - num_adapter_layers = len(layer_status) - - enabled_set: set[bool] = {status.enabled for status in layer_status} # must be {True}, {False}, or {True, False} - enabled: bool | Literal["irregular"] - if len(enabled_set) == 1: - enabled = enabled_set.pop() - else: - enabled = "irregular" - - available_adapters: list[str] = sorted(set().union(*(status.available_adapters for status in layer_status))) - - # ideally, active adapters should be consistent across all layers of the model, but we cannot guarantee it - all_active_adapters: set[tuple[str, ...]] = {tuple(status.active_adapters) for status in layer_status} - active_adapters: list[str] | Literal["irregular"] - if not all_active_adapters: - active_adapters = [] - elif len(all_active_adapters) == 1: - active_adapters = list(all_active_adapters.pop()) - else: - active_adapters = "irregular" - - # Here we determine what adapters are merged. This is not trivial because multiple adapters can be merged or not at - # the same time. Some layers may only have adapter A, some only adapter B, so it's not as easy as just checking - # which adapters are merged on each layer. - - # First, determine all adapters that are merged on at least on module. - merged_all: set[str] = set() - for status in layer_status: - merged_all.update(status.merged_adapters) - - # Next, check if on any layer, on of these adapters is not merged. - merged_adapters: list[str] | Literal["irregular"] = sorted(merged_all) - for status in layer_status: - unmerged = set(status.available_adapters) - set(status.merged_adapters) - if unmerged & merged_all: - # there is overlap between unmerged adapters and adapters that should be merged - merged_adapters = "irregular" - break - - # check status of requires_grad - # first, merge the values for all layers - requires_grad_all: dict[str, list[bool | Literal["irregular"]]] = collections.defaultdict(list) - for status in layer_status: - for key, val in status.requires_grad.items(): - requires_grad_all[key].append(val) - - # then, check if the values are consistent - def check_irrgular(vals: list[bool | Literal["irregular"]]) -> bool | Literal["irregular"]: - if all(val is True for val in vals): - return True - if all(val is False for val in vals): - return False - return "irregular" - - requires_grad = {key: check_irrgular(vals) for key, vals in requires_grad_all.items()} - - devices_dd = collections.defaultdict(list) - for status in layer_status: - for key, val in status.devices.items(): - devices_dd[key].extend(val) - devices = {key: sorted(set(val)) for key, val in devices_dd.items()} - - adapter_model_status = TunerModelStatus( - base_model_type=base_model_type, - adapter_model_type=adapter_model_type, - peft_types=peft_types, - trainable_params=trainable_params, - total_params=total_params, - num_adapter_layers=num_adapter_layers, - enabled=enabled, - active_adapters=active_adapters, - merged_adapters=merged_adapters, - requires_grad=requires_grad, - available_adapters=available_adapters, - devices=devices, - ) - return adapter_model_status - From c43a6e192b257d1681ae95c88c765b8aaacb8f29 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Tue, 24 Jun 2025 11:31:20 -0400 Subject: [PATCH 06/99] Update layer.py --- src/peft/tuners/alora/layer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/peft/tuners/alora/layer.py b/src/peft/tuners/alora/layer.py index e2c0c138e2..133291f66a 100644 --- a/src/peft/tuners/alora/layer.py +++ b/src/peft/tuners/alora/layer.py @@ -18,8 +18,6 @@ from .config import aLoraConfig -#Remove -#from peft import DoraConv2dLayer, DoraConv3dLayer, DoraEmbeddingLayer, DoraLinearLayer, _DoraConvNdLayer class aLoraLayer(BaseTunerLayer): From 063716d37485d3f6090071953c6341bed0f84529 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Tue, 24 Jun 2025 15:05:35 -0400 Subject: [PATCH 07/99] Update __init__.py --- src/peft/tuners/alora/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/peft/tuners/alora/__init__.py b/src/peft/tuners/alora/__init__.py index 5d7bb07b38..d14c3db02b 100644 --- a/src/peft/tuners/alora/__init__.py +++ b/src/peft/tuners/alora/__init__.py @@ -20,4 +20,4 @@ __all__ = ["Linear", "aLoraConfig", "aLoraLayer", "aLoraModel"] -register_peft_method(name="alora", config_cls=aLoraConfig, model_cls=aLoraModel, prefix="alora_", is_mixed_compatible=True) +register_peft_method(name="alora", config_cls=aLoraConfig, model_cls=aLoraModel, prefix="lora_", is_mixed_compatible=True) From 8cde2c04a4d65c1640cca46eb3e0d540871847ec Mon Sep 17 00:00:00 2001 From: Greenewald Date: Tue, 24 Jun 2025 15:06:02 -0400 Subject: [PATCH 08/99] Update model.py --- src/peft/tuners/alora/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/peft/tuners/alora/model.py b/src/peft/tuners/alora/model.py index 2b93c1f0d6..7bf6661da9 100644 --- a/src/peft/tuners/alora/model.py +++ b/src/peft/tuners/alora/model.py @@ -77,7 +77,7 @@ class aLoraModel(BaseTuner): - **peft_config** ([`aLoraConfig`]): The configuration of the aLora model. """ - prefix: str = "alora_" + prefix: str = "lora_" def __init__(self, model, config, adapter_name, low_cpu_mem_usage: bool = False) -> None: super().__init__(model, config, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage) From 00f9cffa351d1a2542fe27f46d06efee30baaa0e Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 30 Jun 2025 14:41:56 -0400 Subject: [PATCH 09/99] Add tokenized invocation_tokens to config --- src/peft/tuners/alora/config.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/peft/tuners/alora/config.py b/src/peft/tuners/alora/config.py index 96b8893707..63f32e644d 100644 --- a/src/peft/tuners/alora/config.py +++ b/src/peft/tuners/alora/config.py @@ -2,7 +2,7 @@ import warnings from dataclasses import dataclass, field -from typing import Literal, Optional, Union +from typing import Literal, Optional, Union, List from torch import nn from peft.utils import PeftType @@ -29,12 +29,17 @@ class aLoraConfig(LoraConfig): ) } ) + invocation_tokens: List[int] = field( + default=None, + metadata={"help": "Tokenized version of `invocation_string` (as a list of token IDs). Use the model's default tokenizer."} + ) def __post_init__(self): super().__post_init__() self.peft_type = PeftType.ALORA if self.invocation_string is None: warnings.warn("invocation_string cannot be None for aLoRA.", UserWarning) - # The r field with default=32 is handled by the dataclass field definition. - # LoraConfig's __post_init__ does not modify self.r. + if self.invocation_tokens is None: + warnings.warn("invocation_tokens cannot be None for aLoRA.", UserWarning) + From b94faa7d3b6850fc01c9c007e8f1ed42949a63f0 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 30 Jun 2025 14:47:06 -0400 Subject: [PATCH 10/99] Get rid of tokenizer argument, now use invocation_tokens --- src/peft/peft_model.py | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index bcb24e9399..ba098e676c 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -111,7 +111,6 @@ def __init__( adapter_name: str = "default", autocast_adapter_dtype: bool = True, low_cpu_mem_usage: bool = False, - tokenizer: Optional[Any] = None, ) -> None: super().__init__() self.active_adapter = adapter_name @@ -119,8 +118,7 @@ def __init__( # These args are special PEFT arguments that users can pass. They need to be removed before passing them to # forward. self.special_peft_forward_args = {"adapter_names", "alora_offsets"} - self.tokenizer = tokenizer # Added for ALORA - + self._is_prompt_learning = peft_config.is_prompt_learning if self._is_prompt_learning: self._peft_config = {adapter_name: peft_config} @@ -389,7 +387,6 @@ def from_pretrained( ephemeral_gpu_offload: bool = False, low_cpu_mem_usage: bool = False, key_mapping: Optional[dict[str, str]] = None, - tokenizer: Optional[Any] = None, # Added for ALORA **kwargs: Any, ) -> PeftModel: r""" @@ -547,7 +544,6 @@ def from_pretrained( adapter_name, autocast_adapter_dtype=autocast_adapter_dtype, low_cpu_mem_usage=low_cpu_mem_usage, - tokenizer=tokenizer, ) else: model = MODEL_TYPE_TO_PEFT_MODEL_MAPPING[config.task_type]( @@ -556,7 +552,6 @@ def from_pretrained( adapter_name, autocast_adapter_dtype=autocast_adapter_dtype, low_cpu_mem_usage=low_cpu_mem_usage, - tokenizer=tokenizer, ) load_result = model.load_adapter( @@ -1778,9 +1773,9 @@ class PeftModelForCausalLM(PeftModel): """ def __init__( - self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", tokenizer: Optional[Any] = None, **kwargs + self, model: torch.nn.Module, peft_config: PeftConfig, adapter_name: str = "default", **kwargs ) -> None: - super().__init__(model, peft_config, adapter_name, tokenizer=tokenizer, **kwargs) + super().__init__(model, peft_config, adapter_name, **kwargs) self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation @@ -1812,14 +1807,13 @@ def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optio alora_offsets[i] = -1 # Not an aLoRA adapter or wrong type continue - invocation_string = getattr(current_peft_config, 'invocation_string', None) - if not self.tokenizer or not invocation_string: + invocation_tokens = getattr(current_peft_config, 'invocation_tokens', None) + if not invocation_tokens: alora_offsets[i] = -1 # No way to calculate offset continue if current_adapter_name not in cached_invocation_tensors: - tokenized_ids_list = self.tokenizer.encode(invocation_string, add_special_tokens=False) - cached_invocation_tensors[current_adapter_name] = torch.tensor(tokenized_ids_list, dtype=torch.long, device=input_ids.device) + cached_invocation_tensors[current_adapter_name] = torch.tensor(invocation_tokens, dtype=torch.long, device=input_ids.device) adapters_to_process_indices[current_adapter_name].append(i) @@ -1845,12 +1839,6 @@ def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optio offset_val = seq_len - best_match_start_idx alora_offsets[i] = offset_val if offset_val > 0 else -1 else: - #warnings.warn( - # f"Invocation string for adapter '{adapter_name_to_process}' not found in input row {i}. " - # f"Input: {self.tokenizer.decode(input_ids[i]) if self.tokenizer else input_ids[i]}. " - # f"Invocation: {self.peft_config[adapter_name_to_process].invocation_string}. " - # "Adapter will be disabled for this row." - #) alora_offsets[i] = -1 return alora_offsets From 26f5cf7c0e0ff05d8376a9299183922d06fbe72e Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 30 Jun 2025 14:48:05 -0400 Subject: [PATCH 11/99] tokenizer code in warning --- src/peft/tuners/alora/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/peft/tuners/alora/config.py b/src/peft/tuners/alora/config.py index 63f32e644d..e9dc0c8ee1 100644 --- a/src/peft/tuners/alora/config.py +++ b/src/peft/tuners/alora/config.py @@ -31,7 +31,7 @@ class aLoraConfig(LoraConfig): ) invocation_tokens: List[int] = field( default=None, - metadata={"help": "Tokenized version of `invocation_string` (as a list of token IDs). Use the model's default tokenizer."} + metadata={"help": "Tokenized version of `invocation_string` (as a list of token IDs). Use the model's default tokenizer. E.g. invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False)"} ) def __post_init__(self): From dcd78c5993673dc023eda799cf287440b37fb10f Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 30 Jun 2025 15:07:06 -0400 Subject: [PATCH 12/99] Update test_custom_models.py --- tests/test_custom_models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 9af333b51c..cac884a751 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -32,6 +32,7 @@ from transformers.pytorch_utils import Conv1D from peft import ( + aLoraConfig, AdaLoraConfig, BOFTConfig, BoneConfig, @@ -830,6 +831,7 @@ PREFIXES = { IA3Config: "ia3_", + aLoraConfig: "lora_", LoraConfig: "lora_", LoHaConfig: "hada_", LoKrConfig: "lokr_", From 087781f25a2cda09143a3c3e6c4487fcca21be31 Mon Sep 17 00:00:00 2001 From: Kristjan Greenewald Date: Tue, 1 Jul 2025 17:21:54 -0400 Subject: [PATCH 13/99] alora tests --- tests/test_custom_models.py | 90 ++++++++++++++++++++++++++++++++++--- 1 file changed, 83 insertions(+), 7 deletions(-) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index cac884a751..1de5c1067f 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -302,6 +302,51 @@ }, ), ######## + # aLoRA # + ######## + ( + "Vanilla MLP 1 aLoRA", + "MLP", + aLoraConfig, + {"target_modules": "lin0", "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, + ), + ( + "Vanilla MLP 2 aLoRA", + "MLP", + aLoraConfig, + {"target_modules": ["lin0"], "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, + ), + ( + "Vanilla MLP 3 aLoRA", + "MLP", + aLoraConfig, + {"target_modules": ["lin1"], "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, + ), + ( + "Vanilla MLP 4 aLoRA", + "MLP", + aLoraConfig, + {"target_modules": ["lin0", "lin1"], "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, + ), + ( + "Vanilla MLP 5 aLoRA", + "MLP", + aLoraConfig, + {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, + ), + ( + "Vanilla MLP 6 aLoRA", + "MLP", + aLoraConfig, + { + "target_modules": ["lin0"], + "lora_alpha": 4, + "lora_dropout": 0.1, + "invocation_tokens": [1, 2, 3], + "invocation_string": '123', + }, + ), + ######## # OFT # ######## ( @@ -739,6 +784,20 @@ {"target_modules": ["lin0"], "init_lora_weights": False, "inference_mode": True, "total_step": 1}, {"target_modules": ["lin1"], "init_lora_weights": False, "inference_mode": True, "total_step": 1}, ), + ( + "aLoRA Same", + "alora", + aLoraConfig, + {"target_modules": ["lin0"], "init_lora_weights": False, "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, + {"target_modules": ["lin0"], "init_lora_weights": False, "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, + ), + ( + "aLoRA Different", + "alora", + aLoraConfig, + {"target_modules": ["lin0"], "init_lora_weights": False, "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, + {"target_modules": ["lin1"], "init_lora_weights": False, "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, + ), ( "FourierFT Same", "fourierft", @@ -1280,6 +1339,8 @@ def test_merge_layers(self, test_name, model_id, config_cls, config_kwargs): pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) + if issubclass(config_cls, aLoraConfig): + pytest.skip("aLoRA does not support merging.") config_kwargs = config_kwargs.copy() if issubclass(config_cls, LoraConfig): @@ -1303,6 +1364,8 @@ def test_merge_layers_fp16(self, test_name, model_id, config_cls, config_kwargs) pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) + if issubclass(config_cls, aLoraConfig): + pytest.skip("aLoRA does not support merging.") config_kwargs = config_kwargs.copy() if issubclass(config_cls, LoraConfig): @@ -1318,6 +1381,8 @@ def test_merge_layers_is_idempotent(self, test_name, model_id, config_cls, confi pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) + if issubclass(config_cls, aLoraConfig): + pytest.skip("aLoRA does not support merging.") # calling merge twice with the same arguments should not change the output config_kwargs = config_kwargs.copy() @@ -1427,6 +1492,8 @@ def test_forward_float16(self, test_name, model_id, config_cls, config_kwargs): if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return + if issubclass(config_cls, aLoraConfig): + pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) model(**X) @@ -1469,6 +1536,8 @@ def test_forward_bfloat16(self, test_name, model_id, config_cls, config_kwargs): if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return + if issubclass(config_cls, aLoraConfig): + pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) model(**X) @@ -1510,6 +1579,8 @@ def test_forward_float16_no_autocast(self, test_name, model_id, config_cls, conf if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return + if issubclass(config_cls, aLoraConfig): + pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) model(**X) @@ -1551,6 +1622,8 @@ def test_forward_bfloat16_no_autocast(self, test_name, model_id, config_cls, con if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return + if issubclass(config_cls, aLoraConfig): + pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) model(**X) @@ -1723,6 +1796,8 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) + if issubclass(config_cls, aLoraConfig): + pytest.skip("aLoRA does not support merging.") # same as test_disable_adapters, but with merging X = self.prepare_inputs_for_testing() @@ -1809,13 +1884,9 @@ def test_disable_adapter_with_bias_warns(self, test_name, model_id, config_cls, # Note: We test only with custom models since they run really fast. There is really no point in testing the same # thing with decoder, encoder_decoder, etc. - if config_cls != LoraConfig or config_cls != BOFTConfig: - # skip this test for other configs as bias is specific to Lora - pytest.skip("Testing bias warnings only for LoraConfig or BOFTConfig") - if not issubclass(config_cls, (LoraConfig, BOFTConfig)): + # skip this test for other configs as bias is specific to Lora pytest.skip("Bias argument is only supported for LoRA or BOFT models") - def run_with_disable(config_kwargs, bias): config_kwargs = config_kwargs.copy() config_kwargs["bias"] = bias @@ -1828,14 +1899,13 @@ def run_with_disable(config_kwargs, bias): with peft_model.disable_adapter(): pass # there is nothing to be done - if config_cls == LoraConfig: + if issubclass(config_cls, LoraConfig): # check that bias=all and bias=lora_only give a warning with the correct message msg_start = "Careful, disabling adapter layers with bias configured to be" with pytest.warns(UserWarning, match=msg_start): run_with_disable(config_kwargs, bias="lora_only") with pytest.warns(UserWarning, match=msg_start): run_with_disable(config_kwargs, bias="all") - if config_cls == BOFTConfig: # check that bias=all and bias=boft_only give a warning with the correct message msg_start = "Careful, disabling adapter layers with bias configured to be" @@ -2725,6 +2795,9 @@ def test_multiple_active_adapters_forward( def test_multiple_active_adapters_merge_and_unmerge( self, test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2 ): + if issubclass(config_cls, aLoraConfig): + pytest.skip("aLoRA does not support merging.") + torch.manual_seed(0) model = self.resolve_model_cls(tuner_method) @@ -2758,6 +2831,9 @@ def test_multiple_active_adapters_merge_and_unmerge( "test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2", MULTIPLE_ACTIVE_ADAPTERS_TEST_CASES ) def test_merge_layers_multi(self, test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2): + if issubclass(config_cls, aLoraConfig): + pytest.skip("aLoRA does not support merging.") + torch.manual_seed(0) model = self.resolve_model_cls(tuner_method) From 89fd2b1211300b4554b378160af912f04f47eb1d Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Tue, 1 Jul 2025 22:02:06 +0000 Subject: [PATCH 14/99] test debugging --- src/peft/tuners/alora/layer.py | 43 +++++++++------------------------- tests/test_custom_models.py | 3 ++- 2 files changed, 13 insertions(+), 33 deletions(-) diff --git a/src/peft/tuners/alora/layer.py b/src/peft/tuners/alora/layer.py index 133291f66a..6e3040ecbb 100644 --- a/src/peft/tuners/alora/layer.py +++ b/src/peft/tuners/alora/layer.py @@ -463,22 +463,16 @@ def get_delta_weight(self, adapter) -> torch.Tensor: def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: self._check_forward_args(x, *args, **kwargs) adapter_names = kwargs.pop("adapter_names", None) - ks = kwargs.pop("alora_offsets", [1]) #added -# ks = [100000] -# print("layer forward") -# print(ks) - if self.disable_adapters or ks[0] <= 0: + alora_offsets = kwargs.pop("alora_offsets", [1]) #Where to activate adapter weights + if self.disable_adapters or alora_offsets[0] <= 0: if self.merged: self.unmerge() result = self.base_layer(x, *args, **kwargs) elif adapter_names is not None: - result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, alora_offsets = ks, **kwargs) + result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, alora_offsets = alora_offsets, **kwargs) elif self.merged: result = self.base_layer(x, *args, **kwargs) else: - #if len(ks) == 1: - - #k = ks # Maybe change result = self.base_layer(x, *args, **kwargs) torch_result_dtype = result.dtype for active_adapter in self.active_adapters: @@ -489,37 +483,22 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: dropout = self.lora_dropout[active_adapter] scaling = self.scaling[active_adapter] x = x.to(lora_A.weight.dtype) - + if not self.use_dora[active_adapter]: + if x.dim() == 2: #only one token, so apply adapter on everything (comes up in some tests) + result = result + lora_B(lora_A(dropout(x))) * scaling # Only do the last k tokens - if len(ks) == 1: - k = min(result.shape[1],ks[0]) + elif len(alora_offsets) == 1: + k = min(result.shape[1],alora_offsets[0]) if k > 0: - result[:,-k:,:] = result[:,-k:,:] + lora_B(lora_A(dropout(x[:,-k:,:]))) * scaling#dropout + result[:,-k:,:] = result[:,-k:,:] + lora_B(lora_A(dropout(x[:,-k:,:]))) * scaling else: for i in range(result.shape[0]): - ks[i] = min(ks[i], result.shape[1]) + alora_offsets[i] = min(alora_offsets[i], result.shape[1]) if ks[i] > 0: - result[i,-ks[i]:,:] = result[i,-ks[i]:,:] + lora_B(lora_A(dropout(x[i,-ks[i]:,:]))) - else: - warnings.warn("NOT SUPPORTED") - if isinstance(dropout, nn.Identity) or not self.training: - base_result = result - else: - x = dropout(x) - base_result = None - - result = result + self.lora_magnitude_vector[active_adapter]( - x, - lora_A=lora_A, - lora_B=lora_B, - scaling=scaling, - base_layer=self.get_base_layer(), - base_result=base_result, - ) - + result[i,-alora_offsets[i]:,:] = result[i,-alora_offsets[i]:,:] + lora_B(lora_A(dropout(x[i,-alora_offsets[i]:,:]))) result = result.to(torch_result_dtype) return result diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 1de5c1067f..7ac669b172 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -1399,7 +1399,8 @@ def test_safe_merge(self, test_name, model_id, config_cls, config_kwargs): pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - + if issubclass(config_cls, aLoraConfig): + pytest.skip(f"Skipping test as merging is not supported for aLora.") # calling merge twice with the same arguments should not change the output config_kwargs = config_kwargs.copy() if issubclass(config_cls, LoraConfig): From b4b3465ae456639a5e97f7e621a1417fd2769f11 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 3 Jul 2025 12:53:26 -0400 Subject: [PATCH 15/99] refactor alora as lora variant --- src/peft/tuners/alora/config.py | 22 +- src/peft/tuners/alora/layer.py | 530 ++----------------------------- src/peft/tuners/alora/model.py | 10 +- src/peft/tuners/lora/layer.py | 12 +- src/peft/tuners/lora/variants.py | 83 ++++- 5 files changed, 137 insertions(+), 520 deletions(-) diff --git a/src/peft/tuners/alora/config.py b/src/peft/tuners/alora/config.py index e9dc0c8ee1..17001a7622 100644 --- a/src/peft/tuners/alora/config.py +++ b/src/peft/tuners/alora/config.py @@ -2,11 +2,9 @@ import warnings from dataclasses import dataclass, field -from typing import Literal, Optional, Union, List -from torch import nn -from peft.utils import PeftType from peft.tuners.lora import LoraConfig +from peft.utils import PeftType @dataclass @@ -19,7 +17,13 @@ class aLoraConfig(LoraConfig): invocation_string (str): String intended to activate the aLoRA. The aLoRA adapted weights will activate 1 token after the first token in this string. This string must be present in all input data. """ - r: int = field(default=32, metadata={"help": "aLora attention dimension. Typically needs to be higher than used for standard Lora. Default=32."}) + + r: int = field( + default=32, + metadata={ + "help": "aLora attention dimension. Typically needs to be higher than used for standard Lora. Default=32." + }, + ) invocation_string: str = field( default=None, metadata={ @@ -27,11 +31,13 @@ class aLoraConfig(LoraConfig): "aLoRA invocation string. The aLoRA adapted weights will activate 1 token after the first token in " "this string. This string must be present in all input data." ) - } + }, ) - invocation_tokens: List[int] = field( + invocation_tokens: list[int] = field( default=None, - metadata={"help": "Tokenized version of `invocation_string` (as a list of token IDs). Use the model's default tokenizer. E.g. invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False)"} + metadata={ + "help": "Tokenized version of `invocation_string` (as a list of token IDs). Use the model's default tokenizer. E.g. invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False)" + }, ) def __post_init__(self): @@ -41,5 +47,3 @@ def __post_init__(self): warnings.warn("invocation_string cannot be None for aLoRA.", UserWarning) if self.invocation_tokens is None: warnings.warn("invocation_tokens cannot be None for aLoRA.", UserWarning) - - diff --git a/src/peft/tuners/alora/layer.py b/src/peft/tuners/alora/layer.py index 6e3040ecbb..2893b692b8 100644 --- a/src/peft/tuners/alora/layer.py +++ b/src/peft/tuners/alora/layer.py @@ -1,352 +1,34 @@ - from __future__ import annotations -import math import warnings -from typing import Any, Optional, Union +from typing import Optional import torch import torch.nn as nn -import torch.nn.functional as F -from accelerate.utils.imports import is_xpu_available -from torch import svd_lowrank -from transformers.pytorch_utils import Conv1D -from peft.tuners.tuners_utils import BaseTunerLayer, check_adapters_to_merge -from peft.utils.integrations import dequantize_module_weight, gather_params_ctx, get_bnb_param_type -from peft.utils.other import transpose +from peft.tuners.lora.layer import Linear as LoraLinear +from peft.tuners.lora.layer import LoraLayer +from peft.tuners.lora.layer import dispatch_default as lora_dispatch_default +from peft.tuners.lora.variants import ALoraLinearVariant from .config import aLoraConfig - -class aLoraLayer(BaseTunerLayer): - # All names of layers that may contain (trainable) adapter weights - adapter_layer_names = ("lora_A", "lora_B", "lora_embedding_A", "lora_embedding_B") - # All names of other parameters that may contain adapter-related parameters - other_param_names = ("r", "lora_alpha", "scaling", "lora_dropout")#,"k") - - def __init__(self, base_layer: nn.Module, ephemeral_gpu_offload: bool = False, **kwargs) -> None: - self.base_layer = base_layer - self.r = {} - self.lora_alpha = {} - self.scaling = {} - self.lora_dropout = nn.ModuleDict({}) - self.lora_A = nn.ModuleDict({}) - self.lora_B = nn.ModuleDict({}) - # For Embedding layer - self.lora_embedding_A = nn.ParameterDict({}) - self.lora_embedding_B = nn.ParameterDict({}) - # Mark the weight as unmerged - self._disable_adapters = False - self.merged_adapters = [] - self.use_dora: dict[str, bool] = {} - self.lora_bias: dict[str, bool] = {} - self.lora_magnitude_vector = torch.nn.ModuleDict() # for DoRA - self._caches: dict[str, Any] = {} - self.ephemeral_gpu_offload: bool = ephemeral_gpu_offload - self.kwargs = kwargs - - base_layer = self.get_base_layer() - if isinstance(base_layer, nn.Linear): - in_features, out_features = base_layer.in_features, base_layer.out_features - #below not supported - # elif isinstance(base_layer, nn.Conv2d): - # in_features, out_features = base_layer.in_channels, base_layer.out_channels - # elif isinstance(base_layer, nn.Conv3d): - # in_features, out_features = base_layer.in_channels, base_layer.out_channels - # elif isinstance(base_layer, nn.Embedding): - # in_features, out_features = base_layer.num_embeddings, base_layer.embedding_dim - # elif isinstance(base_layer, Conv1D): - # in_features, out_features = ( - # base_layer.weight.ds_shape if hasattr(base_layer.weight, "ds_shape") else base_layer.weight.shape - # ) - elif hasattr(base_layer, "infeatures") and hasattr(base_layer, "outfeatures"): - # QuantLinear - in_features, out_features = base_layer.infeatures, base_layer.outfeatures - elif hasattr(base_layer, "input_size") and hasattr(base_layer, "output_size"): - # Megatron ColumnParallelLinear,RowParallelLinear - in_features, out_features = base_layer.input_size, base_layer.output_size - elif hasattr(base_layer, "codebooks") and base_layer.__class__.__name__ == "QuantizedLinear": - # AQLM QuantLinear - in_features, out_features = base_layer.in_features, base_layer.out_features - elif hasattr(base_layer, "w_bit") and base_layer.__class__.__name__ == "WQLinear_GEMM": - # Awq layers - in_features, out_features = base_layer.in_features, base_layer.out_features - elif base_layer.__class__.__name__ == "EetqLinear": - # Eetq layers - in_features, out_features = base_layer.in_features, base_layer.out_features - elif hasattr(base_layer, "W_q") and base_layer.__class__.__name__ == "HQQLinear": - # HQQ layers - in_features, out_features = base_layer.in_features, base_layer.out_features - else: - # possibly support user provided custom layer types using dynamic dispatch - if hasattr(base_layer, "in_features") and hasattr(base_layer, "out_features"): - in_features, out_features = base_layer.in_features, base_layer.out_features - else: - in_features, out_features = None, None - warnings.warn( - f"Unsupported layer type '{type(base_layer)}' encountered, proceed at your own risk.", UserWarning - ) - - self.in_features = in_features - self.out_features = out_features - - def update_layer( - self, - adapter_name, - r, - lora_alpha, - lora_dropout, - init_lora_weights, - use_rslora, -# k = 1,# added - use_dora: bool = False, - lora_bias: bool = False, - ): - # This code works for linear layers, override for other layer types - if r <= 0: - raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") - - self.r[adapter_name] = r - # self.k[adapter_name] = k #added - self.lora_alpha[adapter_name] = lora_alpha - if lora_dropout > 0.0: - lora_dropout_layer = nn.Dropout(p=lora_dropout) - else: - lora_dropout_layer = nn.Identity() - - self.lora_dropout.update(nn.ModuleDict({adapter_name: lora_dropout_layer})) - # Actual trainable parameters - self.lora_A[adapter_name] = nn.Linear(self.in_features, r, bias=False) - self.lora_B[adapter_name] = nn.Linear(r, self.out_features, bias=lora_bias) - self.lora_bias[adapter_name] = lora_bias - - if use_rslora: - self.scaling[adapter_name] = lora_alpha / math.sqrt(r) - else: - self.scaling[adapter_name] = lora_alpha / r - - # for inits that require access to the base weight, use gather_param_ctx so that the weight is gathered when using DeepSpeed - if isinstance(init_lora_weights, str) and init_lora_weights.startswith("pissa"): - with gather_params_ctx(self.get_base_layer().weight): - self.pissa_init(adapter_name, init_lora_weights) - elif isinstance(init_lora_weights, str) and init_lora_weights.lower() == "olora": - with gather_params_ctx(self.get_base_layer().weight): - self.olora_init(adapter_name) - elif init_lora_weights == "loftq": - with gather_params_ctx(self.get_base_layer().weight): - self.loftq_init(adapter_name) - elif init_lora_weights == "eva": - nn.init.zeros_(self.lora_B[adapter_name].weight) - elif init_lora_weights: - self.reset_lora_parameters(adapter_name, init_lora_weights) - # call this before dora_init - self._move_adapter_to_device_of_base_layer(adapter_name) - - if use_dora: - self.dora_init(adapter_name) - self.use_dora[adapter_name] = True - else: - self.use_dora[adapter_name] = False - - self.set_adapter(self.active_adapters) - - def reset_lora_parameters(self, adapter_name, init_lora_weights): - if init_lora_weights is False: - return - - if adapter_name in self.lora_A.keys(): - if init_lora_weights is True: - # initialize A the same way as the default for nn.Linear and B to zero - # https://github.com/microsoft/LoRA/blob/a0a92e0f26c067cf94747bdbf1ce73793fa44d19/loralib/layers.py#L124 - nn.init.kaiming_uniform_(self.lora_A[adapter_name].weight, a=math.sqrt(5)) - elif init_lora_weights.lower() == "gaussian": - nn.init.normal_(self.lora_A[adapter_name].weight, std=1 / self.r[adapter_name]) - else: - raise ValueError(f"Unknown initialization {init_lora_weights=}") - nn.init.zeros_(self.lora_B[adapter_name].weight) - if self.lora_bias[adapter_name]: - nn.init.zeros_(self.lora_B[adapter_name].bias) - if adapter_name in self.lora_embedding_A.keys(): - # Initialize A to zeros and B the same way as the default for nn.Embedding, see: - # https://github.com/microsoft/LoRA/blob/4c0333854cb905966f8cc4e9a74068c1e507c7b7/loralib/layers.py#L59-L60 - nn.init.zeros_(self.lora_embedding_A[adapter_name]) - nn.init.normal_(self.lora_embedding_B[adapter_name]) - if self.lora_bias[adapter_name]: - # embeddings are not supported at the moment, but still adding this for consistency - nn.init.zeros_(self.lora_embedding_B[adapter_name].bias) - - def olora_init(self, adapter_name): - base_layer = self.get_base_layer() - orig_weight = base_layer.weight - bnb_param_type = get_bnb_param_type(orig_weight) - dtype = orig_weight.dtype - - if bnb_param_type: - # check without importing bitsandbytes and robust to bnb_4bit_quant_storage=float* - weight_tensor = dequantize_module_weight(base_layer) - elif dtype in [torch.float32, torch.float16, torch.bfloat16]: - weight_tensor = orig_weight - else: - raise TypeError(f"Unsupported data type for the base layer. Got {dtype}.") - - scale_factor = self.scaling[adapter_name] - r = self.r[adapter_name] - weight_tensor = weight_tensor.to(torch.float32) - Q, R = torch.linalg.qr(weight_tensor.data) - - Qr, Rr = Q[:, :r], R[:r] - - self.lora_A[adapter_name].weight.data = Rr.contiguous() - self.lora_B[adapter_name].weight.data = Qr.contiguous() - - weight_tensor.data -= scale_factor * self.lora_B[adapter_name].weight @ self.lora_A[adapter_name].weight - if bnb_param_type == "4bit": - weight_tensor = orig_weight.__class__( - weight_tensor, - quant_type=orig_weight.quant_type, - quant_storage=orig_weight.quant_storage, - compress_statistics=orig_weight.compress_statistics, - module=orig_weight.module, - ).to(orig_weight.device) - base_layer.weight = weight_tensor - elif bnb_param_type == "8bit": - weight_tensor = orig_weight.__class__( - weight_tensor, - requires_grad=orig_weight.requires_grad, - has_fp16_weights=orig_weight.has_fp16_weights, - ).to(orig_weight.device) - base_layer.weight = weight_tensor - else: - weight_tensor = weight_tensor.to(dtype) - base_layer.weight.data = weight_tensor - - def pissa_init(self, adapter_name, init_lora_weights): - weight = self.get_base_layer().weight - dtype = weight.dtype - if dtype not in [torch.float32, torch.float16, torch.bfloat16]: - raise TypeError( - "Please initialize PiSSA under float32, float16, or bfloat16. " - "Subsequently, re-quantize the residual model to help minimize quantization errors." - ) - weight = transpose(weight.to(torch.float32), self.fan_in_fan_out) - if init_lora_weights == "pissa": - # USV^T = W <-> VSU^T = W^T, where W^T = weight.data in R^{out_channel, in_channel}, - V, S, Uh = torch.linalg.svd(weight.data, full_matrices=False) - Vr = V[:, : self.r[adapter_name]] - Sr = S[: self.r[adapter_name]] - Sr /= self.scaling[adapter_name] - Uhr = Uh[: self.r[adapter_name]] - elif len(init_lora_weights.split("_niter_")) == 2: - Vr, Sr, Ur = svd_lowrank( - weight.data, self.r[adapter_name], niter=int(init_lora_weights.split("_niter_")[-1]) - ) - Sr /= self.scaling[adapter_name] - Uhr = Ur.t() - else: - raise ValueError( - f"init_lora_weights should be 'pissa' or 'pissa_niter_[number of iters]', got {init_lora_weights} instead." - ) - - lora_A = torch.diag(torch.sqrt(Sr)) @ Uhr - lora_B = Vr @ torch.diag(torch.sqrt(Sr)) - self.lora_A[adapter_name].weight.data = lora_A - self.lora_B[adapter_name].weight.data = lora_B - weight = weight.data - self.scaling[adapter_name] * lora_B @ lora_A - weight = transpose(weight.to(dtype), self.fan_in_fan_out) - self.get_base_layer().weight.data = weight - - def loftq_init(self, adapter_name): - from peft.utils.loftq_utils import loftq_init - - weight = self.get_base_layer().weight - kwargs = { - "num_bits": self.kwargs.get("loftq_bits", 4), - "reduced_rank": self.r[adapter_name], - "num_iter": self.kwargs.get("loftq_iter", 1), - } - - qweight, lora_A, lora_B = loftq_init(weight, **kwargs) - if adapter_name in self.lora_A.keys(): - # initialize A the same way as the default for nn.Linear and B to zero - self.lora_A[adapter_name].weight.data = lora_A - self.lora_B[adapter_name].weight.data = lora_B - if adapter_name in self.lora_embedding_A.keys(): - # initialize a the same way as the default for nn.linear and b to zero - self.lora_embedding_A[adapter_name].weight.data = lora_A - self.lora_embedding_B[adapter_name].weight.data = lora_B - self.get_base_layer().weight.data = qweight - - - def _cache_store(self, key: str, value: Any) -> None: - self._caches[key] = value - - def _cache_pop(self, key: str) -> Any: - value = self._caches.pop(key) - return value - - def set_scale(self, adapter, scale): - if adapter not in self.scaling: - # Ignore the case where the adapter is not in the layer - return - self.scaling[adapter] = scale * self.lora_alpha[adapter] / self.r[adapter] - - def scale_layer(self, scale: float) -> None: - if scale == 1: - return - - for active_adapter in self.active_adapters: - if active_adapter not in self.lora_A.keys(): - continue - - self.scaling[active_adapter] *= scale - - def unscale_layer(self, scale=None) -> None: - for active_adapter in self.active_adapters: - if active_adapter not in self.lora_A.keys(): - continue - - if scale is None: - self.scaling[active_adapter] = self.lora_alpha[active_adapter] / self.r[active_adapter] - else: - self.scaling[active_adapter] /= scale - - def _check_forward_args(self, x, *args, **kwargs): - """Check if the arguments are compatible with the configs and state of the model""" - adapter_names = kwargs.get("adapter_names", None) - if adapter_names is None: - return - - if len(x) != len(adapter_names): - msg = ( - "Length of `adapter_names` should be the same as the number of inputs, but got " - f"{len(adapter_names)} and {len(x)} respectively." - ) - raise ValueError(msg) - - if self.merged: - # It is unclear what would be the right thing to do if users pass adapter_names and there are merged - # adapters. Therefore, it is better to raise an error in this case. - msg = "Cannot pass `adapter_names` when there are merged adapters, please call `unmerge_adapter` first." - raise ValueError(msg) - - # # DoRA is not supported (yet), check that it's not being used. Don't check "__base__", as this is the - # # placeholder for the base model. - # unique_adapters = {name for name in adapter_names if name != "__base__"} - # for adapter_name in unique_adapters: - # if self.use_dora.get(adapter_name, False): - # msg = "Cannot pass `adapter_names` when DoRA is enabled." - # raise ValueError(msg) +class aLoraLayer(LoraLayer): + def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[object]: + return ALoraLinearVariant() def _mixed_batch_forward( - self, x: torch.Tensor, *args: Any, adapter_names: list[str],alora_offsets: list[int], **kwargs: Any + self, + x: torch.Tensor, + *args, + adapter_names: list[str], + alora_offsets: list[int], + **kwargs, ) -> torch.Tensor: - # This is a special method that handles the case when users pass the argument `adapter_names`. This is an - # extra argument that allows mixing different adapters in the same batch at inference time. result = self.base_layer(x, *args, **kwargs) torch_result_dtype = result.dtype - ks =alora_offsets + ks = alora_offsets unique_adapters = set(adapter_names) sub_batch_indices_list = [] for adapter in unique_adapters: @@ -363,193 +45,39 @@ def _mixed_batch_forward( dropout = self.lora_dropout[active_adapter] scaling = self.scaling[active_adapter] - # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear - # layer output sub_batch = x[sub_batch_indices_list[i]].to(lora_A.weight.dtype) if len(ks) > 1: ks_batch = ks[sub_batch_indices_list[i]] for j in range(len(ks_batch)): k = min(ks_batch[j], result.shape[1]) - lora_output = lora_B(lora_A(dropout(sub_batch[j,-k:,:]))) * scaling - result[sub_batch_indices_list[i][j],-k:,:] += lora_output.to(torch_result_dtype) + lora_output = lora_B(lora_A(dropout(sub_batch[j, -k:, :]))) * scaling + result[sub_batch_indices_list[i][j], -k:, :] += lora_output.to(torch_result_dtype) else: ks_batch = ks - k = min(result.shape[1],ks_batch[0]) - lora_output = lora_B(lora_A(dropout(sub_batch[:,-k:,:]))) * scaling - result[sub_batch_indices_list[i],-k:,:] += lora_output.to(torch_result_dtype) + k = min(result.shape[1], ks_batch[0]) + lora_output = lora_B(lora_A(dropout(sub_batch[:, -k:, :]))) * scaling + result[sub_batch_indices_list[i], -k:, :] += lora_output.to(torch_result_dtype) return result -# Below code is based on https://github.com/microsoft/LoRA/blob/main/loralib/layers.py -# and modified to work with PyTorch FSDP - - -# ------------------------------------------------------------------------------------------ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. -# ------------------------------------------------------------------------------------------ - - -class Linear(nn.Module, aLoraLayer): - # Lora implemented in a dense layer - def __init__( - self, - base_layer, - adapter_name: str, - r: int = 0, - lora_alpha: int = 1, - lora_dropout: float = 0.0, - fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) - is_target_conv_1d_layer: bool = False, - init_lora_weights: Union[bool, str] = True, - use_rslora: bool = False, - use_dora: bool = False, - lora_bias: bool = False, - **kwargs, - ) -> None: - super().__init__() - aLoraLayer.__init__(self, base_layer, **kwargs) - self.fan_in_fan_out = fan_in_fan_out - - self._active_adapter = adapter_name - self.update_layer( - adapter_name, - r, - lora_alpha=lora_alpha, - lora_dropout=lora_dropout, - init_lora_weights=init_lora_weights, - use_rslora=use_rslora, - use_dora=use_dora, - lora_bias=lora_bias, - ) - self.is_target_conv_1d_layer = is_target_conv_1d_layer - - - def get_delta_weight(self, adapter) -> torch.Tensor: - """ - Compute the delta weight for the given adapter. - - Args: - adapter (str): - The name of the adapter for which the delta weight should be computed. - """ - device = self.lora_B[adapter].weight.device - dtype = self.lora_B[adapter].weight.dtype +class Linear(LoraLinear, aLoraLayer): + pass - # In case users wants to merge the adapter weights that are in - # (b)float16 while being on CPU, we need to cast the weights to float32, perform the merge and then cast back to - # (b)float16 because some CPUs have slow bf16/fp16 matmuls. - cast_to_fp32 = device.type == "cpu" and (dtype == torch.float16 or dtype == torch.bfloat16) - weight_A = self.lora_A[adapter].weight - weight_B = self.lora_B[adapter].weight - - if cast_to_fp32: - weight_A = weight_A.float() - weight_B = weight_B.float() - - output_tensor = transpose(weight_B @ weight_A, self.fan_in_fan_out) * self.scaling[adapter] - - if cast_to_fp32: - output_tensor = output_tensor.to(dtype=dtype) - - # cast back the weights - self.lora_A[adapter].weight.data = weight_A.to(dtype) - self.lora_B[adapter].weight.data = weight_B.to(dtype) - - return output_tensor - - def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: - self._check_forward_args(x, *args, **kwargs) - adapter_names = kwargs.pop("adapter_names", None) - alora_offsets = kwargs.pop("alora_offsets", [1]) #Where to activate adapter weights - if self.disable_adapters or alora_offsets[0] <= 0: - if self.merged: - self.unmerge() - result = self.base_layer(x, *args, **kwargs) - elif adapter_names is not None: - result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, alora_offsets = alora_offsets, **kwargs) - elif self.merged: - result = self.base_layer(x, *args, **kwargs) - else: - result = self.base_layer(x, *args, **kwargs) - torch_result_dtype = result.dtype - for active_adapter in self.active_adapters: - if active_adapter not in self.lora_A.keys(): - continue - lora_A = self.lora_A[active_adapter] - lora_B = self.lora_B[active_adapter] - dropout = self.lora_dropout[active_adapter] - scaling = self.scaling[active_adapter] - x = x.to(lora_A.weight.dtype) - - if not self.use_dora[active_adapter]: - if x.dim() == 2: #only one token, so apply adapter on everything (comes up in some tests) - result = result + lora_B(lora_A(dropout(x))) * scaling - # Only do the last k tokens - elif len(alora_offsets) == 1: - k = min(result.shape[1],alora_offsets[0]) - - if k > 0: - result[:,-k:,:] = result[:,-k:,:] + lora_B(lora_A(dropout(x[:,-k:,:]))) * scaling - else: - - for i in range(result.shape[0]): - alora_offsets[i] = min(alora_offsets[i], result.shape[1]) - if ks[i] > 0: - result[i,-alora_offsets[i]:,:] = result[i,-alora_offsets[i]:,:] + lora_B(lora_A(dropout(x[i,-alora_offsets[i]:,:]))) - result = result.to(torch_result_dtype) - - return result - - def __repr__(self) -> str: - rep = super().__repr__() - return "alora." + rep - - - -def dispatch_default( - target: torch.nn.Module, - adapter_name: str, - lora_config: aLoraConfig, - **kwargs, -) -> Optional[torch.nn.Module]: - new_module = None - - if isinstance(target, BaseTunerLayer): +def dispatch_default(target: nn.Module, adapter_name: str, lora_config: aLoraConfig, **kwargs) -> Optional[nn.Module]: + if isinstance(target, LoraLayer): target_base_layer = target.get_base_layer() else: target_base_layer = target - if isinstance(target_base_layer, torch.nn.Embedding): - embedding_kwargs = kwargs.copy() - embedding_kwargs.pop("fan_in_fan_out", None) - embedding_kwargs.update(lora_config.loftq_config) - new_module = Embedding(target, adapter_name, **embedding_kwargs) - elif isinstance(target_base_layer, torch.nn.Conv2d): - kwargs.update(lora_config.loftq_config) - new_module = Conv2d(target, adapter_name, **kwargs) - elif isinstance(target_base_layer, torch.nn.Conv3d): - kwargs.update(lora_config.loftq_config) - new_module = Conv3d(target, adapter_name, **kwargs) - elif isinstance(target_base_layer, torch.nn.Linear): - if kwargs["fan_in_fan_out"]: + if isinstance(target_base_layer, nn.Linear): + if kwargs.get("fan_in_fan_out", False): warnings.warn( - "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. " - "Setting fan_in_fan_out to False." + "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. Setting fan_in_fan_out to False." ) kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False kwargs.update(lora_config.loftq_config) - new_module = Linear(target, adapter_name, **kwargs) - elif isinstance(target_base_layer, Conv1D): - if not kwargs["fan_in_fan_out"]: - warnings.warn( - "fan_in_fan_out is set to False but the target module is `Conv1D`. " "Setting fan_in_fan_out to True." - ) - kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = True - kwargs.update(lora_config.loftq_config) - new_module = Linear(target, adapter_name, is_target_conv_1d_layer=True, **kwargs) - - return new_module + return Linear(target, adapter_name, **kwargs) + return lora_dispatch_default(target, adapter_name, lora_config, **kwargs) diff --git a/src/peft/tuners/alora/model.py b/src/peft/tuners/alora/model.py index 7bf6661da9..8141d4453f 100644 --- a/src/peft/tuners/alora/model.py +++ b/src/peft/tuners/alora/model.py @@ -36,10 +36,13 @@ #from peft.aqlm import dispatch_aqlm #from peft.awq import dispatch_awq from .config import aLoraConfig + #from peft.eetq import dispatch_eetq #from peft.gptq import dispatch_gptq #from peft.hqq import dispatch_hqq from .layer import aLoraLayer, dispatch_default + + #from peft.torchao import dispatch_torchao #from peft.tp_layer import dispatch_megatron @@ -70,13 +73,10 @@ class aLoraModel(BaseTuner): Returns: `torch.nn.Module`: The aLora model. - - **Attributes**: - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. - **peft_config** ([`aLoraConfig`]): The configuration of the aLora model. """ - prefix: str = "lora_" def __init__(self, model, config, adapter_name, low_cpu_mem_usage: bool = False) -> None: @@ -403,7 +403,7 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): ################################ - + if self.training: raise ValueError("Cannot pass `adapter_names` when the model is in training mode.") @@ -437,7 +437,7 @@ def _check_merge_allowed(self): """ raise ValueError("Merging of aLoRA layers is not possible by definition.") - + @staticmethod def _prepare_adapter_config(peft_config, model_config): if peft_config.target_modules is None: diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index 168942c7ce..01250e69a4 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -65,7 +65,13 @@ def unmerge(module: LoraLayer, active_adapter: str, orig_weight: torch.Tensor) - """Remove the adapter weights from the original weights, then return them""" @staticmethod - def forward(module: LoraLayer, active_adapter: str, x: torch.Tensor, result: torch.Tensor) -> torch.Tensor: + def forward( + module: LoraLayer, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: """ The forward pass of the LoRA variant, should return the overall result (not just the diff) @@ -74,6 +80,7 @@ def forward(module: LoraLayer, active_adapter: str, x: torch.Tensor, result: tor active_adapter (str): The name of the active adapter x (torch.Tensor): The input to the forward call result (torch.Tensor): The result from the base model + **kwargs: Additional arguments passed from [`LoraLayer.forward`]. """ raise NotImplementedError @@ -773,6 +780,7 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: active_adapter=active_adapter, x=x, result=result, + **kwargs, ) result = result.to(torch_result_dtype) @@ -1047,6 +1055,7 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: active_adapter=active_adapter, x=x, result=result, + **kwargs, ) result = result.to(torch_result_dtype) @@ -1323,6 +1332,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: active_adapter=active_adapter, x=x, result=result, + **kwargs, ) result = result.to(torch_result_dtype) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 6b99637390..57bf11a761 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -107,7 +107,13 @@ def unmerge(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> t return new_weight @staticmethod - def forward(module: Linear, active_adapter: str, x: torch.Tensor, result: torch.Tensor) -> torch.Tensor: + def forward( + module: Linear, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: lora_A = module.lora_A[active_adapter] lora_B = module.lora_B[active_adapter] dropout = module.lora_dropout[active_adapter] @@ -197,7 +203,13 @@ def unmerge(module: Embedding, active_adapter: str, orig_weight: torch.Tensor) - return new_weight @staticmethod - def forward(module: Embedding, active_adapter: str, x: torch.Tensor, result: torch.Tensor) -> torch.Tensor: + def forward( + module: Embedding, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: embedding_A = module.lora_embedding_A[active_adapter].T embedding_B = module.lora_embedding_B[active_adapter].T scaling = module.scaling[active_adapter] @@ -273,7 +285,13 @@ def unmerge(module: _ConvNd, active_adapter: str, orig_weight: torch.Tensor) -> return new_weight @staticmethod - def forward(module: _ConvNd, active_adapter: str, x: torch.Tensor, result: torch.Tensor) -> torch.Tensor: + def forward( + module: _ConvNd, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: lora_A = module.lora_A[active_adapter] lora_B = module.lora_B[active_adapter] dropout = module.lora_dropout[active_adapter] @@ -380,7 +398,13 @@ def unmerge(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> t raise NotImplementedError("QALoRA for GPTQ layers does not support 'unmerge'.") @staticmethod - def forward(module: Linear, active_adapter: str, x: torch.Tensor, result: torch.Tensor) -> torch.Tensor: + def forward( + module: Linear, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: lora_A_weight = module.lora_A[active_adapter].weight lora_B_weight = module.lora_B[active_adapter].weight dropout = module.lora_dropout[active_adapter] @@ -411,3 +435,54 @@ def forward(module: Linear, active_adapter: str, x: torch.Tensor, result: torch. delta = delta.view(orig_shape[:-1] + (delta.size(-1),)) return result + delta + + +class ALoraLinearVariant(LoraVariant): + @staticmethod + def init(module: Linear, adapter_name: str, **kwargs: Any) -> None: + pass + + @staticmethod + def merge_safe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + raise NotImplementedError("aLoRA does not support safe merging.") + + @staticmethod + def merge_unsafe(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> None: + raise NotImplementedError("aLoRA does not support merging.") + + @staticmethod + def unmerge(module: Linear, active_adapter: str, orig_weight: torch.Tensor) -> torch.Tensor: + raise NotImplementedError("aLoRA does not support unmerging.") + + @staticmethod + def forward( + module: Linear, + active_adapter: str, + x: torch.Tensor, + result: torch.Tensor, + **kwargs, + ) -> torch.Tensor: + alora_offsets = kwargs.get("alora_offsets", [1]) + + lora_A = module.lora_A[active_adapter] + lora_B = module.lora_B[active_adapter] + dropout = module.lora_dropout[active_adapter] + scaling = module.scaling[active_adapter] + + x = x.to(lora_A.weight.dtype) + + if x.dim() == 2: + result = result + lora_B(lora_A(dropout(x))) * scaling + elif len(alora_offsets) == 1: + k = min(result.shape[1], alora_offsets[0]) + if k > 0: + result[:, -k:, :] = result[:, -k:, :] + lora_B(lora_A(dropout(x[:, -k:, :]))) * scaling + else: + for i in range(result.shape[0]): + offset = min(alora_offsets[i], result.shape[1]) + if offset > 0: + result[i, -offset:, :] = ( + result[i, -offset:, :] + lora_B(lora_A(dropout(x[i, -offset:, :]))) * scaling + ) + + return result From 4de01c7b20018c0251596fcbb4a77ddcc3951c2b Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 3 Jul 2025 13:34:03 -0400 Subject: [PATCH 16/99] add alora to lora config --- src/peft/tuners/lora/config.py | 67 ++++++++++++++++++++++++++++++++-- 1 file changed, 63 insertions(+), 4 deletions(-) diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py index 3293e06ccc..d3df3245ff 100644 --- a/src/peft/tuners/lora/config.py +++ b/src/peft/tuners/lora/config.py @@ -65,9 +65,8 @@ class LoftQConfig: bits. """ - loftq_bits: int = field(default=4, metadata={"help": "Quantization bits for LoftQ"}) - loftq_iter: int = field(default=1, metadata={"help": "Alternating iterations for LoftQ"}) - + loftq_bits: str = field(default=4, metadata={"help": "Quantization bits for LoftQ"}) + loftq_iter: str = field(default=1, metadata={"help": "Alternating iterations for LoftQ"}) @dataclass class EvaConfig: @@ -300,6 +299,21 @@ class LoraConfig(PeftConfig): ranks. Right now, DoRA only supports linear and Conv2D layers. DoRA introduces a bigger overhead than pure LoRA, so it is recommended to merge weights for inference. For more information, see https://huggingface.co/papers/2402.09353. + use_alora (`bool`): + Enable 'Activated LoRA' (aLoRA). This technique + selectively activates the adapter weights only on tokens during and after the alora_invocation_tokens. + When used in a CausalLM, this means that the KV cache prior to invocation is interchangeable with that of + the base model (and other aLoRA adapters operating this way). As a result, in inference pipelines involving + switching between base model inference and adapter inference (e.g. agentic pipelines, see paper for many + examples), significant savings are realized (relative to LoRA) by saving prefill operations. Overall adapter + inference speedups of an order of magnitude or more can occur on vLLM, depending on the length of the shared + context. REQUIRED ARGUMENTS: alora_invocation_string, alora_invocation_tokens. These are necessary to know + when to turn on adapter weights. The invocation string therein must be present in all inputs. Note also that + merging is not possible due to the selective application of the weights. + alora_invocation_string (`str`): + Invocation string for aLoRA (must be present in model inputs). Defaults to None. + alora_invocation_tokens (`List[int]`): + Tokenized copy of alora_invocation_string for use when tokenizer is not available. layer_replication (`List[Tuple[int, int]]`): Build a new stack of layers by stacking the original model layers according to the ranges specified. This allows expanding (or shrinking) the model without duplicating the base model weights. The new layers will @@ -498,6 +512,45 @@ class LoraConfig(PeftConfig): ) }, ) + use_alora: bool = field( + default=False, + metadata={ + "help": ( + "Enable 'Activated LoRA' (aLoRA). This technique selectively activates the adapter " + "weights only on tokens during and after the alora_invocation_tokens. When used in a CausalLM, this means that the KV cache prior to invocation " + "is interchangeable with that of the base model (and other aLoRA adapters operating this way). As a result, in inference pipelines involving switching " + "between base model inference and adapter inference (e.g. agentic pipelines, see paper for many examples), significant savings are realized (relative to LoRA) " + "by saving prefill operations. Overall adapter inference speedups of an order of magnitude or more can occur on vLLM, depending on the length of the shared " + "context. " + "NOTE 1: aLoRA often requires higher rank r than LoRA. r=32 often works well." + "NOTE 2: Merging is NOT supported due to the selective application of the adapter weights." + "REQUIRED ARGUMENTS: alora_invocation_string, alora_invocation_tokens. These are necessary to know when to turn on adapter weights. The invocation string therein " + "must be present in all inputs." + ) + }, + ) + alora_invocation_string: Optional[str] = field( + default=None, + metadata={ + "help": ( + "Activated LoRA (aLoRA) invocation string. " + "The adapter weights will be activated 1 token after the last occurence of this string in the input. " + "This string must be present in all inputs. It is best to have this string begin and end with special tokens to avoid tokenizer boundary effects when " + "tokenizing the input. Only used when `use_alora=True`." + ) + }, + ) + alora_invocation_tokens: Optional[list[int]] = field( + default=None, + metadata={ + "help": ( + "Tokenized copy of the Activated LoRA (aLoRA) invocation string alora_invocation_string. " + "The adapter weights will be activated 1 token after the last occurence of this string in the input. " + "These tokens must be present in all inputs after tokenization. It is best to have alora_invocation_string begin and end with special tokens " + "to avoid tokenizer boundary effects when tokenizing the input. Only used when `use_alora=True`." + ) + }, + ) use_qalora: bool = field( default=False, metadata={ @@ -625,7 +678,13 @@ def __post_init__(self): ) if self.use_dora: raise ValueError("The argument lora_bias=True is not supported for DoRA, please pass use_dora=False") - + + #If activated LoRA (aLoRA) is enabled, check for required invocation arguments. + if self.use_alora: + if self.alora_invocation_string is None or self.alora_invocation_tokens is None: + raise ValueError( + "The fields alora_invocation_string and alora_invocation_tokens (tokenized copy of alora_invocation_string) are required to use aLoRA." + ) # Using post training conversion of modified base weights to restore their initial values PiSSA/CorDA/OLoRA cannot # be correctly done when using rslora + rank_pattern/alpha_pattern. We can't really know if the user intends # this when they'll eventually call save_pretrained (i.e. if they'll pass From f7cb9d88200afdae0325b2cfa8dff0001fb9e59a Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 3 Jul 2025 13:44:10 -0400 Subject: [PATCH 17/99] Update model.py --- src/peft/tuners/lora/model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index a4b80de53d..1122c30e6e 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -200,6 +200,7 @@ def _create_and_replace( "init_lora_weights": lora_config.init_lora_weights, "use_rslora": lora_config.use_rslora, "use_dora": lora_config.use_dora, + "use_alora": lora_config.use_alora, "use_qalora": lora_config.use_qalora, "qalora_group_size": lora_config.qalora_group_size, "ephemeral_gpu_offload": lora_config.runtime_config.ephemeral_gpu_offload, @@ -233,6 +234,7 @@ def _create_and_replace( init_lora_weights=lora_config.init_lora_weights, use_rslora=lora_config.use_rslora, use_dora=lora_config.use_dora, + use_alora=lora_config.use_alora, lora_bias=lora_config.lora_bias, ) else: From 10d660ff2b70f2b4998c91d750041e3bc8bf40b3 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 3 Jul 2025 13:45:40 -0400 Subject: [PATCH 18/99] Update model.py --- src/peft/tuners/lora/model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index 1122c30e6e..6fa4d2c49f 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -234,7 +234,6 @@ def _create_and_replace( init_lora_weights=lora_config.init_lora_weights, use_rslora=lora_config.use_rslora, use_dora=lora_config.use_dora, - use_alora=lora_config.use_alora, lora_bias=lora_config.lora_bias, ) else: From e61a7b294872722c72064bc8e5877a498fb6ebdc Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 3 Jul 2025 14:02:10 -0400 Subject: [PATCH 19/99] Update layer.py for alora --- src/peft/tuners/lora/layer.py | 41 ++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index 01250e69a4..a907872eaf 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -191,6 +191,7 @@ def update_layer( init_lora_weights, use_rslora, use_dora: bool = False, + use_alora: bool = False, use_qalora: bool = False, lora_bias: bool = False, qalora_group_size: int = 32, @@ -205,7 +206,7 @@ def update_layer( raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") lora_variant = self.resolve_lora_variant( - use_dora=use_dora, use_qalora=use_qalora, qalora_group_size=qalora_group_size + use_dora=use_dora, use_alora=use_alora, use_qalora=use_qalora, qalora_group_size=qalora_group_size ) if lora_variant is not None: self.lora_variant[adapter_name] = lora_variant @@ -573,8 +574,18 @@ def _mixed_batch_forward( # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear # layer output sub_batch = x[sub_batch_indices_list[i]].to(lora_A.weight.dtype) - lora_output = lora_B(lora_A(dropout(sub_batch))) * scaling - result[sub_batch_indices_list[i]] += lora_output.to(torch_result_dtype) + if active_adapter not in self.lora_variant: # vanilla LoRA + lora_output = lora_B(lora_A(dropout(sub_batch))) * scaling + result[sub_batch_indices_list[i]] += lora_output.to(torch_result_dtype) + else: + lora_output = self.lora_variant[active_adapter].forward( + self, + active_adapter=active_adapter, + x=x, + result=result, + **kwargs, + ) + result[sub_batch_indices_list[i]] += lora_output.to(torch_result_dtype) return result @@ -603,6 +614,7 @@ def __init__( init_lora_weights: Union[bool, str] = True, use_rslora: bool = False, use_dora: bool = False, + use_alora: bool = False, lora_bias: bool = False, **kwargs, ) -> None: @@ -624,12 +636,14 @@ def __init__( self.is_target_conv_1d_layer = is_target_conv_1d_layer def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: - if not use_dora: + if not use_dora or use_alora: return None - from .variants import DoraLinearVariant - - return DoraLinearVariant() + from .variants import DoraLinearVariant, ALoraLinearVariant + if use_alora: + return ALoraLinearVariant() + else: + return DoraLinearVariant() def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: """ @@ -828,7 +842,12 @@ def __init__( lora_bias=lora_bias, ) - def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: + def resolve_lora_variant(self, *, use_dora: bool, use_alora: bool, **kwargs) -> Optional[LoraVariant]: + if use_alora: + ValueError( + "aLoRA does not support adapting embedding layers." + ) + return None if not use_dora: return None @@ -1078,6 +1097,7 @@ def __init__( init_lora_weights: Union[bool, str] = True, use_rslora: bool = False, use_dora: bool = False, + use_alora: bool = False, lora_bias: bool = False, **kwargs, ) -> None: @@ -1105,11 +1125,12 @@ def __init__( init_lora_weights=init_lora_weights, use_rslora=use_rslora, use_dora=use_dora, + use_alora=use_alora, lora_bias=lora_bias, ) def update_layer( - self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora, lora_bias + self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora, use_alora, lora_bias ): # collect the kwargs kwargs = locals().copy() @@ -1118,7 +1139,7 @@ def update_layer( if r <= 0: raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") - lora_variant = self.resolve_lora_variant(use_dora=use_dora) + lora_variant = self.resolve_lora_variant(use_dora=use_dora, use_alora=use_alora) if lora_variant is not None: self.lora_variant[adapter_name] = lora_variant From 94ebcb3c10fd08615c17c9b2b4a2f43a795cb1c9 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 3 Jul 2025 15:59:54 -0400 Subject: [PATCH 20/99] Update layer.py for use_alora --- src/peft/tuners/lora/layer.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index a907872eaf..e8a2b9b2ab 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -168,11 +168,11 @@ def __init__(self, base_layer: nn.Module, ephemeral_gpu_offload: bool = False, * self.in_features = in_features self.out_features = out_features - def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: + def resolve_lora_variant(self, *, use_dora: bool, use_alora: bool, **kwargs) -> Optional[LoraVariant]: """Return a matching LoRA variant for this layer type. Given the init arguments of this layer, return the correct LoRA variant, if any. E.g., if `use_dora=True`, this - method should return the DoRA variant for the given layer. + method should return the DoRA variant for the given layer. If `use_alora=True`, same for aLoRA. If there is no fitting variant, return None. @@ -631,11 +631,12 @@ def __init__( init_lora_weights=init_lora_weights, use_rslora=use_rslora, use_dora=use_dora, + use_alora=use_alora, lora_bias=lora_bias, ) self.is_target_conv_1d_layer = is_target_conv_1d_layer - def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: + def resolve_lora_variant(self, *, use_dora: bool, use_alora: bool, **kwargs) -> Optional[LoraVariant]: if not use_dora or use_alora: return None @@ -842,12 +843,8 @@ def __init__( lora_bias=lora_bias, ) - def resolve_lora_variant(self, *, use_dora: bool, use_alora: bool, **kwargs) -> Optional[LoraVariant]: - if use_alora: - ValueError( - "aLoRA does not support adapting embedding layers." - ) - return None + def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: + if not use_dora: return None @@ -1097,13 +1094,13 @@ def __init__( init_lora_weights: Union[bool, str] = True, use_rslora: bool = False, use_dora: bool = False, - use_alora: bool = False, lora_bias: bool = False, **kwargs, ) -> None: super().__init__() LoraLayer.__init__(self, base_layer) - + if kwargs.get("use_alora", False): + raise ValueError("aLoRA does not support adapting conv layers.") if base_layer.groups > 1: warnings.warn("LoRA adapter added to ConvNd layer with groups > 1. Merging is not supported.") @@ -1125,12 +1122,11 @@ def __init__( init_lora_weights=init_lora_weights, use_rslora=use_rslora, use_dora=use_dora, - use_alora=use_alora, lora_bias=lora_bias, ) def update_layer( - self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora, use_alora, lora_bias + self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora, lora_bias ): # collect the kwargs kwargs = locals().copy() @@ -1139,7 +1135,7 @@ def update_layer( if r <= 0: raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") - lora_variant = self.resolve_lora_variant(use_dora=use_dora, use_alora=use_alora) + lora_variant = self.resolve_lora_variant(use_dora=use_dora) if lora_variant is not None: self.lora_variant[adapter_name] = lora_variant @@ -1453,7 +1449,8 @@ def __init__( if use_dora: # TODO: probably not so hard to implement raise ValueError(f"{self.__class__.__name__} does not support DoRA (yet), please set use_dora to False") - + if kwargs.get("use_alora", False): + raise ValueError(f"{self.__class__.__name__} does not support aLoRA (yet), please set use_alora to False") super().__init__() LoraLayer.__init__(self, base_layer, **kwargs) From 99046cc3eadf883417e7dfd28d7b210d5156e2e7 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 3 Jul 2025 16:28:06 -0400 Subject: [PATCH 21/99] Update __init__.py --- src/peft/tuners/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/peft/tuners/__init__.py b/src/peft/tuners/__init__.py index 43f739a46c..a6a6a06257 100644 --- a/src/peft/tuners/__init__.py +++ b/src/peft/tuners/__init__.py @@ -14,7 +14,6 @@ from .adalora import AdaLoraConfig, AdaLoraModel from .adaption_prompt import AdaptionPromptConfig, AdaptionPromptModel -from .alora import aLoraConfig, aLoraModel from .boft import BOFTConfig, BOFTModel from .bone import BoneConfig, BoneModel from .c3a import C3AConfig, C3AModel @@ -53,8 +52,6 @@ "AdaLoraModel", "AdaptionPromptConfig", "AdaptionPromptModel", - "aLoraConfig", - "aLoraModel", "BOFTConfig", "BOFTModel", "BoneConfig", From 628a84d472fcbfbfb48e50129fa3b02cfe737444 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 3 Jul 2025 16:31:53 -0400 Subject: [PATCH 22/99] Update peft_model.py --- src/peft/peft_model.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index ba098e676c..f5c4345795 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -65,7 +65,6 @@ set_peft_model_state_dict, shift_tokens_right, ) -from .tuners.alora.config import aLoraConfig class PeftModel(PushToHubMixin, torch.nn.Module): @@ -1803,11 +1802,11 @@ def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optio current_peft_config = self.peft_config[current_adapter_name] - if not isinstance(current_peft_config, aLoraConfig): + if not current_peft_config.use_alora: alora_offsets[i] = -1 # Not an aLoRA adapter or wrong type continue - invocation_tokens = getattr(current_peft_config, 'invocation_tokens', None) + invocation_tokens = getattr(current_peft_config, 'alora_invocation_tokens', None) if not invocation_tokens: alora_offsets[i] = -1 # No way to calculate offset continue From c2f83f1cffecfdf87d89c5481d2281c83ce08944 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 3 Jul 2025 16:34:34 -0400 Subject: [PATCH 23/99] Update config.py --- src/peft/tuners/lora/config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py index d3df3245ff..0b557b8ce0 100644 --- a/src/peft/tuners/lora/config.py +++ b/src/peft/tuners/lora/config.py @@ -544,7 +544,9 @@ class LoraConfig(PeftConfig): default=None, metadata={ "help": ( - "Tokenized copy of the Activated LoRA (aLoRA) invocation string alora_invocation_string. " + "Tokenized copy of the Activated LoRA (aLoRA) invocation string alora_invocation_string " + "(as a list of token IDs). Use the model's default tokenizer. " + "E.g. alora_invocation_tokens = tokenizer.encode(alora_invocation_string, add_special_tokens=False)." "The adapter weights will be activated 1 token after the last occurence of this string in the input. " "These tokens must be present in all inputs after tokenization. It is best to have alora_invocation_string begin and end with special tokens " "to avoid tokenizer boundary effects when tokenizing the input. Only used when `use_alora=True`." From 9c737827ae4db97ee2d344df8624e016d8d1b085 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 3 Jul 2025 16:57:10 -0400 Subject: [PATCH 24/99] alora_offsets forward hook --- src/peft/tuners/lora/model.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index 6fa4d2c49f..761504a468 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -63,6 +63,10 @@ def _adapter_names_pre_forward_hook(target, args, kwargs, adapter_names): kwargs["adapter_names"] = adapter_names return args, kwargs +def _alora_offsets_pre_forward_hook(target, args, kwargs, alora_offsets): + kwargs["alora_offsets"] = alora_offsets + return args, kwargs + class LoraModel(BaseTuner): """ @@ -428,12 +432,21 @@ def set_adapter(self, adapter_name: str | list[str]) -> None: @contextmanager def _enable_peft_forward_hooks(self, *args, **kwargs): + # If adapter_names is passed as an argument, we inject it into the forward arguments. adapter_names = kwargs.pop("adapter_names", None) - if adapter_names is None: + alora_offsets = kwargs.pop("alora_offsets", None) + if adapter_names is None and alora_offsets is None: # nothing to do yield return + hook_handles = [] + for layer in self.modules(): + if isinstance(layer, LoraLayer): + pre_forward = partial(_alora_offsets_pre_forward_hook, alora_offsets = alora_offsets) + handle = layer.register_forward_pre_hook(pre_forward, with_kwargs=True) + hook_handles.append(handle) + if self.training: raise ValueError("Cannot pass `adapter_names` when the model is in training mode.") @@ -463,7 +476,7 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): # encoder part. Further below, the original argument is thus restored for the encoder. adapter_names = sum(([n] * kwargs["num_beams"] for n in adapter_names), []) - hook_handles = [] + for module in self.modules(): if isinstance(module, LoraLayer) or isinstance(module, AuxiliaryTrainingWrapper): pre_forward = partial(_adapter_names_pre_forward_hook, adapter_names=adapter_names) From 4a474142d779e8581e0584d34958ca7f7660361f Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 3 Jul 2025 17:00:13 -0400 Subject: [PATCH 25/99] Delete src/peft/tuners/alora directory --- src/peft/tuners/alora/__init__.py | 23 - src/peft/tuners/alora/config.py | 49 -- src/peft/tuners/alora/layer.py | 83 --- src/peft/tuners/alora/model.py | 896 ------------------------------ 4 files changed, 1051 deletions(-) delete mode 100644 src/peft/tuners/alora/__init__.py delete mode 100644 src/peft/tuners/alora/config.py delete mode 100644 src/peft/tuners/alora/layer.py delete mode 100644 src/peft/tuners/alora/model.py diff --git a/src/peft/tuners/alora/__init__.py b/src/peft/tuners/alora/__init__.py deleted file mode 100644 index d14c3db02b..0000000000 --- a/src/peft/tuners/alora/__init__.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2023-present the HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from peft.utils import register_peft_method - -from .config import aLoraConfig -from .layer import Linear, aLoraLayer -from .model import aLoraModel - -__all__ = ["Linear", "aLoraConfig", "aLoraLayer", "aLoraModel"] - -register_peft_method(name="alora", config_cls=aLoraConfig, model_cls=aLoraModel, prefix="lora_", is_mixed_compatible=True) diff --git a/src/peft/tuners/alora/config.py b/src/peft/tuners/alora/config.py deleted file mode 100644 index 17001a7622..0000000000 --- a/src/peft/tuners/alora/config.py +++ /dev/null @@ -1,49 +0,0 @@ -from __future__ import annotations - -import warnings -from dataclasses import dataclass, field - -from peft.tuners.lora import LoraConfig -from peft.utils import PeftType - - -@dataclass -class aLoraConfig(LoraConfig): - """ - This is the configuration class to store the configuration of an [`aLoraModel`]. - - It subclasses PEFT's LoraConfig, modifies the default rank r to 32 (often best), and adds an additional parameter: - r (`int`): aLora attention dimension (the "rank"). Typically needs to be higher than used for standard Lora. Default=32. - invocation_string (str): String intended to activate the aLoRA. The aLoRA adapted weights will activate - 1 token after the first token in this string. This string must be present in all input data. - """ - - r: int = field( - default=32, - metadata={ - "help": "aLora attention dimension. Typically needs to be higher than used for standard Lora. Default=32." - }, - ) - invocation_string: str = field( - default=None, - metadata={ - "help": ( - "aLoRA invocation string. The aLoRA adapted weights will activate 1 token after the first token in " - "this string. This string must be present in all input data." - ) - }, - ) - invocation_tokens: list[int] = field( - default=None, - metadata={ - "help": "Tokenized version of `invocation_string` (as a list of token IDs). Use the model's default tokenizer. E.g. invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False)" - }, - ) - - def __post_init__(self): - super().__post_init__() - self.peft_type = PeftType.ALORA - if self.invocation_string is None: - warnings.warn("invocation_string cannot be None for aLoRA.", UserWarning) - if self.invocation_tokens is None: - warnings.warn("invocation_tokens cannot be None for aLoRA.", UserWarning) diff --git a/src/peft/tuners/alora/layer.py b/src/peft/tuners/alora/layer.py deleted file mode 100644 index 2893b692b8..0000000000 --- a/src/peft/tuners/alora/layer.py +++ /dev/null @@ -1,83 +0,0 @@ -from __future__ import annotations - -import warnings -from typing import Optional - -import torch -import torch.nn as nn - -from peft.tuners.lora.layer import Linear as LoraLinear -from peft.tuners.lora.layer import LoraLayer -from peft.tuners.lora.layer import dispatch_default as lora_dispatch_default -from peft.tuners.lora.variants import ALoraLinearVariant - -from .config import aLoraConfig - - -class aLoraLayer(LoraLayer): - def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[object]: - return ALoraLinearVariant() - - def _mixed_batch_forward( - self, - x: torch.Tensor, - *args, - adapter_names: list[str], - alora_offsets: list[int], - **kwargs, - ) -> torch.Tensor: - result = self.base_layer(x, *args, **kwargs) - torch_result_dtype = result.dtype - ks = alora_offsets - unique_adapters = set(adapter_names) - sub_batch_indices_list = [] - for adapter in unique_adapters: - sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) - - for i, active_adapter in enumerate(unique_adapters): - if active_adapter == "__base__": - continue - if active_adapter not in self.lora_A.keys(): - continue - - lora_A = self.lora_A[active_adapter] - lora_B = self.lora_B[active_adapter] - dropout = self.lora_dropout[active_adapter] - scaling = self.scaling[active_adapter] - - sub_batch = x[sub_batch_indices_list[i]].to(lora_A.weight.dtype) - if len(ks) > 1: - ks_batch = ks[sub_batch_indices_list[i]] - for j in range(len(ks_batch)): - k = min(ks_batch[j], result.shape[1]) - lora_output = lora_B(lora_A(dropout(sub_batch[j, -k:, :]))) * scaling - result[sub_batch_indices_list[i][j], -k:, :] += lora_output.to(torch_result_dtype) - else: - ks_batch = ks - k = min(result.shape[1], ks_batch[0]) - lora_output = lora_B(lora_A(dropout(sub_batch[:, -k:, :]))) * scaling - result[sub_batch_indices_list[i], -k:, :] += lora_output.to(torch_result_dtype) - - return result - - -class Linear(LoraLinear, aLoraLayer): - pass - - -def dispatch_default(target: nn.Module, adapter_name: str, lora_config: aLoraConfig, **kwargs) -> Optional[nn.Module]: - if isinstance(target, LoraLayer): - target_base_layer = target.get_base_layer() - else: - target_base_layer = target - - if isinstance(target_base_layer, nn.Linear): - if kwargs.get("fan_in_fan_out", False): - warnings.warn( - "fan_in_fan_out is set to True but the target module is `torch.nn.Linear`. Setting fan_in_fan_out to False." - ) - kwargs["fan_in_fan_out"] = lora_config.fan_in_fan_out = False - kwargs.update(lora_config.loftq_config) - return Linear(target, adapter_name, **kwargs) - - return lora_dispatch_default(target, adapter_name, lora_config, **kwargs) diff --git a/src/peft/tuners/alora/model.py b/src/peft/tuners/alora/model.py deleted file mode 100644 index 8141d4453f..0000000000 --- a/src/peft/tuners/alora/model.py +++ /dev/null @@ -1,896 +0,0 @@ - -from __future__ import annotations - -import math -import operator -import warnings -from contextlib import contextmanager -from dataclasses import asdict, replace -from enum import Enum -from functools import partial, reduce -from typing import Literal, Optional - -import torch -from torch import nn -from tqdm import tqdm - -from peft.import_utils import is_bnb_4bit_available, is_bnb_available -from peft.tuners.tuners_utils import ( - BaseTuner, - BaseTunerLayer, - check_target_module_exists, - onload_layer, - replicate_layers, -) -from peft.utils import ( - TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING, - ModulesToSaveWrapper, - _freeze_adapter, - _get_submodules, - get_peft_model_state_dict, - get_quantization_config, -) -from peft.utils.merge_utils import dare_linear, dare_ties, magnitude_prune, task_arithmetic, ties -from peft.utils.other import get_pattern_key - -#from peft.aqlm import dispatch_aqlm -#from peft.awq import dispatch_awq -from .config import aLoraConfig - -#from peft.eetq import dispatch_eetq -#from peft.gptq import dispatch_gptq -#from peft.hqq import dispatch_hqq -from .layer import aLoraLayer, dispatch_default - - -#from peft.torchao import dispatch_torchao -#from peft.tp_layer import dispatch_megatron - - -def _adapter_names_pre_forward_hook(target, args, kwargs, adapter_names,alora_offsets): - # pre-forward hook to inject the adapter_names argument when using mixed adapter batches inference - kwargs["adapter_names"] = adapter_names - if alora_offsets is not None: - kwargs["alora_offsets"] = alora_offsets - return args, kwargs - -def _alora_offsets_pre_forward_hook(target, args, kwargs, alora_offsets): - kwargs["alora_offsets"] = alora_offsets - return args, kwargs - -class aLoraModel(BaseTuner): - """ - Creates Activated Low Rank Adapter (aLoRA) model from a pretrained transformers model. - - The method is described in detail in https://arxiv.org/abs/2504.12397. - - Args: - model ([`torch.nn.Module`]): The model to be adapted. - config ([`aLoraConfig`]): The configuration of the aLora model. - adapter_name (`str`): The name of the adapter, defaults to `"default"`. - low_cpu_mem_usage (`bool`, `optional`, defaults to `False`): - Create empty adapter weights on meta device. Useful to speed up the loading process. - - Returns: - `torch.nn.Module`: The aLora model. - **Attributes**: - - **model** ([`~transformers.PreTrainedModel`]) -- The model to be adapted. - - **peft_config** ([`aLoraConfig`]): The configuration of the aLora model. - """ - prefix: str = "lora_" - - def __init__(self, model, config, adapter_name, low_cpu_mem_usage: bool = False) -> None: - super().__init__(model, config, adapter_name, low_cpu_mem_usage=low_cpu_mem_usage) - - def _check_new_adapter_config(self, config: aLoraConfig) -> None: - """ - A helper method to check the config when a new adapter is being added. - - Raise a ValueError if there is something wrong with the config or if it conflicts with existing adapters. - - """ - # TODO: there should be a check if any of the existing adapters actually has bias != "none", or else the check - # does not fully correspond to the error message. - if (len(self.peft_config) > 1) and (config.bias != "none"): - raise ValueError( - f"{self.__class__.__name__} supports only 1 adapter with bias. When using multiple adapters, " - "set bias to 'none' for all adapters." - ) - - @staticmethod - def _check_target_module_exists(lora_config, key): - return check_target_module_exists(lora_config, key) - - def _prepare_model(self, peft_config: aLoraConfig, model: nn.Module): - r""" - A private method to modify the model structure before adapter is applied. - - Args: - peft_config (`PeftConfig`): - The prepared adapter config. - model (`nn.Module`): - The model that is going to be adapted. - """ - if peft_config.layer_replication: - replicate_layers(model, peft_config.layer_replication) - - def _create_and_replace( - self, - lora_config, - adapter_name, - target, - target_name, - parent, - current_key, - ): - if current_key is None: - raise ValueError("Current Key shouldn't be `None`") - - # Regexp matching - Find key which matches current target_name in patterns provided - r_key = get_pattern_key(lora_config.rank_pattern.keys(), current_key) - alpha_key = get_pattern_key(lora_config.alpha_pattern.keys(), current_key) - r = lora_config.rank_pattern.get(r_key, lora_config.r) - alpha = lora_config.alpha_pattern.get(alpha_key, lora_config.lora_alpha) - - kwargs = { - "r": r, - "lora_alpha": alpha, - "lora_dropout": lora_config.lora_dropout, - "fan_in_fan_out": lora_config.fan_in_fan_out, - "init_lora_weights": lora_config.init_lora_weights, - "use_rslora": lora_config.use_rslora, - "use_dora": lora_config.use_dora, - "ephemeral_gpu_offload": lora_config.runtime_config.ephemeral_gpu_offload, - "lora_bias": lora_config.lora_bias, - "loaded_in_8bit": getattr(self.model, "is_loaded_in_8bit", False), - "loaded_in_4bit": getattr(self.model, "is_loaded_in_4bit", False), - } - # for torchao merging, we need the get_apply_tensor_subclass from the quantization config - try: - kwargs["get_apply_tensor_subclass"] = operator.attrgetter( - "hf_quantizer.quantization_config.get_apply_tensor_subclass" - )(self.model) - except AttributeError: - pass - - quant_methods = ["gptq", "aqlm", "awq"] - for quant_method in quant_methods: - quantization_config = get_quantization_config(self.model, method=quant_method) - if quantization_config is not None: - kwargs[f"{quant_method}_quantization_config"] = quantization_config - - # note: AdaLoraLayer is a subclass of LoraLayer, we need to exclude it - from peft.tuners.adalora import AdaLoraLayer - - if isinstance(target, aLoraLayer) and not isinstance(target, AdaLoraLayer): - target.update_layer( - adapter_name, - r, - lora_alpha=alpha, - lora_dropout=lora_config.lora_dropout, - init_lora_weights=lora_config.init_lora_weights, - use_rslora=lora_config.use_rslora, - use_dora=lora_config.use_dora, - lora_bias=lora_config.lora_bias, - ) - else: - new_module = self._create_new_module(lora_config, adapter_name, target, **kwargs) - if adapter_name not in self.active_adapters: - # adding an additional adapter: it is not automatically trainable - new_module.requires_grad_(False) - self._replace_module(parent, target_name, new_module, target) - - def _replace_module(self, parent, child_name, new_module, child): - setattr(parent, child_name, new_module) - # It's not necessary to set requires_grad here, as that is handled by - # _mark_only_adapters_as_trainable - - # child layer wraps the original module, unpack it - if hasattr(child, "base_layer"): - child = child.base_layer - - if not hasattr(new_module, "base_layer"): - if hasattr(new_module, "W_q"): # HQQ - new_module.W_q = child.W_q - else: - new_module.weight = child.weight - if hasattr(child, "bias"): - new_module.bias = child.bias - - if getattr(child, "state", None) is not None: - if hasattr(new_module, "base_layer"): - new_module.base_layer.state = child.state - else: - new_module.state = child.state - new_module.to(child.weight.device) - - meta = torch.device("meta") - # dispatch to correct device - for name, module in new_module.named_modules(): - if (self.prefix in name) or ("ranknum" in name): - weight = ( - child.qweight - if hasattr(child, "qweight") - else child.W_q - if hasattr(child, "W_q") - else child.weight - if hasattr(child, "weight") - else next(child.parameters()) - ) - if not any(p.device == meta for p in module.parameters()): - module.to(weight.device) - - def _mark_only_adapters_as_trainable(self, model: nn.Module) -> None: - for n, p in model.named_parameters(): - if self.prefix not in n: - p.requires_grad = False - - for active_adapter in self.active_adapters: - bias = self.peft_config[active_adapter].bias - if bias == "none": - continue - - if bias == "all": - for n, p in model.named_parameters(): - if "bias" in n: - p.requires_grad = True - elif bias == "lora_only": - for m in model.modules(): - if isinstance(m, aLoraLayer) and hasattr(m, "bias") and m.bias is not None: - m.bias.requires_grad = True - else: - raise NotImplementedError(f"Requested bias: {bias}, is not implemented.") - - @staticmethod - def _create_new_module(lora_config, adapter_name, target, **kwargs): - # Collect dispatcher functions to decide what backend to use for the replaced LoRA layer. The order matters, - # because the first match is always used. Therefore, the default layers should be checked last. - dispatchers = [] - - if lora_config._custom_modules: - # Experimental custom LoRA module support. Allows users to pass a custom mapping for unsupported layer - # types by impelementing their own LoRA layers. - def dynamic_dispatch_func(target, adapter_name, lora_config, **kwargs): - new_module = None - - if isinstance(target, BaseTunerLayer): - target_base_layer = target.get_base_layer() - else: - target_base_layer = target - - for key, custom_cls in lora_config._custom_modules.items(): - if isinstance(target_base_layer, key): - new_module = custom_cls(target, adapter_name, **kwargs) - break - - return new_module - - dispatchers.append(dynamic_dispatch_func) - - # avoid eager bnb import - if is_bnb_available(): - from peft.tuners.lora.bnb import dispatch_bnb_8bit - - dispatchers.append(dispatch_bnb_8bit) - - if is_bnb_4bit_available(): - from peft.tuners.lora.bnb import dispatch_bnb_4bit - - dispatchers.append(dispatch_bnb_4bit) - - dispatchers.extend( - [ - # dispatch_eetq, - # dispatch_aqlm, -# dispatch_awq, - # dispatch_gptq, - # dispatch_hqq, - # dispatch_torchao, - # dispatch_megatron, - dispatch_default, - ] - ) - - new_module = None - for dispatcher in dispatchers: - new_module = dispatcher(target, adapter_name, lora_config=lora_config, **kwargs) - if new_module is not None: # first match wins - break - - if new_module is None: - # no module could be matched - raise ValueError( - f"Target module {target} is not supported. Currently, only the following modules are supported: " - "`torch.nn.Linear`" - ) - - return new_module - - def __getattr__(self, name: str): - """Forward missing attributes to the wrapped module.""" - try: - return super().__getattr__(name) # defer to nn.Module's logic - except AttributeError: - if name == "model": # see #1892: prevent infinite recursion if class is not initialized - raise - return getattr(self.model, name) - - def get_peft_config_as_dict(self, inference: bool = False): - config_dict = {} - for key, value in self.peft_config.items(): - config = {k: v.value if isinstance(v, Enum) else v for k, v in asdict(value).items()} - if inference: - config["inference_mode"] = True - config_dict[key] = config - return config - - def _set_adapter_layers(self, enabled: bool = True) -> None: - for module in self.model.modules(): - if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)): - module.enable_adapters(enabled) - - def enable_adapter_layers(self) -> None: - """Enable all adapters. - - Call this if you have previously disabled all adapters and want to re-enable them. - """ - self._set_adapter_layers(enabled=True) - - def disable_adapter_layers(self) -> None: - """Disable all adapters. - - When disabling all adapters, the model output corresponds to the output of the base model. - """ - for active_adapter in self.active_adapters: - val = self.peft_config[active_adapter].bias - if val != "none": - msg = ( - f"Careful, disabling adapter layers with bias configured to be '{val}' does not produce the same " - "output as the the base model would without adaption." - ) - warnings.warn(msg) - self._set_adapter_layers(enabled=False) - - def set_adapter(self, adapter_name: str | list[str]) -> None: - """Set the active adapter(s). - - Additionally, this function will set the specified adapters to trainable (i.e., requires_grad=True). If this is - not desired, use the following code. - - ```py - >>> for name, param in model_peft.named_parameters(): - ... if ...: # some check on name (ex. if 'lora' in name) - ... param.requires_grad = False - ``` - - Args: - adapter_name (`str` or `list[str]`): Name of the adapter(s) to be activated. - """ - for module in self.model.modules(): - if isinstance(module, aLoraLayer): - if module.merged: - warnings.warn("Adapter cannot be set when the model is merged. Unmerging the model first.") - module.unmerge() - module.set_adapter(adapter_name) - self.active_adapter = adapter_name - - @contextmanager - def _enable_peft_forward_hooks(self, *args, **kwargs): - # If adapter_names is passed as an argument, we inject it into the forward arguments. - adapter_names = kwargs.pop("adapter_names", None) - alora_offsets = kwargs.pop("alora_offsets", None) - if adapter_names is None and alora_offsets is None: - # nothing to do - yield - return - if adapter_names is None: - hook_handles = [] - for module in self.modules(): - if isinstance(module, aLoraLayer): - pre_forward = partial(_alora_offsets_pre_forward_hook, alora_offsets = alora_offsets) - handle = module.register_forward_pre_hook(pre_forward, with_kwargs=True) - hook_handles.append(handle) - yield - - for handle in hook_handles: - handle.remove() - - return - - - - - - - ################################ - - if self.training: - raise ValueError("Cannot pass `adapter_names` when the model is in training mode.") - - # Check that users only passed actually existing adapters. - # Note: We cannot do this on the layer level, as each individual layer may not have each adapter. Still, we want - # to check that there is at least one layer with the given name, or else something like typos can easily slip. - expected_adapters = set() - for layer in self.modules(): - if isinstance(layer, aLoraLayer): - expected_adapters |= layer.lora_A.keys() - expected_adapters |= layer.lora_embedding_A.keys() - unique_adapters = {name for name in adapter_names if name != "__base__"} - unexpected_adapters = unique_adapters - expected_adapters - if unexpected_adapters: - raise ValueError(f"Trying to infer with non-existing adapter(s): {', '.join(sorted(unexpected_adapters))}") - - hook_handles = [] - for module in self.modules(): - if isinstance(module, aLoraLayer) or isinstance(module, ModulesToSaveWrapper): - pre_forward = partial(_adapter_names_pre_forward_hook, adapter_names=adapter_names, alora_offsets = alora_offsets) - handle = module.register_forward_pre_hook(pre_forward, with_kwargs=True) - hook_handles.append(handle) - - yield - - for handle in hook_handles: - handle.remove() - - def _check_merge_allowed(self): - """Merging is not allowed for activated LoRA models - """ - - raise ValueError("Merging of aLoRA layers is not possible by definition.") - - @staticmethod - def _prepare_adapter_config(peft_config, model_config): - if peft_config.target_modules is None: - if model_config["model_type"] not in TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING: - raise ValueError("Please specify `target_modules` in `peft_config`") - peft_config.target_modules = set( - TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING[model_config["model_type"]] - ) - return peft_config - - def _unload_and_optionally_merge( - self, - merge=True, - progressbar: bool = False, - safe_merge: bool = False, - adapter_names: Optional[list[str]] = None, - ): - if merge: - self._check_merge_allowed() - - key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] - desc = "Unloading " + ("and merging " if merge else "") + "model" - for key in tqdm(key_list, disable=not progressbar, desc=desc): - try: - parent, target, target_name = _get_submodules(self.model, key) - except AttributeError: - continue - with onload_layer(target): - if hasattr(target, "base_layer"): - if merge: - target.merge(safe_merge=safe_merge, adapter_names=adapter_names) - self._replace_module(parent, target_name, target.get_base_layer(), target) - elif isinstance(target, ModulesToSaveWrapper): - # save any additional trainable modules part of `modules_to_save` - new_module = target.modules_to_save[target.active_adapter] - if hasattr(new_module, "base_layer"): - # check if the module is itself a tuner layer - if merge: - new_module.merge(safe_merge=safe_merge, adapter_names=adapter_names) - new_module = new_module.get_base_layer() - setattr(parent, target_name, new_module) - - return self.model - - def _check_add_weighted_adapter( - self, adapters: list[str], combination_type: str, svd_rank: int | None - ) -> tuple[str, int, str]: - """ - Helper function to check if the arguments to add_weighted_adapter are valid and compatible with the underlying - model. - """ - for adapter in adapters: - if adapter not in list(self.peft_config.keys()): - raise ValueError(f"Adapter {adapter} does not exist") - - # If more than one of the adapters targets the same module with modules_to_save, raise an error, as these - # modules cannot be merged. First, find the ModulesToSaveWrapper instances in the model, then check if they - # have modules for the adapters to be merged. - modules_to_save_wrappers = [module for module in self.modules() if isinstance(module, ModulesToSaveWrapper)] - problematic_wrappers = [ - wrapper - for wrapper in modules_to_save_wrappers - if sum(adapter in wrapper.modules_to_save for adapter in adapters) > 1 - ] - if problematic_wrappers: - raise ValueError( - "Cannot add weighted adapters if they target the same module with modules_to_save, but found " - f"{len(problematic_wrappers)} such instance(s)." - ) - - # if there is only one adapter, we can only use linear merging - combination_type = "linear" if len(adapters) == 1 else combination_type - - adapters_ranks = [self.peft_config[adapter].r for adapter in adapters] - if combination_type in ("linear", "ties", "dare_ties", "dare_linear", "magnitude_prune"): - # all adapters ranks should be same, new rank is just this value - if len(set(adapters_ranks)) != 1: - raise ValueError( - "All adapters must have the same r value when using combination_type linear, ties, dare_ties or " - "dare_linear." - ) - new_rank = adapters_ranks[0] - elif combination_type == "cat": - # adapters ranks may be different, new rank is sum of all ranks - # be careful, because output adapter rank may be really big if mixing a lot of adapters - new_rank = sum(adapters_ranks) - elif combination_type.endswith("svd"): - # new rank is the max of all ranks of the adapters if not provided - new_rank = svd_rank or max(adapters_ranks) - else: - raise ValueError(f"Invalid combination_type: {combination_type}") - - target_module_types = [type(self.peft_config[adapter].target_modules) for adapter in adapters] - if not target_module_types: - raise ValueError(f"Found no adapter matching the names in {adapters}") - if len(set(target_module_types)) > 1: - raise ValueError( - "all adapter configs should follow the same target modules type. " - "Combining adapters with `target_modules` type being a mix of list/set and string is not supported." - ) - - if target_module_types[0] is str: - new_target_modules = "|".join(f"({self.peft_config[adapter].target_modules})" for adapter in adapters) - elif target_module_types[0] is set: - new_target_modules = reduce( - operator.or_, (self.peft_config[adapter].target_modules for adapter in adapters) - ) - else: - raise TypeError(f"Invalid type {target_module_types[0]} found in target_modules") - - return combination_type, new_rank, new_target_modules - - def add_weighted_adapter( - self, - adapters: list[str], - weights: list[float], - adapter_name: str, - combination_type: str = "svd", - svd_rank: int | None = None, - svd_clamp: int | None = None, - svd_full_matrices: bool = True, - svd_driver: str | None = None, - density: float | None = None, - majority_sign_method: Literal["total", "frequency"] = "total", - ) -> None: - """ - This method adds a new adapter by merging the given adapters with the given weights. - - When using the `cat` combination_type you should be aware that rank of the resulting adapter will be equal to - the sum of all adapters ranks. So it's possible that the mixed adapter may become too big and result in OOM - errors. - - Args: - adapters (`list`): - List of adapter names to be merged. - weights (`list`): - List of weights for each adapter. - adapter_name (`str`): - Name of the new adapter. - combination_type (`str`): - The merging type can be one of [`svd`, `linear`, `cat`, `ties`, `ties_svd`, `dare_ties`, `dare_linear`, - `dare_ties_svd`, `dare_linear_svd`, `magnitude_prune`, `magnitude_prune_svd`]. When using the `cat` - combination_type, the rank of the resulting adapter is equal to the sum of all adapters ranks (the - mixed adapter may be too big and result in OOM errors). - svd_rank (`int`, *optional*): - Rank of output adapter for svd. If None provided, will use max rank of merging adapters. - svd_clamp (`float`, *optional*): - A quantile threshold for clamping SVD decomposition output. If None is provided, do not perform - clamping. Defaults to None. - svd_full_matrices (`bool`, *optional*): - Controls whether to compute the full or reduced SVD, and consequently, the shape of the returned - tensors U and Vh. Defaults to True. - svd_driver (`str`, *optional*): - Name of the cuSOLVER method to be used. This keyword argument only works when merging on CUDA. Can be - one of [None, `gesvd`, `gesvdj`, `gesvda`]. For more info please refer to `torch.linalg.svd` - documentation. Defaults to None. - density (`float`, *optional*): - Value between 0 and 1. 0 means all values are pruned and 1 means no values are pruned. Should be used - with [`ties`, `ties_svd`, `dare_ties`, `dare_linear`, `dare_ties_svd`, `dare_linear_svd`, - `magnintude_prune`, `magnitude_prune_svd`] - majority_sign_method (`str`): - The method, should be one of ["total", "frequency"], to use to get the magnitude of the sign values. - Should be used with [`ties`, `ties_svd`, `dare_ties`, `dare_ties_svd`] - """ - - if adapter_name in list(self.peft_config.keys()): - return - - combination_type, new_rank, new_target_modules = self._check_add_weighted_adapter( - adapters=adapters, - combination_type=combination_type, - svd_rank=svd_rank, - ) - - self.peft_config[adapter_name] = replace( - self.peft_config[adapters[0]], - r=new_rank, - lora_alpha=new_rank, - target_modules=new_target_modules, - ) - self.inject_adapter(self.model, adapter_name) - - # Do we really need that? - _freeze_adapter(self.model, adapter_name) - - key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] - for key in key_list: - _, target, _ = _get_submodules(self.model, key) - if isinstance(target, aLoraLayer): - if adapter_name in target.lora_A: - target_lora_A = target.lora_A[adapter_name].weight - target_lora_B = target.lora_B[adapter_name].weight - elif adapter_name in target.lora_embedding_A: - target_lora_A = target.lora_embedding_A[adapter_name] - target_lora_B = target.lora_embedding_B[adapter_name] - else: - continue - - target_lora_A.data = target_lora_A.data * 0.0 - target_lora_B.data = target_lora_B.data * 0.0 - if combination_type == "cat": - loras_A, loras_B = [], [] - for adapter, weight in zip(adapters, weights): - if adapter in target.lora_A: - current_adapter_lora_A = target.lora_A[adapter].weight - current_adapter_lora_B = target.lora_B[adapter].weight - elif adapter in target.lora_embedding_A: - current_adapter_lora_A = target.lora_embedding_A[adapter] - current_adapter_lora_B = target.lora_embedding_B[adapter] - else: - continue - loras_A.append(current_adapter_lora_A.data * weight * target.scaling[adapter]) - loras_B.append(current_adapter_lora_B.data) - - if len(loras_A) == 0: - raise ValueError("No matching LoRAs found. Please raise an issue on GitHub.") - loras_A = torch.cat(loras_A, dim=0) - loras_B = torch.cat(loras_B, dim=1) - target_lora_A.data[: loras_A.shape[0], :] = loras_A - target_lora_B.data[:, : loras_B.shape[1]] = loras_B - elif combination_type in [ - "svd", - "ties_svd", - "dare_linear_svd", - "dare_ties_svd", - "magnitude_prune_svd", - ]: - target_lora_A.data, target_lora_B.data = self._svd_generalized_task_arithmetic_weighted_adapter( - combination_type, - adapters, - weights, - new_rank, - target, - target_lora_A, - target_lora_B, - density, - majority_sign_method, - svd_clamp, - full_matrices=svd_full_matrices, - driver=svd_driver, - ) - elif combination_type in ["linear", "ties", "dare_linear", "dare_ties", "magnitude_prune"]: - target_lora_A.data, target_lora_B.data = self._generalized_task_arithmetic_weighted_adapter( - combination_type, adapters, weights, target, density, majority_sign_method - ) - - def _svd_generalized_task_arithmetic_weighted_adapter( - self, - combination_type, - adapters, - weights, - new_rank, - target, - target_lora_A, - target_lora_B, - density, - majority_sign_method, - clamp=None, - full_matrices=True, - driver=None, - ): - valid_adapters = [] - valid_weights = [] - is_embedding = any(adapter in target.lora_embedding_A for adapter in adapters) - for adapter, weight in zip(adapters, weights): - if adapter in target.lora_A or adapter in target.lora_embedding_A: - valid_adapters.append(adapter) - valid_weights.append(weight * target.scaling[adapter]) - - # if no valid adapter, nothing to do - if len(valid_adapters) == 0: - raise ValueError("No matching LoRAs found. Please raise an issue on Github.") - delta_weight = [target.get_delta_weight(adapter) for adapter in valid_adapters] - valid_weights = torch.tensor(valid_weights).to(delta_weight[0].device) - if combination_type == "svd": - delta_weight = task_arithmetic(delta_weight, valid_weights) - elif combination_type == "ties_svd": - delta_weight = ties(delta_weight, valid_weights, density, majority_sign_method) - elif combination_type == "dare_linear_svd": - delta_weight = dare_linear(delta_weight, valid_weights, density) - elif combination_type == "dare_ties_svd": - delta_weight = dare_ties(delta_weight, valid_weights, density, majority_sign_method) - elif combination_type == "magnitude_prune_svd": - delta_weight = magnitude_prune(delta_weight, valid_weights, density) - else: - raise ValueError(f"Invalid value passed to combination type: {combination_type}") - - conv2d = False #isinstance(target, Conv2d) - if conv2d: - conv2d_1x1 = target.weight.size()[2:4] == (1, 1) - if not conv2d_1x1: - delta_weight = delta_weight.flatten(start_dim=1) - else: - delta_weight = delta_weight.squeeze() - if (hasattr(target, "fan_in_fan_out") and target.fan_in_fan_out) or is_embedding: - delta_weight = delta_weight.T - - # based on https://github.com/kohya-ss/sd-scripts/blob/main/networks/svd_merge_lora.py#L114-L131 - U, S, Vh = torch.linalg.svd(delta_weight, full_matrices=full_matrices, driver=driver) - U = U[:, :new_rank] - S = S[:new_rank] - U = U @ torch.diag(S) - Vh = Vh[:new_rank, :] - if clamp is not None: - dist = torch.cat([U.flatten(), Vh.flatten()]) - hi_val = torch.quantile(dist, clamp) - low_val = -hi_val - U = U.clamp(low_val, hi_val) - Vh = Vh.clamp(low_val, hi_val) - if conv2d: - U = U.reshape(target_lora_B.data.shape) - Vh = Vh.reshape(target_lora_A.data.shape) - return Vh, U - - def _generalized_task_arithmetic_weighted_adapter( - self, - combination_type, - adapters, - weights, - target, - density, - majority_sign_method, - ): - # account weights for LoRA A and B layers. - valid_weights = [] - lora_A_deltas = [] - lora_B_deltas = [] - for adapter, weight in zip(adapters, weights): - if adapter in target.lora_A: - current_adapter_lora_A = target.lora_A[adapter].weight - current_adapter_lora_B = target.lora_B[adapter].weight - elif adapter in target.lora_embedding_A: - current_adapter_lora_A = target.lora_embedding_A[adapter] - current_adapter_lora_B = target.lora_embedding_B[adapter] - else: - continue - valid_weights.append(math.sqrt(weight * target.scaling[adapter])) - lora_A_deltas.append(current_adapter_lora_A.data) - lora_B_deltas.append(current_adapter_lora_B.data) - valid_weights = torch.tensor(valid_weights).to(lora_A_deltas[0].device) - lora_deltas = [lora_A_deltas, lora_B_deltas] - dtype = lora_A_deltas[0].dtype - for i, task_tensors in enumerate(lora_deltas): - if combination_type == "linear": - lora_deltas[i] = task_arithmetic(task_tensors, valid_weights) - elif combination_type == "ties": - lora_deltas[i] = ties(task_tensors, valid_weights, density, majority_sign_method) - elif combination_type == "dare_linear": - lora_deltas[i] = dare_linear(task_tensors, valid_weights, density) - elif combination_type == "dare_ties": - lora_deltas[i] = dare_ties(task_tensors, valid_weights, density, majority_sign_method) - elif combination_type == "magnitude_prune": - lora_deltas[i] = magnitude_prune(task_tensors, valid_weights, density) - else: - raise ValueError("Invalid combination type") - lora_deltas = [delta.to(dtype) for delta in lora_deltas] - return lora_deltas - - def delete_adapter(self, adapter_name: str) -> None: - """ - Deletes an existing adapter. - - Args: - adapter_name (str): Name of the adapter to be deleted. - """ - if adapter_name not in list(self.peft_config.keys()): - raise ValueError(f"Adapter {adapter_name} does not exist") - del self.peft_config[adapter_name] - - key_list = [key for key, _ in self.model.named_modules() if self.prefix not in key] - new_adapter = None - for key in key_list: - _, target, _ = _get_submodules(self.model, key) - if isinstance(target, aLoraLayer): - target.delete_adapter(adapter_name) - if new_adapter is None: - new_adapter = target.active_adapters[:] - - self.active_adapter = new_adapter or [] - - def merge_and_unload( - self, progressbar: bool = False, safe_merge: bool = False, adapter_names: Optional[list[str]] = None - ) -> torch.nn.Module: - r""" - This method merges the LoRa layers into the base model. This is needed if someone wants to use the base model - as a standalone model. - - Args: - progressbar (`bool`): - whether to show a progressbar indicating the unload and merge process - safe_merge (`bool`): - whether to activate the safe merging check to check if there is any potential Nan in the adapter - weights - adapter_names (`List[str]`, *optional*): - The list of adapter names that should be merged. If None, all active adapters will be merged. Defaults - to `None`. - Example: - - ```py - >>> from transformers import AutoModelForCausalLM - >>> from peft import PeftModel - - >>> base_model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b") - >>> peft_model_id = "smangrul/falcon-40B-int4-peft-lora-sfttrainer-sample" - >>> model = PeftModel.from_pretrained(base_model, peft_model_id) - >>> merged_model = model.merge_and_unload() - ``` - """ - return self._unload_and_optionally_merge( - progressbar=progressbar, safe_merge=safe_merge, adapter_names=adapter_names - ) - - def unload(self) -> torch.nn.Module: - """ - Gets back the base model by removing all the lora modules without merging. This gives back the original base - model. - """ - return self._unload_and_optionally_merge(merge=False) - - def subtract_mutated_init(self, output_state_dict: dict[str, torch.Tensor], adapter_name: str, kwargs=None): - """ - This function can calculate the updates of the [PiSSA | OLoRA] by comparing the parameters of the [PiSSA | - OLoRA] adapter in `output_state_dict` with the initial values of [PiSSA | OLoRA] in `adapter_name`, thus - converting [PiSSA | OLoRA] to LoRA. - """ - for name, param in self.model.named_parameters(): - if ( - param.data.dtype != torch.float32 - and param.data.dtype != torch.float16 - and param.data.dtype != torch.bfloat16 - ) and adapter_name.startswith("pissa"): - warnings.warn( - r"Note that Quant(W_res) + AB != Quant(W) + \Delta(AB); " - "the converted LoRA, when combined with W or Quant(W), may introduce a certain gap in the fine-tuned model. " - "Therefore, we recommend directly using the Quant(W_res) in conjunction with the PiSSA adapter. " - ) - mutated_init_state_dict = get_peft_model_state_dict( - self, - state_dict=kwargs.get("state_dict", None), - adapter_name=adapter_name, - ) - tensors_lora = {} - for name in output_state_dict.keys(): - ## W = W^{res} + A_0 \times B_0, - ## W + \Delta W = W^{res} + A \times B, - ## \Delta W = A \times B - A_0 \times B_0 = [A | A_0] \times [B | -B_0]^T = A'B'. - if "lora_A" in name: - tensors_lora[name] = torch.cat( - [output_state_dict[name], mutated_init_state_dict[".".join(name.split(".")[1:])]], dim=0 - ) - elif "lora_B" in name: - tensors_lora[name] = torch.cat( - [output_state_dict[name], -mutated_init_state_dict[".".join(name.split(".")[1:])]], dim=1 - ) - - return tensors_lora From 460835120cefc27ba725305800949ccc3fb505d6 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 3 Jul 2025 17:11:46 -0400 Subject: [PATCH 26/99] Check use_alora flag for aLoRA --- src/peft/peft_model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index e87c01b168..f26a9ff769 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -1893,14 +1893,14 @@ def forward( adapter_names_for_offset_calc = kwargs.get("adapter_names") is_alora_relevant = False - if self.active_peft_config.peft_type == PeftType.ALORA: + if getattr(self.active_peft_config, "use_alora", False): is_alora_relevant = True elif adapter_names_for_offset_calc: for name in adapter_names_for_offset_calc: if name == "__base__": continue config_ = self.peft_config.get(name) - if config_ and config_.peft_type == PeftType.ALORA: + if config_ and getattr(config_, "use_alora", False): is_alora_relevant = True break @@ -2057,14 +2057,14 @@ def generate(self, *args, **kwargs): adapter_names_for_offset_calc = kwargs.get("adapter_names") is_alora_relevant_in_generate = False - if self.active_peft_config.peft_type == PeftType.ALORA: + if getattr(self.active_peft_config, "use_alora", False): is_alora_relevant_in_generate = True elif adapter_names_for_offset_calc: for name in adapter_names_for_offset_calc: if name == "__base__": continue config_ = self.peft_config.get(name) - if config_ and config_.peft_type == PeftType.ALORA: + if config_ and getattr(config_, "use_alora", False): is_alora_relevant_in_generate = True break From 191a605a3102124c96bb9d7cb1ee3eb90a2a8df9 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Sun, 13 Jul 2025 18:28:53 +0000 Subject: [PATCH 27/99] inference working --- src/peft/__init__.py | 4 -- src/peft/peft_model.py | 15 ++++-- src/peft/tuners/lora/layer.py | 17 +++++-- src/peft/tuners/lora/model.py | 86 ++++++++++++++++---------------- src/peft/tuners/lora/variants.py | 25 ++++++---- 5 files changed, 82 insertions(+), 65 deletions(-) diff --git a/src/peft/__init__.py b/src/peft/__init__.py index e42da2efe7..91ca5cb14b 100644 --- a/src/peft/__init__.py +++ b/src/peft/__init__.py @@ -50,8 +50,6 @@ AdaLoraModel, AdaptionPromptConfig, AdaptionPromptModel, - aLoraConfig, - aLoraModel, BOFTConfig, BOFTModel, BoneConfig, @@ -129,8 +127,6 @@ "AdaLoraModel", "AdaptionPromptConfig", "AdaptionPromptModel", - "aLoraConfig", - "aLoraModel", "AutoPeftModel", "AutoPeftModelForCausalLM", "AutoPeftModelForFeatureExtraction", diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index 8ae84636c7..4b66142c08 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -1832,19 +1832,19 @@ def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optio continue if current_adapter_name not in self.peft_config: - warnings.warn(f"Adapter '{current_adapter_name}' not found in peft_config. Using offset -1 for row {i}.") + warnings.warn(f"Adapter '{current_adapter_name}' not found in peft_config. Using base model for row {i}.") alora_offsets[i] = -1 continue current_peft_config = self.peft_config[current_adapter_name] if not current_peft_config.use_alora: - alora_offsets[i] = -1 # Not an aLoRA adapter or wrong type + alora_offsets[i] = None # Not an aLoRA adapter or wrong type continue invocation_tokens = getattr(current_peft_config, 'alora_invocation_tokens', None) if not invocation_tokens: - alora_offsets[i] = -1 # No way to calculate offset + alora_offsets[i] = None # No way to calculate offset continue if current_adapter_name not in cached_invocation_tensors: @@ -1873,7 +1873,14 @@ def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optio if best_match_start_idx != -1: offset_val = seq_len - best_match_start_idx alora_offsets[i] = offset_val if offset_val > 0 else -1 - else: + else: # Invocation sequence not found in input + warnings.warn( + f"Could not find alora_invocation_tokens for specified aLoRA adapter in the " + f'following instance' + f'{sequence}' + f'Invocation tokens: {current_invocation_ids_tensor} \n' + f"Defaulting to base model. " + ) alora_offsets[i] = -1 return alora_offsets diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index e8a2b9b2ab..9279e9effd 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -34,6 +34,7 @@ from .config import LoraConfig +VARIANT_KWARG_KEYS = ["alora_offsets"] class LoraVariant: """ @@ -552,6 +553,7 @@ def _mixed_batch_forward( ) -> torch.Tensor: # This is a special method that handles the case when users pass the argument `adapter_names`. This is an # extra argument that allows mixing different adapters in the same batch at inference time. + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer result = self.base_layer(x, *args, **kwargs) torch_result_dtype = result.dtype @@ -583,6 +585,7 @@ def _mixed_batch_forward( active_adapter=active_adapter, x=x, result=result, + **variant_kwargs, **kwargs, ) result[sub_batch_indices_list[i]] += lora_output.to(torch_result_dtype) @@ -637,7 +640,7 @@ def __init__( self.is_target_conv_1d_layer = is_target_conv_1d_layer def resolve_lora_variant(self, *, use_dora: bool, use_alora: bool, **kwargs) -> Optional[LoraVariant]: - if not use_dora or use_alora: + if not use_dora and not use_alora: return None from .variants import DoraLinearVariant, ALoraLinearVariant @@ -764,13 +767,14 @@ def get_delta_weight(self, adapter) -> torch.Tensor: def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: self._check_forward_args(x, *args, **kwargs) adapter_names = kwargs.pop("adapter_names", None) - + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer + if self.disable_adapters: if self.merged: self.unmerge() result = self.base_layer(x, *args, **kwargs) elif adapter_names is not None: - result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs) + result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names,**variant_kwargs, **kwargs) elif self.merged: result = self.base_layer(x, *args, **kwargs) else: @@ -795,6 +799,7 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: active_adapter=active_adapter, x=x, result=result, + **variant_kwargs, **kwargs, ) @@ -1043,7 +1048,7 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: # TODO: no dtype conversion here, unlike in Linear, is that correct? self._check_forward_args(x, *args, **kwargs) adapter_names = kwargs.pop("adapter_names", None) - + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer if self.disable_adapters: if self.merged: self.unmerge() @@ -1071,6 +1076,7 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: active_adapter=active_adapter, x=x, result=result, + **variant_kwargs, **kwargs, ) result = result.to(torch_result_dtype) @@ -1318,7 +1324,7 @@ def get_delta_weight(self, adapter) -> torch.Tensor: def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: self._check_forward_args(x, *args, **kwargs) adapter_names = kwargs.pop("adapter_names", None) - + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer if self.disable_adapters: if self.merged: self.unmerge() @@ -1349,6 +1355,7 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: active_adapter=active_adapter, x=x, result=result, + **variant_kwargs, **kwargs, ) diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index 761504a468..e3b9054870 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -447,53 +447,53 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): handle = layer.register_forward_pre_hook(pre_forward, with_kwargs=True) hook_handles.append(handle) - - if self.training: - raise ValueError("Cannot pass `adapter_names` when the model is in training mode.") - - # Check that users only passed actually existing adapters. - # Note: We cannot do this on the layer level, as each individual layer may not have each adapter. Still, we want - # to check that there is at least one layer with the given name, or else something like typos can easily slip. - expected_adapters = set() - for layer in self.modules(): - if isinstance(layer, LoraLayer): - expected_adapters |= layer.lora_A.keys() - expected_adapters |= layer.lora_embedding_A.keys() - unique_adapters = {name for name in adapter_names if name != "__base__"} - unexpected_adapters = unique_adapters - expected_adapters - if unexpected_adapters: - raise ValueError(f"Trying to infer with non-existing adapter(s): {', '.join(sorted(unexpected_adapters))}") - - # deal with beam search - num_beams = kwargs.get("num_beams", None) - uses_beam_search = isinstance(num_beams, int) and (num_beams > 1) - original_adapter_names = adapter_names[:] - if uses_beam_search: - if not isinstance(adapter_names, (list, tuple)): - raise TypeError(f"Got adapter names of type {type(adapter_names)}, expected a list of str.") - # When there is beam search, the inputs are repeated n times, thus we repeat each adapter name n times and - # then flatten the nested list. For encoder-decoder models, this extended list should not be applied to the - # encoder part. Further below, the original argument is thus restored for the encoder. - adapter_names = sum(([n] * kwargs["num_beams"] for n in adapter_names), []) - - - for module in self.modules(): - if isinstance(module, LoraLayer) or isinstance(module, AuxiliaryTrainingWrapper): - pre_forward = partial(_adapter_names_pre_forward_hook, adapter_names=adapter_names) - handle = module.register_forward_pre_hook(pre_forward, with_kwargs=True) - hook_handles.append(handle) - - if uses_beam_search and hasattr(self.model, "get_encoder"): - # For encoder-decoder models, even when applying beam search, the encoder part of the model should not use - # the extended adapter_names. This is because the encoder still uses the original, non-extended samples. - for module in self.model.get_encoder().modules(): + if adapter_names is not None: + if self.training: + raise ValueError("Cannot pass `adapter_names` when the model is in training mode.") + + # Check that users only passed actually existing adapters. + # Note: We cannot do this on the layer level, as each individual layer may not have each adapter. Still, we want + # to check that there is at least one layer with the given name, or else something like typos can easily slip. + expected_adapters = set() + for layer in self.modules(): + if isinstance(layer, LoraLayer): + expected_adapters |= layer.lora_A.keys() + expected_adapters |= layer.lora_embedding_A.keys() + unique_adapters = {name for name in adapter_names if name != "__base__"} + unexpected_adapters = unique_adapters - expected_adapters + if unexpected_adapters: + raise ValueError(f"Trying to infer with non-existing adapter(s): {', '.join(sorted(unexpected_adapters))}") + + # deal with beam search + num_beams = kwargs.get("num_beams", None) + uses_beam_search = isinstance(num_beams, int) and (num_beams > 1) + original_adapter_names = adapter_names[:] + if uses_beam_search: + if not isinstance(adapter_names, (list, tuple)): + raise TypeError(f"Got adapter names of type {type(adapter_names)}, expected a list of str.") + # When there is beam search, the inputs are repeated n times, thus we repeat each adapter name n times and + # then flatten the nested list. For encoder-decoder models, this extended list should not be applied to the + # encoder part. Further below, the original argument is thus restored for the encoder. + adapter_names = sum(([n] * kwargs["num_beams"] for n in adapter_names), []) + + + for module in self.modules(): if isinstance(module, LoraLayer) or isinstance(module, AuxiliaryTrainingWrapper): - # Add another hook to overwrite the kwargs with the original adapter names -- this is easier than - # trying to exclude the encoder. - pre_forward = partial(_adapter_names_pre_forward_hook, adapter_names=original_adapter_names) + pre_forward = partial(_adapter_names_pre_forward_hook, adapter_names=adapter_names) handle = module.register_forward_pre_hook(pre_forward, with_kwargs=True) hook_handles.append(handle) + if uses_beam_search and hasattr(self.model, "get_encoder"): + # For encoder-decoder models, even when applying beam search, the encoder part of the model should not use + # the extended adapter_names. This is because the encoder still uses the original, non-extended samples. + for module in self.model.get_encoder().modules(): + if isinstance(module, LoraLayer) or isinstance(module, AuxiliaryTrainingWrapper): + # Add another hook to overwrite the kwargs with the original adapter names -- this is easier than + # trying to exclude the encoder. + pre_forward = partial(_adapter_names_pre_forward_hook, adapter_names=original_adapter_names) + handle = module.register_forward_pre_hook(pre_forward, with_kwargs=True) + hook_handles.append(handle) + yield for handle in hook_handles: diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 57bf11a761..2b4edc6c73 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -470,19 +470,26 @@ def forward( scaling = module.scaling[active_adapter] x = x.to(lora_A.weight.dtype) - + if x.dim() == 2: result = result + lora_B(lora_A(dropout(x))) * scaling elif len(alora_offsets) == 1: - k = min(result.shape[1], alora_offsets[0]) - if k > 0: - result[:, -k:, :] = result[:, -k:, :] + lora_B(lora_A(dropout(x[:, -k:, :]))) * scaling + if alora_offsets[0] is None: + # run as lora + result[:, :, :] = result[:, :, :] + lora_B(lora_A(dropout(x[:, :, :]))) * scaling + else: + offset = min(result.shape[1], alora_offsets[0]) + if offset > 0: + result[:, -offset:, :] = result[:, -offset:, :] + lora_B(lora_A(dropout(x[:, -offset:, :]))) * scaling else: for i in range(result.shape[0]): - offset = min(alora_offsets[i], result.shape[1]) - if offset > 0: - result[i, -offset:, :] = ( - result[i, -offset:, :] + lora_B(lora_A(dropout(x[i, -offset:, :]))) * scaling - ) + if alora_offsets[i] is None: # run as lora + result[:, :, :] = result[:, :, :] + lora_B(lora_A(dropout(x[:, :, :]))) * scaling + else: + offset = min(alora_offsets[i], result.shape[1]) + if offset > 0: + result[i, -offset:, :] = ( + result[i, -offset:, :] + lora_B(lora_A(dropout(x[i, -offset:, :]))) * scaling + ) return result From 475ee8f04398ccf2367aeb4814e6af812fe97dea Mon Sep 17 00:00:00 2001 From: Kristjan Greenewald Date: Sun, 13 Jul 2025 17:41:33 -0400 Subject: [PATCH 28/99] update tests --- tests/test_custom_models.py | 92 ++++++++----------------------------- 1 file changed, 20 insertions(+), 72 deletions(-) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 7ac669b172..8a08c543d2 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -32,7 +32,6 @@ from transformers.pytorch_utils import Conv1D from peft import ( - aLoraConfig, AdaLoraConfig, BOFTConfig, BoneConfig, @@ -90,6 +89,15 @@ LoraConfig, {"target_modules": "lin1", "use_dora": True, "lora_alpha": 32}, ), + # Activated LoRA (aLoRA) + ("Vanilla MLP 9 Activated LoRA (aLoRA)", "MLP", LoraConfig, {"target_modules": ["lin0"], "use_alora": True, "alora_invocation_tokens": [1, 2, 3], "alora_invocation_string": '123'}), + ("Vanilla MLP 10 Activated LoRA (aLoRA)", "MLP", LoraConfig, {"target_modules": ["lin0", "lin1"], "use_alora": True, "alora_invocation_tokens": [1, 2, 3], "alora_invocation_string": '123'}), + ( + "Vanilla MLP 11 Activated LoRA (aLoRA)", + "MLP", + LoraConfig, + {"target_modules": "lin1", "use_alora": True, "alora_invocation_tokens": [1, 2, 3], "alora_invocation_string": '123', "lora_alpha": 32}, + ), ("Embedding + transformers Conv1D 1 LoRA", "EmbConv1D", LoraConfig, {"target_modules": ["conv1d"]}), ("Embedding + transformers Conv1D 2 LoRA", "EmbConv1D", LoraConfig, {"target_modules": ["emb"]}), ("Embedding + transformers Conv1D 3 LoRA", "EmbConv1D", LoraConfig, {"target_modules": ["emb", "conv1d"]}), @@ -302,51 +310,6 @@ }, ), ######## - # aLoRA # - ######## - ( - "Vanilla MLP 1 aLoRA", - "MLP", - aLoraConfig, - {"target_modules": "lin0", "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, - ), - ( - "Vanilla MLP 2 aLoRA", - "MLP", - aLoraConfig, - {"target_modules": ["lin0"], "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, - ), - ( - "Vanilla MLP 3 aLoRA", - "MLP", - aLoraConfig, - {"target_modules": ["lin1"], "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, - ), - ( - "Vanilla MLP 4 aLoRA", - "MLP", - aLoraConfig, - {"target_modules": ["lin0", "lin1"], "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, - ), - ( - "Vanilla MLP 5 aLoRA", - "MLP", - aLoraConfig, - {"target_modules": ["lin0"], "modules_to_save": ["lin1"], "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, - ), - ( - "Vanilla MLP 6 aLoRA", - "MLP", - aLoraConfig, - { - "target_modules": ["lin0"], - "lora_alpha": 4, - "lora_dropout": 0.1, - "invocation_tokens": [1, 2, 3], - "invocation_string": '123', - }, - ), - ######## # OFT # ######## ( @@ -784,20 +747,6 @@ {"target_modules": ["lin0"], "init_lora_weights": False, "inference_mode": True, "total_step": 1}, {"target_modules": ["lin1"], "init_lora_weights": False, "inference_mode": True, "total_step": 1}, ), - ( - "aLoRA Same", - "alora", - aLoraConfig, - {"target_modules": ["lin0"], "init_lora_weights": False, "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, - {"target_modules": ["lin0"], "init_lora_weights": False, "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, - ), - ( - "aLoRA Different", - "alora", - aLoraConfig, - {"target_modules": ["lin0"], "init_lora_weights": False, "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, - {"target_modules": ["lin1"], "init_lora_weights": False, "invocation_tokens": [1, 2, 3], "invocation_string": '123'}, - ), ( "FourierFT Same", "fourierft", @@ -890,7 +839,6 @@ PREFIXES = { IA3Config: "ia3_", - aLoraConfig: "lora_", LoraConfig: "lora_", LoHaConfig: "hada_", LoKrConfig: "lokr_", @@ -1339,7 +1287,7 @@ def test_merge_layers(self, test_name, model_id, config_cls, config_kwargs): pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if issubclass(config_cls, aLoraConfig): + if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: pytest.skip("aLoRA does not support merging.") config_kwargs = config_kwargs.copy() @@ -1364,7 +1312,7 @@ def test_merge_layers_fp16(self, test_name, model_id, config_cls, config_kwargs) pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if issubclass(config_cls, aLoraConfig): + if issubclass(config_cls, LoraConfig) and LoraConfig.use_alora is True: pytest.skip("aLoRA does not support merging.") config_kwargs = config_kwargs.copy() @@ -1381,7 +1329,7 @@ def test_merge_layers_is_idempotent(self, test_name, model_id, config_cls, confi pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if issubclass(config_cls, aLoraConfig): + if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: pytest.skip("aLoRA does not support merging.") # calling merge twice with the same arguments should not change the output @@ -1399,7 +1347,7 @@ def test_safe_merge(self, test_name, model_id, config_cls, config_kwargs): pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if issubclass(config_cls, aLoraConfig): + if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: pytest.skip(f"Skipping test as merging is not supported for aLora.") # calling merge twice with the same arguments should not change the output config_kwargs = config_kwargs.copy() @@ -1493,7 +1441,7 @@ def test_forward_float16(self, test_name, model_id, config_cls, config_kwargs): if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return - if issubclass(config_cls, aLoraConfig): + if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) @@ -1537,7 +1485,7 @@ def test_forward_bfloat16(self, test_name, model_id, config_cls, config_kwargs): if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return - if issubclass(config_cls, aLoraConfig): + if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) @@ -1580,7 +1528,7 @@ def test_forward_float16_no_autocast(self, test_name, model_id, config_cls, conf if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return - if issubclass(config_cls, aLoraConfig): + if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) @@ -1623,7 +1571,7 @@ def test_forward_bfloat16_no_autocast(self, test_name, model_id, config_cls, con if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return - if issubclass(config_cls, aLoraConfig): + if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) @@ -1797,7 +1745,7 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if issubclass(config_cls, aLoraConfig): + if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: pytest.skip("aLoRA does not support merging.") # same as test_disable_adapters, but with merging @@ -2796,7 +2744,7 @@ def test_multiple_active_adapters_forward( def test_multiple_active_adapters_merge_and_unmerge( self, test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2 ): - if issubclass(config_cls, aLoraConfig): + if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: pytest.skip("aLoRA does not support merging.") torch.manual_seed(0) @@ -2832,7 +2780,7 @@ def test_multiple_active_adapters_merge_and_unmerge( "test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2", MULTIPLE_ACTIVE_ADAPTERS_TEST_CASES ) def test_merge_layers_multi(self, test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2): - if issubclass(config_cls, aLoraConfig): + if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: pytest.skip("aLoRA does not support merging.") torch.manual_seed(0) From 3781a81f0979aa4249e0c86bbb3362bc2a8c3d22 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Mon, 14 Jul 2025 02:49:39 +0000 Subject: [PATCH 29/99] tests passing --- src/peft/peft_model.py | 35 ++++++++++++++++---------------- src/peft/tuners/lora/config.py | 20 +++++++++--------- src/peft/tuners/lora/layer.py | 7 ++++--- src/peft/tuners/lora/variants.py | 2 +- src/peft/utils/peft_types.py | 4 +--- tests/test_custom_models.py | 24 +++++++++++----------- 6 files changed, 45 insertions(+), 47 deletions(-) diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index 4b66142c08..eb97d95a9d 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -47,7 +47,6 @@ from .config import PeftConfig from .mapping import PEFT_TYPE_TO_CONFIG_MAPPING, PEFT_TYPE_TO_PREFIX_MAPPING, PEFT_TYPE_TO_TUNER_MAPPING from .utils import ( - PeftType, SAFETENSORS_WEIGHTS_NAME, TRANSFORMERS_MODELS_TO_PREFIX_TUNING_POSTPROCESS_MAPPING, WEIGHTS_NAME, @@ -117,7 +116,7 @@ def __init__( # These args are special PEFT arguments that users can pass. They need to be removed before passing them to # forward. self.special_peft_forward_args = {"adapter_names", "alora_offsets"} - + self._is_prompt_learning = peft_config.is_prompt_learning if self._is_prompt_learning: self._peft_config = {adapter_name: peft_config} @@ -1813,7 +1812,7 @@ def __init__( super().__init__(model, peft_config, adapter_name, **kwargs) self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation - + def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optional[list[str]] = None) -> list[int]: if input_ids is None: return [] @@ -1835,13 +1834,13 @@ def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optio warnings.warn(f"Adapter '{current_adapter_name}' not found in peft_config. Using base model for row {i}.") alora_offsets[i] = -1 continue - + current_peft_config = self.peft_config[current_adapter_name] - if not current_peft_config.use_alora: + if not current_peft_config.use_alora: alora_offsets[i] = None # Not an aLoRA adapter or wrong type continue - + invocation_tokens = getattr(current_peft_config, 'alora_invocation_tokens', None) if not invocation_tokens: alora_offsets[i] = None # No way to calculate offset @@ -1849,7 +1848,7 @@ def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optio if current_adapter_name not in cached_invocation_tensors: cached_invocation_tensors[current_adapter_name] = torch.tensor(invocation_tokens, dtype=torch.long, device=input_ids.device) - + adapters_to_process_indices[current_adapter_name].append(i) for adapter_name_to_process, indices in adapters_to_process_indices.items(): @@ -1860,7 +1859,7 @@ def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optio sequence = input_ids[i] seq_len = len(sequence) best_match_start_idx = -1 - + possible_starts = (sequence == current_invocation_ids_tensor[0]).nonzero(as_tuple=True)[0] for start_idx_tensor in possible_starts: @@ -1869,7 +1868,7 @@ def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optio if torch.equal(sequence[idx : idx + invocation_len], current_invocation_ids_tensor): if idx > best_match_start_idx: best_match_start_idx = idx - + if best_match_start_idx != -1: offset_val = seq_len - best_match_start_idx alora_offsets[i] = offset_val if offset_val > 0 else -1 @@ -1912,16 +1911,16 @@ def forward( if config_ and getattr(config_, "use_alora", False): is_alora_relevant = True break - + if is_alora_relevant: - alora_offsets = kwargs.get("alora_offsets") + alora_offsets = kwargs.get("alora_offsets") if alora_offsets is None: if input_ids is None and inputs_embeds is not None: warnings.warn("Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass.") alora_offsets = [-1] * inputs_embeds.shape[0] elif input_ids is not None: alora_offsets = self._calculate_alora_offsets(input_ids, adapter_names=adapter_names_for_offset_calc) - else: + else: alora_offsets = [] # Should not happen if _get_batch_size logic is sound kwargs['alora_offsets'] = alora_offsets if self.base_model.config.model_type == "mpt": @@ -2076,7 +2075,7 @@ def generate(self, *args, **kwargs): if config_ and getattr(config_, "use_alora", False): is_alora_relevant_in_generate = True break - + if is_alora_relevant_in_generate: alora_offsets_from_kwargs = kwargs.get("alora_offsets") if alora_offsets_from_kwargs is None: @@ -2086,18 +2085,18 @@ def generate(self, *args, **kwargs): current_input_ids = args[0] else: current_input_ids = None - + if current_input_ids is not None: - if current_input_ids.ndim == 1: + if current_input_ids.ndim == 1: current_input_ids = current_input_ids.unsqueeze(0) calculated_offsets = self._calculate_alora_offsets(current_input_ids, adapter_names=adapter_names_for_offset_calc) for i in range(len(calculated_offsets)): calculated_offsets[i] -= 1 kwargs['alora_offsets'] = calculated_offsets - + else: warnings.warn("Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA.") - bs = 1 + bs = 1 if "attention_mask" in kwargs and kwargs["attention_mask"] is not None: bs = kwargs["attention_mask"].shape[0] elif "inputs_embeds" in kwargs and kwargs["inputs_embeds"] is not None: @@ -2108,7 +2107,7 @@ def generate(self, *args, **kwargs): bs = kwargs["input_ids"].shape[0] kwargs['alora_offsets'] = [-1] * bs - + with self._enable_peft_forward_hooks(*args, **kwargs): kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} outputs = self.base_model.generate(*args, **kwargs) diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py index 0b557b8ce0..b47e167087 100644 --- a/src/peft/tuners/lora/config.py +++ b/src/peft/tuners/lora/config.py @@ -300,20 +300,20 @@ class LoraConfig(PeftConfig): LoRA, so it is recommended to merge weights for inference. For more information, see https://huggingface.co/papers/2402.09353. use_alora (`bool`): - Enable 'Activated LoRA' (aLoRA). This technique - selectively activates the adapter weights only on tokens during and after the alora_invocation_tokens. - When used in a CausalLM, this means that the KV cache prior to invocation is interchangeable with that of - the base model (and other aLoRA adapters operating this way). As a result, in inference pipelines involving - switching between base model inference and adapter inference (e.g. agentic pipelines, see paper for many - examples), significant savings are realized (relative to LoRA) by saving prefill operations. Overall adapter - inference speedups of an order of magnitude or more can occur on vLLM, depending on the length of the shared - context. REQUIRED ARGUMENTS: alora_invocation_string, alora_invocation_tokens. These are necessary to know + Enable 'Activated LoRA' (aLoRA). This technique + selectively activates the adapter weights only on tokens during and after the alora_invocation_tokens. + When used in a CausalLM, this means that the KV cache prior to invocation is interchangeable with that of + the base model (and other aLoRA adapters operating this way). As a result, in inference pipelines involving + switching between base model inference and adapter inference (e.g. agentic pipelines, see paper for many + examples), significant savings are realized (relative to LoRA) by saving prefill operations. Overall adapter + inference speedups of an order of magnitude or more can occur on vLLM, depending on the length of the shared + context. REQUIRED ARGUMENTS: alora_invocation_string, alora_invocation_tokens. These are necessary to know when to turn on adapter weights. The invocation string therein must be present in all inputs. Note also that merging is not possible due to the selective application of the weights. alora_invocation_string (`str`): Invocation string for aLoRA (must be present in model inputs). Defaults to None. alora_invocation_tokens (`List[int]`): - Tokenized copy of alora_invocation_string for use when tokenizer is not available. + Tokenized copy of alora_invocation_string for use when tokenizer is not available. layer_replication (`List[Tuple[int, int]]`): Build a new stack of layers by stacking the original model layers according to the ranges specified. This allows expanding (or shrinking) the model without duplicating the base model weights. The new layers will @@ -680,7 +680,7 @@ def __post_init__(self): ) if self.use_dora: raise ValueError("The argument lora_bias=True is not supported for DoRA, please pass use_dora=False") - + #If activated LoRA (aLoRA) is enabled, check for required invocation arguments. if self.use_alora: if self.alora_invocation_string is None or self.alora_invocation_tokens is None: diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index 9279e9effd..f7576a2187 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -34,6 +34,7 @@ from .config import LoraConfig + VARIANT_KWARG_KEYS = ["alora_offsets"] class LoraVariant: @@ -643,8 +644,8 @@ def resolve_lora_variant(self, *, use_dora: bool, use_alora: bool, **kwargs) -> if not use_dora and not use_alora: return None - from .variants import DoraLinearVariant, ALoraLinearVariant - if use_alora: + from .variants import ALoraLinearVariant, DoraLinearVariant + if use_alora: return ALoraLinearVariant() else: return DoraLinearVariant() @@ -768,7 +769,7 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: self._check_forward_args(x, *args, **kwargs) adapter_names = kwargs.pop("adapter_names", None) variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer - + if self.disable_adapters: if self.merged: self.unmerge() diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 2b4edc6c73..c830cbdb98 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -470,7 +470,7 @@ def forward( scaling = module.scaling[active_adapter] x = x.to(lora_A.weight.dtype) - + if x.dim() == 2: result = result + lora_B(lora_A(dropout(x))) * scaling elif len(alora_offsets) == 1: diff --git a/src/peft/utils/peft_types.py b/src/peft/utils/peft_types.py index 95f169949c..4c1e2563a9 100644 --- a/src/peft/utils/peft_types.py +++ b/src/peft/utils/peft_types.py @@ -26,7 +26,6 @@ class PeftType(str, enum.Enum): - P_TUNING - PREFIX_TUNING - LORA - - ALORA - ADALORA - BOFT - ADAPTION_PROMPT @@ -49,8 +48,7 @@ class PeftType(str, enum.Enum): MULTITASK_PROMPT_TUNING = "MULTITASK_PROMPT_TUNING" P_TUNING = "P_TUNING" PREFIX_TUNING = "PREFIX_TUNING" - LORA = "LORA" - ALORA = "ALORA" + LORA = "LORA" ADALORA = "ADALORA" BOFT = "BOFT" ADAPTION_PROMPT = "ADAPTION_PROMPT" diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 8a08c543d2..9ee62600b9 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -1287,7 +1287,7 @@ def test_merge_layers(self, test_name, model_id, config_cls, config_kwargs): pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: + if config_kwargs.get("use_alora"): pytest.skip("aLoRA does not support merging.") config_kwargs = config_kwargs.copy() @@ -1312,7 +1312,7 @@ def test_merge_layers_fp16(self, test_name, model_id, config_cls, config_kwargs) pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if issubclass(config_cls, LoraConfig) and LoraConfig.use_alora is True: + if config_kwargs.get("use_alora"): pytest.skip("aLoRA does not support merging.") config_kwargs = config_kwargs.copy() @@ -1329,7 +1329,7 @@ def test_merge_layers_is_idempotent(self, test_name, model_id, config_cls, confi pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: + if config_kwargs.get("use_alora"): pytest.skip("aLoRA does not support merging.") # calling merge twice with the same arguments should not change the output @@ -1347,8 +1347,8 @@ def test_safe_merge(self, test_name, model_id, config_cls, config_kwargs): pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: - pytest.skip(f"Skipping test as merging is not supported for aLora.") + if config_kwargs.get("use_alora"): + pytest.skip("Skipping test as merging is not supported for aLora.") # calling merge twice with the same arguments should not change the output config_kwargs = config_kwargs.copy() if issubclass(config_cls, LoraConfig): @@ -1441,7 +1441,7 @@ def test_forward_float16(self, test_name, model_id, config_cls, config_kwargs): if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return - if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: + if config_kwargs.get("use_alora"): pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) @@ -1485,7 +1485,7 @@ def test_forward_bfloat16(self, test_name, model_id, config_cls, config_kwargs): if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return - if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: + if config_kwargs.get("use_alora"): pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) @@ -1528,7 +1528,7 @@ def test_forward_float16_no_autocast(self, test_name, model_id, config_cls, conf if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return - if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: + if config_kwargs.get("use_alora"): pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) @@ -1571,7 +1571,7 @@ def test_forward_bfloat16_no_autocast(self, test_name, model_id, config_cls, con if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return - if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: + if config_kwargs.get("use_alora"): pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) @@ -1745,7 +1745,7 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: + if config_kwargs.get("use_alora"): pytest.skip("aLoRA does not support merging.") # same as test_disable_adapters, but with merging @@ -2744,7 +2744,7 @@ def test_multiple_active_adapters_forward( def test_multiple_active_adapters_merge_and_unmerge( self, test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2 ): - if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: + if config_kwargs_1.get("use_alora") or config_kwargs_2.get("use_alora"): pytest.skip("aLoRA does not support merging.") torch.manual_seed(0) @@ -2780,7 +2780,7 @@ def test_multiple_active_adapters_merge_and_unmerge( "test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2", MULTIPLE_ACTIVE_ADAPTERS_TEST_CASES ) def test_merge_layers_multi(self, test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2): - if issubclass(config_cls, LoraConfig) and config_cls.use_alora is True: + if config_kwargs_1.get("use_alora") or config_kwargs_2.get("use_alora"): pytest.skip("aLoRA does not support merging.") torch.manual_seed(0) From b9d17456a51ee6cc53708308d2c219fcd5346428 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Mon, 14 Jul 2025 02:56:01 +0000 Subject: [PATCH 30/99] whitespace --- src/peft/tuners/lora/model.py | 2 +- src/peft/utils/peft_types.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index e3b9054870..bdd213223b 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -446,7 +446,7 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): pre_forward = partial(_alora_offsets_pre_forward_hook, alora_offsets = alora_offsets) handle = layer.register_forward_pre_hook(pre_forward, with_kwargs=True) hook_handles.append(handle) - + if adapter_names is not None: if self.training: raise ValueError("Cannot pass `adapter_names` when the model is in training mode.") diff --git a/src/peft/utils/peft_types.py b/src/peft/utils/peft_types.py index 4c1e2563a9..f0d0dc9572 100644 --- a/src/peft/utils/peft_types.py +++ b/src/peft/utils/peft_types.py @@ -48,7 +48,7 @@ class PeftType(str, enum.Enum): MULTITASK_PROMPT_TUNING = "MULTITASK_PROMPT_TUNING" P_TUNING = "P_TUNING" PREFIX_TUNING = "PREFIX_TUNING" - LORA = "LORA" + LORA = "LORA" ADALORA = "ADALORA" BOFT = "BOFT" ADAPTION_PROMPT = "ADAPTION_PROMPT" From 22177f309b42d65053632da0ea49e118ac24dd91 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Mon, 14 Jul 2025 03:00:21 +0000 Subject: [PATCH 31/99] whitespace --- src/peft/tuners/lora/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index bdd213223b..a215678be3 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -446,7 +446,7 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): pre_forward = partial(_alora_offsets_pre_forward_hook, alora_offsets = alora_offsets) handle = layer.register_forward_pre_hook(pre_forward, with_kwargs=True) hook_handles.append(handle) - + if adapter_names is not None: if self.training: raise ValueError("Cannot pass `adapter_names` when the model is in training mode.") From 498bdb19bffebd96777a9e01362c474352d2f443 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Mon, 14 Jul 2025 03:07:11 +0000 Subject: [PATCH 32/99] format --- src/peft/peft_model.py | 69 ++++++++++++++++++++------------ src/peft/tuners/lora/config.py | 3 +- src/peft/tuners/lora/layer.py | 13 +++--- src/peft/tuners/lora/model.py | 9 +++-- src/peft/tuners/lora/variants.py | 6 ++- tests/test_custom_models.py | 33 +++++++++++++-- 6 files changed, 92 insertions(+), 41 deletions(-) diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index eb97d95a9d..53f2808193 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -1812,8 +1812,9 @@ def __init__( super().__init__(model, peft_config, adapter_name, **kwargs) self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation - - def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optional[list[str]] = None) -> list[int]: + def _calculate_alora_offsets( + self, input_ids: torch.Tensor, adapter_names: Optional[list[str]] = None + ) -> list[int]: if input_ids is None: return [] @@ -1824,30 +1825,36 @@ def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optio adapters_to_process_indices = collections.defaultdict(list) for i in range(batch_size): - current_adapter_name = adapter_names[i] if adapter_names and i < len(adapter_names) else self.active_adapter + current_adapter_name = ( + adapter_names[i] if adapter_names and i < len(adapter_names) else self.active_adapter + ) if current_adapter_name == "__base__": alora_offsets[i] = -1 continue if current_adapter_name not in self.peft_config: - warnings.warn(f"Adapter '{current_adapter_name}' not found in peft_config. Using base model for row {i}.") + warnings.warn( + f"Adapter '{current_adapter_name}' not found in peft_config. Using base model for row {i}." + ) alora_offsets[i] = -1 continue current_peft_config = self.peft_config[current_adapter_name] if not current_peft_config.use_alora: - alora_offsets[i] = None # Not an aLoRA adapter or wrong type + alora_offsets[i] = None # Not an aLoRA adapter or wrong type continue - invocation_tokens = getattr(current_peft_config, 'alora_invocation_tokens', None) + invocation_tokens = getattr(current_peft_config, "alora_invocation_tokens", None) if not invocation_tokens: - alora_offsets[i] = None # No way to calculate offset + alora_offsets[i] = None # No way to calculate offset continue if current_adapter_name not in cached_invocation_tensors: - cached_invocation_tensors[current_adapter_name] = torch.tensor(invocation_tokens, dtype=torch.long, device=input_ids.device) + cached_invocation_tensors[current_adapter_name] = torch.tensor( + invocation_tokens, dtype=torch.long, device=input_ids.device + ) adapters_to_process_indices[current_adapter_name].append(i) @@ -1872,12 +1879,12 @@ def _calculate_alora_offsets(self, input_ids: torch.Tensor, adapter_names: Optio if best_match_start_idx != -1: offset_val = seq_len - best_match_start_idx alora_offsets[i] = offset_val if offset_val > 0 else -1 - else: # Invocation sequence not found in input + else: # Invocation sequence not found in input warnings.warn( - f"Could not find alora_invocation_tokens for specified aLoRA adapter in the " - f'following instance' - f'{sequence}' - f'Invocation tokens: {current_invocation_ids_tensor} \n' + f"Could not find alora_invocation_tokens for specified aLoRA adapter in the " + f"following instance" + f"{sequence}" + f"Invocation tokens: {current_invocation_ids_tensor} \n" f"Defaulting to base model. " ) alora_offsets[i] = -1 @@ -1916,13 +1923,17 @@ def forward( alora_offsets = kwargs.get("alora_offsets") if alora_offsets is None: if input_ids is None and inputs_embeds is not None: - warnings.warn("Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass.") + warnings.warn( + "Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass." + ) alora_offsets = [-1] * inputs_embeds.shape[0] elif input_ids is not None: - alora_offsets = self._calculate_alora_offsets(input_ids, adapter_names=adapter_names_for_offset_calc) + alora_offsets = self._calculate_alora_offsets( + input_ids, adapter_names=adapter_names_for_offset_calc + ) else: - alora_offsets = [] # Should not happen if _get_batch_size logic is sound - kwargs['alora_offsets'] = alora_offsets + alora_offsets = [] # Should not happen if _get_batch_size logic is sound + kwargs["alora_offsets"] = alora_offsets if self.base_model.config.model_type == "mpt": if inputs_embeds is not None: raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds") @@ -2080,8 +2091,8 @@ def generate(self, *args, **kwargs): alora_offsets_from_kwargs = kwargs.get("alora_offsets") if alora_offsets_from_kwargs is None: current_input_ids = kwargs.get("input_ids") - if not current_input_ids: # args[0] is usually input_ids - if args and isinstance(args[0], torch.Tensor): # and args[0].dim() >=1 : + if not current_input_ids: # args[0] is usually input_ids + if args and isinstance(args[0], torch.Tensor): # and args[0].dim() >=1 : current_input_ids = args[0] else: current_input_ids = None @@ -2089,24 +2100,32 @@ def generate(self, *args, **kwargs): if current_input_ids is not None: if current_input_ids.ndim == 1: current_input_ids = current_input_ids.unsqueeze(0) - calculated_offsets = self._calculate_alora_offsets(current_input_ids, adapter_names=adapter_names_for_offset_calc) + calculated_offsets = self._calculate_alora_offsets( + current_input_ids, adapter_names=adapter_names_for_offset_calc + ) for i in range(len(calculated_offsets)): calculated_offsets[i] -= 1 - kwargs['alora_offsets'] = calculated_offsets + kwargs["alora_offsets"] = calculated_offsets else: - warnings.warn("Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA.") + warnings.warn( + "Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA." + ) bs = 1 if "attention_mask" in kwargs and kwargs["attention_mask"] is not None: bs = kwargs["attention_mask"].shape[0] elif "inputs_embeds" in kwargs and kwargs["inputs_embeds"] is not None: bs = kwargs["inputs_embeds"].shape[0] - elif args and isinstance(args[0], torch.Tensor) and args[0].dim() > 0: # input_ids might be in args[0] + elif ( + args and isinstance(args[0], torch.Tensor) and args[0].dim() > 0 + ): # input_ids might be in args[0] bs = args[0].shape[0] - elif "input_ids" in kwargs and kwargs["input_ids"] is not None: # Should have been caught by current_input_ids + elif ( + "input_ids" in kwargs and kwargs["input_ids"] is not None + ): # Should have been caught by current_input_ids bs = kwargs["input_ids"].shape[0] - kwargs['alora_offsets'] = [-1] * bs + kwargs["alora_offsets"] = [-1] * bs with self._enable_peft_forward_hooks(*args, **kwargs): kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py index b47e167087..3a208a9336 100644 --- a/src/peft/tuners/lora/config.py +++ b/src/peft/tuners/lora/config.py @@ -68,6 +68,7 @@ class LoftQConfig: loftq_bits: str = field(default=4, metadata={"help": "Quantization bits for LoftQ"}) loftq_iter: str = field(default=1, metadata={"help": "Alternating iterations for LoftQ"}) + @dataclass class EvaConfig: """ @@ -681,7 +682,7 @@ def __post_init__(self): if self.use_dora: raise ValueError("The argument lora_bias=True is not supported for DoRA, please pass use_dora=False") - #If activated LoRA (aLoRA) is enabled, check for required invocation arguments. + # If activated LoRA (aLoRA) is enabled, check for required invocation arguments. if self.use_alora: if self.alora_invocation_string is None or self.alora_invocation_tokens is None: raise ValueError( diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index f7576a2187..39c669d2c9 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -37,6 +37,7 @@ VARIANT_KWARG_KEYS = ["alora_offsets"] + class LoraVariant: """ Base class for LoRA variants, e.g. DoRA. @@ -554,7 +555,7 @@ def _mixed_batch_forward( ) -> torch.Tensor: # This is a special method that handles the case when users pass the argument `adapter_names`. This is an # extra argument that allows mixing different adapters in the same batch at inference time. - variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer result = self.base_layer(x, *args, **kwargs) torch_result_dtype = result.dtype @@ -645,6 +646,7 @@ def resolve_lora_variant(self, *, use_dora: bool, use_alora: bool, **kwargs) -> return None from .variants import ALoraLinearVariant, DoraLinearVariant + if use_alora: return ALoraLinearVariant() else: @@ -768,14 +770,14 @@ def get_delta_weight(self, adapter) -> torch.Tensor: def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: self._check_forward_args(x, *args, **kwargs) adapter_names = kwargs.pop("adapter_names", None) - variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer if self.disable_adapters: if self.merged: self.unmerge() result = self.base_layer(x, *args, **kwargs) elif adapter_names is not None: - result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names,**variant_kwargs, **kwargs) + result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **variant_kwargs, **kwargs) elif self.merged: result = self.base_layer(x, *args, **kwargs) else: @@ -850,7 +852,6 @@ def __init__( ) def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: - if not use_dora: return None @@ -1049,7 +1050,7 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor: # TODO: no dtype conversion here, unlike in Linear, is that correct? self._check_forward_args(x, *args, **kwargs) adapter_names = kwargs.pop("adapter_names", None) - variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer if self.disable_adapters: if self.merged: self.unmerge() @@ -1325,7 +1326,7 @@ def get_delta_weight(self, adapter) -> torch.Tensor: def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: self._check_forward_args(x, *args, **kwargs) adapter_names = kwargs.pop("adapter_names", None) - variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer if self.disable_adapters: if self.merged: self.unmerge() diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index a215678be3..dd8db3a1a1 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -63,6 +63,7 @@ def _adapter_names_pre_forward_hook(target, args, kwargs, adapter_names): kwargs["adapter_names"] = adapter_names return args, kwargs + def _alora_offsets_pre_forward_hook(target, args, kwargs, alora_offsets): kwargs["alora_offsets"] = alora_offsets return args, kwargs @@ -432,7 +433,6 @@ def set_adapter(self, adapter_name: str | list[str]) -> None: @contextmanager def _enable_peft_forward_hooks(self, *args, **kwargs): - # If adapter_names is passed as an argument, we inject it into the forward arguments. adapter_names = kwargs.pop("adapter_names", None) alora_offsets = kwargs.pop("alora_offsets", None) @@ -443,7 +443,7 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): hook_handles = [] for layer in self.modules(): if isinstance(layer, LoraLayer): - pre_forward = partial(_alora_offsets_pre_forward_hook, alora_offsets = alora_offsets) + pre_forward = partial(_alora_offsets_pre_forward_hook, alora_offsets=alora_offsets) handle = layer.register_forward_pre_hook(pre_forward, with_kwargs=True) hook_handles.append(handle) @@ -462,7 +462,9 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): unique_adapters = {name for name in adapter_names if name != "__base__"} unexpected_adapters = unique_adapters - expected_adapters if unexpected_adapters: - raise ValueError(f"Trying to infer with non-existing adapter(s): {', '.join(sorted(unexpected_adapters))}") + raise ValueError( + f"Trying to infer with non-existing adapter(s): {', '.join(sorted(unexpected_adapters))}" + ) # deal with beam search num_beams = kwargs.get("num_beams", None) @@ -476,7 +478,6 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): # encoder part. Further below, the original argument is thus restored for the encoder. adapter_names = sum(([n] * kwargs["num_beams"] for n in adapter_names), []) - for module in self.modules(): if isinstance(module, LoraLayer) or isinstance(module, AuxiliaryTrainingWrapper): pre_forward = partial(_adapter_names_pre_forward_hook, adapter_names=adapter_names) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index c830cbdb98..30b1b14fe0 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -480,10 +480,12 @@ def forward( else: offset = min(result.shape[1], alora_offsets[0]) if offset > 0: - result[:, -offset:, :] = result[:, -offset:, :] + lora_B(lora_A(dropout(x[:, -offset:, :]))) * scaling + result[:, -offset:, :] = ( + result[:, -offset:, :] + lora_B(lora_A(dropout(x[:, -offset:, :]))) * scaling + ) else: for i in range(result.shape[0]): - if alora_offsets[i] is None: # run as lora + if alora_offsets[i] is None: # run as lora result[:, :, :] = result[:, :, :] + lora_B(lora_A(dropout(x[:, :, :]))) * scaling else: offset = min(alora_offsets[i], result.shape[1]) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 9ee62600b9..2b2587967e 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -90,13 +90,39 @@ {"target_modules": "lin1", "use_dora": True, "lora_alpha": 32}, ), # Activated LoRA (aLoRA) - ("Vanilla MLP 9 Activated LoRA (aLoRA)", "MLP", LoraConfig, {"target_modules": ["lin0"], "use_alora": True, "alora_invocation_tokens": [1, 2, 3], "alora_invocation_string": '123'}), - ("Vanilla MLP 10 Activated LoRA (aLoRA)", "MLP", LoraConfig, {"target_modules": ["lin0", "lin1"], "use_alora": True, "alora_invocation_tokens": [1, 2, 3], "alora_invocation_string": '123'}), + ( + "Vanilla MLP 9 Activated LoRA (aLoRA)", + "MLP", + LoraConfig, + { + "target_modules": ["lin0"], + "use_alora": True, + "alora_invocation_tokens": [1, 2, 3], + "alora_invocation_string": "123", + }, + ), + ( + "Vanilla MLP 10 Activated LoRA (aLoRA)", + "MLP", + LoraConfig, + { + "target_modules": ["lin0", "lin1"], + "use_alora": True, + "alora_invocation_tokens": [1, 2, 3], + "alora_invocation_string": "123", + }, + ), ( "Vanilla MLP 11 Activated LoRA (aLoRA)", "MLP", LoraConfig, - {"target_modules": "lin1", "use_alora": True, "alora_invocation_tokens": [1, 2, 3], "alora_invocation_string": '123', "lora_alpha": 32}, + { + "target_modules": "lin1", + "use_alora": True, + "alora_invocation_tokens": [1, 2, 3], + "alora_invocation_string": "123", + "lora_alpha": 32, + }, ), ("Embedding + transformers Conv1D 1 LoRA", "EmbConv1D", LoraConfig, {"target_modules": ["conv1d"]}), ("Embedding + transformers Conv1D 2 LoRA", "EmbConv1D", LoraConfig, {"target_modules": ["emb"]}), @@ -1836,6 +1862,7 @@ def test_disable_adapter_with_bias_warns(self, test_name, model_id, config_cls, if not issubclass(config_cls, (LoraConfig, BOFTConfig)): # skip this test for other configs as bias is specific to Lora pytest.skip("Bias argument is only supported for LoRA or BOFT models") + def run_with_disable(config_kwargs, bias): config_kwargs = config_kwargs.copy() config_kwargs["bias"] = bias From 6bb1d5bd9d099201cf03933b8cc2b24ea5d5385f Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Mon, 14 Jul 2025 03:16:43 +0000 Subject: [PATCH 33/99] make quality --- pyproject.toml | 1 + src/peft/tuners/lora/config.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 846a5c3b5b..5ad6a90db4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ ignore = [ "E501", # Line length (handled by ruff-format) "F841", # unused variable "UP007", # X | Y style Unions + "UP045", # X | Y style Unions "C420", # dict.fromkeys ] diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py index 3a208a9336..16c7b1d37f 100644 --- a/src/peft/tuners/lora/config.py +++ b/src/peft/tuners/lora/config.py @@ -302,15 +302,15 @@ class LoraConfig(PeftConfig): https://huggingface.co/papers/2402.09353. use_alora (`bool`): Enable 'Activated LoRA' (aLoRA). This technique - selectively activates the adapter weights only on tokens during and after the alora_invocation_tokens. - When used in a CausalLM, this means that the KV cache prior to invocation is interchangeable with that of - the base model (and other aLoRA adapters operating this way). As a result, in inference pipelines involving + selectively activates the adapter weights only on tokens during and after the alora_invocation_tokens. When + used in a CausalLM, this means that the KV cache prior to invocation is interchangeable with that of the + base model (and other aLoRA adapters operating this way). As a result, in inference pipelines involving switching between base model inference and adapter inference (e.g. agentic pipelines, see paper for many - examples), significant savings are realized (relative to LoRA) by saving prefill operations. Overall adapter - inference speedups of an order of magnitude or more can occur on vLLM, depending on the length of the shared - context. REQUIRED ARGUMENTS: alora_invocation_string, alora_invocation_tokens. These are necessary to know - when to turn on adapter weights. The invocation string therein must be present in all inputs. Note also that - merging is not possible due to the selective application of the weights. + examples), significant savings are realized (relative to LoRA) by saving prefill operations. Overall + adapter inference speedups of an order of magnitude or more can occur on vLLM, depending on the length of + the shared context. REQUIRED ARGUMENTS: alora_invocation_string, alora_invocation_tokens. These are + necessary to know when to turn on adapter weights. The invocation string therein must be present in all + inputs. Note also that merging is not possible due to the selective application of the weights. alora_invocation_string (`str`): Invocation string for aLoRA (must be present in model inputs). Defaults to None. alora_invocation_tokens (`List[int]`): From 9f266009c869873481ec00e1ef8c9a508e07a2ce Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 14 Jul 2025 14:59:09 -0400 Subject: [PATCH 34/99] Update pyproject.toml Co-authored-by: githubnemo --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 5ad6a90db4..58d39dc6b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ ignore = [ "E501", # Line length (handled by ruff-format) "F841", # unused variable "UP007", # X | Y style Unions - "UP045", # X | Y style Unions + "UP045", # X | Y style Optionals "C420", # dict.fromkeys ] From e613d02357dfa7ff192c4837cb288c5b175f792d Mon Sep 17 00:00:00 2001 From: Kristjan Greenewald Date: Thu, 17 Jul 2025 14:41:41 -0700 Subject: [PATCH 35/99] streamline config --- src/peft/peft_model.py | 97 +++------------------------- src/peft/tuners/lora/config.py | 61 ++++++------------ src/peft/tuners/lora/model.py | 3 +- src/peft/tuners/lora/variants.py | 104 +++++++++++++++++++++++++------ tests/test_custom_models.py | 34 +--------- tests/test_decoder_models.py | 13 ++++ tests/testing_common.py | 14 +++-- 7 files changed, 138 insertions(+), 188 deletions(-) diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index 53f2808193..055d0f8986 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -39,6 +39,7 @@ from transformers.utils import PushToHubMixin from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer +from peft.tuners.lora.variants import calculate_alora_offsets from peft.utils.constants import DUMMY_MODEL_CONFIG from peft.utils.integrations import init_empty_weights from peft.utils.other import create_attention_mask, set_additional_trainable_modules @@ -1812,84 +1813,6 @@ def __init__( super().__init__(model, peft_config, adapter_name, **kwargs) self.base_model_prepare_inputs_for_generation = self.base_model.prepare_inputs_for_generation - def _calculate_alora_offsets( - self, input_ids: torch.Tensor, adapter_names: Optional[list[str]] = None - ) -> list[int]: - if input_ids is None: - return [] - - batch_size = input_ids.shape[0] - alora_offsets = [-1] * batch_size - - cached_invocation_tensors = {} - adapters_to_process_indices = collections.defaultdict(list) - - for i in range(batch_size): - current_adapter_name = ( - adapter_names[i] if adapter_names and i < len(adapter_names) else self.active_adapter - ) - - if current_adapter_name == "__base__": - alora_offsets[i] = -1 - continue - - if current_adapter_name not in self.peft_config: - warnings.warn( - f"Adapter '{current_adapter_name}' not found in peft_config. Using base model for row {i}." - ) - alora_offsets[i] = -1 - continue - - current_peft_config = self.peft_config[current_adapter_name] - - if not current_peft_config.use_alora: - alora_offsets[i] = None # Not an aLoRA adapter or wrong type - continue - - invocation_tokens = getattr(current_peft_config, "alora_invocation_tokens", None) - if not invocation_tokens: - alora_offsets[i] = None # No way to calculate offset - continue - - if current_adapter_name not in cached_invocation_tensors: - cached_invocation_tensors[current_adapter_name] = torch.tensor( - invocation_tokens, dtype=torch.long, device=input_ids.device - ) - - adapters_to_process_indices[current_adapter_name].append(i) - - for adapter_name_to_process, indices in adapters_to_process_indices.items(): - current_invocation_ids_tensor = cached_invocation_tensors[adapter_name_to_process] - invocation_len = len(current_invocation_ids_tensor) - - for i in indices: - sequence = input_ids[i] - seq_len = len(sequence) - best_match_start_idx = -1 - - possible_starts = (sequence == current_invocation_ids_tensor[0]).nonzero(as_tuple=True)[0] - - for start_idx_tensor in possible_starts: - idx = start_idx_tensor.item() - if idx + invocation_len <= seq_len: - if torch.equal(sequence[idx : idx + invocation_len], current_invocation_ids_tensor): - if idx > best_match_start_idx: - best_match_start_idx = idx - - if best_match_start_idx != -1: - offset_val = seq_len - best_match_start_idx - alora_offsets[i] = offset_val if offset_val > 0 else -1 - else: # Invocation sequence not found in input - warnings.warn( - f"Could not find alora_invocation_tokens for specified aLoRA adapter in the " - f"following instance" - f"{sequence}" - f"Invocation tokens: {current_invocation_ids_tensor} \n" - f"Defaulting to base model. " - ) - alora_offsets[i] = -1 - return alora_offsets - def forward( self, input_ids=None, @@ -1908,14 +1831,14 @@ def forward( adapter_names_for_offset_calc = kwargs.get("adapter_names") is_alora_relevant = False - if getattr(self.active_peft_config, "use_alora", False): + if getattr(self.active_peft_config, "alora_invocation_tokens", None): is_alora_relevant = True elif adapter_names_for_offset_calc: for name in adapter_names_for_offset_calc: if name == "__base__": continue config_ = self.peft_config.get(name) - if config_ and getattr(config_, "use_alora", False): + if config_ and getattr(config_, "alora_invocation_tokens", None): is_alora_relevant = True break @@ -1926,10 +1849,10 @@ def forward( warnings.warn( "Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass." ) - alora_offsets = [-1] * inputs_embeds.shape[0] + alora_offsets = [None] * inputs_embeds.shape[0] elif input_ids is not None: - alora_offsets = self._calculate_alora_offsets( - input_ids, adapter_names=adapter_names_for_offset_calc + alora_offsets = calculate_alora_offsets( + self.peft_config, self.active_adapter, input_ids, adapter_names=adapter_names_for_offset_calc ) else: alora_offsets = [] # Should not happen if _get_batch_size logic is sound @@ -2076,14 +1999,14 @@ def generate(self, *args, **kwargs): adapter_names_for_offset_calc = kwargs.get("adapter_names") is_alora_relevant_in_generate = False - if getattr(self.active_peft_config, "use_alora", False): + if getattr(self.active_peft_config, "alora_invocation_tokens", None): is_alora_relevant_in_generate = True elif adapter_names_for_offset_calc: for name in adapter_names_for_offset_calc: if name == "__base__": continue config_ = self.peft_config.get(name) - if config_ and getattr(config_, "use_alora", False): + if config_ and getattr(config_, "alora_invocation_tokens", None): is_alora_relevant_in_generate = True break @@ -2100,7 +2023,7 @@ def generate(self, *args, **kwargs): if current_input_ids is not None: if current_input_ids.ndim == 1: current_input_ids = current_input_ids.unsqueeze(0) - calculated_offsets = self._calculate_alora_offsets( + calculated_offsets = calculate_alora_offsets( current_input_ids, adapter_names=adapter_names_for_offset_calc ) for i in range(len(calculated_offsets)): @@ -2125,7 +2048,7 @@ def generate(self, *args, **kwargs): ): # Should have been caught by current_input_ids bs = kwargs["input_ids"].shape[0] - kwargs["alora_offsets"] = [-1] * bs + kwargs["alora_offsets"] = [None] * bs with self._enable_peft_forward_hooks(*args, **kwargs): kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py index 16c7b1d37f..6d3cf003c1 100644 --- a/src/peft/tuners/lora/config.py +++ b/src/peft/tuners/lora/config.py @@ -65,8 +65,8 @@ class LoftQConfig: bits. """ - loftq_bits: str = field(default=4, metadata={"help": "Quantization bits for LoftQ"}) - loftq_iter: str = field(default=1, metadata={"help": "Alternating iterations for LoftQ"}) + loftq_bits: int = field(default=4, metadata={"help": "Quantization bits for LoftQ"}) + loftq_iter: int = field(default=1, metadata={"help": "Alternating iterations for LoftQ"}) @dataclass @@ -300,21 +300,17 @@ class LoraConfig(PeftConfig): ranks. Right now, DoRA only supports linear and Conv2D layers. DoRA introduces a bigger overhead than pure LoRA, so it is recommended to merge weights for inference. For more information, see https://huggingface.co/papers/2402.09353. - use_alora (`bool`): - Enable 'Activated LoRA' (aLoRA). This technique - selectively activates the adapter weights only on tokens during and after the alora_invocation_tokens. When - used in a CausalLM, this means that the KV cache prior to invocation is interchangeable with that of the - base model (and other aLoRA adapters operating this way). As a result, in inference pipelines involving - switching between base model inference and adapter inference (e.g. agentic pipelines, see paper for many - examples), significant savings are realized (relative to LoRA) by saving prefill operations. Overall - adapter inference speedups of an order of magnitude or more can occur on vLLM, depending on the length of - the shared context. REQUIRED ARGUMENTS: alora_invocation_string, alora_invocation_tokens. These are - necessary to know when to turn on adapter weights. The invocation string therein must be present in all - inputs. Note also that merging is not possible due to the selective application of the weights. - alora_invocation_string (`str`): - Invocation string for aLoRA (must be present in model inputs). Defaults to None. alora_invocation_tokens (`List[int]`): - Tokenized copy of alora_invocation_string for use when tokenizer is not available. + If not None, enable 'Activated LoRA' (aLoRA), + with alora_invocation_tokens being the tokenized invocation string for the adapter (must be present in all + model input strings). This technique selectively activates the adapter weights only on tokens during and + after the alora_invocation_tokens. When used in a CausalLM, this means that the KV cache prior to invocation + is interchangeable with that of the base model (and other aLoRA adapters operating this way). As a result, + in inference pipelines involving switching between base model inference and adapter inference (e.g. agentic + pipelines, see paper for examples), significant savings are realized (relative to LoRA) by saving prefill + operations. Overall adapter inference speedups of an order of magnitude or more can occur on vLLM, depending + on the length of the shared context. Note that merging is not possible due to the selective application of + the weights. layer_replication (`List[Tuple[int, int]]`): Build a new stack of layers by stacking the original model layers according to the ranges specified. This allows expanding (or shrinking) the model without duplicating the base model weights. The new layers will @@ -513,11 +509,12 @@ class LoraConfig(PeftConfig): ) }, ) - use_alora: bool = field( - default=False, + alora_invocation_tokens: Optional[list[int]] = field( + default=None, metadata={ "help": ( - "Enable 'Activated LoRA' (aLoRA). This technique selectively activates the adapter " + "Tokenized copy of the Activated LoRA (aLoRA) invocation string (as a list of token IDs). Use the model's default tokenizer. If not None, " + "enable 'Activated LoRA' (aLoRA). This technique selectively activates the adapter " "weights only on tokens during and after the alora_invocation_tokens. When used in a CausalLM, this means that the KV cache prior to invocation " "is interchangeable with that of the base model (and other aLoRA adapters operating this way). As a result, in inference pipelines involving switching " "between base model inference and adapter inference (e.g. agentic pipelines, see paper for many examples), significant savings are realized (relative to LoRA) " @@ -525,32 +522,10 @@ class LoraConfig(PeftConfig): "context. " "NOTE 1: aLoRA often requires higher rank r than LoRA. r=32 often works well." "NOTE 2: Merging is NOT supported due to the selective application of the adapter weights." - "REQUIRED ARGUMENTS: alora_invocation_string, alora_invocation_tokens. These are necessary to know when to turn on adapter weights. The invocation string therein " - "must be present in all inputs." - ) - }, - ) - alora_invocation_string: Optional[str] = field( - default=None, - metadata={ - "help": ( - "Activated LoRA (aLoRA) invocation string. " - "The adapter weights will be activated 1 token after the last occurence of this string in the input. " - "This string must be present in all inputs. It is best to have this string begin and end with special tokens to avoid tokenizer boundary effects when " - "tokenizing the input. Only used when `use_alora=True`." - ) - }, - ) - alora_invocation_tokens: Optional[list[int]] = field( - default=None, - metadata={ - "help": ( - "Tokenized copy of the Activated LoRA (aLoRA) invocation string alora_invocation_string " - "(as a list of token IDs). Use the model's default tokenizer. " - "E.g. alora_invocation_tokens = tokenizer.encode(alora_invocation_string, add_special_tokens=False)." + "Example: alora_invocation_tokens = tokenizer.encode(alora_invocation_string, add_special_tokens=False)." "The adapter weights will be activated 1 token after the last occurence of this string in the input. " "These tokens must be present in all inputs after tokenization. It is best to have alora_invocation_string begin and end with special tokens " - "to avoid tokenizer boundary effects when tokenizing the input. Only used when `use_alora=True`." + "to avoid tokenizer boundary effects when tokenizing the input." ) }, ) diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index dd8db3a1a1..77497faf4c 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -205,7 +205,7 @@ def _create_and_replace( "init_lora_weights": lora_config.init_lora_weights, "use_rslora": lora_config.use_rslora, "use_dora": lora_config.use_dora, - "use_alora": lora_config.use_alora, + "use_alora": lora_config.alora_invocation_tokens is not None, "use_qalora": lora_config.use_qalora, "qalora_group_size": lora_config.qalora_group_size, "ephemeral_gpu_offload": lora_config.runtime_config.ephemeral_gpu_offload, @@ -239,6 +239,7 @@ def _create_and_replace( init_lora_weights=lora_config.init_lora_weights, use_rslora=lora_config.use_rslora, use_dora=lora_config.use_dora, + use_alora=lora_config.alora_invocation_tokens is not None, lora_bias=lora_config.lora_bias, ) else: diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 30b1b14fe0..a549a3c3b7 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -13,7 +13,9 @@ # limitations under the License. from __future__ import annotations -from typing import Any +import collections +from typing import Any, Optional +import warnings import torch from accelerate.utils.imports import is_xpu_available @@ -21,6 +23,7 @@ from peft.utils.other import transpose +from .config import PeftConfig from .dora import DoraConv1dLayer, DoraConv2dLayer, DoraConv3dLayer, DoraEmbeddingLayer, DoraLinearLayer from .layer import Conv1d, Conv2d, Conv3d, Embedding, Linear, LoraVariant, _ConvNd @@ -471,27 +474,88 @@ def forward( x = x.to(lora_A.weight.dtype) - if x.dim() == 2: + if x.dim() == 2: # comes up in some single-token tests result = result + lora_B(lora_A(dropout(x))) * scaling - elif len(alora_offsets) == 1: - if alora_offsets[0] is None: - # run as lora - result[:, :, :] = result[:, :, :] + lora_B(lora_A(dropout(x[:, :, :]))) * scaling - else: - offset = min(result.shape[1], alora_offsets[0]) - if offset > 0: - result[:, -offset:, :] = ( - result[:, -offset:, :] + lora_B(lora_A(dropout(x[:, -offset:, :]))) * scaling - ) - else: + else: # typical LLM regime for i in range(result.shape[0]): - if alora_offsets[i] is None: # run as lora - result[:, :, :] = result[:, :, :] + lora_B(lora_A(dropout(x[:, :, :]))) * scaling - else: + if alora_offsets[i] is not None and alora_offsets[i] > 0: # otherwise use base model offset = min(alora_offsets[i], result.shape[1]) - if offset > 0: - result[i, -offset:, :] = ( - result[i, -offset:, :] + lora_B(lora_A(dropout(x[i, -offset:, :]))) * scaling - ) + result[i, -offset:, :] = ( + result[i, -offset:, :] + lora_B(lora_A(dropout(x[i, -offset:, :]))) * scaling + ) return result + +def calculate_alora_offsets( + peft_config: PeftConfig, active_adapter: str, input_ids: torch.Tensor, adapter_names: Optional[list[str]] = None +) -> list[int]: + if input_ids is None: + return [] + + batch_size = input_ids.shape[0] + alora_offsets = [None] * batch_size + + cached_invocation_tensors = {} + adapters_to_process_indices = collections.defaultdict(list) + + for i in range(batch_size): + current_adapter_name = ( + adapter_names[i] if adapter_names and i < len(adapter_names) else active_adapter + ) + + if current_adapter_name == "__base__": + alora_offsets[i] = None + continue + + if current_adapter_name not in peft_config: + warnings.warn( + f"Adapter '{current_adapter_name}' not found in peft_config. Using base model for row {i}." + ) + alora_offsets[i] = None + continue + + current_peft_config = peft_config[current_adapter_name] + + invocation_tokens = getattr(current_peft_config, "alora_invocation_tokens", None) + if invocation_tokens is None: + alora_offsets[i] = None # Not an aLoRA adapter or wrong type + continue + + if current_adapter_name not in cached_invocation_tensors: + cached_invocation_tensors[current_adapter_name] = torch.tensor( + invocation_tokens, dtype=torch.long, device=input_ids.device + ) + + adapters_to_process_indices[current_adapter_name].append(i) + + for adapter_name_to_process, indices in adapters_to_process_indices.items(): + current_invocation_ids_tensor = cached_invocation_tensors[adapter_name_to_process] + invocation_len = len(current_invocation_ids_tensor) + + for i in indices: + sequence = input_ids[i] + seq_len = len(sequence) + best_match_start_idx = -1 + + possible_starts = (sequence == current_invocation_ids_tensor[0]).nonzero(as_tuple=True)[0] + + for start_idx_tensor in possible_starts: + idx = start_idx_tensor.item() + if idx + invocation_len <= seq_len: + if torch.equal(sequence[idx : idx + invocation_len], current_invocation_ids_tensor): + if idx > best_match_start_idx: + best_match_start_idx = idx + + if best_match_start_idx != -1: + offset_val = seq_len - best_match_start_idx + alora_offsets[i] = offset_val if offset_val > 0 else None + else: # Invocation sequence not found in input + warnings.warn( + f"Could not find alora_invocation_tokens for specified aLoRA adapter in the " + f"following instance" + f"{sequence}" + f"Invocation tokens: {current_invocation_ids_tensor} \n" + f"Defaulting to base model. " + ) + alora_offsets[i] = None + return alora_offsets \ No newline at end of file diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 2b2587967e..24fa001b21 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -96,9 +96,7 @@ LoraConfig, { "target_modules": ["lin0"], - "use_alora": True, "alora_invocation_tokens": [1, 2, 3], - "alora_invocation_string": "123", }, ), ( @@ -107,9 +105,7 @@ LoraConfig, { "target_modules": ["lin0", "lin1"], - "use_alora": True, "alora_invocation_tokens": [1, 2, 3], - "alora_invocation_string": "123", }, ), ( @@ -118,9 +114,7 @@ LoraConfig, { "target_modules": "lin1", - "use_alora": True, "alora_invocation_tokens": [1, 2, 3], - "alora_invocation_string": "123", "lora_alpha": 32, }, ), @@ -1313,8 +1307,6 @@ def test_merge_layers(self, test_name, model_id, config_cls, config_kwargs): pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if config_kwargs.get("use_alora"): - pytest.skip("aLoRA does not support merging.") config_kwargs = config_kwargs.copy() if issubclass(config_cls, LoraConfig): @@ -1338,8 +1330,6 @@ def test_merge_layers_fp16(self, test_name, model_id, config_cls, config_kwargs) pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if config_kwargs.get("use_alora"): - pytest.skip("aLoRA does not support merging.") config_kwargs = config_kwargs.copy() if issubclass(config_cls, LoraConfig): @@ -1355,8 +1345,6 @@ def test_merge_layers_is_idempotent(self, test_name, model_id, config_cls, confi pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if config_kwargs.get("use_alora"): - pytest.skip("aLoRA does not support merging.") # calling merge twice with the same arguments should not change the output config_kwargs = config_kwargs.copy() @@ -1373,8 +1361,6 @@ def test_safe_merge(self, test_name, model_id, config_cls, config_kwargs): pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if config_kwargs.get("use_alora"): - pytest.skip("Skipping test as merging is not supported for aLora.") # calling merge twice with the same arguments should not change the output config_kwargs = config_kwargs.copy() if issubclass(config_cls, LoraConfig): @@ -1467,8 +1453,6 @@ def test_forward_float16(self, test_name, model_id, config_cls, config_kwargs): if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return - if config_kwargs.get("use_alora"): - pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) model(**X) @@ -1511,8 +1495,6 @@ def test_forward_bfloat16(self, test_name, model_id, config_cls, config_kwargs): if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return - if config_kwargs.get("use_alora"): - pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) model(**X) @@ -1554,8 +1536,6 @@ def test_forward_float16_no_autocast(self, test_name, model_id, config_cls, conf if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return - if config_kwargs.get("use_alora"): - pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) model(**X) @@ -1597,8 +1577,6 @@ def test_forward_bfloat16_no_autocast(self, test_name, model_id, config_cls, con if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return - if config_kwargs.get("use_alora"): - pytest.skip("aLoRA does not support merging.") model.merge_adapter(safe_merge=False) model(**X) @@ -1771,8 +1749,6 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if config_kwargs.get("use_alora"): - pytest.skip("aLoRA does not support merging.") # same as test_disable_adapters, but with merging X = self.prepare_inputs_for_testing() @@ -1859,9 +1835,9 @@ def test_disable_adapter_with_bias_warns(self, test_name, model_id, config_cls, # Note: We test only with custom models since they run really fast. There is really no point in testing the same # thing with decoder, encoder_decoder, etc. - if not issubclass(config_cls, (LoraConfig, BOFTConfig)): + if config_cls != LoraConfig or config_cls != BOFTConfig: # skip this test for other configs as bias is specific to Lora - pytest.skip("Bias argument is only supported for LoRA or BOFT models") + pytest.skip("Testing bias warnings only for LoraConfig or BOFTConfig") def run_with_disable(config_kwargs, bias): config_kwargs = config_kwargs.copy() @@ -1875,7 +1851,7 @@ def run_with_disable(config_kwargs, bias): with peft_model.disable_adapter(): pass # there is nothing to be done - if issubclass(config_cls, LoraConfig): + if config_cls == LoraConfig: # check that bias=all and bias=lora_only give a warning with the correct message msg_start = "Careful, disabling adapter layers with bias configured to be" with pytest.warns(UserWarning, match=msg_start): @@ -2771,8 +2747,6 @@ def test_multiple_active_adapters_forward( def test_multiple_active_adapters_merge_and_unmerge( self, test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2 ): - if config_kwargs_1.get("use_alora") or config_kwargs_2.get("use_alora"): - pytest.skip("aLoRA does not support merging.") torch.manual_seed(0) @@ -2807,8 +2781,6 @@ def test_multiple_active_adapters_merge_and_unmerge( "test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2", MULTIPLE_ACTIVE_ADAPTERS_TEST_CASES ) def test_merge_layers_multi(self, test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2): - if config_kwargs_1.get("use_alora") or config_kwargs_2.get("use_alora"): - pytest.skip("aLoRA does not support merging.") torch.manual_seed(0) diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py index d406f17727..f0a5fb4e15 100644 --- a/tests/test_decoder_models.py +++ b/tests/test_decoder_models.py @@ -138,6 +138,19 @@ "bias": "none", }, ), + # Activated LoRA (aLoRA) + ( + LoraConfig, + { + "task_type": "CAUSAL_LM", + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.05, + "bias": "none", + "alora_invocation_tokens": [1], + }, + ), # LoRA + trainable tokens ( LoraConfig, diff --git a/tests/testing_common.py b/tests/testing_common.py index 9050ef6e66..fab832edc1 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -629,10 +629,9 @@ def _test_load_multiple_adapters(self, model_id, config_cls, config_kwargs): assert load_result2.missing_keys == [] def _test_merge_layers_fp16(self, model_id, config_cls, config_kwargs): - if config_cls not in (LoraConfig, IA3Config, AdaLoraConfig, LoHaConfig, LoKrConfig, VBLoRAConfig): - # Merge layers only supported for LoRA and IA³ + if config_cls not in (LoraConfig, IA3Config, AdaLoraConfig, LoHaConfig, LoKrConfig, VBLoRAConfig) or config_kwargs.get("alora_invocation_tokens") is not None: + # Merge layers only supported for LoRA and IA³, and not for Activated LoRA (aLoRA) return pytest.skip(f"Test not applicable for {config_cls}") - if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") @@ -662,8 +661,8 @@ def _test_merge_layers_nan(self, model_id, config_cls, config_kwargs): LoKrConfig, VeraConfig, FourierFTConfig, - ): - # Merge layers only supported for LoRA and IA³ + ) or config_kwargs.get("alora_invocation_tokens") is not None: + # Merge layers only supported for LoRA and IA³, and not for Activated LoRA (aLoRA) return if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") @@ -742,6 +741,9 @@ def _test_merge_layers(self, model_id, config_cls, config_kwargs): if issubclass(config_cls, (OFTConfig, BOFTConfig)): return pytest.skip(f"Test not applicable for {config_cls}") + + if config_kwargs.get("alora_invocation_tokens") is not None: + return pytest.skip ("Merging not applicable to aLoRA") if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") @@ -844,7 +846,7 @@ def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs): **config_kwargs, ) - if config.peft_type not in supported_peft_types: + if config.peft_type not in supported_peft_types or config_kwargs.get("alora_invocation_tokens") is not None: return with hub_online_once(model_id): From 9a0f9d9f20b6c3860c726553d3fb4377fff02ca2 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Fri, 18 Jul 2025 02:53:35 +0000 Subject: [PATCH 36/99] decoder tests --- src/peft/peft_model.py | 10 +++++----- src/peft/tuners/lora/config.py | 6 ------ src/peft/tuners/lora/layer.py | 16 +++++++++------- src/peft/tuners/lora/model.py | 1 - src/peft/tuners/lora/variants.py | 10 ++++------ tests/test_custom_models.py | 13 +++++++------ tests/testing_common.py | 27 ++++++++++++++++++--------- 7 files changed, 43 insertions(+), 40 deletions(-) diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index 055d0f8986..bf631859c1 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -2014,8 +2014,8 @@ def generate(self, *args, **kwargs): alora_offsets_from_kwargs = kwargs.get("alora_offsets") if alora_offsets_from_kwargs is None: current_input_ids = kwargs.get("input_ids") - if not current_input_ids: # args[0] is usually input_ids - if args and isinstance(args[0], torch.Tensor): # and args[0].dim() >=1 : + if current_input_ids is None: # args[0] is usually input_ids + if args and isinstance(args[0], torch.Tensor): current_input_ids = args[0] else: current_input_ids = None @@ -2024,10 +2024,11 @@ def generate(self, *args, **kwargs): if current_input_ids.ndim == 1: current_input_ids = current_input_ids.unsqueeze(0) calculated_offsets = calculate_alora_offsets( - current_input_ids, adapter_names=adapter_names_for_offset_calc + self.peft_config, self.active_adapter, current_input_ids, adapter_names=adapter_names_for_offset_calc ) for i in range(len(calculated_offsets)): - calculated_offsets[i] -= 1 + if calculated_offsets[i] is not None: + calculated_offsets[i] -= 1 kwargs["alora_offsets"] = calculated_offsets else: @@ -2049,7 +2050,6 @@ def generate(self, *args, **kwargs): bs = kwargs["input_ids"].shape[0] kwargs["alora_offsets"] = [None] * bs - with self._enable_peft_forward_hooks(*args, **kwargs): kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} outputs = self.base_model.generate(*args, **kwargs) diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py index 6d3cf003c1..7e515eba9c 100644 --- a/src/peft/tuners/lora/config.py +++ b/src/peft/tuners/lora/config.py @@ -657,12 +657,6 @@ def __post_init__(self): if self.use_dora: raise ValueError("The argument lora_bias=True is not supported for DoRA, please pass use_dora=False") - # If activated LoRA (aLoRA) is enabled, check for required invocation arguments. - if self.use_alora: - if self.alora_invocation_string is None or self.alora_invocation_tokens is None: - raise ValueError( - "The fields alora_invocation_string and alora_invocation_tokens (tokenized copy of alora_invocation_string) are required to use aLoRA." - ) # Using post training conversion of modified base weights to restore their initial values PiSSA/CorDA/OLoRA cannot # be correctly done when using rslora + rank_pattern/alpha_pattern. We can't really know if the user intends # this when they'll eventually call save_pretrained (i.e. if they'll pass diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index 39c669d2c9..71c2a73b0b 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -563,7 +563,7 @@ def _mixed_batch_forward( sub_batch_indices_list = [] for adapter in unique_adapters: sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) - + alora_offsets = variant_kwargs.get("alora_offsets",None) for i, active_adapter in enumerate(unique_adapters): if active_adapter == "__base__": continue @@ -581,16 +581,18 @@ def _mixed_batch_forward( if active_adapter not in self.lora_variant: # vanilla LoRA lora_output = lora_B(lora_A(dropout(sub_batch))) * scaling result[sub_batch_indices_list[i]] += lora_output.to(torch_result_dtype) - else: + else: + if alora_offsets is not None: + variant_kwargs["alora_offsets"] = [alora_offsets[j] for j in sub_batch_indices_list[i]] lora_output = self.lora_variant[active_adapter].forward( self, active_adapter=active_adapter, - x=x, - result=result, + x=sub_batch, + result=result[sub_batch_indices_list[i]], **variant_kwargs, **kwargs, ) - result[sub_batch_indices_list[i]] += lora_output.to(torch_result_dtype) + result[sub_batch_indices_list[i]] = lora_output.to(torch_result_dtype) return result @@ -860,7 +862,7 @@ def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVari return DoraEmbeddingVariant() def update_layer( - self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora, lora_bias + self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora, lora_bias, **kwargs ): # collect the kwargs kwargs = locals().copy() @@ -1134,7 +1136,7 @@ def __init__( ) def update_layer( - self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora, lora_bias + self, adapter_name, r, lora_alpha, lora_dropout, init_lora_weights, use_rslora, use_dora, lora_bias, **kwargs ): # collect the kwargs kwargs = locals().copy() diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index 77497faf4c..250b47952a 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -239,7 +239,6 @@ def _create_and_replace( init_lora_weights=lora_config.init_lora_weights, use_rslora=lora_config.use_rslora, use_dora=lora_config.use_dora, - use_alora=lora_config.alora_invocation_tokens is not None, lora_bias=lora_config.lora_bias, ) else: diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index a549a3c3b7..8dda338ed3 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -465,18 +465,16 @@ def forward( result: torch.Tensor, **kwargs, ) -> torch.Tensor: - alora_offsets = kwargs.get("alora_offsets", [1]) - + alora_offsets = kwargs.get("alora_offsets", None) lora_A = module.lora_A[active_adapter] lora_B = module.lora_B[active_adapter] dropout = module.lora_dropout[active_adapter] scaling = module.scaling[active_adapter] x = x.to(lora_A.weight.dtype) - if x.dim() == 2: # comes up in some single-token tests result = result + lora_B(lora_A(dropout(x))) * scaling - else: # typical LLM regime + elif alora_offsets is not None: # typical LLM regime for i in range(result.shape[0]): if alora_offsets[i] is not None and alora_offsets[i] > 0: # otherwise use base model offset = min(alora_offsets[i], result.shape[1]) @@ -547,7 +545,7 @@ def calculate_alora_offsets( best_match_start_idx = idx if best_match_start_idx != -1: - offset_val = seq_len - best_match_start_idx + offset_val = seq_len - best_match_start_idx + 1 alora_offsets[i] = offset_val if offset_val > 0 else None else: # Invocation sequence not found in input warnings.warn( @@ -558,4 +556,4 @@ def calculate_alora_offsets( f"Defaulting to base model. " ) alora_offsets[i] = None - return alora_offsets \ No newline at end of file + return alora_offsets diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 24fa001b21..34f9422c70 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -1450,7 +1450,7 @@ def test_forward_float16(self, test_name, model_id, config_cls, config_kwargs): # check that none of this raises an error model(**X) - if model_id in ["Conv2dGroups", "Conv2dGroups2"]: + if model_id in ["Conv2dGroups", "Conv2dGroups2"] or config_kwargs.get("alora_invocation_tokens") is not None: # this model does not support merging return @@ -1492,7 +1492,7 @@ def test_forward_bfloat16(self, test_name, model_id, config_cls, config_kwargs): # check that none of this raises an error model(**X) - if model_id in ["Conv2dGroups", "Conv2dGroups2"]: + if model_id in ["Conv2dGroups", "Conv2dGroups2"] or config_kwargs.get("alora_invocation_tokens") is not None: # this model does not support merging return @@ -1533,7 +1533,7 @@ def test_forward_float16_no_autocast(self, test_name, model_id, config_cls, conf # check that none of this raises an error model(**X) - if model_id in ["Conv2dGroups", "Conv2dGroups2"]: + if model_id in ["Conv2dGroups", "Conv2dGroups2"] or config_kwargs.get("alora_invocation_tokens") is not None: # this model does not support merging return @@ -1574,10 +1574,9 @@ def test_forward_bfloat16_no_autocast(self, test_name, model_id, config_cls, con # check that none of this raises an error model(**X) - if model_id in ["Conv2dGroups", "Conv2dGroups2"]: + if model_id in ["Conv2dGroups", "Conv2dGroups2"] or config_kwargs.get("alora_invocation_tokens") is not None: # this model does not support merging return - model.merge_adapter(safe_merge=False) model(**X) model.unmerge_adapter() @@ -1749,7 +1748,9 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - + if config_kwargs.get("alora_invocation_tokens") is not None: + # Merge layers not supported for Activated LoRA (aLoRA) + pytest.skip(f"Test not applicable for Activated LoRA") # same as test_disable_adapters, but with merging X = self.prepare_inputs_for_testing() model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) diff --git a/tests/testing_common.py b/tests/testing_common.py index fab832edc1..dba7955698 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -631,7 +631,10 @@ def _test_load_multiple_adapters(self, model_id, config_cls, config_kwargs): def _test_merge_layers_fp16(self, model_id, config_cls, config_kwargs): if config_cls not in (LoraConfig, IA3Config, AdaLoraConfig, LoHaConfig, LoKrConfig, VBLoRAConfig) or config_kwargs.get("alora_invocation_tokens") is not None: # Merge layers only supported for LoRA and IA³, and not for Activated LoRA (aLoRA) - return pytest.skip(f"Test not applicable for {config_cls}") + if config_kwargs.get("alora_invocation_tokens") is None: + return pytest.skip(f"Test not applicable for {config_cls}") + else: + return pytest.skip("Test not applicable for Activated LoRA") if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") @@ -901,6 +904,9 @@ def _test_merge_layers_multi(self, model_id, config_cls, config_kwargs): assert torch.allclose(logits_merged_adapter_default, logits_adapter_1, atol=1e-3, rtol=1e-3) def _test_merge_layers_is_idempotent(self, model_id, config_cls, config_kwargs): + if config_kwargs.get("alora_invocation_tokens") is not None: + # Merging not supported for Activated LoRA (aLoRA) + return pytest.skip("Test not applicable for Activated LoRA (aLoRA)") with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) config = config_cls( @@ -923,6 +929,9 @@ def _test_merge_layers_is_idempotent(self, model_id, config_cls, config_kwargs): assert torch.allclose(logits_0, logits_1, atol=1e-6, rtol=1e-6) def _test_safe_merge(self, model_id, config_cls, config_kwargs): + if config_kwargs.get("alora_invocation_tokens") is not None: + # Merging not supported for Activated LoRA (aLoRA) + return pytest.skip("Test not applicable for Activated LoRA (aLoRA)") torch.manual_seed(0) with hub_online_once(model_id): model = self.transformers_class.from_pretrained(model_id) @@ -987,8 +996,7 @@ def _test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): dummy_input = self.prepare_inputs_for_testing() # ensure that we have at least 3 samples for this test - dummy_input = {k: torch.cat([v for _ in range(3)]) for k, v in dummy_input.items()} - + dummy_input = {k: torch.cat([v for _ in range(3)]) for k, v in dummy_input.items()} with torch.inference_mode(): with model.disable_adapter(): output_base = model(**dummy_input)[0] @@ -1017,12 +1025,12 @@ def _test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): # alternate between base model, adapter0, and adapter1 adapters = ["__base__", "adapter0", "adapter1"] - dummy_input["adapter_names"] = [adapters[i % 3] for i in (range(len(dummy_input["input_ids"])))] - + dummy_input["adapter_names"] = [adapters[i % 3] for i in (range(len(dummy_input["input_ids"])))] with torch.inference_mode(): output_mixed = model(**dummy_input)[0] logits_mixed = model.generate(**dummy_input, return_dict_in_generate=True, output_scores=True).scores[0] - + #print(output_adapter0[1::3]) + #print(output_mixed[1::3]) assert torch.allclose(output_base[::3], output_mixed[::3], atol=atol, rtol=rtol) assert torch.allclose(output_adapter0[1::3], output_mixed[1::3], atol=atol, rtol=rtol) assert torch.allclose(output_adapter1[2::3], output_mixed[2::3], atol=atol, rtol=rtol) @@ -1035,7 +1043,6 @@ def _test_generate_with_mixed_adapter_batches_and_beam_search(self, model_id, co # adapter_names argument. See #2283. if config_cls not in (LoraConfig,): return pytest.skip(f"Mixed adapter batches not supported for {config_cls}") - if config_kwargs.get("trainable_token_indices", None) is not None: # for some configurations this test will fail since the adapter values don't differ. # this is probably a problem with the test setup and not with the implementation. @@ -1064,8 +1071,10 @@ def _test_generate_with_mixed_adapter_batches_and_beam_search(self, model_id, co dummy_input = self.prepare_inputs_for_testing() # ensure that we have at least 3 samples for this test dummy_input = {k: torch.cat([v for _ in range(3)]) for k, v in dummy_input.items()} - - gen_kwargs = {**dummy_input, "max_length": 20, "num_beams": 10, "early_stopping": True} + num_beams = 10 + if config_kwargs.get("alora_invocation_tokens") is not None: + num_beams = 1 # beam search not yet fully supported + gen_kwargs = {**dummy_input, "max_length": 20, "num_beams": num_beams, "early_stopping": True} with torch.inference_mode(): with model.disable_adapter(): gen_base = model.generate(**gen_kwargs) From 3d1284ae6f8cd365c1a7dcaadc31827788502196 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Fri, 18 Jul 2025 03:02:11 +0000 Subject: [PATCH 37/99] make quality --- src/peft/peft_model.py | 14 +++++++---- src/peft/tuners/lora/config.py | 18 +++++++------- src/peft/tuners/lora/layer.py | 4 ++-- src/peft/tuners/lora/variants.py | 15 +++++------- tests/test_custom_models.py | 4 +--- tests/testing_common.py | 41 +++++++++++++++++++------------- 6 files changed, 52 insertions(+), 44 deletions(-) diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index bf631859c1..cb110d7399 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -38,8 +38,8 @@ from transformers.modeling_outputs import QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput from transformers.utils import PushToHubMixin -from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer from peft.tuners.lora.variants import calculate_alora_offsets +from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer from peft.utils.constants import DUMMY_MODEL_CONFIG from peft.utils.integrations import init_empty_weights from peft.utils.other import create_attention_mask, set_additional_trainable_modules @@ -1852,7 +1852,10 @@ def forward( alora_offsets = [None] * inputs_embeds.shape[0] elif input_ids is not None: alora_offsets = calculate_alora_offsets( - self.peft_config, self.active_adapter, input_ids, adapter_names=adapter_names_for_offset_calc + self.peft_config, + self.active_adapter, + input_ids, + adapter_names=adapter_names_for_offset_calc, ) else: alora_offsets = [] # Should not happen if _get_batch_size logic is sound @@ -2015,7 +2018,7 @@ def generate(self, *args, **kwargs): if alora_offsets_from_kwargs is None: current_input_ids = kwargs.get("input_ids") if current_input_ids is None: # args[0] is usually input_ids - if args and isinstance(args[0], torch.Tensor): + if args and isinstance(args[0], torch.Tensor): current_input_ids = args[0] else: current_input_ids = None @@ -2024,7 +2027,10 @@ def generate(self, *args, **kwargs): if current_input_ids.ndim == 1: current_input_ids = current_input_ids.unsqueeze(0) calculated_offsets = calculate_alora_offsets( - self.peft_config, self.active_adapter, current_input_ids, adapter_names=adapter_names_for_offset_calc + self.peft_config, + self.active_adapter, + current_input_ids, + adapter_names=adapter_names_for_offset_calc, ) for i in range(len(calculated_offsets)): if calculated_offsets[i] is not None: diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py index 7e515eba9c..b0101ed2c4 100644 --- a/src/peft/tuners/lora/config.py +++ b/src/peft/tuners/lora/config.py @@ -301,16 +301,16 @@ class LoraConfig(PeftConfig): LoRA, so it is recommended to merge weights for inference. For more information, see https://huggingface.co/papers/2402.09353. alora_invocation_tokens (`List[int]`): - If not None, enable 'Activated LoRA' (aLoRA), - with alora_invocation_tokens being the tokenized invocation string for the adapter (must be present in all - model input strings). This technique selectively activates the adapter weights only on tokens during and - after the alora_invocation_tokens. When used in a CausalLM, this means that the KV cache prior to invocation - is interchangeable with that of the base model (and other aLoRA adapters operating this way). As a result, - in inference pipelines involving switching between base model inference and adapter inference (e.g. agentic + If not None, enable 'Activated LoRA' (aLoRA), with + alora_invocation_tokens being the tokenized invocation string for the adapter (must be present in all model + input strings). This technique selectively activates the adapter weights only on tokens during and after + the alora_invocation_tokens. When used in a CausalLM, this means that the KV cache prior to invocation is + interchangeable with that of the base model (and other aLoRA adapters operating this way). As a result, in + inference pipelines involving switching between base model inference and adapter inference (e.g. agentic pipelines, see paper for examples), significant savings are realized (relative to LoRA) by saving prefill - operations. Overall adapter inference speedups of an order of magnitude or more can occur on vLLM, depending - on the length of the shared context. Note that merging is not possible due to the selective application of - the weights. + operations. Overall adapter inference speedups of an order of magnitude or more can occur on vLLM, + depending on the length of the shared context. Note that merging is not possible due to the selective + application of the weights. layer_replication (`List[Tuple[int, int]]`): Build a new stack of layers by stacking the original model layers according to the ranges specified. This allows expanding (or shrinking) the model without duplicating the base model weights. The new layers will diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index 71c2a73b0b..c3054bc0ed 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -563,7 +563,7 @@ def _mixed_batch_forward( sub_batch_indices_list = [] for adapter in unique_adapters: sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) - alora_offsets = variant_kwargs.get("alora_offsets",None) + alora_offsets = variant_kwargs.get("alora_offsets", None) for i, active_adapter in enumerate(unique_adapters): if active_adapter == "__base__": continue @@ -581,7 +581,7 @@ def _mixed_batch_forward( if active_adapter not in self.lora_variant: # vanilla LoRA lora_output = lora_B(lora_A(dropout(sub_batch))) * scaling result[sub_batch_indices_list[i]] += lora_output.to(torch_result_dtype) - else: + else: if alora_offsets is not None: variant_kwargs["alora_offsets"] = [alora_offsets[j] for j in sub_batch_indices_list[i]] lora_output = self.lora_variant[active_adapter].forward( diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 8dda338ed3..85bb4a8b68 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -14,8 +14,8 @@ from __future__ import annotations import collections -from typing import Any, Optional import warnings +from typing import Any, Optional import torch from accelerate.utils.imports import is_xpu_available @@ -472,9 +472,9 @@ def forward( scaling = module.scaling[active_adapter] x = x.to(lora_A.weight.dtype) - if x.dim() == 2: # comes up in some single-token tests + if x.dim() == 2: # comes up in some single-token tests result = result + lora_B(lora_A(dropout(x))) * scaling - elif alora_offsets is not None: # typical LLM regime + elif alora_offsets is not None: # typical LLM regime for i in range(result.shape[0]): if alora_offsets[i] is not None and alora_offsets[i] > 0: # otherwise use base model offset = min(alora_offsets[i], result.shape[1]) @@ -484,6 +484,7 @@ def forward( return result + def calculate_alora_offsets( peft_config: PeftConfig, active_adapter: str, input_ids: torch.Tensor, adapter_names: Optional[list[str]] = None ) -> list[int]: @@ -497,18 +498,14 @@ def calculate_alora_offsets( adapters_to_process_indices = collections.defaultdict(list) for i in range(batch_size): - current_adapter_name = ( - adapter_names[i] if adapter_names and i < len(adapter_names) else active_adapter - ) + current_adapter_name = adapter_names[i] if adapter_names and i < len(adapter_names) else active_adapter if current_adapter_name == "__base__": alora_offsets[i] = None continue if current_adapter_name not in peft_config: - warnings.warn( - f"Adapter '{current_adapter_name}' not found in peft_config. Using base model for row {i}." - ) + warnings.warn(f"Adapter '{current_adapter_name}' not found in peft_config. Using base model for row {i}.") alora_offsets[i] = None continue diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 34f9422c70..1df7fe5ac7 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -1750,7 +1750,7 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co ) if config_kwargs.get("alora_invocation_tokens") is not None: # Merge layers not supported for Activated LoRA (aLoRA) - pytest.skip(f"Test not applicable for Activated LoRA") + pytest.skip("Test not applicable for Activated LoRA") # same as test_disable_adapters, but with merging X = self.prepare_inputs_for_testing() model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) @@ -2748,7 +2748,6 @@ def test_multiple_active_adapters_forward( def test_multiple_active_adapters_merge_and_unmerge( self, test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2 ): - torch.manual_seed(0) model = self.resolve_model_cls(tuner_method) @@ -2782,7 +2781,6 @@ def test_multiple_active_adapters_merge_and_unmerge( "test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2", MULTIPLE_ACTIVE_ADAPTERS_TEST_CASES ) def test_merge_layers_multi(self, test_name, tuner_method, config_cls, config_kwargs_1, config_kwargs_2): - torch.manual_seed(0) model = self.resolve_model_cls(tuner_method) diff --git a/tests/testing_common.py b/tests/testing_common.py index dba7955698..0588f71b69 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -629,7 +629,10 @@ def _test_load_multiple_adapters(self, model_id, config_cls, config_kwargs): assert load_result2.missing_keys == [] def _test_merge_layers_fp16(self, model_id, config_cls, config_kwargs): - if config_cls not in (LoraConfig, IA3Config, AdaLoraConfig, LoHaConfig, LoKrConfig, VBLoRAConfig) or config_kwargs.get("alora_invocation_tokens") is not None: + if ( + config_cls not in (LoraConfig, IA3Config, AdaLoraConfig, LoHaConfig, LoKrConfig, VBLoRAConfig) + or config_kwargs.get("alora_invocation_tokens") is not None + ): # Merge layers only supported for LoRA and IA³, and not for Activated LoRA (aLoRA) if config_kwargs.get("alora_invocation_tokens") is None: return pytest.skip(f"Test not applicable for {config_cls}") @@ -656,15 +659,19 @@ def _test_merge_layers_fp16(self, model_id, config_cls, config_kwargs): _ = model.merge_and_unload() def _test_merge_layers_nan(self, model_id, config_cls, config_kwargs): - if config_cls not in ( - LoraConfig, - IA3Config, - AdaLoraConfig, - LoHaConfig, - LoKrConfig, - VeraConfig, - FourierFTConfig, - ) or config_kwargs.get("alora_invocation_tokens") is not None: + if ( + config_cls + not in ( + LoraConfig, + IA3Config, + AdaLoraConfig, + LoHaConfig, + LoKrConfig, + VeraConfig, + FourierFTConfig, + ) + or config_kwargs.get("alora_invocation_tokens") is not None + ): # Merge layers only supported for LoRA and IA³, and not for Activated LoRA (aLoRA) return if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): @@ -744,9 +751,9 @@ def _test_merge_layers(self, model_id, config_cls, config_kwargs): if issubclass(config_cls, (OFTConfig, BOFTConfig)): return pytest.skip(f"Test not applicable for {config_cls}") - + if config_kwargs.get("alora_invocation_tokens") is not None: - return pytest.skip ("Merging not applicable to aLoRA") + return pytest.skip("Merging not applicable to aLoRA") if ("gpt2" in model_id.lower()) and (config_cls != LoraConfig): self.skipTest("Merging GPT2 adapters not supported for IA³ (yet)") @@ -996,7 +1003,7 @@ def _test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): dummy_input = self.prepare_inputs_for_testing() # ensure that we have at least 3 samples for this test - dummy_input = {k: torch.cat([v for _ in range(3)]) for k, v in dummy_input.items()} + dummy_input = {k: torch.cat([v for _ in range(3)]) for k, v in dummy_input.items()} with torch.inference_mode(): with model.disable_adapter(): output_base = model(**dummy_input)[0] @@ -1025,12 +1032,12 @@ def _test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): # alternate between base model, adapter0, and adapter1 adapters = ["__base__", "adapter0", "adapter1"] - dummy_input["adapter_names"] = [adapters[i % 3] for i in (range(len(dummy_input["input_ids"])))] + dummy_input["adapter_names"] = [adapters[i % 3] for i in (range(len(dummy_input["input_ids"])))] with torch.inference_mode(): output_mixed = model(**dummy_input)[0] logits_mixed = model.generate(**dummy_input, return_dict_in_generate=True, output_scores=True).scores[0] - #print(output_adapter0[1::3]) - #print(output_mixed[1::3]) + # print(output_adapter0[1::3]) + # print(output_mixed[1::3]) assert torch.allclose(output_base[::3], output_mixed[::3], atol=atol, rtol=rtol) assert torch.allclose(output_adapter0[1::3], output_mixed[1::3], atol=atol, rtol=rtol) assert torch.allclose(output_adapter1[2::3], output_mixed[2::3], atol=atol, rtol=rtol) @@ -1073,7 +1080,7 @@ def _test_generate_with_mixed_adapter_batches_and_beam_search(self, model_id, co dummy_input = {k: torch.cat([v for _ in range(3)]) for k, v in dummy_input.items()} num_beams = 10 if config_kwargs.get("alora_invocation_tokens") is not None: - num_beams = 1 # beam search not yet fully supported + num_beams = 1 # beam search not yet fully supported gen_kwargs = {**dummy_input, "max_length": 20, "num_beams": num_beams, "early_stopping": True} with torch.inference_mode(): with model.disable_adapter(): From 373462345dc61bdc31e165314724db5c67949e26 Mon Sep 17 00:00:00 2001 From: Kristjan Greenewald Date: Thu, 17 Jul 2025 20:31:29 -0700 Subject: [PATCH 38/99] moving more alora_offsets to variants.py --- src/peft/peft_model.py | 95 ++------------------------------ src/peft/tuners/lora/variants.py | 89 +++++++++++++++++++++++++++++- 2 files changed, 93 insertions(+), 91 deletions(-) diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index cb110d7399..5631e26ef1 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -38,7 +38,7 @@ from transformers.modeling_outputs import QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput from transformers.utils import PushToHubMixin -from peft.tuners.lora.variants import calculate_alora_offsets +from peft.tuners.lora.variants import get_alora_offsets_for_generate, get_alora_offsets_for_forward from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer from peft.utils.constants import DUMMY_MODEL_CONFIG from peft.utils.integrations import init_empty_weights @@ -1828,38 +1828,8 @@ def forward( peft_config = self.active_peft_config if not peft_config.is_prompt_learning: - adapter_names_for_offset_calc = kwargs.get("adapter_names") - - is_alora_relevant = False - if getattr(self.active_peft_config, "alora_invocation_tokens", None): - is_alora_relevant = True - elif adapter_names_for_offset_calc: - for name in adapter_names_for_offset_calc: - if name == "__base__": - continue - config_ = self.peft_config.get(name) - if config_ and getattr(config_, "alora_invocation_tokens", None): - is_alora_relevant = True - break - - if is_alora_relevant: - alora_offsets = kwargs.get("alora_offsets") - if alora_offsets is None: - if input_ids is None and inputs_embeds is not None: - warnings.warn( - "Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass." - ) - alora_offsets = [None] * inputs_embeds.shape[0] - elif input_ids is not None: - alora_offsets = calculate_alora_offsets( - self.peft_config, - self.active_adapter, - input_ids, - adapter_names=adapter_names_for_offset_calc, - ) - else: - alora_offsets = [] # Should not happen if _get_batch_size logic is sound - kwargs["alora_offsets"] = alora_offsets + # For aLoRA + kwargs = get_alora_offsets_for_forward(self, input_ids, inputs_embeds,**kwargs) if self.base_model.config.model_type == "mpt": if inputs_embeds is not None: raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds") @@ -1999,63 +1969,8 @@ def generate(self, *args, **kwargs): self.base_model.generation_config = self.generation_config try: if not peft_config.is_prompt_learning: - adapter_names_for_offset_calc = kwargs.get("adapter_names") - is_alora_relevant_in_generate = False - - if getattr(self.active_peft_config, "alora_invocation_tokens", None): - is_alora_relevant_in_generate = True - elif adapter_names_for_offset_calc: - for name in adapter_names_for_offset_calc: - if name == "__base__": - continue - config_ = self.peft_config.get(name) - if config_ and getattr(config_, "alora_invocation_tokens", None): - is_alora_relevant_in_generate = True - break - - if is_alora_relevant_in_generate: - alora_offsets_from_kwargs = kwargs.get("alora_offsets") - if alora_offsets_from_kwargs is None: - current_input_ids = kwargs.get("input_ids") - if current_input_ids is None: # args[0] is usually input_ids - if args and isinstance(args[0], torch.Tensor): - current_input_ids = args[0] - else: - current_input_ids = None - - if current_input_ids is not None: - if current_input_ids.ndim == 1: - current_input_ids = current_input_ids.unsqueeze(0) - calculated_offsets = calculate_alora_offsets( - self.peft_config, - self.active_adapter, - current_input_ids, - adapter_names=adapter_names_for_offset_calc, - ) - for i in range(len(calculated_offsets)): - if calculated_offsets[i] is not None: - calculated_offsets[i] -= 1 - kwargs["alora_offsets"] = calculated_offsets - - else: - warnings.warn( - "Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA." - ) - bs = 1 - if "attention_mask" in kwargs and kwargs["attention_mask"] is not None: - bs = kwargs["attention_mask"].shape[0] - elif "inputs_embeds" in kwargs and kwargs["inputs_embeds"] is not None: - bs = kwargs["inputs_embeds"].shape[0] - elif ( - args and isinstance(args[0], torch.Tensor) and args[0].dim() > 0 - ): # input_ids might be in args[0] - bs = args[0].shape[0] - elif ( - "input_ids" in kwargs and kwargs["input_ids"] is not None - ): # Should have been caught by current_input_ids - bs = kwargs["input_ids"].shape[0] - - kwargs["alora_offsets"] = [None] * bs + # for aLoRA, None otherwise. + kwargs["alora_offsets"] = get_alora_offsets_for_generate(self, *args, **kwargs) with self._enable_peft_forward_hooks(*args, **kwargs): kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} outputs = self.base_model.generate(*args, **kwargs) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 85bb4a8b68..5e86bbc324 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -22,7 +22,7 @@ from torch import nn from peft.utils.other import transpose - +from peft import PeftModel from .config import PeftConfig from .dora import DoraConv1dLayer, DoraConv2dLayer, DoraConv3dLayer, DoraEmbeddingLayer, DoraLinearLayer from .layer import Conv1d, Conv2d, Conv3d, Embedding, Linear, LoraVariant, _ConvNd @@ -554,3 +554,90 @@ def calculate_alora_offsets( ) alora_offsets[i] = None return alora_offsets + +def is_alora_relevant_in_batch(model: PeftModel, adapter_names: Optional[list[str]] = None): + is_alora_relevant = False + if getattr(model.active_peft_config, "alora_invocation_tokens", None): + is_alora_relevant = True + elif adapter_names: + for name in adapter_names: + if name == "__base__": + continue + config_ = model.peft_config.get(name) + if config_ and getattr(config_, "alora_invocation_tokens", None): + is_alora_relevant = True + break + + return is_alora_relevant + +def get_alora_offsets_for_forward(model: PeftModel, input_ids: torch.Tensor, inputs_embeds: torch.Tensor, **kwargs): + adapter_names_for_offset_calc = kwargs.get("adapter_names", None) + if is_alora_relevant_in_batch(model, adapter_names_for_offset_calc): + alora_offsets = kwargs.get("alora_offsets") + if alora_offsets is None: + if input_ids is None and inputs_embeds is not None: + warnings.warn( + "Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass." + ) + alora_offsets = [None] * inputs_embeds.shape[0] + elif input_ids is not None: + alora_offsets = calculate_alora_offsets( + model.peft_config, + model.active_adapter, + input_ids, + adapter_names=adapter_names_for_offset_calc, + ) + else: + alora_offsets = None + kwargs["alora_offsets"] = alora_offsets + return kwargs + + + +def get_alora_offsets_for_generate(model: PeftModel, *args, **kwargs): + adapter_names_for_offset_calc = kwargs.get("adapter_names") + if is_alora_relevant_in_batch(model, adapter_names_for_offset_calc): + alora_offsets_from_kwargs = kwargs.get("alora_offsets") + if alora_offsets_from_kwargs is None: + current_input_ids = kwargs.get("input_ids") + if current_input_ids is None: # args[0] is usually input_ids + if args and isinstance(args[0], torch.Tensor): + current_input_ids = args[0] + else: + current_input_ids = None + + if current_input_ids is not None: + if current_input_ids.ndim == 1: + current_input_ids = current_input_ids.unsqueeze(0) + calculated_offsets = calculate_alora_offsets( + model.peft_config, + model.active_adapter, + current_input_ids, + adapter_names=adapter_names_for_offset_calc, + ) + for i in range(len(calculated_offsets)): + if calculated_offsets[i] is not None: + calculated_offsets[i] -= 1 + alora_offsets = calculated_offsets + + else: + warnings.warn( + "Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA." + ) + bs = 1 + if "attention_mask" in kwargs and kwargs["attention_mask"] is not None: + bs = kwargs["attention_mask"].shape[0] + elif "inputs_embeds" in kwargs and kwargs["inputs_embeds"] is not None: + bs = kwargs["inputs_embeds"].shape[0] + elif ( + args and isinstance(args[0], torch.Tensor) and args[0].dim() > 0 + ): # input_ids might be in args[0] + bs = args[0].shape[0] + elif ( + "input_ids" in kwargs and kwargs["input_ids"] is not None + ): # Should have been caught by current_input_ids + bs = kwargs["input_ids"].shape[0] + + alora_offsets = [None] * bs + kwargs["alora_offsets"] = alora_offsets + return kwargs From fa59f51ebde57063b02c7db4d6aaf26f0f922e48 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Fri, 18 Jul 2025 04:19:32 +0000 Subject: [PATCH 39/99] fixes --- src/peft/peft_model.py | 6 +++--- src/peft/tuners/lora/variants.py | 27 ++++++++++++--------------- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index 5631e26ef1..ef5ba66fd5 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -38,7 +38,7 @@ from transformers.modeling_outputs import QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput from transformers.utils import PushToHubMixin -from peft.tuners.lora.variants import get_alora_offsets_for_generate, get_alora_offsets_for_forward +from peft.tuners.lora.variants import get_alora_offsets_for_forward, get_alora_offsets_for_generate from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer from peft.utils.constants import DUMMY_MODEL_CONFIG from peft.utils.integrations import init_empty_weights @@ -1829,7 +1829,7 @@ def forward( if not peft_config.is_prompt_learning: # For aLoRA - kwargs = get_alora_offsets_for_forward(self, input_ids, inputs_embeds,**kwargs) + kwargs = get_alora_offsets_for_forward(self, input_ids, inputs_embeds, **kwargs) if self.base_model.config.model_type == "mpt": if inputs_embeds is not None: raise AssertionError("forward in MPTForCausalLM does not support inputs_embeds") @@ -1970,7 +1970,7 @@ def generate(self, *args, **kwargs): try: if not peft_config.is_prompt_learning: # for aLoRA, None otherwise. - kwargs["alora_offsets"] = get_alora_offsets_for_generate(self, *args, **kwargs) + kwargs = get_alora_offsets_for_generate(self, *args, **kwargs) with self._enable_peft_forward_hooks(*args, **kwargs): kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} outputs = self.base_model.generate(*args, **kwargs) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 5e86bbc324..2bf1ecb95c 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -22,7 +22,7 @@ from torch import nn from peft.utils.other import transpose -from peft import PeftModel + from .config import PeftConfig from .dora import DoraConv1dLayer, DoraConv2dLayer, DoraConv3dLayer, DoraEmbeddingLayer, DoraLinearLayer from .layer import Conv1d, Conv2d, Conv3d, Embedding, Linear, LoraVariant, _ConvNd @@ -555,7 +555,8 @@ def calculate_alora_offsets( alora_offsets[i] = None return alora_offsets -def is_alora_relevant_in_batch(model: PeftModel, adapter_names: Optional[list[str]] = None): + +def is_alora_relevant_in_batch(model: nn.module, adapter_names: Optional[list[str]] = None): is_alora_relevant = False if getattr(model.active_peft_config, "alora_invocation_tokens", None): is_alora_relevant = True @@ -570,7 +571,8 @@ def is_alora_relevant_in_batch(model: PeftModel, adapter_names: Optional[list[st return is_alora_relevant -def get_alora_offsets_for_forward(model: PeftModel, input_ids: torch.Tensor, inputs_embeds: torch.Tensor, **kwargs): + +def get_alora_offsets_for_forward(model: nn.module, input_ids: torch.Tensor, inputs_embeds: torch.Tensor, **kwargs): adapter_names_for_offset_calc = kwargs.get("adapter_names", None) if is_alora_relevant_in_batch(model, adapter_names_for_offset_calc): alora_offsets = kwargs.get("alora_offsets") @@ -579,22 +581,20 @@ def get_alora_offsets_for_forward(model: PeftModel, input_ids: torch.Tensor, inp warnings.warn( "Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass." ) - alora_offsets = [None] * inputs_embeds.shape[0] + kwargs["alora_offsets"] = [None] * inputs_embeds.shape[0] elif input_ids is not None: - alora_offsets = calculate_alora_offsets( + kwargs["alora_offsets"] = calculate_alora_offsets( model.peft_config, model.active_adapter, input_ids, adapter_names=adapter_names_for_offset_calc, ) else: - alora_offsets = None - kwargs["alora_offsets"] = alora_offsets + kwargs["alora_offsets"] = None return kwargs - -def get_alora_offsets_for_generate(model: PeftModel, *args, **kwargs): +def get_alora_offsets_for_generate(model: nn.module, *args, **kwargs): adapter_names_for_offset_calc = kwargs.get("adapter_names") if is_alora_relevant_in_batch(model, adapter_names_for_offset_calc): alora_offsets_from_kwargs = kwargs.get("alora_offsets") @@ -618,7 +618,7 @@ def get_alora_offsets_for_generate(model: PeftModel, *args, **kwargs): for i in range(len(calculated_offsets)): if calculated_offsets[i] is not None: calculated_offsets[i] -= 1 - alora_offsets = calculated_offsets + kwargs["alora_offsets"] = calculated_offsets else: warnings.warn( @@ -629,15 +629,12 @@ def get_alora_offsets_for_generate(model: PeftModel, *args, **kwargs): bs = kwargs["attention_mask"].shape[0] elif "inputs_embeds" in kwargs and kwargs["inputs_embeds"] is not None: bs = kwargs["inputs_embeds"].shape[0] - elif ( - args and isinstance(args[0], torch.Tensor) and args[0].dim() > 0 - ): # input_ids might be in args[0] + elif args and isinstance(args[0], torch.Tensor) and args[0].dim() > 0: # input_ids might be in args[0] bs = args[0].shape[0] elif ( "input_ids" in kwargs and kwargs["input_ids"] is not None ): # Should have been caught by current_input_ids bs = kwargs["input_ids"].shape[0] - alora_offsets = [None] * bs - kwargs["alora_offsets"] = alora_offsets + kwargs["alora_offsets"] = [None] * bs return kwargs From 8e418c012bf52016a52284be0d6d1529b2120fb8 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 30 Jul 2025 13:16:35 -0400 Subject: [PATCH 40/99] Update peft_model.py --- src/peft/peft_model.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index ef5ba66fd5..e895c32d5f 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -1969,7 +1969,6 @@ def generate(self, *args, **kwargs): self.base_model.generation_config = self.generation_config try: if not peft_config.is_prompt_learning: - # for aLoRA, None otherwise. kwargs = get_alora_offsets_for_generate(self, *args, **kwargs) with self._enable_peft_forward_hooks(*args, **kwargs): kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} From dd6b670bd0d30aeb43ec174c901fe2e0424b8159 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 30 Jul 2025 13:20:48 -0400 Subject: [PATCH 41/99] Update config.py --- src/peft/tuners/lora/config.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py index 8d505c8f8f..f6b6d4f084 100644 --- a/src/peft/tuners/lora/config.py +++ b/src/peft/tuners/lora/config.py @@ -526,19 +526,16 @@ class LoraConfig(PeftConfig): default=None, metadata={ "help": ( - "Tokenized copy of the Activated LoRA (aLoRA) invocation string (as a list of token IDs). Use the model's default tokenizer. If not None, " - "enable 'Activated LoRA' (aLoRA). This technique selectively activates the adapter " - "weights only on tokens during and after the alora_invocation_tokens. When used in a CausalLM, this means that the KV cache prior to invocation " - "is interchangeable with that of the base model (and other aLoRA adapters operating this way). As a result, in inference pipelines involving switching " - "between base model inference and adapter inference (e.g. agentic pipelines, see paper for many examples), significant savings are realized (relative to LoRA) " - "by saving prefill operations. Overall adapter inference speedups of an order of magnitude or more can occur on vLLM, depending on the length of the shared " - "context. " - "NOTE 1: aLoRA often requires higher rank r than LoRA. r=32 often works well." - "NOTE 2: Merging is NOT supported due to the selective application of the adapter weights." - "Example: alora_invocation_tokens = tokenizer.encode(alora_invocation_string, add_special_tokens=False)." - "The adapter weights will be activated 1 token after the last occurence of this string in the input. " - "These tokens must be present in all inputs after tokenization. It is best to have alora_invocation_string begin and end with special tokens " - "to avoid tokenizer boundary effects when tokenizing the input." + "If not None, enable 'Activated LoRA' (aLoRA), with " + "alora_invocation_tokens being the tokenized invocation string for the adapter (must be present in all model " + "input strings). This technique selectively activates the adapter weights only on tokens during and after " + "the alora_invocation_tokens. When used in a CausalLM, this means that the KV cache prior to invocation is " + "interchangeable with that of the base model (and other aLoRA adapters operating this way). As a result, in " + "inference pipelines involving switching between base model inference and adapter inference (e.g. agentic " + "pipelines, see paper for examples), significant savings are realized (relative to LoRA) by saving prefill " + "operations. Overall adapter inference speedups of an order of magnitude or more can occur on vLLM, " + "depending on the length of the shared context. Note that merging is not possible due to the selective " + "application of the weights." ) }, ) From 183c6a6e2b028fb789d564d0f88da65f67711e78 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 30 Jul 2025 13:53:04 -0400 Subject: [PATCH 42/99] Update variants.py --- src/peft/tuners/lora/variants.py | 163 ++++++++++++++++++------------- 1 file changed, 95 insertions(+), 68 deletions(-) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 2bf1ecb95c..a303157ade 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -472,15 +472,20 @@ def forward( scaling = module.scaling[active_adapter] x = x.to(lora_A.weight.dtype) - if x.dim() == 2: # comes up in some single-token tests - result = result + lora_B(lora_A(dropout(x))) * scaling - elif alora_offsets is not None: # typical LLM regime - for i in range(result.shape[0]): - if alora_offsets[i] is not None and alora_offsets[i] > 0: # otherwise use base model - offset = min(alora_offsets[i], result.shape[1]) - result[i, -offset:, :] = ( - result[i, -offset:, :] + lora_B(lora_A(dropout(x[i, -offset:, :]))) * scaling - ) + if alora_offsets is not None: # should never be None + if x.dim() == 2: + # If x is 2-dimensional (unusual but comes up in certain tests), this means that for all inputs, + # there is only 1 token position being processed and we should adapt its weights. + result = result + lora_B(lora_A(dropout(x))) * scaling + else: #Typical regime + for i in range(result.shape[0]): + # If alora_offsets[i] is None, this means that the invocation sequence was not found in the + # input. As a result, the weights should not be activated anywhere (equivalent to base model). + if alora_offsets[i] is not None and alora_offsets[i] > 0: + offset = min(alora_offsets[i], result.shape[1]) + result[i, -offset:, :] = ( + result[i, -offset:, :] + lora_B(lora_A(dropout(x[i, -offset:, :]))) * scaling + ) return result @@ -488,6 +493,13 @@ def forward( def calculate_alora_offsets( peft_config: PeftConfig, active_adapter: str, input_ids: torch.Tensor, adapter_names: Optional[list[str]] = None ) -> list[int]: + """ + This is a helper function for Activated LoRA (aLoRA) that searches each input token sequence for the last occurence + of the appropriate "alora_invocation_tokens" invocation sequence. If adapter_names is passed, then each input uses + the appropriate invocation sequence for the specified adapter for that row. Logic is provided to handle mixed collections + of adapters for which not all are aLoRAs (e.g. some base model, some LoRA). If the invocation sequence is not present, the + corresponding alora_offset is set to None and a warning is printed. + """ if input_ids is None: return [] @@ -556,7 +568,10 @@ def calculate_alora_offsets( return alora_offsets -def is_alora_relevant_in_batch(model: nn.module, adapter_names: Optional[list[str]] = None): +def is_alora_relevant_in_batch(model: nn.Module, adapter_names: Optional[list[str]] = None): + """ + Helper function to determine if the current batch has any aLoRA adapters. + """ is_alora_relevant = False if getattr(model.active_peft_config, "alora_invocation_tokens", None): is_alora_relevant = True @@ -572,69 +587,81 @@ def is_alora_relevant_in_batch(model: nn.module, adapter_names: Optional[list[st return is_alora_relevant -def get_alora_offsets_for_forward(model: nn.module, input_ids: torch.Tensor, inputs_embeds: torch.Tensor, **kwargs): +def get_alora_offsets_for_forward(model: nn.Module, input_ids: torch.Tensor, inputs_embeds: torch.Tensor, **kwargs): + """ + Wrapper around calculate_alora_offsets, for the .forward of the model. It only calculates alora_offsets if the batch + contains aLoRA adapters. + """ adapter_names_for_offset_calc = kwargs.get("adapter_names", None) - if is_alora_relevant_in_batch(model, adapter_names_for_offset_calc): - alora_offsets = kwargs.get("alora_offsets") - if alora_offsets is None: - if input_ids is None and inputs_embeds is not None: - warnings.warn( - "Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass." - ) - kwargs["alora_offsets"] = [None] * inputs_embeds.shape[0] - elif input_ids is not None: - kwargs["alora_offsets"] = calculate_alora_offsets( - model.peft_config, - model.active_adapter, - input_ids, - adapter_names=adapter_names_for_offset_calc, - ) - else: - kwargs["alora_offsets"] = None + if not is_alora_relevant_in_batch(model, adapter_names_for_offset_calc): + # Nothing to compute + return kwargs + alora_offsets = kwargs.get("alora_offsets") + if alora_offsets is None: + if input_ids is None and inputs_embeds is not None: + warnings.warn( + "Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass." + ) + kwargs["alora_offsets"] = [None] * inputs_embeds.shape[0] + elif input_ids is not None: + kwargs["alora_offsets"] = calculate_alora_offsets( + model.peft_config, + model.active_adapter, + input_ids, + adapter_names=adapter_names_for_offset_calc, + ) + else: + kwargs["alora_offsets"] = None return kwargs def get_alora_offsets_for_generate(model: nn.module, *args, **kwargs): + """ + Wrapper around calculate_alora_offsets, for the .generate of the model. It only calculates alora_offsets if the batch + contains aLoRA adapters. + """ adapter_names_for_offset_calc = kwargs.get("adapter_names") - if is_alora_relevant_in_batch(model, adapter_names_for_offset_calc): - alora_offsets_from_kwargs = kwargs.get("alora_offsets") - if alora_offsets_from_kwargs is None: - current_input_ids = kwargs.get("input_ids") - if current_input_ids is None: # args[0] is usually input_ids - if args and isinstance(args[0], torch.Tensor): - current_input_ids = args[0] - else: - current_input_ids = None - - if current_input_ids is not None: - if current_input_ids.ndim == 1: - current_input_ids = current_input_ids.unsqueeze(0) - calculated_offsets = calculate_alora_offsets( - model.peft_config, - model.active_adapter, - current_input_ids, - adapter_names=adapter_names_for_offset_calc, - ) - for i in range(len(calculated_offsets)): - if calculated_offsets[i] is not None: - calculated_offsets[i] -= 1 - kwargs["alora_offsets"] = calculated_offsets - + if not is_alora_relevant_in_batch(model, adapter_names_for_offset_calc): + # Nothing to compute + return kwargs + alora_offsets_from_kwargs = kwargs.get("alora_offsets") + if alora_offsets_from_kwargs is None: + current_input_ids = kwargs.get("input_ids") + if current_input_ids is None: # args[0] is usually input_ids + if args and isinstance(args[0], torch.Tensor): + current_input_ids = args[0] else: - warnings.warn( - "Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA." - ) - bs = 1 - if "attention_mask" in kwargs and kwargs["attention_mask"] is not None: - bs = kwargs["attention_mask"].shape[0] - elif "inputs_embeds" in kwargs and kwargs["inputs_embeds"] is not None: - bs = kwargs["inputs_embeds"].shape[0] - elif args and isinstance(args[0], torch.Tensor) and args[0].dim() > 0: # input_ids might be in args[0] - bs = args[0].shape[0] - elif ( - "input_ids" in kwargs and kwargs["input_ids"] is not None - ): # Should have been caught by current_input_ids - bs = kwargs["input_ids"].shape[0] - - kwargs["alora_offsets"] = [None] * bs + current_input_ids = None + + if current_input_ids is not None: + if current_input_ids.ndim == 1: + current_input_ids = current_input_ids.unsqueeze(0) + calculated_offsets = calculate_alora_offsets( + model.peft_config, + model.active_adapter, + current_input_ids, + adapter_names=adapter_names_for_offset_calc, + ) + for i in range(len(calculated_offsets)): + if calculated_offsets[i] is not None: + calculated_offsets[i] -= 1 + kwargs["alora_offsets"] = calculated_offsets + + else: + warnings.warn( + "Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA." + ) + bs = 1 + if "attention_mask" in kwargs and kwargs["attention_mask"] is not None: + bs = kwargs["attention_mask"].shape[0] + elif "inputs_embeds" in kwargs and kwargs["inputs_embeds"] is not None: + bs = kwargs["inputs_embeds"].shape[0] + elif args and isinstance(args[0], torch.Tensor) and args[0].dim() > 0: # input_ids might be in args[0] + bs = args[0].shape[0] + elif ( + "input_ids" in kwargs and kwargs["input_ids"] is not None + ): # Should have been caught by current_input_ids + bs = kwargs["input_ids"].shape[0] + + kwargs["alora_offsets"] = [None] * bs return kwargs From 1df3c9cfb86684a8b2c0380f64bd1db2662ec1c0 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 30 Jul 2025 14:12:25 -0400 Subject: [PATCH 43/99] Update variants.py --- src/peft/tuners/lora/variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index a303157ade..a04a35f758 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -663,5 +663,5 @@ def get_alora_offsets_for_generate(model: nn.module, *args, **kwargs): ): # Should have been caught by current_input_ids bs = kwargs["input_ids"].shape[0] - kwargs["alora_offsets"] = [None] * bs + kwargs["alora_offsets"] = None return kwargs From 6c129c0e1ca27b392b621666ce877828d38c832c Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 30 Jul 2025 14:15:16 -0400 Subject: [PATCH 44/99] Update testing_common.py --- tests/testing_common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/testing_common.py b/tests/testing_common.py index aff0de0faf..f0cb3c47ae 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -1039,8 +1039,7 @@ def _test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): with torch.inference_mode(): output_mixed = model(**dummy_input)[0] logits_mixed = model.generate(**dummy_input, return_dict_in_generate=True, output_scores=True).scores[0] - # print(output_adapter0[1::3]) - # print(output_mixed[1::3]) + assert torch.allclose(output_base[::3], output_mixed[::3], atol=atol, rtol=rtol) assert torch.allclose(output_adapter0[1::3], output_mixed[1::3], atol=atol, rtol=rtol) assert torch.allclose(output_adapter1[2::3], output_mixed[2::3], atol=atol, rtol=rtol) From 9de9c18dc24ec12a2a43631325ca90191bf9ff97 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 30 Jul 2025 14:26:20 -0400 Subject: [PATCH 45/99] Update testing_common.py --- tests/testing_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testing_common.py b/tests/testing_common.py index f307cd97ec..083218d2b0 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -1053,6 +1053,8 @@ def _test_generate_with_mixed_adapter_batches_and_beam_search(self, model_id, co # adapter_names argument. See #2283. if config_cls not in (LoraConfig,): return pytest.skip(f"Mixed adapter batches not supported for {config_cls}") + if config_kwargs.get("alora_invocation_tokens") is not None: + return pytest.skip(f"Beam search not yet supported for aLoRA") # beam search not yet fully supported if config_kwargs.get("trainable_token_indices", None) is not None: # for some configurations this test will fail since the adapter values don't differ. # this is probably a problem with the test setup and not with the implementation. @@ -1082,8 +1084,6 @@ def _test_generate_with_mixed_adapter_batches_and_beam_search(self, model_id, co # ensure that we have at least 3 samples for this test dummy_input = {k: torch.cat([v for _ in range(3)]) for k, v in dummy_input.items()} num_beams = 10 - if config_kwargs.get("alora_invocation_tokens") is not None: - num_beams = 1 # beam search not yet fully supported gen_kwargs = {**dummy_input, "max_length": 20, "num_beams": num_beams, "early_stopping": True} with torch.inference_mode(): with model.disable_adapter(): From 6b7242c36027a581765c83609535f032cd88e4ad Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 30 Jul 2025 14:30:06 -0400 Subject: [PATCH 46/99] Update config.py --- src/peft/tuners/lora/config.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py index 8cd7cc6294..87e6eb8a16 100644 --- a/src/peft/tuners/lora/config.py +++ b/src/peft/tuners/lora/config.py @@ -685,6 +685,9 @@ def __post_init__(self): if self.use_dora: raise ValueError("The argument lora_bias=True is not supported for DoRA, please pass use_dora=False") + if self.alora_invocation_tokens is not None and self.task_type is not "CAUSAL_LM": + raise ValueError("aLoRA is currently only supported for CAUSAL_LM task.") + # Using post training conversion of modified base weights to restore their initial values PiSSA/CorDA/OLoRA cannot # be correctly done when using rslora + rank_pattern/alpha_pattern. We can't really know if the user intends # this when they'll eventually call save_pretrained (i.e. if they'll pass From 21a4054fa6bba34522f8b8ec44c6910fd979a14a Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 30 Jul 2025 14:36:22 -0400 Subject: [PATCH 47/99] Update model.py --- src/peft/tuners/lora/model.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index 76b3d2e181..90c6effbcc 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -481,6 +481,8 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): uses_beam_search = isinstance(num_beams, int) and (num_beams > 1) original_adapter_names = adapter_names[:] if uses_beam_search: + if alora_offsets is not None: + raise ValueError("Beam search not yet supported for aLoRA.") if not isinstance(adapter_names, (list, tuple)): raise TypeError(f"Got adapter names of type {type(adapter_names)}, expected a list of str.") # When there is beam search, the inputs are repeated n times, thus we repeat each adapter name n times and From c9fb0856aa2019b26eab4da05fd9875ee8ba3bb7 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 30 Jul 2025 14:40:00 -0400 Subject: [PATCH 48/99] Update peft_model.py --- src/peft/peft_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/peft/peft_model.py b/src/peft/peft_model.py index 7a7e3b53cf..b638d18a70 100644 --- a/src/peft/peft_model.py +++ b/src/peft/peft_model.py @@ -1831,7 +1831,7 @@ def forward( peft_config = self.active_peft_config if not peft_config.is_prompt_learning: - # For aLoRA + # Adds alora_offsets to kwargs if relevant. No other modifications. kwargs = get_alora_offsets_for_forward(self, input_ids, inputs_embeds, **kwargs) if self.base_model.config.model_type == "mpt": if inputs_embeds is not None: @@ -1972,6 +1972,7 @@ def generate(self, *args, **kwargs): self.base_model.generation_config = self.generation_config try: if not peft_config.is_prompt_learning: + # Adds alora_offsets to kwargs if relevant. No other changes. kwargs = get_alora_offsets_for_generate(self, *args, **kwargs) with self._enable_peft_forward_hooks(*args, **kwargs): kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args} From 3bde17de5d4396402d81c89b1083f8aca8c8f1df Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 30 Jul 2025 15:51:09 -0400 Subject: [PATCH 49/99] Update lora.md --- docs/source/developer_guides/lora.md | 44 ++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index df3f58ffb8..cd19776df5 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -173,6 +173,50 @@ from peft import LoraConfig config = LoraConfig(use_rslora=True, ...) ``` +### Activated LoRA (aLoRA) + +Activated LoRA (aLoRA) is a low rank adapter architecture for Causal LMs that allows for reusing existing base model KV cache for more efficient inference. This approach is best suited for inference pipelines which rely on the base model for most tasks/generations, but use aLoRA adapter(s) to perform specialized task(s) within the chain. For example, checking or correcting generated outputs of the base model. In these settings, inference times can be sped up by an order of magnitude or more. For more information on aLoRA and many example use cases, see https://huggingface.co/papers/2504.12397. + +This technique scans for the last occurence of an invocation sequence (`alora_invocation_tokens`) in each input (this can be as short as 1 token), and activates the adapter weights on tokens starting 1 token after the beginning of the invocation sequence. Weights on prior tokens are left un-adapted -- making the cache for those tokens interchangeable with base model cache due to the causal attention mask in Causal LMs. Usage is very similar to standard LoRA, with the key difference that this invocation sequence must be specified when the adapter is created: + +```py +from peft import LoraConfig + +config = LoraConfig(alora_invocation_tokens=alora_invocation_tokens, ...) +``` + +where `alora_invocation_tokens` is a list of integer token ids. Given a desired invocation string, this can be obtained as +``` +invocation_string = "placeholder" +alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False). +``` +where the tokenizer is the tokenizer for the base model. + +**Notes** +* aLoRA is only supported for CAUSAL_LM tasks due to its focus on cache reuse. +* Since the weights are adapted on fewer tokens, often (not always) aLoRA requires higher rank (`r`) than LoRA. `r=32` can be a good starting point. +* aLoRA weights cannot be merged into the base model by definition, since the adapter weights are selectively applied to a subset of tokens. Attempts to merge will throw errors. +* Beam search is not yet supported. +* It is generally not recommended to add new tokens to the tokenizer that are not present in the base model, as this can complicate the target use case of both the base model and adapter model operating on overlapping context. + +#### Choice of invocation sequence and SFT design + +Each input must have the `alora_invocation_tokens` sequence present, it is not added automatically. To maximize model performance without compromising cache reuse, it is recommended to have the adapter weights activated early, i.e. at the start of any adapter-specific prompting, but after any long inputs such as prior generations or documents. As with any model, +formatting should be consistent between train and test. + +Consider the following example, where the base model has a chat template, +and the goal it to train the adapter to generate a desired output. + +* Option 1: If there is no task-specific prompt, i.e. the input is a chat history with the `assistant` prompt, then the chat template's `assistant` prompt (e.g. `<|start_of_role|>assistant<|end_of_role|>`) is a natural choice for the invocation string. See the model's chat template to find the prompt for the model. +* Option 2: If there is a task-specific prompt for the adapter that describes the task the adapter is learning, and that prompt is put as a `user` turn immediately prior to the generation, then the chat template's `user` prompt (e.g. `<|start_of_role|>user<|end_of_role|>`) is a natural choice for the invocation string. + +Once deciding on an invocation string, get the model tokenizer and obtain `alora_invocation_tokens` as +``` +alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False). +``` + +**Note** If using custom strings for the invocation string, make sure that the start and end of the string are special tokens to avoid issues with tokenization at the boundaries. + ### Weight-Decomposed Low-Rank Adaptation (DoRA) From 0e5047566c962227ffabe115e46b44f0a48f3b47 Mon Sep 17 00:00:00 2001 From: Kristjan Greenewald Date: Tue, 5 Aug 2025 13:30:41 -0400 Subject: [PATCH 50/99] variants tests and example --- examples/alora_finetuning/README.md | 68 +++++ examples/alora_finetuning/alora_finetuning.py | 265 ++++++++++++++++++ src/peft/tuners/lora/variants.py | 2 +- tests/test_lora_variants.py | 66 ++++- 4 files changed, 399 insertions(+), 2 deletions(-) create mode 100644 examples/alora_finetuning/README.md create mode 100644 examples/alora_finetuning/alora_finetuning.py diff --git a/examples/alora_finetuning/README.md b/examples/alora_finetuning/README.md new file mode 100644 index 0000000000..49295a7884 --- /dev/null +++ b/examples/alora_finetuning/README.md @@ -0,0 +1,68 @@ +# Activated LoRA (aLoRA) + +## Introduction +Activated LoRA (aLoRA) is an adapter that selectively activates its weights only after a given invocation sequence, ensuring that hidden states match the base model prior to this point. This allows reusing the base model KVs (stored in the KV cache) for tokens before the invocation, +enabling much faster real-world inference (e.g. vLLM) when switching between generation with the base model and generation with adapters. +See the [paper](https://huggingface.co/papers/2504.12397) for more details. + +## Quick start +```python +import torch +from peft import LoraConfig, get_peft_model +from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer +from datasets import load_dataset + +model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", device_map="cuda") +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3") +dataset = load_dataset("Lots-of-LoRAs/task1660_super_glue_question_generation", split="train") + +invocation_string = "[/INST]" # End of user turn in Mistral chat template +invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False) + +lora_config = LoraConfig( + alora_invocation_tokens=invocation_tokens, + r=32, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], +) + +peft_model = get_peft_model(model, lora_config) +trainer = Trainer( + model=peft_model, + train_dataset=dataset, + dataset_text_field="text", + max_seq_length=2048, + tokenizer=tokenizer, +) +trainer.train() +peft_model.save_pretrained("alora-mistral-7b") +``` + +Pass an invocation string with `--invocation_string` when running the example +script: +```bash +python examples/alora_finetuning/alora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --data_path timdettmers/openassistant-guanaco --invocation_string "<|start_of_turn|>assistant" +``` + +### Full example of the script +```bash +python alora_finetuning.py \ + --base_model "PATH_TO_MODEL" \ + --data_path "PATH_TO_DATASET" \ + --output_dir "PATH_TO_OUTPUT_DIR" \ + --batch_size 1 \ + --num_epochs 3 \ + --learning_rate 3e-4 \ + --cutoff_len 512 \ + --val_set_size 500 \ + --invocation_string "<|start_of_turn|>assistant" \ + --quantize \ + --eval_step 10 \ + --save_step 100 \ + --device "cuda:0" \ + --lora_r 32 \ + --lora_alpha 32 \ + --lora_dropout 0.05 \ + --lora_target_modules "q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj" \ + --hub_model_id "YOUR_HF_REPO" \ + --push_to_hub +``` \ No newline at end of file diff --git a/examples/alora_finetuning/alora_finetuning.py b/examples/alora_finetuning/alora_finetuning.py new file mode 100644 index 0000000000..d657012c95 --- /dev/null +++ b/examples/alora_finetuning/alora_finetuning.py @@ -0,0 +1,265 @@ +import os, copy + +import torch +from datasets import load_dataset +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, + BitsAndBytesConfig, + DataCollatorForCompletionOnlyLM, + DynamicCache, + Trainer, + TrainingArguments, +) + +from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel + + + +def train_model( + base_model: str, + data_path: str, + output_dir: str, + batch_size: int, + num_epochs: int, + learning_rate: float, + cutoff_len: int, + val_set_size: int, + invocation_string: str, + quantize: bool, + eval_step: int, + save_step: int, + device: str, + lora_r: int, + lora_alpha: int, + lora_dropout: float, + lora_target_modules: str, + hub_model_id: str, + push_to_hub: bool, +): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + hf_token = os.getenv("HF_TOKEN") + + device = torch.device(device) + print(f"Using device: {device}") + + tokenizer = AutoTokenizer.from_pretrained(base_model, token=hf_token) + + invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False) + + if quantize: + model = AutoModelForCausalLM.from_pretrained( + base_model, + token=hf_token, + quantization_config=BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=( + torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16 + ), + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ), + ) + model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True) + else: + model = AutoModelForCausalLM.from_pretrained(base_model, token=hf_token) + + lora_config = LoraConfig( + alora_invocation_tokens=invocation_tokens, + r=lora_r, + lora_alpha=lora_alpha, + target_modules=( + lora_target_modules.split(",") + if lora_target_modules + else ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] + ), + lora_dropout=lora_dropout, + bias="none", + ) + + model = get_peft_model(model, lora_config) + + model.to(device) + tokenizer.pad_token = tokenizer.eos_token + + dataset = load_dataset(data_path) + + def formatting_prompts_func(example): + output_texts = [] + for i in range(len(example['input'])): + chat = [{ + "role": "user", + "content": example['input'][i] + }, + { + "role": "assistant", + "content": example['output'][i] + }] + text = tokenizer.apply_chat_template(chat, tokenize=False,add_generation_prompt=False) + output_texts.append(text) + return output_texts + + data_collator = DataCollatorForCompletionOnlyLM(invocation_string, tokenizer=tokenizer) + + training_args = TrainingArguments( + output_dir=output_dir, + num_train_epochs=num_epochs, + per_device_train_batch_size=batch_size, + per_device_eval_batch_size=batch_size, + warmup_steps=100, + weight_decay=0.01, + logging_dir="./logs", + logging_steps=eval_step, + save_steps=save_step, + save_total_limit=2, + push_to_hub=push_to_hub, + hub_model_id=hub_model_id, + gradient_accumulation_steps=16, + fp16=True, + learning_rate=learning_rate, + hub_token=hf_token, + ) + + torch.cuda.empty_cache() + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset["train"], + eval_dataset=dataset["test"], + formatting_func=formatting_prompts_func, + data_collator=data_collator, + ) + + trainer.train() + + if push_to_hub: + trainer.push_to_hub(commit_message="Fine-tuned model") + + model.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) + +def model_inference(model_path: str, adapter_path: str, prompt: str=None, data_path: str=None, reuse_cache: bool = True): + ''' + Simple inference with the tuned aLoRA adapter. Optionally (reuse_cache = True) demonstrates + that the aLoRA adapter can (but does not need to) use KV cache created by the base model, + perhaps during a prior generation turn. + + Purely for demonstration purposes. See the [paper](https://huggingface.co/papers/2504.12397) + for realistic multiturn cache reuse examples. + ''' + if prompt is None: + # Use first row of test data + dataset = load_dataset(data_path) + prompt = dataset["test"][0]["input"] + + tokenizer = AutoTokenizer.from_pretrained(adapter_path) + base_model = AutoModelForCausalLM.from_pretrained(model_path) + alora_model = PeftModel.from_pretrained(base_model, adapter_path,adapter_name="adapter") + + chat = [{ + "role": "user", + "content": prompt + }] + text = tokenizer.apply_chat_template(chat, tokenize=False,add_generation_prompt=True) + inputs = tokenizer(text, return_tensors="pt").to(base_model.device) + + if reuse_cache: + # Input through the end of the last turn + text_input = tokenizer.apply_chat_template(chat, tokenize=False,add_generation_prompt=False) + alora_model.set_adapter(None) + kv_cache = DynamicCache() + inputs_prefill = tokenizer(text_input,return_tensors="pt").to(base_model.device) + # prefill input with base model + with torch.no_grad(): + kv_cache = alora_model(**inputs_prefill, past_key_values=kv_cache).past_key_values + + # Generate answer with adapter + alora_model.set_adapter("adapter") + output_dict = alora_model.generate(**inputs,past_key_values=copy.deepcopy(kv_cache),return_dict_in_generate=True) + alora_outputs = output_dict.sequences + + # Generate answer with base model for comparison + alora_model.set_adapter(None) + output_dict = alora_model.generate(**inputs,past_key_values=copy.deepcopy(kv_cache),return_dict_in_generate=True) + base_outputs = output_dict.sequences + else: + # Simpler inference calls (output equivalent to the above) + # Generate answer with adapter + alora_model.set_adapter("adapter") + output_dict = alora_model.generate(**inputs,return_dict_in_generate=True) + alora_outputs = output_dict.sequences + + # Generate answer with base model for comparison + alora_model.set_adapter(None) + output_dict = alora_model.generate(**inputs,return_dict_in_generate=True) + base_outputs = output_dict.sequences + # Print results + print(f"Prompt: {text}") + print(f"Base model response: {tokenizer.decode(base_outputs[0]).rsplit(text,1)[1]}") + print(f"Trained adapter response: {tokenizer.decode(alora_outputs[0]).rsplit(text,1)[1]}") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Fine-tune Mistral with Activated LoRA") + parser.add_argument("--base_model", type=str, default="mistralai/Mistral-7B-Instruct-v0.3", help="Base model path or name") + parser.add_argument( + "--data_path", type=str, default="timdettmers/openassistant-guanaco", help="Dataset path or name" + ) + parser.add_argument( + "--output_dir", type=str, default="path/to/output", help="Output directory for the fine-tuned model" + ) + parser.add_argument("--batch_size", type=int, default=1, help="Batch size") + parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs") + parser.add_argument("--learning_rate", type=float, default=3e-4, help="Learning rate") + parser.add_argument("--cutoff_len", type=int, default=512, help="Cutoff length for tokenization") + parser.add_argument("--val_set_size", type=int, default=500, help="Validation set size") + parser.add_argument( + "--invocation_string", + type=str, + default="[/INST]", + help="String that activates the aLoRA adapter. Model dependent.", + ) + parser.add_argument("--quantize", action="store_true", help="Use quantization") + parser.add_argument("--eval_step", type=int, default=10, help="Evaluation step interval") + parser.add_argument("--save_step", type=int, default=100, help="Save step interval") + parser.add_argument("--device", type=str, default="cuda:0", help="Device to use for training") + parser.add_argument("--lora_r", type=int, default=32, help="LoRA rank") + parser.add_argument("--lora_alpha", type=int, default=32, help="LoRA alpha") + parser.add_argument("--lora_dropout", type=float, default=0.05, help="LoRA dropout rate") + parser.add_argument( + "--lora_target_modules", type=str, default=None, help="Comma-separated list of target modules for LoRA" + ) + parser.add_argument( + "--hub_model_id", + type=str, + default="path/to/repo", + help="Repository name to push the model on the Hugging Face Hub", + ) + parser.add_argument("--push_to_hub", action="store_true", help="Whether to push the model to Hugging Face Hub") + args = parser.parse_args() + train_model( + base_model=args.base_model, + data_path=args.data_path, + output_dir=args.output_dir, + batch_size=args.batch_size, + num_epochs=args.num_epochs, + learning_rate=args.learning_rate, + cutoff_len=args.cutoff_len, + val_set_size=args.val_set_size, + invocation_string=args.invocation_string, + quantize=args.quantize, + eval_step=args.eval_step, + save_step=args.save_step, + device=args.device, + lora_r=args.lora_r, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout, + lora_target_modules=args.lora_target_modules, + hub_model_id=args.hub_model_id, + push_to_hub=args.push_to_hub, + ) + print("Model trained. Running test inference.") + model_inference(model_path = args.base_model, adapter_path = args.output_dir, data_path=args.data_path, reuse_cache = True) \ No newline at end of file diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 5e86bbc324..c31a576c3b 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -594,7 +594,7 @@ def get_alora_offsets_for_forward(model: PeftModel, input_ids: torch.Tensor, inp -def get_alora_offsets_for_generate(model: PeftModel, *args, **kwargs): +def get_alora_offsets_for_generate(model, *args, **kwargs): adapter_names_for_offset_calc = kwargs.get("adapter_names") if is_alora_relevant_in_batch(model, adapter_names_for_offset_calc): alora_offsets_from_kwargs = kwargs.get("alora_offsets") diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index 9112400473..12336e1469 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import pytest import torch from torch import nn @@ -26,9 +27,12 @@ DoraConv2dVariant, DoraEmbeddingVariant, DoraLinearVariant, + calculate_alora_offsets, + get_alora_offsets_for_forward, + get_alora_offsets_for_generate, ) - +# Used for Dora class CustomModel(nn.Module): """pytorch module that contains common targetable layers (linear, embedding, conv, ...)""" @@ -60,6 +64,16 @@ def forward(self, input_ids, dummy_image_input): output = self.linear2(output) return output +# Used for testing alora_offsets for aLoRA +class DummyLM(nn.Module): + def __init__(self, vocab_size: int = 10, hidden_dim: int = 8): + super().__init__() + self.embed = nn.Embedding(vocab_size, hidden_dim) + self.linear = nn.Linear(hidden_dim, vocab_size) + + def forward(self, input_ids): + hidden = self.embed(input_ids) + return self.linear(hidden) VARIANT_MAP = { "dora": { @@ -124,3 +138,53 @@ def test_dora_params_have_gradients(self): for layer in layer_names: assert getattr(peft_model.base_model.model, layer).lora_magnitude_vector["default"].weight.grad is not None + +# Make sure warning is sent when invocation sequence is not present +def test_calculate_alora_offsets_basic_and_warning(): + config = LoraConfig(task_type="CAUSAL_LM", alora_invocation_tokens=[1, 2]) + peft_config = {"default": config} + input_ids = torch.tensor([[0, 1, 2, 3], [0, 4, 5, 6]]) + + # second row lacks invocation sequence -> warning and None offset + with pytest.warns(UserWarning): + offsets = calculate_alora_offsets(peft_config, "default", input_ids) + + assert offsets[0] == input_ids.shape[1] - 2 + 1 + assert offsets[1] is None + +# Verify alora_offsets are correct with multiple adapters +def test_calculate_alora_offsets_with_adapter_names(): + cfg1 = LoraConfig(task_type="CAUSAL_LM", alora_invocation_tokens=[1]) + cfg2 = LoraConfig(task_type="CAUSAL_LM", alora_invocation_tokens=[2]) + peft_config = {"a1": cfg1, "a2": cfg2} + input_ids = torch.tensor([[0, 1, 1], [0, 2, 2]]) + + offsets = calculate_alora_offsets(peft_config, "a1", input_ids, adapter_names=["a1", "a2"]) + + assert offsets == [input_ids.shape[1] - 1 + 1, input_ids.shape[1] - 2 + 1] + +# Make sure that attempting to pass in embeddings rather than token ids gives a warning +def test_get_alora_offsets_forward_inputs_embeds_warning(): + cfg = LoraConfig(task_type="CAUSAL_LM", alora_invocation_tokens=[1]) + model = get_peft_model(DummyLM(), cfg) + embeds = model.base_model.model.embed(torch.tensor([[1, 2, 3]])) + + with pytest.warns(UserWarning): + kwargs = get_alora_offsets_for_forward(model, None, embeds) + assert kwargs["alora_offsets"] == [None] + +# Verify that the adapter does not modify outputs prior to invocation point +def test_alora_activation_matches_base_until_invocation(): + base_model = DummyLM() + cfg = LoraConfig(task_type="CAUSAL_LM", target_modules=["linear"], alora_invocation_tokens=[1]) + lora_model = get_peft_model(copy.deepcopy(base_model), cfg) + + input_ids = torch.tensor([[0, 1, 2, 3]]) + base_out = base_model(input_ids) + offsets = calculate_alora_offsets(lora_model.peft_config, lora_model.active_adapter, input_ids) + lora_out = lora_model(input_ids, alora_offsets=offsets) + + start = input_ids.shape[1] - offsets[0] + assert torch.allclose(lora_out[:, :start], base_out[:, :start]) + assert not torch.allclose(lora_out[:, start:], base_out[:, start:]) + From 0a24c72da7e3627b71c044b9d040008b4a597aaa Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Tue, 5 Aug 2025 21:03:29 +0000 Subject: [PATCH 51/99] fixes --- examples/alora_finetuning/alora_finetuning.py | 12 ++-- method_comparison/MetaMathQA/run.py | 6 +- method_comparison/MetaMathQA/utils.py | 3 +- src/peft/tuners/lora/config.py | 4 +- src/peft/tuners/lora/variants.py | 4 +- tests/test_lora_variants.py | 67 +++++++++++-------- tests/testing_common.py | 2 +- 7 files changed, 55 insertions(+), 43 deletions(-) diff --git a/examples/alora_finetuning/alora_finetuning.py b/examples/alora_finetuning/alora_finetuning.py index d657012c95..1eda77a435 100644 --- a/examples/alora_finetuning/alora_finetuning.py +++ b/examples/alora_finetuning/alora_finetuning.py @@ -1,4 +1,5 @@ -import os, copy +import copy +import os import torch from datasets import load_dataset @@ -12,8 +13,7 @@ TrainingArguments, ) -from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel - +from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training def train_model( @@ -83,7 +83,7 @@ def train_model( tokenizer.pad_token = tokenizer.eos_token dataset = load_dataset(data_path) - + def formatting_prompts_func(example): output_texts = [] for i in range(len(example['input'])): @@ -172,7 +172,7 @@ def model_inference(model_path: str, adapter_path: str, prompt: str=None, data_p inputs_prefill = tokenizer(text_input,return_tensors="pt").to(base_model.device) # prefill input with base model with torch.no_grad(): - kv_cache = alora_model(**inputs_prefill, past_key_values=kv_cache).past_key_values + kv_cache = alora_model(**inputs_prefill, past_key_values=kv_cache).past_key_values # Generate answer with adapter alora_model.set_adapter("adapter") @@ -262,4 +262,4 @@ def model_inference(model_path: str, adapter_path: str, prompt: str=None, data_p push_to_hub=args.push_to_hub, ) print("Model trained. Running test inference.") - model_inference(model_path = args.base_model, adapter_path = args.output_dir, data_path=args.data_path, reuse_cache = True) \ No newline at end of file + model_inference(model_path = args.base_model, adapter_path = args.output_dir, data_path=args.data_path, reuse_cache = True) diff --git a/method_comparison/MetaMathQA/run.py b/method_comparison/MetaMathQA/run.py index 0d159220eb..c03c801f5c 100644 --- a/method_comparison/MetaMathQA/run.py +++ b/method_comparison/MetaMathQA/run.py @@ -25,11 +25,12 @@ import sys import textwrap import time -from contextlib import AbstractContextManager, nullcontext +from contextlib import nullcontext from functools import partial from typing import Any, Callable, Literal, Optional import torch +from data import get_train_valid_test_datasets from torch import nn from torch.amp import GradScaler, autocast from tqdm import tqdm @@ -53,9 +54,8 @@ validate_experiment_path, ) -from data import get_train_valid_test_datasets from peft import AdaLoraConfig, PeftConfig -from peft.utils import infer_device, CONFIG_NAME +from peft.utils import CONFIG_NAME, infer_device # # suppress all warnings diff --git a/method_comparison/MetaMathQA/utils.py b/method_comparison/MetaMathQA/utils.py index d48a301b35..531554a6a0 100644 --- a/method_comparison/MetaMathQA/utils.py +++ b/method_comparison/MetaMathQA/utils.py @@ -44,7 +44,8 @@ import peft from peft import PeftConfig, get_peft_model, prepare_model_for_kbit_training from peft.optimizers import create_lorafa_optimizer, create_loraplus_optimizer -from peft.utils import infer_device, SAFETENSORS_WEIGHTS_NAME +from peft.utils import SAFETENSORS_WEIGHTS_NAME, infer_device + device = infer_device() diff --git a/src/peft/tuners/lora/config.py b/src/peft/tuners/lora/config.py index 87e6eb8a16..742e414040 100644 --- a/src/peft/tuners/lora/config.py +++ b/src/peft/tuners/lora/config.py @@ -685,8 +685,8 @@ def __post_init__(self): if self.use_dora: raise ValueError("The argument lora_bias=True is not supported for DoRA, please pass use_dora=False") - if self.alora_invocation_tokens is not None and self.task_type is not "CAUSAL_LM": - raise ValueError("aLoRA is currently only supported for CAUSAL_LM task.") + if self.alora_invocation_tokens is not None and self.task_type != "CAUSAL_LM": + warnings.warn("aLoRA is currently only supported for CAUSAL_LM task.") # Using post training conversion of modified base weights to restore their initial values PiSSA/CorDA/OLoRA cannot # be correctly done when using rslora + rank_pattern/alpha_pattern. We can't really know if the user intends diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index a04a35f758..1d84a3b7a4 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -479,7 +479,7 @@ def forward( result = result + lora_B(lora_A(dropout(x))) * scaling else: #Typical regime for i in range(result.shape[0]): - # If alora_offsets[i] is None, this means that the invocation sequence was not found in the + # If alora_offsets[i] is None, this means that the invocation sequence was not found in the # input. As a result, the weights should not be activated anywhere (equivalent to base model). if alora_offsets[i] is not None and alora_offsets[i] > 0: offset = min(alora_offsets[i], result.shape[1]) @@ -587,7 +587,7 @@ def is_alora_relevant_in_batch(model: nn.Module, adapter_names: Optional[list[st return is_alora_relevant -def get_alora_offsets_for_forward(model: nn.Module, input_ids: torch.Tensor, inputs_embeds: torch.Tensor, **kwargs): +def get_alora_offsets_for_forward(model: nn.Module, input_ids: torch.Tensor, inputs_embeds: torch.Tensor = None, **kwargs): """ Wrapper around calculate_alora_offsets, for the .forward of the model. It only calculates alora_offsets if the batch contains aLoRA adapters. diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index 12336e1469..c58ed1bc6e 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import copy import pytest import torch from torch import nn @@ -29,9 +28,9 @@ DoraLinearVariant, calculate_alora_offsets, get_alora_offsets_for_forward, - get_alora_offsets_for_generate, ) + # Used for Dora class CustomModel(nn.Module): """pytorch module that contains common targetable layers (linear, embedding, conv, ...)""" @@ -64,17 +63,34 @@ def forward(self, input_ids, dummy_image_input): output = self.linear2(output) return output -# Used for testing alora_offsets for aLoRA +# Used for testing alora_offsets for aLoRA class DummyLM(nn.Module): def __init__(self, vocab_size: int = 10, hidden_dim: int = 8): super().__init__() self.embed = nn.Embedding(vocab_size, hidden_dim) self.linear = nn.Linear(hidden_dim, vocab_size) - def forward(self, input_ids): - hidden = self.embed(input_ids) + def forward(self, X): + hidden = self.embed(X) return self.linear(hidden) +class MockTransformerWrapper: + """Mock class to behave like a transformers model. + + This is needed because the tests initialize the model by calling transformers_class.from_pretrained. + + """ + + @classmethod + def from_pretrained(cls): + # set the seed so that from_pretrained always returns the same model + torch.manual_seed(0) + + torch_dtype = torch.float32 + + return DummyLM().to(torch_dtype) + + VARIANT_MAP = { "dora": { LoraLinear: DoraLinearVariant, @@ -141,7 +157,7 @@ def test_dora_params_have_gradients(self): # Make sure warning is sent when invocation sequence is not present def test_calculate_alora_offsets_basic_and_warning(): - config = LoraConfig(task_type="CAUSAL_LM", alora_invocation_tokens=[1, 2]) + config = LoraConfig(alora_invocation_tokens=[1, 2]) peft_config = {"default": config} input_ids = torch.tensor([[0, 1, 2, 3], [0, 4, 5, 6]]) @@ -149,42 +165,37 @@ def test_calculate_alora_offsets_basic_and_warning(): with pytest.warns(UserWarning): offsets = calculate_alora_offsets(peft_config, "default", input_ids) - assert offsets[0] == input_ids.shape[1] - 2 + 1 + assert offsets[0] == input_ids.shape[1] - 1 + 1 assert offsets[1] is None # Verify alora_offsets are correct with multiple adapters def test_calculate_alora_offsets_with_adapter_names(): - cfg1 = LoraConfig(task_type="CAUSAL_LM", alora_invocation_tokens=[1]) - cfg2 = LoraConfig(task_type="CAUSAL_LM", alora_invocation_tokens=[2]) + cfg1 = LoraConfig(alora_invocation_tokens=[1]) + cfg2 = LoraConfig(alora_invocation_tokens=[2]) peft_config = {"a1": cfg1, "a2": cfg2} input_ids = torch.tensor([[0, 1, 1], [0, 2, 2]]) offsets = calculate_alora_offsets(peft_config, "a1", input_ids, adapter_names=["a1", "a2"]) - assert offsets == [input_ids.shape[1] - 1 + 1, input_ids.shape[1] - 2 + 1] - -# Make sure that attempting to pass in embeddings rather than token ids gives a warning -def test_get_alora_offsets_forward_inputs_embeds_warning(): - cfg = LoraConfig(task_type="CAUSAL_LM", alora_invocation_tokens=[1]) - model = get_peft_model(DummyLM(), cfg) - embeds = model.base_model.model.embed(torch.tensor([[1, 2, 3]])) - - with pytest.warns(UserWarning): - kwargs = get_alora_offsets_for_forward(model, None, embeds) - assert kwargs["alora_offsets"] == [None] + assert offsets == [input_ids.shape[1] - 2 + 1, input_ids.shape[1] - 2 + 1] # Verify that the adapter does not modify outputs prior to invocation point def test_alora_activation_matches_base_until_invocation(): - base_model = DummyLM() - cfg = LoraConfig(task_type="CAUSAL_LM", target_modules=["linear"], alora_invocation_tokens=[1]) - lora_model = get_peft_model(copy.deepcopy(base_model), cfg) + transformers_class = MockTransformerWrapper + base_model = transformers_class.from_pretrained() + cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2],init_lora_weights = False) + lora_model = get_peft_model(base_model, cfg) + lora_model.eval() input_ids = torch.tensor([[0, 1, 2, 3]]) - base_out = base_model(input_ids) - offsets = calculate_alora_offsets(lora_model.peft_config, lora_model.active_adapter, input_ids) - lora_out = lora_model(input_ids, alora_offsets=offsets) - - start = input_ids.shape[1] - offsets[0] + with lora_model.disable_adapter(): + with torch.no_grad(): + base_out = lora_model(X = input_ids) + + kwargs = get_alora_offsets_for_forward(lora_model, input_ids) + with torch.no_grad(): + lora_out = lora_model(X = input_ids,**kwargs) + start = input_ids.shape[1] - kwargs['alora_offsets'][0] assert torch.allclose(lora_out[:, :start], base_out[:, :start]) assert not torch.allclose(lora_out[:, start:], base_out[:, start:]) diff --git a/tests/testing_common.py b/tests/testing_common.py index 2dec675d5c..d197851d54 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -1010,7 +1010,7 @@ def _test_generate_with_mixed_adapter_batches_and_beam_search(self, model_id, co if config_cls not in (LoraConfig,): return pytest.skip(f"Mixed adapter batches not supported for {config_cls}") if config_kwargs.get("alora_invocation_tokens") is not None: - return pytest.skip(f"Beam search not yet supported for aLoRA") # beam search not yet fully supported + return pytest.skip("Beam search not yet supported for aLoRA") # beam search not yet fully supported if config_kwargs.get("trainable_token_indices", None) is not None: # for some configurations this test will fail since the adapter values don't differ. # this is probably a problem with the test setup and not with the implementation. From 6fe25db8f892bc7c01332d032a3b457b56dff1d5 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Tue, 5 Aug 2025 21:05:03 +0000 Subject: [PATCH 52/99] amend --- method_comparison/MetaMathQA/run.py | 6 +++--- method_comparison/MetaMathQA/utils.py | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/method_comparison/MetaMathQA/run.py b/method_comparison/MetaMathQA/run.py index c03c801f5c..0d159220eb 100644 --- a/method_comparison/MetaMathQA/run.py +++ b/method_comparison/MetaMathQA/run.py @@ -25,12 +25,11 @@ import sys import textwrap import time -from contextlib import nullcontext +from contextlib import AbstractContextManager, nullcontext from functools import partial from typing import Any, Callable, Literal, Optional import torch -from data import get_train_valid_test_datasets from torch import nn from torch.amp import GradScaler, autocast from tqdm import tqdm @@ -54,8 +53,9 @@ validate_experiment_path, ) +from data import get_train_valid_test_datasets from peft import AdaLoraConfig, PeftConfig -from peft.utils import CONFIG_NAME, infer_device +from peft.utils import infer_device, CONFIG_NAME # # suppress all warnings diff --git a/method_comparison/MetaMathQA/utils.py b/method_comparison/MetaMathQA/utils.py index 531554a6a0..d48a301b35 100644 --- a/method_comparison/MetaMathQA/utils.py +++ b/method_comparison/MetaMathQA/utils.py @@ -44,8 +44,7 @@ import peft from peft import PeftConfig, get_peft_model, prepare_model_for_kbit_training from peft.optimizers import create_lorafa_optimizer, create_loraplus_optimizer -from peft.utils import SAFETENSORS_WEIGHTS_NAME, infer_device - +from peft.utils import infer_device, SAFETENSORS_WEIGHTS_NAME device = infer_device() From 06bf2a2937274b5dd22bb6f535a74328251b5420 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Wed, 6 Aug 2025 03:46:07 +0000 Subject: [PATCH 53/99] new changes --- examples/alora_finetuning/README.md | 8 +- examples/alora_finetuning/alora_finetuning.py | 142 ++++++++---------- src/peft/tuners/lora/bnb.py | 87 ++++++++--- src/peft/tuners/lora/layer.py | 2 +- src/peft/tuners/lora/variants.py | 30 ++-- tests/test_lora_variants.py | 14 +- tests/testing_common.py | 2 +- 7 files changed, 163 insertions(+), 122 deletions(-) diff --git a/examples/alora_finetuning/README.md b/examples/alora_finetuning/README.md index 49295a7884..2947ffbc98 100644 --- a/examples/alora_finetuning/README.md +++ b/examples/alora_finetuning/README.md @@ -37,10 +37,10 @@ trainer.train() peft_model.save_pretrained("alora-mistral-7b") ``` -Pass an invocation string with `--invocation_string` when running the example +Pass the invocation string with `--invocation_string` when running the training example script: ```bash -python examples/alora_finetuning/alora_finetuning.py --base_model meta-llama/Meta-Llama-3-8B --data_path timdettmers/openassistant-guanaco --invocation_string "<|start_of_turn|>assistant" +python examples/alora_finetuning/alora_finetuning.py --base_model mistralai/Mistral-7B-Instruct-v0.3 --data_path Lots-of-LoRAs/task1660_super_glue_question_generation --invocation_string "[/INST]" ``` ### Full example of the script @@ -54,7 +54,7 @@ python alora_finetuning.py \ --learning_rate 3e-4 \ --cutoff_len 512 \ --val_set_size 500 \ - --invocation_string "<|start_of_turn|>assistant" \ + --invocation_string "[/INST]" \ --quantize \ --eval_step 10 \ --save_step 100 \ @@ -65,4 +65,4 @@ python alora_finetuning.py \ --lora_target_modules "q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj" \ --hub_model_id "YOUR_HF_REPO" \ --push_to_hub -``` \ No newline at end of file +``` diff --git a/examples/alora_finetuning/alora_finetuning.py b/examples/alora_finetuning/alora_finetuning.py index 1eda77a435..41468278bf 100644 --- a/examples/alora_finetuning/alora_finetuning.py +++ b/examples/alora_finetuning/alora_finetuning.py @@ -1,4 +1,3 @@ -import copy import os import torch @@ -7,8 +6,7 @@ AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, - DataCollatorForCompletionOnlyLM, - DynamicCache, + DataCollatorForLanguageModeling, Trainer, TrainingArguments, ) @@ -44,7 +42,7 @@ def train_model( print(f"Using device: {device}") tokenizer = AutoTokenizer.from_pretrained(base_model, token=hf_token) - + tokenizer.pad_token = tokenizer.unk_token invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False) if quantize: @@ -65,14 +63,11 @@ def train_model( model = AutoModelForCausalLM.from_pretrained(base_model, token=hf_token) lora_config = LoraConfig( + task_type="CAUSAL_LM", alora_invocation_tokens=invocation_tokens, r=lora_r, lora_alpha=lora_alpha, - target_modules=( - lora_target_modules.split(",") - if lora_target_modules - else ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] - ), + target_modules=(lora_target_modules.split(",") if lora_target_modules else ["q_proj", "k_proj", "v_proj"]), lora_dropout=lora_dropout, bias="none", ) @@ -84,22 +79,39 @@ def train_model( dataset = load_dataset(data_path) - def formatting_prompts_func(example): - output_texts = [] - for i in range(len(example['input'])): - chat = [{ - "role": "user", - "content": example['input'][i] - }, - { - "role": "assistant", - "content": example['output'][i] - }] - text = tokenizer.apply_chat_template(chat, tokenize=False,add_generation_prompt=False) - output_texts.append(text) - return output_texts - - data_collator = DataCollatorForCompletionOnlyLM(invocation_string, tokenizer=tokenizer) + def tokenize_function(examples): + formatted_texts = [ + tokenizer.apply_chat_template( + [ + {"role": "user", "content": user_msg}, + {"role": "assistant", "content": assistant_msg}, + ], + tokenize=False, # get plain text first + add_generation_prompt=False, + ) + for user_msg, assistant_msg in zip(examples["input"], examples["output"]) + ] + + # 2) Tokenize those texts + model_inputs = tokenizer( + formatted_texts, + padding="max_length", + truncation=True, + max_length=cutoff_len, + ) + + labels = [] + for ids in model_inputs["input_ids"]: + labels.append([(token_id if token_id != tokenizer.pad_token_id else -100) for token_id in ids]) + model_inputs["labels"] = labels + + return model_inputs + + # Tokenize the dataset and prepare for training + tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names) + + # Data collator to dynamically pad the batched examples + data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) training_args = TrainingArguments( output_dir=output_dir, @@ -125,9 +137,8 @@ def formatting_prompts_func(example): trainer = Trainer( model=model, args=training_args, - train_dataset=dataset["train"], - eval_dataset=dataset["test"], - formatting_func=formatting_prompts_func, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["test"], data_collator=data_collator, ) @@ -139,82 +150,57 @@ def formatting_prompts_func(example): model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) -def model_inference(model_path: str, adapter_path: str, prompt: str=None, data_path: str=None, reuse_cache: bool = True): - ''' + +def model_inference(model_path: str, adapter_path: str, prompt: str = None, data_path: str = None): + """ Simple inference with the tuned aLoRA adapter. Optionally (reuse_cache = True) demonstrates that the aLoRA adapter can (but does not need to) use KV cache created by the base model, perhaps during a prior generation turn. Purely for demonstration purposes. See the [paper](https://huggingface.co/papers/2504.12397) for realistic multiturn cache reuse examples. - ''' + """ if prompt is None: # Use first row of test data dataset = load_dataset(data_path) prompt = dataset["test"][0]["input"] - - tokenizer = AutoTokenizer.from_pretrained(adapter_path) + tokenizer = AutoTokenizer.from_pretrained(model_path) base_model = AutoModelForCausalLM.from_pretrained(model_path) - alora_model = PeftModel.from_pretrained(base_model, adapter_path,adapter_name="adapter") - - chat = [{ - "role": "user", - "content": prompt - }] - text = tokenizer.apply_chat_template(chat, tokenize=False,add_generation_prompt=True) + alora_model = PeftModel.from_pretrained(base_model, adapter_path) + chat = [{"role": "user", "content": prompt}] + text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) inputs = tokenizer(text, return_tensors="pt").to(base_model.device) - if reuse_cache: - # Input through the end of the last turn - text_input = tokenizer.apply_chat_template(chat, tokenize=False,add_generation_prompt=False) - alora_model.set_adapter(None) - kv_cache = DynamicCache() - inputs_prefill = tokenizer(text_input,return_tensors="pt").to(base_model.device) - # prefill input with base model - with torch.no_grad(): - kv_cache = alora_model(**inputs_prefill, past_key_values=kv_cache).past_key_values - - # Generate answer with adapter - alora_model.set_adapter("adapter") - output_dict = alora_model.generate(**inputs,past_key_values=copy.deepcopy(kv_cache),return_dict_in_generate=True) - alora_outputs = output_dict.sequences - - # Generate answer with base model for comparison - alora_model.set_adapter(None) - output_dict = alora_model.generate(**inputs,past_key_values=copy.deepcopy(kv_cache),return_dict_in_generate=True) - base_outputs = output_dict.sequences - else: - # Simpler inference calls (output equivalent to the above) - # Generate answer with adapter - alora_model.set_adapter("adapter") - output_dict = alora_model.generate(**inputs,return_dict_in_generate=True) - alora_outputs = output_dict.sequences - - # Generate answer with base model for comparison - alora_model.set_adapter(None) - output_dict = alora_model.generate(**inputs,return_dict_in_generate=True) - base_outputs = output_dict.sequences + # Generate answer with adapter + + output_dict = alora_model.generate(**inputs, return_dict_in_generate=True, max_new_tokens=20) + alora_outputs = output_dict.sequences + # Print results print(f"Prompt: {text}") - print(f"Base model response: {tokenizer.decode(base_outputs[0]).rsplit(text,1)[1]}") - print(f"Trained adapter response: {tokenizer.decode(alora_outputs[0]).rsplit(text,1)[1]}") + print(f"Trained adapter response: {tokenizer.decode(alora_outputs[0]).rsplit(text, 1)[1]}") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Fine-tune Mistral with Activated LoRA") - parser.add_argument("--base_model", type=str, default="mistralai/Mistral-7B-Instruct-v0.3", help="Base model path or name") parser.add_argument( - "--data_path", type=str, default="timdettmers/openassistant-guanaco", help="Dataset path or name" + "--base_model", type=str, default="mistralai/Mistral-7B-Instruct-v0.3", help="Base model path or name" + ) + parser.add_argument( + "--data_path", + type=str, + default="Lots-of-LoRAs/task1660_super_glue_question_generation", + help="Dataset path or name", ) parser.add_argument( "--output_dir", type=str, default="path/to/output", help="Output directory for the fine-tuned model" ) - parser.add_argument("--batch_size", type=int, default=1, help="Batch size") + parser.add_argument("--batch_size", type=int, default=2, help="Batch size") parser.add_argument("--num_epochs", type=int, default=1, help="Number of training epochs") - parser.add_argument("--learning_rate", type=float, default=3e-4, help="Learning rate") - parser.add_argument("--cutoff_len", type=int, default=512, help="Cutoff length for tokenization") + parser.add_argument("--learning_rate", type=float, default=1e-4, help="Learning rate") + parser.add_argument("--cutoff_len", type=int, default=2048, help="Cutoff length for tokenization") parser.add_argument("--val_set_size", type=int, default=500, help="Validation set size") parser.add_argument( "--invocation_string", @@ -262,4 +248,4 @@ def model_inference(model_path: str, adapter_path: str, prompt: str=None, data_p push_to_hub=args.push_to_hub, ) print("Model trained. Running test inference.") - model_inference(model_path = args.base_model, adapter_path = args.output_dir, data_path=args.data_path, reuse_cache = True) + model_inference(model_path=args.base_model, adapter_path=args.output_dir, data_path=args.data_path) diff --git a/src/peft/tuners/lora/bnb.py b/src/peft/tuners/lora/bnb.py index 4dd5b2af9a..8f7f0c3c3a 100644 --- a/src/peft/tuners/lora/bnb.py +++ b/src/peft/tuners/lora/bnb.py @@ -27,6 +27,8 @@ from .layer import LoraLayer, LoraVariant +VARIANT_KWARG_KEYS = ["alora_offsets"] + if is_bnb_available(): class Linear8bitLt(torch.nn.Module, LoraLayer): @@ -40,6 +42,7 @@ def __init__( lora_dropout: float = 0.0, init_lora_weights: bool = True, use_rslora: bool = False, + use_alora: bool = False, use_dora: bool = False, lora_bias: bool = False, **kwargs, @@ -57,16 +60,20 @@ def __init__( init_lora_weights=init_lora_weights, use_rslora=use_rslora, use_dora=use_dora, + use_alora=use_alora, lora_bias=lora_bias, ) - def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: - if not use_dora: + def resolve_lora_variant(self, *, use_dora: bool, use_alora: bool, **kwargs) -> Optional[LoraVariant]: + if not use_dora and not use_alora: return None - from .variants import DoraLinearVariant + from .variants import ALoraLinearVariant, DoraLinearVariant - return DoraLinearVariant() + if use_alora: + return ALoraLinearVariant() + else: + return DoraLinearVariant() def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: """ @@ -178,13 +185,14 @@ def _mixed_batch_forward( ) -> torch.Tensor: # This is a special method that handles the case when users pass the argument `adapter_names`. This is an # extra argument that allows mixing different adapters in the same batch at inference time. + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer result = self.base_layer(x, *args, **kwargs) unique_adapters = set(adapter_names) sub_batch_indices_list = [] for adapter in unique_adapters: sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) - + alora_offsets = variant_kwargs.get("alora_offsets", None) for i, active_adapter in enumerate(unique_adapters): if active_adapter == "__base__": continue @@ -204,23 +212,39 @@ def _mixed_batch_forward( # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear # layer output sub_batch = x[sub_batch_indices_list[i]] - output = lora_B(lora_A(dropout(sub_batch))) * scaling - if requires_conversion: - output = output.to(expected_dtype) - result[sub_batch_indices_list[i]] += output + if active_adapter not in self.lora_variant: # vanilla LoRA: + output = lora_B(lora_A(dropout(sub_batch))) * scaling + if requires_conversion: + output = output.to(expected_dtype) + result[sub_batch_indices_list[i]] += output + else: + if alora_offsets is not None: + variant_kwargs["alora_offsets"] = [alora_offsets[j] for j in sub_batch_indices_list[i]] + output = self.lora_variant[active_adapter].forward( + self, + active_adapter=active_adapter, + x=sub_batch, + result=result[sub_batch_indices_list[i]], + **variant_kwargs, + **kwargs, + ) + if requires_conversion: + output = output.to(expected_dtype) + result[sub_batch_indices_list[i]] = output return result def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: self._check_forward_args(x, *args, **kwargs) adapter_names = kwargs.pop("adapter_names", None) + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer if self.disable_adapters: if self.merged: self.unmerge() result = self.base_layer(x, *args, **kwargs) elif adapter_names is not None: - result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs) + result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **variant_kwargs, **kwargs) elif self.merged: result = self.base_layer(x, *args, **kwargs) else: @@ -249,6 +273,8 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: active_adapter=active_adapter, x=x, result=result, + **variant_kwargs, + **kwargs, ) if requires_conversion: result = result.to(expected_dtype) @@ -315,13 +341,16 @@ def __init__( lora_bias=lora_bias, ) - def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: - if not use_dora: + def resolve_lora_variant(self, *, use_dora: bool, use_alora: bool, **kwargs) -> Optional[LoraVariant]: + if not use_dora and not use_alora: return None - from .variants import DoraLinearVariant + from .variants import ALoraLinearVariant, DoraLinearVariant - return DoraLinearVariant() + if use_alora: + return ALoraLinearVariant() + else: + return DoraLinearVariant() def merge(self, safe_merge: bool = False, adapter_names: Optional[list[str]] = None) -> None: """ @@ -431,6 +460,7 @@ def _mixed_batch_forward( ) -> torch.Tensor: # This is a special method that handles the case when users pass the argument `adapter_names`. This is an # extra argument that allows mixing different adapters in the same batch at inference time. + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer result = self.base_layer(x, *args, **kwargs) unique_adapters = set(adapter_names) @@ -438,6 +468,7 @@ def _mixed_batch_forward( for adapter in unique_adapters: sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) + alora_offsets = variant_kwargs.get("alora_offsets", None) for i, active_adapter in enumerate(unique_adapters): if active_adapter == "__base__": continue @@ -457,23 +488,39 @@ def _mixed_batch_forward( # getting the sub-batch, passing it to LoRA layers and updating the corresponding indices of the linear # layer output sub_batch = x[sub_batch_indices_list[i]] - output = lora_B(lora_A(dropout(sub_batch))) * scaling - if requires_conversion: - output = output.to(expected_dtype) - result[sub_batch_indices_list[i]] += output + if active_adapter not in self.lora_variant: # vanilla LoRA + output = lora_B(lora_A(dropout(sub_batch))) * scaling + if requires_conversion: + output = output.to(expected_dtype) + result[sub_batch_indices_list[i]] += output + else: + if alora_offsets is not None: + variant_kwargs["alora_offsets"] = [alora_offsets[j] for j in sub_batch_indices_list[i]] + output = self.lora_variant[active_adapter].forward( + self, + active_adapter=active_adapter, + x=sub_batch, + result=result[sub_batch_indices_list[i]], + **variant_kwargs, + **kwargs, + ) + if requires_conversion: + output = output.to(expected_dtype) + result[sub_batch_indices_list[i]] = output return result def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: self._check_forward_args(x, *args, **kwargs) adapter_names = kwargs.pop("adapter_names", None) + variant_kwargs = {k: kwargs.pop(k, None) for k in VARIANT_KWARG_KEYS} # don't pass these to base_layer if self.disable_adapters: if self.merged: self.unmerge() result = self.base_layer(x, *args, **kwargs) elif adapter_names is not None: - result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **kwargs) + result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **variant_kwargs, **kwargs) elif self.merged: result = self.base_layer(x, *args, **kwargs) else: @@ -509,6 +556,8 @@ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor: active_adapter=active_adapter, x=x, result=result, + **variant_kwargs, + **kwargs, ) if requires_conversion: result = result.to(expected_dtype) diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py index 4b88da71bc..1a2a702a09 100644 --- a/src/peft/tuners/lora/layer.py +++ b/src/peft/tuners/lora/layer.py @@ -173,7 +173,7 @@ def __init__(self, base_layer: nn.Module, ephemeral_gpu_offload: bool = False, * self.in_features = in_features self.out_features = out_features - def resolve_lora_variant(self, *, use_dora: bool, use_alora: bool, **kwargs) -> Optional[LoraVariant]: + def resolve_lora_variant(self, *, use_dora: bool, **kwargs) -> Optional[LoraVariant]: """Return a matching LoRA variant for this layer type. Given the init arguments of this layer, return the correct LoRA variant, if any. E.g., if `use_dora=True`, this diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 1d84a3b7a4..3013ff5bfa 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -472,12 +472,12 @@ def forward( scaling = module.scaling[active_adapter] x = x.to(lora_A.weight.dtype) - if alora_offsets is not None: # should never be None - if x.dim() == 2: - # If x is 2-dimensional (unusual but comes up in certain tests), this means that for all inputs, - # there is only 1 token position being processed and we should adapt its weights. - result = result + lora_B(lora_A(dropout(x))) * scaling - else: #Typical regime + if x.dim() == 2: + # If x is 2-dimensional (unusual but comes up in certain tests), this means that for all inputs, + # there is only 1 token position being processed and we should adapt its weights. + result = result + lora_B(lora_A(dropout(x))) * scaling + else: # Typical regime + if alora_offsets is not None: for i in range(result.shape[0]): # If alora_offsets[i] is None, this means that the invocation sequence was not found in the # input. As a result, the weights should not be activated anywhere (equivalent to base model). @@ -496,9 +496,9 @@ def calculate_alora_offsets( """ This is a helper function for Activated LoRA (aLoRA) that searches each input token sequence for the last occurence of the appropriate "alora_invocation_tokens" invocation sequence. If adapter_names is passed, then each input uses - the appropriate invocation sequence for the specified adapter for that row. Logic is provided to handle mixed collections - of adapters for which not all are aLoRAs (e.g. some base model, some LoRA). If the invocation sequence is not present, the - corresponding alora_offset is set to None and a warning is printed. + the appropriate invocation sequence for the specified adapter for that row. Logic is provided to handle mixed + collections of adapters for which not all are aLoRAs (e.g. some base model, some LoRA). If the invocation sequence + is not present, the corresponding alora_offset is set to None and a warning is printed. """ if input_ids is None: return [] @@ -587,10 +587,12 @@ def is_alora_relevant_in_batch(model: nn.Module, adapter_names: Optional[list[st return is_alora_relevant -def get_alora_offsets_for_forward(model: nn.Module, input_ids: torch.Tensor, inputs_embeds: torch.Tensor = None, **kwargs): +def get_alora_offsets_for_forward( + model: nn.Module, input_ids: torch.Tensor, inputs_embeds: torch.Tensor = None, **kwargs +): """ - Wrapper around calculate_alora_offsets, for the .forward of the model. It only calculates alora_offsets if the batch - contains aLoRA adapters. + Wrapper around calculate_alora_offsets, for the .forward of the model. It only calculates alora_offsets if the + batch contains aLoRA adapters. """ adapter_names_for_offset_calc = kwargs.get("adapter_names", None) if not is_alora_relevant_in_batch(model, adapter_names_for_offset_calc): @@ -617,8 +619,8 @@ def get_alora_offsets_for_forward(model: nn.Module, input_ids: torch.Tensor, inp def get_alora_offsets_for_generate(model: nn.module, *args, **kwargs): """ - Wrapper around calculate_alora_offsets, for the .generate of the model. It only calculates alora_offsets if the batch - contains aLoRA adapters. + Wrapper around calculate_alora_offsets, for the .generate of the model. It only calculates alora_offsets if the + batch contains aLoRA adapters. """ adapter_names_for_offset_calc = kwargs.get("adapter_names") if not is_alora_relevant_in_batch(model, adapter_names_for_offset_calc): diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index c58ed1bc6e..947e08c230 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -63,6 +63,7 @@ def forward(self, input_ids, dummy_image_input): output = self.linear2(output) return output + # Used for testing alora_offsets for aLoRA class DummyLM(nn.Module): def __init__(self, vocab_size: int = 10, hidden_dim: int = 8): @@ -74,6 +75,7 @@ def forward(self, X): hidden = self.embed(X) return self.linear(hidden) + class MockTransformerWrapper: """Mock class to behave like a transformers model. @@ -155,6 +157,7 @@ def test_dora_params_have_gradients(self): for layer in layer_names: assert getattr(peft_model.base_model.model, layer).lora_magnitude_vector["default"].weight.grad is not None + # Make sure warning is sent when invocation sequence is not present def test_calculate_alora_offsets_basic_and_warning(): config = LoraConfig(alora_invocation_tokens=[1, 2]) @@ -168,6 +171,7 @@ def test_calculate_alora_offsets_basic_and_warning(): assert offsets[0] == input_ids.shape[1] - 1 + 1 assert offsets[1] is None + # Verify alora_offsets are correct with multiple adapters def test_calculate_alora_offsets_with_adapter_names(): cfg1 = LoraConfig(alora_invocation_tokens=[1]) @@ -179,23 +183,23 @@ def test_calculate_alora_offsets_with_adapter_names(): assert offsets == [input_ids.shape[1] - 2 + 1, input_ids.shape[1] - 2 + 1] + # Verify that the adapter does not modify outputs prior to invocation point def test_alora_activation_matches_base_until_invocation(): transformers_class = MockTransformerWrapper base_model = transformers_class.from_pretrained() - cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2],init_lora_weights = False) + cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) lora_model = get_peft_model(base_model, cfg) lora_model.eval() input_ids = torch.tensor([[0, 1, 2, 3]]) with lora_model.disable_adapter(): with torch.no_grad(): - base_out = lora_model(X = input_ids) + base_out = lora_model(X=input_ids) kwargs = get_alora_offsets_for_forward(lora_model, input_ids) with torch.no_grad(): - lora_out = lora_model(X = input_ids,**kwargs) - start = input_ids.shape[1] - kwargs['alora_offsets'][0] + lora_out = lora_model(X=input_ids, **kwargs) + start = input_ids.shape[1] - kwargs["alora_offsets"][0] assert torch.allclose(lora_out[:, :start], base_out[:, :start]) assert not torch.allclose(lora_out[:, start:], base_out[:, start:]) - diff --git a/tests/testing_common.py b/tests/testing_common.py index d197851d54..86f2994e44 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -1010,7 +1010,7 @@ def _test_generate_with_mixed_adapter_batches_and_beam_search(self, model_id, co if config_cls not in (LoraConfig,): return pytest.skip(f"Mixed adapter batches not supported for {config_cls}") if config_kwargs.get("alora_invocation_tokens") is not None: - return pytest.skip("Beam search not yet supported for aLoRA") # beam search not yet fully supported + return pytest.skip("Beam search not yet supported for aLoRA") # beam search not yet fully supported if config_kwargs.get("trainable_token_indices", None) is not None: # for some configurations this test will fail since the adapter values don't differ. # this is probably a problem with the test setup and not with the implementation. From 2d49f38013e75b9b21a14a4cb8a1da93125c8d75 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Tue, 5 Aug 2025 23:55:03 -0400 Subject: [PATCH 54/99] Update lora.md --- docs/source/developer_guides/lora.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index cd19776df5..0fb17377ab 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -217,6 +217,7 @@ alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens **Note** If using custom strings for the invocation string, make sure that the start and end of the string are special tokens to avoid issues with tokenization at the boundaries. +An example inference setup is at [alora finetuning](https://github.com/huggingface/peft/blob/main/examples/alora_finetuning/alora_finetuning.py). ### Weight-Decomposed Low-Rank Adaptation (DoRA) From cff5b070b206282e13729ce54a4ccdcd6465041a Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 6 Aug 2025 14:00:51 -0400 Subject: [PATCH 55/99] Update lora.md --- docs/source/developer_guides/lora.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index 0fb17377ab..2cf0bc30cd 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -182,7 +182,7 @@ This technique scans for the last occurence of an invocation sequence (`alora_in ```py from peft import LoraConfig -config = LoraConfig(alora_invocation_tokens=alora_invocation_tokens, ...) +config = LoraConfig(alora_invocation_tokens=alora_invocation_tokens, task_type="CAUSAL_LM", ...) ``` where `alora_invocation_tokens` is a list of integer token ids. Given a desired invocation string, this can be obtained as @@ -193,7 +193,7 @@ alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens where the tokenizer is the tokenizer for the base model. **Notes** -* aLoRA is only supported for CAUSAL_LM tasks due to its focus on cache reuse. +* aLoRA is only supported for `task_type=CAUSAL_LM` tasks due to its focus on cache reuse. * Since the weights are adapted on fewer tokens, often (not always) aLoRA requires higher rank (`r`) than LoRA. `r=32` can be a good starting point. * aLoRA weights cannot be merged into the base model by definition, since the adapter weights are selectively applied to a subset of tokens. Attempts to merge will throw errors. * Beam search is not yet supported. From 79240398a055a58c2bde0d1e31e9996799cc72d5 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 14 Aug 2025 21:11:35 -0400 Subject: [PATCH 56/99] Update docs/source/developer_guides/lora.md Co-authored-by: githubnemo --- docs/source/developer_guides/lora.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index 2cf0bc30cd..27c7e573db 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -205,7 +205,7 @@ Each input must have the `alora_invocation_tokens` sequence present, it is not a formatting should be consistent between train and test. Consider the following example, where the base model has a chat template, -and the goal it to train the adapter to generate a desired output. +and the goal is to train the adapter to generate a desired output. * Option 1: If there is no task-specific prompt, i.e. the input is a chat history with the `assistant` prompt, then the chat template's `assistant` prompt (e.g. `<|start_of_role|>assistant<|end_of_role|>`) is a natural choice for the invocation string. See the model's chat template to find the prompt for the model. * Option 2: If there is a task-specific prompt for the adapter that describes the task the adapter is learning, and that prompt is put as a `user` turn immediately prior to the generation, then the chat template's `user` prompt (e.g. `<|start_of_role|>user<|end_of_role|>`) is a natural choice for the invocation string. From 6f1e284214e91d3bf5deba8128babbe1340a9deb Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 14 Aug 2025 21:19:12 -0400 Subject: [PATCH 57/99] Update lora.md --- docs/source/developer_guides/lora.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index 64b00e0b35..b4acc24356 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -190,7 +190,7 @@ where `alora_invocation_tokens` is a list of integer token ids. Given a desired invocation_string = "placeholder" alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False). ``` -where the tokenizer is the tokenizer for the base model. +where the tokenizer is the tokenizer for the base model. Note that we have `add_special_tokens=False` to avoid adding SOS/EOS tokens in our search string (which will most likely cause failure to find). **Notes** * aLoRA is only supported for `task_type=CAUSAL_LM` tasks due to its focus on cache reuse. From 21ceb56200a6966b34ae98f52b465cdcb671f70d Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 14 Aug 2025 21:35:57 -0400 Subject: [PATCH 58/99] Update variants.py --- src/peft/tuners/lora/variants.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 3013ff5bfa..dcf7c2b267 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -653,17 +653,6 @@ def get_alora_offsets_for_generate(model: nn.module, *args, **kwargs): warnings.warn( "Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA." ) - bs = 1 - if "attention_mask" in kwargs and kwargs["attention_mask"] is not None: - bs = kwargs["attention_mask"].shape[0] - elif "inputs_embeds" in kwargs and kwargs["inputs_embeds"] is not None: - bs = kwargs["inputs_embeds"].shape[0] - elif args and isinstance(args[0], torch.Tensor) and args[0].dim() > 0: # input_ids might be in args[0] - bs = args[0].shape[0] - elif ( - "input_ids" in kwargs and kwargs["input_ids"] is not None - ): # Should have been caught by current_input_ids - bs = kwargs["input_ids"].shape[0] - + kwargs["alora_offsets"] = None return kwargs From d1d31e7db7f8c263b8154af6aff910b6c4faa0e1 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 14 Aug 2025 21:36:51 -0400 Subject: [PATCH 59/99] Update src/peft/tuners/lora/variants.py Co-authored-by: githubnemo --- src/peft/tuners/lora/variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index dcf7c2b267..f08e36439a 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -604,7 +604,7 @@ def get_alora_offsets_for_forward( warnings.warn( "Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass." ) - kwargs["alora_offsets"] = [None] * inputs_embeds.shape[0] + kwargs["alora_offsets"] = None elif input_ids is not None: kwargs["alora_offsets"] = calculate_alora_offsets( model.peft_config, From 076411f812639e82b130a2f27c81b201bf02d8cd Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 14 Aug 2025 21:41:02 -0400 Subject: [PATCH 60/99] Update variants.py --- src/peft/tuners/lora/variants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index f08e36439a..5657a49a57 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -644,6 +644,7 @@ def get_alora_offsets_for_generate(model: nn.module, *args, **kwargs): current_input_ids, adapter_names=adapter_names_for_offset_calc, ) + # Subtract 1 from offsets for generate, due to position difference between "forward" and generate forward pass for i in range(len(calculated_offsets)): if calculated_offsets[i] is not None: calculated_offsets[i] -= 1 From 3c267ce907297c362c28372fd9ef6907fead1565 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Thu, 14 Aug 2025 22:01:13 -0400 Subject: [PATCH 61/99] Update tests/test_lora_variants.py Co-authored-by: githubnemo --- tests/test_lora_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index 947e08c230..374c3efcdc 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -31,7 +31,7 @@ ) -# Used for Dora +# Custom model featuring embeddings and a 'visual stack' class CustomModel(nn.Module): """pytorch module that contains common targetable layers (linear, embedding, conv, ...)""" From 56455a85fec1eeb64fb8dbb4b52b8e30aee7ebfe Mon Sep 17 00:00:00 2001 From: Greenewald Date: Fri, 15 Aug 2025 09:47:10 -0400 Subject: [PATCH 62/99] Update test_custom_models.py --- tests/test_custom_models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index b2eb8d21b3..7b1986456a 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -99,7 +99,7 @@ LoraConfig, { "target_modules": ["lin0"], - "alora_invocation_tokens": [1, 2, 3], + "alora_invocation_tokens": [1, 2, 3], #placeholder, not important for tests in this file }, ), ( @@ -108,7 +108,7 @@ LoraConfig, { "target_modules": ["lin0", "lin1"], - "alora_invocation_tokens": [1, 2, 3], + "alora_invocation_tokens": [1, 2, 3], #placeholder, not important for tests in this file }, ), ( @@ -117,7 +117,7 @@ LoraConfig, { "target_modules": "lin1", - "alora_invocation_tokens": [1, 2, 3], + "alora_invocation_tokens": [1, 2, 3], #placeholder, not important for tests in this file "lora_alpha": 32, }, ), From 14752be3b98a5970f45bb7e5e8f1e0e951e333df Mon Sep 17 00:00:00 2001 From: Greenewald Date: Fri, 15 Aug 2025 09:53:05 -0400 Subject: [PATCH 63/99] Update model.py --- src/peft/tuners/lora/model.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index 293dce9685..7d38e89d72 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -460,11 +460,12 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): yield return hook_handles = [] - for layer in self.modules(): - if isinstance(layer, LoraLayer): - pre_forward = partial(_alora_offsets_pre_forward_hook, alora_offsets=alora_offsets) - handle = layer.register_forward_pre_hook(pre_forward, with_kwargs=True) - hook_handles.append(handle) + if alora_offsets is not None: + for layer in self.modules(): + if isinstance(layer, LoraLayer): + pre_forward = partial(_alora_offsets_pre_forward_hook, alora_offsets=alora_offsets) + handle = layer.register_forward_pre_hook(pre_forward, with_kwargs=True) + hook_handles.append(handle) if adapter_names is not None: if self.training: From 57313a51ed7e8cf88f8522c01589f55c0ecda5ca Mon Sep 17 00:00:00 2001 From: Greenewald Date: Fri, 15 Aug 2025 09:55:37 -0400 Subject: [PATCH 64/99] Update testing_common.py --- tests/testing_common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/testing_common.py b/tests/testing_common.py index dbefe502ff..f55ce890f5 100644 --- a/tests/testing_common.py +++ b/tests/testing_common.py @@ -1047,8 +1047,7 @@ def _test_generate_with_mixed_adapter_batches_and_beam_search(self, model_id, co dummy_input = self.prepare_inputs_for_testing() # ensure that we have at least 3 samples for this test dummy_input = {k: torch.cat([v for _ in range(3)]) for k, v in dummy_input.items()} - num_beams = 10 - gen_kwargs = {**dummy_input, "max_length": 20, "num_beams": num_beams, "early_stopping": True} + gen_kwargs = {**dummy_input, "max_length": 20, "num_beams": 10, "early_stopping": True} with torch.inference_mode(): with model.disable_adapter(): gen_base = model.generate(**gen_kwargs) From 089d304ac4427b39f337278b90e7bf5444fb42a4 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 18 Aug 2025 13:21:00 -0400 Subject: [PATCH 65/99] Update bnb.py --- src/peft/tuners/lora/bnb.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/peft/tuners/lora/bnb.py b/src/peft/tuners/lora/bnb.py index 8f7f0c3c3a..f4cfb24288 100644 --- a/src/peft/tuners/lora/bnb.py +++ b/src/peft/tuners/lora/bnb.py @@ -192,7 +192,7 @@ def _mixed_batch_forward( sub_batch_indices_list = [] for adapter in unique_adapters: sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) - alora_offsets = variant_kwargs.get("alora_offsets", None) + for i, active_adapter in enumerate(unique_adapters): if active_adapter == "__base__": continue @@ -218,6 +218,7 @@ def _mixed_batch_forward( output = output.to(expected_dtype) result[sub_batch_indices_list[i]] += output else: + alora_offsets = variant_kwargs.get("alora_offsets", None) if alora_offsets is not None: variant_kwargs["alora_offsets"] = [alora_offsets[j] for j in sub_batch_indices_list[i]] output = self.lora_variant[active_adapter].forward( @@ -468,7 +469,6 @@ def _mixed_batch_forward( for adapter in unique_adapters: sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) - alora_offsets = variant_kwargs.get("alora_offsets", None) for i, active_adapter in enumerate(unique_adapters): if active_adapter == "__base__": continue @@ -494,6 +494,7 @@ def _mixed_batch_forward( output = output.to(expected_dtype) result[sub_batch_indices_list[i]] += output else: + alora_offsets = variant_kwargs.get("alora_offsets", None) if alora_offsets is not None: variant_kwargs["alora_offsets"] = [alora_offsets[j] for j in sub_batch_indices_list[i]] output = self.lora_variant[active_adapter].forward( From 35c7aaeba81f8b2e04f04203e2f849d67c9f322e Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 18 Aug 2025 13:25:11 -0400 Subject: [PATCH 66/99] Update test_lora_variants.py --- tests/test_lora_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index 374c3efcdc..c2327dde3d 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -200,6 +200,6 @@ def test_alora_activation_matches_base_until_invocation(): kwargs = get_alora_offsets_for_forward(lora_model, input_ids) with torch.no_grad(): lora_out = lora_model(X=input_ids, **kwargs) - start = input_ids.shape[1] - kwargs["alora_offsets"][0] + start = 2 #index of invocation token assert torch.allclose(lora_out[:, :start], base_out[:, :start]) assert not torch.allclose(lora_out[:, start:], base_out[:, start:]) From cb794110dd66be77fe9f66487e5b8c518aee4780 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 18 Aug 2025 13:25:45 -0400 Subject: [PATCH 67/99] Update test_lora_variants.py --- tests/test_lora_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index c2327dde3d..e2b72adfd6 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -193,6 +193,7 @@ def test_alora_activation_matches_base_until_invocation(): lora_model.eval() input_ids = torch.tensor([[0, 1, 2, 3]]) + start = 2 #index of invocation token with lora_model.disable_adapter(): with torch.no_grad(): base_out = lora_model(X=input_ids) @@ -200,6 +201,5 @@ def test_alora_activation_matches_base_until_invocation(): kwargs = get_alora_offsets_for_forward(lora_model, input_ids) with torch.no_grad(): lora_out = lora_model(X=input_ids, **kwargs) - start = 2 #index of invocation token assert torch.allclose(lora_out[:, :start], base_out[:, :start]) assert not torch.allclose(lora_out[:, start:], base_out[:, start:]) From 99dc4fa0635cc324130cc165f61755b4ca4a4993 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 18 Aug 2025 13:50:21 -0400 Subject: [PATCH 68/99] Update test_lora_variants.py new tests --- tests/test_lora_variants.py | 38 ++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index e2b72adfd6..3c30399f5d 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -71,9 +71,10 @@ def __init__(self, vocab_size: int = 10, hidden_dim: int = 8): self.embed = nn.Embedding(vocab_size, hidden_dim) self.linear = nn.Linear(hidden_dim, vocab_size) - def forward(self, X): - hidden = self.embed(X) - return self.linear(hidden) + def forward(self, X=None, embeds=None): + if X is not None: + embeds = self.embed(X) + return self.linear(embeds) class MockTransformerWrapper: @@ -168,7 +169,7 @@ def test_calculate_alora_offsets_basic_and_warning(): with pytest.warns(UserWarning): offsets = calculate_alora_offsets(peft_config, "default", input_ids) - assert offsets[0] == input_ids.shape[1] - 1 + 1 + assert offsets[0] == 4 assert offsets[1] is None @@ -181,7 +182,7 @@ def test_calculate_alora_offsets_with_adapter_names(): offsets = calculate_alora_offsets(peft_config, "a1", input_ids, adapter_names=["a1", "a2"]) - assert offsets == [input_ids.shape[1] - 2 + 1, input_ids.shape[1] - 2 + 1] + assert offsets == [2, 2] # Verify that the adapter does not modify outputs prior to invocation point @@ -203,3 +204,30 @@ def test_alora_activation_matches_base_until_invocation(): lora_out = lora_model(X=input_ids, **kwargs) assert torch.allclose(lora_out[:, :start], base_out[:, :start]) assert not torch.allclose(lora_out[:, start:], base_out[:, start:]) + +# Verify that warning is given for alora when providing embeddings only +def test_input_embeds_warning(): + transformers_class = MockTransformerWrapper + base_model = transformers_class.from_pretrained() + cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) + lora_model = get_peft_model(base_model, cfg) + lora_model.eval() + + input_ids = torch.tensor([[0, 1, 2, 3]]) + input_embeds = base_model.embed(input_ids) + with pytest.warns(UserWarning): + with torch.no_grad(): + lora_out = lora_model(embeds=input_embeds) + +# Verify that error is raised when requesting num_beams > 1 for alora +def test_num_beams_error(): + transformers_class = MockTransformerWrapper + base_model = transformers_class.from_pretrained() + cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) + lora_model = get_peft_model(base_model, cfg) + lora_model.eval() + + input_ids = torch.tensor([[0, 1, 2, 3]]) + with pytest.pytest.raises(ValueError): + with torch.no_grad(): + lora_out = lora_model(X=input_ids,num_beams=2) From 133183a4e7aafea102a0979b75ddde526b64ffa1 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 18 Aug 2025 13:58:11 -0400 Subject: [PATCH 69/99] workaround for new tokens --- docs/source/developer_guides/lora.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index b4acc24356..ec7f898383 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -197,7 +197,7 @@ where the tokenizer is the tokenizer for the base model. Note that we have `add_ * Since the weights are adapted on fewer tokens, often (not always) aLoRA requires higher rank (`r`) than LoRA. `r=32` can be a good starting point. * aLoRA weights cannot be merged into the base model by definition, since the adapter weights are selectively applied to a subset of tokens. Attempts to merge will throw errors. * Beam search is not yet supported. -* It is generally not recommended to add new tokens to the tokenizer that are not present in the base model, as this can complicate the target use case of both the base model and adapter model operating on overlapping context. +* It is generally not recommended to add new tokens to the tokenizer that are not present in the base model, as this can complicate the target use case of both the base model and adapter model operating on overlapping context. That said, there is a possible workaround by first efficiently adding [trainable tokens](https://huggingface.co/docs/peft/en/package_reference/trainable_tokens) to the base model prior to training the adapter. #### Choice of invocation sequence and SFT design From 0b7b16404e93dd6d8be3b628b00a8b4eafb7f301 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Tue, 19 Aug 2025 12:04:38 -0400 Subject: [PATCH 70/99] Update test_lora_variants.py --- tests/test_lora_variants.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index 3c30399f5d..09ffbf7fcd 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -159,15 +159,14 @@ def test_dora_params_have_gradients(self): assert getattr(peft_model.base_model.model, layer).lora_magnitude_vector["default"].weight.grad is not None -# Make sure warning is sent when invocation sequence is not present -def test_calculate_alora_offsets_basic_and_warning(): +# Make sure None is set when invocation sequence is not present +def test_calculate_alora_offsets(): config = LoraConfig(alora_invocation_tokens=[1, 2]) peft_config = {"default": config} input_ids = torch.tensor([[0, 1, 2, 3], [0, 4, 5, 6]]) - # second row lacks invocation sequence -> warning and None offset - with pytest.warns(UserWarning): - offsets = calculate_alora_offsets(peft_config, "default", input_ids) + # second row lacks invocation sequence -> None offset + offsets = calculate_alora_offsets(peft_config, "default", input_ids) assert offsets[0] == 4 assert offsets[1] is None From 7d05034e41908cec63562d95d0667cc829a76134 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Tue, 19 Aug 2025 12:05:42 -0400 Subject: [PATCH 71/99] Update variants.py --- src/peft/tuners/lora/variants.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 5657a49a57..3f2c3de52a 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -557,13 +557,6 @@ def calculate_alora_offsets( offset_val = seq_len - best_match_start_idx + 1 alora_offsets[i] = offset_val if offset_val > 0 else None else: # Invocation sequence not found in input - warnings.warn( - f"Could not find alora_invocation_tokens for specified aLoRA adapter in the " - f"following instance" - f"{sequence}" - f"Invocation tokens: {current_invocation_ids_tensor} \n" - f"Defaulting to base model. " - ) alora_offsets[i] = None return alora_offsets From 1d16e13d90ced86aeb66ace3322f413575da894d Mon Sep 17 00:00:00 2001 From: Greenewald Date: Tue, 19 Aug 2025 12:08:04 -0400 Subject: [PATCH 72/99] Update lora.md --- docs/source/developer_guides/lora.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index ec7f898383..c46e96ce76 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -215,10 +215,12 @@ Once deciding on an invocation string, get the model tokenizer and obtain `alora alora_invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False). ``` -**Note** If using custom strings for the invocation string, make sure that the start and end of the string are special tokens to avoid issues with tokenization at the boundaries. - An example inference setup is at [alora finetuning](https://github.com/huggingface/peft/blob/main/examples/alora_finetuning/alora_finetuning.py). +**Note** If using custom strings for the invocation string, make sure that the start and end of the string are special tokens to avoid issues with tokenization at the boundaries. + +To see why, imagine that 'a', 'b', 'c', and 'ab' are tokens in your tokenizer (numbers 1, 2, 3, 4 respectively). Suppose that your alora_invocation_tokens = [2, 3]. Now imagine your input string is "abc". Because "ab" is a token, this will get tokenized as [4,3]. So the alora_invocation_tokens will fail to be found, despite the string "bc" being in it. If the start and end of the invocation string are special tokens, however, this failure case will never happen since special tokens are never tokenized into the same token with other characters. + ### Weight-Decomposed Low-Rank Adaptation (DoRA) This technique decomposes the updates of the weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, especially at low ranks. For more information on DoRA, see https://huggingface.co/papers/2402.09353. From 31fcfcc4fc6666e3336c0b47af5474cdd02850a2 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Tue, 19 Aug 2025 18:35:38 +0000 Subject: [PATCH 73/99] tests and example --- examples/alora_finetuning/README.md | 16 ++++++++++++---- examples/alora_finetuning/alora_finetuning.py | 4 ++-- src/peft/tuners/lora/bnb.py | 2 +- src/peft/tuners/lora/variants.py | 2 +- tests/test_custom_models.py | 6 +++--- tests/test_lora_variants.py | 19 +++++++++++++++---- 6 files changed, 34 insertions(+), 15 deletions(-) diff --git a/examples/alora_finetuning/README.md b/examples/alora_finetuning/README.md index 2947ffbc98..e6b8da0bcd 100644 --- a/examples/alora_finetuning/README.md +++ b/examples/alora_finetuning/README.md @@ -5,11 +5,11 @@ Activated LoRA (aLoRA) is an adapter that selectively activates its weights only enabling much faster real-world inference (e.g. vLLM) when switching between generation with the base model and generation with adapters. See the [paper](https://huggingface.co/papers/2504.12397) for more details. -## Quick start +## Quick start (shown for Mistral 7B) ```python import torch from peft import LoraConfig, get_peft_model -from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer +from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, DataCollatorForLanguageModeling from datasets import load_dataset model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3", device_map="cuda") @@ -20,28 +20,36 @@ invocation_string = "[/INST]" # End of user turn in Mistral chat template invocation_tokens = tokenizer.encode(invocation_string, add_special_tokens=False) lora_config = LoraConfig( + task_type="CAUSAL_LM", alora_invocation_tokens=invocation_tokens, r=32, - target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], + target_modules=["q_proj", "k_proj", "v_proj"], ) peft_model = get_peft_model(model, lora_config) +data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False) trainer = Trainer( model=peft_model, train_dataset=dataset, dataset_text_field="text", max_seq_length=2048, tokenizer=tokenizer, + data_collator=data_collator, ) trainer.train() peft_model.save_pretrained("alora-mistral-7b") ``` +### Use the training example script directly Pass the invocation string with `--invocation_string` when running the training example -script: +script. For Mistral 7B, do: ```bash python examples/alora_finetuning/alora_finetuning.py --base_model mistralai/Mistral-7B-Instruct-v0.3 --data_path Lots-of-LoRAs/task1660_super_glue_question_generation --invocation_string "[/INST]" ``` +and similarly for Llama-3.2-3B-Instruct: +```bash +python examples/alora_finetuning/alora_finetuning.py --base_model meta-llama/Llama-3.2-3B-Instruct --data_path Lots-of-LoRAs/task1660_super_glue_question_generation --invocation_string "<|start_header_id|>assistant<|end_header_id|>" +``` ### Full example of the script ```bash diff --git a/examples/alora_finetuning/alora_finetuning.py b/examples/alora_finetuning/alora_finetuning.py index 41468278bf..67d1033c7e 100644 --- a/examples/alora_finetuning/alora_finetuning.py +++ b/examples/alora_finetuning/alora_finetuning.py @@ -172,13 +172,13 @@ def model_inference(model_path: str, adapter_path: str, prompt: str = None, data inputs = tokenizer(text, return_tensors="pt").to(base_model.device) # Generate answer with adapter - output_dict = alora_model.generate(**inputs, return_dict_in_generate=True, max_new_tokens=20) alora_outputs = output_dict.sequences # Print results print(f"Prompt: {text}") - print(f"Trained adapter response: {tokenizer.decode(alora_outputs[0]).rsplit(text, 1)[1]}") + response = tokenizer.decode(alora_outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True) + print(f"Trained adapter response: {response}") # {tokenizer.decode(alora_outputs[0]).rsplit(text, 1)[1]}") if __name__ == "__main__": diff --git a/src/peft/tuners/lora/bnb.py b/src/peft/tuners/lora/bnb.py index f4cfb24288..ad167faf05 100644 --- a/src/peft/tuners/lora/bnb.py +++ b/src/peft/tuners/lora/bnb.py @@ -192,7 +192,7 @@ def _mixed_batch_forward( sub_batch_indices_list = [] for adapter in unique_adapters: sub_batch_indices_list.append([index for index, item in enumerate(adapter_names) if item == adapter]) - + for i, active_adapter in enumerate(unique_adapters): if active_adapter == "__base__": continue diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 3f2c3de52a..d7be192c2d 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -647,6 +647,6 @@ def get_alora_offsets_for_generate(model: nn.module, *args, **kwargs): warnings.warn( "Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA." ) - + kwargs["alora_offsets"] = None return kwargs diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 7b1986456a..c1e18641ee 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -99,7 +99,7 @@ LoraConfig, { "target_modules": ["lin0"], - "alora_invocation_tokens": [1, 2, 3], #placeholder, not important for tests in this file + "alora_invocation_tokens": [1, 2, 3], # placeholder, not important for tests in this file }, ), ( @@ -108,7 +108,7 @@ LoraConfig, { "target_modules": ["lin0", "lin1"], - "alora_invocation_tokens": [1, 2, 3], #placeholder, not important for tests in this file + "alora_invocation_tokens": [1, 2, 3], # placeholder, not important for tests in this file }, ), ( @@ -117,7 +117,7 @@ LoraConfig, { "target_modules": "lin1", - "alora_invocation_tokens": [1, 2, 3], #placeholder, not important for tests in this file + "alora_invocation_tokens": [1, 2, 3], # placeholder, not important for tests in this file "lora_alpha": 32, }, ), diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index 09ffbf7fcd..220e5b1db7 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -22,6 +22,7 @@ from peft.tuners.lora.layer import Embedding as LoraEmbedding from peft.tuners.lora.layer import Linear as LoraLinear from peft.tuners.lora.variants import ( + ALoraLinearVariant, DoraConv1dVariant, DoraConv2dVariant, DoraEmbeddingVariant, @@ -100,7 +101,10 @@ def from_pretrained(cls): LoraEmbedding: DoraEmbeddingVariant, LoraConv1d: DoraConv1dVariant, LoraConv2d: DoraConv2dVariant, - } + }, + "alora": { + LoraLinear: ALoraLinearVariant, + }, } @@ -110,6 +114,11 @@ def from_pretrained(cls): LoraConfig, {"target_modules": ["linear1", "linear2", "conv1d", "conv2d", "embedding"], "use_dora": True}, ), + ( + "alora", + LoraConfig, + {"target_modules": ["linear1", "linear2"], "alora_invocation_tokens": [1]}, + ), ] @@ -193,7 +202,7 @@ def test_alora_activation_matches_base_until_invocation(): lora_model.eval() input_ids = torch.tensor([[0, 1, 2, 3]]) - start = 2 #index of invocation token + start = 1 with lora_model.disable_adapter(): with torch.no_grad(): base_out = lora_model(X=input_ids) @@ -204,6 +213,7 @@ def test_alora_activation_matches_base_until_invocation(): assert torch.allclose(lora_out[:, :start], base_out[:, :start]) assert not torch.allclose(lora_out[:, start:], base_out[:, start:]) + # Verify that warning is given for alora when providing embeddings only def test_input_embeds_warning(): transformers_class = MockTransformerWrapper @@ -218,6 +228,7 @@ def test_input_embeds_warning(): with torch.no_grad(): lora_out = lora_model(embeds=input_embeds) + # Verify that error is raised when requesting num_beams > 1 for alora def test_num_beams_error(): transformers_class = MockTransformerWrapper @@ -227,6 +238,6 @@ def test_num_beams_error(): lora_model.eval() input_ids = torch.tensor([[0, 1, 2, 3]]) - with pytest.pytest.raises(ValueError): + with pytest.raises(ValueError): with torch.no_grad(): - lora_out = lora_model(X=input_ids,num_beams=2) + lora_out = lora_model(X=input_ids, num_beams=2) From 45be76827ca2234d9811f0024d25647605eeaacb Mon Sep 17 00:00:00 2001 From: Greenewald Date: Tue, 19 Aug 2025 17:37:46 -0400 Subject: [PATCH 74/99] Update test_lora_variants.py --- tests/test_lora_variants.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index 220e5b1db7..37d0eb19fd 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -72,7 +72,7 @@ def __init__(self, vocab_size: int = 10, hidden_dim: int = 8): self.embed = nn.Embedding(vocab_size, hidden_dim) self.linear = nn.Linear(hidden_dim, vocab_size) - def forward(self, X=None, embeds=None): + def forward(self, X=None, embeds=None, num_beams=None): if X is not None: embeds = self.embed(X) return self.linear(embeds) @@ -225,8 +225,11 @@ def test_input_embeds_warning(): input_ids = torch.tensor([[0, 1, 2, 3]]) input_embeds = base_model.embed(input_ids) with pytest.warns(UserWarning): - with torch.no_grad(): - lora_out = lora_model(embeds=input_embeds) + kwargs = get_alora_offsets_for_forward(lora_model, input_embeds=input_embeds) + assert kwargs.get("alora_offsets") is None + with pytest.warns(UserWarning): + kwargs = get_alora_offsets_for_generate(lora_model, input_embeds=input_embeds) + assert kwargs.get("alora_offsets") is None # Verify that error is raised when requesting num_beams > 1 for alora From de4b88677e688ef08a8a057cd063abce3c9ee8f6 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Wed, 20 Aug 2025 02:50:34 +0000 Subject: [PATCH 75/99] offsets_change --- examples/alora_finetuning/alora_finetuning.py | 2 +- src/peft/tuners/lora/model.py | 10 +++++----- src/peft/tuners/lora/variants.py | 9 ++------- tests/test_lora_variants.py | 15 ++++++++------- 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/examples/alora_finetuning/alora_finetuning.py b/examples/alora_finetuning/alora_finetuning.py index 67d1033c7e..fb7073d6f0 100644 --- a/examples/alora_finetuning/alora_finetuning.py +++ b/examples/alora_finetuning/alora_finetuning.py @@ -178,7 +178,7 @@ def model_inference(model_path: str, adapter_path: str, prompt: str = None, data # Print results print(f"Prompt: {text}") response = tokenizer.decode(alora_outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True) - print(f"Trained adapter response: {response}") # {tokenizer.decode(alora_outputs[0]).rsplit(text, 1)[1]}") + print(f"Trained adapter response: {response}") if __name__ == "__main__": diff --git a/src/peft/tuners/lora/model.py b/src/peft/tuners/lora/model.py index 7d38e89d72..bdf235cb0a 100644 --- a/src/peft/tuners/lora/model.py +++ b/src/peft/tuners/lora/model.py @@ -466,7 +466,11 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): pre_forward = partial(_alora_offsets_pre_forward_hook, alora_offsets=alora_offsets) handle = layer.register_forward_pre_hook(pre_forward, with_kwargs=True) hook_handles.append(handle) - + num_beams = kwargs.get("num_beams", None) + uses_beam_search = isinstance(num_beams, int) and (num_beams > 1) + if uses_beam_search: + if alora_offsets is not None: + raise ValueError("Beam search not yet supported for aLoRA.") if adapter_names is not None: if self.training: raise ValueError("Cannot pass `adapter_names` when the model is in training mode.") @@ -487,12 +491,8 @@ def _enable_peft_forward_hooks(self, *args, **kwargs): ) # deal with beam search - num_beams = kwargs.get("num_beams", None) - uses_beam_search = isinstance(num_beams, int) and (num_beams > 1) original_adapter_names = adapter_names[:] if uses_beam_search: - if alora_offsets is not None: - raise ValueError("Beam search not yet supported for aLoRA.") if not isinstance(adapter_names, (list, tuple)): raise TypeError(f"Got adapter names of type {type(adapter_names)}, expected a list of str.") # When there is beam search, the inputs are repeated n times, thus we repeat each adapter name n times and diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index d7be192c2d..54e0367231 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -470,7 +470,6 @@ def forward( lora_B = module.lora_B[active_adapter] dropout = module.lora_dropout[active_adapter] scaling = module.scaling[active_adapter] - x = x.to(lora_A.weight.dtype) if x.dim() == 2: # If x is 2-dimensional (unusual but comes up in certain tests), this means that for all inputs, @@ -554,7 +553,7 @@ def calculate_alora_offsets( best_match_start_idx = idx if best_match_start_idx != -1: - offset_val = seq_len - best_match_start_idx + 1 + offset_val = seq_len - best_match_start_idx alora_offsets[i] = offset_val if offset_val > 0 else None else: # Invocation sequence not found in input alora_offsets[i] = None @@ -581,7 +580,7 @@ def is_alora_relevant_in_batch(model: nn.Module, adapter_names: Optional[list[st def get_alora_offsets_for_forward( - model: nn.Module, input_ids: torch.Tensor, inputs_embeds: torch.Tensor = None, **kwargs + model: nn.Module, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, **kwargs ): """ Wrapper around calculate_alora_offsets, for the .forward of the model. It only calculates alora_offsets if the @@ -637,10 +636,6 @@ def get_alora_offsets_for_generate(model: nn.module, *args, **kwargs): current_input_ids, adapter_names=adapter_names_for_offset_calc, ) - # Subtract 1 from offsets for generate, due to position difference between "forward" and generate forward pass - for i in range(len(calculated_offsets)): - if calculated_offsets[i] is not None: - calculated_offsets[i] -= 1 kwargs["alora_offsets"] = calculated_offsets else: diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index 37d0eb19fd..a9685c9ff1 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -29,6 +29,7 @@ DoraLinearVariant, calculate_alora_offsets, get_alora_offsets_for_forward, + get_alora_offsets_for_generate, ) @@ -72,7 +73,7 @@ def __init__(self, vocab_size: int = 10, hidden_dim: int = 8): self.embed = nn.Embedding(vocab_size, hidden_dim) self.linear = nn.Linear(hidden_dim, vocab_size) - def forward(self, X=None, embeds=None, num_beams=None): + def forward(self, X=None, embeds=None, num_beams=None, alora_offsets=None): if X is not None: embeds = self.embed(X) return self.linear(embeds) @@ -177,7 +178,7 @@ def test_calculate_alora_offsets(): # second row lacks invocation sequence -> None offset offsets = calculate_alora_offsets(peft_config, "default", input_ids) - assert offsets[0] == 4 + assert offsets[0] == 3 assert offsets[1] is None @@ -190,7 +191,7 @@ def test_calculate_alora_offsets_with_adapter_names(): offsets = calculate_alora_offsets(peft_config, "a1", input_ids, adapter_names=["a1", "a2"]) - assert offsets == [2, 2] + assert offsets == [1, 1] # Verify that the adapter does not modify outputs prior to invocation point @@ -202,7 +203,7 @@ def test_alora_activation_matches_base_until_invocation(): lora_model.eval() input_ids = torch.tensor([[0, 1, 2, 3]]) - start = 1 + start = 2 with lora_model.disable_adapter(): with torch.no_grad(): base_out = lora_model(X=input_ids) @@ -225,10 +226,10 @@ def test_input_embeds_warning(): input_ids = torch.tensor([[0, 1, 2, 3]]) input_embeds = base_model.embed(input_ids) with pytest.warns(UserWarning): - kwargs = get_alora_offsets_for_forward(lora_model, input_embeds=input_embeds) + kwargs = get_alora_offsets_for_forward(lora_model, inputs_embeds=input_embeds) assert kwargs.get("alora_offsets") is None with pytest.warns(UserWarning): - kwargs = get_alora_offsets_for_generate(lora_model, input_embeds=input_embeds) + kwargs = get_alora_offsets_for_generate(lora_model, inputs_embeds=input_embeds) assert kwargs.get("alora_offsets") is None @@ -243,4 +244,4 @@ def test_num_beams_error(): input_ids = torch.tensor([[0, 1, 2, 3]]) with pytest.raises(ValueError): with torch.no_grad(): - lora_out = lora_model(X=input_ids, num_beams=2) + lora_out = lora_model(X=input_ids, num_beams=2, alora_offsets=[3]) From 5bba212107903bf51ec7d8bdec5c53bb67ba73d9 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 1 Sep 2025 08:57:25 -0400 Subject: [PATCH 76/99] Update pyproject.toml --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index bb3e810377..4f09d81e44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,6 @@ ignore = [ "E501", # Line length (handled by ruff-format) "F841", # unused variable "UP007", # X | Y style Unions - "UP045", # X | Y style Optionals "C420", # dict.fromkeys "UP045", # don't force replacing Optional[X] with X | None ] From 3bd61960449f56afa39e6c190d7e70d45560bab9 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 1 Sep 2025 09:26:07 -0400 Subject: [PATCH 77/99] Update test_lora_variants.py --- tests/test_lora_variants.py | 159 +++++++++++++++++++----------------- 1 file changed, 83 insertions(+), 76 deletions(-) diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index a9685c9ff1..64519fbbd8 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -168,80 +168,87 @@ def test_dora_params_have_gradients(self): for layer in layer_names: assert getattr(peft_model.base_model.model, layer).lora_magnitude_vector["default"].weight.grad is not None - -# Make sure None is set when invocation sequence is not present -def test_calculate_alora_offsets(): - config = LoraConfig(alora_invocation_tokens=[1, 2]) - peft_config = {"default": config} - input_ids = torch.tensor([[0, 1, 2, 3], [0, 4, 5, 6]]) - - # second row lacks invocation sequence -> None offset - offsets = calculate_alora_offsets(peft_config, "default", input_ids) - - assert offsets[0] == 3 - assert offsets[1] is None - - -# Verify alora_offsets are correct with multiple adapters -def test_calculate_alora_offsets_with_adapter_names(): - cfg1 = LoraConfig(alora_invocation_tokens=[1]) - cfg2 = LoraConfig(alora_invocation_tokens=[2]) - peft_config = {"a1": cfg1, "a2": cfg2} - input_ids = torch.tensor([[0, 1, 1], [0, 2, 2]]) - - offsets = calculate_alora_offsets(peft_config, "a1", input_ids, adapter_names=["a1", "a2"]) - - assert offsets == [1, 1] - - -# Verify that the adapter does not modify outputs prior to invocation point -def test_alora_activation_matches_base_until_invocation(): - transformers_class = MockTransformerWrapper - base_model = transformers_class.from_pretrained() - cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) - lora_model = get_peft_model(base_model, cfg) - lora_model.eval() - - input_ids = torch.tensor([[0, 1, 2, 3]]) - start = 2 - with lora_model.disable_adapter(): - with torch.no_grad(): - base_out = lora_model(X=input_ids) - - kwargs = get_alora_offsets_for_forward(lora_model, input_ids) - with torch.no_grad(): - lora_out = lora_model(X=input_ids, **kwargs) - assert torch.allclose(lora_out[:, :start], base_out[:, :start]) - assert not torch.allclose(lora_out[:, start:], base_out[:, start:]) - - -# Verify that warning is given for alora when providing embeddings only -def test_input_embeds_warning(): - transformers_class = MockTransformerWrapper - base_model = transformers_class.from_pretrained() - cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) - lora_model = get_peft_model(base_model, cfg) - lora_model.eval() - - input_ids = torch.tensor([[0, 1, 2, 3]]) - input_embeds = base_model.embed(input_ids) - with pytest.warns(UserWarning): - kwargs = get_alora_offsets_for_forward(lora_model, inputs_embeds=input_embeds) - assert kwargs.get("alora_offsets") is None - with pytest.warns(UserWarning): - kwargs = get_alora_offsets_for_generate(lora_model, inputs_embeds=input_embeds) - assert kwargs.get("alora_offsets") is None - - -# Verify that error is raised when requesting num_beams > 1 for alora -def test_num_beams_error(): - transformers_class = MockTransformerWrapper - base_model = transformers_class.from_pretrained() - cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) - lora_model = get_peft_model(base_model, cfg) - lora_model.eval() - - input_ids = torch.tensor([[0, 1, 2, 3]]) - with pytest.raises(ValueError): +class TestActivatedLora: + @pytest.mark.parametrize('input_ids, alora_invocation_tokens, expected_offsets', [ + ([[0, 1, 2, 3], [0, 4, 5, 6]], [1, 2], [3, None]), + ([[1, 2, 1, 2], [0, 4, 1, 2]], [1, 2], [2, 2]), + ([[1, 2, 3, 4], [0, 4, 1, 4]], [1, 2], [4, None]), + ([[1, 2, 3, 4]], None, [None]), + ]) + # Verify alora_offsets are calculated correctly + def test_calculate_alora_offsets(input_ids, alora_invocation_tokens, expected_offsets): + config = LoraConfig(alora_invocation_tokens=alora_invocation_tokens) + peft_config = {"default": config} + + # compute offsets + offsets = calculate_alora_offsets(peft_config, "default", torch.tensor(input_ids)) + + assert offsets == expected_offsets + + @pytest.mark.parametrize('input_ids, alora_invocations, expected_offsets', [ + ([[0, 1, 1], [0, 2, 2]], {"a1": [1], "a2": [2]}, [1, 1]), + ([[0, 1, 1], [0, 2, 2]], {"a1": [1], "a2": None}, [1, None]), + ]) + # Verify alora_offsets are correct with adapter names + def test_calculate_alora_offsets_with_adapter_names(): + peft_config = {} + for alora_name in alora_invocations.keys(): + peft_config[alora_name] = LoraConfig(alora_invocations[alora_name]) + + adapter_names = list(alora_invocations.keys()) + offsets = calculate_alora_offsets(peft_config, adapter_names[0], torch.tensor(input_ids), adapter_names=adapter_names) + + assert offsets == expected_offsets + + + # Verify that the adapter does not modify outputs prior to invocation point + def test_alora_activation_matches_base_until_invocation(): + transformers_class = MockTransformerWrapper + base_model = transformers_class.from_pretrained() + cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) + lora_model = get_peft_model(base_model, cfg) + lora_model.eval() + + input_ids = torch.tensor([[0, 1, 2, 3]]) + start = 2 + with lora_model.disable_adapter(): + with torch.no_grad(): + base_out = lora_model(X=input_ids) + + kwargs = get_alora_offsets_for_forward(lora_model, input_ids) with torch.no_grad(): - lora_out = lora_model(X=input_ids, num_beams=2, alora_offsets=[3]) + lora_out = lora_model(X=input_ids, **kwargs) + assert torch.allclose(lora_out[:, :start], base_out[:, :start]) + assert not torch.allclose(lora_out[:, start:], base_out[:, start:]) + + + # Verify that warning is given for alora when providing embeddings only + def test_input_embeds_warning(): + transformers_class = MockTransformerWrapper + base_model = transformers_class.from_pretrained() + cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) + lora_model = get_peft_model(base_model, cfg) + lora_model.eval() + + input_ids = torch.tensor([[0, 1, 2, 3]]) + input_embeds = base_model.embed(input_ids) + with pytest.warns(UserWarning): + kwargs = get_alora_offsets_for_forward(lora_model, inputs_embeds=input_embeds) + assert kwargs.get("alora_offsets") is None + with pytest.warns(UserWarning): + kwargs = get_alora_offsets_for_generate(lora_model, inputs_embeds=input_embeds) + assert kwargs.get("alora_offsets") is None + + + # Verify that error is raised when requesting num_beams > 1 for alora + def test_num_beams_error(): + transformers_class = MockTransformerWrapper + base_model = transformers_class.from_pretrained() + cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) + lora_model = get_peft_model(base_model, cfg) + lora_model.eval() + + input_ids = torch.tensor([[0, 1, 2, 3]]) + with pytest.raises(ValueError): + with torch.no_grad(): + lora_out = lora_model(X=input_ids, num_beams=2, alora_offsets=[3]) From b541cff56e363060a7685c323182f915df7c0649 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 1 Sep 2025 09:30:34 -0400 Subject: [PATCH 78/99] Update test_lora_variants.py --- tests/test_lora_variants.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index 64519fbbd8..ad3b0a1c57 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -249,6 +249,7 @@ def test_num_beams_error(): lora_model.eval() input_ids = torch.tensor([[0, 1, 2, 3]]) - with pytest.raises(ValueError): + with pytest.raises(ValueError) as e: with torch.no_grad(): lora_out = lora_model(X=input_ids, num_beams=2, alora_offsets=[3]) + assert "num_beams is not supported" in str(e.value) From 783cf909da019b03fb89903354f43f91de958503 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 1 Sep 2025 11:23:07 -0400 Subject: [PATCH 79/99] Update test_lora_variants.py --- tests/test_lora_variants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index ad3b0a1c57..3c996d9fd0 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -232,10 +232,10 @@ def test_input_embeds_warning(): input_ids = torch.tensor([[0, 1, 2, 3]]) input_embeds = base_model.embed(input_ids) - with pytest.warns(UserWarning): + with pytest.warns(UserWarning, match="Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass."): kwargs = get_alora_offsets_for_forward(lora_model, inputs_embeds=input_embeds) assert kwargs.get("alora_offsets") is None - with pytest.warns(UserWarning): + with pytest.warns(UserWarning, match="Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA."): kwargs = get_alora_offsets_for_generate(lora_model, inputs_embeds=input_embeds) assert kwargs.get("alora_offsets") is None From 92e1305962516b34913405ad4b2f0d5adc738b55 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 1 Sep 2025 11:32:06 -0400 Subject: [PATCH 80/99] Update test_custom_models.py --- tests/test_custom_models.py | 45 ++++++++----------------------------- 1 file changed, 9 insertions(+), 36 deletions(-) diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index c9e2c1a3c1..765d8c3599 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -93,35 +93,6 @@ LoraConfig, {"target_modules": "lin1", "use_dora": True, "lora_alpha": 32}, ), - # Activated LoRA (aLoRA) - ( - "Vanilla MLP 9 Activated LoRA (aLoRA)", - "MLP", - LoraConfig, - { - "target_modules": ["lin0"], - "alora_invocation_tokens": [1, 2, 3], # placeholder, not important for tests in this file - }, - ), - ( - "Vanilla MLP 10 Activated LoRA (aLoRA)", - "MLP", - LoraConfig, - { - "target_modules": ["lin0", "lin1"], - "alora_invocation_tokens": [1, 2, 3], # placeholder, not important for tests in this file - }, - ), - ( - "Vanilla MLP 11 Activated LoRA (aLoRA)", - "MLP", - LoraConfig, - { - "target_modules": "lin1", - "alora_invocation_tokens": [1, 2, 3], # placeholder, not important for tests in this file - "lora_alpha": 32, - }, - ), ("Embedding + transformers Conv1D 1 LoRA", "EmbConv1D", LoraConfig, {"target_modules": ["conv1d"]}), ("Embedding + transformers Conv1D 2 LoRA", "EmbConv1D", LoraConfig, {"target_modules": ["emb"]}), ("Embedding + transformers Conv1D 3 LoRA", "EmbConv1D", LoraConfig, {"target_modules": ["emb", "conv1d"]}), @@ -1738,7 +1709,7 @@ def test_forward_float16(self, test_name, model_id, config_cls, config_kwargs): # check that none of this raises an error model(**X) - if model_id in ["Conv2dGroups", "Conv2dGroups2"] or config_kwargs.get("alora_invocation_tokens") is not None: + if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return @@ -1780,7 +1751,7 @@ def test_forward_bfloat16(self, test_name, model_id, config_cls, config_kwargs): # check that none of this raises an error model(**X) - if model_id in ["Conv2dGroups", "Conv2dGroups2"] or config_kwargs.get("alora_invocation_tokens") is not None: + if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return @@ -1821,7 +1792,7 @@ def test_forward_float16_no_autocast(self, test_name, model_id, config_cls, conf # check that none of this raises an error model(**X) - if model_id in ["Conv2dGroups", "Conv2dGroups2"] or config_kwargs.get("alora_invocation_tokens") is not None: + if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return @@ -1862,9 +1833,10 @@ def test_forward_bfloat16_no_autocast(self, test_name, model_id, config_cls, con # check that none of this raises an error model(**X) - if model_id in ["Conv2dGroups", "Conv2dGroups2"] or config_kwargs.get("alora_invocation_tokens") is not None: + if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return + model.merge_adapter(safe_merge=False) model(**X) model.unmerge_adapter() @@ -2041,9 +2013,7 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - if config_kwargs.get("alora_invocation_tokens") is not None: - # Merge layers not supported for Activated LoRA (aLoRA) - pytest.skip("Test not applicable for Activated LoRA") + # same as test_disable_adapters, but with merging X = self.prepare_inputs_for_testing() model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) @@ -2129,6 +2099,8 @@ def test_disable_adapter_with_bias_warns(self, test_name, model_id, config_cls, if config_cls != LoraConfig or config_cls != BOFTConfig: # skip this test for other configs as bias is specific to Lora pytest.skip("Testing bias warnings only for LoraConfig or BOFTConfig") + if not issubclass(config_cls, (LoraConfig, BOFTConfig)): + pytest.skip("Bias argument is only supported for LoRA or BOFT models") def run_with_disable(config_kwargs, bias): config_kwargs = config_kwargs.copy() @@ -2149,6 +2121,7 @@ def run_with_disable(config_kwargs, bias): run_with_disable(config_kwargs, bias="lora_only") with pytest.warns(UserWarning, match=msg_start): run_with_disable(config_kwargs, bias="all") + if config_cls == BOFTConfig: # check that bias=all and bias=boft_only give a warning with the correct message msg_start = "Careful, disabling adapter layers with bias configured to be" From e536b1a588b73e40901cdbbf3314867e30384c65 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 1 Sep 2025 11:37:37 -0400 Subject: [PATCH 81/99] Update test_decoder_models.py --- tests/test_decoder_models.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py index 8131907c4d..c61c7e976f 100644 --- a/tests/test_decoder_models.py +++ b/tests/test_decoder_models.py @@ -165,6 +165,19 @@ "alora_invocation_tokens": [1], }, ), + ( + LoraConfig, + { + "task_type": "CAUSAL_LM", + "r": 8, + "lora_alpha": 32, + "target_modules": None, + "lora_dropout": 0.05, + "bias": "none", + # not one test input sequence will ever have this token, this should do nothing at all + "alora_invocation_tokens": [1000], + }, + ), # LoRA + trainable tokens ( LoraConfig, @@ -285,6 +298,9 @@ def _skip_adalora_oft_hra_bone_for_gpt2(model_id, config_cls): ]: pytest.skip("Skipping AdaLora/BOFT/HRA/OFT/Bone/MiSS for GPT2LMHeadModel") +def _skip_alora_no_activation(config_cls, config_kwargs): + if config_cls is LoraConfig and config_kwargs.get("alora_invocation_tokens") == [1000]: + pytest.skip("Skipping aLoRA no-activation-case because the test expects changed output which there won't be.") class TestDecoderModels(PeftCommonTester): transformers_class = AutoModelForCausalLM @@ -424,6 +440,7 @@ def test_merge_layers_nan(self, model_id, config_cls, config_kwargs): def test_mixed_adapter_batches(self, model_id, config_cls, config_kwargs): if config_cls != LoraConfig: pytest.skip("Mixed adapter batches not supported for this config.") + _skip_alora_no_activation(config_cls, config_kwargs) config_kwargs = set_init_weights_false(config_cls, config_kwargs) self._test_mixed_adapter_batches(model_id, config_cls, config_kwargs.copy()) @@ -513,6 +530,7 @@ def test_adding_multiple_adapters_with_bias_raises(self, model_id, config_cls, c def test_unload_adapter(self, model_id, config_cls, config_kwargs): _skip_adalora_oft_hra_bone_for_gpt2(model_id, config_cls) _skip_if_not_conv1d_supported(model_id, config_cls) + _skip_alora_no_activation(config_cls, config_kwargs) config_kwargs = set_init_weights_false(config_cls, config_kwargs) self._test_unload_adapter(model_id, config_cls, config_kwargs.copy()) @@ -531,6 +549,7 @@ def test_training_prompt_learning_tasks(self, model_id, config_cls, config_kwarg @pytest.mark.parametrize("config_cls,config_kwargs", ALL_CONFIGS) def test_disable_adapter(self, model_id, config_cls, config_kwargs): _skip_if_not_conv1d_supported(model_id, config_cls) + _skip_alora_no_activation(config_cls, config_kwargs) config_kwargs = set_init_weights_false(config_cls, config_kwargs) self._test_disable_adapter(model_id, config_cls, config_kwargs.copy()) From 43a2fc2468f03846e2f2d110b0fcd2a2c3396232 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 1 Sep 2025 11:47:32 -0400 Subject: [PATCH 82/99] Update variants.py --- src/peft/tuners/lora/variants.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 54e0367231..b879f6edd9 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -471,20 +471,19 @@ def forward( dropout = module.lora_dropout[active_adapter] scaling = module.scaling[active_adapter] x = x.to(lora_A.weight.dtype) - if x.dim() == 2: - # If x is 2-dimensional (unusual but comes up in certain tests), this means that for all inputs, - # there is only 1 token position being processed and we should adapt its weights. - result = result + lora_B(lora_A(dropout(x))) * scaling - else: # Typical regime - if alora_offsets is not None: - for i in range(result.shape[0]): - # If alora_offsets[i] is None, this means that the invocation sequence was not found in the - # input. As a result, the weights should not be activated anywhere (equivalent to base model). - if alora_offsets[i] is not None and alora_offsets[i] > 0: - offset = min(alora_offsets[i], result.shape[1]) - result[i, -offset:, :] = ( - result[i, -offset:, :] + lora_B(lora_A(dropout(x[i, -offset:, :]))) * scaling - ) + if alora_offsets is None or all(x is None for x in alora_offsets): + # make a cheap dummy calculation to avoid training scenario where we did not match any offset + # and therefore won't adapt but are training and expecting gradients (which would lead to an exception). + result += (lora_A.weight[0, 0] - lora_A.weight[0, 0]) + (lora_B.weight[0,0] - lora_B.weight[0, 0]) + else: + for i in range(result.shape[0]): + # If alora_offsets[i] is None, this means that the invocation sequence was not found in the + # input. As a result, the weights should not be activated anywhere (equivalent to base model). + if alora_offsets[i] is not None and alora_offsets[i] > 0: + offset = min(alora_offsets[i], result.shape[1]) + result[i, -offset:, :] = ( + result[i, -offset:, :] + lora_B(lora_A(dropout(x[i, -offset:, :]))) * scaling + ) return result From ea964fd125d07cea07c0fa422d24c462ab3e63c5 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 1 Sep 2025 15:47:14 -0400 Subject: [PATCH 83/99] Update variants.py --- src/peft/tuners/lora/variants.py | 40 +++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index b879f6edd9..174de5d2a1 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -476,15 +476,37 @@ def forward( # and therefore won't adapt but are training and expecting gradients (which would lead to an exception). result += (lora_A.weight[0, 0] - lora_A.weight[0, 0]) + (lora_B.weight[0,0] - lora_B.weight[0, 0]) else: - for i in range(result.shape[0]): - # If alora_offsets[i] is None, this means that the invocation sequence was not found in the - # input. As a result, the weights should not be activated anywhere (equivalent to base model). - if alora_offsets[i] is not None and alora_offsets[i] > 0: - offset = min(alora_offsets[i], result.shape[1]) - result[i, -offset:, :] = ( - result[i, -offset:, :] + lora_B(lora_A(dropout(x[i, -offset:, :]))) * scaling - ) - + # for i in range(result.shape[0]): + # # If alora_offsets[i] is None, this means that the invocation sequence was not found in the + # # input. As a result, the weights should not be activated anywhere (equivalent to base model). + # if alora_offsets[i] is not None and alora_offsets[i] > 0: + # offset = min(alora_offsets[i], result.shape[1]) + # result[i, -offset:, :] = ( + # result[i, -offset:, :] + lora_B(lora_A(dropout(x[i, -offset:, :]))) * scaling + # ) + result_shape = result.shape + T = result_shape[-2] #tokens + D = result_shape[-1] #dimensions + device = result.device + # If alora_offsets[i] is None, this means that the invocation sequence was not found in the + # input. As a result, the weights should not be activated anywhere (equivalent to base model). + # Convert None -> 0 and clip to [0, T] + offsets = torch.tensor( + [0 if o is None else min(int(o), T) for o in alora_offsets], + device=device, + dtype=torch.long, + ) + # Mask True on the last `offsets[i]` positions for each row i + pos = torch.arange(T, device=device).unsqueeze(0) # [1, T] + mask = pos >= (T - offsets).unsqueeze(1) + + # Flatten for vectorization + x_flat = x.view(-1, D) + res_flat = result.view(-1, D) + mask_flat = mask.view(-1) + + # Compute adapter on the selected tokens only + res_flat[mask_flat] += lora_B(lora_A(dropout(x_flat[mask_flat]))) * scaling return result From 7bf294302b9c724f4140edac14e2381a9b9906ae Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 1 Sep 2025 18:30:02 -0400 Subject: [PATCH 84/99] Update test_gpu_examples.py --- tests/test_gpu_examples.py | 66 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py index c2b5a37da4..7d366f8dfa 100644 --- a/tests/test_gpu_examples.py +++ b/tests/test_gpu_examples.py @@ -4851,7 +4851,73 @@ def test_eva_initialization_consistency(self, model_fixture, dataset, peft_confi f"Mean absolute cosine similarity {mean_cosine_similarity:.4f} " f"is not greater than {self.COSINE_SIMILARITY_THRESHOLD}" ) +class TestALoRAInferenceGPU: + """GPU inference for Activated LoRA.""" + # Constants for test configuration + NUM_SEEDS = 3 + LORA_DIM = 8 + LORA_ALPHA = 1 + DEVICE = infer_device() + + @pytest.fixture + def tokenizer(self): + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + @pytest.fixture + def model(self): + model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") + model.model.decoder.layers = model.model.decoder.layers[:2] # truncate to 2 layers + return model.to(self.DEVICE) + + @pytest.fixture + def model_bnb(self): + bnb_config = BitsAndBytesConfig(load_in_4bit=True) + model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + quantization_config=bnb_config, + ) + model.model.decoder.layers = model.model.decoder.layers[:2] # truncate to 2 layers + model = prepare_model_for_kbit_training(model) + return model + + @pytest.fixture + def peft_config(self): + return LoraConfig( + r=self.LORA_DIM, + task_type="CAUSAL_LM", + lora_alpha=self.LORA_ALPHA, + target_modules=["q_proj"], + alora_invocation_tokens=[2], #id for + init_lora_weights=False, + ) + + @require_non_cpu + @require_bitsandbytes + @pytest.mark.single_gpu_tests + def test_alora_forward_consistency(self, peft_config): + """Test that the forwards of the model with adapter are similar across quantizations.""" + for seed in range(self.NUM_SEEDS): + torch.manual_seed(seed) + random.seed(seed) + np.random.seed(seed) + peft_model = get_peft_model(deepcopy(model), peft_config) + torch.manual_seed(seed) + random.seed(seed) + np.random.seed(seed) + peft_model_bnb = get_peft_model(deepcopy(model_bnb), peft_config) + peft_model.eval() + peft_model_bnb.eval() + input_ids = torch.tensor([[0, 1, 2, 3]]).to(DEVICE) + with torch.no_grad(): + peft_out = peft_model(input_ids = input_ids) + peft_out_bnb = peft_model_bnb(input_ids = input_ids) + a = peft_out.detach().to(torch.float32).cpu() + b = peft_out_bnb.detach().to(torch.float32).cpu() + assert torch.allclose(a, b, rtol=1e-1, atol=2e-2) + @pytest.mark.multi_gpu_tests class TestPrefixTuning: From c1e6a397c4122980bad64f36ec49e3ad003a1597 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Tue, 2 Sep 2025 00:10:33 +0000 Subject: [PATCH 85/99] latest requests --- src/peft/tuners/lora/variants.py | 21 +++------ tests/test_custom_models.py | 6 +-- tests/test_decoder_models.py | 2 + tests/test_gpu_examples.py | 23 ++++++---- tests/test_lora_variants.py | 78 ++++++++++++++++++-------------- 5 files changed, 71 insertions(+), 59 deletions(-) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 174de5d2a1..99fa50a457 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -474,19 +474,12 @@ def forward( if alora_offsets is None or all(x is None for x in alora_offsets): # make a cheap dummy calculation to avoid training scenario where we did not match any offset # and therefore won't adapt but are training and expecting gradients (which would lead to an exception). - result += (lora_A.weight[0, 0] - lora_A.weight[0, 0]) + (lora_B.weight[0,0] - lora_B.weight[0, 0]) + result += (lora_A.weight[0, 0] - lora_A.weight[0, 0]) + (lora_B.weight[0, 0] - lora_B.weight[0, 0]) else: - # for i in range(result.shape[0]): - # # If alora_offsets[i] is None, this means that the invocation sequence was not found in the - # # input. As a result, the weights should not be activated anywhere (equivalent to base model). - # if alora_offsets[i] is not None and alora_offsets[i] > 0: - # offset = min(alora_offsets[i], result.shape[1]) - # result[i, -offset:, :] = ( - # result[i, -offset:, :] + lora_B(lora_A(dropout(x[i, -offset:, :]))) * scaling - # ) result_shape = result.shape - T = result_shape[-2] #tokens - D = result_shape[-1] #dimensions + T = result_shape[-2] # tokens + D = result_shape[-1] # dimensions + Dx = x.shape[-1] device = result.device # If alora_offsets[i] is None, this means that the invocation sequence was not found in the # input. As a result, the weights should not be activated anywhere (equivalent to base model). @@ -497,14 +490,14 @@ def forward( dtype=torch.long, ) # Mask True on the last `offsets[i]` positions for each row i - pos = torch.arange(T, device=device).unsqueeze(0) # [1, T] + pos = torch.arange(T, device=device).unsqueeze(0) # [1, T] mask = pos >= (T - offsets).unsqueeze(1) # Flatten for vectorization - x_flat = x.view(-1, D) + x_flat = x.view(-1, Dx) res_flat = result.view(-1, D) mask_flat = mask.view(-1) - + # Compute adapter on the selected tokens only res_flat[mask_flat] += lora_B(lora_A(dropout(x_flat[mask_flat]))) * scaling return result diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py index 765d8c3599..1dd531a081 100644 --- a/tests/test_custom_models.py +++ b/tests/test_custom_models.py @@ -1836,7 +1836,7 @@ def test_forward_bfloat16_no_autocast(self, test_name, model_id, config_cls, con if model_id in ["Conv2dGroups", "Conv2dGroups2"]: # this model does not support merging return - + model.merge_adapter(safe_merge=False) model(**X) model.unmerge_adapter() @@ -2013,7 +2013,7 @@ def test_disable_adapters_with_merging(self, test_name, model_id, config_cls, co pytest.skip( f"Skipping test for {model_id} as merging is not supported. (See https://github.com/huggingface/peft/pull/2403 for details)" ) - + # same as test_disable_adapters, but with merging X = self.prepare_inputs_for_testing() model = self.transformers_class.from_pretrained(model_id).to(self.torch_device) @@ -2121,7 +2121,7 @@ def run_with_disable(config_kwargs, bias): run_with_disable(config_kwargs, bias="lora_only") with pytest.warns(UserWarning, match=msg_start): run_with_disable(config_kwargs, bias="all") - + if config_cls == BOFTConfig: # check that bias=all and bias=boft_only give a warning with the correct message msg_start = "Careful, disabling adapter layers with bias configured to be" diff --git a/tests/test_decoder_models.py b/tests/test_decoder_models.py index c61c7e976f..2c0c402b1c 100644 --- a/tests/test_decoder_models.py +++ b/tests/test_decoder_models.py @@ -298,10 +298,12 @@ def _skip_adalora_oft_hra_bone_for_gpt2(model_id, config_cls): ]: pytest.skip("Skipping AdaLora/BOFT/HRA/OFT/Bone/MiSS for GPT2LMHeadModel") + def _skip_alora_no_activation(config_cls, config_kwargs): if config_cls is LoraConfig and config_kwargs.get("alora_invocation_tokens") == [1000]: pytest.skip("Skipping aLoRA no-activation-case because the test expects changed output which there won't be.") + class TestDecoderModels(PeftCommonTester): transformers_class = AutoModelForCausalLM diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py index 7d366f8dfa..80c9b70d2b 100644 --- a/tests/test_gpu_examples.py +++ b/tests/test_gpu_examples.py @@ -24,6 +24,7 @@ import numpy as np import pytest +import random import torch from accelerate import infer_auto_device_map from accelerate.test_utils.testing import run_command @@ -4897,26 +4898,30 @@ def peft_config(self): @require_non_cpu @require_bitsandbytes @pytest.mark.single_gpu_tests - def test_alora_forward_consistency(self, peft_config): + def test_alora_forward_consistency(self, model, model_bnb, peft_config): """Test that the forwards of the model with adapter are similar across quantizations.""" for seed in range(self.NUM_SEEDS): torch.manual_seed(seed) - random.seed(seed) + # random.seed(seed) np.random.seed(seed) peft_model = get_peft_model(deepcopy(model), peft_config) torch.manual_seed(seed) - random.seed(seed) + #random.seed(seed) np.random.seed(seed) peft_model_bnb = get_peft_model(deepcopy(model_bnb), peft_config) peft_model.eval() peft_model_bnb.eval() - input_ids = torch.tensor([[0, 1, 2, 3]]).to(DEVICE) + input_ids = torch.tensor([[0, 1, 2, 3]]).to(self.DEVICE) with torch.no_grad(): - peft_out = peft_model(input_ids = input_ids) - peft_out_bnb = peft_model_bnb(input_ids = input_ids) - a = peft_out.detach().to(torch.float32).cpu() - b = peft_out_bnb.detach().to(torch.float32).cpu() - assert torch.allclose(a, b, rtol=1e-1, atol=2e-2) + peft_out = peft_model(input_ids = input_ids,return_dict=True,output_hidden_states=True) + peft_out_bnb=peft_model_bnb(input_ids=input_ids,return_dict=True,output_hidden_states=True) + h_fp = peft_out.hidden_states[-1] + h_4b = peft_out_bnb.hidden_states[-1] + a = h_fp.detach().to(torch.float32).cpu() + b = h_4b.detach().to(torch.float32).cpu() + import torch.nn.functional as F + cos = F.cosine_similarity(a.flatten(), b.flatten(), dim=0).item() + assert cos > 0.9 @pytest.mark.multi_gpu_tests diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index 3c996d9fd0..dcab41894c 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -168,88 +168,100 @@ def test_dora_params_have_gradients(self): for layer in layer_names: assert getattr(peft_model.base_model.model, layer).lora_magnitude_vector["default"].weight.grad is not None + class TestActivatedLora: - @pytest.mark.parametrize('input_ids, alora_invocation_tokens, expected_offsets', [ - ([[0, 1, 2, 3], [0, 4, 5, 6]], [1, 2], [3, None]), - ([[1, 2, 1, 2], [0, 4, 1, 2]], [1, 2], [2, 2]), - ([[1, 2, 3, 4], [0, 4, 1, 4]], [1, 2], [4, None]), - ([[1, 2, 3, 4]], None, [None]), - ]) + @pytest.mark.parametrize( + "input_ids, alora_invocation_tokens, expected_offsets", + [ + ([[0, 1, 2, 3], [0, 4, 5, 6]], [1, 2], [3, None]), + ([[1, 2, 1, 2], [0, 4, 1, 2]], [1, 2], [2, 2]), + ([[1, 2, 3, 4], [0, 4, 1, 4]], [1, 2], [4, None]), + ([[1, 2, 3, 4]], None, [None]), + ], + ) # Verify alora_offsets are calculated correctly - def test_calculate_alora_offsets(input_ids, alora_invocation_tokens, expected_offsets): + def test_calculate_alora_offsets(self,input_ids, alora_invocation_tokens, expected_offsets): config = LoraConfig(alora_invocation_tokens=alora_invocation_tokens) peft_config = {"default": config} - + # compute offsets offsets = calculate_alora_offsets(peft_config, "default", torch.tensor(input_ids)) assert offsets == expected_offsets - - @pytest.mark.parametrize('input_ids, alora_invocations, expected_offsets', [ - ([[0, 1, 1], [0, 2, 2]], {"a1": [1], "a2": [2]}, [1, 1]), - ([[0, 1, 1], [0, 2, 2]], {"a1": [1], "a2": None}, [1, None]), - ]) + + @pytest.mark.parametrize( + "input_ids, alora_invocations, expected_offsets", + [ + ([[0, 1, 1], [0, 2, 2]], {"a1": [1], "a2": [2]}, [1, 1]), + ([[0, 1, 1], [0, 2, 2]], {"a1": [1], "a2": None}, [1, None]), + ], + ) # Verify alora_offsets are correct with adapter names - def test_calculate_alora_offsets_with_adapter_names(): + def test_calculate_alora_offsets_with_adapter_names(self,input_ids, alora_invocations, expected_offsets): peft_config = {} for alora_name in alora_invocations.keys(): - peft_config[alora_name] = LoraConfig(alora_invocations[alora_name]) + peft_config[alora_name] = LoraConfig(alora_invocation_tokens=alora_invocations[alora_name]) adapter_names = list(alora_invocations.keys()) - offsets = calculate_alora_offsets(peft_config, adapter_names[0], torch.tensor(input_ids), adapter_names=adapter_names) - + offsets = calculate_alora_offsets( + peft_config, adapter_names[0], torch.tensor(input_ids), adapter_names=adapter_names + ) + assert offsets == expected_offsets - - + # Verify that the adapter does not modify outputs prior to invocation point - def test_alora_activation_matches_base_until_invocation(): + def test_alora_activation_matches_base_until_invocation(self): transformers_class = MockTransformerWrapper base_model = transformers_class.from_pretrained() cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) lora_model = get_peft_model(base_model, cfg) lora_model.eval() - + input_ids = torch.tensor([[0, 1, 2, 3]]) start = 2 with lora_model.disable_adapter(): with torch.no_grad(): base_out = lora_model(X=input_ids) - + kwargs = get_alora_offsets_for_forward(lora_model, input_ids) with torch.no_grad(): lora_out = lora_model(X=input_ids, **kwargs) assert torch.allclose(lora_out[:, :start], base_out[:, :start]) assert not torch.allclose(lora_out[:, start:], base_out[:, start:]) - - + # Verify that warning is given for alora when providing embeddings only - def test_input_embeds_warning(): + def test_input_embeds_warning(self): transformers_class = MockTransformerWrapper base_model = transformers_class.from_pretrained() cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) lora_model = get_peft_model(base_model, cfg) lora_model.eval() - + input_ids = torch.tensor([[0, 1, 2, 3]]) input_embeds = base_model.embed(input_ids) - with pytest.warns(UserWarning, match="Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass."): + with pytest.warns( + UserWarning, + match="Cannot calculate aLoRA offsets when only inputs_embeds are provided. Disabling aLoRA for this forward pass.", + ): kwargs = get_alora_offsets_for_forward(lora_model, inputs_embeds=input_embeds) assert kwargs.get("alora_offsets") is None - with pytest.warns(UserWarning, match="Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA."): + with pytest.warns( + UserWarning, + match="Cannot calculate aLoRA offsets during generate as input_ids are not available. Disabling aLoRA.", + ): kwargs = get_alora_offsets_for_generate(lora_model, inputs_embeds=input_embeds) assert kwargs.get("alora_offsets") is None - - + # Verify that error is raised when requesting num_beams > 1 for alora - def test_num_beams_error(): + def test_num_beams_error(self): transformers_class = MockTransformerWrapper base_model = transformers_class.from_pretrained() cfg = LoraConfig(target_modules=["linear"], alora_invocation_tokens=[2], init_lora_weights=False) lora_model = get_peft_model(base_model, cfg) lora_model.eval() - + input_ids = torch.tensor([[0, 1, 2, 3]]) with pytest.raises(ValueError) as e: with torch.no_grad(): lora_out = lora_model(X=input_ids, num_beams=2, alora_offsets=[3]) - assert "num_beams is not supported" in str(e.value) + assert "Beam search not yet supported for aLoRA." in str(e.value) From af761629b4f6b880053b99a27c97d682950a5d8d Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Tue, 2 Sep 2025 00:11:50 +0000 Subject: [PATCH 86/99] latest requests --- tests/test_gpu_examples.py | 28 +++++++++++++++------------- tests/test_lora_variants.py | 4 ++-- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/tests/test_gpu_examples.py b/tests/test_gpu_examples.py index 80c9b70d2b..267a29f0cf 100644 --- a/tests/test_gpu_examples.py +++ b/tests/test_gpu_examples.py @@ -24,7 +24,6 @@ import numpy as np import pytest -import random import torch from accelerate import infer_auto_device_map from accelerate.test_utils.testing import run_command @@ -4852,6 +4851,8 @@ def test_eva_initialization_consistency(self, model_fixture, dataset, peft_confi f"Mean absolute cosine similarity {mean_cosine_similarity:.4f} " f"is not greater than {self.COSINE_SIMILARITY_THRESHOLD}" ) + + class TestALoRAInferenceGPU: """GPU inference for Activated LoRA.""" @@ -4860,7 +4861,7 @@ class TestALoRAInferenceGPU: LORA_DIM = 8 LORA_ALPHA = 1 DEVICE = infer_device() - + @pytest.fixture def tokenizer(self): tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m") @@ -4870,7 +4871,7 @@ def tokenizer(self): @pytest.fixture def model(self): model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m") - model.model.decoder.layers = model.model.decoder.layers[:2] # truncate to 2 layers + model.model.decoder.layers = model.model.decoder.layers[:2] # truncate to 2 layers return model.to(self.DEVICE) @pytest.fixture @@ -4880,7 +4881,7 @@ def model_bnb(self): "facebook/opt-125m", quantization_config=bnb_config, ) - model.model.decoder.layers = model.model.decoder.layers[:2] # truncate to 2 layers + model.model.decoder.layers = model.model.decoder.layers[:2] # truncate to 2 layers model = prepare_model_for_kbit_training(model) return model @@ -4891,7 +4892,7 @@ def peft_config(self): task_type="CAUSAL_LM", lora_alpha=self.LORA_ALPHA, target_modules=["q_proj"], - alora_invocation_tokens=[2], #id for + alora_invocation_tokens=[2], # id for init_lora_weights=False, ) @@ -4902,27 +4903,28 @@ def test_alora_forward_consistency(self, model, model_bnb, peft_config): """Test that the forwards of the model with adapter are similar across quantizations.""" for seed in range(self.NUM_SEEDS): torch.manual_seed(seed) - # random.seed(seed) + # random.seed(seed) np.random.seed(seed) peft_model = get_peft_model(deepcopy(model), peft_config) torch.manual_seed(seed) - #random.seed(seed) + # random.seed(seed) np.random.seed(seed) peft_model_bnb = get_peft_model(deepcopy(model_bnb), peft_config) peft_model.eval() peft_model_bnb.eval() input_ids = torch.tensor([[0, 1, 2, 3]]).to(self.DEVICE) with torch.no_grad(): - peft_out = peft_model(input_ids = input_ids,return_dict=True,output_hidden_states=True) - peft_out_bnb=peft_model_bnb(input_ids=input_ids,return_dict=True,output_hidden_states=True) - h_fp = peft_out.hidden_states[-1] - h_4b = peft_out_bnb.hidden_states[-1] + peft_out = peft_model(input_ids=input_ids, return_dict=True, output_hidden_states=True) + peft_out_bnb = peft_model_bnb(input_ids=input_ids, return_dict=True, output_hidden_states=True) + h_fp = peft_out.hidden_states[-1] + h_4b = peft_out_bnb.hidden_states[-1] a = h_fp.detach().to(torch.float32).cpu() b = h_4b.detach().to(torch.float32).cpu() import torch.nn.functional as F + cos = F.cosine_similarity(a.flatten(), b.flatten(), dim=0).item() - assert cos > 0.9 - + assert cos > 0.9 + @pytest.mark.multi_gpu_tests class TestPrefixTuning: diff --git a/tests/test_lora_variants.py b/tests/test_lora_variants.py index dcab41894c..1c2a3c20a2 100644 --- a/tests/test_lora_variants.py +++ b/tests/test_lora_variants.py @@ -180,7 +180,7 @@ class TestActivatedLora: ], ) # Verify alora_offsets are calculated correctly - def test_calculate_alora_offsets(self,input_ids, alora_invocation_tokens, expected_offsets): + def test_calculate_alora_offsets(self, input_ids, alora_invocation_tokens, expected_offsets): config = LoraConfig(alora_invocation_tokens=alora_invocation_tokens) peft_config = {"default": config} @@ -197,7 +197,7 @@ def test_calculate_alora_offsets(self,input_ids, alora_invocation_tokens, expect ], ) # Verify alora_offsets are correct with adapter names - def test_calculate_alora_offsets_with_adapter_names(self,input_ids, alora_invocations, expected_offsets): + def test_calculate_alora_offsets_with_adapter_names(self, input_ids, alora_invocations, expected_offsets): peft_config = {} for alora_name in alora_invocations.keys(): peft_config[alora_name] = LoraConfig(alora_invocation_tokens=alora_invocations[alora_name]) From 1ae715517c1d46bf2ba16f8df80b547083e0aa5a Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 1 Sep 2025 20:15:46 -0400 Subject: [PATCH 87/99] Update variants.py --- src/peft/tuners/lora/variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index 99fa50a457..f6b67c8476 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -485,7 +485,7 @@ def forward( # input. As a result, the weights should not be activated anywhere (equivalent to base model). # Convert None -> 0 and clip to [0, T] offsets = torch.tensor( - [0 if o is None else min(int(o), T) for o in alora_offsets], + [0 if o is None else max(1,min(int(o),T)) for o in alora_offsets], device=device, dtype=torch.long, ) From bd15f77a237f5d541880757569eebcd99531662a Mon Sep 17 00:00:00 2001 From: Greenewald Date: Mon, 1 Sep 2025 20:32:40 -0400 Subject: [PATCH 88/99] Update variants.py --- src/peft/tuners/lora/variants.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index f6b67c8476..b396dc10eb 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -471,16 +471,18 @@ def forward( dropout = module.lora_dropout[active_adapter] scaling = module.scaling[active_adapter] x = x.to(lora_A.weight.dtype) - if alora_offsets is None or all(x is None for x in alora_offsets): - # make a cheap dummy calculation to avoid training scenario where we did not match any offset - # and therefore won't adapt but are training and expecting gradients (which would lead to an exception). - result += (lora_A.weight[0, 0] - lora_A.weight[0, 0]) + (lora_B.weight[0, 0] - lora_B.weight[0, 0]) + result_shape = result.shape + B = result_shape[0] # batch + if len(result_shape) == 3: + T = result_shape[1] # tokens + else: + T = 1 + D = result_shape[-1] # dimensions + Dx = x.shape[-1] + device = result.device + if alora_offsets is None: # use base model only, but ensure 0 gradient + mask = torch.zeros((B, T), dtype=torch.bool) else: - result_shape = result.shape - T = result_shape[-2] # tokens - D = result_shape[-1] # dimensions - Dx = x.shape[-1] - device = result.device # If alora_offsets[i] is None, this means that the invocation sequence was not found in the # input. As a result, the weights should not be activated anywhere (equivalent to base model). # Convert None -> 0 and clip to [0, T] @@ -493,13 +495,13 @@ def forward( pos = torch.arange(T, device=device).unsqueeze(0) # [1, T] mask = pos >= (T - offsets).unsqueeze(1) - # Flatten for vectorization - x_flat = x.view(-1, Dx) - res_flat = result.view(-1, D) - mask_flat = mask.view(-1) + # Flatten for vectorization + x_flat = x.view(-1, Dx) + res_flat = result.view(-1, D) + mask_flat = mask.view(-1) - # Compute adapter on the selected tokens only - res_flat[mask_flat] += lora_B(lora_A(dropout(x_flat[mask_flat]))) * scaling + # Compute adapter on the selected tokens only + res_flat[mask_flat] += lora_B(lora_A(dropout(x_flat[mask_flat]))) * scaling return result From 4641d604294fb18d54ca25bb5fd305e5adb321f0 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Tue, 2 Sep 2025 01:45:14 +0000 Subject: [PATCH 89/99] make test --- src/peft/tuners/lora/variants.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index b396dc10eb..f8ff619042 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -480,14 +480,14 @@ def forward( D = result_shape[-1] # dimensions Dx = x.shape[-1] device = result.device - if alora_offsets is None: # use base model only, but ensure 0 gradient + if alora_offsets is None: # use base model only, but ensure 0 gradient mask = torch.zeros((B, T), dtype=torch.bool) else: # If alora_offsets[i] is None, this means that the invocation sequence was not found in the # input. As a result, the weights should not be activated anywhere (equivalent to base model). # Convert None -> 0 and clip to [0, T] offsets = torch.tensor( - [0 if o is None else max(1,min(int(o),T)) for o in alora_offsets], + [0 if o is None else max(1, min(int(o), T)) for o in alora_offsets], device=device, dtype=torch.long, ) From c99cd903a966fc105f13d72f27b857f3c1c28f78 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Tue, 2 Sep 2025 15:43:57 -0400 Subject: [PATCH 90/99] Update variants.py --- src/peft/tuners/lora/variants.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index f8ff619042..b03ef53b7b 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -485,9 +485,9 @@ def forward( else: # If alora_offsets[i] is None, this means that the invocation sequence was not found in the # input. As a result, the weights should not be activated anywhere (equivalent to base model). - # Convert None -> 0 and clip to [0, T] + # Convert None -> 0 and clip to T offsets = torch.tensor( - [0 if o is None else max(1, min(int(o), T)) for o in alora_offsets], + [0 if o is None else min(int(o), T) for o in alora_offsets], device=device, dtype=torch.long, ) @@ -510,10 +510,10 @@ def calculate_alora_offsets( ) -> list[int]: """ This is a helper function for Activated LoRA (aLoRA) that searches each input token sequence for the last occurence - of the appropriate "alora_invocation_tokens" invocation sequence. If adapter_names is passed, then each input uses - the appropriate invocation sequence for the specified adapter for that row. Logic is provided to handle mixed - collections of adapters for which not all are aLoRAs (e.g. some base model, some LoRA). If the invocation sequence - is not present, the corresponding alora_offset is set to None and a warning is printed. + of the appropriate "alora_invocation_tokens" invocation sequence. The calculated alora_offset is the location of the + *start* of the invocation tokens, counting backward from the end (will therefore always be >= len(alora_invocation_tokens). + If adapter_names is passed, then each input uses the appropriate invocation sequence for the specified adapter for that row. + Logic is provided to handle mixed collections of adapters for which not all are aLoRAs (e.g. some base model, some LoRA). """ if input_ids is None: return [] From 012879397c5b0b0b5116ef13fa10f21f7368cc3b Mon Sep 17 00:00:00 2001 From: Greenewald Date: Tue, 2 Sep 2025 15:51:49 -0400 Subject: [PATCH 91/99] Update lora.md --- docs/source/developer_guides/lora.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index c46e96ce76..fcb22813f4 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -177,7 +177,7 @@ config = LoraConfig(use_rslora=True, ...) Activated LoRA (aLoRA) is a low rank adapter architecture for Causal LMs that allows for reusing existing base model KV cache for more efficient inference. This approach is best suited for inference pipelines which rely on the base model for most tasks/generations, but use aLoRA adapter(s) to perform specialized task(s) within the chain. For example, checking or correcting generated outputs of the base model. In these settings, inference times can be sped up by an order of magnitude or more. For more information on aLoRA and many example use cases, see https://huggingface.co/papers/2504.12397. -This technique scans for the last occurence of an invocation sequence (`alora_invocation_tokens`) in each input (this can be as short as 1 token), and activates the adapter weights on tokens starting 1 token after the beginning of the invocation sequence. Weights on prior tokens are left un-adapted -- making the cache for those tokens interchangeable with base model cache due to the causal attention mask in Causal LMs. Usage is very similar to standard LoRA, with the key difference that this invocation sequence must be specified when the adapter is created: +This technique scans for the last occurence of an invocation sequence (`alora_invocation_tokens`) in each input (this can be as short as 1 token), and activates the adapter weights on tokens starting with the beginning of the invocation sequence (any inputs after the invocation sequence are also adapted, and all generated tokens will use the adapted weights). Weights on prior tokens are left un-adapted -- making the cache for those tokens interchangeable with base model cache due to the causal attention mask in Causal LMs. Usage is very similar to standard LoRA, with the key difference that this invocation sequence must be specified when the adapter is created: ```py from peft import LoraConfig From 4e79da05190a4540083334451f95ad512759cae9 Mon Sep 17 00:00:00 2001 From: "Kristjan Greenewald Kristjan.H.Greenewald@ibm.com" Date: Tue, 2 Sep 2025 20:14:45 +0000 Subject: [PATCH 92/99] make style --- src/peft/tuners/lora/variants.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/peft/tuners/lora/variants.py b/src/peft/tuners/lora/variants.py index b03ef53b7b..bea62e48d9 100644 --- a/src/peft/tuners/lora/variants.py +++ b/src/peft/tuners/lora/variants.py @@ -510,10 +510,11 @@ def calculate_alora_offsets( ) -> list[int]: """ This is a helper function for Activated LoRA (aLoRA) that searches each input token sequence for the last occurence - of the appropriate "alora_invocation_tokens" invocation sequence. The calculated alora_offset is the location of the - *start* of the invocation tokens, counting backward from the end (will therefore always be >= len(alora_invocation_tokens). - If adapter_names is passed, then each input uses the appropriate invocation sequence for the specified adapter for that row. - Logic is provided to handle mixed collections of adapters for which not all are aLoRAs (e.g. some base model, some LoRA). + of the appropriate "alora_invocation_tokens" invocation sequence. The calculated alora_offset is the location of + the *start* of the invocation tokens, counting backward from the end (will therefore always be >= + len(alora_invocation_tokens). If adapter_names is passed, then each input uses the appropriate invocation sequence + for the specified adapter for that row. Logic is provided to handle mixed collections of adapters for which not all + are aLoRAs (e.g. some base model, some LoRA). """ if input_ids is None: return [] From f2ab507ed34622e1e28d82e5d2de28a1df7cb411 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Tue, 2 Sep 2025 22:17:33 -0400 Subject: [PATCH 93/99] Update lora.md --- docs/source/developer_guides/lora.md | 53 ++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index fcb22813f4..71a38ca33c 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -221,6 +221,59 @@ An example inference setup is at [alora finetuning](https://github.com/huggingfa To see why, imagine that 'a', 'b', 'c', and 'ab' are tokens in your tokenizer (numbers 1, 2, 3, 4 respectively). Suppose that your alora_invocation_tokens = [2, 3]. Now imagine your input string is "abc". Because "ab" is a token, this will get tokenized as [4,3]. So the alora_invocation_tokens will fail to be found, despite the string "bc" being in it. If the start and end of the invocation string are special tokens, however, this failure case will never happen since special tokens are never tokenized into the same token with other characters. +#### Using (and reusing) cache for generation +The main purpose of Activated LoRA is to make KV cache interchangeable between the base model and aLoRA adapter models **prior to the invocation sequence**. Specifically, keys and values stored during one model generation can be used in subsequent generations to avoid expensive prefill operations for context tokens. When sharing cache between the base model and aLoRA adapters, there are 2 main patterns: +1. The base model has generated something, and an aLoRA adapter is then called to do a followup generation. +2. An aLoRA adapter has generated something, and the base model or a different aLoRA adapter is called to do a followup generation where there is partial context overlap with the original aLoRA. + +**Important** Note that cache reuse is smoothly and automatically handled in more dedicated/optimized inference libraries, with no need to consider the below. See e.g. PRs in [vLLM](https://github.com/vllm-project/vllm/pull/19710), and [llama.cpp](https://github.com/ggml-org/llama.cpp/pull/15327). + +The above behaviors can be demonstrated using [DynamicCache](https://huggingface.co/docs/transformers/en/kv_cache) from `transformers`. Since `DynamicCache` does not cleanly support rewinding the cache to an earlier token position, care must be taken to ensure all cache objects are the correct tokens. In particular, an extra step is required for sharing cache when there is partial context overlap. It is also important to remember that cache for the `alora_invocation_tokens` and following are created with adapted weights and not usable by the base model. + +**Pattern 1: Base model followed by aLoRA** Here, the entire input and generation from the base model is input into the aLoRA adapter, along with the invocation sequence: +``` +from transformers import DynamicCache +... +cache = DynamicCache() +inputs_base = tokenizer(prompt_base, return_tensors="pt") +# Generate from base model and save cache +with model_alora.disable_adapter(): + output = model_alora.generate(inputs_base["input_ids"].to(device),attention_mask=inputs_base["attention_mask"].to(device),past_key_values = cache,return_dict_in_generate=True) +output_text_base = tokenizer.decode(output.sequences[0]) +cache = output.past_key_values + +# Generate with aLoRA adapter from cache +prompt_alora = output_text + INVOCATION_STRING +inputs_alora = tokenizer(prompt_alora, return_tensors="pt") +output = model_alora.generate(inputs_alora["input_ids"].to(device),attention_mask=inputs_alora["attention_mask"].to(device), past_key_values=cache,return_dict_in_generate=True) +output_text_alora = tokenizer.decode(output_alora.sequences[0]) +``` +**Pattern 2: aLoRA generation followed by base model (or another aLoRA) with partial context overlap** Here, we prefill the shared context using the base model, and then generate. +``` +from transformers import DynamicCache +import copy +... +cache = DynamicCache() +inputs_shared = tokenizer(prompt_shared, return_tensors="pt") +# Prefill from base model and save cache +with model_alora.disable_adapter(): + with torch.no_grad(): + cache = model_alora(inputs_shared["input_ids"].to(device), attention_mask=inputs_shared["attention_mask"].to(device), past_key_values=cache).past_key_values +cache_copy = copy.deepcopy(cache) +# Generate aLoRA +prompt_alora = prompt_shared + INVOCATION_STRING +inputs_alora = tokenizer(prompt_alora, return_tensors="pt") +output = model_alora.generate(inputs_alora["input_ids"].to(device),attention_mask=inputs_alora["attention_mask"].to(device), past_key_values=cache,return_dict_in_generate=True) +output_text_alora = tokenizer.decode(output_alora.sequences[0]) +# Generate base +prompt_base = prompt_shared +inputs_base = tokenizer(prompt_base, return_tensors="pt") +# Generate from base model and save cache +with model_alora.disable_adapter(): + output = model_alora.generate(inputs_base["input_ids"].to(device),attention_mask=inputs_base["attention_mask"].to(device),past_key_values = cache_copy,return_dict_in_generate=True) +output_text_base = tokenizer.decode(output.sequences[0]) +``` + ### Weight-Decomposed Low-Rank Adaptation (DoRA) This technique decomposes the updates of the weights into two parts, magnitude and direction. Direction is handled by normal LoRA, whereas the magnitude is handled by a separate learnable parameter. This can improve the performance of LoRA, especially at low ranks. For more information on DoRA, see https://huggingface.co/papers/2402.09353. From b27a6dcc6a63727f13e4961787e082ec74747705 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 3 Sep 2025 07:54:34 -0400 Subject: [PATCH 94/99] Update docs/source/developer_guides/lora.md Co-authored-by: githubnemo --- docs/source/developer_guides/lora.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index 71a38ca33c..e8efc4c2ae 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -226,7 +226,6 @@ The main purpose of Activated LoRA is to make KV cache interchangeable between t 1. The base model has generated something, and an aLoRA adapter is then called to do a followup generation. 2. An aLoRA adapter has generated something, and the base model or a different aLoRA adapter is called to do a followup generation where there is partial context overlap with the original aLoRA. -**Important** Note that cache reuse is smoothly and automatically handled in more dedicated/optimized inference libraries, with no need to consider the below. See e.g. PRs in [vLLM](https://github.com/vllm-project/vllm/pull/19710), and [llama.cpp](https://github.com/ggml-org/llama.cpp/pull/15327). The above behaviors can be demonstrated using [DynamicCache](https://huggingface.co/docs/transformers/en/kv_cache) from `transformers`. Since `DynamicCache` does not cleanly support rewinding the cache to an earlier token position, care must be taken to ensure all cache objects are the correct tokens. In particular, an extra step is required for sharing cache when there is partial context overlap. It is also important to remember that cache for the `alora_invocation_tokens` and following are created with adapted weights and not usable by the base model. From 082d4176363eb6cf607030b6593ad51a8d4607c0 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 3 Sep 2025 07:55:35 -0400 Subject: [PATCH 95/99] Update docs/source/developer_guides/lora.md Co-authored-by: githubnemo --- docs/source/developer_guides/lora.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index e8efc4c2ae..d4a65f69b2 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -227,7 +227,7 @@ The main purpose of Activated LoRA is to make KV cache interchangeable between t 2. An aLoRA adapter has generated something, and the base model or a different aLoRA adapter is called to do a followup generation where there is partial context overlap with the original aLoRA. -The above behaviors can be demonstrated using [DynamicCache](https://huggingface.co/docs/transformers/en/kv_cache) from `transformers`. Since `DynamicCache` does not cleanly support rewinding the cache to an earlier token position, care must be taken to ensure all cache objects are the correct tokens. In particular, an extra step is required for sharing cache when there is partial context overlap. It is also important to remember that cache for the `alora_invocation_tokens` and following are created with adapted weights and not usable by the base model. +To demonstrate the above behaviors when using caching, we're using [DynamicCache](https://huggingface.co/docs/transformers/en/kv_cache) from `transformers`. Care must be taken to ensure that adapted cache values are not mixed with base cache values. In particular, an extra step is required for sharing the cache when there is partial context overlap (pattern 2). **Pattern 1: Base model followed by aLoRA** Here, the entire input and generation from the base model is input into the aLoRA adapter, along with the invocation sequence: ``` From 582b043b2f1df7d89a32f0abeb363f1a61f239dd Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 3 Sep 2025 07:56:58 -0400 Subject: [PATCH 96/99] Update docs/source/developer_guides/lora.md Co-authored-by: githubnemo --- docs/source/developer_guides/lora.md | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index d4a65f69b2..0e720af91d 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -253,24 +253,26 @@ from transformers import DynamicCache import copy ... cache = DynamicCache() -inputs_shared = tokenizer(prompt_shared, return_tensors="pt") +inputs_shared = tokenizer(prompt_shared, return_tensors="pt").to(device) + # Prefill from base model and save cache with model_alora.disable_adapter(): with torch.no_grad(): - cache = model_alora(inputs_shared["input_ids"].to(device), attention_mask=inputs_shared["attention_mask"].to(device), past_key_values=cache).past_key_values + model_alora(**inputs_shared, past_key_values=cache) cache_copy = copy.deepcopy(cache) -# Generate aLoRA + +# Generate from aLoRA using prefilled cache prompt_alora = prompt_shared + INVOCATION_STRING -inputs_alora = tokenizer(prompt_alora, return_tensors="pt") -output = model_alora.generate(inputs_alora["input_ids"].to(device),attention_mask=inputs_alora["attention_mask"].to(device), past_key_values=cache,return_dict_in_generate=True) -output_text_alora = tokenizer.decode(output_alora.sequences[0]) -# Generate base +inputs_alora = tokenizer(prompt_alora, return_tensors="pt").to(device) +output = model_alora.generate(**inputs_alora, past_key_values=cache) +output_text_alora = tokenizer.decode(output[0]) + +# Generate from base model using saved cache not tainted by aLoRA KV values prompt_base = prompt_shared -inputs_base = tokenizer(prompt_base, return_tensors="pt") -# Generate from base model and save cache +inputs_base = tokenizer(prompt_base, return_tensors="pt").to(device) with model_alora.disable_adapter(): - output = model_alora.generate(inputs_base["input_ids"].to(device),attention_mask=inputs_base["attention_mask"].to(device),past_key_values = cache_copy,return_dict_in_generate=True) -output_text_base = tokenizer.decode(output.sequences[0]) + output = model_alora.generate(**inputs_base, past_key_values=cache_copy) +output_text_base = tokenizer.decode(output[0]) ``` ### Weight-Decomposed Low-Rank Adaptation (DoRA) From 2d3fadfb62c606947df7f9881e4b341ac0a24407 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 3 Sep 2025 07:57:40 -0400 Subject: [PATCH 97/99] Update docs/source/developer_guides/lora.md Co-authored-by: githubnemo --- docs/source/developer_guides/lora.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index 0e720af91d..4028235efd 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -243,10 +243,11 @@ cache = output.past_key_values # Generate with aLoRA adapter from cache prompt_alora = output_text + INVOCATION_STRING -inputs_alora = tokenizer(prompt_alora, return_tensors="pt") -output = model_alora.generate(inputs_alora["input_ids"].to(device),attention_mask=inputs_alora["attention_mask"].to(device), past_key_values=cache,return_dict_in_generate=True) -output_text_alora = tokenizer.decode(output_alora.sequences[0]) -``` +inputs_alora = tokenizer(prompt_alora, return_tensors="pt").to(device) +output = model_alora.generate(**inputs_alora, past_key_values=cache) +output_text_alora = tokenizer.decode(output[0]) + +# Note: cache is now tainted with adapter values and cannot be used in base model from here on! **Pattern 2: aLoRA generation followed by base model (or another aLoRA) with partial context overlap** Here, we prefill the shared context using the base model, and then generate. ``` from transformers import DynamicCache From 9a207442c810afa84f1dfc0098b3131b4cdf96a4 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 3 Sep 2025 07:58:10 -0400 Subject: [PATCH 98/99] Update docs/source/developer_guides/lora.md Co-authored-by: githubnemo --- docs/source/developer_guides/lora.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index 4028235efd..82f89b6d88 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -222,7 +222,7 @@ An example inference setup is at [alora finetuning](https://github.com/huggingfa To see why, imagine that 'a', 'b', 'c', and 'ab' are tokens in your tokenizer (numbers 1, 2, 3, 4 respectively). Suppose that your alora_invocation_tokens = [2, 3]. Now imagine your input string is "abc". Because "ab" is a token, this will get tokenized as [4,3]. So the alora_invocation_tokens will fail to be found, despite the string "bc" being in it. If the start and end of the invocation string are special tokens, however, this failure case will never happen since special tokens are never tokenized into the same token with other characters. #### Using (and reusing) cache for generation -The main purpose of Activated LoRA is to make KV cache interchangeable between the base model and aLoRA adapter models **prior to the invocation sequence**. Specifically, keys and values stored during one model generation can be used in subsequent generations to avoid expensive prefill operations for context tokens. When sharing cache between the base model and aLoRA adapters, there are 2 main patterns: +The main purpose of Activated LoRA is to make KV cache interchangeable between the base model and aLoRA adapter models **prior to the invocation sequence** since base and adapted KV values are not compatible. Specifically, keys and values stored during one model generation can be used in subsequent generations to avoid expensive prefill operations for context tokens. When sharing cache between the base model and aLoRA adapters, there are 2 main patterns: 1. The base model has generated something, and an aLoRA adapter is then called to do a followup generation. 2. An aLoRA adapter has generated something, and the base model or a different aLoRA adapter is called to do a followup generation where there is partial context overlap with the original aLoRA. From dbd56e7cd1e94bf8f56dd4d9b425101095913cc4 Mon Sep 17 00:00:00 2001 From: Greenewald Date: Wed, 3 Sep 2025 08:03:52 -0400 Subject: [PATCH 99/99] Update lora.md --- docs/source/developer_guides/lora.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/developer_guides/lora.md b/docs/source/developer_guides/lora.md index 82f89b6d88..b2049f6147 100644 --- a/docs/source/developer_guides/lora.md +++ b/docs/source/developer_guides/lora.md @@ -223,8 +223,8 @@ To see why, imagine that 'a', 'b', 'c', and 'ab' are tokens in your tokenizer (n #### Using (and reusing) cache for generation The main purpose of Activated LoRA is to make KV cache interchangeable between the base model and aLoRA adapter models **prior to the invocation sequence** since base and adapted KV values are not compatible. Specifically, keys and values stored during one model generation can be used in subsequent generations to avoid expensive prefill operations for context tokens. When sharing cache between the base model and aLoRA adapters, there are 2 main patterns: -1. The base model has generated something, and an aLoRA adapter is then called to do a followup generation. -2. An aLoRA adapter has generated something, and the base model or a different aLoRA adapter is called to do a followup generation where there is partial context overlap with the original aLoRA. +1. The base model has generated something, and an aLoRA adapter is then called to do a followup generation. Example: the base model answers a question, and an aLoRA trained to detect hallucinations checks the base model response. +2. An aLoRA adapter has generated something, and the base model or a different aLoRA adapter is called to do a followup generation where there is partial context overlap with the original aLoRA. Example: The user provides a query, and an aLoRA rewrites the query to be more self-contained and improve retrieval in a RAG system. Then, documents are retrieved and loaded into context, an aLoRA checks if these documents are indeed relevant to the question, and then the base model generates an answer. To demonstrate the above behaviors when using caching, we're using [DynamicCache](https://huggingface.co/docs/transformers/en/kv_cache) from `transformers`. Care must be taken to ensure that adapted cache values are not mixed with base cache values. In particular, an extra step is required for sharing the cache when there is partial context overlap (pattern 2).