diff --git a/ppdiffusers/ppdiffusers/__init__.py b/ppdiffusers/ppdiffusers/__init__.py
index 8b5154083..00a0f09a9 100644
--- a/ppdiffusers/ppdiffusers/__init__.py
+++ b/ppdiffusers/ppdiffusers/__init__.py
@@ -63,7 +63,6 @@
LVDMAutoencoderKL,
LVDMUNet3DModel,
ModelMixin,
- MotionAdapter,
MultiAdapter,
PriorTransformer,
T2IAdapter,
@@ -73,7 +72,6 @@
UNet2DConditionModel,
UNet2DModel,
UNet3DConditionModel,
- UNetMotionModel,
VQModel,
)
from .optimization import (
diff --git a/ppdiffusers/ppdiffusers/loaders.py b/ppdiffusers/ppdiffusers/loaders.py
index 991a70a4c..325260567 100644
--- a/ppdiffusers/ppdiffusers/loaders.py
+++ b/ppdiffusers/ppdiffusers/loaders.py
@@ -17,15 +17,13 @@
import re
import warnings
from collections import defaultdict
-from contextlib import nullcontext
from io import BytesIO
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
import paddle
import paddle.nn as nn
-
-# import paddle.nn.functional as F # TODO: remove
+import paddle.nn.functional as F
import requests
from huggingface_hub import hf_hub_download
from huggingface_hub.file_download import _request_wrapper, hf_raise_for_status
@@ -68,8 +66,6 @@
PretrainedTokenizer,
)
-logger = logging.get_logger(__name__)
-
TEXT_ENCODER_NAME = "text_encoder"
UNET_NAME = "unet"
@@ -121,73 +117,7 @@ def __init__(self, regular_linear_layer, lora_scale=1, network_alpha=None, rank=
self.lora_scale = lora_scale
- # overwrite PyTorch's `state_dict` to be sure that only the 'regular_linear_layer' weights are saved
- # when saving the whole text encoder model and when LoRA is unloaded or fused
- def state_dict(self, *args, destination=None, prefix="", keep_vars=False):
- if self.lora_linear_layer is None:
- return self.regular_linear_layer.state_dict(
- *args, destination=destination, prefix=prefix, keep_vars=keep_vars
- )
-
- return super().state_dict(*args, destination=destination, prefix=prefix, keep_vars=keep_vars)
-
- def _fuse_lora(self, lora_scale=1.0):
- if self.lora_linear_layer is None:
- return
-
- dtype = self.regular_linear_layer.weight.dtype
-
- w_orig = self.regular_linear_layer.weight.astype(paddle.get_default_dtype())
- w_up = self.lora_linear_layer.up.weight.astype(paddle.get_default_dtype())
- w_down = self.lora_linear_layer.down.weight.astype(paddle.get_default_dtype())
-
- if self.lora_linear_layer.network_alpha is not None:
- w_up = w_up * self.lora_linear_layer.network_alpha / self.lora_linear_layer.rank
-
- # Weight is reversed in the torch and paddle, use .T
- fused_weight = w_orig + (lora_scale * paddle.bmm(w_up.T[None, :], w_down.T[None, :])[0]).T
- out_0 = fused_weight.cast(dtype=dtype)
- self.regular_linear_layer.weight = self.create_parameter(
- shape=out_0.shape,
- default_initializer=nn.initializer.Assign(out_0),
- )
-
- # we can drop the lora layer now
- self.lora_linear_layer = None
-
- # offload the up and down matrices to CPU to not blow the memory
- self.w_up = w_up.cpu()
- self.w_down = w_down.cpu()
- self.lora_scale = lora_scale
-
- def _unfuse_lora(self):
- if not (hasattr(self, "w_up") and hasattr(self, "w_down")):
- return
-
- fused_weight = self.regular_linear_layer.weight
- dtype = fused_weight.dtype
-
- w_up = self.w_up.astype(paddle.get_default_dtype())
- w_down = self.w_down.astype(paddle.get_default_dtype())
-
- unfused_weight = (
- fused_weight.astype(paddle.get_default_dtype())
- - (self.lora_scale * paddle.bmm(w_up.T[None, :], w_down.T[None, :])[0]).T
- )
- out_0 = unfused_weight.cast(dtype=dtype)
- self.regular_linear_layer.weight = self.create_parameter(
- shape=out_0.shape,
- default_initializer=nn.initializer.Assign(out_0),
- )
-
- self.w_up = None
- self.w_down = None
-
def forward(self, input):
- if self.lora_scale is None:
- self.lora_scale = 1.0
- if self.lora_linear_layer is None:
- return self.regular_linear_layer(input)
return self.regular_linear_layer(input) + self.lora_scale * self.lora_linear_layer(input)
@@ -327,19 +257,17 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
information.
"""
- # TODO: remove
- # from .models.attention_processor import (
- # AttnAddedKVProcessor,
- # AttnAddedKVProcessor2_5,
- # CustomDiffusionAttnProcessor,
- # LoRAAttnAddedKVProcessor,
- # LoRAAttnProcessor,
- # LoRAAttnProcessor2_5,
- # LoRAXFormersAttnProcessor,
- # SlicedAttnAddedKVProcessor,
- # XFormersAttnProcessor,
- # )
- from .models.attention_processor import CustomDiffusionAttnProcessor
+ from .models.attention_processor import (
+ AttnAddedKVProcessor,
+ AttnAddedKVProcessor2_5,
+ CustomDiffusionAttnProcessor,
+ LoRAAttnAddedKVProcessor,
+ LoRAAttnProcessor,
+ LoRAAttnProcessor2_5,
+ LoRAXFormersAttnProcessor,
+ SlicedAttnAddedKVProcessor,
+ XFormersAttnProcessor,
+ )
from .models.lora import (
LoRACompatibleConv,
LoRACompatibleLinear,
@@ -364,7 +292,6 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
# This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
# See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
network_alphas = kwargs.pop("network_alphas", None)
- is_network_alphas_none = network_alphas is None
if from_diffusers and use_safetensors and not is_safetensors_available():
raise ValueError(
@@ -441,7 +368,8 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
state_dict = pretrained_model_name_or_path_or_dict
# fill attn processors
- lora_layers_list = []
+ attn_processors = {}
+ non_attn_lora_layers = []
is_lora = all(("lora" in k or k.endswith(".alpha")) for k in state_dict.keys())
is_custom_diffusion = any("custom_diffusion" in k for k in state_dict.keys())
@@ -450,13 +378,17 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
state_dict = transpose_state_dict(state_dict)
if is_lora:
- # attn_processors = {} # TODO: remove
- # correct keys
- state_dict, network_alphas = self.convert_state_dict_legacy_attn_format(state_dict, network_alphas)
-
- if network_alphas is not None:
- network_alphas_keys = list(network_alphas.keys())
- used_network_alphas_keys = set()
+ is_new_lora_format = all(
+ key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
+ )
+ if is_new_lora_format:
+ # Strip the `"unet"` prefix.
+ is_text_encoder_present = any(key.startswith(self.text_encoder_name) for key in state_dict.keys())
+ if is_text_encoder_present:
+ warn_message = "The state_dict contains LoRA params corresponding to the text encoder which are not being used here. To use both UNet and text encoder related LoRA params, use [`pipe.load_lora_weights()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights)."
+ warnings.warn(warn_message)
+ unet_keys = [k for k in state_dict.keys() if k.startswith(self.unet_name)]
+ state_dict = {k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys}
lora_grouped_dict = defaultdict(dict)
mapped_network_alphas = {}
@@ -471,20 +403,13 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
# Create another `mapped_network_alphas` dictionary so that we can properly map them.
if network_alphas is not None:
- for k in network_alphas_keys:
+ for k in network_alphas:
if k.replace(".alpha", "") in key:
- mapped_network_alphas.update({attn_processor_key: network_alphas.get(k)})
- used_network_alphas_keys.add(k)
-
- if not is_network_alphas_none:
- if len(set(network_alphas_keys) - used_network_alphas_keys) > 0:
- raise ValueError(
- f"The `network_alphas` has to be empty at this point but has the following keys \n\n {', '.join(network_alphas.keys())}"
- )
+ mapped_network_alphas.update({attn_processor_key: network_alphas[k]})
if len(state_dict) > 0:
raise ValueError(
- f"The `state_dict` has to be empty at this point but has the following keys \n\n {', '.join(state_dict.keys())}"
+ f"The state_dict has to be empty at this point but has the following keys \n\n {', '.join(state_dict.keys())}"
)
for key, value_dict in lora_grouped_dict.items():
@@ -494,7 +419,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
# Process non-attention layers, which don't have to_{k,v,q,out_proj}_lora layers
# or add_{k,v,q,out_proj}_proj_lora layers.
- if "lora.down.weight" in value_dict: # TODO: remove this line
+ if "lora.down.weight" in value_dict:
if value_dict["lora.down.weight"].ndim == 2:
rank = value_dict["lora.down.weight"].shape[1]
else:
@@ -526,67 +451,66 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
value_dict = {k.replace("lora.", ""): v for k, v in value_dict.items()}
lora.load_dict(value_dict)
- lora_layers_list.append((attn_processor, lora))
- # TODO: remove
- # else:
- # # To handle SDXL.
- # rank_mapping = {}
- # hidden_size_mapping = {}
- # for projection_id in ["to_k", "to_q", "to_v", "to_out"]:
- # rank = value_dict[f"{projection_id}_lora.down.weight"].shape[1]
- # hidden_size = value_dict[f"{projection_id}_lora.up.weight"].shape[1]
-
- # rank_mapping.update({f"{projection_id}_lora.down.weight": rank})
- # hidden_size_mapping.update({f"{projection_id}_lora.up.weight": hidden_size})
-
- # if isinstance(
- # attn_processor, (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_5)
- # ):
- # cross_attention_dim = value_dict["add_k_proj_lora.down.weight"].shape[0]
- # attn_processor_class = LoRAAttnAddedKVProcessor
- # else:
- # cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[0]
- # if isinstance(attn_processor, (XFormersAttnProcessor, LoRAXFormersAttnProcessor)):
- # attn_processor_class = LoRAXFormersAttnProcessor
- # else:
- # attn_processor_class = (
- # LoRAAttnProcessor2_5
- # if hasattr(F, "scaled_dot_product_attention_")
- # else LoRAAttnProcessor
- # )
-
- # if attn_processor_class is not LoRAAttnAddedKVProcessor:
- # attn_processors[key] = attn_processor_class(
- # rank=rank_mapping.get("to_k_lora.down.weight"),
- # hidden_size=hidden_size_mapping.get("to_k_lora.up.weight"),
- # cross_attention_dim=cross_attention_dim,
- # network_alpha=mapped_network_alphas.get(key),
- # q_rank=rank_mapping.get("to_q_lora.down.weight"),
- # q_hidden_size=hidden_size_mapping.get("to_q_lora.up.weight"),
- # v_rank=rank_mapping.get("to_v_lora.down.weight"),
- # v_hidden_size=hidden_size_mapping.get("to_v_lora.up.weight"),
- # out_rank=rank_mapping.get("to_out_lora.down.weight"),
- # out_hidden_size=hidden_size_mapping.get("to_out_lora.up.weight"),
- # # rank=rank_mapping.get("to_k_lora.down.weight", None),
- # # hidden_size=hidden_size_mapping.get("to_k_lora.up.weight", None),
- # # q_rank=rank_mapping.get("to_q_lora.down.weight", None),
- # # q_hidden_size=hidden_size_mapping.get("to_q_lora.up.weight", None),
- # # v_rank=rank_mapping.get("to_v_lora.down.weight", None),
- # # v_hidden_size=hidden_size_mapping.get("to_v_lora.up.weight", None),
- # # out_rank=rank_mapping.get("to_out_lora.down.weight", None),
- # # out_hidden_size=hidden_size_mapping.get("to_out_lora.up.weight", None),
- # )
- # else:
- # attn_processors[key] = attn_processor_class(
- # rank=rank_mapping.get("to_k_lora.down.weight", None),
- # hidden_size=hidden_size_mapping.get("to_k_lora.up.weight", None),
- # cross_attention_dim=cross_attention_dim,
- # network_alpha=mapped_network_alphas.get(key),
- # )
-
- # attn_processors[key].load_dict(value_dict)
+ non_attn_lora_layers.append((attn_processor, lora))
+ else:
+ # To handle SDXL.
+ rank_mapping = {}
+ hidden_size_mapping = {}
+ for projection_id in ["to_k", "to_q", "to_v", "to_out"]:
+ rank = value_dict[f"{projection_id}_lora.down.weight"].shape[1]
+ hidden_size = value_dict[f"{projection_id}_lora.up.weight"].shape[1]
+
+ rank_mapping.update({f"{projection_id}_lora.down.weight": rank})
+ hidden_size_mapping.update({f"{projection_id}_lora.up.weight": hidden_size})
+
+ if isinstance(
+ attn_processor, (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_5)
+ ):
+ cross_attention_dim = value_dict["add_k_proj_lora.down.weight"].shape[0]
+ attn_processor_class = LoRAAttnAddedKVProcessor
+ else:
+ cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[0]
+ if isinstance(attn_processor, (XFormersAttnProcessor, LoRAXFormersAttnProcessor)):
+ attn_processor_class = LoRAXFormersAttnProcessor
+ else:
+ attn_processor_class = (
+ LoRAAttnProcessor2_5
+ if hasattr(F, "scaled_dot_product_attention_")
+ else LoRAAttnProcessor
+ )
+
+ if attn_processor_class is not LoRAAttnAddedKVProcessor:
+ attn_processors[key] = attn_processor_class(
+ rank=rank_mapping.get("to_k_lora.down.weight"),
+ hidden_size=hidden_size_mapping.get("to_k_lora.up.weight"),
+ cross_attention_dim=cross_attention_dim,
+ network_alpha=mapped_network_alphas.get(key),
+ q_rank=rank_mapping.get("to_q_lora.down.weight"),
+ q_hidden_size=hidden_size_mapping.get("to_q_lora.up.weight"),
+ v_rank=rank_mapping.get("to_v_lora.down.weight"),
+ v_hidden_size=hidden_size_mapping.get("to_v_lora.up.weight"),
+ out_rank=rank_mapping.get("to_out_lora.down.weight"),
+ out_hidden_size=hidden_size_mapping.get("to_out_lora.up.weight"),
+ # rank=rank_mapping.get("to_k_lora.down.weight", None),
+ # hidden_size=hidden_size_mapping.get("to_k_lora.up.weight", None),
+ # q_rank=rank_mapping.get("to_q_lora.down.weight", None),
+ # q_hidden_size=hidden_size_mapping.get("to_q_lora.up.weight", None),
+ # v_rank=rank_mapping.get("to_v_lora.down.weight", None),
+ # v_hidden_size=hidden_size_mapping.get("to_v_lora.up.weight", None),
+ # out_rank=rank_mapping.get("to_out_lora.down.weight", None),
+ # out_hidden_size=hidden_size_mapping.get("to_out_lora.up.weight", None),
+ )
+ else:
+ attn_processors[key] = attn_processor_class(
+ rank=rank_mapping.get("to_k_lora.down.weight", None),
+ hidden_size=hidden_size_mapping.get("to_k_lora.up.weight", None),
+ cross_attention_dim=cross_attention_dim,
+ network_alpha=mapped_network_alphas.get(key),
+ )
+
+ attn_processors[key].load_dict(value_dict)
+
elif is_custom_diffusion:
- attn_processors = {}
custom_diffusion_grouped_dict = defaultdict(dict)
for key, value in state_dict.items():
if len(value) == 0:
@@ -620,47 +544,23 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict
cross_attention_dim=cross_attention_dim,
)
attn_processors[key].load_dict(value_dict)
-
- self.set_attn_processor(attn_processors)
else:
raise ValueError(
f"{model_file} does not seem to be in the correct format expected by LoRA or Custom Diffusion training."
)
-
- # set lora layers
- for target_module, lora_layer in lora_layers_list:
+ # set correct dtype & device
+ attn_processors = {k: v.to(dtype=self.dtype) for k, v in attn_processors.items()}
+ non_attn_lora_layers = [(t, l.to(dtype=self.dtype)) for t, l in non_attn_lora_layers]
+ # set layers
+ self.set_attn_processor(attn_processors)
+
+ # set ff layers
+ for target_module, lora_layer in non_attn_lora_layers:
target_module.set_lora_layer(lora_layer)
# It should raise an error if we don't have a set lora here
# if hasattr(target_module, "set_lora_layer"):
# target_module.set_lora_layer(lora_layer)
- def convert_state_dict_legacy_attn_format(self, state_dict, network_alphas):
- is_new_lora_format = all(
- key.startswith(self.unet_name) or key.startswith(self.text_encoder_name) for key in state_dict.keys()
- )
- if is_new_lora_format:
- # Strip the `"unet"` prefix.
- is_text_encoder_present = any(key.startswith(self.text_encoder_name) for key in state_dict.keys())
- if is_text_encoder_present:
- warn_message = "The state_dict contains LoRA params corresponding to the text encoder which are not being used here. To use both UNet and text encoder related LoRA params, use [`pipe.load_lora_weights()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.load_lora_weights)."
- logger.warn(warn_message)
- unet_keys = [k for k in state_dict.keys() if k.startswith(self.unet_name)]
- state_dict = {k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys}
-
- # change processor format to 'pure' LoRACompatibleLinear format
- if any("processor" in k.split(".") for k in state_dict.keys()):
-
- def format_to_lora_compatible(key):
- if "processor" not in key.split("."):
- return key
- return key.replace(".processor", "").replace("to_out_lora", "to_out.0.lora").replace("_lora", ".lora")
-
- state_dict = {format_to_lora_compatible(k): v for k, v in state_dict.items()}
-
- if network_alphas is not None:
- network_alphas = {format_to_lora_compatible(k): v for k, v in network_alphas.items()}
- return state_dict, network_alphas
-
def save_attn_procs(
self,
save_directory: Union[str, os.PathLike],
@@ -762,21 +662,6 @@ def save_function(weights, filename):
logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
- def fuse_lora(self, lora_scale=1.0):
- self.lora_scale = lora_scale
- self.apply(self._fuse_lora_apply)
-
- def _fuse_lora_apply(self, module):
- if hasattr(module, "_fuse_lora"):
- module._fuse_lora(self.lora_scale)
-
- def unfuse_lora(self):
- self.apply(self._unfuse_lora_apply)
-
- def _unfuse_lora_apply(self, module):
- if hasattr(module, "_unfuse_lora"):
- module._unfuse_lora()
-
class TextualInversionLoaderMixin:
r"""
@@ -788,15 +673,6 @@ def maybe_convert_prompt(self, prompt: Union[str, List[str]], tokenizer: "Pretra
Processes prompts that include a special token corresponding to a multi-vector textual inversion embedding to
be replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual
inversion token or if the textual inversion token is a single vector, the input prompt is returned.
-
- Parameters:
- prompt (`str` or list of `str`):
- The prompt or prompts to guide the image generation.
- tokenizer (`PreTrainedTokenizer`):
- The tokenizer responsible for encoding the prompt into input tokens.
-
- Returns:
- `str` or list of `str`: The converted prompt
"""
if not isinstance(prompt, List):
prompts = [prompt]
@@ -810,7 +686,7 @@ def maybe_convert_prompt(self, prompt: Union[str, List[str]], tokenizer: "Pretra
return prompts
- def _maybe_convert_prompt(self, prompt: str, tokenizer: "PreTrainedTokenizer"): # noqa: F821
+ def _maybe_convert_prompt(self, prompt: str, tokenizer: "PretrainedTokenizer"):
r"""
Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds
to a multi-vector textual inversion embedding, this function will process the prompt so that the special token
@@ -819,7 +695,7 @@ def _maybe_convert_prompt(self, prompt: str, tokenizer: "PreTrainedTokenizer"):
Parameters:
prompt (`str`):
The prompt to guide the image generation.
- tokenizer (`PreTrainedTokenizer`):
+ tokenizer (`PretrainedTokenizer`):
The tokenizer responsible for encoding the prompt into input tokens.
Returns:
`str`: The converted prompt
@@ -842,8 +718,6 @@ def load_textual_inversion(
self,
pretrained_model_name_or_path: Union[str, List[str], Dict[str, paddle.Tensor], List[Dict[str, paddle.Tensor]]],
token: Optional[Union[str, List[str]]] = None,
- tokenizer: Optional["PreTrainedTokenizer"] = None, # noqa: F821
- text_encoder: Optional["PreTrainedModel"] = None, # noqa: F821
**kwargs,
):
r"""
@@ -862,11 +736,6 @@ def load_textual_inversion(
token (`str` or `List[str]`, *optional*):
Override the token to use for the textual inversion weights. If `pretrained_model_name_or_path` is a
list, then `token` must also be a list of equal length.
- text_encoder ([`~transformers.CLIPTextModel`], *optional*):
- Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
- If not specified, function will take self.tokenizer.
- tokenizer ([`~transformers.CLIPTokenizer`], *optional*):
- A `CLIPTokenizer` to tokenize text. If not specified, function will take self.tokenizer.
weight_name (`str`, *optional*):
Name of a custom weight file. This should be used when:
- The saved textual inversion file is in 🤗 Diffusers format, but was saved under a specific weight
@@ -924,9 +793,6 @@ def load_textual_inversion(
image.save("character.png")
```
"""
- tokenizer = tokenizer or getattr(self, "tokenizer", None)
- text_encoder = text_encoder or getattr(self, "text_encoder", None)
-
if not hasattr(self, "tokenizer") or not isinstance(self.tokenizer, PretrainedTokenizer):
raise ValueError(
f"{self.__class__.__name__} requires `self.tokenizer` of type `PretrainedTokenizer` for calling"
@@ -1134,7 +1000,6 @@ class LoraLoaderMixin:
"""
text_encoder_name = TEXT_ENCODER_NAME
unet_name = UNET_NAME
- num_fused_loras = 0
def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, paddle.Tensor]], **kwargs):
"""
@@ -1508,23 +1373,22 @@ def load_lora_into_text_encoder(cls, state_dict, network_alphas, text_encoder, p
How much to scale the output of the lora linear layer before it is added with the output of the regular
lora layer.
"""
+
# If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
# then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as
# their prefixes.
keys = list(state_dict.keys())
prefix = cls.text_encoder_name if prefix is None else prefix
- # Safe prefix to check with.
if any(cls.text_encoder_name in key for key in keys):
# Load the layers corresponding to text encoder and make necessary adjustments.
- text_encoder_keys = [k for k in keys if k.startswith(prefix) and k.split(".")[0] == prefix]
+ text_encoder_keys = [k for k in keys if k.startswith(prefix)]
text_encoder_lora_state_dict = {
k.replace(f"{prefix}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys
}
if len(text_encoder_lora_state_dict) > 0:
logger.info(f"Loading {prefix}.")
- rank = {}
if any("to_out_lora" in k for k in text_encoder_lora_state_dict.keys()):
# Convert from the old naming convention to the new naming convention.
@@ -1563,34 +1427,23 @@ def load_lora_into_text_encoder(cls, state_dict, network_alphas, text_encoder, p
f"{name}.out_proj.lora_linear_layer.down.weight"
] = text_encoder_lora_state_dict.pop(f"{name}.to_out_lora.down.weight")
- for name, _ in text_encoder_attn_modules(text_encoder):
- rank_key = f"{name}.out_proj.lora_linear_layer.up.weight"
- # torch use 1, paddle use 0
- rank.update({rank_key: text_encoder_lora_state_dict[rank_key].shape[0]})
-
+ rank = text_encoder_lora_state_dict[
+ "text_model.transformer.layers.0.self_attn.out_proj.lora_linear_layer.up.weight"
+ ].shape[0]
patch_mlp = any(".linear1." in key for key in text_encoder_lora_state_dict.keys())
- if patch_mlp:
- for name, _ in text_encoder_mlp_modules(text_encoder):
- rank_key_fc1 = f"{name}.linear1.lora_linear_layer.up.weight"
- rank_key_fc2 = f"{name}.linear2.lora_linear_layer.up.weight"
- rank.update({rank_key_fc1: text_encoder_lora_state_dict[rank_key_fc1].shape[0]})
- rank.update({rank_key_fc2: text_encoder_lora_state_dict[rank_key_fc2].shape[0]})
- if network_alphas is not None:
- alpha_keys = [
- k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix
- ]
- network_alphas = {
- k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys
- }
-
- cls._modify_text_encoder(text_encoder, lora_scale, network_alphas, rank=rank, patch_mlp=patch_mlp)
+ cls._modify_text_encoder(
+ text_encoder,
+ lora_scale,
+ network_alphas,
+ rank=rank,
+ patch_mlp=patch_mlp,
+ )
# set correct dtype & device
text_encoder_lora_state_dict = {
k: v._to(dtype=text_encoder.dtype) for k, v in text_encoder_lora_state_dict.items()
}
-
text_encoder.load_dict(text_encoder_lora_state_dict)
# load_state_dict_results = text_encoder.load_dict(text_encoder_lora_state_dict)
# if len(load_state_dict_results.unexpected_keys) != 0:
@@ -1611,16 +1464,16 @@ def _remove_text_encoder_monkey_patch(self):
def _remove_text_encoder_monkey_patch_classmethod(cls, text_encoder):
for _, attn_module in text_encoder_attn_modules(text_encoder):
if isinstance(attn_module.q_proj, PatchedLoraProjection):
- attn_module.q_proj.lora_linear_layer = None
- attn_module.k_proj.lora_linear_layer = None
- attn_module.v_proj.lora_linear_layer = None
- attn_module.out_proj.lora_linear_layer = None
+ attn_module.q_proj = attn_module.q_proj.regular_linear_layer
+ attn_module.k_proj = attn_module.k_proj.regular_linear_layer
+ attn_module.v_proj = attn_module.v_proj.regular_linear_layer
+ attn_module.out_proj = attn_module.out_proj.regular_linear_layer
for _, mlp_module in text_encoder_mlp_modules(text_encoder):
if isinstance(mlp_module.linear1, PatchedLoraProjection):
- mlp_module.linear1.lora_linear_layer = None
+ mlp_module.linear1 = mlp_module.linear1.regular_linear_layer
if isinstance(mlp_module.linear2, PatchedLoraProjection):
- mlp_module.linear2.lora_linear_layer = None
+ mlp_module.linear2 = mlp_module.linear2.regular_linear_layer
# @classmethod
# def _modify_text_encoder(
@@ -1735,7 +1588,7 @@ def _modify_text_encoder(
text_encoder,
lora_scale=1,
network_alphas=None,
- rank: Union[Dict[str, int], int] = 4,
+ rank=4,
dtype=None,
patch_mlp=False,
):
@@ -1743,65 +1596,52 @@ def _modify_text_encoder(
Monkey-patches the forward passes of attention modules of the text encoder.
"""
- def create_patched_linear_lora(model, network_alpha, rank, dtype, lora_parameters):
- linear_layer = model.regular_linear_layer if isinstance(model, PatchedLoraProjection) else model
- ctx = nullcontext
- with ctx():
- model = PatchedLoraProjection(linear_layer, lora_scale, network_alpha, rank, dtype=dtype)
-
- lora_parameters.extend(model.lora_linear_layer.parameters())
- return model
-
# First, remove any monkey-patch that might have been applied before
cls._remove_text_encoder_monkey_patch_classmethod(text_encoder)
lora_parameters = []
network_alphas = {} if network_alphas is None else network_alphas
- is_network_alphas_populated = len(network_alphas) > 0
for name, attn_module in text_encoder_attn_modules(text_encoder):
- query_alpha = network_alphas.pop(name + ".to_q_lora.down.weight.alpha", None)
- key_alpha = network_alphas.pop(name + ".to_k_lora.down.weight.alpha", None)
- value_alpha = network_alphas.pop(name + ".to_v_lora.down.weight.alpha", None)
- out_alpha = network_alphas.pop(name + ".to_out_lora.down.weight.alpha", None)
+ query_alpha = network_alphas.get(name + ".k.proj.alpha")
+ key_alpha = network_alphas.get(name + ".q.proj.alpha")
+ value_alpha = network_alphas.get(name + ".v.proj.alpha")
+ proj_alpha = network_alphas.get(name + ".out.proj.alpha")
- if isinstance(rank, dict):
- current_rank = rank.pop(f"{name}.out_proj.lora_linear_layer.up.weight")
- else:
- current_rank = rank
-
- attn_module.q_proj = create_patched_linear_lora(
- attn_module.q_proj, query_alpha, current_rank, dtype, lora_parameters
+ attn_module.q_proj = PatchedLoraProjection(
+ attn_module.q_proj, lora_scale, network_alpha=query_alpha, rank=rank, dtype=dtype
)
- attn_module.k_proj = create_patched_linear_lora(
- attn_module.k_proj, key_alpha, current_rank, dtype, lora_parameters
+ lora_parameters.extend(attn_module.q_proj.lora_linear_layer.parameters())
+
+ attn_module.k_proj = PatchedLoraProjection(
+ attn_module.k_proj, lora_scale, network_alpha=key_alpha, rank=rank, dtype=dtype
)
- attn_module.v_proj = create_patched_linear_lora(
- attn_module.v_proj, value_alpha, current_rank, dtype, lora_parameters
+ lora_parameters.extend(attn_module.k_proj.lora_linear_layer.parameters())
+
+ attn_module.v_proj = PatchedLoraProjection(
+ attn_module.v_proj, lora_scale, network_alpha=value_alpha, rank=rank, dtype=dtype
)
- attn_module.out_proj = create_patched_linear_lora(
- attn_module.out_proj, out_alpha, current_rank, dtype, lora_parameters
+ lora_parameters.extend(attn_module.v_proj.lora_linear_layer.parameters())
+
+ attn_module.out_proj = PatchedLoraProjection(
+ attn_module.out_proj, lora_scale, network_alpha=proj_alpha, rank=rank, dtype=dtype
)
+ lora_parameters.extend(attn_module.out_proj.lora_linear_layer.parameters())
if patch_mlp:
for name, mlp_module in text_encoder_mlp_modules(text_encoder):
- fc1_alpha = network_alphas.pop(name + ".linear1.lora_linear_layer.down.weight.alpha", None)
- fc2_alpha = network_alphas.pop(name + ".linear2.lora_linear_layer.down.weight.alpha", None)
-
- current_rank_fc1 = rank.pop(f"{name}.linear1.lora_linear_layer.up.weight")
- current_rank_fc2 = rank.pop(f"{name}.linear2.lora_linear_layer.up.weight")
+ fc1_alpha = network_alphas.get(name + ".linear1.alpha")
+ fc2_alpha = network_alphas.get(name + ".linear2.alpha")
- mlp_module.linear1 = create_patched_linear_lora(
- mlp_module.linear1, fc1_alpha, current_rank_fc1, dtype, lora_parameters
- )
- mlp_module.linear2 = create_patched_linear_lora(
- mlp_module.linear2, fc2_alpha, current_rank_fc2, dtype, lora_parameters
+ mlp_module.linear1 = PatchedLoraProjection(
+ mlp_module.linear1, lora_scale, network_alpha=fc1_alpha, rank=rank, dtype=dtype
)
+ lora_parameters.extend(mlp_module.linear1.lora_linear_layer.parameters())
- if is_network_alphas_populated and len(network_alphas) > 0:
- raise ValueError(
- f"The `network_alphas` has to be empty at this point but has the following keys \n\n {', '.join(network_alphas.keys())}"
- )
+ mlp_module.linear2 = PatchedLoraProjection(
+ mlp_module.linear2, lora_scale, network_alpha=fc2_alpha, rank=rank, dtype=dtype
+ )
+ lora_parameters.extend(mlp_module.linear2.lora_linear_layer.parameters())
return lora_parameters
@@ -2098,128 +1938,39 @@ def unload_lora_weights(self):
>>> ...
```
"""
- # TODO: remove
- # from .models.attention_processor import (
- # LORA_ATTENTION_PROCESSORS,
- # AttnProcessor,
- # AttnProcessor2_5,
- # LoRAAttnAddedKVProcessor,
- # LoRAAttnProcessor,
- # LoRAAttnProcessor2_5,
- # LoRAXFormersAttnProcessor,
- # XFormersAttnProcessor,
- # )
-
- # unet_attention_classes = {type(processor) for _, processor in self.unet.attn_processors.items()}
-
- # if unet_attention_classes.issubset(LORA_ATTENTION_PROCESSORS):
- # # Handle attention processors that are a mix of regular attention and AddedKV
- # # attention.
- # if len(unet_attention_classes) > 1 or LoRAAttnAddedKVProcessor in unet_attention_classes:
- # self.unet.set_default_attn_processor()
- # else:
- # regular_attention_classes = {
- # LoRAAttnProcessor: AttnProcessor,
- # LoRAAttnProcessor2_5: AttnProcessor2_5,
- # LoRAXFormersAttnProcessor: XFormersAttnProcessor,
- # }
- # [attention_proc_class] = unet_attention_classes
- # self.unet.set_attn_processor(regular_attention_classes[attention_proc_class]())
-
- for _, module in self.unet.named_sublayers():
- if hasattr(module, "set_lora_layer"):
- module.set_lora_layer(None)
-
- # Safe to call the following regardless of LoRA.
- self._remove_text_encoder_monkey_patch()
-
- def fuse_lora(self, fuse_unet: bool = True, fuse_text_encoder: bool = True, lora_scale: float = 1.0):
- r"""
- Fuses the LoRA parameters into the original parameters of the corresponding blocks.
-
-
-
- This is an experimental API.
-
-
-
- Args:
- fuse_unet (`bool`, defaults to `True`): Whether to fuse the UNet LoRA parameters.
- fuse_text_encoder (`bool`, defaults to `True`):
- Whether to fuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
- LoRA parameters then it won't have any effect.
- lora_scale (`float`, defaults to 1.0):
- Controls how much to influence the outputs with the LoRA parameters.
- """
- if fuse_unet or fuse_text_encoder:
- self.num_fused_loras += 1
- if self.num_fused_loras > 1:
- logger.warn(
- "The current API is supported for operating with a single LoRA file. You are trying to load and fuse more than one LoRA which is not well-supported.",
- )
-
- if fuse_unet:
- self.unet.fuse_lora(lora_scale)
-
- def fuse_text_encoder_lora(text_encoder):
- for _, attn_module in text_encoder_attn_modules(text_encoder):
- if isinstance(attn_module.q_proj, PatchedLoraProjection):
- attn_module.q_proj._fuse_lora(lora_scale)
- attn_module.k_proj._fuse_lora(lora_scale)
- attn_module.v_proj._fuse_lora(lora_scale)
- attn_module.out_proj._fuse_lora(lora_scale)
-
- for _, mlp_module in text_encoder_mlp_modules(text_encoder):
- if isinstance(mlp_module.linear1, PatchedLoraProjection):
- mlp_module.linear1._fuse_lora(lora_scale)
- mlp_module.linear2._fuse_lora(lora_scale)
-
- if fuse_text_encoder:
- if hasattr(self, "text_encoder"):
- fuse_text_encoder_lora(self.text_encoder)
- if hasattr(self, "text_encoder_2"):
- fuse_text_encoder_lora(self.text_encoder_2)
-
- def unfuse_lora(self, unfuse_unet: bool = True, unfuse_text_encoder: bool = True):
- r"""
- Reverses the effect of
- [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.fuse_lora).
+ from .models.attention_processor import (
+ LORA_ATTENTION_PROCESSORS,
+ AttnProcessor,
+ AttnProcessor2_5,
+ LoRAAttnAddedKVProcessor,
+ LoRAAttnProcessor,
+ LoRAAttnProcessor2_5,
+ LoRAXFormersAttnProcessor,
+ XFormersAttnProcessor,
+ )
-
+ unet_attention_classes = {type(processor) for _, processor in self.unet.attn_processors.items()}
- This is an experimental API.
+ if unet_attention_classes.issubset(LORA_ATTENTION_PROCESSORS):
+ # Handle attention processors that are a mix of regular attention and AddedKV
+ # attention.
+ if len(unet_attention_classes) > 1 or LoRAAttnAddedKVProcessor in unet_attention_classes:
+ self.unet.set_default_attn_processor()
+ else:
+ regular_attention_classes = {
+ LoRAAttnProcessor: AttnProcessor,
+ LoRAAttnProcessor2_5: AttnProcessor2_5,
+ LoRAXFormersAttnProcessor: XFormersAttnProcessor,
+ }
+ [attention_proc_class] = unet_attention_classes
+ self.unet.set_attn_processor(regular_attention_classes[attention_proc_class]())
-
+ for _, module in self.unet.named_sublayers():
+ if hasattr(module, "set_lora_layer"):
+ module.set_lora_layer(None)
- Args:
- unfuse_unet (`bool`, defaults to `True`): Whether to unfuse the UNet LoRA parameters.
- unfuse_text_encoder (`bool`, defaults to `True`):
- Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
- LoRA parameters then it won't have any effect.
- """
- if unfuse_unet:
- self.unet.unfuse_lora()
-
- def unfuse_text_encoder_lora(text_encoder):
- for _, attn_module in text_encoder_attn_modules(text_encoder):
- if isinstance(attn_module.q_proj, PatchedLoraProjection):
- attn_module.q_proj._unfuse_lora()
- attn_module.k_proj._unfuse_lora()
- attn_module.v_proj._unfuse_lora()
- attn_module.out_proj._unfuse_lora()
-
- for _, mlp_module in text_encoder_mlp_modules(text_encoder):
- if isinstance(mlp_module.linear1, PatchedLoraProjection):
- mlp_module.linear1._unfuse_lora()
- mlp_module.linear2._unfuse_lora()
-
- if unfuse_text_encoder:
- if hasattr(self, "text_encoder"):
- unfuse_text_encoder_lora(self.text_encoder)
- if hasattr(self, "text_encoder_2"):
- unfuse_text_encoder_lora(self.text_encoder_2)
-
- self.num_fused_loras -= 1
+ # Safe to call the following regardless of LoRA.
+ self._remove_text_encoder_monkey_patch()
class FromSingleFileMixin:
diff --git a/ppdiffusers/ppdiffusers/models/__init__.py b/ppdiffusers/ppdiffusers/models/__init__.py
index 431b806c9..08192f41d 100644
--- a/ppdiffusers/ppdiffusers/models/__init__.py
+++ b/ppdiffusers/ppdiffusers/models/__init__.py
@@ -37,7 +37,6 @@
from .unet_2d import UNet2DModel
from .unet_2d_condition import UNet2DConditionModel
from .unet_3d_condition import UNet3DConditionModel
- from .unet_motion_model import MotionAdapter, UNetMotionModel
from .vq_model import VQModel
try:
diff --git a/ppdiffusers/ppdiffusers/models/activations.py b/ppdiffusers/ppdiffusers/models/activations.py
index 61d66cc4b..8091cb327 100644
--- a/ppdiffusers/ppdiffusers/models/activations.py
+++ b/ppdiffusers/ppdiffusers/models/activations.py
@@ -13,94 +13,14 @@
# limitations under the License.
import paddle.nn as nn
-import paddle.nn.functional as F
-from ..utils import USE_PEFT_BACKEND
-from .lora import LoRACompatibleLinear
-ACTIVATION_FUNCTIONS = {
- "swish": nn.Silu(),
- "silu": nn.Silu(),
- "mish": nn.Mish(),
- "gelu": nn.GELU(),
- "relu": nn.ReLU(),
-}
-
-
-def get_activation(act_fn: str) -> nn.Layer:
- """Helper function to get activation function from string.
-
- Args:
- act_fn (str): Name of activation function.
-
- Returns:
- nn.Layer: Activation function.
- """
-
- act_fn = act_fn.lower()
- if act_fn in ACTIVATION_FUNCTIONS:
- return ACTIVATION_FUNCTIONS[act_fn]
+def get_activation(act_fn):
+ if act_fn in ["swish", "silu"]:
+ return nn.Silu()
+ elif act_fn == "mish":
+ return nn.Mish()
+ elif act_fn == "gelu":
+ return nn.GELU()
else:
raise ValueError(f"Unsupported activation function: {act_fn}")
-
-
-class GELU(nn.Layer):
- r"""
- GELU activation function with tanh approximation support with `approximate="tanh"`.
-
- Parameters:
- dim_in (`int`): The number of channels in the input.
- dim_out (`int`): The number of channels in the output.
- approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
- """
-
- def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
- super().__init__()
- self.proj = LoRACompatibleLinear(dim_in, dim_out)
- self.approximate = approximate
- self.approximate_bool = approximate == "tanh"
-
- def forward(self, hidden_states):
- hidden_states = self.proj(hidden_states)
- hidden_states = F.gelu(hidden_states, approximate=self.approximate_bool)
- return hidden_states
-
-
-class GEGLU(nn.Layer):
- r"""
- A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function.
-
- Parameters:
- dim_in (`int`): The number of channels in the input.
- dim_out (`int`): The number of channels in the output.
- """
-
- def __init__(self, dim_in: int, dim_out: int):
- super().__init__()
- linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear
-
- self.proj = linear_cls(dim_in, dim_out * 2)
-
- def forward(self, hidden_states, scale: float = 1.0):
- args = () if USE_PEFT_BACKEND else (scale,)
- hidden_states, gate = self.proj(hidden_states, *args).chunk(2, axis=-1)
- return hidden_states * F.gelu(gate)
-
-
-class ApproximateGELU(nn.Layer):
- r"""
- The approximate form of the Gaussian Error Linear Unit (GELU). For more details, see section 2 of this
- [paper](https://arxiv.org/abs/1606.08415).
-
- Parameters:
- dim_in (`int`): The number of channels in the input.
- dim_out (`int`): The number of channels in the output.
- """
-
- def __init__(self, dim_in: int, dim_out: int):
- super().__init__()
- self.proj = nn.Linear(dim_in, dim_out)
-
- def forward(self, x):
- x = self.proj(x)
- return x * F.sigmoid(1.702 * x)
diff --git a/ppdiffusers/ppdiffusers/models/adapter.py b/ppdiffusers/ppdiffusers/models/adapter.py
index cd41c7218..783fde367 100644
--- a/ppdiffusers/ppdiffusers/models/adapter.py
+++ b/ppdiffusers/ppdiffusers/models/adapter.py
@@ -12,17 +12,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import os
-from typing import Callable, List, Optional, Union
+
+from typing import List, Optional
import paddle
import paddle.nn as nn
from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import logging
from .modeling_utils import ModelMixin
-
-logger = logging.get_logger(__name__)
+from .resnet import Downsample2D
class MultiAdapter(ModelMixin):
@@ -40,39 +38,9 @@ class MultiAdapter(ModelMixin):
def __init__(self, adapters: List["T2IAdapter"]):
super(MultiAdapter, self).__init__()
-
self.num_adapter = len(adapters)
self.adapters = nn.LayerList(adapters)
- if len(adapters) == 0:
- raise ValueError("Expecting at least one adapter")
-
- if len(adapters) == 1:
- raise ValueError("For a single adapter, please use the `T2IAdapter` class instead of `MultiAdapter`")
-
- # The outputs from each adapter are added together with a weight.
- # This means that the change in dimensions from downsampling must
- # be the same for all adapters. Inductively, it also means the
- # downscale_factor and total_downscale_factor must be the same for all
- # adapters.
- first_adapter_total_downscale_factor = adapters[0].total_downscale_factor
- first_adapter_downscale_factor = adapters[0].downscale_factor
- for idx in range(1, len(adapters)):
- if (
- adapters[idx].total_downscale_factor != first_adapter_total_downscale_factor
- or adapters[idx].downscale_factor != first_adapter_downscale_factor
- ):
- raise ValueError(
- f"Expecting all adapters to have the same downscaling behavior, but got:\n"
- f"adapters[0].total_downscale_factor={first_adapter_total_downscale_factor}\n"
- f"adapters[0].downscale_factor={first_adapter_downscale_factor}\n"
- f"adapter[`{idx}`].total_downscale_factor={adapters[idx].total_downscale_factor}\n"
- f"adapter[`{idx}`].downscale_factor={adapters[idx].downscale_factor}"
- )
-
- self.total_downscale_factor = first_adapter_total_downscale_factor
- self.downscale_factor = first_adapter_downscale_factor
-
def forward(self, xs: paddle.Tensor, adapter_weights: Optional[List[float]] = None) -> List[paddle.Tensor]:
"""
Args:
@@ -87,132 +55,21 @@ def forward(self, xs: paddle.Tensor, adapter_weights: Optional[List[float]] = No
adapter_weights = paddle.to_tensor([1 / self.num_adapter] * self.num_adapter)
else:
adapter_weights = paddle.to_tensor(adapter_weights)
-
+ if xs.shape[1] % self.num_adapter != 0:
+ raise ValueError(
+ f"Expecting multi-adapter's input have number of channel that cab be evenly divisible by num_adapter: {xs.shape[1]} % {self.num_adapter} != 0"
+ )
+ x_list = paddle.chunk(xs, chunks=self.num_adapter, axis=1)
accume_state = None
- for x, w, adapter in zip(xs, adapter_weights, self.adapters):
+ for x, w, adapter in zip(x_list, adapter_weights, self.adapters):
features = adapter(x)
if accume_state is None:
accume_state = features
- for i in range(len(accume_state)):
- accume_state[i] = w * accume_state[i]
else:
for i in range(len(features)):
accume_state[i] += w * features[i]
return accume_state
- def save_pretrained(
- self,
- save_directory: Union[str, os.PathLike],
- is_main_process: bool = True,
- save_function: Callable = None,
- safe_serialization: bool = True,
- variant: Optional[str] = None,
- ):
- """
- Save a model and its configuration file to a directory, so that it can be re-loaded using the
- `[`~models.adapter.MultiAdapter.from_pretrained`]` class method.
-
- Arguments:
- save_directory (`str` or `os.PathLike`):
- Directory to which to save. Will be created if it doesn't exist.
- is_main_process (`bool`, *optional*, defaults to `True`):
- Whether the process calling this is the main process or not. Useful when in distributed training like
- TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on
- the main process to avoid race conditions.
- save_function (`Callable`):
- The function to use to save the state dictionary. Useful on distributed training like TPUs when one
- need to replace `torch.save` by another method. Can be configured with the environment variable
- `DIFFUSERS_SAVE_MODE`.
- safe_serialization (`bool`, *optional*, defaults to `True`):
- Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
- variant (`str`, *optional*):
- If specified, weights are saved in the format pytorch_model..bin.
- """
- idx = 0
- model_path_to_save = save_directory
- for adapter in self.adapters:
- adapter.save_pretrained(
- model_path_to_save,
- is_main_process=is_main_process,
- save_function=save_function,
- safe_serialization=safe_serialization,
- variant=variant,
- )
-
- idx += 1
- model_path_to_save = model_path_to_save + f"_{idx}"
-
- @classmethod
- def from_pretrained(cls, pretrained_model_path: Optional[Union[str, os.PathLike]], **kwargs):
- r"""
- Instantiate a pretrained MultiAdapter model from multiple pre-trained adapter models.
-
- The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train
- the model, you should first set it back in training mode with `model.train()`.
-
- The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
- pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
- task.
-
- The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
- weights are discarded.
-
- Parameters:
- pretrained_model_path (`os.PathLike`):
- A path to a *directory* containing model weights saved using
- [`~diffusers.models.adapter.MultiAdapter.save_pretrained`], e.g., `./my_model_directory/adapter`.
- torch_dtype (`str` or `torch.dtype`, *optional*):
- Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype
- will be automatically derived from the model's weights.
- output_loading_info(`bool`, *optional*, defaults to `False`):
- Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
- device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
- A map that specifies where each submodule should go. It doesn't need to be refined to each
- parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the
- same device.
-
- To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For
- more information about each option see [designing a device
- map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
- max_memory (`Dict`, *optional*):
- A dictionary device identifier to maximum memory. Will default to the maximum memory available for each
- GPU and the available CPU RAM if unset.
- low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
- Speed up model loading by not initializing the weights and only loading the pre-trained weights. This
- also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the
- model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch,
- setting this argument to `True` will raise an error.
- variant (`str`, *optional*):
- If specified load weights from `variant` filename, *e.g.* pytorch_model..bin. `variant` is
- ignored when using `from_flax`.
- use_safetensors (`bool`, *optional*, defaults to `None`):
- If set to `None`, the `safetensors` weights will be downloaded if they're available **and** if the
- `safetensors` library is installed. If set to `True`, the model will be forcibly loaded from
- `safetensors` weights. If set to `False`, loading will *not* use `safetensors`.
- """
- idx = 0
- adapters = []
-
- # load adapter and append to list until no adapter directory exists anymore
- # first adapter has to be saved under `./mydirectory/adapter` to be compliant with `DiffusionPipeline.from_pretrained`
- # second, third, ... adapters have to be saved under `./mydirectory/adapter_1`, `./mydirectory/adapter_2`, ...
- model_path_to_load = pretrained_model_path
- while os.path.isdir(model_path_to_load):
- adapter = T2IAdapter.from_pretrained(model_path_to_load, **kwargs)
- adapters.append(adapter)
-
- idx += 1
- model_path_to_load = pretrained_model_path + f"_{idx}"
-
- logger.info(f"{len(adapters)} adapters loaded from {pretrained_model_path}.")
-
- if len(adapters) == 0:
- raise ValueError(
- f"No T2IAdapters found under {os.path.dirname(pretrained_model_path)}. Expected at least {pretrained_model_path + '_0'}."
- )
-
- return cls(adapters)
-
class T2IAdapter(ModelMixin, ConfigMixin):
r"""
@@ -222,10 +79,8 @@ class T2IAdapter(ModelMixin, ConfigMixin):
[Adapter](https://github.com/TencentARC/T2I-Adapter/blob/686de4681515662c0ac2ffa07bf5dda83af1038a/ldm/modules/encoders/adapter.py#L97)
and
[AdapterLight](https://github.com/TencentARC/T2I-Adapter/blob/686de4681515662c0ac2ffa07bf5dda83af1038a/ldm/modules/encoders/adapter.py#L235).
-
This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
implements for all the model (such as downloading or saving, etc.)
-
Parameters:
in_channels (`int`, *optional*, defaults to 3):
Number of channels of Aapter's input(*control image*). Set this parameter to 1 if you're using gray scale
@@ -234,11 +89,7 @@ class T2IAdapter(ModelMixin, ConfigMixin):
The number of channel of each downsample block's output hidden state. The `len(block_out_channels)` will
also determine the number of downsample blocks in the Adapter.
num_res_blocks (`int`, *optional*, defaults to 2):
- Number of ResNet blocks in each downsample block.
- downscale_factor (`int`, *optional*, defaults to 8):
- A factor that determines the total downscale factor of the Adapter.
- adapter_type (`str`, *optional*, defaults to `full_adapter`):
- The type of Adapter to use. Choose either `full_adapter` or `full_adapter_xl` or `light_adapter`.
+ Number of ResNet blocks in each downsample block
"""
@register_to_config
@@ -254,45 +105,23 @@ def __init__(
if adapter_type == "full_adapter":
self.adapter = FullAdapter(in_channels, channels, num_res_blocks, downscale_factor)
- elif adapter_type == "full_adapter_xl":
- self.adapter = FullAdapterXL(in_channels, channels, num_res_blocks, downscale_factor)
elif adapter_type == "light_adapter":
self.adapter = LightAdapter(in_channels, channels, num_res_blocks, downscale_factor)
else:
- raise ValueError(
- f"Unsupported adapter_type: '{adapter_type}'. Choose either 'full_adapter' or "
- "'full_adapter_xl' or 'light_adapter'."
- )
+ raise ValueError(f"unknown adapter_type: {type}. Choose either 'full_adapter' or 'simple_adapter'")
def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
- r"""
- This function processes the input tensor `x` through the adapter model and returns a list of feature tensors,
- each representing information extracted at a different scale from the input. The length of the list is
- determined by the number of downsample blocks in the Adapter, as specified by the `channels` and
- `num_res_blocks` parameters during initialization.
- """
return self.adapter(x)
@property
def total_downscale_factor(self):
return self.adapter.total_downscale_factor
- @property
- def downscale_factor(self):
- """The downscale factor applied in the T2I-Adapter's initial pixel unshuffle operation. If an input image's dimensions are
- not evenly divisible by the downscale_factor then an exception will be raised.
- """
- return self.adapter.unshuffle.downscale_factor
-
# full adapter
class FullAdapter(nn.Layer):
- r"""
- See [`T2IAdapter`] for more information.
- """
-
def __init__(
self,
in_channels: int = 3,
@@ -320,62 +149,6 @@ def __init__(
self.total_downscale_factor = downscale_factor * 2 ** (len(channels) - 1)
def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
- r"""
- This method processes the input tensor `x` through the FullAdapter model and performs operations including
- pixel unshuffling, convolution, and a stack of AdapterBlocks. It returns a list of feature tensors, each
- capturing information at a different stage of processing within the FullAdapter model. The number of feature
- tensors in the list is determined by the number of downsample blocks specified during initialization.
- """
- x = self.unshuffle(x)
- x = self.conv_in(x)
-
- features = []
-
- for block in self.body:
- x = block(x)
- features.append(x)
-
- return features
-
-
-class FullAdapterXL(nn.Layer):
- r"""
- See [`T2IAdapter`] for more information.
- """
-
- def __init__(
- self,
- in_channels: int = 3,
- channels: List[int] = [320, 640, 1280, 1280],
- num_res_blocks: int = 2,
- downscale_factor: int = 16,
- ):
- super().__init__()
-
- in_channels = in_channels * downscale_factor**2
-
- self.unshuffle = nn.PixelUnshuffle(downscale_factor)
- self.conv_in = nn.Conv2D(in_channels, channels[0], kernel_size=3, padding=1)
-
- self.body = []
- # blocks to extract XL features with dimensions of [320, 64, 64], [640, 64, 64], [1280, 32, 32], [1280, 32, 32]
- for i in range(len(channels)):
- if i == 1:
- self.body.append(AdapterBlock(channels[i - 1], channels[i], num_res_blocks))
- elif i == 2:
- self.body.append(AdapterBlock(channels[i - 1], channels[i], num_res_blocks, down=True))
- else:
- self.body.append(AdapterBlock(channels[i], channels[i], num_res_blocks))
-
- self.body = nn.ModuleList(self.body)
- # XL has only one downsampling AdapterBlock.
- self.total_downscale_factor = downscale_factor * 2
-
- def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
- r"""
- This method takes the tensor x as input and processes it through FullAdapterXL model. It consists of operations
- including unshuffling pixels, applying convolution layer and appending each block into list of feature tensors.
- """
x = self.unshuffle(x)
x = self.conv_in(x)
@@ -389,27 +162,12 @@ def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
class AdapterBlock(nn.Layer):
- r"""
- An AdapterBlock is a helper model that contains multiple ResNet-like blocks. It is used in the `FullAdapter` and
- `FullAdapterXL` models.
-
- Parameters:
- in_channels (`int`):
- Number of channels of AdapterBlock's input.
- out_channels (`int`):
- Number of channels of AdapterBlock's output.
- num_res_blocks (`int`):
- Number of ResNet blocks in the AdapterBlock.
- down (`bool`, *optional*, defaults to `False`):
- Whether to perform downsampling on AdapterBlock's input.
- """
-
- def __init__(self, in_channels: int, out_channels: int, num_res_blocks: int, down: bool = False):
+ def __init__(self, in_channels, out_channels, num_res_blocks, down=False):
super().__init__()
self.downsample = None
if down:
- self.downsample = nn.AvgPool2D(kernel_size=2, stride=2, ceil_mode=True)
+ self.downsample = Downsample2D(in_channels)
self.in_conv = None
if in_channels != out_channels:
@@ -419,12 +177,7 @@ def __init__(self, in_channels: int, out_channels: int, num_res_blocks: int, dow
*[AdapterResnetBlock(out_channels) for _ in range(num_res_blocks)],
)
- def forward(self, x: paddle.Tensor) -> paddle.Tensor:
- r"""
- This method takes tensor x as input and performs operations downsampling and convolutional layers if the
- self.downsample and self.in_conv properties of AdapterBlock model are specified. Then it applies a series of
- residual blocks to the input tensor.
- """
+ def forward(self, x):
if self.downsample is not None:
x = self.downsample(x)
@@ -437,14 +190,6 @@ def forward(self, x: paddle.Tensor) -> paddle.Tensor:
class AdapterResnetBlock(nn.Layer):
- r"""
- An `AdapterResnetBlock` is a helper model that implements a ResNet-like block.
-
- Parameters:
- channels (`int`):
- Number of channels of AdapterResnetBlock's input and output.
- """
-
def __init__(self, channels):
super().__init__()
self.block1 = nn.Conv2D(channels, channels, kernel_size=3, padding=1)
@@ -452,12 +197,9 @@ def __init__(self, channels):
self.block2 = nn.Conv2D(channels, channels, kernel_size=1)
def forward(self, x):
- r"""
- This method takes input tensor x and applies a convolutional layer, ReLU activation, and another convolutional
- layer on the input tensor. It returns addition with the input tensor.
- """
-
- h = self.act(self.block1(x))
+ h = x
+ h = self.block1(h)
+ h = self.act(h)
h = self.block2(h)
return h + x
@@ -467,10 +209,6 @@ def forward(self, x):
class LightAdapter(nn.Layer):
- r"""
- See [`T2IAdapter`] for more information.
- """
-
def __init__(
self,
in_channels: int = 3,
@@ -497,11 +235,7 @@ def __init__(
self.total_downscale_factor = downscale_factor * (2 ** len(channels))
- def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
- r"""
- This method takes the input tensor x and performs downscaling and appends it in list of feature tensors. Each
- feature tensor corresponds to a different level of processing within the LightAdapter.
- """
+ def forward(self, x):
x = self.unshuffle(x)
features = []
@@ -514,38 +248,19 @@ def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]:
class LightAdapterBlock(nn.Layer):
- r"""
- A `LightAdapterBlock` is a helper model that contains multiple `LightAdapterResnetBlocks`. It is used in the
- `LightAdapter` model.
-
- Parameters:
- in_channels (`int`):
- Number of channels of LightAdapterBlock's input.
- out_channels (`int`):
- Number of channels of LightAdapterBlock's output.
- num_res_blocks (`int`):
- Number of LightAdapterResnetBlocks in the LightAdapterBlock.
- down (`bool`, *optional*, defaults to `False`):
- Whether to perform downsampling on LightAdapterBlock's input.
- """
-
def __init__(self, in_channels, out_channels, num_res_blocks, down=False):
super().__init__()
mid_channels = out_channels // 4
self.downsample = None
if down:
- self.downsample = nn.AvgPool2D(kernel_size=2, stride=2, ceil_mode=True)
+ self.downsample = Downsample2D(in_channels)
self.in_conv = nn.Conv2D(in_channels, mid_channels, kernel_size=1)
self.resnets = nn.Sequential(*[LightAdapterResnetBlock(mid_channels) for _ in range(num_res_blocks)])
self.out_conv = nn.Conv2D(mid_channels, out_channels, kernel_size=1)
- def forward(self, x: paddle.Tensor) -> paddle.Tensor:
- r"""
- This method takes tensor x as input and performs downsampling if required. Then it applies in convolution
- layer, a sequence of residual blocks, and out convolutional layer.
- """
+ def forward(self, x):
if self.downsample is not None:
x = self.downsample(x)
@@ -557,28 +272,16 @@ def forward(self, x: paddle.Tensor) -> paddle.Tensor:
class LightAdapterResnetBlock(nn.Layer):
- """
- A `LightAdapterResnetBlock` is a helper model that implements a ResNet-like block with a slightly different
- architecture than `AdapterResnetBlock`.
-
- Parameters:
- channels (`int`):
- Number of channels of LightAdapterResnetBlock's input and output.
- """
-
def __init__(self, channels):
super().__init__()
self.block1 = nn.Conv2D(channels, channels, kernel_size=3, padding=1)
self.act = nn.ReLU()
self.block2 = nn.Conv2D(channels, channels, kernel_size=3, padding=1)
- def forward(self, x: paddle.Tensor) -> paddle.Tensor:
- r"""
- This function takes input tensor x and processes it through one convolutional layer, ReLU activation, and
- another convolutional layer and adds it to input tensor.
- """
-
- h = self.act(self.block1(x))
+ def forward(self, x):
+ h = x
+ h = self.block1(h)
+ h = self.act(h)
h = self.block2(h)
return h + x
diff --git a/ppdiffusers/ppdiffusers/models/attention.py b/ppdiffusers/ppdiffusers/models/attention.py
index 3343636d3..bc6f8cdcf 100644
--- a/ppdiffusers/ppdiffusers/models/attention.py
+++ b/ppdiffusers/ppdiffusers/models/attention.py
@@ -19,12 +19,11 @@
import paddle.nn.functional as F
from paddle import nn
-from ..utils import is_ppxformers_available, maybe_allow_in_graph
-from .activations import GEGLU, GELU, ApproximateGELU
+from ..utils import is_ppxformers_available
+from .activations import get_activation
from .attention_processor import Attention
-from .embeddings import SinusoidalPositionalEmbedding
+from .embeddings import CombinedTimestepLabelEmbeddings
from .lora import LoRACompatibleLinear
-from .normalization import AdaLayerNorm, AdaLayerNormZero
def drop_path(input, drop_prob: float = 0.0, training: bool = False):
@@ -229,56 +228,6 @@ def forward(self, hidden_states):
return hidden_states
-@maybe_allow_in_graph
-class GatedSelfAttentionDense(nn.Layer):
- r"""
- A gated self-attention dense layer that combines visual features and object features.
-
- Parameters:
- query_dim (`int`): The number of channels in the query.
- context_dim (`int`): The number of channels in the context.
- n_heads (`int`): The number of heads to use for attention.
- d_head (`int`): The number of channels in each head.
- """
-
- def __init__(self, query_dim: int, context_dim: int, n_heads: int, d_head: int):
- super().__init__()
-
- # we need a linear projection since we need cat visual feature and obj feature
- self.linear = nn.Linear(context_dim, query_dim)
-
- self.attn = Attention(query_dim=query_dim, heads=n_heads, dim_head=d_head)
- self.ff = FeedForward(query_dim, activation_fn="geglu")
-
- self.norm1 = nn.LayerNorm(query_dim)
- self.norm2 = nn.LayerNorm(query_dim)
-
- tmp1 = paddle.to_tensor(0.0)
- tmp2 = paddle.to_tensor(0.0)
- self.register_parameter(
- "alpha_attn",
- self.create_parameter(tmp1.shape, tmp1.dtype, default_initializer=paddle.nn.initializer.Assign(tmp1)),
- )
- self.register_parameter(
- "alpha_dense",
- self.create_parameter(tmp2.shape, tmp2.dtype, default_initializer=paddle.nn.initializer.Assign(tmp2)),
- )
-
- self.enabled = True
-
- def forward(self, x, objs):
- if not self.enabled:
- return x
-
- n_visual = x.shape[1]
- objs = self.linear(objs)
-
- x = x + self.alpha_attn.tanh() * self.attn(self.norm1(paddle.concat([x, objs], axis=1)))[:, :n_visual, :]
- x = x + self.alpha_dense.tanh() * self.ff(self.norm2(x))
-
- return x
-
-
class BasicTransformerBlock(nn.Layer):
r"""
A basic Transformer block.
@@ -289,29 +238,15 @@ class BasicTransformerBlock(nn.Layer):
attention_head_dim (`int`): The number of channels in each head.
dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+ only_cross_attention (`bool`, *optional*):
+ Whether to use only cross-attention layers. In this case two cross attention layers are used.
+ double_self_attention (`bool`, *optional*):
+ Whether to use two self-attention layers. In this case no cross attention layers are used.
activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
num_embeds_ada_norm (:
obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
attention_bias (:
obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
- only_cross_attention (`bool`, *optional*):
- Whether to use only cross-attention layers. In this case two cross attention layers are used.
- double_self_attention (`bool`, *optional*):
- Whether to use two self-attention layers. In this case no cross attention layers are used.
- upcast_attention (`bool`, *optional*):
- Whether to upcast the attention computation to float32. This is useful for mixed precision training.
- norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
- Whether to use learnable elementwise affine parameters for normalization.
- norm_type (`str`, *optional*, defaults to `"layer_norm"`):
- The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
- final_dropout (`bool` *optional*, defaults to False):
- Whether to apply a final dropout after the last feed-forward layer.
- attention_type (`str`, *optional*, defaults to `"default"`):
- The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
- positional_embeddings (`str`, *optional*, defaults to `None`):
- The type of positional embeddings to apply to.
- num_positional_embeddings (`int`, *optional*, defaults to `None`):
- The maximum number of positional embeddings to apply.
"""
def __init__(
@@ -328,20 +263,14 @@ def __init__(
double_self_attention: bool = False,
upcast_attention: bool = False,
norm_elementwise_affine: bool = True,
- norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single'
- norm_eps: float = 1e-5,
+ norm_type: str = "layer_norm",
final_dropout: bool = False,
- attention_type: str = "default",
- positional_embeddings: Optional[str] = None,
- num_positional_embeddings: Optional[int] = None,
):
super().__init__()
self.only_cross_attention = only_cross_attention
self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
- self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
- self.use_layer_norm = norm_type == "layer_norm"
if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
raise ValueError(
@@ -354,16 +283,6 @@ def __init__(
else:
norm_kwargs = {}
- if positional_embeddings and (num_positional_embeddings is None):
- raise ValueError(
- "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
- )
-
- if positional_embeddings == "sinusoidal":
- self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
- else:
- self.pos_embed = None
-
# Define 3 blocks. Each block has its own normalization layer.
# 1. Self-Attn
if self.use_ada_layer_norm:
@@ -404,22 +323,9 @@ def __init__(
self.attn2 = None
# 3. Feed-forward
- if not self.use_ada_layer_norm_single:
- self.norm3 = nn.LayerNorm(dim, **norm_kwargs)
-
+ self.norm3 = nn.LayerNorm(dim, **norm_kwargs)
self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
- # 4. Fuser
- if attention_type == "gated" or attention_type == "gated-text-image":
- self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
-
- # 5. Scale-shift for PixArt-Alpha.
- if self.use_ada_layer_norm_single:
- tmp_param = paddle.randn([6, dim]) / dim**0.5
- self.scale_shift_table = self.create_parameter(
- shape=tmp_param.shape, default_initializer=paddle.nn.initializer.Assign(tmp_param)
- )
-
# let chunk size default to None
self._chunk_size = None
self._chunk_dim = 0
@@ -440,37 +346,17 @@ def forward(
class_labels: Optional[paddle.Tensor] = None,
):
# Notice that normalization is always applied before the real computation in the following blocks.
- # 0. Self-Attention
- batch_size = hidden_states.shape[0]
-
+ # 1. Self-Attention
if self.use_ada_layer_norm:
norm_hidden_states = self.norm1(hidden_states, timestep)
elif self.use_ada_layer_norm_zero:
norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
)
- elif self.use_layer_norm:
- norm_hidden_states = self.norm1(hidden_states)
- elif self.use_ada_layer_norm_single:
- shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
- self.scale_shift_table[None] + timestep.reshape([batch_size, 6, -1])
- ).chunk(6, dim=1)
- norm_hidden_states = self.norm1(hidden_states)
- norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
- norm_hidden_states = norm_hidden_states.squeeze(1)
else:
- raise ValueError("Incorrect norm used")
-
- if self.pos_embed is not None:
- norm_hidden_states = self.pos_embed(norm_hidden_states)
-
- # 1. Retrieve lora scale.
- lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
-
- # 2. Prepare GLIGEN inputs
- cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
- gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+ norm_hidden_states = self.norm1(hidden_states)
+ cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
attn_output = self.attn1(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
@@ -479,33 +365,14 @@ def forward(
)
if self.use_ada_layer_norm_zero:
attn_output = gate_msa.unsqueeze(1) * attn_output
- elif self.use_ada_layer_norm_single:
- attn_output = gate_msa * attn_output
-
hidden_states = attn_output + hidden_states
- if hidden_states.ndim == 4:
- hidden_states = hidden_states.squeeze(1)
- # 2.5 GLIGEN Control
- if gligen_kwargs is not None:
- hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
-
- # 3. Cross-Attention
if self.attn2 is not None:
- if self.use_ada_layer_norm:
- norm_hidden_states = self.norm2(hidden_states, timestep)
- elif self.use_ada_layer_norm_zero or self.use_layer_norm:
- norm_hidden_states = self.norm2(hidden_states)
- elif self.use_ada_layer_norm_single:
- # For PixArt norm2 isn't applied here:
- # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
- norm_hidden_states = hidden_states
- else:
- raise ValueError("Incorrect norm")
-
- if self.pos_embed is not None and self.use_ada_layer_norm_single is False:
- norm_hidden_states = self.pos_embed(norm_hidden_states)
+ norm_hidden_states = (
+ self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+ )
+ # 2. Cross-Attention
attn_output = self.attn2(
norm_hidden_states,
encoder_hidden_states=encoder_hidden_states,
@@ -514,17 +381,12 @@ def forward(
)
hidden_states = attn_output + hidden_states
- # 4. Feed-forward
- if not self.use_ada_layer_norm_single:
- norm_hidden_states = self.norm3(hidden_states)
+ # 3. Feed-forward
+ norm_hidden_states = self.norm3(hidden_states)
if self.use_ada_layer_norm_zero:
norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
- if self.use_ada_layer_norm_single:
- norm_hidden_states = self.norm2(hidden_states)
- norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
-
if self._chunk_size is not None:
# "feed_forward_chunk_size" can be used to save memory
if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
@@ -534,23 +396,16 @@ def forward(
num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
ff_output = paddle.concat(
- [
- self.ff(hid_slice, scale=lora_scale)
- for hid_slice in norm_hidden_states.chunk(num_chunks, axis=self._chunk_dim)
- ],
+ [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, axis=self._chunk_dim)],
axis=self._chunk_dim,
)
else:
- ff_output = self.ff(norm_hidden_states, scale=lora_scale)
+ ff_output = self.ff(norm_hidden_states)
if self.use_ada_layer_norm_zero:
ff_output = gate_mlp.unsqueeze(1) * ff_output
- elif self.use_ada_layer_norm_single:
- ff_output = gate_mlp * ff_output
hidden_states = ff_output + hidden_states
- if hidden_states.ndim == 4:
- hidden_states = hidden_states.squeeze(1)
return hidden_states
@@ -601,10 +456,137 @@ def __init__(
if final_dropout:
self.net.append(nn.Dropout(dropout))
- def forward(self, hidden_states, scale: float = 1.0):
+ def forward(self, hidden_states):
for module in self.net:
- if isinstance(module, (LoRACompatibleLinear, GEGLU)):
- hidden_states = module(hidden_states, scale)
- else:
- hidden_states = module(hidden_states)
+ hidden_states = module(hidden_states)
return hidden_states
+
+
+class GELU(nn.Layer):
+ r"""
+ GELU activation function with tanh approximation support with `approximate="tanh"`.
+ """
+
+ def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
+ super().__init__()
+ self.proj = LoRACompatibleLinear(dim_in, dim_out)
+ self.approximate = approximate
+ self.approximate_bool = approximate == "tanh"
+
+ def forward(self, hidden_states):
+ hidden_states = self.proj(hidden_states)
+ hidden_states = F.gelu(hidden_states, approximate=self.approximate_bool)
+ return hidden_states
+
+
+class GEGLU(nn.Layer):
+ r"""
+ A variant of the gated linear unit activation function from https://arxiv.org/abs/2002.05202.
+
+ Parameters:
+ dim_in (`int`): The number of channels in the input.
+ dim_out (`int`): The number of channels in the output.
+ """
+
+ def __init__(self, dim_in: int, dim_out: int):
+ super().__init__()
+ self.proj = LoRACompatibleLinear(dim_in, dim_out * 2)
+
+ def forward(self, hidden_states):
+ hidden_states, gate = self.proj(hidden_states).chunk(2, axis=-1)
+ return hidden_states * F.gelu(gate)
+
+
+class ApproximateGELU(nn.Layer):
+ """
+ The approximate form of Gaussian Error Linear Unit (GELU)
+
+ For more details, see section 2: https://arxiv.org/abs/1606.08415
+ """
+
+ def __init__(self, dim_in: int, dim_out: int):
+ super().__init__()
+ self.proj = nn.Linear(dim_in, dim_out)
+
+ def forward(self, x):
+ x = self.proj(x)
+ return x * F.sigmoid(1.702 * x)
+
+
+class AdaLayerNorm(nn.Layer):
+ """
+ Norm layer modified to incorporate timestep embeddings.
+ """
+
+ def __init__(self, embedding_dim, num_embeddings):
+ super().__init__()
+ self.emb = nn.Embedding(num_embeddings, embedding_dim)
+ self.silu = nn.Silu()
+ self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
+ # elementwise_affine=False
+ norm_kwargs = {"weight_attr": False, "bias_attr": False}
+ self.norm = nn.LayerNorm(embedding_dim, **norm_kwargs)
+
+ def forward(self, x, timestep):
+ emb = self.linear(self.silu(self.emb(timestep)))
+ # must set axis=-1, paddle vs pytorch
+ scale, shift = paddle.chunk(emb, 2, axis=-1)
+ x = self.norm(x) * (1 + scale) + shift
+ return x
+
+
+class AdaLayerNormZero(nn.Layer):
+ """
+ Norm layer adaptive layer norm zero (adaLN-Zero).
+ """
+
+ def __init__(self, embedding_dim, num_embeddings):
+ super().__init__()
+
+ self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
+
+ self.silu = nn.Silu()
+ self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias_attr=True)
+ # elementwise_affine=False
+ norm_kwargs = {"weight_attr": False, "bias_attr": False}
+ self.norm = nn.LayerNorm(embedding_dim, epsilon=1e-6, **norm_kwargs)
+
+ def forward(self, x, timestep, class_labels, hidden_dtype=None):
+ emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, axis=1)
+ x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
+ return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
+
+
+class AdaGroupNorm(nn.Layer):
+ """
+ GroupNorm layer modified to incorporate timestep embeddings.
+ """
+
+ def __init__(
+ self, embedding_dim: int, out_dim: int, num_groups: int, act_fn: Optional[str] = None, eps: float = 1e-5
+ ):
+ super().__init__()
+ self.num_groups = num_groups
+ self.eps = eps
+ if act_fn is None:
+ self.act = None
+ else:
+ self.act = get_activation(act_fn)
+
+ self.linear = nn.Linear(embedding_dim, out_dim * 2)
+ # elementwise_affine=False
+ norm_kwargs = {"weight_attr": False, "bias_attr": False}
+ self.group_norm = nn.GroupNorm(num_groups, out_dim, epsilon=eps, **norm_kwargs)
+ self.group_norm.weight = None
+ self.group_norm.bias = None
+
+ def forward(self, x, emb):
+ if self.act:
+ emb = self.act(emb)
+ emb = self.linear(emb)
+ emb = emb[:, :, None, None]
+ scale, shift = emb.chunk(2, axis=1)
+ x = self.group_norm(x)
+ x = x * (1 + scale) + shift
+ return x
diff --git a/ppdiffusers/ppdiffusers/models/attention_processor.py b/ppdiffusers/ppdiffusers/models/attention_processor.py
index e7789de33..0ed4e1e5d 100644
--- a/ppdiffusers/ppdiffusers/models/attention_processor.py
+++ b/ppdiffusers/ppdiffusers/models/attention_processor.py
@@ -12,15 +12,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from importlib import import_module
from typing import Optional, Union
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
-from ..utils import USE_PEFT_BACKEND, deprecate, is_ppxformers_available, logging
-from .lora import LoRACompatibleLinear, LoRALinearLayer
+from ..utils import deprecate, is_ppxformers_available, logging
+from .lora import LoRALinearLayer
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -30,50 +29,14 @@ class Attention(nn.Layer):
A cross attention layer.
Parameters:
- query_dim (`int`):
- The number of channels in the query.
+ query_dim (`int`): The number of channels in the query.
cross_attention_dim (`int`, *optional*):
The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
- heads (`int`, *optional*, defaults to 8):
- The number of heads to use for multi-head attention.
- dim_head (`int`, *optional*, defaults to 64):
- The number of channels in each head.
- dropout (`float`, *optional*, defaults to 0.0):
- The dropout probability to use.
+ heads (`int`, *optional*, defaults to 8): The number of heads to use for multi-head attention.
+ dim_head (`int`, *optional*, defaults to 64): The number of channels in each head.
+ dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
bias (`bool`, *optional*, defaults to False):
Set to `True` for the query, key, and value linear layers to contain a bias parameter.
- upcast_attention (`bool`, *optional*, defaults to False):
- Set to `True` to upcast the attention computation to `float32`.
- upcast_softmax (`bool`, *optional*, defaults to False):
- Set to `True` to upcast the softmax computation to `float32`.
- cross_attention_norm (`str`, *optional*, defaults to `None`):
- The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
- cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
- The number of groups to use for the group norm in the cross attention.
- added_kv_proj_dim (`int`, *optional*, defaults to `None`):
- The number of channels to use for the added key and value projections. If `None`, no projection is used.
- norm_num_groups (`int`, *optional*, defaults to `None`):
- The number of groups to use for the group norm in the attention.
- spatial_norm_dim (`int`, *optional*, defaults to `None`):
- The number of channels to use for the spatial normalization.
- out_bias (`bool`, *optional*, defaults to `True`):
- Set to `True` to use a bias in the output linear layer.
- scale_qk (`bool`, *optional*, defaults to `True`):
- Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
- only_cross_attention (`bool`, *optional*, defaults to `False`):
- Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
- `added_kv_proj_dim` is not `None`.
- eps (`float`, *optional*, defaults to 1e-5):
- An additional value added to the denominator in group normalization that is used for numerical stability.
- rescale_output_factor (`float`, *optional*, defaults to 1.0):
- A factor to rescale the output by dividing it with this value.
- residual_connection (`bool`, *optional*, defaults to `False`):
- Set to `True` to add the residual connection to the output.
- _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
- Set to `True` if the attention block is loaded from a deprecated state dict.
- processor (`AttnProcessor`, *optional*, defaults to `None`):
- The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
- `AttnProcessor` otherwise.
"""
def __init__(
@@ -83,7 +46,7 @@ def __init__(
heads: int = 8,
dim_head: int = 64,
dropout: float = 0.0,
- bias: bool = False,
+ bias=False,
upcast_attention: bool = False,
upcast_softmax: bool = False,
cross_attention_norm: Optional[str] = None,
@@ -97,12 +60,12 @@ def __init__(
eps: float = 1e-5,
rescale_output_factor: float = 1.0,
residual_connection: bool = False,
- _from_deprecated_attn_block: bool = False,
+ _from_deprecated_attn_block=False,
processor: Optional["AttnProcessor"] = None,
):
super().__init__()
- self.inner_dim = dim_head * heads
- self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+ inner_dim = dim_head * heads
+ cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
self.upcast_attention = upcast_attention
self.upcast_softmax = upcast_softmax
self.rescale_output_factor = rescale_output_factor
@@ -144,7 +107,7 @@ def __init__(
if cross_attention_norm is None:
self.norm_cross = None
elif cross_attention_norm == "layer_norm":
- self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
+ self.norm_cross = nn.LayerNorm(cross_attention_dim)
elif cross_attention_norm == "group_norm":
if self.added_kv_proj_dim is not None:
# The given `encoder_hidden_states` are initially of shape
@@ -154,7 +117,7 @@ def __init__(
# the number of channels for the group norm.
norm_cross_num_channels = added_kv_proj_dim
else:
- norm_cross_num_channels = self.cross_attention_dim
+ norm_cross_num_channels = cross_attention_dim
self.norm_cross = nn.GroupNorm(
num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, epsilon=1e-5
@@ -164,22 +127,22 @@ def __init__(
f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
)
- self.to_q = LoRACompatibleLinear(query_dim, self.inner_dim, bias_attr=bias)
+ self.to_q = nn.Linear(query_dim, inner_dim, bias_attr=bias)
if not self.only_cross_attention:
# only relevant for the `AddedKVProcessor` classes
- self.to_k = LoRACompatibleLinear(self.cross_attention_dim, self.inner_dim, bias_attr=bias)
- self.to_v = LoRACompatibleLinear(self.cross_attention_dim, self.inner_dim, bias_attr=bias)
+ self.to_k = nn.Linear(cross_attention_dim, inner_dim, bias_attr=bias)
+ self.to_v = nn.Linear(cross_attention_dim, inner_dim, bias_attr=bias)
else:
self.to_k = None
self.to_v = None
if self.added_kv_proj_dim is not None:
- self.add_k_proj = LoRACompatibleLinear(added_kv_proj_dim, self.inner_dim)
- self.add_v_proj = LoRACompatibleLinear(added_kv_proj_dim, self.inner_dim)
+ self.add_k_proj = nn.Linear(added_kv_proj_dim, inner_dim)
+ self.add_v_proj = nn.Linear(added_kv_proj_dim, inner_dim)
self.to_out = nn.LayerList([])
- self.to_out.append(LoRACompatibleLinear(self.inner_dim, query_dim, bias_attr=out_bias))
+ self.to_out.append(nn.Linear(inner_dim, query_dim, bias_attr=out_bias))
self.to_out.append(nn.Dropout(dropout))
# set attention processor
@@ -189,17 +152,7 @@ def __init__(
def set_use_memory_efficient_attention_xformers(
self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[str] = None
- ) -> None:
- r"""
- Set whether to use memory efficient attention from `xformers` or not.
-
- Args:
- use_memory_efficient_attention_xformers (`bool`):
- Whether to use memory efficient attention from `xformers` or not.
- attention_op (`Callable`, *optional*):
- The attention operation to use. Defaults to `None` which uses the default attention operation from
- `xformers`.
- """
+ ):
is_lora = hasattr(self, "processor") and isinstance(
self.processor,
LORA_ATTENTION_PROCESSORS,
@@ -217,7 +170,6 @@ def set_use_memory_efficient_attention_xformers(
LoRAAttnAddedKVProcessor,
),
)
-
if use_memory_efficient_attention_xformers:
if is_added_kv_processor and (is_lora or is_custom_diffusion):
raise NotImplementedError(
@@ -300,13 +252,6 @@ def set_use_memory_efficient_attention_xformers(
self.set_processor(processor)
def set_attention_slice(self, slice_size):
- r"""
- Set the slice size for attention computation.
-
- Args:
- slice_size (`int`):
- The slice size for attention computation.
- """
if slice_size is not None and slice_size > self.sliceable_head_dim:
raise ValueError(f"slice_size {slice_size} has to be smaller or equal to {self.sliceable_head_dim}.")
@@ -321,32 +266,7 @@ def set_attention_slice(self, slice_size):
self.set_processor(processor)
- def set_processor(self, processor: "AttnProcessor", _remove_lora: bool = False) -> None:
- r"""
- Set the attention processor to use.
-
- Args:
- processor (`AttnProcessor`):
- The attention processor to use.
- _remove_lora (`bool`, *optional*, defaults to `False`):
- Set to `True` to remove LoRA layers from the model.
- """
- if (
- hasattr(self, "processor")
- and not isinstance(processor, LORA_ATTENTION_PROCESSORS)
- and self.to_q.lora_layer is not None
- ):
- deprecate(
- "set_processor to offload LoRA",
- "0.26.0",
- "In detail, removing LoRA layers via calling `set_default_attn_processor` is deprecated. Please make sure to call `pipe.unload_lora_weights()` instead.",
- )
- # TODO(Patrick, Sayak) - this can be deprecated once PEFT LoRA integration is complete
- # We need to remove all LoRA layers
- for module in self.sublayers():
- if hasattr(module, "set_lora_layer"):
- module.set_lora_layer(None)
-
+ def set_processor(self, processor: "AttnProcessor"):
# if current processor is in `self._sub_layers` and if passed `processor` is not, we need to
# pop `processor` from `self._sub_layers`
if hasattr(self, "processor") and isinstance(self.processor, nn.Layer) and not isinstance(processor, nn.Layer):
@@ -355,113 +275,7 @@ def set_processor(self, processor: "AttnProcessor", _remove_lora: bool = False)
self.processor = processor
- def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
- r"""
- Get the attention processor in use.
-
- Args:
- return_deprecated_lora (`bool`, *optional*, defaults to `False`):
- Set to `True` to return the deprecated LoRA attention processor.
-
- Returns:
- "AttentionProcessor": The attention processor in use.
- """
- if not return_deprecated_lora:
- return self.processor
-
- # TODO(Sayak, Patrick). The rest of the function is needed to ensure backwards compatible
- # serialization format for LoRA Attention Processors. It should be deleted once the integration
- # with PEFT is completed.
- is_lora_activated = {
- name: module.lora_layer is not None
- for name, module in self.named_sublayers()
- if hasattr(module, "lora_layer")
- }
-
- # 1. if no layer has a LoRA activated we can return the processor as usual
- if not any(is_lora_activated.values()):
- return self.processor
-
- # If doesn't apply LoRA do `add_k_proj` or `add_v_proj`
- is_lora_activated.pop("add_k_proj", None)
- is_lora_activated.pop("add_v_proj", None)
- # 2. else it is not posssible that only some layers have LoRA activated
- if not all(is_lora_activated.values()):
- raise ValueError(
- f"Make sure that either all layers or no layers have LoRA activated, but have {is_lora_activated}"
- )
-
- # 3. And we need to merge the current LoRA layers into the corresponding LoRA attention processor
- non_lora_processor_cls_name = self.processor.__class__.__name__
- lora_processor_cls = getattr(import_module(__name__), "LoRA" + non_lora_processor_cls_name)
-
- hidden_size = self.inner_dim
-
- # now create a LoRA attention processor from the LoRA layers
- if lora_processor_cls in [LoRAAttnProcessor, LoRAAttnProcessor2_5, LoRAXFormersAttnProcessor]:
- kwargs = {
- "cross_attention_dim": self.cross_attention_dim,
- "rank": self.to_q.lora_layer.rank,
- "network_alpha": self.to_q.lora_layer.network_alpha,
- "q_rank": self.to_q.lora_layer.rank,
- "q_hidden_size": self.to_q.lora_layer.out_features,
- "k_rank": self.to_k.lora_layer.rank,
- "k_hidden_size": self.to_k.lora_layer.out_features,
- "v_rank": self.to_v.lora_layer.rank,
- "v_hidden_size": self.to_v.lora_layer.out_features,
- "out_rank": self.to_out[0].lora_layer.rank,
- "out_hidden_size": self.to_out[0].lora_layer.out_features,
- }
-
- if hasattr(self.processor, "attention_op"):
- kwargs["attention_op"] = self.prcoessor.attention_op
-
- lora_processor = lora_processor_cls(hidden_size, **kwargs)
- lora_processor.to_q_lora.load_dict(self.to_q.lora_layer.state_dict())
- lora_processor.to_k_lora.load_dict(self.to_k.lora_layer.state_dict())
- lora_processor.to_v_lora.load_dict(self.to_v.lora_layer.state_dict())
- lora_processor.to_out_lora.load_dict(self.to_out[0].lora_layer.state_dict())
- elif lora_processor_cls == LoRAAttnAddedKVProcessor:
- lora_processor = lora_processor_cls(
- hidden_size,
- cross_attention_dim=self.add_k_proj.weight.shape[0],
- rank=self.to_q.lora_layer.rank,
- network_alpha=self.to_q.lora_layer.network_alpha,
- )
- lora_processor.to_q_lora.load_dict(self.to_q.lora_layer.state_dict())
- lora_processor.to_k_lora.load_dict(self.to_k.lora_layer.state_dict())
- lora_processor.to_v_lora.load_dict(self.to_v.lora_layer.state_dict())
- lora_processor.to_out_lora.load_dict(self.to_out[0].lora_layer.state_dict())
-
- # only save if used
- if self.add_k_proj.lora_layer is not None:
- lora_processor.add_k_proj_lora.load_dict(self.add_k_proj.lora_layer.state_dict())
- lora_processor.add_v_proj_lora.load_dict(self.add_v_proj.lora_layer.state_dict())
- else:
- lora_processor.add_k_proj_lora = None
- lora_processor.add_v_proj_lora = None
- else:
- raise ValueError(f"{lora_processor_cls} does not exist.")
-
- return lora_processor
-
def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
- r"""
- The forward method of the `Attention` class.
-
- Args:
- hidden_states (`paddle.Tensor`):
- The hidden states of the query.
- encoder_hidden_states (`paddle.Tensor`, *optional*):
- The hidden states of the encoder.
- attention_mask (`paddle.Tensor`, *optional*):
- The attention mask to use. If `None`, no mask is applied.
- **cross_attention_kwargs:
- Additional keyword arguments to pass along to the cross attention.
-
- Returns:
- `paddle.Tensor`: The output of the attention layer.
- """
# The `Attention` class can call different attention processors / attention functions
# here we simply pass along all tensors to the selected processor class
# For standard processors that are defined here, `**cross_attention_kwargs` is empty
@@ -518,7 +332,7 @@ def prepare_attention_mask(self, attention_mask, target_length, batch_size=None,
if batch_size is None:
deprecate(
"batch_size=None",
- "0.22.0",
+ "0.0.15",
message=(
"Not passing the `batch_size` parameter to `prepare_attention_mask` can lead to incorrect"
" attention mask preparation and is deprecated behavior. Please make sure to pass `batch_size` to"
@@ -580,12 +394,10 @@ def __call__(
encoder_hidden_states=None,
attention_mask=None,
temb=None,
- scale=1.0,
**cross_attention_kwargs
):
- residual = hidden_states
-
- args = () if USE_PEFT_BACKEND else (scale,)
+ if attn.residual_connection:
+ residual = hidden_states
if attn.spatial_norm is not None:
hidden_states = attn.spatial_norm(hidden_states, temb)
@@ -604,17 +416,123 @@ def __call__(
if attn.group_norm is not None:
hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
- query = attn.to_q(hidden_states, *args)
+ query = attn.to_q(hidden_states)
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
- key = attn.to_k(encoder_hidden_states, *args)
- value = attn.to_v(encoder_hidden_states, *args)
+ key = attn.to_k(encoder_hidden_states)
+ value = attn.to_v(encoder_hidden_states)
+
+ query = attn.head_to_batch_dim(query)
+ key = attn.head_to_batch_dim(key)
+ value = attn.head_to_batch_dim(value)
+
+ attention_probs = attn.get_attention_scores(query, key, attention_mask)
+ hidden_states = paddle.matmul(attention_probs, value)
+ hidden_states = attn.batch_to_head_dim(hidden_states)
+
+ # linear proj
+ hidden_states = attn.to_out[0](hidden_states)
+ # dropout
+ hidden_states = attn.to_out[1](hidden_states)
+
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose([0, 2, 1]).reshape([batch_size, channel, height, width])
+
+ if attn.residual_connection:
+ hidden_states = hidden_states + residual
+
+ hidden_states = hidden_states / attn.rescale_output_factor
+
+ return hidden_states
+
+
+class LoRAAttnProcessor(nn.Layer):
+ r"""
+ Processor for implementing the LoRA attention mechanism.
+
+ Args:
+ hidden_size (`int`, *optional*):
+ The hidden size of the attention layer.
+ cross_attention_dim (`int`, *optional*):
+ The number of channels in the `encoder_hidden_states`.
+ rank (`int`, defaults to 4):
+ The dimension of the LoRA update matrices.
+ network_alpha (`int`, *optional*):
+ Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+ """
+
+ def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None, **kwargs):
+ super().__init__()
+
+ self.hidden_size = hidden_size
+ self.cross_attention_dim = cross_attention_dim
+ self.rank = rank
+
+ q_rank = kwargs.pop("q_rank", None)
+ q_hidden_size = kwargs.pop("q_hidden_size", None)
+ q_rank = q_rank if q_rank is not None else rank
+ q_hidden_size = q_hidden_size if q_hidden_size is not None else hidden_size
+
+ v_rank = kwargs.pop("v_rank", None)
+ v_hidden_size = kwargs.pop("v_hidden_size", None)
+ v_rank = v_rank if v_rank is not None else rank
+ v_hidden_size = v_hidden_size if v_hidden_size is not None else hidden_size
+
+ out_rank = kwargs.pop("out_rank", None)
+ out_hidden_size = kwargs.pop("out_hidden_size", None)
+ out_rank = out_rank if out_rank is not None else rank
+ out_hidden_size = out_hidden_size if out_hidden_size is not None else hidden_size
+
+ self.to_q_lora = LoRALinearLayer(q_hidden_size, q_hidden_size, q_rank, network_alpha)
+ self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+ self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha)
+ self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha)
+
+ def __call__(
+ self,
+ attn: Attention,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ scale=1.0,
+ temb=None,
+ **cross_attention_kwargs
+ ):
+ if attn.residual_connection:
+ residual = hidden_states
+
+ if attn.spatial_norm is not None:
+ hidden_states = attn.spatial_norm(hidden_states, temb)
+
+ input_ndim = hidden_states.ndim
+
+ if input_ndim == 4:
+ batch_size, channel, height, width = hidden_states.shape
+ hidden_states = hidden_states.reshape([batch_size, channel, height * width]).transpose([0, 2, 1])
+
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
+
+ query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
query = attn.head_to_batch_dim(query)
+
+ if encoder_hidden_states is None:
+ encoder_hidden_states = hidden_states
+ elif attn.norm_cross:
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+ key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+ value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
+
key = attn.head_to_batch_dim(key)
value = attn.head_to_batch_dim(value)
@@ -623,7 +541,7 @@ def __call__(
hidden_states = attn.batch_to_head_dim(hidden_states)
# linear proj
- hidden_states = attn.to_out[0](hidden_states, *args)
+ hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
# dropout
hidden_states = attn.to_out[1](hidden_states)
@@ -659,12 +577,12 @@ class CustomDiffusionAttnProcessor(nn.Layer):
def __init__(
self,
- train_kv: bool = True,
- train_q_out: bool = True,
- hidden_size: Optional[int] = None,
- cross_attention_dim: Optional[int] = None,
- out_bias: bool = True,
- dropout: float = 0.0,
+ train_kv=True,
+ train_q_out=True,
+ hidden_size=None,
+ cross_attention_dim=None,
+ out_bias=True,
+ dropout=0.0,
):
super().__init__()
self.train_kv = train_kv
@@ -746,6 +664,87 @@ class AttnAddedKVProcessor:
encoder.
"""
+ def __call__(
+ self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs
+ ):
+ residual = hidden_states
+ hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
+ [0, 2, 1]
+ )
+ batch_size, sequence_length, _ = hidden_states.shape
+
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+
+ if encoder_hidden_states is None:
+ encoder_hidden_states = hidden_states
+ elif attn.norm_cross:
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+ hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
+
+ query = attn.to_q(hidden_states)
+ query = attn.head_to_batch_dim(query)
+
+ encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+ encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+ encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
+ encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
+
+ if not attn.only_cross_attention:
+ key = attn.to_k(hidden_states)
+ value = attn.to_v(hidden_states)
+ key = attn.head_to_batch_dim(key)
+ value = attn.head_to_batch_dim(value)
+ key = paddle.concat([encoder_hidden_states_key_proj, key], axis=2)
+ value = paddle.concat([encoder_hidden_states_value_proj, value], axis=2)
+ else:
+ key = encoder_hidden_states_key_proj
+ value = encoder_hidden_states_value_proj
+
+ attention_probs = attn.get_attention_scores(query, key, attention_mask)
+ hidden_states = paddle.matmul(attention_probs, value)
+ hidden_states = attn.batch_to_head_dim(hidden_states)
+
+ # linear proj
+ hidden_states = attn.to_out[0](hidden_states)
+ # dropout
+ hidden_states = attn.to_out[1](hidden_states)
+
+ hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape)
+ hidden_states = hidden_states + residual
+
+ return hidden_states
+
+
+class LoRAAttnAddedKVProcessor(nn.Layer):
+ r"""
+ Processor for implementing the LoRA attention mechanism with extra learnable key and value matrices for the text
+ encoder.
+
+ Args:
+ hidden_size (`int`, *optional*):
+ The hidden size of the attention layer.
+ cross_attention_dim (`int`, *optional*, defaults to `None`):
+ The number of channels in the `encoder_hidden_states`.
+ rank (`int`, defaults to 4):
+ The dimension of the LoRA update matrices.
+
+ """
+
+ def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None):
+ super().__init__()
+
+ self.hidden_size = hidden_size
+ self.cross_attention_dim = cross_attention_dim
+ self.rank = rank
+
+ self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+ self.add_k_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+ self.add_v_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+ self.to_k_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+ self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+ self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
+
def __call__(
self,
attn: Attention,
@@ -756,9 +755,6 @@ def __call__(
**cross_attention_kwargs
):
residual = hidden_states
-
- args = () if USE_PEFT_BACKEND else (scale,)
-
hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
[0, 2, 1]
)
@@ -773,21 +769,25 @@ def __call__(
hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
- query = attn.to_q(hidden_states, *args)
+ query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
query = attn.head_to_batch_dim(query)
- encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states, scale=scale)
- encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states, scale=scale)
+ encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + scale * self.add_k_proj_lora(
+ encoder_hidden_states
+ )
+ encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + scale * self.add_v_proj_lora(
+ encoder_hidden_states
+ )
encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
if not attn.only_cross_attention:
- key = attn.to_k(hidden_states, *args)
- value = attn.to_v(hidden_states, *args)
+ key = attn.to_k(hidden_states) + scale * self.to_k_lora(hidden_states)
+ value = attn.to_v(hidden_states) + scale * self.to_v_lora(hidden_states)
key = attn.head_to_batch_dim(key)
value = attn.head_to_batch_dim(value)
- key = paddle.concat([encoder_hidden_states_key_proj, key], axis=2)
- value = paddle.concat([encoder_hidden_states_value_proj, value], axis=2)
+ key = paddle.concat([encoder_hidden_states_key_proj, key], axis=1)
+ value = paddle.concat([encoder_hidden_states_value_proj, value], axis=1)
else:
key = encoder_hidden_states_key_proj
value = encoder_hidden_states_value_proj
@@ -797,7 +797,7 @@ def __call__(
hidden_states = attn.batch_to_head_dim(hidden_states)
# linear proj
- hidden_states = attn.to_out[0](hidden_states, *args)
+ hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
# dropout
hidden_states = attn.to_out[1](hidden_states)
@@ -906,12 +906,10 @@ def __call__(
encoder_hidden_states: Optional[paddle.Tensor] = None,
attention_mask: Optional[paddle.Tensor] = None,
temb: Optional[paddle.Tensor] = None,
- scale: float = 1.0,
**cross_attention_kwargs
):
- residual = hidden_states
-
- args = () if USE_PEFT_BACKEND else (scale,)
+ if attn.residual_connection:
+ residual = hidden_states
if attn.spatial_norm is not None:
hidden_states = attn.spatial_norm(hidden_states, temb)
@@ -940,15 +938,15 @@ def __call__(
if attn.group_norm is not None:
hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
- query = attn.to_q(hidden_states, *args)
+ query = attn.to_q(hidden_states)
if encoder_hidden_states is None:
encoder_hidden_states = hidden_states
elif attn.norm_cross:
encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
- key = attn.to_k(encoder_hidden_states, *args)
- value = attn.to_v(encoder_hidden_states, *args)
+ key = attn.to_k(encoder_hidden_states)
+ value = attn.to_v(encoder_hidden_states)
# if transpose = False, query's shape will be [batch_size, seq_len, num_head, head_dim]
query = attn.head_to_batch_dim(query, transpose=False)
@@ -969,7 +967,7 @@ def __call__(
# hidden_states = hidden_states.cast(query.dtype)
hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
# linear proj
- hidden_states = attn.to_out[0](hidden_states, *args)
+ hidden_states = attn.to_out[0](hidden_states)
# dropout
hidden_states = attn.to_out[1](hidden_states)
@@ -984,21 +982,149 @@ def __call__(
return hidden_states
-class CustomDiffusionXFormersAttnProcessor(nn.Layer):
+class LoRAXFormersAttnProcessor(nn.Layer):
r"""
- Processor for implementing memory efficient attention using xFormers for the Custom Diffusion method.
+ Processor for implementing the LoRA attention mechanism with memory efficient attention using xFormers.
Args:
- train_kv (`bool`, defaults to `True`):
- Whether to newly train the key and value matrices corresponding to the text features.
- train_q_out (`bool`, defaults to `True`):
- Whether to newly train query matrices corresponding to the latent image features.
- hidden_size (`int`, *optional*, defaults to `None`):
- The hidden size of the attention layer.
- cross_attention_dim (`int`, *optional*, defaults to `None`):
- The number of channels in the `encoder_hidden_states`.
- out_bias (`bool`, defaults to `True`):
- Whether to include the bias parameter in `train_q_out`.
+ hidden_size (`int`, *optional*):
+ The hidden size of the attention layer.
+ cross_attention_dim (`int`, *optional*):
+ The number of channels in the `encoder_hidden_states`.
+ rank (`int`, defaults to 4):
+ The dimension of the LoRA update matrices.
+ attention_op (`Callable`, *optional*, defaults to `None`):
+ The base
+ [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
+ use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
+ operator.
+ network_alpha (`int`, *optional*):
+ Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+
+ """
+
+ def __init__(
+ self,
+ hidden_size,
+ cross_attention_dim,
+ rank=4,
+ attention_op: Optional[str] = None,
+ network_alpha=None,
+ **kwargs,
+ ):
+ super().__init__()
+
+ self.hidden_size = hidden_size
+ self.cross_attention_dim = cross_attention_dim
+ self.rank = rank
+ self.attention_op = attention_op
+
+ q_rank = kwargs.pop("q_rank", None)
+ q_hidden_size = kwargs.pop("q_hidden_size", None)
+ q_rank = q_rank if q_rank is not None else rank
+ q_hidden_size = q_hidden_size if q_hidden_size is not None else hidden_size
+
+ v_rank = kwargs.pop("v_rank", None)
+ v_hidden_size = kwargs.pop("v_hidden_size", None)
+ v_rank = v_rank if v_rank is not None else rank
+ v_hidden_size = v_hidden_size if v_hidden_size is not None else hidden_size
+
+ out_rank = kwargs.pop("out_rank", None)
+ out_hidden_size = kwargs.pop("out_hidden_size", None)
+ out_rank = out_rank if out_rank is not None else rank
+ out_hidden_size = out_hidden_size if out_hidden_size is not None else hidden_size
+
+ self.to_q_lora = LoRALinearLayer(q_hidden_size, q_hidden_size, q_rank, network_alpha)
+ self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
+ self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha)
+ self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha)
+
+ def __call__(
+ self,
+ attn: Attention,
+ hidden_states,
+ encoder_hidden_states=None,
+ attention_mask=None,
+ scale=1.0,
+ temb=None,
+ **cross_attention_kwargs
+ ):
+ if attn.residual_connection:
+ residual = hidden_states
+
+ if attn.spatial_norm is not None:
+ hidden_states = attn.spatial_norm(hidden_states, temb)
+
+ input_ndim = hidden_states.ndim
+
+ if input_ndim == 4:
+ batch_size, channel, height, width = hidden_states.shape
+ hidden_states = hidden_states.reshape([batch_size, channel, height * width]).transpose([0, 2, 1])
+
+ batch_size, sequence_length, _ = (
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+ )
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
+ if attn.group_norm is not None:
+ hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
+
+ query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
+ query = attn.head_to_batch_dim(query, transpose=False)
+
+ if encoder_hidden_states is None:
+ encoder_hidden_states = hidden_states
+ elif attn.norm_cross:
+ encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+
+ key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
+ value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
+
+ key = attn.head_to_batch_dim(key, transpose=False)
+ value = attn.head_to_batch_dim(value, transpose=False)
+
+ hidden_states = F.scaled_dot_product_attention_(
+ query,
+ key,
+ value,
+ attn_mask=attention_mask,
+ scale=attn.scale,
+ dropout_p=0.0,
+ training=attn.training,
+ attention_op=self.attention_op,
+ )
+
+ hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
+ # linear proj
+ hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
+ # dropout
+ hidden_states = attn.to_out[1](hidden_states)
+
+ if input_ndim == 4:
+ hidden_states = hidden_states.transpose([0, 2, 1]).reshape([batch_size, channel, height, width])
+
+ if attn.residual_connection:
+ hidden_states = hidden_states + residual
+
+ hidden_states = hidden_states / attn.rescale_output_factor
+
+ return hidden_states
+
+
+class CustomDiffusionXFormersAttnProcessor(nn.Layer):
+ r"""
+ Processor for implementing memory efficient attention using xFormers for the Custom Diffusion method.
+
+ Args:
+ train_kv (`bool`, defaults to `True`):
+ Whether to newly train the key and value matrices corresponding to the text features.
+ train_q_out (`bool`, defaults to `True`):
+ Whether to newly train query matrices corresponding to the latent image features.
+ hidden_size (`int`, *optional*, defaults to `None`):
+ The hidden size of the attention layer.
+ cross_attention_dim (`int`, *optional*, defaults to `None`):
+ The number of channels in the `encoder_hidden_states`.
+ out_bias (`bool`, defaults to `True`):
+ Whether to include the bias parameter in `train_q_out`.
dropout (`float`, *optional*, defaults to 0.0):
The dropout probability to use.
attention_op (`Callable`, *optional*, defaults to `None`):
@@ -1289,6 +1415,34 @@ def __call__(
return hidden_states
+AttnProcessor2_5 = XFormersAttnProcessor
+AttnAddedKVProcessor2_5 = XFormersAttnAddedKVProcessor
+LoRAAttnProcessor2_5 = LoRAXFormersAttnProcessor
+AttentionProcessor = Union[
+ AttnProcessor,
+ AttnProcessor2_5,
+ XFormersAttnProcessor,
+ SlicedAttnProcessor,
+ AttnAddedKVProcessor,
+ SlicedAttnAddedKVProcessor,
+ AttnAddedKVProcessor2_5,
+ XFormersAttnAddedKVProcessor,
+ LoRAAttnProcessor,
+ LoRAXFormersAttnProcessor,
+ LoRAAttnProcessor2_5,
+ LoRAAttnAddedKVProcessor,
+ CustomDiffusionAttnProcessor,
+ CustomDiffusionXFormersAttnProcessor,
+]
+
+LORA_ATTENTION_PROCESSORS = (
+ LoRAAttnProcessor,
+ LoRAAttnProcessor2_5,
+ LoRAXFormersAttnProcessor,
+ LoRAAttnAddedKVProcessor,
+)
+
+
class SpatialNorm(nn.Layer):
"""
Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002
@@ -1314,414 +1468,3 @@ def forward(self, f, zq):
norm_f = self.norm_layer(f)
new_f = norm_f * self.conv_y(zq) + self.conv_b(zq)
return new_f
-
-
-# Deprecated
-class LoRAAttnProcessor(nn.Layer):
- r"""
- Processor for implementing the LoRA attention mechanism.
-
- Args:
- hidden_size (`int`, *optional*):
- The hidden size of the attention layer.
- cross_attention_dim (`int`, *optional*):
- The number of channels in the `encoder_hidden_states`.
- rank (`int`, defaults to 4):
- The dimension of the LoRA update matrices.
- network_alpha (`int`, *optional*):
- Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
- kwargs (`dict`):
- Additional keyword arguments to pass to the `LoRALinearLayer` layers.
- """
-
- def __init__(
- self,
- hidden_size: int,
- cross_attention_dim: Optional[int] = None,
- rank: int = 4,
- network_alpha: Optional[int] = None,
- **kwargs,
- ):
- super().__init__()
-
- self.hidden_size = hidden_size
- self.cross_attention_dim = cross_attention_dim
- self.rank = rank
-
- q_rank = kwargs.pop("q_rank", None)
- q_hidden_size = kwargs.pop("q_hidden_size", None)
- q_rank = q_rank if q_rank is not None else rank
- q_hidden_size = q_hidden_size if q_hidden_size is not None else hidden_size
-
- v_rank = kwargs.pop("v_rank", None)
- v_hidden_size = kwargs.pop("v_hidden_size", None)
- v_rank = v_rank if v_rank is not None else rank
- v_hidden_size = v_hidden_size if v_hidden_size is not None else hidden_size
-
- out_rank = kwargs.pop("out_rank", None)
- out_hidden_size = kwargs.pop("out_hidden_size", None)
- out_rank = out_rank if out_rank is not None else rank
- out_hidden_size = out_hidden_size if out_hidden_size is not None else hidden_size
-
- self.to_q_lora = LoRALinearLayer(q_hidden_size, q_hidden_size, q_rank, network_alpha)
- self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
- self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha)
- self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha)
-
- def __call__(
- self,
- attn: Attention,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- scale=1.0,
- temb=None,
- **cross_attention_kwargs
- ):
- if attn.residual_connection:
- residual = hidden_states
-
- if attn.spatial_norm is not None:
- hidden_states = attn.spatial_norm(hidden_states, temb)
-
- input_ndim = hidden_states.ndim
-
- if input_ndim == 4:
- batch_size, channel, height, width = hidden_states.shape
- hidden_states = hidden_states.reshape([batch_size, channel, height * width]).transpose([0, 2, 1])
-
- batch_size, sequence_length, _ = (
- hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
- )
- attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
- if attn.group_norm is not None:
- hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
-
- query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
- query = attn.head_to_batch_dim(query)
-
- if encoder_hidden_states is None:
- encoder_hidden_states = hidden_states
- elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
- key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
- value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
-
- key = attn.head_to_batch_dim(key)
- value = attn.head_to_batch_dim(value)
-
- attention_probs = attn.get_attention_scores(query, key, attention_mask)
- hidden_states = paddle.matmul(attention_probs, value)
- hidden_states = attn.batch_to_head_dim(hidden_states)
-
- # linear proj
- hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
- # dropout
- hidden_states = attn.to_out[1](hidden_states)
-
- if input_ndim == 4:
- hidden_states = hidden_states.transpose([0, 2, 1]).reshape([batch_size, channel, height, width])
-
- if attn.residual_connection:
- hidden_states = hidden_states + residual
-
- hidden_states = hidden_states / attn.rescale_output_factor
-
- return hidden_states
-
-
-class LoRAXFormersAttnProcessor(nn.Layer):
- r"""
- Processor for implementing the LoRA attention mechanism with memory efficient attention using xFormers.
-
- Args:
- hidden_size (`int`, *optional*):
- The hidden size of the attention layer.
- cross_attention_dim (`int`, *optional*):
- The number of channels in the `encoder_hidden_states`.
- rank (`int`, defaults to 4):
- The dimension of the LoRA update matrices.
- attention_op (`Callable`, *optional*, defaults to `None`):
- The base
- [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to
- use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best
- operator.
- network_alpha (`int`, *optional*):
- Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
- kwargs (`dict`):
- Additional keyword arguments to pass to the `LoRALinearLayer` layers.
- """
-
- def __init__(
- self,
- hidden_size,
- cross_attention_dim,
- rank=4,
- attention_op: Optional[str] = None,
- network_alpha=None,
- **kwargs,
- ):
- super().__init__()
-
- self.hidden_size = hidden_size
- self.cross_attention_dim = cross_attention_dim
- self.rank = rank
- self.attention_op = attention_op
-
- q_rank = kwargs.pop("q_rank", None)
- q_hidden_size = kwargs.pop("q_hidden_size", None)
- q_rank = q_rank if q_rank is not None else rank
- q_hidden_size = q_hidden_size if q_hidden_size is not None else hidden_size
-
- v_rank = kwargs.pop("v_rank", None)
- v_hidden_size = kwargs.pop("v_hidden_size", None)
- v_rank = v_rank if v_rank is not None else rank
- v_hidden_size = v_hidden_size if v_hidden_size is not None else hidden_size
-
- out_rank = kwargs.pop("out_rank", None)
- out_hidden_size = kwargs.pop("out_hidden_size", None)
- out_rank = out_rank if out_rank is not None else rank
- out_hidden_size = out_hidden_size if out_hidden_size is not None else hidden_size
-
- self.to_q_lora = LoRALinearLayer(q_hidden_size, q_hidden_size, q_rank, network_alpha)
- self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
- self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha)
- self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha)
-
- def __call__(
- self,
- attn: Attention,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- scale=1.0,
- temb=None,
- **cross_attention_kwargs
- ):
- if attn.residual_connection:
- residual = hidden_states
-
- if attn.spatial_norm is not None:
- hidden_states = attn.spatial_norm(hidden_states, temb)
-
- input_ndim = hidden_states.ndim
-
- if input_ndim == 4:
- batch_size, channel, height, width = hidden_states.shape
- hidden_states = hidden_states.reshape([batch_size, channel, height * width]).transpose([0, 2, 1])
-
- batch_size, sequence_length, _ = (
- hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
- )
- attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, transpose=False)
- if attn.group_norm is not None:
- hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
-
- query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
- query = attn.head_to_batch_dim(query, transpose=False)
-
- if encoder_hidden_states is None:
- encoder_hidden_states = hidden_states
- elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
- key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states)
- value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states)
-
- key = attn.head_to_batch_dim(key, transpose=False)
- value = attn.head_to_batch_dim(value, transpose=False)
-
- hidden_states = F.scaled_dot_product_attention_(
- query,
- key,
- value,
- attn_mask=attention_mask,
- scale=attn.scale,
- dropout_p=0.0,
- training=attn.training,
- attention_op=self.attention_op,
- )
-
- hidden_states = attn.batch_to_head_dim(hidden_states, transpose=False)
- # linear proj
- hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
- # dropout
- hidden_states = attn.to_out[1](hidden_states)
-
- if input_ndim == 4:
- hidden_states = hidden_states.transpose([0, 2, 1]).reshape([batch_size, channel, height, width])
-
- if attn.residual_connection:
- hidden_states = hidden_states + residual
-
- hidden_states = hidden_states / attn.rescale_output_factor
-
- return hidden_states
-
-
-class LoRAAttnAddedKVProcessor(nn.Layer):
- r"""
- Processor for implementing the LoRA attention mechanism with extra learnable key and value matrices for the text
- encoder.
-
- Args:
- hidden_size (`int`, *optional*):
- The hidden size of the attention layer.
- cross_attention_dim (`int`, *optional*, defaults to `None`):
- The number of channels in the `encoder_hidden_states`.
- rank (`int`, defaults to 4):
- The dimension of the LoRA update matrices.
- network_alpha (`int`, *optional*):
- Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
- kwargs (`dict`):
- Additional keyword arguments to pass to the `LoRALinearLayer` layers.
- """
-
- def __init__(
- self,
- hidden_size: int,
- cross_attention_dim: Optional[int] = None,
- rank: int = 4,
- network_alpha: Optional[int] = None,
- ):
- super().__init__()
-
- self.hidden_size = hidden_size
- self.cross_attention_dim = cross_attention_dim
- self.rank = rank
-
- self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
- self.add_k_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
- self.add_v_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha)
- self.to_k_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
- self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
- self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha)
-
- def __call__(
- self,
- attn: Attention,
- hidden_states,
- encoder_hidden_states=None,
- attention_mask=None,
- scale=1.0,
- **cross_attention_kwargs
- ):
- residual = hidden_states
- hidden_states = hidden_states.reshape([hidden_states.shape[0], hidden_states.shape[1], -1]).transpose(
- [0, 2, 1]
- )
- batch_size, sequence_length, _ = hidden_states.shape
-
- attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
- if encoder_hidden_states is None:
- encoder_hidden_states = hidden_states
- elif attn.norm_cross:
- encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
- hidden_states = attn.group_norm(hidden_states.transpose([0, 2, 1])).transpose([0, 2, 1])
-
- query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states)
- query = attn.head_to_batch_dim(query)
-
- encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + scale * self.add_k_proj_lora(
- encoder_hidden_states
- )
- encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + scale * self.add_v_proj_lora(
- encoder_hidden_states
- )
- encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj)
- encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj)
-
- if not attn.only_cross_attention:
- key = attn.to_k(hidden_states) + scale * self.to_k_lora(hidden_states)
- value = attn.to_v(hidden_states) + scale * self.to_v_lora(hidden_states)
- key = attn.head_to_batch_dim(key)
- value = attn.head_to_batch_dim(value)
- key = paddle.concat([encoder_hidden_states_key_proj, key], axis=1)
- value = paddle.concat([encoder_hidden_states_value_proj, value], axis=1)
- else:
- key = encoder_hidden_states_key_proj
- value = encoder_hidden_states_value_proj
-
- attention_probs = attn.get_attention_scores(query, key, attention_mask)
- hidden_states = paddle.matmul(attention_probs, value)
- hidden_states = attn.batch_to_head_dim(hidden_states)
-
- # linear proj
- hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states)
- # dropout
- hidden_states = attn.to_out[1](hidden_states)
-
- hidden_states = hidden_states.transpose([0, 2, 1]).reshape(residual.shape)
- hidden_states = hidden_states + residual
-
- return hidden_states
-
-
-AttnProcessor2_0 = XFormersAttnProcessor
-AttnAddedKVProcessor2_0 = XFormersAttnAddedKVProcessor
-LoRAAttnProcessor2_0 = LoRAXFormersAttnProcessor
-AttnProcessor2_5 = XFormersAttnProcessor
-AttnAddedKVProcessor2_5 = XFormersAttnAddedKVProcessor
-LoRAAttnProcessor2_5 = LoRAXFormersAttnProcessor
-
-LORA_ATTENTION_PROCESSORS = (
- LoRAAttnProcessor,
- LoRAAttnProcessor2_0,
- LoRAAttnProcessor2_5,
- LoRAXFormersAttnProcessor,
- LoRAAttnAddedKVProcessor,
-)
-
-ADDED_KV_ATTENTION_PROCESSORS = (
- AttnAddedKVProcessor,
- SlicedAttnAddedKVProcessor,
- AttnAddedKVProcessor2_0,
- AttnAddedKVProcessor2_5,
- XFormersAttnAddedKVProcessor,
- LoRAAttnAddedKVProcessor,
-)
-
-CROSS_ATTENTION_PROCESSORS = (
- AttnProcessor,
- AttnProcessor2_0,
- AttnProcessor2_5,
- XFormersAttnProcessor,
- SlicedAttnProcessor,
- LoRAAttnProcessor,
- LoRAAttnProcessor2_0,
- LoRAAttnProcessor2_5,
- LoRAXFormersAttnProcessor,
-)
-
-AttentionProcessor = Union[
- AttnProcessor,
- AttnProcessor2_0,
- AttnProcessor2_5,
- XFormersAttnProcessor,
- SlicedAttnProcessor,
- AttnAddedKVProcessor,
- SlicedAttnAddedKVProcessor,
- AttnAddedKVProcessor2_0,
- AttnAddedKVProcessor2_5,
- XFormersAttnAddedKVProcessor,
- CustomDiffusionAttnProcessor,
- CustomDiffusionXFormersAttnProcessor,
- # deprecated
- LoRAAttnProcessor,
- LoRAAttnProcessor2_0,
- LoRAAttnProcessor2_5,
- LoRAXFormersAttnProcessor,
- LoRAAttnAddedKVProcessor,
-]
-
-LORA_ATTENTION_PROCESSORS = (
- LoRAAttnProcessor,
- LoRAAttnProcessor2_0,
- LoRAAttnProcessor2_5,
- LoRAXFormersAttnProcessor,
- LoRAAttnAddedKVProcessor,
-)
diff --git a/ppdiffusers/ppdiffusers/models/autoencoder_kl.py b/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
index bf1078ff1..9a0e4eb87 100644
--- a/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
+++ b/ppdiffusers/ppdiffusers/models/autoencoder_kl.py
@@ -21,13 +21,7 @@
from ..configuration_utils import ConfigMixin, register_to_config
from ..loaders import FromOriginalVAEMixin
from ..utils import BaseOutput, apply_forward_hook
-from .attention_processor import (
- ADDED_KV_ATTENTION_PROCESSORS,
- CROSS_ATTENTION_PROCESSORS,
- AttentionProcessor,
- AttnAddedKVProcessor,
- AttnProcessor,
-)
+from .attention_processor import AttentionProcessor, AttnProcessor
from .modeling_utils import ModelMixin
from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
@@ -189,8 +183,8 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
processors = {}
def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
- if hasattr(module, "get_processor"):
- processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+ if hasattr(module, "set_processor"):
+ processors[f"{name}.processor"] = module.processor
for sub_name, child in module.named_children():
fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
@@ -203,20 +197,15 @@ def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[st
return processors
# Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
- def set_attn_processor(
- self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
- ):
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
r"""
Sets the attention processor to use to compute attention.
-
Parameters:
processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
The instantiated processor class or a dictionary of processor classes that will be set as the processor
for **all** `Attention` layers.
-
If `processor` is a dict, the key needs to define the path to the corresponding cross attention
processor. This is strongly recommended when setting trainable attention processors.
-
"""
count = len(self.attn_processors.keys())
@@ -229,9 +218,9 @@ def set_attn_processor(
def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
if hasattr(module, "set_processor"):
if not isinstance(processor, dict):
- module.set_processor(processor, _remove_lora=_remove_lora)
+ module.set_processor(processor)
else:
- module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+ module.set_processor(processor.pop(f"{name}.processor"))
for sub_name, child in module.named_children():
fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
@@ -244,33 +233,10 @@ def set_default_attn_processor(self):
"""
Disables custom attention processors and sets the default attention implementation.
"""
- if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
- processor = AttnAddedKVProcessor()
- elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
- processor = AttnProcessor()
- else:
- raise ValueError(
- f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
- )
-
- self.set_attn_processor(processor, _remove_lora=True)
+ self.set_attn_processor(AttnProcessor())
@apply_forward_hook
- def encode(
- self, x: paddle.Tensor, return_dict: bool = True
- ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
- """
- Encode a batch of images into latents.
-
- Args:
- x (`torch.FloatTensor`): Input batch of images.
- return_dict (`bool`, *optional*, defaults to `True`):
- Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
-
- Returns:
- The latent representations of the encoded images. If `return_dict` is True, a
- [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
- """
+ def encode(self, x: paddle.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
# TODO junnyu, support float16
x = x.cast(self.encoder.conv_in.weight.dtype)
if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
@@ -281,7 +247,6 @@ def encode(
h = paddle.concat(encoded_slices)
else:
h = self.encoder(x)
-
moments = self.quant_conv(h)
posterior = DiagonalGaussianDistribution(moments)
@@ -303,23 +268,7 @@ def _decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOu
return DecoderOutput(sample=dec)
@apply_forward_hook
- def decode(
- self, z: paddle.Tensor, return_dict: bool = True, generator=None
- ) -> Union[DecoderOutput, paddle.Tensor]:
- """
- Decode a batch of images.
-
- Args:
- z (`torch.FloatTensor`): Input batch of latent vectors.
- return_dict (`bool`, *optional*, defaults to `True`):
- Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
-
- Returns:
- [`~models.vae.DecoderOutput`] or `tuple`:
- If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
- returned.
-
- """
+ def decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
# TODO junnyu, add this to support pure fp16
z = z.cast(self.post_quant_conv.weight.dtype)
if self.use_slicing and z.shape[0] > 1:
@@ -348,18 +297,15 @@ def blend_h(self, a, b, blend_extent):
def tiled_encode(self, x: paddle.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
r"""Encode a batch of images using a tiled encoder.
-
When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
output, but they should be much less noticeable.
-
Args:
x (`paddle.Tensor`): Input batch of images.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
-
Returns:
[`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
@@ -403,12 +349,10 @@ def tiled_encode(self, x: paddle.Tensor, return_dict: bool = True) -> Autoencode
def tiled_decode(self, z: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
r"""
Decode a batch of images using a tiled decoder.
-
Args:
z (`paddle.Tensor`): Input batch of latent vectors.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
-
Returns:
[`~models.vae.DecoderOutput`] or `tuple`:
If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
diff --git a/ppdiffusers/ppdiffusers/models/autoencoder_tiny.py b/ppdiffusers/ppdiffusers/models/autoencoder_tiny.py
deleted file mode 100644
index e16464b99..000000000
--- a/ppdiffusers/ppdiffusers/models/autoencoder_tiny.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 Ollin Boer Bohan and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import paddle
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput
-from .modeling_utils import ModelMixin
-from .vae import DecoderOutput, DecoderTiny, EncoderTiny
-
-
-@dataclass
-class AutoencoderTinyOutput(BaseOutput):
- """
- Output of AutoencoderTiny encoding method.
-
- Args:
- latents (`paddle.Tensor`): Encoded outputs of the `Encoder`.
-
- """
-
- latents: paddle.Tensor
-
-
-class AutoencoderTiny(ModelMixin, ConfigMixin):
- r"""
- A tiny distilled VAE model for encoding images into latents and decoding latent representations into images.
-
- [`AutoencoderTiny`] is a wrapper around the original implementation of `TAESD`.
-
- This model inherits from [`ModelMixin`]. Check the superclass documentation for its generic methods implemented for
- all models (such as downloading or saving).
-
- Parameters:
- in_channels (`int`, *optional*, defaults to 3): Number of channels in the input image.
- out_channels (`int`, *optional*, defaults to 3): Number of channels in the output.
- encoder_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64, 64, 64, 64)`):
- Tuple of integers representing the number of output channels for each encoder block. The length of the
- tuple should be equal to the number of encoder blocks.
- decoder_block_out_channels (`Tuple[int]`, *optional*, defaults to `(64, 64, 64, 64)`):
- Tuple of integers representing the number of output channels for each decoder block. The length of the
- tuple should be equal to the number of decoder blocks.
- act_fn (`str`, *optional*, defaults to `"relu"`):
- Activation function to be used throughout the model.
- latent_channels (`int`, *optional*, defaults to 4):
- Number of channels in the latent representation. The latent space acts as a compressed representation of
- the input image.
- upsampling_scaling_factor (`int`, *optional*, defaults to 2):
- Scaling factor for upsampling in the decoder. It determines the size of the output image during the
- upsampling process.
- num_encoder_blocks (`Tuple[int]`, *optional*, defaults to `(1, 3, 3, 3)`):
- Tuple of integers representing the number of encoder blocks at each stage of the encoding process. The
- length of the tuple should be equal to the number of stages in the encoder. Each stage has a different
- number of encoder blocks.
- num_decoder_blocks (`Tuple[int]`, *optional*, defaults to `(3, 3, 3, 1)`):
- Tuple of integers representing the number of decoder blocks at each stage of the decoding process. The
- length of the tuple should be equal to the number of stages in the decoder. Each stage has a different
- number of decoder blocks.
- latent_magnitude (`float`, *optional*, defaults to 3.0):
- Magnitude of the latent representation. This parameter scales the latent representation values to control
- the extent of information preservation.
- latent_shift (float, *optional*, defaults to 0.5):
- Shift applied to the latent representation. This parameter controls the center of the latent space.
- scaling_factor (`float`, *optional*, defaults to 1.0):
- The component-wise standard deviation of the trained latent space computed using the first batch of the
- training set. This is used to scale the latent space to have unit variance when training the diffusion
- model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
- diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
- / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
- Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper. For this Autoencoder,
- however, no such scaling factor was used, hence the value of 1.0 as the default.
- force_upcast (`bool`, *optional*, default to `False`):
- If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
- can be fine-tuned / trained to a lower range without losing too much precision, in which case
- `force_upcast` can be set to `False` (see this fp16-friendly
- [AutoEncoder](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)).
- """
- _supports_gradient_checkpointing = True
-
- @register_to_config
- def __init__(
- self,
- in_channels=3,
- out_channels=3,
- encoder_block_out_channels: Tuple[int] = (64, 64, 64, 64),
- decoder_block_out_channels: Tuple[int] = (64, 64, 64, 64),
- act_fn: str = "relu",
- latent_channels: int = 4,
- upsampling_scaling_factor: int = 2,
- num_encoder_blocks: Tuple[int] = (1, 3, 3, 3),
- num_decoder_blocks: Tuple[int] = (3, 3, 3, 1),
- latent_magnitude: int = 3,
- latent_shift: float = 0.5,
- force_upcast: float = False,
- scaling_factor: float = 1.0,
- ):
- super().__init__()
-
- if len(encoder_block_out_channels) != len(num_encoder_blocks):
- raise ValueError("`encoder_block_out_channels` should have the same length as `num_encoder_blocks`.")
- if len(decoder_block_out_channels) != len(num_decoder_blocks):
- raise ValueError("`decoder_block_out_channels` should have the same length as `num_decoder_blocks`.")
-
- self.encoder = EncoderTiny(
- in_channels=in_channels,
- out_channels=latent_channels,
- num_blocks=num_encoder_blocks,
- block_out_channels=encoder_block_out_channels,
- act_fn=act_fn,
- )
-
- self.decoder = DecoderTiny(
- in_channels=latent_channels,
- out_channels=out_channels,
- num_blocks=num_decoder_blocks,
- block_out_channels=decoder_block_out_channels,
- upsampling_scaling_factor=upsampling_scaling_factor,
- act_fn=act_fn,
- )
-
- self.latent_magnitude = latent_magnitude
- self.latent_shift = latent_shift
- self.scaling_factor = scaling_factor
-
- self.use_slicing = False
- self.use_tiling = False
-
- # only relevant if vae tiling is enabled
- self.spatial_scale_factor = 2**out_channels
- self.tile_overlap_factor = 0.125
- self.tile_sample_min_size = 512
- self.tile_latent_min_size = self.tile_sample_min_size // self.spatial_scale_factor
-
- def _set_gradient_checkpointing(self, module, value=False):
- if isinstance(module, (EncoderTiny, DecoderTiny)):
- module.gradient_checkpointing = value
-
- def scale_latents(self, x):
- """raw latents -> [0, 1]"""
- return x.div(2 * self.latent_magnitude).add(self.latent_shift).clamp(0, 1)
-
- def unscale_latents(self, x):
- """[0, 1] -> raw latents"""
- return x.sub(self.latent_shift).mul(2 * self.latent_magnitude)
-
- def enable_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.use_slicing = True
-
- def disable_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
- decoding in one step.
- """
- self.use_slicing = False
-
- def enable_tiling(self, use_tiling: bool = True):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.use_tiling = use_tiling
-
- def disable_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
- decoding in one step.
- """
- self.enable_tiling(False)
-
- def _tiled_encode(self, x: paddle.Tensor) -> paddle.Tensor:
- r"""Encode a batch of images using a tiled encoder.
-
- When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
- steps. This is useful to keep memory use constant regardless of image size. To avoid tiling artifacts, the
- tiles overlap and are blended together to form a smooth output.
-
- Args:
- x (`paddle.Tensor`): Input batch of images.
- return_dict (`bool`, *optional*, defaults to `True`):
- Whether or not to return a [`~models.autoencoder_tiny.AutoencoderTinyOutput`] instead of a plain tuple.
-
- Returns:
- [`~models.autoencoder_tiny.AutoencoderTinyOutput`] or `tuple`:
- If return_dict is True, a [`~models.autoencoder_tiny.AutoencoderTinyOutput`] is returned, otherwise a
- plain `tuple` is returned.
- """
- # scale of encoder output relative to input
- sf = self.spatial_scale_factor
- tile_size = self.tile_sample_min_size
-
- # number of pixels to blend and to traverse between tile
- blend_size = int(tile_size * self.tile_overlap_factor)
- traverse_size = tile_size - blend_size
-
- # tiles index (up/left)
- ti = range(0, x.shape[-2], traverse_size)
- tj = range(0, x.shape[-1], traverse_size)
-
- # mask for blending
- blend_masks = paddle.stack(
- paddle.meshgrid([paddle.arange(tile_size / sf) / (blend_size / sf - 1)] * 2, indexing="ij")
- )
- blend_masks = blend_masks.clamp(0, 1)
-
- # output array
- out = paddle.zeros([x.shape[0], 4, x.shape[-2] // sf, x.shape[-1] // sf])
- for i in ti:
- for j in tj:
- tile_in = x[..., i : i + tile_size, j : j + tile_size]
- # tile result
- tile_out = out[..., i // sf : (i + tile_size) // sf, j // sf : (j + tile_size) // sf]
- tile = self.encoder(tile_in)
- h, w = tile.shape[-2], tile.shape[-1]
- # blend tile result into output
- blend_mask_i = paddle.ones_like(blend_masks[0]) if i == 0 else blend_masks[0]
- blend_mask_j = paddle.ones_like(blend_masks[1]) if j == 0 else blend_masks[1]
- blend_mask = blend_mask_i * blend_mask_j
- tile, blend_mask = tile[..., :h, :w], blend_mask[..., :h, :w]
- # pytorch: tile_out.copy_(blend_mask * tile + (1 - blend_mask) * tile_out)
- tile_out = blend_mask * tile + (1 - blend_mask) * tile_out
- return out
-
- def _tiled_decode(self, x: paddle.Tensor) -> paddle.Tensor:
- r"""Encode a batch of images using a tiled encoder.
-
- When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
- steps. This is useful to keep memory use constant regardless of image size. To avoid tiling artifacts, the
- tiles overlap and are blended together to form a smooth output.
-
- Args:
- x (`paddle.Tensor`): Input batch of images.
- return_dict (`bool`, *optional*, defaults to `True`):
- Whether or not to return a [`~models.autoencoder_tiny.AutoencoderTinyOutput`] instead of a plain tuple.
-
- Returns:
- [`~models.vae.DecoderOutput`] or `tuple`:
- If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
- returned.
- """
- # scale of decoder output relative to input
- sf = self.spatial_scale_factor
- tile_size = self.tile_latent_min_size
-
- # number of pixels to blend and to traverse between tiles
- blend_size = int(tile_size * self.tile_overlap_factor)
- traverse_size = tile_size - blend_size
-
- # tiles index (up/left)
- ti = range(0, x.shape[-2], traverse_size)
- tj = range(0, x.shape[-1], traverse_size)
-
- # mask for blending
- blend_masks = paddle.stack(
- paddle.meshgrid([paddle.arange(tile_size * sf) / (blend_size * sf - 1)] * 2, indexing="ij")
- )
- blend_masks = blend_masks.clamp(0, 1).to(x.device)
-
- # output array
- out = paddle.zeros(x.shape[0], 3, x.shape[-2] * sf, x.shape[-1] * sf, device=x.device)
- for i in ti:
- for j in tj:
- tile_in = x[..., i : i + tile_size, j : j + tile_size]
- # tile result
- tile_out = out[..., i * sf : (i + tile_size) * sf, j * sf : (j + tile_size) * sf]
- tile = self.decoder(tile_in)
- h, w = tile.shape[-2], tile.shape[-1]
- # blend tile result into output
- blend_mask_i = paddle.ones_like(blend_masks[0]) if i == 0 else blend_masks[0]
- blend_mask_j = paddle.ones_like(blend_masks[1]) if j == 0 else blend_masks[1]
- blend_mask = (blend_mask_i * blend_mask_j)[..., :h, :w]
- # pytorch: tile_out.copy_(blend_mask * tile + (1 - blend_mask) * tile_out)
- tile_out = blend_mask * tile + (1 - blend_mask) * tile_out
- return out
-
- def encode(self, x: paddle.Tensor, return_dict: bool = True) -> Union[AutoencoderTinyOutput, Tuple[paddle.Tensor]]:
- if self.use_slicing and x.shape[0] > 1:
- output = [self._tiled_encode(x_slice) if self.use_tiling else self.encoder(x) for x_slice in x.split(1)]
- output = paddle.concat(output)
- else:
- output = self._tiled_encode(x) if self.use_tiling else self.encoder(x)
-
- if not return_dict:
- return (output,)
-
- return AutoencoderTinyOutput(latents=output)
-
- def decode(
- self, x: paddle.Tensor, generator: Optional[paddle.Generator] = None, return_dict: bool = True
- ) -> Union[DecoderOutput, Tuple[paddle.Tensor]]:
- if self.use_slicing and x.shape[0] > 1:
- output = [self._tiled_decode(x_slice) if self.use_tiling else self.decoder(x) for x_slice in x.split(1)]
- output = paddle.concat(output)
- else:
- output = self._tiled_decode(x) if self.use_tiling else self.decoder(x)
-
- if not return_dict:
- return (output,)
-
- return DecoderOutput(sample=output)
-
- def forward(
- self,
- sample: paddle.Tensor,
- return_dict: bool = True,
- ) -> Union[DecoderOutput, Tuple[paddle.Tensor]]:
- r"""
- Args:
- sample (`paddle.Tensor`): Input sample.
- return_dict (`bool`, *optional*, defaults to `True`):
- Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
- """
- enc = self.encode(sample).latents
-
- # scale latents to be in [0, 1], then quantize latents to a byte tensor,
- # as if we were storing the latents in an RGBA uint8 image.
- scaled_enc = self.scale_latents(enc).mul_(255).round_().byte()
-
- # unquantize latents back into [0, 1], then unscale latents back to their original range,
- # as if we were loading the latents from an RGBA uint8 image.
- unscaled_enc = self.unscale_latents(scaled_enc / 255.0)
-
- dec = self.decode(unscaled_enc)
-
- if not return_dict:
- return (dec,)
- return DecoderOutput(sample=dec)
diff --git a/ppdiffusers/ppdiffusers/models/controlnet.py b/ppdiffusers/ppdiffusers/models/controlnet.py
index df9755fc2..091b4d5db 100644
--- a/ppdiffusers/ppdiffusers/models/controlnet.py
+++ b/ppdiffusers/ppdiffusers/models/controlnet.py
@@ -23,13 +23,7 @@
from ..initializer import zeros_
from ..loaders import FromOriginalControlnetMixin
from ..utils import BaseOutput, logging
-from .attention_processor import (
- ADDED_KV_ATTENTION_PROCESSORS,
- CROSS_ATTENTION_PROCESSORS,
- AttentionProcessor,
- AttnAddedKVProcessor,
- AttnProcessor,
-)
+from .attention_processor import AttentionProcessor, AttnProcessor
from .embeddings import (
TextImageProjection,
TextImageTimeEmbedding,
@@ -53,7 +47,6 @@
class ControlNetOutput(BaseOutput):
"""
The output of [`ControlNetModel`].
-
Args:
down_block_res_samples (`tuple[paddle.Tensor]`):
A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
@@ -76,12 +69,12 @@ def forward(self, hidden_states):
class ControlNetConditioningEmbedding(nn.Layer):
"""
- Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
- [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
- training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
- convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides
- (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
- model) to encode image-space conditions ... into feature maps ..."
+ "Stable Diffusion uses a pre-processing method similar to VQ-GAN [11] to convert the entire dataset of 512 × 512
+ images into smaller 64 × 64 “latent images” for stabilized training. This requires ControlNets to convert
+ image-based conditions to 64 × 64 feature space to match the convolution size. We use a tiny network E(·) of four
+ convolution layers with 4 × 4 kernels and 2 × 2 strides (activated by ReLU, channels are 16, 32, 64, 128,
+ initialized with Gaussian weights, trained jointly with the full model) to encode image-space conditions ... into
+ feature maps ..."
"""
def __init__(
@@ -122,7 +115,6 @@ def forward(self, conditioning):
class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
"""
A ControlNet model.
-
Args:
in_channels (`int`, defaults to 4):
The number of channels in the input sample.
@@ -229,7 +221,6 @@ def __init__(
resnet_pre_temb_non_linearity: bool = False,
):
super().__init__()
-
# If `num_attention_heads` is not defined (which is the case for most models)
# it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
# The reason for this behavior is to correct for incorrectly named variables that were introduced
@@ -266,8 +257,10 @@ def __init__(
# time
time_embed_dim = block_out_channels[0] * 4
+
self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
timestep_input_dim = block_out_channels[0]
+
self.time_embedding = TimestepEmbedding(
timestep_input_dim,
time_embed_dim,
@@ -458,7 +451,6 @@ def from_unet(
):
r"""
Instantiate a [`ControlNetModel`] from [`UNet2DConditionModel`].
-
Parameters:
unet (`UNet2DConditionModel`):
The UNet model weights to copy to the [`ControlNetModel`]. All configuration options are also copied
@@ -473,7 +465,6 @@ def from_unet(
addition_time_embed_dim = (
unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None
)
-
controlnet = cls(
encoder_hid_dim=encoder_hid_dim,
encoder_hid_dim_type=encoder_hid_dim_type,
@@ -520,7 +511,6 @@ def from_unet(
return controlnet
@property
- # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
def attn_processors(self) -> Dict[str, AttentionProcessor]:
r"""
Returns:
@@ -531,8 +521,8 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
processors = {}
def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
- if hasattr(module, "get_processor"):
- processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+ if hasattr(module, "set_processor"):
+ processors[f"{name}.processor"] = module.processor
for sub_name, child in module.named_children():
fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
@@ -544,21 +534,15 @@ def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[st
return processors
- # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
- def set_attn_processor(
- self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
- ):
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
r"""
Sets the attention processor to use to compute attention.
-
Parameters:
processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
The instantiated processor class or a dictionary of processor classes that will be set as the processor
for **all** `Attention` layers.
-
If `processor` is a dict, the key needs to define the path to the corresponding cross attention
processor. This is strongly recommended when setting trainable attention processors.
-
"""
count = len(self.attn_processors.keys())
@@ -571,9 +555,9 @@ def set_attn_processor(
def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
if hasattr(module, "set_processor"):
if not isinstance(processor, dict):
- module.set_processor(processor, _remove_lora=_remove_lora)
+ module.set_processor(processor)
else:
- module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+ module.set_processor(processor.pop(f"{name}.processor"))
for sub_name, child in module.named_children():
fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
@@ -586,25 +570,13 @@ def set_default_attn_processor(self):
"""
Disables custom attention processors and sets the default attention implementation.
"""
- if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
- processor = AttnAddedKVProcessor()
- elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
- processor = AttnProcessor()
- else:
- raise ValueError(
- f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
- )
-
- self.set_attn_processor(processor, _remove_lora=True)
+ self.set_attn_processor(AttnProcessor())
- # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attention_slice
def set_attention_slice(self, slice_size):
r"""
Enable sliced attention computation.
-
When this option is enabled, the attention module splits the input tensor in slices to compute attention in
several steps. This is useful for saving some memory in exchange for a small decrease in speed.
-
Args:
slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
@@ -684,7 +656,6 @@ def forward(
) -> Union[ControlNetOutput, Tuple]:
"""
The [`ControlNetModel`] forward method.
-
Args:
sample (`paddle.Tensor`):
The noisy input tensor.
@@ -699,13 +670,7 @@ def forward(
class_labels (`paddle.Tensor`, *optional*, defaults to `None`):
Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
timestep_cond (`paddle.Tensor`, *optional*, defaults to `None`):
- Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
- timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
- embeddings.
attention_mask (`paddle.Tensor`, *optional*, defaults to `None`):
- An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
- is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
- negative values to the attention scores corresponding to "discard" tokens.
added_cond_kwargs (`dict`):
Additional conditions for the Stable Diffusion XL UNet.
cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
@@ -715,7 +680,6 @@ def forward(
you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
return_dict (`bool`, defaults to `True`):
Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
-
Returns:
[`~models.controlnet.ControlNetOutput`] **or** `tuple`:
If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
@@ -763,7 +727,6 @@ def forward(
aug_emb = None
emb = self.time_embedding(t_emb, timestep_cond)
- aug_emb = None
if self.class_embedding is not None:
if class_labels is None:
@@ -780,7 +743,7 @@ def forward(
class_emb = self.class_embedding(class_labels).cast(sample.dtype)
emb = emb + class_emb
- if "addition_embed_type" in self.config and self.config.addition_embed_type is not None:
+ if "addition_embed_type" in self.config:
if self.config.addition_embed_type == "text":
aug_emb = self.add_embedding(encoder_hidden_states)
@@ -813,10 +776,12 @@ def forward(
sample = self.conv_in(sample)
controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
+
sample = sample + controlnet_cond
# 3. down
down_block_res_samples = (sample,)
+
for downsample_block in self.down_blocks:
if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
sample, res_samples = downsample_block(
diff --git a/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py b/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
index 036fa40fa..d6f680e81 100644
--- a/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
+++ b/ppdiffusers/ppdiffusers/models/dual_transformer_2d.py
@@ -116,10 +116,6 @@ def forward(
Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step.
attention_mask (`paddle.Tensor`, *optional*):
Optional attention mask to be applied in Attention
- cross_attention_kwargs (`dict`, *optional*):
- A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
- `self.processor` in
- [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
diff --git a/ppdiffusers/ppdiffusers/models/embeddings.py b/ppdiffusers/ppdiffusers/models/embeddings.py
index bfc502690..4331e9e70 100644
--- a/ppdiffusers/ppdiffusers/models/embeddings.py
+++ b/ppdiffusers/ppdiffusers/models/embeddings.py
@@ -64,22 +64,17 @@ def get_timestep_embedding(
return emb
-def get_2d_sincos_pos_embed(
- embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16
-):
+def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
"""
grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or
[1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
"""
- if isinstance(grid_size, int):
- grid_size = (grid_size, grid_size)
-
- grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size) / interpolation_scale
- grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size) / interpolation_scale
+ grid_h = np.arange(grid_size, dtype=np.float32)
+ grid_w = np.arange(grid_size, dtype=np.float32)
grid = np.meshgrid(grid_w, grid_h) # here w goes first
grid = np.stack(grid, axis=0)
- grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+ grid = grid.reshape([2, 1, grid_size, grid_size])
pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
if cls_token and extra_tokens > 0:
pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
@@ -132,7 +127,6 @@ def __init__(
layer_norm=False,
flatten=True,
bias=True,
- interpolation_scale=1,
add_pos_embed=True,
):
super().__init__()
@@ -150,47 +144,19 @@ def __init__(
else:
self.norm = None
- self.patch_size = patch_size
- # See:
- # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L161
- self.height, self.width = height // patch_size, width // patch_size
- self.base_size = height // patch_size
- self.interpolation_scale = interpolation_scale
self.add_pos_embed = add_pos_embed
if add_pos_embed:
- pos_embed = get_2d_sincos_pos_embed(
- embed_dim,
- int(num_patches**0.5),
- base_size=self.base_size,
- interpolation_scale=self.interpolation_scale,
- )
+ pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5))
self.register_buffer(
"pos_embed", paddle.to_tensor(pos_embed).cast("float32").unsqueeze(0), persistable=False
)
def forward(self, latent):
- height, width = latent.shape[-2] // self.patch_size, latent.shape[-1] // self.patch_size
-
latent = self.proj(latent)
if self.flatten:
latent = latent.flatten(2).transpose([0, 2, 1]) # BCHW -> BNC
if self.layer_norm:
latent = self.norm(latent)
-
- # Interpolate positional embeddings if needed.
- # (For PixArt-Alpha: https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L162C151-L162C160)
- if self.height != height or self.width != width:
- pos_embed = get_2d_sincos_pos_embed(
- embed_dim=self.pos_embed.shape[-1],
- grid_size=(height, width),
- base_size=self.base_size,
- interpolation_scale=self.interpolation_scale,
- )
- pos_embed = paddle.to_tensor(pos_embed)
- pos_embed = pos_embed.cast("float32").unsqueeze(0)
- else:
- pos_embed = self.pos_embed
-
if self.add_pos_embed:
return latent + self.pos_embed
else:
@@ -293,42 +259,6 @@ def forward(self, x):
return out
-class SinusoidalPositionalEmbedding(nn.Layer):
- """Apply positional information to a sequence of embeddings.
-
- Takes in a sequence of embeddings with shape (batch_size, seq_length, embed_dim) and adds positional embeddings to
- them
-
- Args:
- embed_dim: (int): Dimension of the positional embedding.
- max_seq_length: Maximum sequence length to apply positional embeddings
-
- """
-
- def __init__(self, embed_dim: int, max_seq_length: int = 32):
- super().__init__()
- position = paddle.arange(max_seq_length).unsqueeze(1)
- div_term = paddle.exp(paddle.arange(0, embed_dim, 2) * (-math.log(10000.0) / embed_dim))
- pe = paddle.zeros([1, max_seq_length, embed_dim])
- pe[0, :, 0::2] = paddle.sin(
- (position * div_term).cast(paddle.float32) # use paddle.get_default_type(), CI occur error
- ).cast(
- pe.dtype
- ) # paddle: sin not support int64, convert to float32
- pe[0, :, 1::2] = paddle.cos((position * div_term).cast(paddle.float32)).cast(
- pe.dtype
- ) # paddle: cos not support int64, convert to float32
- # When use register_buffer, CI occur error in UNetMotionModelTests::test_from_save_pretrained_dtype
- # self.register_buffer("pe", pe)
- pe.stop_gradient = True
- self.pe = pe
-
- def forward(self, x):
- _, seq_length, _ = x.shape
- x = x + self.pe[:, :seq_length]
- return x
-
-
class ImagePositionalEmbeddings(nn.Layer):
"""
Converts latent image classes into vector embeddings. Sums the vector embeddings with positional embeddings for the
@@ -633,201 +563,3 @@ def shape(x):
a = a.reshape([bs, -1, 1]).transpose([0, 2, 1])
return a[:, 0, :] # cls_token
-
-
-class FourierEmbedder(nn.Layer):
- def __init__(self, num_freqs=64, temperature=100):
- super().__init__()
-
- self.num_freqs = num_freqs
- self.temperature = temperature
-
- freq_bands = temperature ** (paddle.arange(num_freqs) / num_freqs)
- freq_bands = freq_bands[None, None, None]
- self.register_buffer("freq_bands", freq_bands, persistent=False)
-
- def __call__(self, x):
- x = self.freq_bands * x.unsqueeze(-1)
- return paddle.stack((x.sin(), x.cos()), dim=-1).transpose([0, 1, 3, 4, 2]).reshape([*x.shape[:2], -1])
-
-
-class PositionNet(nn.Layer):
- def __init__(self, positive_len, out_dim, feature_type="text-only", fourier_freqs=8):
- super().__init__()
- self.positive_len = positive_len
- self.out_dim = out_dim
-
- self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs)
- self.position_dim = fourier_freqs * 2 * 4 # 2: sin/cos, 4: xyxy
-
- if isinstance(out_dim, tuple):
- out_dim = out_dim[0]
-
- if feature_type == "text-only":
- self.linears = nn.Sequential(
- nn.Linear(self.positive_len + self.position_dim, 512),
- nn.SiLU(),
- nn.Linear(512, 512),
- nn.SiLU(),
- nn.Linear(512, out_dim),
- )
- tmp_0 = paddle.zeros([self.positive_len])
- self.null_positive_feature = self.create_parameter(
- shape=tmp_0.shape, default_initializer=paddle.nn.initializer.Assign(tmp_0)
- )
-
- elif feature_type == "text-image":
- self.linears_text = nn.Sequential(
- nn.Linear(self.positive_len + self.position_dim, 512),
- nn.SiLU(),
- nn.Linear(512, 512),
- nn.SiLU(),
- nn.Linear(512, out_dim),
- )
- self.linears_image = nn.Sequential(
- nn.Linear(self.positive_len + self.position_dim, 512),
- nn.SiLU(),
- nn.Linear(512, 512),
- nn.SiLU(),
- nn.Linear(512, out_dim),
- )
- tmp_0 = paddle.zeros([self.positive_len])
- self.null_text_feature = self.create_parameter(
- shape=tmp_0.shape, default_initializer=paddle.nn.initializer.Assign(tmp_0)
- )
- tmp_1 = paddle.zeros([self.positive_len])
- self.null_image_feature = self.create_parameter(
- shape=tmp_1.shape, default_initializer=paddle.nn.initializer.Assign(tmp_1)
- )
-
- tmp = paddle.zeros([self.position_dim])
- self.null_position_feature = self.create_parameter(
- shape=tmp.shape, default_initializer=paddle.nn.initializer.Assign(tmp)
- )
-
- def forward(
- self,
- boxes,
- masks,
- positive_embeddings=None,
- phrases_masks=None,
- image_masks=None,
- phrases_embeddings=None,
- image_embeddings=None,
- ):
- masks = masks.unsqueeze(-1)
-
- # embedding position (it may includes padding as placeholder)
- xyxy_embedding = self.fourier_embedder(boxes) # B*N*4 -> B*N*C
-
- # learnable null embedding
- xyxy_null = self.null_position_feature.view(1, 1, -1)
-
- # replace padding with learnable null embedding
- xyxy_embedding = xyxy_embedding * masks + (1 - masks) * xyxy_null
-
- # positionet with text only information
- if positive_embeddings is not None:
- # learnable null embedding
- positive_null = self.null_positive_feature.view(1, 1, -1)
-
- # replace padding with learnable null embedding
- positive_embeddings = positive_embeddings * masks + (1 - masks) * positive_null
-
- objs = self.linears(paddle.concat([positive_embeddings, xyxy_embedding], axis=-1))
-
- # positionet with text and image infomation
- else:
- phrases_masks = phrases_masks.unsqueeze(-1)
- image_masks = image_masks.unsqueeze(-1)
-
- # learnable null embedding
- text_null = self.null_text_feature.view(1, 1, -1)
- image_null = self.null_image_feature.view(1, 1, -1)
-
- # replace padding with learnable null embedding
- phrases_embeddings = phrases_embeddings * phrases_masks + (1 - phrases_masks) * text_null
- image_embeddings = image_embeddings * image_masks + (1 - image_masks) * image_null
-
- objs_text = self.linears_text(paddle.concat([phrases_embeddings, xyxy_embedding], axis=-1))
- objs_image = self.linears_image(paddle.concat([image_embeddings, xyxy_embedding], axis=-1))
- objs = paddle.concat([objs_text, objs_image], axis=1)
-
- return objs
-
-
-class CombinedTimestepSizeEmbeddings(nn.Layer):
- """
- For PixArt-Alpha.
-
- Reference:
- https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L164C9-L168C29
- """
-
- def __init__(self, embedding_dim, size_emb_dim, use_additional_conditions: bool = False):
- super().__init__()
-
- self.outdim = size_emb_dim
- self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
- self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
-
- self.use_additional_conditions = use_additional_conditions
- if use_additional_conditions:
- self.use_additional_conditions = True
- self.additional_condition_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0)
- self.resolution_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
- self.aspect_ratio_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=size_emb_dim)
-
- def apply_condition(self, size: paddle.Tensor, batch_size: int, embedder: nn.Layer):
- if size.ndim == 1:
- size = size[:, None]
-
- if size.shape[0] != batch_size:
- size = size.tile([batch_size // size.shape[0], 1])
- if size.shape[0] != batch_size:
- raise ValueError(f"`batch_size` should be {size.shape[0]} but found {batch_size}.")
-
- current_batch_size, dims = size.shape[0], size.shape[1]
- size = size.reshape([-1])
- size_freq = self.additional_condition_proj(size).to(size.dtype)
-
- size_emb = embedder(size_freq)
- size_emb = size_emb.reshape([current_batch_size, dims * self.outdim])
- return size_emb
-
- def forward(self, timestep, resolution, aspect_ratio, batch_size, hidden_dtype):
- timesteps_proj = self.time_proj(timestep)
- timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype)) # (N, D)
-
- if self.use_additional_conditions:
- resolution = self.apply_condition(resolution, batch_size=batch_size, embedder=self.resolution_embedder)
- aspect_ratio = self.apply_condition(
- aspect_ratio, batch_size=batch_size, embedder=self.aspect_ratio_embedder
- )
- conditioning = timesteps_emb + paddle.concat([resolution, aspect_ratio], axis=1)
- else:
- conditioning = timesteps_emb
-
- return conditioning
-
-
-class CaptionProjection(nn.Layer):
- """
- Projects caption embeddings. Also handles dropout for classifier-free guidance.
-
- Adapted from https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py
- """
-
- def __init__(self, in_features, hidden_size, num_tokens=120):
- super().__init__()
- self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
- self.act_1 = nn.GELU(approximate=True)
- self.linear_2 = nn.Linear(in_features=hidden_size, out_features=hidden_size, bias=True)
- tmp_0 = paddle.randn(num_tokens, in_features) / in_features**0.5
- self.register_buffer("y_embedding", self.create_parameter(tmp_0.shape, paddle.nn.initializer.Assign(tmp_0)))
-
- def forward(self, caption, force_drop_ids=None):
- hidden_states = self.linear_1(caption)
- hidden_states = self.act_1(hidden_states)
- hidden_states = self.linear_2(hidden_states)
- return hidden_states
diff --git a/ppdiffusers/ppdiffusers/models/lora.py b/ppdiffusers/ppdiffusers/models/lora.py
index 09b47192d..5b2cd9196 100644
--- a/ppdiffusers/ppdiffusers/models/lora.py
+++ b/ppdiffusers/ppdiffusers/models/lora.py
@@ -14,61 +14,28 @@
from typing import Optional
-import paddle
import paddle.nn as nn
-import paddle.nn.functional as F
from ..initializer import normal_, zeros_
-from ..loaders import (
- PatchedLoraProjection,
- text_encoder_attn_modules,
- text_encoder_mlp_modules,
-)
-from ..utils import logging
-
-logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-
-
-def adjust_lora_scale_text_encoder(text_encoder, lora_scale: float = 1.0):
- for _, attn_module in text_encoder_attn_modules(text_encoder):
- if isinstance(attn_module.q_proj, PatchedLoraProjection):
- attn_module.q_proj.lora_scale = lora_scale
- attn_module.k_proj.lora_scale = lora_scale
- attn_module.v_proj.lora_scale = lora_scale
- attn_module.out_proj.lora_scale = lora_scale
-
- for _, mlp_module in text_encoder_mlp_modules(text_encoder):
- if isinstance(mlp_module.linear1, PatchedLoraProjection):
- mlp_module.linear1.lora_scale = lora_scale
- mlp_module.linear2.lora_scale = lora_scale
class LoRALinearLayer(nn.Layer):
- r"""
- A linear layer that is used with LoRA.
-
- Parameters:
- in_features (`int`):
- Number of input features.
- out_features (`int`):
- Number of output features.
- rank (`int`, `optional`, defaults to 4):
- The rank of the LoRA layer.
- network_alpha (`float`, `optional`, defaults to `None`):
- The value of the network alpha used for stable learning and preventing underflow. This value has the same
- meaning as the `--network_alpha` option in the kohya-ss trainer script. See
- https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
- device (`torch.device`, `optional`, defaults to `None`):
- The device to use for the layer's weights.
- dtype (`torch.dtype`, `optional`, defaults to `None`):
- The dtype to use for the layer's weights.
- """
-
def __init__(self, in_features, out_features, rank=4, network_alpha=None, device=None, dtype=None):
super().__init__()
- self.down = nn.Linear(in_features, rank, bias_attr=False)
- self.up = nn.Linear(rank, out_features, bias_attr=False)
+ if rank > min(in_features, out_features):
+ raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}")
+
+ self.down = nn.Linear(
+ in_features,
+ rank,
+ bias_attr=False,
+ )
+ self.up = nn.Linear(
+ rank,
+ out_features,
+ bias_attr=False,
+ )
if device is not None:
self.down.to(device=device)
self.up.to(device=device)
@@ -80,8 +47,6 @@ def __init__(self, in_features, out_features, rank=4, network_alpha=None, device
# See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
self.network_alpha = network_alpha
self.rank = rank
- self.out_features = out_features
- self.in_features = in_features
normal_(self.down.weight, std=1 / rank)
zeros_(self.up.weight)
@@ -89,7 +54,6 @@ def __init__(self, in_features, out_features, rank=4, network_alpha=None, device
def forward(self, hidden_states):
orig_dtype = hidden_states.dtype
dtype = self.down.weight.dtype
-
down_hidden_states = self.down(hidden_states.cast(dtype))
up_hidden_states = self.up(down_hidden_states)
@@ -100,33 +64,14 @@ def forward(self, hidden_states):
class LoRAConv2dLayer(nn.Layer):
- r"""
- A convolutional layer that is used with LoRA.
-
- Parameters:
- in_features (`int`):
- Number of input features.
- out_features (`int`):
- Number of output features.
- rank (`int`, `optional`, defaults to 4):
- The rank of the LoRA layer.
- kernel_size (`int` or `tuple` of two `int`, `optional`, defaults to 1):
- The kernel size of the convolution.
- stride (`int` or `tuple` of two `int`, `optional`, defaults to 1):
- The stride of the convolution.
- padding (`int` or `tuple` of two `int` or `str`, `optional`, defaults to 0):
- The padding of the convolution.
- network_alpha (`float`, `optional`, defaults to `None`):
- The value of the network alpha used for stable learning and preventing underflow. This value has the same
- meaning as the `--network_alpha` option in the kohya-ss trainer script. See
- https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
- """
-
def __init__(
self, in_features, out_features, rank=4, kernel_size=(1, 1), stride=(1, 1), padding=0, network_alpha=None
):
super().__init__()
+ if rank > min(in_features, out_features):
+ raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}")
+
self.down = nn.Conv2D(
in_features, rank, kernel_size=kernel_size, stride=stride, padding=padding, bias_attr=False
)
@@ -167,78 +112,17 @@ def __init__(self, *args, lora_layer: Optional[LoRAConv2dLayer] = None, **kwargs
def set_lora_layer(self, lora_layer: Optional[LoRAConv2dLayer]):
self.lora_layer = lora_layer
- def _fuse_lora(self, lora_scale: float = 1.0, safe_fusing: bool = False):
- if self.lora_layer is None:
- return
-
- dtype = self.weight.dtype
-
- w_orig = self.weight.astype(paddle.get_default_dtype())
- w_up = self.lora_layer.up.weight.astype(paddle.get_default_dtype())
- w_down = self.lora_layer.down.weight.astype(paddle.get_default_dtype())
-
- if self.lora_layer.network_alpha is not None:
- w_up = w_up * self.lora_layer.network_alpha / self.lora_layer.rank
-
- fusion = paddle.mm(w_up.flatten(start_axis=1), w_down.flatten(start_axis=1))
- fusion = fusion.reshape((w_orig.shape))
- fused_weight = w_orig + (lora_scale * fusion)
-
- if safe_fusing and paddle.isnan(fused_weight).any().item():
- raise ValueError(
- "This LoRA weight seems to be broken. "
- f"Encountered NaN values when trying to fuse LoRA weights for {self}."
- "LoRA weights will not be fused."
- )
-
- out_0 = fused_weight.cast(dtype=dtype)
- self.weight = self.create_parameter(
- shape=out_0.shape,
- default_initializer=nn.initializer.Assign(out_0),
- )
-
- # we can drop the lora layer now
- self.lora_layer = None
-
- # offload the up and down matrices to CPU to not blow the memory
- self.w_up = w_up.cpu()
- self.w_down = w_down.cpu()
- self._lora_scale = lora_scale
-
- def _unfuse_lora(self):
- if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
- return
-
- fused_weight = self.weight
- dtype = fused_weight.dtype
-
- self.w_up = self.w_up.astype(paddle.get_default_dtype())
- self.w_down = self.w_down.astype(paddle.get_default_dtype())
-
- fusion = paddle.mm(self.w_up.flatten(start_axis=1), self.w_down.flatten(start_axis=1))
- fusion = fusion.reshape((fused_weight.shape))
- unfused_weight = fused_weight.astype(paddle.get_default_dtype()) - (self._lora_scale * fusion)
- out_0 = unfused_weight.cast(dtype=dtype)
- self.weight = self.create_parameter(
- shape=out_0.shape,
- default_initializer=nn.initializer.Assign(out_0),
- )
-
- self.w_up = None
- self.w_down = None
-
- def forward(self, hidden_states, scale: float = 1.0):
+ def forward(self, x):
if self.lora_layer is None:
# make sure to the functional Conv2D function as otherwise torch.compile's graph will break
# see: https://github.com/huggingface/diffusers/pull/4315
- return F.conv2d(
- hidden_states, self.weight, self.bias, self._stride, self._padding, self._dilation, self._groups
+ return nn.functional.conv2d(
+ x, self.weight, self.bias, self._stride, self._padding, self._dilation, self._groups
)
+ # return super().forward(x)
+ # return nn.functional.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
else:
- original_outputs = F.conv2d(
- hidden_states, self.weight, self.bias, self._stride, self._padding, self._dilation, self._groups
- )
- return original_outputs + (scale * self.lora_layer(hidden_states))
+ return super().forward(x) + self.lora_layer(x)
class LoRACompatibleLinear(nn.Linear):
@@ -250,76 +134,17 @@ def __init__(self, *args, lora_layer: Optional[LoRALinearLayer] = None, **kwargs
super().__init__(*args, **kwargs)
self.lora_layer = lora_layer
- def set_lora_layer(self, lora_layer: Optional[LoRALinearLayer]):
+ def set_lora_layer(self, lora_layer: Optional[LoRAConv2dLayer]):
self.lora_layer = lora_layer
- def _fuse_lora(self, lora_scale: float = 1.0, safe_fusing: bool = False):
- if self.lora_layer is None:
- return
-
- dtype = self.weight.dtype
-
- w_orig = self.weight.astype(paddle.get_default_dtype())
- w_up = self.lora_layer.up.weight.astype(paddle.get_default_dtype())
- w_down = self.lora_layer.down.weight.astype(paddle.get_default_dtype())
-
- if self.lora_layer.network_alpha is not None:
- w_up = w_up * self.lora_layer.network_alpha / self.lora_layer.rank
-
- fused_weight = w_orig + (lora_scale * paddle.bmm(w_up.T[None, :], w_down.T[None, :])[0]).T
-
- if safe_fusing and paddle.isnan(fused_weight).any().item():
- raise ValueError(
- "This LoRA weight seems to be broken. "
- f"Encountered NaN values when trying to fuse LoRA weights for {self}."
- "LoRA weights will not be fused."
- )
-
- out_0 = fused_weight.cast(dtype=dtype)
- self.weight = self.create_parameter(
- shape=out_0.shape,
- default_initializer=nn.initializer.Assign(out_0),
- )
-
- # we can drop the lora layer now
- self.lora_layer = None
-
- # offload the up and down matrices to CPU to not blow the memory
- self.w_up = w_up.cpu()
- self.w_down = w_down.cpu()
- self._lora_scale = lora_scale
-
- def _unfuse_lora(self):
- if not (getattr(self, "w_up", None) is not None and getattr(self, "w_down", None) is not None):
- return
-
- fused_weight = self.weight
- dtype = fused_weight.dtype
-
- w_up = self.w_up.astype(paddle.get_default_dtype())
- w_down = self.w_down.astype(paddle.get_default_dtype())
-
- unfused_weight = (
- fused_weight.astype(paddle.get_default_dtype())
- - (self._lora_scale * paddle.bmm(w_up.T[None, :], w_down.T[None, :])[0]).T
- )
- out_0 = unfused_weight.cast(dtype=dtype)
- self.weight = self.create_parameter(
- shape=out_0.shape,
- default_initializer=nn.initializer.Assign(out_0),
- )
-
- self.w_up = None
- self.w_down = None
-
- def forward(self, hidden_states, scale: float = 1.0):
+ def forward(self, x):
# breakpoint()
if self.lora_layer is None:
- # return super().forward(hidden_states)
+ # return super().forward(x)
return nn.functional.linear(
- hidden_states,
+ x,
self.weight,
self.bias,
)
else:
- return super().forward(hidden_states) + (scale * self.lora_layer(hidden_states))
+ return super().forward(x) + self.lora_layer(x)
diff --git a/ppdiffusers/ppdiffusers/models/modeling_utils.py b/ppdiffusers/ppdiffusers/models/modeling_utils.py
index 8d81a7f6f..a86140987 100644
--- a/ppdiffusers/ppdiffusers/models/modeling_utils.py
+++ b/ppdiffusers/ppdiffusers/models/modeling_utils.py
@@ -251,8 +251,6 @@ def save_pretrained(
safe_serialization: bool = False,
variant: Optional[str] = None,
to_diffusers: Optional[bool] = None,
- push_to_hub: bool = False,
- **kwargs,
):
"""
Save a model and its configuration file to a directory so that it can be reloaded using the
diff --git a/ppdiffusers/ppdiffusers/models/normalization.py b/ppdiffusers/ppdiffusers/models/normalization.py
deleted file mode 100644
index 5a0726f19..000000000
--- a/ppdiffusers/ppdiffusers/models/normalization.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Optional, Tuple
-
-import paddle
-import paddle.nn as nn
-
-from .activations import get_activation
-from .embeddings import CombinedTimestepLabelEmbeddings, CombinedTimestepSizeEmbeddings
-
-
-class AdaLayerNorm(nn.Layer):
- r"""
- Norm layer modified to incorporate timestep embeddings.
-
- Parameters:
- embedding_dim (`int`): The size of each embedding vector.
- num_embeddings (`int`): The size of the embeddings dictionary.
- """
-
- def __init__(self, embedding_dim: int, num_embeddings: int):
- super().__init__()
- self.emb = nn.Embedding(num_embeddings, embedding_dim)
- self.silu = nn.Silu()
- self.linear = nn.Linear(embedding_dim, embedding_dim * 2)
- # elementwise_affine=False
- norm_kwargs = {"weight_attr": False, "bias_attr": False}
- self.norm = nn.LayerNorm(embedding_dim, **norm_kwargs)
-
- def forward(self, x, timestep):
- emb = self.linear(self.silu(self.emb(timestep)))
- # must set axis=-1, paddle vs pytorch
- scale, shift = paddle.chunk(emb, 2, axis=-1)
- x = self.norm(x) * (1 + scale) + shift
- return x
-
-
-class AdaLayerNormZero(nn.Layer):
- r"""
- Norm layer adaptive layer norm zero (adaLN-Zero).
-
- Parameters:
- embedding_dim (`int`): The size of each embedding vector.
- num_embeddings (`int`): The size of the embeddings dictionary.
- """
-
- def __init__(self, embedding_dim: int, num_embeddings: int):
- super().__init__()
-
- self.emb = CombinedTimestepLabelEmbeddings(num_embeddings, embedding_dim)
-
- self.silu = nn.Silu()
- self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias_attr=True)
- # elementwise_affine=False
- norm_kwargs = {"weight_attr": False, "bias_attr": False}
- self.norm = nn.LayerNorm(embedding_dim, epsilon=1e-6, **norm_kwargs)
-
- def forward(self, x, timestep, class_labels, hidden_dtype=None):
- emb = self.linear(self.silu(self.emb(timestep, class_labels, hidden_dtype=hidden_dtype)))
- shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = emb.chunk(6, axis=1)
- x = self.norm(x) * (1 + scale_msa[:, None]) + shift_msa[:, None]
- return x, gate_msa, shift_mlp, scale_mlp, gate_mlp
-
-
-class AdaLayerNormSingle(nn.Layer):
- r"""
- Norm layer adaptive layer norm single (adaLN-single).
-
- As proposed in PixArt-Alpha (see: https://arxiv.org/abs/2310.00426; Section 2.3).
-
- Parameters:
- embedding_dim (`int`): The size of each embedding vector.
- use_additional_conditions (`bool`): To use additional conditions for normalization or not.
- """
-
- def __init__(self, embedding_dim: int, use_additional_conditions: bool = False):
- super().__init__()
-
- self.emb = CombinedTimestepSizeEmbeddings(
- embedding_dim, size_emb_dim=embedding_dim // 3, use_additional_conditions=use_additional_conditions
- )
-
- self.silu = nn.Silu()
- self.linear = nn.Linear(embedding_dim, 6 * embedding_dim, bias=True)
-
- def forward(
- self,
- timestep: paddle.Tensor,
- added_cond_kwargs: Dict[str, paddle.Tensor] = None,
- batch_size: int = None,
- hidden_dtype: Optional[paddle.dtype] = None,
- ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
- # No modulation happening here.
- embedded_timestep = self.emb(timestep, **added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_dtype)
- return self.linear(self.silu(embedded_timestep)), embedded_timestep
-
-
-class AdaGroupNorm(nn.Layer):
- r"""
- GroupNorm layer modified to incorporate timestep embeddings.
-
- Parameters:
- embedding_dim (`int`): The size of each embedding vector.
- num_embeddings (`int`): The size of the embeddings dictionary.
- num_groups (`int`): The number of groups to separate the channels into.
- act_fn (`str`, *optional*, defaults to `None`): The activation function to use.
- eps (`float`, *optional*, defaults to `1e-5`): The epsilon value to use for numerical stability.
- """
-
- def __init__(
- self, embedding_dim: int, out_dim: int, num_groups: int, act_fn: Optional[str] = None, eps: float = 1e-5
- ):
- super().__init__()
- self.num_groups = num_groups
- self.eps = eps
-
- if act_fn is None:
- self.act = None
- else:
- self.act = get_activation(act_fn)
-
- self.linear = nn.Linear(embedding_dim, out_dim * 2)
- # elementwise_affine=False
- norm_kwargs = {"weight_attr": False, "bias_attr": False}
- self.group_norm = nn.GroupNorm(num_groups, out_dim, epsilon=eps, **norm_kwargs)
- self.group_norm.weight = None
- self.group_norm.bias = None
-
- def forward(self, x: paddle.Tensor, emb: paddle.Tensor) -> paddle.Tensor:
- if self.act:
- emb = self.act(emb)
- emb = self.linear(emb)
- emb = emb[:, :, None, None]
- scale, shift = emb.chunk(2, axis=1)
- x = self.group_norm(x)
- x = x * (1 + scale) + shift
- return x
diff --git a/ppdiffusers/ppdiffusers/models/prior_transformer.py b/ppdiffusers/ppdiffusers/models/prior_transformer.py
index f10fa2596..d8f0b97d6 100644
--- a/ppdiffusers/ppdiffusers/models/prior_transformer.py
+++ b/ppdiffusers/ppdiffusers/models/prior_transformer.py
@@ -21,16 +21,9 @@
import paddle.nn.functional as F
from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import UNet2DConditionLoadersMixin
from ..utils import NEG_INF, BaseOutput
from .attention import BasicTransformerBlock
-from .attention_processor import (
- ADDED_KV_ATTENTION_PROCESSORS,
- CROSS_ATTENTION_PROCESSORS,
- AttentionProcessor,
- AttnAddedKVProcessor,
- AttnProcessor,
-)
+from .attention_processor import AttentionProcessor, AttnProcessor
from .embeddings import TimestepEmbedding, Timesteps
from .modeling_utils import ModelMixin
@@ -48,7 +41,7 @@ class PriorTransformerOutput(BaseOutput):
predicted_image_embedding: paddle.Tensor
-class PriorTransformer(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+class PriorTransformer(ModelMixin, ConfigMixin):
"""
A Prior Transformer model.
@@ -204,8 +197,8 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
processors = {}
def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
- if hasattr(module, "get_processor"):
- processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+ if hasattr(module, "set_processor"):
+ processors[f"{name}.processor"] = module.processor
for sub_name, child in module.named_children():
fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
@@ -218,9 +211,7 @@ def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[st
return processors
# Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
- def set_attn_processor(
- self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
- ):
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
r"""
Sets the attention processor to use to compute attention.
Parameters:
@@ -241,9 +232,9 @@ def set_attn_processor(
def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
if hasattr(module, "set_processor"):
if not isinstance(processor, dict):
- module.set_processor(processor, _remove_lora=_remove_lora)
+ module.set_processor(processor)
else:
- module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+ module.set_processor(processor.pop(f"{name}.processor"))
for sub_name, child in module.named_children():
fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
@@ -256,16 +247,7 @@ def set_default_attn_processor(self):
"""
Disables custom attention processors and sets the default attention implementation.
"""
- if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
- processor = AttnAddedKVProcessor()
- elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
- processor = AttnProcessor()
- else:
- raise ValueError(
- f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
- )
-
- self.set_attn_processor(processor, _remove_lora=True)
+ self.set_attn_processor(AttnProcessor())
def forward(
self,
diff --git a/ppdiffusers/ppdiffusers/models/resnet.py b/ppdiffusers/ppdiffusers/models/resnet.py
index 168b828b5..69a5cb502 100644
--- a/ppdiffusers/ppdiffusers/models/resnet.py
+++ b/ppdiffusers/ppdiffusers/models/resnet.py
@@ -23,9 +23,9 @@
from ..initializer import zeros_
from .activations import get_activation
+from .attention import AdaGroupNorm
from .attention_processor import SpatialNorm
from .lora import LoRACompatibleConv, LoRACompatibleLinear
-from .normalization import AdaGroupNorm
class Upsample1D(nn.Layer):
@@ -40,8 +40,6 @@ class Upsample1D(nn.Layer):
option to use a convolution transpose.
out_channels (`int`, optional):
number of output channels. Defaults to `channels`.
- name (`str`, default `conv`):
- name of the upsampling 1D layer.
"""
def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
@@ -83,8 +81,6 @@ class Downsample1D(nn.Layer):
number of output channels. Defaults to `channels`.
padding (`int`, default `1`):
padding for the convolution.
- name (`str`, default `conv`):
- name of the downsampling 1D layer.
"""
def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
@@ -119,8 +115,6 @@ class Upsample2D(nn.Layer):
option to use a convolution transpose.
out_channels (`int`, optional):
number of output channels. Defaults to `channels`.
- name (`str`, default `conv`):
- name of the upsampling 2D layer.
"""
def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
@@ -143,7 +137,7 @@ def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_chann
else:
self.Conv2d_0 = conv
- def forward(self, hidden_states, output_size=None, scale: float = 1.0):
+ def forward(self, hidden_states, output_size=None):
assert hidden_states.shape[1] == self.channels
if self.use_conv_transpose:
@@ -156,10 +150,6 @@ def forward(self, hidden_states, output_size=None, scale: float = 1.0):
if dtype == paddle.bfloat16:
hidden_states = hidden_states.cast("float32")
- # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
- if hidden_states.shape[0] >= 64:
- hidden_states = hidden_states.contiguous() if hasattr(hidden_states, "contiguous") else hidden_states
-
# if `output_size` is passed we force the interpolation output
# size and do not make use of `scale_factor=2`
if output_size is None:
@@ -174,15 +164,9 @@ def forward(self, hidden_states, output_size=None, scale: float = 1.0):
# TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
if self.use_conv:
if self.name == "conv":
- if isinstance(self.conv, LoRACompatibleConv):
- hidden_states = self.conv(hidden_states, scale)
- else:
- hidden_states = self.conv(hidden_states)
+ hidden_states = self.conv(hidden_states)
else:
- if isinstance(self.Conv2d_0, LoRACompatibleConv):
- hidden_states = self.Conv2d_0(hidden_states, scale)
- else:
- hidden_states = self.Conv2d_0(hidden_states)
+ hidden_states = self.Conv2d_0(hidden_states)
return hidden_states
@@ -199,8 +183,6 @@ class Downsample2D(nn.Layer):
number of output channels. Defaults to `channels`.
padding (`int`, default `1`):
padding for the convolution.
- name (`str`, default `conv`):
- name of the downsampling 2D layer.
"""
def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
@@ -227,17 +209,14 @@ def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name=
else:
self.conv = conv
- def forward(self, hidden_states, scale: float = 1.0):
+ def forward(self, hidden_states):
assert hidden_states.shape[1] == self.channels
if self.use_conv and self.padding == 0:
pad = (0, 1, 0, 1)
hidden_states = F.pad(hidden_states, pad, mode="constant", value=0)
assert hidden_states.shape[1] == self.channels
- if isinstance(self.conv, LoRACompatibleConv):
- hidden_states = self.conv(hidden_states, scale)
- else:
- hidden_states = self.conv(hidden_states)
+ hidden_states = self.conv(hidden_states)
return hidden_states
@@ -444,12 +423,6 @@ def forward(self, hidden_states):
# downsample/upsample layer used in k-upscaler, might be able to use FirDownsample2D/DirUpsample2D instead
class KDownsample2D(nn.Layer):
- r"""A 2D K-downsampling layer.
-
- Parameters:
- pad_mode (`str`, *optional*, default to `"reflect"`): the padding mode to use.
- """
-
def __init__(self, pad_mode="reflect"):
super().__init__()
self.pad_mode = pad_mode
@@ -469,12 +442,6 @@ def forward(self, inputs):
class KUpsample2D(nn.Layer):
- r"""A 2D K-upsampling layer.
-
- Parameters:
- pad_mode (`str`, *optional*, default to `"reflect"`): the padding mode to use.
- """
-
def __init__(self, pad_mode="reflect"):
super().__init__()
self.pad_mode = pad_mode
@@ -625,7 +592,7 @@ def __init__(
in_channels, conv_2d_out_channels, kernel_size=1, stride=1, padding=0, bias_attr=conv_shortcut_bias
)
- def forward(self, input_tensor, temb, scale: float = 1.0):
+ def forward(self, input_tensor, temb):
hidden_states = input_tensor
if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial":
@@ -636,34 +603,18 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
hidden_states = self.nonlinearity(hidden_states)
if self.upsample is not None:
- input_tensor = (
- self.upsample(input_tensor, scale=scale)
- if isinstance(self.upsample, Upsample2D)
- else self.upsample(input_tensor)
- )
- hidden_states = (
- self.upsample(hidden_states, scale=scale)
- if isinstance(self.upsample, Upsample2D)
- else self.upsample(hidden_states)
- )
+ input_tensor = self.upsample(input_tensor)
+ hidden_states = self.upsample(hidden_states)
elif self.downsample is not None:
- input_tensor = (
- self.downsample(input_tensor, scale=scale)
- if isinstance(self.downsample, Downsample2D)
- else self.downsample(input_tensor)
- )
- hidden_states = (
- self.downsample(hidden_states, scale=scale)
- if isinstance(self.downsample, Downsample2D)
- else self.downsample(hidden_states)
- )
+ input_tensor = self.downsample(input_tensor)
+ hidden_states = self.downsample(hidden_states)
- hidden_states = self.conv1(hidden_states, scale)
+ hidden_states = self.conv1(hidden_states)
if self.time_emb_proj is not None:
if not self.pre_temb_non_linearity and not self.skip_time_act:
temb = self.nonlinearity(temb)
- temb = self.time_emb_proj(temb, scale)[:, :, None, None]
+ temb = self.time_emb_proj(temb)[:, :, None, None]
if temb is not None and self.time_embedding_norm == "default":
hidden_states = hidden_states + temb
@@ -680,10 +631,10 @@ def forward(self, input_tensor, temb, scale: float = 1.0):
hidden_states = self.nonlinearity(hidden_states)
hidden_states = self.dropout(hidden_states)
- hidden_states = self.conv2(hidden_states, scale)
+ hidden_states = self.conv2(hidden_states)
if self.conv_shortcut is not None:
- input_tensor = self.conv_shortcut(input_tensor, scale)
+ input_tensor = self.conv_shortcut(input_tensor)
# TODO this maybe result -inf, input_tensor's min value -57644 hidden_states's min value -10000
output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
@@ -706,21 +657,14 @@ def rearrange_dims(tensor):
class Conv1dBlock(nn.Layer):
"""
Conv1d --> GroupNorm --> Mish
-
- Parameters:
- inp_channels (`int`): Number of input channels.
- out_channels (`int`): Number of output channels.
- kernel_size (`int` or `tuple`): Size of the convolving kernel.
- n_groups (`int`, default `8`): Number of groups to separate the channels into.
- activation (`str`, defaults `mish`): Name of the activation function.
"""
- def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8, activation: str = "mish"):
+ def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8):
super().__init__()
self.conv1d = nn.Conv1D(inp_channels, out_channels, kernel_size, padding=kernel_size // 2)
self.group_norm = nn.GroupNorm(n_groups, out_channels)
- self.mish = get_activation(activation)
+ self.mish = nn.Mish()
def forward(self, inputs):
intermediate_repr = self.conv1d(inputs)
@@ -733,23 +677,12 @@ def forward(self, inputs):
# unet_rl.py
class ResidualTemporalBlock1D(nn.Layer):
- """
- Residual 1D block with temporal convolutions.
-
- Parameters:
- inp_channels (`int`): Number of input channels.
- out_channels (`int`): Number of output channels.
- embed_dim (`int`): Embedding dimension.
- kernel_size (`int` or `tuple`): Size of the convolving kernel.
- activation (`str`, defaults `mish`): It is possible to choose the right activation function.
- """
-
- def __init__(self, inp_channels, out_channels, embed_dim, kernel_size=5, activation: str = "mish"):
+ def __init__(self, inp_channels, out_channels, embed_dim, kernel_size=5):
super().__init__()
self.conv_in = Conv1dBlock(inp_channels, out_channels, kernel_size)
self.conv_out = Conv1dBlock(out_channels, out_channels, kernel_size)
- self.time_emb_act = get_activation(activation)
+ self.time_emb_act = nn.Mish()
self.time_emb = nn.Linear(embed_dim, out_channels)
self.residual_conv = (
@@ -919,11 +852,6 @@ class TemporalConvLayer(nn.Layer):
"""
Temporal convolutional layer that can be used for video (sequence of images) input Code mostly copied from:
https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/models/multi_modal/video_synthesis/unet_sd.py#L1016
-
- Parameters:
- in_dim (`int`): Number of input channels.
- out_dim (`int`): Number of output channels.
- dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
"""
def __init__(self, in_dim, out_dim=None, dropout=0.0):
@@ -955,8 +883,6 @@ def __init__(self, in_dim, out_dim=None, dropout=0.0):
nn.Dropout(p=dropout),
nn.Conv3D(in_channels=out_dim, out_channels=in_dim, kernel_size=(3, 1, 1), padding=(1, 0, 0)),
)
-
- # zero out the last layer params,so the conv block is identity
zeros_(self.conv4[-1].weight)
zeros_(self.conv4[-1].bias)
@@ -966,15 +892,12 @@ def forward(self, hidden_states, num_frames=1):
.reshape((-1, num_frames) + tuple(hidden_states.shape[1:]))
.transpose(perm=[0, 2, 1, 3, 4])
)
-
identity = hidden_states
hidden_states = self.conv1(hidden_states)
hidden_states = self.conv2(hidden_states)
hidden_states = self.conv3(hidden_states)
hidden_states = self.conv4(hidden_states)
-
hidden_states = identity + hidden_states
-
hidden_states = hidden_states.transpose(perm=[0, 2, 1, 3, 4]).reshape(
(hidden_states.shape[0] * hidden_states.shape[2], -1) + tuple(hidden_states.shape[3:])
)
diff --git a/ppdiffusers/ppdiffusers/models/t5_film_transformer.py b/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
index 515538951..33986470c 100644
--- a/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
+++ b/ppdiffusers/ppdiffusers/models/t5_film_transformer.py
@@ -24,28 +24,6 @@
class T5FilmDecoder(ModelMixin, ConfigMixin):
- r"""
- T5 style decoder with FiLM conditioning.
-
- Args:
- input_dims (`int`, *optional*, defaults to `128`):
- The number of input dimensions.
- targets_length (`int`, *optional*, defaults to `256`):
- The length of the targets.
- d_model (`int`, *optional*, defaults to `768`):
- Size of the input hidden states.
- num_layers (`int`, *optional*, defaults to `12`):
- The number of `DecoderLayer`'s to use.
- num_heads (`int`, *optional*, defaults to `12`):
- The number of attention heads to use.
- d_kv (`int`, *optional*, defaults to `64`):
- Size of the key-value projection vectors.
- d_ff (`int`, *optional*, defaults to `2048`):
- The number of dimensions in the intermediate feed-forward layer of `DecoderLayer`'s.
- dropout_rate (`float`, *optional*, defaults to `0.1`):
- Dropout probability.
- """
-
@register_to_config
def __init__(
self,
@@ -92,9 +70,7 @@ def encoder_decoder_mask(self, query_input, key_input):
def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time):
batch, _, _ = decoder_input_tokens.shape
- assert decoder_noise_time.shape == [
- batch,
- ]
+ assert decoder_noise_time.shape[0] == batch
# decoder_noise_time is in [0, 1), so rescale to expected timing range.
time_steps = get_timestep_embedding(
@@ -149,27 +125,7 @@ def forward(self, encodings_and_masks, decoder_input_tokens, decoder_noise_time)
class DecoderLayer(nn.Layer):
- r"""
- T5 decoder layer.
-
- Args:
- d_model (`int`):
- Size of the input hidden states.
- d_kv (`int`):
- Size of the key-value projection vectors.
- num_heads (`int`):
- Number of attention heads.
- d_ff (`int`):
- Size of the intermediate feed-forward layer.
- dropout_rate (`float`):
- Dropout probability.
- layer_norm_epsilon (`float`, *optional*, defaults to `1e-6`):
- A small value used for numerical stability to avoid dividing by zero.
- """
-
- def __init__(
- self, d_model: int, d_kv: int, num_heads: int, d_ff: int, dropout_rate: float, layer_norm_epsilon: float = 1e-6
- ):
+ def __init__(self, d_model, d_kv, num_heads, d_ff, dropout_rate, layer_norm_epsilon=1e-6):
super().__init__()
self.layer = nn.LayerList()
@@ -227,21 +183,7 @@ def forward(
class T5LayerSelfAttentionCond(nn.Layer):
- r"""
- T5 style self-attention layer with conditioning.
-
- Args:
- d_model (`int`):
- Size of the input hidden states.
- d_kv (`int`):
- Size of the key-value projection vectors.
- num_heads (`int`):
- Number of attention heads.
- dropout_rate (`float`):
- Dropout probability.
- """
-
- def __init__(self, d_model: int, d_kv: int, num_heads: int, dropout_rate: float):
+ def __init__(self, d_model, d_kv, num_heads, dropout_rate):
super().__init__()
self.layer_norm = T5LayerNorm(d_model)
self.FiLMLayer = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
@@ -269,23 +211,7 @@ def forward(
class T5LayerCrossAttention(nn.Layer):
- r"""
- T5 style cross-attention layer.
-
- Args:
- d_model (`int`):
- Size of the input hidden states.
- d_kv (`int`):
- Size of the key-value projection vectors.
- num_heads (`int`):
- Number of attention heads.
- dropout_rate (`float`):
- Dropout probability.
- layer_norm_epsilon (`float`):
- A small value used for numerical stability to avoid dividing by zero.
- """
-
- def __init__(self, d_model: int, d_kv: int, num_heads: int, dropout_rate: float, layer_norm_epsilon: float):
+ def __init__(self, d_model, d_kv, num_heads, dropout_rate, layer_norm_epsilon):
super().__init__()
self.attention = Attention(query_dim=d_model, heads=num_heads, dim_head=d_kv, out_bias=False, scale_qk=False)
self.layer_norm = T5LayerNorm(d_model, eps=layer_norm_epsilon)
@@ -308,21 +234,7 @@ def forward(
class T5LayerFFCond(nn.Layer):
- r"""
- T5 style feed-forward conditional layer.
-
- Args:
- d_model (`int`):
- Size of the input hidden states.
- d_ff (`int`):
- Size of the intermediate feed-forward layer.
- dropout_rate (`float`):
- Dropout probability.
- layer_norm_epsilon (`float`):
- A small value used for numerical stability to avoid dividing by zero.
- """
-
- def __init__(self, d_model: int, d_ff: int, dropout_rate: float, layer_norm_epsilon: float):
+ def __init__(self, d_model, d_ff, dropout_rate, layer_norm_epsilon):
super().__init__()
self.DenseReluDense = T5DenseGatedActDense(d_model=d_model, d_ff=d_ff, dropout_rate=dropout_rate)
self.film = T5FiLMLayer(in_features=d_model * 4, out_features=d_model)
@@ -340,19 +252,7 @@ def forward(self, hidden_states, conditioning_emb=None):
class T5DenseGatedActDense(nn.Layer):
- r"""
- T5 style feed-forward layer with gated activations and dropout.
-
- Args:
- d_model (`int`):
- Size of the input hidden states.
- d_ff (`int`):
- Size of the intermediate feed-forward layer.
- dropout_rate (`float`):
- Dropout probability.
- """
-
- def __init__(self, d_model: int, d_ff: int, dropout_rate: float):
+ def __init__(self, d_model, d_ff, dropout_rate):
super().__init__()
self.wi_0 = nn.Linear(d_model, d_ff, bias_attr=False)
self.wi_1 = nn.Linear(d_model, d_ff, bias_attr=False)
@@ -371,20 +271,11 @@ def forward(self, hidden_states):
class T5LayerNorm(nn.Layer):
- r"""
- T5 style layer normalization module.
-
- Args:
- hidden_size (`int`):
- Size of the input hidden states.
- eps (`float`, `optional`, defaults to `1e-6`):
- A small value used for numerical stability to avoid dividing by zero.
+ """
+ Construct a layernorm module in the T5 style No bias and no subtraction of mean.
"""
- def __init__(self, hidden_size: int, eps: float = 1e-6):
- """
- Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
- """
+ def __init__(self, hidden_size, eps=1e-6):
super().__init__()
self.weight = self.create_parameter(shape=[hidden_size], default_initializer=nn.initializer.Constant(1.0))
self.variance_epsilon = eps
@@ -418,16 +309,10 @@ def forward(self, input: paddle.Tensor) -> paddle.Tensor:
class T5FiLMLayer(nn.Layer):
"""
- T5 style FiLM Layer.
-
- Args:
- in_features (`int`):
- Number of input features.
- out_features (`int`):
- Number of output features.
+ FiLM Layer
"""
- def __init__(self, in_features: int, out_features: int):
+ def __init__(self, in_features, out_features):
super().__init__()
self.scale_bias = nn.Linear(in_features, out_features * 2, bias_attr=False)
diff --git a/ppdiffusers/ppdiffusers/models/transformer_2d.py b/ppdiffusers/ppdiffusers/models/transformer_2d.py
index 3a77c5cca..e039a9e17 100644
--- a/ppdiffusers/ppdiffusers/models/transformer_2d.py
+++ b/ppdiffusers/ppdiffusers/models/transformer_2d.py
@@ -18,16 +18,14 @@
import paddle
import paddle.nn.functional as F
from paddle import nn
-from paddle.distributed.fleet.utils import recompute
from ..configuration_utils import ConfigMixin, register_to_config
from ..models.embeddings import ImagePositionalEmbeddings
from ..utils import BaseOutput, deprecate
from .attention import BasicTransformerBlock
-from .embeddings import CaptionProjection, PatchEmbed
+from .embeddings import PatchEmbed
from .lora import LoRACompatibleConv, LoRACompatibleLinear
from .modeling_utils import ModelMixin
-from .normalization import AdaLayerNormSingle
@dataclass
@@ -91,13 +89,9 @@ def __init__(
num_embeds_ada_norm: Optional[int] = None,
use_linear_projection: bool = False,
only_cross_attention: bool = False,
- double_self_attention: bool = False,
upcast_attention: bool = False,
norm_type: str = "layer_norm",
norm_elementwise_affine: bool = True,
- norm_eps: float = 1e-5,
- attention_type: str = "default",
- caption_channels: int = None,
):
super().__init__()
self.use_linear_projection = use_linear_projection
@@ -166,15 +160,12 @@ def __init__(
self.width = sample_size
self.patch_size = patch_size
- interpolation_scale = self.config.sample_size // 64 # => 64 (= 512 pixart) has interpolation scale 1
- interpolation_scale = max(interpolation_scale, 1)
self.pos_embed = PatchEmbed(
height=sample_size,
width=sample_size,
patch_size=patch_size,
in_channels=in_channels,
embed_dim=inner_dim,
- interpolation_scale=interpolation_scale,
)
# 3. Define transformers blocks
@@ -190,12 +181,9 @@ def __init__(
num_embeds_ada_norm=num_embeds_ada_norm,
attention_bias=attention_bias,
only_cross_attention=only_cross_attention,
- double_self_attention=double_self_attention,
upcast_attention=upcast_attention,
norm_type=norm_type,
norm_elementwise_affine=norm_elementwise_affine,
- norm_eps=norm_eps,
- attention_type=attention_type,
)
for d in range(num_layers)
]
@@ -212,42 +200,18 @@ def __init__(
elif self.is_input_vectorized:
self.norm_out = nn.LayerNorm(inner_dim)
self.out = nn.Linear(inner_dim, self.num_vector_embeds - 1)
- elif self.is_input_patches and norm_type != "ada_norm_single":
+ elif self.is_input_patches:
# elementwise_affine=False
norm_kwargs = {"weight_attr": False, "bias_attr": False}
self.norm_out = nn.LayerNorm(inner_dim, epsilon=1e-6, **norm_kwargs)
self.proj_out_1 = nn.Linear(inner_dim, 2 * inner_dim)
self.proj_out_2 = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
- elif self.is_input_patches and norm_type == "ada_norm_single":
- norm_kwargs = {"weight_attr": False, "bias_attr": False}
- self.norm_out = nn.LayerNorm(inner_dim, epsilon=1e-6, **norm_kwargs)
- tmp_param = paddle.randn(2, inner_dim) / inner_dim**0.5
- self.scale_shift_table = self.create_parameter(
- shape=tmp_param.shape, default_initializer=paddle.nn.initializer.Assign(tmp_param)
- )
- self.proj_out = nn.Linear(inner_dim, patch_size * patch_size * self.out_channels)
-
- # 5. PixArt-Alpha blocks.
- self.adaln_single = None
- self.use_additional_conditions = False
- if norm_type == "ada_norm_single":
- self.use_additional_conditions = self.config.sample_size == 128
- # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
- # additional conditions until we find better name
- self.adaln_single = AdaLayerNormSingle(inner_dim, use_additional_conditions=self.use_additional_conditions)
-
- self.caption_projection = None
- if caption_channels is not None:
- self.caption_projection = CaptionProjection(in_features=caption_channels, hidden_size=inner_dim)
-
- self.gradient_checkpointing = False
def forward(
self,
hidden_states: paddle.Tensor,
encoder_hidden_states: Optional[paddle.Tensor] = None,
timestep: Optional[paddle.Tensor] = None,
- added_cond_kwargs: Dict[str, paddle.Tensor] = None,
class_labels: Optional[paddle.Tensor] = None,
cross_attention_kwargs: Dict[str, Any] = None,
attention_mask: Optional[paddle.Tensor] = None,
@@ -268,14 +232,6 @@ def forward(
class_labels ( `paddle.Tensor` of shape `(batch size, num classes)`, *optional*):
Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
`AdaLayerZeroNorm`.
- cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
- A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
- `self.processor` in
- [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
- attention_mask ( `paddle.Tensor`, *optional*):
- An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
- is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
- negative values to the attention scores corresponding to "discard" tokens.
encoder_attention_mask ( `paddle.Tensor`, *optional*):
Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
@@ -316,95 +272,40 @@ def forward(
encoder_attention_mask = (1 - encoder_attention_mask.cast(hidden_states.dtype)) * -10000.0
encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
- # Retrieve lora scale.
- lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
-
# 1. Input
if self.is_input_continuous:
- batch, _, height, width = paddle.shape(hidden_states)
+ _, _, height, width = paddle.shape(hidden_states)
residual = hidden_states
hidden_states = self.norm(hidden_states)
if not self.use_linear_projection:
- hidden_states = self.proj_in(hidden_states, scale=lora_scale)
+ hidden_states = self.proj_in(hidden_states)
hidden_states = hidden_states.transpose([0, 2, 3, 1]).flatten(1, 2)
if self.use_linear_projection:
- hidden_states = self.proj_in(hidden_states, scale=lora_scale)
+ hidden_states = self.proj_in(hidden_states)
elif self.is_input_vectorized:
- # pytorch code:
- # hidden_states = self.latent_image_embedding(hidden_states)
- # paddle code, _C_ops.embedding not support float32, need convert to int64:
hidden_states = self.latent_image_embedding(hidden_states.cast("int64"))
elif self.is_input_patches:
- height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
hidden_states = self.pos_embed(hidden_states)
- if self.adaln_single is not None:
- if self.use_additional_conditions and added_cond_kwargs is None:
- raise ValueError(
- "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
- )
- batch_size = hidden_states.shape[0]
- timestep, embedded_timestep = self.adaln_single(
- timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
- )
-
# 2. Blocks
- if self.caption_projection is not None:
- batch_size = hidden_states.shape[0]
- encoder_hidden_states = self.caption_projection(encoder_hidden_states)
- encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
-
for block in self.transformer_blocks:
- if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
- # hidden_states = torch.utils.checkpoint.checkpoint(
- # block,
- # hidden_states,
- # attention_mask,
- # encoder_hidden_states,
- # encoder_attention_mask,
- # timestep,
- # cross_attention_kwargs,
- # class_labels,
- # use_reentrant=False,
- # )
- def create_custom_forward(module, return_dict=None):
- def custom_forward(*inputs):
- if return_dict is not None:
- return module(*inputs, return_dict=return_dict)[0] # move [0] when paddlepaddle <= 2.4.1
- else:
- return module(*inputs)
-
- return custom_forward
-
- hidden_states = recompute(
- create_custom_forward(block),
- hidden_states,
- attention_mask,
- encoder_hidden_states,
- encoder_attention_mask,
- timestep,
- cross_attention_kwargs,
- class_labels,
- # use_reentrant=False,
- ) # [0]
- else:
- hidden_states = block(
- hidden_states,
- attention_mask=attention_mask,
- encoder_hidden_states=encoder_hidden_states,
- encoder_attention_mask=encoder_attention_mask,
- timestep=timestep,
- cross_attention_kwargs=cross_attention_kwargs,
- class_labels=class_labels,
- )
+ hidden_states = block(
+ hidden_states,
+ attention_mask=attention_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ timestep=timestep,
+ cross_attention_kwargs=cross_attention_kwargs,
+ class_labels=class_labels,
+ )
# 3. Output
if self.is_input_continuous:
if self.use_linear_projection:
- hidden_states = self.proj_out(hidden_states, scale=lora_scale)
- hidden_states = hidden_states.reshape([batch, height, width, self.inner_dim]).transpose([0, 3, 1, 2])
+ hidden_states = self.proj_out(hidden_states)
+ hidden_states = hidden_states.reshape([-1, height, width, self.inner_dim]).transpose([0, 3, 1, 2])
if not self.use_linear_projection:
- hidden_states = self.proj_out(hidden_states, scale=lora_scale)
+ hidden_states = self.proj_out(hidden_states)
output = hidden_states + residual
elif self.is_input_vectorized:
hidden_states = self.norm_out(hidden_states)
@@ -414,25 +315,17 @@ def custom_forward(*inputs):
# log(p(x_0))
output = F.log_softmax(logits.cast("float64"), axis=1).cast("float32")
- if self.is_input_patches:
- if self.config.norm_type != "ada_norm_single":
- conditioning = self.transformer_blocks[0].norm1.emb(
- timestep, class_labels, hidden_dtype=hidden_states.dtype
- )
- shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, axis=1)
- hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
- hidden_states = self.proj_out_2(hidden_states)
- elif self.config.norm_type == "ada_norm_single":
- shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, axis=1)
- hidden_states = self.norm_out(hidden_states)
- # Modulation
- hidden_states = hidden_states * (1 + scale) + shift
- hidden_states = self.proj_out(hidden_states)
- hidden_states = hidden_states.squeeze(1)
+ elif self.is_input_patches:
+ # TODO: cleanup!
+ conditioning = self.transformer_blocks[0].norm1.emb(
+ timestep, class_labels, hidden_dtype=hidden_states.dtype
+ )
+ shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, axis=1)
+ hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+ hidden_states = self.proj_out_2(hidden_states)
# unpatchify
- if self.adaln_single is None:
- height = width = int(hidden_states.shape[1] ** 0.5)
+ height = width = int(hidden_states.shape[1] ** 0.5)
hidden_states = hidden_states.reshape(
(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
)
diff --git a/ppdiffusers/ppdiffusers/models/transformer_temporal.py b/ppdiffusers/ppdiffusers/models/transformer_temporal.py
index 75e80f6ab..84847ffa5 100644
--- a/ppdiffusers/ppdiffusers/models/transformer_temporal.py
+++ b/ppdiffusers/ppdiffusers/models/transformer_temporal.py
@@ -14,7 +14,7 @@
# limitations under the License.
from dataclasses import dataclass
-from typing import Any, Dict, Optional
+from typing import Optional
import paddle
import paddle.nn as nn
@@ -50,21 +50,13 @@ class TransformerTemporalModel(ModelMixin, ConfigMixin):
num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
- attention_bias (`bool`, *optional*):
- Configure if the `TransformerBlock` attention should contain a bias parameter.
sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
This is fixed during training since it is used to learn a number of position embeddings.
- activation_fn (`str`, *optional*, defaults to `"geglu"`):
- Activation function to use in feed-forward. See `diffusers.models.activations.get_activation` for supported
- activation functions.
- norm_elementwise_affine (`bool`, *optional*):
- Configure if the `TransformerBlock` should use learnable elementwise affine parameters for normalization.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+ attention_bias (`bool`, *optional*):
+ Configure if the `TransformerBlock` attention should contain a bias parameter.
double_self_attention (`bool`, *optional*):
Configure if each `TransformerBlock` should contain two self-attention layers.
- positional_embeddings: (`str`, *optional*):
- The type of positional embeddings to apply to the sequence input before passing use.
- num_positional_embeddings: (`int`, *optional*):
- The maximum length of the sequence over which to apply positional embeddings.
"""
@register_to_config
@@ -83,19 +75,14 @@ def __init__(
activation_fn: str = "geglu",
norm_elementwise_affine: bool = True,
double_self_attention: bool = True,
- positional_embeddings: Optional[str] = None,
- num_positional_embeddings: Optional[int] = None,
):
super().__init__()
self.num_attention_heads = num_attention_heads
self.attention_head_dim = attention_head_dim
inner_dim = num_attention_heads * attention_head_dim
-
self.in_channels = in_channels
-
self.norm = nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, epsilon=1e-06)
self.proj_in = nn.Linear(in_channels, inner_dim)
-
# 3. Define transformers blocks
self.transformer_blocks = nn.LayerList(
[
@@ -109,25 +96,22 @@ def __init__(
attention_bias=attention_bias,
double_self_attention=double_self_attention,
norm_elementwise_affine=norm_elementwise_affine,
- positional_embeddings=positional_embeddings,
- num_positional_embeddings=num_positional_embeddings,
)
for d in range(num_layers)
]
)
-
self.proj_out = nn.Linear(inner_dim, in_channels)
def forward(
self,
- hidden_states: paddle.Tensor,
- encoder_hidden_states: Optional[paddle.Tensor] = None,
- timestep: Optional[paddle.Tensor] = None,
- class_labels: paddle.Tensor = None,
- num_frames: int = 1,
- cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+ hidden_states,
+ encoder_hidden_states=None,
+ timestep=None,
+ class_labels=None,
+ num_frames=1,
+ cross_attention_kwargs=None,
return_dict: bool = True,
- ) -> TransformerTemporalModelOutput:
+ ):
"""
The [`TransformerTemporal`] forward method.
@@ -142,12 +126,6 @@ def forward(
class_labels ( `paddle.Tensor` of shape `(batch size, num classes)`, *optional*):
Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
`AdaLayerZeroNorm`.
- num_frames (`int`, *optional*, defaults to 1):
- The number of frames to be processed per batch. This is used to reshape the hidden states.
- cross_attention_kwargs (`dict`, *optional*):
- A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
- `self.processor` in
- [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
tuple.
@@ -160,18 +138,14 @@ def forward(
# 1. Input
batch_frames, channel, height, width = hidden_states.shape
batch_size = batch_frames // num_frames
-
residual = hidden_states
-
hidden_states = hidden_states[None, :].reshape((batch_size, num_frames, channel, height, width))
hidden_states = hidden_states.transpose([0, 2, 1, 3, 4])
-
hidden_states = self.norm(hidden_states)
hidden_states = hidden_states.transpose([0, 3, 4, 2, 1]).reshape(
(batch_size * height * width, num_frames, channel)
)
hidden_states = self.proj_in(hidden_states)
-
# 2. Blocks
for block in self.transformer_blocks:
hidden_states = block(
@@ -181,7 +155,6 @@ def forward(
cross_attention_kwargs=cross_attention_kwargs,
class_labels=class_labels,
)
-
# 3. Output
hidden_states = self.proj_out(hidden_states)
hidden_states = (
@@ -190,10 +163,7 @@ def forward(
.transpose([0, 3, 4, 1, 2])
)
hidden_states = hidden_states.reshape((batch_frames, channel, height, width))
-
output = hidden_states + residual
-
if not return_dict:
return (output,)
-
return TransformerTemporalModelOutput(sample=output)
diff --git a/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
index 8ace1771a..efcce95db 100644
--- a/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
+++ b/ppdiffusers/ppdiffusers/models/unet_1d_blocks.py
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import math
-from typing import Optional, Union
+from typing import Optional
import paddle
import paddle.nn.functional as F
@@ -27,17 +27,17 @@
class DownResnetBlock1D(nn.Layer):
def __init__(
self,
- in_channels: int,
- out_channels: Optional[int] = None,
- num_layers: int = 1,
- conv_shortcut: bool = False,
- temb_channels: int = 32,
- groups: int = 32,
- groups_out: Optional[int] = None,
- non_linearity: Optional[str] = None,
- time_embedding_norm: str = "default",
- output_scale_factor: float = 1.0,
- add_downsample: bool = True,
+ in_channels,
+ out_channels=None,
+ num_layers=1,
+ conv_shortcut=False,
+ temb_channels=32,
+ groups=32,
+ groups_out=None,
+ non_linearity=None,
+ time_embedding_norm="default",
+ output_scale_factor=1.0,
+ add_downsample=True,
):
super().__init__()
self.in_channels = in_channels
@@ -89,16 +89,16 @@ def forward(self, hidden_states, temb=None):
class UpResnetBlock1D(nn.Layer):
def __init__(
self,
- in_channels: int,
- out_channels: Optional[int] = None,
- num_layers: int = 1,
- temb_channels: int = 32,
- groups: int = 32,
- groups_out: Optional[int] = None,
- non_linearity: Optional[str] = None,
- time_embedding_norm: str = "default",
- output_scale_factor: float = 1.0,
- add_upsample: bool = True,
+ in_channels,
+ out_channels=None,
+ num_layers=1,
+ temb_channels=32,
+ groups=32,
+ groups_out=None,
+ non_linearity=None,
+ time_embedding_norm="default",
+ output_scale_factor=1.0,
+ add_upsample=True,
):
super().__init__()
self.in_channels = in_channels
@@ -169,13 +169,13 @@ def forward(self, x, temb=None):
class MidResTemporalBlock1D(nn.Layer):
def __init__(
self,
- in_channels: int,
- out_channels: int,
- embed_dim: int,
+ in_channels,
+ out_channels,
+ embed_dim,
num_layers: int = 1,
add_downsample: bool = False,
add_upsample: bool = False,
- non_linearity: Optional[str] = None,
+ non_linearity=None,
):
super().__init__()
self.in_channels = in_channels
@@ -220,7 +220,7 @@ def forward(self, hidden_states, temb):
class OutConv1DBlock(nn.Layer):
- def __init__(self, num_groups_out: int, out_channels: int, embed_dim: int, act_fn: str):
+ def __init__(self, num_groups_out, out_channels, embed_dim, act_fn):
super().__init__()
self.final_conv1d_1 = nn.Conv1D(embed_dim, embed_dim, 5, padding=2)
self.final_conv1d_gn = nn.GroupNorm(num_groups_out, embed_dim)
@@ -238,7 +238,7 @@ def forward(self, hidden_states, temb=None):
class OutValueFunctionBlock(nn.Layer):
- def __init__(self, fc_dim: int, embed_dim: int, act_fn: str = "mish"):
+ def __init__(self, fc_dim, embed_dim, act_fn="mish"):
super().__init__()
self.final_block = nn.LayerList(
[
@@ -278,7 +278,7 @@ def forward(self, hidden_states, temb):
class Downsample1d(nn.Layer):
- def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"):
+ def __init__(self, kernel="linear", pad_mode="reflect"):
super().__init__()
self.pad_mode = pad_mode
kernel_1d = paddle.to_tensor(_kernels[kernel])
@@ -296,7 +296,7 @@ def forward(self, hidden_states):
class Upsample1d(nn.Layer):
- def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"):
+ def __init__(self, kernel="linear", pad_mode="reflect"):
super().__init__()
self.pad_mode = pad_mode
kernel_1d = paddle.to_tensor(_kernels[kernel])
@@ -314,7 +314,7 @@ def forward(self, hidden_states, temb=None):
class SelfAttention1d(nn.Layer):
- def __init__(self, in_channels: int, n_head: int = 1, dropout_rate: float = 0.0):
+ def __init__(self, in_channels, n_head=1, dropout_rate=0.0):
super().__init__()
self.channels = in_channels
self.group_norm = nn.GroupNorm(1, num_channels=in_channels)
@@ -333,12 +333,6 @@ def __init__(self, in_channels: int, n_head: int = 1, dropout_rate: float = 0.0)
self._use_memory_efficient_attention_xformers = False
self._attention_op = None
- def transpose_for_scores(self, projection: paddle.Tensor) -> paddle.Tensor:
- new_projection_shape = projection.shape[:-1] + (self.num_heads, -1)
- # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
- new_projection = projection.reshape(new_projection_shape).transpose([0, 2, 1, 3])
- return new_projection
-
def reshape_heads_to_batch_dim(self, tensor, transpose=True):
tensor = tensor.reshape([0, 0, self.num_heads, self.head_size])
if transpose:
@@ -378,7 +372,6 @@ def set_use_memory_efficient_attention_xformers(
def forward(self, hidden_states):
residual = hidden_states
- # batch, channel_dim, seq = hidden_states.shape
hidden_states = self.group_norm(hidden_states)
hidden_states = hidden_states.transpose([0, 2, 1])
@@ -610,7 +603,7 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
class UpBlock1D(nn.Layer):
- def __init__(self, in_channels: int, out_channels: int, mid_channels: Optional[int] = None):
+ def __init__(self, in_channels, out_channels, mid_channels=None):
super().__init__()
mid_channels = in_channels if mid_channels is None else mid_channels
@@ -658,20 +651,7 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
return hidden_states
-DownBlockType = Union[DownResnetBlock1D, DownBlock1D, AttnDownBlock1D, DownBlock1DNoSkip]
-MidBlockType = Union[MidResTemporalBlock1D, ValueFunctionMidBlock1D, UNetMidBlock1D]
-OutBlockType = Union[OutConv1DBlock, OutValueFunctionBlock]
-UpBlockType = Union[UpResnetBlock1D, UpBlock1D, AttnUpBlock1D, UpBlock1DNoSkip]
-
-
-def get_down_block(
- down_block_type: str,
- num_layers: int,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- add_downsample: bool,
-) -> DownBlockType:
+def get_down_block(down_block_type, num_layers, in_channels, out_channels, temb_channels, add_downsample):
if down_block_type == "DownResnetBlock1D":
return DownResnetBlock1D(
in_channels=in_channels,
@@ -689,9 +669,7 @@ def get_down_block(
raise ValueError(f"{down_block_type} does not exist.")
-def get_up_block(
- up_block_type: str, num_layers: int, in_channels: int, out_channels: int, temb_channels: int, add_upsample: bool
-) -> UpBlockType:
+def get_up_block(up_block_type, num_layers, in_channels, out_channels, temb_channels, add_upsample):
if up_block_type == "UpResnetBlock1D":
return UpResnetBlock1D(
in_channels=in_channels,
@@ -709,15 +687,7 @@ def get_up_block(
raise ValueError(f"{up_block_type} does not exist.")
-def get_mid_block(
- mid_block_type: str,
- num_layers: int,
- in_channels: int,
- mid_channels: int,
- out_channels: int,
- embed_dim: int,
- add_downsample: bool,
-) -> MidBlockType:
+def get_mid_block(mid_block_type, num_layers, in_channels, mid_channels, out_channels, embed_dim, add_downsample):
if mid_block_type == "MidResTemporalBlock1D":
return MidResTemporalBlock1D(
num_layers=num_layers,
@@ -733,9 +703,7 @@ def get_mid_block(
raise ValueError(f"{mid_block_type} does not exist.")
-def get_out_block(
- *, out_block_type: str, num_groups_out: int, embed_dim: int, out_channels: int, act_fn: str, fc_dim: int
-) -> Optional[OutBlockType]:
+def get_out_block(*, out_block_type, num_groups_out, embed_dim, out_channels, act_fn, fc_dim):
if out_block_type == "OutConv1DBlock":
return OutConv1DBlock(num_groups_out, out_channels, embed_dim, act_fn)
elif out_block_type == "ValueFunction":
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d.py b/ppdiffusers/ppdiffusers/models/unet_2d.py
index 6be3582e8..b7eb75f02 100644
--- a/ppdiffusers/ppdiffusers/models/unet_2d.py
+++ b/ppdiffusers/ppdiffusers/models/unet_2d.py
@@ -72,14 +72,9 @@ class UNet2DModel(ModelMixin, ConfigMixin):
The downsample type for downsampling layers. Choose between "conv" and "resnet"
upsample_type (`str`, *optional*, defaults to `conv`):
The upsample type for upsampling layers. Choose between "conv" and "resnet"
- dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension.
norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for normalization.
- attn_norm_num_groups (`int`, *optional*, defaults to `None`):
- If set to an integer, a group norm layer will be created in the mid block's [`Attention`] layer with the
- given number of groups. If left as `None`, the group norm layer will only be created if
- `resnet_time_scale_shift` is set to `default`, and if created will have `norm_num_groups` groups.
norm_eps (`float`, *optional*, defaults to `1e-5`): The epsilon for normalization.
resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
@@ -109,17 +104,14 @@ def __init__(
downsample_padding: int = 1,
downsample_type: str = "conv",
upsample_type: str = "conv",
- dropout: float = 0.0,
act_fn: str = "silu",
attention_head_dim: Optional[int] = 8,
norm_num_groups: int = 32,
- attn_norm_num_groups: Optional[int] = None,
norm_eps: float = 1e-5,
resnet_time_scale_shift: str = "default",
add_attention: bool = True,
class_embed_type: Optional[str] = None,
num_class_embeds: Optional[int] = None,
- num_train_timesteps: Optional[int] = None,
resnet_pre_temb_non_linearity: Optional[bool] = False,
):
super().__init__()
@@ -148,9 +140,6 @@ def __init__(
elif time_embedding_type == "positional":
self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
timestep_input_dim = block_out_channels[0]
- elif time_embedding_type == "learned":
- self.time_proj = nn.Embedding(num_train_timesteps, block_out_channels[0])
- timestep_input_dim = block_out_channels[0]
self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
@@ -194,7 +183,6 @@ def __init__(
downsample_padding=downsample_padding,
resnet_time_scale_shift=resnet_time_scale_shift,
downsample_type=downsample_type,
- dropout=dropout,
resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
)
self.down_blocks.append(down_block)
@@ -203,14 +191,12 @@ def __init__(
self.mid_block = UNetMidBlock2D(
in_channels=block_out_channels[-1],
temb_channels=time_embed_dim,
- dropout=dropout,
resnet_eps=norm_eps,
resnet_act_fn=act_fn,
output_scale_factor=mid_block_scale_factor,
resnet_time_scale_shift=resnet_time_scale_shift,
attention_head_dim=attention_head_dim if attention_head_dim is not None else block_out_channels[-1],
resnet_groups=norm_num_groups,
- attn_groups=attn_norm_num_groups,
add_attention=add_attention,
resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
)
@@ -239,7 +225,6 @@ def __init__(
attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel,
resnet_time_scale_shift=resnet_time_scale_shift,
upsample_type=upsample_type,
- dropout=dropout,
resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
)
self.up_blocks.append(up_block)
@@ -319,8 +304,6 @@ def forward(
class_labels = class_labels.cast(paddle.int64)
class_emb = self.class_embedding(class_labels).cast(sample.dtype)
emb = emb + class_emb
- elif self.class_embedding is None and class_labels is not None:
- raise ValueError("class_embedding needs to be initialized in order to use class conditioning")
# 2. pre-process
skip_sample = sample
@@ -365,7 +348,7 @@ def forward(
sample += skip_sample
if self.config.time_embedding_type == "fourier":
- timesteps = timesteps.reshape((sample.shape[0], *([1] * len(sample.shape[1:]))))
+ timesteps = timesteps.reshape([sample.shape[0], *([1] * len(sample.shape[1:]))])
sample = sample / timesteps
if not return_dict:
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
index 86ebdd19c..2f75a2607 100644
--- a/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
+++ b/ppdiffusers/ppdiffusers/models/unet_2d_blocks.py
@@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple
import numpy as np
import paddle
@@ -20,15 +20,13 @@
from paddle.distributed.fleet.utils import recompute
from ..utils import is_ppxformers_available, logging
-from ..utils.paddle_utils import apply_freeu
-from .activations import get_activation
+from .attention import AdaGroupNorm
from .attention_processor import (
Attention,
AttnAddedKVProcessor,
AttnAddedKVProcessor2_5,
)
from .dual_transformer_2d import DualTransformer2DModel
-from .normalization import AdaGroupNorm
from .resnet import (
Downsample2D,
FirDownsample2D,
@@ -44,31 +42,29 @@
def get_down_block(
- down_block_type: str,
- num_layers: int,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- add_downsample: bool,
- resnet_eps: float,
- resnet_act_fn: str,
- transformer_layers_per_block: int = 1,
- num_attention_heads: Optional[int] = None,
- resnet_groups: Optional[int] = None,
- cross_attention_dim: Optional[int] = None,
- downsample_padding: Optional[int] = None,
- dual_cross_attention: bool = False,
- use_linear_projection: bool = False,
- only_cross_attention: bool = False,
- upcast_attention: bool = False,
- resnet_time_scale_shift: str = "default",
- attention_type: str = "default",
- resnet_skip_time_act: bool = False,
- resnet_out_scale_factor: float = 1.0,
- cross_attention_norm: Optional[str] = None,
- attention_head_dim: Optional[int] = None,
- downsample_type: Optional[str] = None,
- dropout: float = 0.0,
+ down_block_type,
+ num_layers,
+ in_channels,
+ out_channels,
+ temb_channels,
+ add_downsample,
+ resnet_eps,
+ resnet_act_fn,
+ transformer_layers_per_block=1,
+ num_attention_heads=None,
+ resnet_groups=None,
+ cross_attention_dim=None,
+ downsample_padding=None,
+ dual_cross_attention=False,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ upcast_attention=False,
+ resnet_time_scale_shift="default",
+ resnet_skip_time_act=False,
+ resnet_out_scale_factor=1.0,
+ cross_attention_norm=None,
+ attention_head_dim=None,
+ downsample_type=None,
resnet_pre_temb_non_linearity=False,
):
# If attn head dim is not defined, we default it to the number of heads
@@ -85,7 +81,6 @@ def get_down_block(
in_channels=in_channels,
out_channels=out_channels,
temb_channels=temb_channels,
- dropout=dropout,
add_downsample=add_downsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -100,7 +95,6 @@ def get_down_block(
in_channels=in_channels,
out_channels=out_channels,
temb_channels=temb_channels,
- dropout=dropout,
add_downsample=add_downsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -120,7 +114,6 @@ def get_down_block(
in_channels=in_channels,
out_channels=out_channels,
temb_channels=temb_channels,
- dropout=dropout,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
resnet_groups=resnet_groups,
@@ -139,7 +132,6 @@ def get_down_block(
in_channels=in_channels,
out_channels=out_channels,
temb_channels=temb_channels,
- dropout=dropout,
add_downsample=add_downsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -152,7 +144,6 @@ def get_down_block(
only_cross_attention=only_cross_attention,
upcast_attention=upcast_attention,
resnet_time_scale_shift=resnet_time_scale_shift,
- attention_type=attention_type,
resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
)
elif down_block_type == "SimpleCrossAttnDownBlock2D":
@@ -163,7 +154,6 @@ def get_down_block(
in_channels=in_channels,
out_channels=out_channels,
temb_channels=temb_channels,
- dropout=dropout,
add_downsample=add_downsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -183,7 +173,6 @@ def get_down_block(
in_channels=in_channels,
out_channels=out_channels,
temb_channels=temb_channels,
- dropout=dropout,
add_downsample=add_downsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -197,7 +186,6 @@ def get_down_block(
in_channels=in_channels,
out_channels=out_channels,
temb_channels=temb_channels,
- dropout=dropout,
add_downsample=add_downsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -210,7 +198,6 @@ def get_down_block(
num_layers=num_layers,
in_channels=in_channels,
out_channels=out_channels,
- dropout=dropout,
add_downsample=add_downsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -224,7 +211,6 @@ def get_down_block(
num_layers=num_layers,
in_channels=in_channels,
out_channels=out_channels,
- dropout=dropout,
add_downsample=add_downsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -240,7 +226,6 @@ def get_down_block(
in_channels=in_channels,
out_channels=out_channels,
temb_channels=temb_channels,
- dropout=dropout,
add_downsample=add_downsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -252,7 +237,6 @@ def get_down_block(
in_channels=in_channels,
out_channels=out_channels,
temb_channels=temb_channels,
- dropout=dropout,
add_downsample=add_downsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -265,32 +249,29 @@ def get_down_block(
def get_up_block(
- up_block_type: str,
- num_layers: int,
- in_channels: int,
- out_channels: int,
- prev_output_channel: int,
- temb_channels: int,
- add_upsample: bool,
- resnet_eps: float,
- resnet_act_fn: str,
- resolution_idx: Optional[int] = None,
- transformer_layers_per_block: int = 1,
- num_attention_heads: Optional[int] = None,
- resnet_groups: Optional[int] = None,
- cross_attention_dim: Optional[int] = None,
- dual_cross_attention: bool = False,
- use_linear_projection: bool = False,
- only_cross_attention: bool = False,
- upcast_attention: bool = False,
- resnet_time_scale_shift: str = "default",
- attention_type: str = "default",
- resnet_skip_time_act: bool = False,
- resnet_out_scale_factor: float = 1.0,
- cross_attention_norm: Optional[str] = None,
- attention_head_dim: Optional[int] = None,
- upsample_type: Optional[str] = None,
- dropout: float = 0.0,
+ up_block_type,
+ num_layers,
+ in_channels,
+ out_channels,
+ prev_output_channel,
+ temb_channels,
+ add_upsample,
+ resnet_eps,
+ resnet_act_fn,
+ transformer_layers_per_block=1,
+ num_attention_heads=None,
+ resnet_groups=None,
+ cross_attention_dim=None,
+ dual_cross_attention=False,
+ use_linear_projection=False,
+ only_cross_attention=False,
+ upcast_attention=False,
+ resnet_time_scale_shift="default",
+ resnet_skip_time_act=False,
+ resnet_out_scale_factor=1.0,
+ cross_attention_norm=None,
+ attention_head_dim=None,
+ upsample_type=None,
resnet_pre_temb_non_linearity=False,
):
# If attn head dim is not defined, we default it to the number of heads
@@ -308,8 +289,6 @@ def get_up_block(
out_channels=out_channels,
prev_output_channel=prev_output_channel,
temb_channels=temb_channels,
- resolution_idx=resolution_idx,
- dropout=dropout,
add_upsample=add_upsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -324,8 +303,6 @@ def get_up_block(
out_channels=out_channels,
prev_output_channel=prev_output_channel,
temb_channels=temb_channels,
- resolution_idx=resolution_idx,
- dropout=dropout,
add_upsample=add_upsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -345,8 +322,6 @@ def get_up_block(
out_channels=out_channels,
prev_output_channel=prev_output_channel,
temb_channels=temb_channels,
- resolution_idx=resolution_idx,
- dropout=dropout,
add_upsample=add_upsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -358,7 +333,6 @@ def get_up_block(
only_cross_attention=only_cross_attention,
upcast_attention=upcast_attention,
resnet_time_scale_shift=resnet_time_scale_shift,
- attention_type=attention_type,
resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
)
elif up_block_type == "SimpleCrossAttnUpBlock2D":
@@ -370,8 +344,6 @@ def get_up_block(
out_channels=out_channels,
prev_output_channel=prev_output_channel,
temb_channels=temb_channels,
- resolution_idx=resolution_idx,
- dropout=dropout,
add_upsample=add_upsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -397,8 +369,6 @@ def get_up_block(
out_channels=out_channels,
prev_output_channel=prev_output_channel,
temb_channels=temb_channels,
- resolution_idx=resolution_idx,
- dropout=dropout,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
resnet_groups=resnet_groups,
@@ -414,8 +384,6 @@ def get_up_block(
out_channels=out_channels,
prev_output_channel=prev_output_channel,
temb_channels=temb_channels,
- resolution_idx=resolution_idx,
- dropout=dropout,
add_upsample=add_upsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -429,8 +397,6 @@ def get_up_block(
out_channels=out_channels,
prev_output_channel=prev_output_channel,
temb_channels=temb_channels,
- resolution_idx=resolution_idx,
- dropout=dropout,
add_upsample=add_upsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -443,8 +409,6 @@ def get_up_block(
num_layers=num_layers,
in_channels=in_channels,
out_channels=out_channels,
- resolution_idx=resolution_idx,
- dropout=dropout,
add_upsample=add_upsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -458,8 +422,6 @@ def get_up_block(
num_layers=num_layers,
in_channels=in_channels,
out_channels=out_channels,
- resolution_idx=resolution_idx,
- dropout=dropout,
add_upsample=add_upsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -475,8 +437,6 @@ def get_up_block(
in_channels=in_channels,
out_channels=out_channels,
temb_channels=temb_channels,
- resolution_idx=resolution_idx,
- dropout=dropout,
add_upsample=add_upsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -488,8 +448,6 @@ def get_up_block(
in_channels=in_channels,
out_channels=out_channels,
temb_channels=temb_channels,
- resolution_idx=resolution_idx,
- dropout=dropout,
add_upsample=add_upsample,
resnet_eps=resnet_eps,
resnet_act_fn=resnet_act_fn,
@@ -501,74 +459,7 @@ def get_up_block(
raise ValueError(f"{up_block_type} does not exist.")
-class AutoencoderTinyBlock(nn.Layer):
- """
- Tiny Autoencoder block used in [`AutoencoderTiny`]. It is a mini residual module consisting of plain conv + ReLU
- blocks.
-
- Args:
- in_channels (`int`): The number of input channels.
- out_channels (`int`): The number of output channels.
- act_fn (`str`):
- ` The activation function to use. Supported values are `"swish"`, `"mish"`, `"gelu"`, and `"relu"`.
-
- Returns:
- `paddle.Tensor`: A tensor with the same shape as the input tensor, but with the number of channels equal to
- `out_channels`.
- """
-
- def __init__(self, in_channels: int, out_channels: int, act_fn: str):
- super().__init__()
- act_fn = get_activation(act_fn)
- self.conv = nn.Sequential(
- nn.Conv2D(in_channels, out_channels, kernel_size=3, padding=1),
- act_fn,
- nn.Conv2D(out_channels, out_channels, kernel_size=3, padding=1),
- act_fn,
- nn.Conv2D(out_channels, out_channels, kernel_size=3, padding=1),
- )
- self.skip = (
- nn.Conv2D(in_channels, out_channels, kernel_size=1, bias_attr=False)
- if in_channels != out_channels
- else nn.Identity()
- )
- self.fuse = nn.ReLU()
-
- def forward(self, x):
- return self.fuse(self.conv(x) + self.skip(x))
-
-
class UNetMidBlock2D(nn.Layer):
- """
- A 2D UNet mid-block [`UNetMidBlock2D`] with multiple residual blocks and optional attention blocks.
-
- Args:
- in_channels (`int`): The number of input channels.
- temb_channels (`int`): The number of temporal embedding channels.
- dropout (`float`, *optional*, defaults to 0.0): The dropout rate.
- num_layers (`int`, *optional*, defaults to 1): The number of residual blocks.
- resnet_eps (`float`, *optional*, 1e-6 ): The epsilon value for the resnet blocks.
- resnet_time_scale_shift (`str`, *optional*, defaults to `default`):
- The type of normalization to apply to the time embeddings. This can help to improve the performance of the
- model on tasks with long-range temporal dependencies.
- resnet_act_fn (`str`, *optional*, defaults to `swish`): The activation function for the resnet blocks.
- resnet_groups (`int`, *optional*, defaults to 32):
- The number of groups to use in the group normalization layers of the resnet blocks.
- attn_groups (`Optional[int]`, *optional*, defaults to None): The number of groups for the attention blocks.
- resnet_pre_norm (`bool`, *optional*, defaults to `True`):
- Whether to use pre-normalization for the resnet blocks.
- add_attention (`bool`, *optional*, defaults to `True`): Whether to add attention blocks.
- attention_head_dim (`int`, *optional*, defaults to 1):
- Dimension of a single attention head. The number of attention heads is determined based on this value and
- the number of input channels.
- output_scale_factor (`float`, *optional*, defaults to 1.0): The output scale factor.
-
- Returns:
- `paddle.Tensor`: The output of the last residual block, which is a tensor of shape `(batch_size,
- in_channels, height, width)`.
-
- """
-
def __init__(
self,
in_channels: int,
@@ -579,7 +470,6 @@ def __init__(
resnet_time_scale_shift: str = "default", # default, spatial
resnet_act_fn: str = "swish",
resnet_groups: int = 32,
- attn_groups: Optional[int] = None,
resnet_pre_norm: bool = True,
add_attention: bool = True,
attention_head_dim: int = 1,
@@ -590,9 +480,6 @@ def __init__(
resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
self.add_attention = add_attention
- if attn_groups is None:
- attn_groups = resnet_groups if resnet_time_scale_shift == "default" else None
-
# there is always at least one resnet
resnets = [
ResnetBlock2D(
@@ -626,7 +513,7 @@ def __init__(
dim_head=attention_head_dim,
rescale_output_factor=output_scale_factor,
eps=resnet_eps,
- norm_num_groups=attn_groups,
+ norm_num_groups=resnet_groups if resnet_time_scale_shift == "default" else None,
spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None,
residual_connection=True,
bias=True,
@@ -673,7 +560,7 @@ def __init__(
temb_channels: int,
dropout: float = 0.0,
num_layers: int = 1,
- transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+ transformer_layers_per_block: int = 1,
resnet_eps: float = 1e-6,
resnet_time_scale_shift: str = "default",
resnet_act_fn: str = "swish",
@@ -685,7 +572,6 @@ def __init__(
dual_cross_attention: bool = False,
use_linear_projection: bool = False,
upcast_attention: bool = False,
- attention_type: str = "default",
resnet_pre_temb_non_linearity: bool = False,
):
super().__init__()
@@ -694,10 +580,6 @@ def __init__(
self.num_attention_heads = num_attention_heads
resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
- # support for variable transformer layers per block
- if isinstance(transformer_layers_per_block, int):
- transformer_layers_per_block = [transformer_layers_per_block] * num_layers
-
# there is always at least one resnet
resnets = [
ResnetBlock2D(
@@ -716,19 +598,18 @@ def __init__(
]
attentions = []
- for i in range(num_layers):
+ for _ in range(num_layers):
if not dual_cross_attention:
attentions.append(
Transformer2DModel(
num_attention_heads,
in_channels // num_attention_heads,
in_channels=in_channels,
- num_layers=transformer_layers_per_block[i],
+ num_layers=transformer_layers_per_block,
cross_attention_dim=cross_attention_dim,
norm_num_groups=resnet_groups,
use_linear_projection=use_linear_projection,
upcast_attention=upcast_attention,
- attention_type=attention_type,
)
)
else:
@@ -761,8 +642,6 @@ def __init__(
self.attentions = nn.LayerList(attentions)
self.resnets = nn.LayerList(resnets)
- self.gradient_checkpointing = False
-
def forward(
self,
hidden_states: paddle.Tensor,
@@ -772,39 +651,17 @@ def forward(
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
encoder_attention_mask: Optional[paddle.Tensor] = None,
) -> paddle.Tensor:
- lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
hidden_states = self.resnets[0](hidden_states, temb)
for attn, resnet in zip(self.attentions, self.resnets[1:]):
- if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
- def create_custom_forward(module, return_dict=None):
- def custom_forward(*inputs):
- if return_dict is not None:
- return module(*inputs, return_dict=return_dict)[0] # move [0] when paddlepaddle <= 2.4.1
- else:
- return module(*inputs)
-
- return custom_forward
-
- hidden_states = recompute(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- attention_mask,
- encoder_attention_mask,
- ) # [0]
- hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
- else:
- hidden_states = attn(
- hidden_states,
- encoder_hidden_states=encoder_hidden_states,
- cross_attention_kwargs=cross_attention_kwargs,
- attention_mask=attention_mask,
- encoder_attention_mask=encoder_attention_mask,
- return_dict=False,
- )[0]
- hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+ hidden_states = attn(
+ hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ cross_attention_kwargs=cross_attention_kwargs,
+ attention_mask=attention_mask,
+ encoder_attention_mask=encoder_attention_mask,
+ return_dict=False,
+ )[0]
+ hidden_states = resnet(hidden_states, temb)
return hidden_states
@@ -824,9 +681,9 @@ def __init__(
attention_head_dim: int = 1,
output_scale_factor: float = 1.0,
cross_attention_dim: int = 1280,
- skip_time_act: bool = False,
- only_cross_attention: bool = False,
- cross_attention_norm: Optional[str] = None,
+ skip_time_act=False,
+ only_cross_attention=False,
+ cross_attention_norm=None,
resnet_pre_temb_non_linearity: bool = False,
):
super().__init__()
@@ -903,10 +760,8 @@ def forward(
attention_mask: Optional[paddle.Tensor] = None,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
encoder_attention_mask: Optional[paddle.Tensor] = None,
- ) -> paddle.Tensor:
+ ):
cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
- lora_scale = cross_attention_kwargs.get("scale", 1.0)
-
if attention_mask is None:
# if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
mask = None if encoder_hidden_states is None else encoder_attention_mask
@@ -918,7 +773,7 @@ def forward(
# mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask
mask = attention_mask
- hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
+ hidden_states = self.resnets[0](hidden_states, temb)
for attn, resnet in zip(self.attentions, self.resnets[1:]):
# attn
hidden_states = attn(
@@ -929,7 +784,7 @@ def forward(
)
# resnet
- hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+ hidden_states = resnet(hidden_states, temb)
return hidden_states
@@ -950,7 +805,7 @@ def __init__(
attention_head_dim: int = 1,
output_scale_factor: float = 1.0,
downsample_padding: int = 1,
- downsample_type: str = "conv",
+ downsample_type="conv",
resnet_pre_temb_non_linearity: bool = False,
):
super().__init__()
@@ -1028,31 +883,20 @@ def __init__(
else:
self.downsamplers = None
- def forward(
- self,
- hidden_states: paddle.Tensor,
- temb: Optional[paddle.Tensor] = None,
- upsample_size: Optional[int] = None,
- cross_attention_kwargs: Optional[Dict[str, Any]] = None,
- ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...]]:
- cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
-
- lora_scale = cross_attention_kwargs.get("scale", 1.0)
-
+ def forward(self, hidden_states, temb=None, upsample_size=None):
output_states = ()
for resnet, attn in zip(self.resnets, self.attentions):
- cross_attention_kwargs.update({"scale": lora_scale})
- hidden_states = resnet(hidden_states, temb, scale=lora_scale)
- hidden_states = attn(hidden_states, **cross_attention_kwargs)
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = attn(hidden_states)
output_states = output_states + (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
if self.downsample_type == "resnet":
- hidden_states = downsampler(hidden_states, temb=temb, scale=lora_scale)
+ hidden_states = downsampler(hidden_states, temb=temb)
else:
- hidden_states = downsampler(hidden_states, scale=lora_scale)
+ hidden_states = downsampler(hidden_states)
output_states += (hidden_states,)
@@ -1067,7 +911,7 @@ def __init__(
temb_channels: int,
dropout: float = 0.0,
num_layers: int = 1,
- transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+ transformer_layers_per_block: int = 1,
resnet_eps: float = 1e-6,
resnet_time_scale_shift: str = "default",
resnet_act_fn: str = "swish",
@@ -1082,7 +926,6 @@ def __init__(
use_linear_projection: bool = False,
only_cross_attention: bool = False,
upcast_attention: bool = False,
- attention_type: str = "default",
resnet_pre_temb_non_linearity: bool = False,
):
super().__init__()
@@ -1091,8 +934,6 @@ def __init__(
self.has_cross_attention = True
self.num_attention_heads = num_attention_heads
- if isinstance(transformer_layers_per_block, int):
- transformer_layers_per_block = [transformer_layers_per_block] * num_layers
for i in range(num_layers):
in_channels = in_channels if i == 0 else out_channels
@@ -1117,13 +958,12 @@ def __init__(
num_attention_heads,
out_channels // num_attention_heads,
in_channels=out_channels,
- num_layers=transformer_layers_per_block[i],
+ num_layers=transformer_layers_per_block,
cross_attention_dim=cross_attention_dim,
norm_num_groups=resnet_groups,
use_linear_projection=use_linear_projection,
only_cross_attention=only_cross_attention,
upcast_attention=upcast_attention,
- attention_type=attention_type,
)
)
else:
@@ -1161,12 +1001,11 @@ def forward(
attention_mask: Optional[paddle.Tensor] = None,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
encoder_attention_mask: Optional[paddle.Tensor] = None,
- additional_residuals: Optional[paddle.Tensor] = None,
- ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...]]:
+ additional_residuals=None,
+ ):
+ # TODO(Patrick, William) - attention mask is not used
output_states = ()
- lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
-
blocks = list(zip(self.resnets, self.attentions))
for i, (resnet, attn) in enumerate(blocks):
@@ -1193,7 +1032,7 @@ def custom_forward(*inputs):
encoder_attention_mask,
) # [0]
else:
- hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+ hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
@@ -1218,7 +1057,7 @@ def custom_forward(*inputs):
if self.downsamplers is not None:
for downsampler in self.downsamplers:
- hidden_states = downsampler(hidden_states, scale=lora_scale)
+ hidden_states = downsampler(hidden_states)
output_states = output_states + (hidden_states,)
@@ -1279,9 +1118,7 @@ def __init__(
self.gradient_checkpointing = False
- def forward(
- self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None, scale: float = 1.0
- ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...]]:
+ def forward(self, hidden_states, temb=None):
output_states = ()
for resnet in self.resnets:
@@ -1295,13 +1132,13 @@ def custom_forward(*inputs):
hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
else:
- hidden_states = resnet(hidden_states, temb, scale=scale)
+ hidden_states = resnet(hidden_states, temb)
output_states = output_states + (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
- hidden_states = downsampler(hidden_states, scale=scale)
+ hidden_states = downsampler(hidden_states)
output_states = output_states + (hidden_states,)
@@ -1359,13 +1196,13 @@ def __init__(
else:
self.downsamplers = None
- def forward(self, hidden_states: paddle.Tensor, scale: float = 1.0) -> paddle.Tensor:
+ def forward(self, hidden_states):
for resnet in self.resnets:
- hidden_states = resnet(hidden_states, temb=None, scale=scale)
+ hidden_states = resnet(hidden_states, temb=None)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
- hidden_states = downsampler(hidden_states, scale)
+ hidden_states = downsampler(hidden_states)
return hidden_states
@@ -1444,15 +1281,14 @@ def __init__(
else:
self.downsamplers = None
- def forward(self, hidden_states: paddle.Tensor, scale: float = 1.0) -> paddle.Tensor:
+ def forward(self, hidden_states):
for resnet, attn in zip(self.resnets, self.attentions):
- hidden_states = resnet(hidden_states, temb=None, scale=scale)
- cross_attention_kwargs = {"scale": scale}
- hidden_states = attn(hidden_states, **cross_attention_kwargs)
+ hidden_states = resnet(hidden_states, temb=None)
+ hidden_states = attn(hidden_states)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
- hidden_states = downsampler(hidden_states, scale)
+ hidden_states = downsampler(hidden_states)
return hidden_states
@@ -1540,23 +1376,16 @@ def __init__(
self.downsamplers = None
self.skip_conv = None
- def forward(
- self,
- hidden_states: paddle.Tensor,
- temb: Optional[paddle.Tensor] = None,
- skip_sample: Optional[paddle.Tensor] = None,
- scale: float = 1.0,
- ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...], paddle.Tensor]:
+ def forward(self, hidden_states, temb=None, skip_sample=None):
output_states = ()
for resnet, attn in zip(self.resnets, self.attentions):
- hidden_states = resnet(hidden_states, temb, scale=scale)
- cross_attention_kwargs = {"scale": scale}
- hidden_states = attn(hidden_states, **cross_attention_kwargs)
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = attn(hidden_states)
output_states += (hidden_states,)
if self.downsamplers is not None:
- hidden_states = self.resnet_down(hidden_states, temb, scale=scale)
+ hidden_states = self.resnet_down(hidden_states, temb)
for downsampler in self.downsamplers:
skip_sample = downsampler(skip_sample)
@@ -1630,21 +1459,15 @@ def __init__(
self.downsamplers = None
self.skip_conv = None
- def forward(
- self,
- hidden_states: paddle.Tensor,
- temb: Optional[paddle.Tensor] = None,
- skip_sample: Optional[paddle.Tensor] = None,
- scale: float = 1.0,
- ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...], paddle.Tensor]:
+ def forward(self, hidden_states, temb=None, skip_sample=None):
output_states = ()
for resnet in self.resnets:
- hidden_states = resnet(hidden_states, temb, scale)
+ hidden_states = resnet(hidden_states, temb)
output_states += (hidden_states,)
if self.downsamplers is not None:
- hidden_states = self.resnet_down(hidden_states, temb, scale)
+ hidden_states = self.resnet_down(hidden_states, temb)
for downsampler in self.downsamplers:
skip_sample = downsampler(skip_sample)
@@ -1722,9 +1545,7 @@ def __init__(
self.gradient_checkpointing = False
- def forward(
- self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None, scale: float = 1.0
- ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...]]:
+ def forward(self, hidden_states, temb=None):
output_states = ()
for resnet in self.resnets:
@@ -1738,13 +1559,13 @@ def custom_forward(*inputs):
hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
else:
- hidden_states = resnet(hidden_states, temb, scale)
+ hidden_states = resnet(hidden_states, temb)
output_states = output_states + (hidden_states,)
if self.downsamplers is not None:
for downsampler in self.downsamplers:
- hidden_states = downsampler(hidden_states, temb, scale)
+ hidden_states = downsampler(hidden_states, temb)
output_states = output_states + (hidden_states,)
@@ -1768,9 +1589,9 @@ def __init__(
cross_attention_dim: int = 1280,
output_scale_factor: float = 1.0,
add_downsample: bool = True,
- skip_time_act: bool = False,
- only_cross_attention: bool = False,
- cross_attention_norm: Optional[str] = None,
+ skip_time_act=False,
+ only_cross_attention=False,
+ cross_attention_norm=None,
resnet_pre_temb_non_linearity: bool = False,
):
super().__init__()
@@ -1857,8 +1678,6 @@ def forward(
output_states = ()
cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
- lora_scale = cross_attention_kwargs.get("scale", 1.0)
-
if attention_mask is None:
# if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
mask = None if encoder_hidden_states is None else encoder_attention_mask
@@ -1891,7 +1710,7 @@ def custom_forward(*inputs):
cross_attention_kwargs,
)
else:
- hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+ hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
hidden_states,
@@ -1904,7 +1723,7 @@ def custom_forward(*inputs):
if self.downsamplers is not None:
for downsampler in self.downsamplers:
- hidden_states = downsampler(hidden_states, temb, scale=lora_scale)
+ hidden_states = downsampler(hidden_states, temb)
output_states = output_states + (hidden_states,)
@@ -1959,9 +1778,7 @@ def __init__(
self.gradient_checkpointing = False
- def forward(
- self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None, scale: float = 1.0
- ) -> Tuple[paddle.Tensor, Tuple[paddle.Tensor, ...]]:
+ def forward(self, hidden_states, temb=None):
output_states = ()
for resnet in self.resnets:
@@ -1975,7 +1792,7 @@ def custom_forward(*inputs):
hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
else:
- hidden_states = resnet(hidden_states, temb, scale)
+ hidden_states = resnet(hidden_states, temb)
output_states += (hidden_states,)
@@ -1996,7 +1813,7 @@ def __init__(
dropout: float = 0.0,
num_layers: int = 4,
resnet_group_size: int = 32,
- add_downsample: bool = True,
+ add_downsample=True,
attention_head_dim: int = 64,
add_self_attention: bool = False,
resnet_eps: float = 1e-5,
@@ -2063,7 +1880,6 @@ def forward(
encoder_attention_mask: Optional[paddle.Tensor] = None,
):
output_states = ()
- lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
for resnet, attn in zip(self.resnets, self.attentions):
if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
@@ -2088,7 +1904,7 @@ def custom_forward(*inputs):
encoder_attention_mask,
)
else:
- hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+ hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
@@ -2117,7 +1933,6 @@ def __init__(
prev_output_channel: int,
out_channels: int,
temb_channels: int,
- resolution_idx: int = None,
dropout: float = 0.0,
num_layers: int = 1,
resnet_eps: float = 1e-6,
@@ -2127,7 +1942,7 @@ def __init__(
resnet_pre_norm: bool = True,
attention_head_dim: int = 1,
output_scale_factor: float = 1.0,
- upsample_type: str = "conv",
+ upsample_type="conv",
resnet_pre_temb_non_linearity: bool = False,
):
super().__init__()
@@ -2202,32 +2017,22 @@ def __init__(
else:
self.upsamplers = None
- self.resolution_idx = resolution_idx
-
- def forward(
- self,
- hidden_states: paddle.Tensor,
- res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
- temb: Optional[paddle.Tensor] = None,
- upsample_size: Optional[int] = None,
- scale: float = 1.0,
- ) -> paddle.Tensor:
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
for resnet, attn in zip(self.resnets, self.attentions):
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
- hidden_states = resnet(hidden_states, temb, scale=scale)
- cross_attention_kwargs = {"scale": scale}
- hidden_states = attn(hidden_states, **cross_attention_kwargs)
+ hidden_states = resnet(hidden_states, temb)
+ hidden_states = attn(hidden_states)
if self.upsamplers is not None:
for upsampler in self.upsamplers:
if self.upsample_type == "resnet":
- hidden_states = upsampler(hidden_states, temb=temb, scale=scale)
+ hidden_states = upsampler(hidden_states, temb=temb)
else:
- hidden_states = upsampler(hidden_states, scale=scale)
+ hidden_states = upsampler(hidden_states)
return hidden_states
@@ -2239,10 +2044,9 @@ def __init__(
out_channels: int,
prev_output_channel: int,
temb_channels: int,
- resolution_idx: Optional[int] = None,
dropout: float = 0.0,
num_layers: int = 1,
- transformer_layers_per_block: Union[int, Tuple[int]] = 1,
+ transformer_layers_per_block: int = 1,
resnet_eps: float = 1e-6,
resnet_time_scale_shift: str = "default",
resnet_act_fn: str = "swish",
@@ -2256,7 +2060,6 @@ def __init__(
use_linear_projection: bool = False,
only_cross_attention: bool = False,
upcast_attention: bool = False,
- attention_type: str = "default",
resnet_pre_temb_non_linearity: bool = False,
):
super().__init__()
@@ -2266,9 +2069,6 @@ def __init__(
self.has_cross_attention = True
self.num_attention_heads = num_attention_heads
- if isinstance(transformer_layers_per_block, int):
- transformer_layers_per_block = [transformer_layers_per_block] * num_layers
-
for i in range(num_layers):
res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
resnet_in_channels = prev_output_channel if i == 0 else out_channels
@@ -2294,13 +2094,12 @@ def __init__(
num_attention_heads,
out_channels // num_attention_heads,
in_channels=out_channels,
- num_layers=transformer_layers_per_block[i],
+ num_layers=transformer_layers_per_block,
cross_attention_dim=cross_attention_dim,
norm_num_groups=resnet_groups,
use_linear_projection=use_linear_projection,
only_cross_attention=only_cross_attention,
upcast_attention=upcast_attention,
- attention_type=attention_type,
)
)
else:
@@ -2323,7 +2122,6 @@ def __init__(
self.upsamplers = None
self.gradient_checkpointing = False
- self.resolution_idx = resolution_idx
def forward(
self,
@@ -2335,32 +2133,11 @@ def forward(
upsample_size: Optional[int] = None,
attention_mask: Optional[paddle.Tensor] = None,
encoder_attention_mask: Optional[paddle.Tensor] = None,
- ) -> paddle.Tensor:
- lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
- is_freeu_enabled = (
- getattr(self, "s1", None)
- and getattr(self, "s2", None)
- and getattr(self, "b1", None)
- and getattr(self, "b2", None)
- )
-
+ ):
for resnet, attn in zip(self.resnets, self.attentions):
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-
- # FreeU: Only operate on the first two stages
- if is_freeu_enabled:
- hidden_states, res_hidden_states = apply_freeu(
- self.resolution_idx,
- hidden_states,
- res_hidden_states,
- s1=self.s1,
- s2=self.s2,
- b1=self.b1,
- b2=self.b2,
- )
-
hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
@@ -2386,7 +2163,7 @@ def custom_forward(*inputs):
encoder_attention_mask,
)
else:
- hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+ hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
@@ -2398,7 +2175,7 @@ def custom_forward(*inputs):
if self.upsamplers is not None:
for upsampler in self.upsamplers:
- hidden_states = upsampler(hidden_states, upsample_size, scale=lora_scale)
+ hidden_states = upsampler(hidden_states, upsample_size)
return hidden_states
@@ -2410,7 +2187,6 @@ def __init__(
prev_output_channel: int,
out_channels: int,
temb_channels: int,
- resolution_idx: Optional[int] = None,
dropout: float = 0.0,
num_layers: int = 1,
resnet_eps: float = 1e-6,
@@ -2453,40 +2229,12 @@ def __init__(
self.upsamplers = None
self.gradient_checkpointing = False
- self.resolution_idx = resolution_idx
-
- def forward(
- self,
- hidden_states: paddle.Tensor,
- res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
- temb: Optional[paddle.Tensor] = None,
- upsample_size: Optional[int] = None,
- scale: float = 1.0,
- ) -> paddle.Tensor:
- is_freeu_enabled = (
- getattr(self, "s1", None)
- and getattr(self, "s2", None)
- and getattr(self, "b1", None)
- and getattr(self, "b2", None)
- )
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
for resnet in self.resnets:
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-
- # FreeU: Only operate on the first two stages
- if is_freeu_enabled:
- hidden_states, res_hidden_states = apply_freeu(
- self.resolution_idx,
- hidden_states,
- res_hidden_states,
- s1=self.s1,
- s2=self.s2,
- b1=self.b1,
- b2=self.b2,
- )
-
hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
@@ -2513,7 +2261,6 @@ def __init__(
self,
in_channels: int,
out_channels: int,
- resolution_idx: Optional[int] = None,
dropout: float = 0.0,
num_layers: int = 1,
resnet_eps: float = 1e-6,
@@ -2523,7 +2270,7 @@ def __init__(
resnet_pre_norm: bool = True,
output_scale_factor: float = 1.0,
add_upsample: bool = True,
- temb_channels: Optional[int] = None,
+ temb_channels=None,
resnet_pre_temb_non_linearity: bool = False,
):
super().__init__()
@@ -2555,13 +2302,9 @@ def __init__(
else:
self.upsamplers = None
- self.resolution_idx = resolution_idx
-
- def forward(
- self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None, scale: float = 1.0
- ) -> paddle.Tensor:
+ def forward(self, hidden_states, temb=None):
for resnet in self.resnets:
- hidden_states = resnet(hidden_states, temb=temb, scale=scale)
+ hidden_states = resnet(hidden_states, temb=temb)
if self.upsamplers is not None:
for upsampler in self.upsamplers:
@@ -2575,7 +2318,6 @@ def __init__(
self,
in_channels: int,
out_channels: int,
- resolution_idx: Optional[int] = None,
dropout: float = 0.0,
num_layers: int = 1,
resnet_eps: float = 1e-6,
@@ -2586,7 +2328,7 @@ def __init__(
attention_head_dim: int = 1,
output_scale_factor: float = 1.0,
add_upsample: bool = True,
- temb_channels: Optional[int] = None,
+ temb_channels=None,
resnet_pre_temb_non_linearity: bool = False,
):
super().__init__()
@@ -2641,19 +2383,14 @@ def __init__(
else:
self.upsamplers = None
- self.resolution_idx = resolution_idx
-
- def forward(
- self, hidden_states: paddle.Tensor, temb: Optional[paddle.Tensor] = None, scale: float = 1.0
- ) -> paddle.Tensor:
+ def forward(self, hidden_states, temb=None):
for resnet, attn in zip(self.resnets, self.attentions):
- hidden_states = resnet(hidden_states, temb=temb, scale=scale)
- cross_attention_kwargs = {"scale": scale}
- hidden_states = attn(hidden_states, temb=temb, **cross_attention_kwargs)
+ hidden_states = resnet(hidden_states, temb=temb)
+ hidden_states = attn(hidden_states, temb=temb)
if self.upsamplers is not None:
for upsampler in self.upsamplers:
- hidden_states = upsampler(hidden_states, scale=scale)
+ hidden_states = upsampler(hidden_states)
return hidden_states
@@ -2665,7 +2402,6 @@ def __init__(
prev_output_channel: int,
out_channels: int,
temb_channels: int,
- resolution_idx: Optional[int] = None,
dropout: float = 0.0,
num_layers: int = 1,
resnet_eps: float = 1e-6,
@@ -2752,26 +2488,16 @@ def __init__(
self.skip_norm = None
self.act = None
- self.resolution_idx = resolution_idx
-
- def forward(
- self,
- hidden_states: paddle.Tensor,
- res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
- temb: Optional[paddle.Tensor] = None,
- skip_sample=None,
- scale: float = 1.0,
- ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
for resnet in self.resnets:
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
- hidden_states = resnet(hidden_states, temb, scale=scale)
+ hidden_states = resnet(hidden_states, temb)
- cross_attention_kwargs = {"scale": scale}
- hidden_states = self.attentions[0](hidden_states, **cross_attention_kwargs)
+ hidden_states = self.attentions[0](hidden_states)
if skip_sample is not None:
skip_sample = self.upsampler(skip_sample)
@@ -2785,7 +2511,7 @@ def forward(
skip_sample = skip_sample + skip_sample_states
- hidden_states = self.resnet_up(hidden_states, temb, scale=scale)
+ hidden_states = self.resnet_up(hidden_states, temb)
return hidden_states, skip_sample
@@ -2797,7 +2523,6 @@ def __init__(
prev_output_channel: int,
out_channels: int,
temb_channels: int,
- resolution_idx: Optional[int] = None,
dropout: float = 0.0,
num_layers: int = 1,
resnet_eps: float = 1e-6,
@@ -2863,23 +2588,14 @@ def __init__(
self.skip_norm = None
self.act = None
- self.resolution_idx = resolution_idx
-
- def forward(
- self,
- hidden_states: paddle.Tensor,
- res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
- temb: Optional[paddle.Tensor] = None,
- skip_sample=None,
- scale: float = 1.0,
- ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None):
for resnet in self.resnets:
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
- hidden_states = resnet(hidden_states, temb, scale=scale)
+ hidden_states = resnet(hidden_states, temb)
if skip_sample is not None:
skip_sample = self.upsampler(skip_sample)
@@ -2893,7 +2609,7 @@ def forward(
skip_sample = skip_sample + skip_sample_states
- hidden_states = self.resnet_up(hidden_states, temb, scale=scale)
+ hidden_states = self.resnet_up(hidden_states, temb)
return hidden_states, skip_sample
@@ -2905,7 +2621,6 @@ def __init__(
prev_output_channel: int,
out_channels: int,
temb_channels: int,
- resolution_idx: Optional[int] = None,
dropout: float = 0.0,
num_layers: int = 1,
resnet_eps: float = 1e-6,
@@ -2915,7 +2630,7 @@ def __init__(
resnet_pre_norm: bool = True,
output_scale_factor: float = 1.0,
add_upsample: bool = True,
- skip_time_act: bool = False,
+ skip_time_act=False,
resnet_pre_temb_non_linearity: bool = False,
):
super().__init__()
@@ -2968,16 +2683,8 @@ def __init__(
self.upsamplers = None
self.gradient_checkpointing = False
- self.resolution_idx = resolution_idx
- def forward(
- self,
- hidden_states: paddle.Tensor,
- res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
- temb: Optional[paddle.Tensor] = None,
- upsample_size: Optional[int] = None,
- scale: float = 1.0,
- ) -> paddle.Tensor:
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
for resnet in self.resnets:
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
@@ -2994,11 +2701,11 @@ def custom_forward(*inputs):
hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
else:
- hidden_states = resnet(hidden_states, temb, scale=scale)
+ hidden_states = resnet(hidden_states, temb)
if self.upsamplers is not None:
for upsampler in self.upsamplers:
- hidden_states = upsampler(hidden_states, temb, scale=scale)
+ hidden_states = upsampler(hidden_states, temb)
return hidden_states
@@ -3010,7 +2717,6 @@ def __init__(
out_channels: int,
prev_output_channel: int,
temb_channels: int,
- resolution_idx: Optional[int] = None,
dropout: float = 0.0,
num_layers: int = 1,
resnet_eps: float = 1e-6,
@@ -3022,9 +2728,9 @@ def __init__(
cross_attention_dim: int = 1280,
output_scale_factor: float = 1.0,
add_upsample: bool = True,
- skip_time_act: bool = False,
- only_cross_attention: bool = False,
- cross_attention_norm: Optional[str] = None,
+ skip_time_act=False,
+ only_cross_attention=False,
+ cross_attention_norm=None,
resnet_pre_temb_non_linearity: bool = False,
):
super().__init__()
@@ -3099,7 +2805,6 @@ def __init__(
self.upsamplers = None
self.gradient_checkpointing = False
- self.resolution_idx = resolution_idx
def forward(
self,
@@ -3111,10 +2816,9 @@ def forward(
attention_mask: Optional[paddle.Tensor] = None,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
encoder_attention_mask: Optional[paddle.Tensor] = None,
- ) -> paddle.Tensor:
+ ):
cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}
- lora_scale = cross_attention_kwargs.get("scale", 1.0)
if attention_mask is None:
# if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask.
mask = None if encoder_hidden_states is None else encoder_attention_mask
@@ -3153,7 +2857,7 @@ def custom_forward(*inputs):
cross_attention_kwargs,
)
else:
- hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+ hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
hidden_states,
@@ -3164,7 +2868,7 @@ def custom_forward(*inputs):
if self.upsamplers is not None:
for upsampler in self.upsamplers:
- hidden_states = upsampler(hidden_states, temb, scale=lora_scale)
+ hidden_states = upsampler(hidden_states, temb)
return hidden_states
@@ -3175,7 +2879,6 @@ def __init__(
in_channels: int,
out_channels: int,
temb_channels: int,
- resolution_idx: int,
dropout: float = 0.0,
num_layers: int = 5,
resnet_eps: float = 1e-5,
@@ -3219,16 +2922,8 @@ def __init__(
self.upsamplers = None
self.gradient_checkpointing = False
- self.resolution_idx = resolution_idx
- def forward(
- self,
- hidden_states: paddle.Tensor,
- res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
- temb: Optional[paddle.Tensor] = None,
- upsample_size: Optional[int] = None,
- scale: float = 1.0,
- ) -> paddle.Tensor:
+ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
res_hidden_states_tuple = res_hidden_states_tuple[-1]
if res_hidden_states_tuple is not None:
hidden_states = paddle.concat([hidden_states, res_hidden_states_tuple], axis=1)
@@ -3259,13 +2954,12 @@ def __init__(
in_channels: int,
out_channels: int,
temb_channels: int,
- resolution_idx: int,
dropout: float = 0.0,
num_layers: int = 4,
resnet_eps: float = 1e-5,
resnet_act_fn: str = "gelu",
resnet_group_size: int = 32,
- attention_head_dim: int = 1, # attention dim_head
+ attention_head_dim=1, # attention dim_head
cross_attention_dim: int = 768,
add_upsample: bool = True,
upcast_attention: bool = False,
@@ -3355,7 +3049,6 @@ def forward(
if res_hidden_states_tuple is not None:
hidden_states = paddle.concat([hidden_states, res_hidden_states_tuple], axis=1)
- lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
for resnet, attn in zip(self.resnets, self.attentions):
if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
@@ -3383,7 +3076,7 @@ def custom_forward(*inputs):
encoder_attention_mask,
)
else:
- hidden_states = resnet(hidden_states, temb, scale=lora_scale)
+ hidden_states = resnet(hidden_states, temb)
hidden_states = attn(
hidden_states,
encoder_hidden_states=encoder_hidden_states,
@@ -3411,18 +3104,11 @@ class KAttentionBlock(nn.Layer):
attention_head_dim (`int`): The number of channels in each head.
dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
- attention_bias (`bool`, *optional*, defaults to `False`):
- Configure if the attention layers should contain a bias parameter.
- upcast_attention (`bool`, *optional*, defaults to `False`):
- Set to `True` to upcast the attention computation to `float32`.
- temb_channels (`int`, *optional*, defaults to 768):
- The number of channels in the token embedding.
- add_self_attention (`bool`, *optional*, defaults to `False`):
- Set to `True` to add self-attention to the block.
- cross_attention_norm (`str`, *optional*, defaults to `None`):
- The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
- group_size (`int`, *optional*, defaults to 32):
- The number of groups to separate the channels into for group normalization.
+ activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+ num_embeds_ada_norm (:
+ obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+ attention_bias (:
+ obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
"""
def __init__(
diff --git a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
index c76bb0fb1..23904b256 100644
--- a/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
+++ b/ppdiffusers/ppdiffusers/models/unet_2d_condition.py
@@ -20,21 +20,14 @@
from ..configuration_utils import ConfigMixin, register_to_config
from ..loaders import UNet2DConditionLoadersMixin
-from ..utils import NEG_INF, BaseOutput, deprecate, logging
+from ..utils import NEG_INF, BaseOutput, logging
from .activations import get_activation
-from .attention_processor import (
- ADDED_KV_ATTENTION_PROCESSORS,
- CROSS_ATTENTION_PROCESSORS,
- AttentionProcessor,
- AttnAddedKVProcessor,
- AttnProcessor,
-)
+from .attention_processor import AttentionProcessor, AttnProcessor
from .embeddings import (
GaussianFourierProjection,
ImageHintTimeEmbedding,
ImageProjection,
ImageTimeEmbedding,
- PositionNet,
TextImageProjection,
TextImageTimeEmbedding,
TextTimeEmbedding,
@@ -43,9 +36,12 @@
)
from .modeling_utils import ModelMixin
from .unet_2d_blocks import (
- UNetMidBlock2D,
+ CrossAttnDownBlock2D,
+ CrossAttnUpBlock2D,
+ DownBlock2D,
UNetMidBlock2DCrossAttn,
UNetMidBlock2DSimpleCrossAttn,
+ UpBlock2D,
get_down_block,
get_up_block,
)
@@ -86,7 +82,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
The tuple of downsample blocks to use.
mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
- Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
+ Block type for middle of UNet, it can be either `UNetMidBlock2DCrossAttn` or
`UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
The tuple of upsample blocks to use.
@@ -98,22 +94,16 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
- dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
If `None`, normalization and activation layers is skipped in post-processing.
norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
The dimension of the cross attention features.
- transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
+ transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
[`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
[`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
- reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
- The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
- blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
- [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
- [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
encoder_hid_dim (`int`, *optional*, defaults to None):
If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
dimension to `cross_attention_dim`.
@@ -147,9 +137,9 @@ class conditioning with `class_embed_type` equal to `None`.
The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
time_cond_proj_dim (`int`, *optional*, defaults to `None`):
The dimension of `cond_proj` layer in the timestep embedding.
- conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`,
- *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`,
- *optional*): The dimension of the `class_labels` input when
+ conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer.
+ conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer.
+ projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when
`class_embed_type="projection"`. Required when `class_embed_type="projection"`.
class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
embeddings with the class embeddings.
@@ -184,13 +174,11 @@ def __init__(
layers_per_block: Union[int, Tuple[int]] = 2,
downsample_padding: int = 1,
mid_block_scale_factor: float = 1,
- dropout: float = 0.0,
act_fn: str = "silu",
norm_num_groups: Optional[int] = 32,
norm_eps: float = 1e-5,
cross_attention_dim: Union[int, Tuple[int]] = 1280,
- transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
- reverse_transformer_layers_per_block: Optional[Tuple[Tuple[int]]] = None,
+ transformer_layers_per_block: Union[int, Tuple[int]] = 1,
encoder_hid_dim: Optional[int] = None,
encoder_hid_dim_type: Optional[str] = None,
attention_head_dim: Union[int, Tuple[int]] = 8,
@@ -213,7 +201,6 @@ def __init__(
conv_in_kernel: int = 3,
conv_out_kernel: int = 3,
projection_class_embeddings_input_dim: Optional[int] = None,
- attention_type: str = "default",
class_embeddings_concat: bool = False,
mid_block_only_cross_attention: Optional[bool] = None,
cross_attention_norm: Optional[str] = None,
@@ -272,10 +259,6 @@ def __init__(
raise ValueError(
f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
)
- if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
- for layer_number_per_block in transformer_layers_per_block:
- if isinstance(layer_number_per_block, list):
- raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
# input
conv_in_padding = (conv_in_kernel - 1) // 2
@@ -472,12 +455,10 @@ def __init__(
only_cross_attention=only_cross_attention[i],
upcast_attention=upcast_attention,
resnet_time_scale_shift=resnet_time_scale_shift,
- attention_type=attention_type,
resnet_skip_time_act=resnet_skip_time_act,
resnet_out_scale_factor=resnet_out_scale_factor,
cross_attention_norm=cross_attention_norm,
attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
- dropout=dropout,
resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
)
self.down_blocks.append(down_block)
@@ -488,7 +469,6 @@ def __init__(
transformer_layers_per_block=transformer_layers_per_block[-1],
in_channels=block_out_channels[-1],
temb_channels=blocks_time_embed_dim,
- dropout=dropout,
resnet_eps=norm_eps,
resnet_act_fn=act_fn,
output_scale_factor=mid_block_scale_factor,
@@ -499,14 +479,12 @@ def __init__(
dual_cross_attention=dual_cross_attention,
use_linear_projection=use_linear_projection,
upcast_attention=upcast_attention,
- attention_type=attention_type,
resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
)
elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
self.mid_block = UNetMidBlock2DSimpleCrossAttn(
in_channels=block_out_channels[-1],
temb_channels=blocks_time_embed_dim,
- dropout=dropout,
resnet_eps=norm_eps,
resnet_act_fn=act_fn,
output_scale_factor=mid_block_scale_factor,
@@ -519,20 +497,6 @@ def __init__(
cross_attention_norm=cross_attention_norm,
resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
)
- elif mid_block_type == "UNetMidBlock2D":
- self.mid_block = UNetMidBlock2D(
- in_channels=block_out_channels[-1],
- temb_channels=blocks_time_embed_dim,
- dropout=dropout,
- num_layers=0,
- resnet_eps=norm_eps,
- resnet_act_fn=act_fn,
- output_scale_factor=mid_block_scale_factor,
- resnet_groups=norm_num_groups,
- resnet_time_scale_shift=resnet_time_scale_shift,
- add_attention=False,
- resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
- )
elif mid_block_type is None:
self.mid_block = None
else:
@@ -546,12 +510,8 @@ def __init__(
reversed_num_attention_heads = list(reversed(num_attention_heads))
reversed_layers_per_block = list(reversed(layers_per_block))
reversed_cross_attention_dim = list(reversed(cross_attention_dim))
- reversed_transformer_layers_per_block = (
- list(reversed(transformer_layers_per_block))
- if reverse_transformer_layers_per_block is None
- else reverse_transformer_layers_per_block
- )
- only_cross_attention = list(reversed(only_cross_attention))
+ reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+ reversed_only_cross_attention = list(reversed(only_cross_attention))
output_channel = reversed_block_out_channels[0]
for i, up_block_type in enumerate(up_block_types):
@@ -579,21 +539,18 @@ def __init__(
add_upsample=add_upsample,
resnet_eps=norm_eps,
resnet_act_fn=act_fn,
- resolution_idx=i,
resnet_groups=norm_num_groups,
cross_attention_dim=reversed_cross_attention_dim[i],
num_attention_heads=reversed_num_attention_heads[i],
dual_cross_attention=dual_cross_attention,
use_linear_projection=use_linear_projection,
- only_cross_attention=only_cross_attention[i],
+ only_cross_attention=reversed_only_cross_attention[i],
upcast_attention=upcast_attention,
resnet_time_scale_shift=resnet_time_scale_shift,
- attention_type=attention_type,
resnet_skip_time_act=resnet_skip_time_act,
resnet_out_scale_factor=resnet_out_scale_factor,
cross_attention_norm=cross_attention_norm,
attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel,
- dropout=dropout,
resnet_pre_temb_non_linearity=resnet_pre_temb_non_linearity,
)
self.up_blocks.append(up_block)
@@ -614,18 +571,6 @@ def __init__(
block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
)
- if attention_type in ["gated", "gated-text-image"]:
- positive_len = 768
- if isinstance(cross_attention_dim, int):
- positive_len = cross_attention_dim
- elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
- positive_len = cross_attention_dim[0]
-
- feature_type = "text-only" if attention_type == "gated" else "text-image"
- self.position_net = PositionNet(
- positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
- )
-
@property
def attn_processors(self) -> Dict[str, AttentionProcessor]:
r"""
@@ -637,8 +582,8 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
processors = {}
def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
- if hasattr(module, "get_processor"):
- processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+ if hasattr(module, "set_processor"):
+ processors[f"{name}.processor"] = module.processor
for sub_name, child in module.named_children():
fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
@@ -650,9 +595,7 @@ def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[st
return processors
- def set_attn_processor(
- self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
- ):
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
r"""
Sets the attention processor to use to compute attention.
@@ -676,9 +619,9 @@ def set_attn_processor(
def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
if hasattr(module, "set_processor"):
if not isinstance(processor, dict):
- module.set_processor(processor, _remove_lora=_remove_lora)
+ module.set_processor(processor)
else:
- module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+ module.set_processor(processor.pop(f"{name}.processor"))
for sub_name, child in module.named_children():
fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
@@ -690,16 +633,7 @@ def set_default_attn_processor(self):
"""
Disables custom attention processors and sets the default attention implementation.
"""
- if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
- processor = AttnAddedKVProcessor()
- elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
- processor = AttnProcessor()
- else:
- raise ValueError(
- f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
- )
-
- self.set_attn_processor(processor, _remove_lora=True)
+ self.set_attn_processor(AttnProcessor())
def set_attention_slice(self, slice_size):
r"""
@@ -767,41 +701,9 @@ def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
fn_recursive_set_attention_slice(module, reversed_slice_size)
def _set_gradient_checkpointing(self, module, value=False):
- if hasattr(module, "gradient_checkpointing"):
+ if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, CrossAttnUpBlock2D, UpBlock2D)):
module.gradient_checkpointing = value
- def enable_freeu(self, s1, s2, b1, b2):
- r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stage blocks where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
- are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate the "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate the "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- for i, upsample_block in enumerate(self.up_blocks):
- setattr(upsample_block, "s1", s1)
- setattr(upsample_block, "s2", s2)
- setattr(upsample_block, "b1", b1)
- setattr(upsample_block, "b2", b2)
-
- def disable_freeu(self):
- """Disables the FreeU mechanism."""
- freeu_keys = {"s1", "s2", "b1", "b2"}
- for i, upsample_block in enumerate(self.up_blocks):
- for k in freeu_keys:
- if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
- setattr(upsample_block, k, None)
-
def forward(
self,
sample: paddle.Tensor,
@@ -814,7 +716,6 @@ def forward(
added_cond_kwargs: Optional[Dict[str, paddle.Tensor]] = None,
down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
mid_block_additional_residual: Optional[paddle.Tensor] = None,
- down_intrablock_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
encoder_attention_mask: Optional[paddle.Tensor] = None,
return_dict: bool = True,
) -> Union[UNet2DConditionOutput, Tuple]:
@@ -822,32 +723,12 @@ def forward(
The [`UNet2DConditionModel`] forward method.
Args:
- sample (`torch.FloatTensor`):
+ sample (`paddle.Tensor`):
The noisy input tensor with the following shape `(batch, channel, height, width)`.
- timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
- encoder_hidden_states (`torch.FloatTensor`):
+ timestep (`paddle.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
+ encoder_hidden_states (`paddle.Tensor`):
The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
- class_labels (`torch.Tensor`, *optional*, defaults to `None`):
- Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
- timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`):
- Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
- through the `self.time_embedding` layer to obtain the timestep embeddings.
- attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
- An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
- is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
- negative values to the attention scores corresponding to "discard" tokens.
- cross_attention_kwargs (`dict`, *optional*):
- A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
- `self.processor` in
- [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
- added_cond_kwargs: (`dict`, *optional*):
- A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
- are passed along to the UNet blocks.
- down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*):
- A tuple of tensors that if specified are added to the residuals of down unet blocks.
- mid_block_additional_residual: (`torch.Tensor`, *optional*):
- A tensor that if specified is added to the residual of the middle unet block.
- encoder_attention_mask (`torch.Tensor`):
+ encoder_attention_mask (`paddle.Tensor`):
A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
`True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
which adds large negative values to the attention scores corresponding to "discard" tokens.
@@ -859,13 +740,6 @@ def forward(
added_cond_kwargs: (`dict`, *optional*):
A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
are passed along to the UNet blocks.
- down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
- additional residuals to be added to UNet long skip connections from down blocks to up blocks for
- example from ControlNet side model(s)
- mid_block_additional_residual (`torch.Tensor`, *optional*):
- additional residual to be added to UNet mid block output, for example from ControlNet side model
- down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
- additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
Returns:
[`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
@@ -1044,40 +918,18 @@ def forward(
# 2. pre-process
sample = self.conv_in(sample)
- # 2.5 GLIGEN position net
- if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None:
- cross_attention_kwargs = cross_attention_kwargs.copy()
- gligen_args = cross_attention_kwargs.pop("gligen")
- cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}
-
# 3. down
- lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None
- # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets
- is_adapter = down_intrablock_additional_residuals is not None
- # maintain backward compatibility for legacy usage, where
- # T2I-Adapter and ControlNet both use down_block_additional_residuals arg
- # but can only use one or the other
- if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None:
- deprecate(
- "T2I should not use down_block_additional_residuals",
- "1.3.0",
- "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \
- and will be removed in diffusers 1.3.0. `down_block_additional_residuals` should only be used \
- for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ",
- standard_warn=False,
- )
- down_intrablock_additional_residuals = down_block_additional_residuals
- is_adapter = True
+ is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None
down_block_res_samples = (sample,)
for downsample_block in self.down_blocks:
if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
# For t2i-adapter CrossAttnDownBlock2D
additional_residuals = {}
- if is_adapter and len(down_intrablock_additional_residuals) > 0:
- additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0)
+ if is_adapter and len(down_block_additional_residuals) > 0:
+ additional_residuals["additional_residuals"] = down_block_additional_residuals.pop(0)
sample, res_samples = downsample_block(
hidden_states=sample,
@@ -1089,18 +941,10 @@ def forward(
**additional_residuals,
)
else:
- # paddle original code:
- # sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
-
- # if is_adapter and len(down_block_additional_residuals) > 0:
- # sample += down_block_additional_residuals.pop(0)
- # # westfish: add to align with torch features
- # res_samples = tuple(res_samples[:-1]) + (sample,)
-
- sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale)
- if is_adapter and len(down_intrablock_additional_residuals) > 0:
- sample += down_intrablock_additional_residuals.pop(0)
- # pytorch without the following line:
+ sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+ if is_adapter and len(down_block_additional_residuals) > 0:
+ sample += down_block_additional_residuals.pop(0)
# westfish: add to align with torch features
res_samples = tuple(res_samples[:-1]) + (sample,)
down_block_res_samples += res_samples
@@ -1118,25 +962,14 @@ def forward(
# 4. mid
if self.mid_block is not None:
- if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
- sample = self.mid_block(
- sample,
- emb,
- encoder_hidden_states=encoder_hidden_states,
- attention_mask=attention_mask,
- cross_attention_kwargs=cross_attention_kwargs,
- encoder_attention_mask=encoder_attention_mask,
- )
- else:
- sample = self.mid_block(sample, emb)
-
- # To support T2I-Adapter-XL
- if (
- is_adapter
- and len(down_intrablock_additional_residuals) > 0
- and sample.shape == down_intrablock_additional_residuals[0].shape
- ):
- sample += down_intrablock_additional_residuals.pop(0)
+ sample = self.mid_block(
+ sample,
+ emb,
+ encoder_hidden_states=encoder_hidden_states,
+ attention_mask=attention_mask,
+ cross_attention_kwargs=cross_attention_kwargs,
+ encoder_attention_mask=encoder_attention_mask,
+ )
if is_controlnet:
sample = sample + mid_block_additional_residual
@@ -1170,7 +1003,6 @@ def forward(
temb=emb,
res_hidden_states_tuple=res_samples,
upsample_size=upsample_size,
- scale=lora_scale,
)
# 6. post-process
diff --git a/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py b/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
index 36abadec7..d395de074 100644
--- a/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
+++ b/ppdiffusers/ppdiffusers/models/unet_3d_blocks.py
@@ -13,14 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from typing import Any, Dict, Optional, Tuple
-
import paddle
import paddle.nn as nn
-from paddle.distributed.fleet.utils import recompute
-from ..utils.paddle_utils import apply_freeu
-from .dual_transformer_2d import DualTransformer2DModel
from .resnet import Downsample2D, ResnetBlock2D, TemporalConvLayer, Upsample2D
from .transformer_2d import Transformer2DModel
from .transformer_temporal import TransformerTemporalModel
@@ -44,8 +39,6 @@ def get_down_block(
only_cross_attention=False,
upcast_attention=False,
resnet_time_scale_shift="default",
- temporal_num_attention_heads=8,
- temporal_max_seq_length=32,
):
if down_block_type == "DownBlock3D":
return DownBlock3D(
@@ -81,45 +74,6 @@ def get_down_block(
upcast_attention=upcast_attention,
resnet_time_scale_shift=resnet_time_scale_shift,
)
- if down_block_type == "DownBlockMotion":
- return DownBlockMotion(
- num_layers=num_layers,
- in_channels=in_channels,
- out_channels=out_channels,
- temb_channels=temb_channels,
- add_downsample=add_downsample,
- resnet_eps=resnet_eps,
- resnet_act_fn=resnet_act_fn,
- resnet_groups=resnet_groups,
- downsample_padding=downsample_padding,
- resnet_time_scale_shift=resnet_time_scale_shift,
- temporal_num_attention_heads=temporal_num_attention_heads,
- temporal_max_seq_length=temporal_max_seq_length,
- )
- elif down_block_type == "CrossAttnDownBlockMotion":
- if cross_attention_dim is None:
- raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlockMotion")
- return CrossAttnDownBlockMotion(
- num_layers=num_layers,
- in_channels=in_channels,
- out_channels=out_channels,
- temb_channels=temb_channels,
- add_downsample=add_downsample,
- resnet_eps=resnet_eps,
- resnet_act_fn=resnet_act_fn,
- resnet_groups=resnet_groups,
- downsample_padding=downsample_padding,
- cross_attention_dim=cross_attention_dim,
- num_attention_heads=num_attention_heads,
- dual_cross_attention=dual_cross_attention,
- use_linear_projection=use_linear_projection,
- only_cross_attention=only_cross_attention,
- upcast_attention=upcast_attention,
- resnet_time_scale_shift=resnet_time_scale_shift,
- temporal_num_attention_heads=temporal_num_attention_heads,
- temporal_max_seq_length=temporal_max_seq_length,
- )
-
raise ValueError(f"{down_block_type} does not exist.")
@@ -134,7 +88,6 @@ def get_up_block(
resnet_eps,
resnet_act_fn,
num_attention_heads,
- resolution_idx=None,
resnet_groups=None,
cross_attention_dim=None,
dual_cross_attention=False,
@@ -142,9 +95,6 @@ def get_up_block(
only_cross_attention=False,
upcast_attention=False,
resnet_time_scale_shift="default",
- temporal_num_attention_heads=8,
- temporal_cross_attention_dim=None,
- temporal_max_seq_length=32,
):
if up_block_type == "UpBlock3D":
return UpBlock3D(
@@ -158,7 +108,6 @@ def get_up_block(
resnet_act_fn=resnet_act_fn,
resnet_groups=resnet_groups,
resnet_time_scale_shift=resnet_time_scale_shift,
- resolution_idx=resolution_idx,
)
elif up_block_type == "CrossAttnUpBlock3D":
if cross_attention_dim is None:
@@ -180,47 +129,6 @@ def get_up_block(
only_cross_attention=only_cross_attention,
upcast_attention=upcast_attention,
resnet_time_scale_shift=resnet_time_scale_shift,
- resolution_idx=resolution_idx,
- )
- if up_block_type == "UpBlockMotion":
- return UpBlockMotion(
- num_layers=num_layers,
- in_channels=in_channels,
- out_channels=out_channels,
- prev_output_channel=prev_output_channel,
- temb_channels=temb_channels,
- add_upsample=add_upsample,
- resnet_eps=resnet_eps,
- resnet_act_fn=resnet_act_fn,
- resnet_groups=resnet_groups,
- resnet_time_scale_shift=resnet_time_scale_shift,
- resolution_idx=resolution_idx,
- temporal_num_attention_heads=temporal_num_attention_heads,
- temporal_max_seq_length=temporal_max_seq_length,
- )
- elif up_block_type == "CrossAttnUpBlockMotion":
- if cross_attention_dim is None:
- raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlockMotion")
- return CrossAttnUpBlockMotion(
- num_layers=num_layers,
- in_channels=in_channels,
- out_channels=out_channels,
- prev_output_channel=prev_output_channel,
- temb_channels=temb_channels,
- add_upsample=add_upsample,
- resnet_eps=resnet_eps,
- resnet_act_fn=resnet_act_fn,
- resnet_groups=resnet_groups,
- cross_attention_dim=cross_attention_dim,
- num_attention_heads=num_attention_heads,
- dual_cross_attention=dual_cross_attention,
- use_linear_projection=use_linear_projection,
- only_cross_attention=only_cross_attention,
- upcast_attention=upcast_attention,
- resnet_time_scale_shift=resnet_time_scale_shift,
- resolution_idx=resolution_idx,
- temporal_num_attention_heads=temporal_num_attention_heads,
- temporal_max_seq_length=temporal_max_seq_length,
)
raise ValueError(f"{up_block_type} does not exist.")
@@ -569,21 +477,17 @@ def __init__(
use_linear_projection=False,
only_cross_attention=False,
upcast_attention=False,
- resolution_idx=None,
):
super().__init__()
resnets = []
temp_convs = []
attentions = []
temp_attentions = []
-
self.has_cross_attention = True
self.num_attention_heads = num_attention_heads
-
for i in range(num_layers):
res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
resnets.append(
ResnetBlock2D(
in_channels=resnet_in_channels + res_skip_channels,
@@ -638,9 +542,7 @@ def __init__(
)
else:
self.upsamplers = None
-
self.gradient_checkpointing = False
- self.resolution_idx = resolution_idx
def forward(
self,
@@ -653,13 +555,6 @@ def forward(
num_frames=1,
cross_attention_kwargs=None,
):
- is_freeu_enabled = (
- getattr(self, "s1", None)
- and getattr(self, "s2", None)
- and getattr(self, "b1", None)
- and getattr(self, "b2", None)
- )
-
# TODO(Patrick, William) - attention mask is not used
for resnet, temp_conv, attn, temp_attn in zip(
self.resnets, self.temp_convs, self.attentions, self.temp_attentions
@@ -667,19 +562,6 @@ def forward(
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-
- # FreeU: Only operate on the first two stages
- if is_freeu_enabled:
- hidden_states, res_hidden_states = apply_freeu(
- self.resolution_idx,
- hidden_states,
- res_hidden_states,
- s1=self.s1,
- s2=self.s2,
- b1=self.b1,
- b2=self.b2,
- )
-
hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
hidden_states = resnet(hidden_states, temb)
hidden_states = temp_conv(hidden_states, num_frames=num_frames)
@@ -717,16 +599,13 @@ def __init__(
resnet_pre_norm: bool = True,
output_scale_factor=1.0,
add_upsample=True,
- resolution_idx=None,
):
super().__init__()
resnets = []
temp_convs = []
-
for i in range(num_layers):
res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
resnets.append(
ResnetBlock2D(
in_channels=resnet_in_channels + res_skip_channels,
@@ -754,794 +633,17 @@ def __init__(
self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
else:
self.upsamplers = None
-
self.gradient_checkpointing = False
- self.resolution_idx = resolution_idx
def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, num_frames=1):
- is_freeu_enabled = (
- getattr(self, "s1", None)
- and getattr(self, "s2", None)
- and getattr(self, "b1", None)
- and getattr(self, "b2", None)
- )
for resnet, temp_conv in zip(self.resnets, self.temp_convs):
# pop res hidden states
res_hidden_states = res_hidden_states_tuple[-1]
res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-
- # FreeU: Only operate on the first two stages
- if is_freeu_enabled:
- hidden_states, res_hidden_states = apply_freeu(
- self.resolution_idx,
- hidden_states,
- res_hidden_states,
- s1=self.s1,
- s2=self.s2,
- b1=self.b1,
- b2=self.b2,
- )
-
hidden_states = paddle.concat(x=[hidden_states, res_hidden_states], axis=1)
hidden_states = resnet(hidden_states, temb)
hidden_states = temp_conv(hidden_states, num_frames=num_frames)
if self.upsamplers is not None:
for upsampler in self.upsamplers:
hidden_states = upsampler(hidden_states, upsample_size)
-
- return hidden_states
-
-
-class DownBlockMotion(nn.Layer):
- def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float = 0.0,
- num_layers: int = 1,
- resnet_eps: float = 1e-6,
- resnet_time_scale_shift: str = "default",
- resnet_act_fn: str = "swish",
- resnet_groups: int = 32,
- resnet_pre_norm: bool = True,
- output_scale_factor=1.0,
- add_downsample=True,
- downsample_padding=1,
- temporal_num_attention_heads=1,
- temporal_cross_attention_dim=None,
- temporal_max_seq_length=32,
- ):
- super().__init__()
- resnets = []
- motion_modules = []
-
- for i in range(num_layers):
- in_channels = in_channels if i == 0 else out_channels
- resnets.append(
- ResnetBlock2D(
- in_channels=in_channels,
- out_channels=out_channels,
- temb_channels=temb_channels,
- eps=resnet_eps,
- groups=resnet_groups,
- dropout=dropout,
- time_embedding_norm=resnet_time_scale_shift,
- non_linearity=resnet_act_fn,
- output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm,
- )
- )
- motion_modules.append(
- TransformerTemporalModel(
- num_attention_heads=temporal_num_attention_heads,
- in_channels=out_channels,
- norm_num_groups=resnet_groups,
- cross_attention_dim=temporal_cross_attention_dim,
- attention_bias=False,
- activation_fn="geglu",
- positional_embeddings="sinusoidal",
- num_positional_embeddings=temporal_max_seq_length,
- attention_head_dim=out_channels // temporal_num_attention_heads,
- )
- )
-
- self.resnets = nn.LayerList(resnets)
- self.motion_modules = nn.LayerList(motion_modules)
-
- if add_downsample:
- self.downsamplers = nn.LayerList(
- [
- Downsample2D(
- out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
- )
- ]
- )
- else:
- self.downsamplers = None
-
- self.gradient_checkpointing = False
-
- def forward(self, hidden_states, temb=None, scale: float = 1.0, num_frames=1):
- output_states = ()
-
- blocks = zip(self.resnets, self.motion_modules)
- for resnet, motion_module in blocks:
- if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
- def create_custom_forward(module):
- def custom_forward(*inputs):
- return module(*inputs)
-
- return custom_forward
-
- hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
- hidden_states = recompute(
- create_custom_forward(motion_module), hidden_states.requires_grad_(), temb, num_frames
- )
- else:
- hidden_states = resnet(hidden_states, temb, scale=scale)
- hidden_states = motion_module(hidden_states, num_frames=num_frames)[0]
-
- output_states = output_states + (hidden_states,)
-
- if self.downsamplers is not None:
- for downsampler in self.downsamplers:
- hidden_states = downsampler(hidden_states, scale=scale)
-
- output_states = output_states + (hidden_states,)
-
- return hidden_states, output_states
-
-
-class CrossAttnDownBlockMotion(nn.Layer):
- def __init__(
- self,
- in_channels: int,
- out_channels: int,
- temb_channels: int,
- dropout: float = 0.0,
- num_layers: int = 1,
- transformer_layers_per_block: int = 1,
- resnet_eps: float = 1e-6,
- resnet_time_scale_shift: str = "default",
- resnet_act_fn: str = "swish",
- resnet_groups: int = 32,
- resnet_pre_norm: bool = True,
- num_attention_heads=1,
- cross_attention_dim=1280,
- output_scale_factor=1.0,
- downsample_padding=1,
- add_downsample=True,
- dual_cross_attention=False,
- use_linear_projection=False,
- only_cross_attention=False,
- upcast_attention=False,
- attention_type="default",
- temporal_cross_attention_dim=None,
- temporal_num_attention_heads=8,
- temporal_max_seq_length=32,
- ):
- super().__init__()
- resnets = []
- attentions = []
- motion_modules = []
-
- self.has_cross_attention = True
- self.num_attention_heads = num_attention_heads
-
- for i in range(num_layers):
- in_channels = in_channels if i == 0 else out_channels
- resnets.append(
- ResnetBlock2D(
- in_channels=in_channels,
- out_channels=out_channels,
- temb_channels=temb_channels,
- eps=resnet_eps,
- groups=resnet_groups,
- dropout=dropout,
- time_embedding_norm=resnet_time_scale_shift,
- non_linearity=resnet_act_fn,
- output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm,
- )
- )
-
- if not dual_cross_attention:
- attentions.append(
- Transformer2DModel(
- num_attention_heads,
- out_channels // num_attention_heads,
- in_channels=out_channels,
- num_layers=transformer_layers_per_block,
- cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups,
- use_linear_projection=use_linear_projection,
- only_cross_attention=only_cross_attention,
- upcast_attention=upcast_attention,
- attention_type=attention_type,
- )
- )
- else:
- attentions.append(
- DualTransformer2DModel(
- num_attention_heads,
- out_channels // num_attention_heads,
- in_channels=out_channels,
- num_layers=1,
- cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups,
- )
- )
-
- motion_modules.append(
- TransformerTemporalModel(
- num_attention_heads=temporal_num_attention_heads,
- in_channels=out_channels,
- norm_num_groups=resnet_groups,
- cross_attention_dim=temporal_cross_attention_dim,
- attention_bias=False,
- activation_fn="geglu",
- positional_embeddings="sinusoidal",
- num_positional_embeddings=temporal_max_seq_length,
- attention_head_dim=out_channels // temporal_num_attention_heads,
- )
- )
-
- self.attentions = nn.LayerList(attentions)
- self.resnets = nn.LayerList(resnets)
- self.motion_modules = nn.LayerList(motion_modules)
-
- if add_downsample:
- self.downsamplers = nn.LayerList(
- [
- Downsample2D(
- out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
- )
- ]
- )
- else:
- self.downsamplers = None
-
- self.gradient_checkpointing = False
-
- def forward(
- self,
- hidden_states,
- temb=None,
- encoder_hidden_states=None,
- attention_mask=None,
- num_frames=1,
- encoder_attention_mask=None,
- cross_attention_kwargs=None,
- additional_residuals=None,
- ):
- output_states = ()
-
- lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
-
- blocks = list(zip(self.resnets, self.attentions, self.motion_modules))
- for i, (resnet, attn, motion_module) in enumerate(blocks):
- if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
- def create_custom_forward(module, return_dict=None):
- def custom_forward(*inputs):
- if return_dict is not None:
- return module(*inputs, return_dict=return_dict)[0] # move [0] when paddlepaddle <= 2.4.1
- else:
- return module(*inputs)
-
- return custom_forward
-
- hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
- hidden_states = recompute(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- attention_mask,
- encoder_attention_mask,
- ) # [0]
- else:
- hidden_states = resnet(hidden_states, temb, scale=lora_scale)
- hidden_states = attn(
- hidden_states,
- encoder_hidden_states=encoder_hidden_states,
- cross_attention_kwargs=cross_attention_kwargs,
- attention_mask=attention_mask,
- encoder_attention_mask=encoder_attention_mask,
- return_dict=False,
- )[0]
- hidden_states = motion_module(
- hidden_states,
- num_frames=num_frames,
- )[0]
-
- # apply additional residuals to the output of the last pair of resnet and attention blocks
- if i == len(blocks) - 1 and additional_residuals is not None:
- hidden_states = hidden_states + additional_residuals
-
- output_states = output_states + (hidden_states,)
-
- if self.downsamplers is not None:
- for downsampler in self.downsamplers:
- hidden_states = downsampler(hidden_states, scale=lora_scale)
-
- output_states = output_states + (hidden_states,)
-
- return hidden_states, output_states
-
-
-class CrossAttnUpBlockMotion(nn.Layer):
- def __init__(
- self,
- in_channels: int,
- out_channels: int,
- prev_output_channel: int,
- temb_channels: int,
- resolution_idx: int = None,
- dropout: float = 0.0,
- num_layers: int = 1,
- transformer_layers_per_block: int = 1,
- resnet_eps: float = 1e-6,
- resnet_time_scale_shift: str = "default",
- resnet_act_fn: str = "swish",
- resnet_groups: int = 32,
- resnet_pre_norm: bool = True,
- num_attention_heads=1,
- cross_attention_dim=1280,
- output_scale_factor=1.0,
- add_upsample=True,
- dual_cross_attention=False,
- use_linear_projection=False,
- only_cross_attention=False,
- upcast_attention=False,
- attention_type="default",
- temporal_cross_attention_dim=None,
- temporal_num_attention_heads=8,
- temporal_max_seq_length=32,
- ):
- super().__init__()
- resnets = []
- attentions = []
- motion_modules = []
-
- self.has_cross_attention = True
- self.num_attention_heads = num_attention_heads
-
- for i in range(num_layers):
- res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
- resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
- resnets.append(
- ResnetBlock2D(
- in_channels=resnet_in_channels + res_skip_channels,
- out_channels=out_channels,
- temb_channels=temb_channels,
- eps=resnet_eps,
- groups=resnet_groups,
- dropout=dropout,
- time_embedding_norm=resnet_time_scale_shift,
- non_linearity=resnet_act_fn,
- output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm,
- )
- )
-
- if not dual_cross_attention:
- attentions.append(
- Transformer2DModel(
- num_attention_heads,
- out_channels // num_attention_heads,
- in_channels=out_channels,
- num_layers=transformer_layers_per_block,
- cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups,
- use_linear_projection=use_linear_projection,
- only_cross_attention=only_cross_attention,
- upcast_attention=upcast_attention,
- attention_type=attention_type,
- )
- )
- else:
- attentions.append(
- DualTransformer2DModel(
- num_attention_heads,
- out_channels // num_attention_heads,
- in_channels=out_channels,
- num_layers=1,
- cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups,
- )
- )
- motion_modules.append(
- TransformerTemporalModel(
- num_attention_heads=temporal_num_attention_heads,
- in_channels=out_channels,
- norm_num_groups=resnet_groups,
- cross_attention_dim=temporal_cross_attention_dim,
- attention_bias=False,
- activation_fn="geglu",
- positional_embeddings="sinusoidal",
- num_positional_embeddings=temporal_max_seq_length,
- attention_head_dim=out_channels // temporal_num_attention_heads,
- )
- )
-
- self.attentions = nn.LayerList(attentions)
- self.resnets = nn.LayerList(resnets)
- self.motion_modules = nn.LayerList(motion_modules)
-
- if add_upsample:
- self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
- else:
- self.upsamplers = None
-
- self.gradient_checkpointing = False
- self.resolution_idx = resolution_idx
-
- def forward(
- self,
- hidden_states: paddle.Tensor,
- res_hidden_states_tuple: Tuple[paddle.Tensor, ...],
- temb: Optional[paddle.Tensor] = None,
- encoder_hidden_states: Optional[paddle.Tensor] = None,
- cross_attention_kwargs: Optional[Dict[str, Any]] = None,
- upsample_size: Optional[int] = None,
- attention_mask: Optional[paddle.Tensor] = None,
- encoder_attention_mask: Optional[paddle.Tensor] = None,
- num_frames=1,
- ):
- lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
- is_freeu_enabled = (
- getattr(self, "s1", None)
- and getattr(self, "s2", None)
- and getattr(self, "b1", None)
- and getattr(self, "b2", None)
- )
-
- blocks = zip(self.resnets, self.attentions, self.motion_modules)
- for resnet, attn, motion_module in blocks:
- # pop res hidden states
- res_hidden_states = res_hidden_states_tuple[-1]
- res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-
- # FreeU: Only operate on the first two stages
- if is_freeu_enabled:
- hidden_states, res_hidden_states = apply_freeu(
- self.resolution_idx,
- hidden_states,
- res_hidden_states,
- s1=self.s1,
- s2=self.s2,
- b1=self.b1,
- b2=self.b2,
- )
-
- hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-
- if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
- def create_custom_forward(module, return_dict=None):
- def custom_forward(*inputs):
- if return_dict is not None:
- return module(*inputs, return_dict=return_dict)[0] # move [0] when paddlepaddle <= 2.4.1
- else:
- return module(*inputs)
-
- return custom_forward
-
- hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
- hidden_states = recompute(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- attention_mask,
- encoder_attention_mask,
- ) # [0]
- else:
- hidden_states = resnet(hidden_states, temb, scale=lora_scale)
- hidden_states = attn(
- hidden_states,
- encoder_hidden_states=encoder_hidden_states,
- cross_attention_kwargs=cross_attention_kwargs,
- attention_mask=attention_mask,
- encoder_attention_mask=encoder_attention_mask,
- return_dict=False,
- )[0]
- hidden_states = motion_module(
- hidden_states,
- num_frames=num_frames,
- )[0]
-
- if self.upsamplers is not None:
- for upsampler in self.upsamplers:
- hidden_states = upsampler(hidden_states, upsample_size, scale=lora_scale)
-
- return hidden_states
-
-
-class UpBlockMotion(nn.Layer):
- def __init__(
- self,
- in_channels: int,
- prev_output_channel: int,
- out_channels: int,
- temb_channels: int,
- resolution_idx: int = None,
- dropout: float = 0.0,
- num_layers: int = 1,
- resnet_eps: float = 1e-6,
- resnet_time_scale_shift: str = "default",
- resnet_act_fn: str = "swish",
- resnet_groups: int = 32,
- resnet_pre_norm: bool = True,
- output_scale_factor=1.0,
- add_upsample=True,
- temporal_norm_num_groups=32,
- temporal_cross_attention_dim=None,
- temporal_num_attention_heads=8,
- temporal_max_seq_length=32,
- ):
- super().__init__()
- resnets = []
- motion_modules = []
-
- for i in range(num_layers):
- res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
- resnet_in_channels = prev_output_channel if i == 0 else out_channels
-
- resnets.append(
- ResnetBlock2D(
- in_channels=resnet_in_channels + res_skip_channels,
- out_channels=out_channels,
- temb_channels=temb_channels,
- eps=resnet_eps,
- groups=resnet_groups,
- dropout=dropout,
- time_embedding_norm=resnet_time_scale_shift,
- non_linearity=resnet_act_fn,
- output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm,
- )
- )
-
- motion_modules.append(
- TransformerTemporalModel(
- num_attention_heads=temporal_num_attention_heads,
- in_channels=out_channels,
- norm_num_groups=temporal_norm_num_groups,
- cross_attention_dim=temporal_cross_attention_dim,
- attention_bias=False,
- activation_fn="geglu",
- positional_embeddings="sinusoidal",
- num_positional_embeddings=temporal_max_seq_length,
- attention_head_dim=out_channels // temporal_num_attention_heads,
- )
- )
-
- self.resnets = nn.LayerList(resnets)
- self.motion_modules = nn.LayerList(motion_modules)
-
- if add_upsample:
- self.upsamplers = nn.LayerList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)])
- else:
- self.upsamplers = None
-
- self.gradient_checkpointing = False
- self.resolution_idx = resolution_idx
-
- def forward(
- self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, scale: float = 1.0, num_frames=1
- ):
- is_freeu_enabled = (
- getattr(self, "s1", None)
- and getattr(self, "s2", None)
- and getattr(self, "b1", None)
- and getattr(self, "b2", None)
- )
-
- blocks = zip(self.resnets, self.motion_modules)
-
- for resnet, motion_module in blocks:
- # pop res hidden states
- res_hidden_states = res_hidden_states_tuple[-1]
- res_hidden_states_tuple = res_hidden_states_tuple[:-1]
-
- # FreeU: Only operate on the first two stages
- if is_freeu_enabled:
- hidden_states, res_hidden_states = apply_freeu(
- self.resolution_idx,
- hidden_states,
- res_hidden_states,
- s1=self.s1,
- s2=self.s2,
- b1=self.b1,
- b2=self.b2,
- )
-
- hidden_states = paddle.concat([hidden_states, res_hidden_states], axis=1)
-
- if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
- def create_custom_forward(module):
- def custom_forward(*inputs):
- return module(*inputs)
-
- return custom_forward
-
- hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
- hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
-
- else:
- hidden_states = resnet(hidden_states, temb, scale=scale)
- hidden_states = motion_module(hidden_states, num_frames=num_frames)[0]
-
- if self.upsamplers is not None:
- for upsampler in self.upsamplers:
- hidden_states = upsampler(hidden_states, upsample_size, scale=scale)
-
- return hidden_states
-
-
-class UNetMidBlockCrossAttnMotion(nn.Layer):
- def __init__(
- self,
- in_channels: int,
- temb_channels: int,
- dropout: float = 0.0,
- num_layers: int = 1,
- transformer_layers_per_block: int = 1,
- resnet_eps: float = 1e-6,
- resnet_time_scale_shift: str = "default",
- resnet_act_fn: str = "swish",
- resnet_groups: int = 32,
- resnet_pre_norm: bool = True,
- num_attention_heads=1,
- output_scale_factor=1.0,
- cross_attention_dim=1280,
- dual_cross_attention=False,
- use_linear_projection=False,
- upcast_attention=False,
- attention_type="default",
- temporal_num_attention_heads=1,
- temporal_cross_attention_dim=None,
- temporal_max_seq_length=32,
- ):
- super().__init__()
-
- self.has_cross_attention = True
- self.num_attention_heads = num_attention_heads
- resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
-
- # there is always at least one resnet
- resnets = [
- ResnetBlock2D(
- in_channels=in_channels,
- out_channels=in_channels,
- temb_channels=temb_channels,
- eps=resnet_eps,
- groups=resnet_groups,
- dropout=dropout,
- time_embedding_norm=resnet_time_scale_shift,
- non_linearity=resnet_act_fn,
- output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm,
- )
- ]
- attentions = []
- motion_modules = []
-
- for _ in range(num_layers):
- if not dual_cross_attention:
- attentions.append(
- Transformer2DModel(
- num_attention_heads,
- in_channels // num_attention_heads,
- in_channels=in_channels,
- num_layers=transformer_layers_per_block,
- cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups,
- use_linear_projection=use_linear_projection,
- upcast_attention=upcast_attention,
- attention_type=attention_type,
- )
- )
- else:
- attentions.append(
- DualTransformer2DModel(
- num_attention_heads,
- in_channels // num_attention_heads,
- in_channels=in_channels,
- num_layers=1,
- cross_attention_dim=cross_attention_dim,
- norm_num_groups=resnet_groups,
- )
- )
- resnets.append(
- ResnetBlock2D(
- in_channels=in_channels,
- out_channels=in_channels,
- temb_channels=temb_channels,
- eps=resnet_eps,
- groups=resnet_groups,
- dropout=dropout,
- time_embedding_norm=resnet_time_scale_shift,
- non_linearity=resnet_act_fn,
- output_scale_factor=output_scale_factor,
- pre_norm=resnet_pre_norm,
- )
- )
- motion_modules.append(
- TransformerTemporalModel(
- num_attention_heads=temporal_num_attention_heads,
- attention_head_dim=in_channels // temporal_num_attention_heads,
- in_channels=in_channels,
- norm_num_groups=resnet_groups,
- cross_attention_dim=temporal_cross_attention_dim,
- attention_bias=False,
- positional_embeddings="sinusoidal",
- num_positional_embeddings=temporal_max_seq_length,
- activation_fn="geglu",
- )
- )
-
- self.attentions = nn.LayerList(attentions)
- self.resnets = nn.LayerList(resnets)
- self.motion_modules = nn.LayerList(motion_modules)
-
- self.gradient_checkpointing = False
-
- def forward(
- self,
- hidden_states: paddle.Tensor,
- temb: Optional[paddle.Tensor] = None,
- encoder_hidden_states: Optional[paddle.Tensor] = None,
- attention_mask: Optional[paddle.Tensor] = None,
- cross_attention_kwargs: Optional[Dict[str, Any]] = None,
- encoder_attention_mask: Optional[paddle.Tensor] = None,
- num_frames=1,
- ) -> paddle.Tensor:
- lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
- hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale)
-
- blocks = zip(self.attentions, self.resnets[1:], self.motion_modules)
- for attn, resnet, motion_module in blocks:
- if self.training and self.gradient_checkpointing and not hidden_states.stop_gradient:
-
- def create_custom_forward(module, return_dict=None):
- def custom_forward(*inputs):
- if return_dict is not None:
- return module(*inputs, return_dict=return_dict)
- else:
- return module(*inputs)
-
- return custom_forward
-
- hidden_states = recompute(
- create_custom_forward(attn, return_dict=False),
- hidden_states,
- encoder_hidden_states,
- cross_attention_kwargs,
- attention_mask,
- encoder_attention_mask,
- ) # [0]
- hidden_states = recompute(create_custom_forward(motion_module), hidden_states, temb)
- hidden_states = recompute(create_custom_forward(resnet), hidden_states, temb)
- else:
- hidden_states = attn(
- hidden_states,
- encoder_hidden_states=encoder_hidden_states,
- cross_attention_kwargs=cross_attention_kwargs,
- attention_mask=attention_mask,
- encoder_attention_mask=encoder_attention_mask,
- return_dict=False,
- )[0]
- hidden_states = motion_module(
- hidden_states,
- num_frames=num_frames,
- )[0]
- hidden_states = resnet(hidden_states, temb, scale=lora_scale)
-
return hidden_states
diff --git a/ppdiffusers/ppdiffusers/models/unet_3d_condition.py b/ppdiffusers/ppdiffusers/models/unet_3d_condition.py
index 0ad7c5875..1224a5c29 100644
--- a/ppdiffusers/ppdiffusers/models/unet_3d_condition.py
+++ b/ppdiffusers/ppdiffusers/models/unet_3d_condition.py
@@ -22,13 +22,7 @@
from ..configuration_utils import ConfigMixin, register_to_config
from ..loaders import UNet2DConditionLoadersMixin
from ..utils import BaseOutput, logging
-from .attention_processor import (
- ADDED_KV_ATTENTION_PROCESSORS,
- CROSS_ATTENTION_PROCESSORS,
- AttentionProcessor,
- AttnAddedKVProcessor,
- AttnProcessor,
-)
+from .attention_processor import AttentionProcessor, AttnProcessor
from .embeddings import TimestepEmbedding, Timesteps
from .modeling_utils import ModelMixin
from .transformer_temporal import TransformerTemporalModel
@@ -241,12 +235,9 @@ def __init__(
cross_attention_dim=cross_attention_dim,
num_attention_heads=reversed_num_attention_heads[i],
dual_cross_attention=False,
- resolution_idx=i,
)
self.up_blocks.append(up_block)
prev_output_channel = output_channel
-
- # out
if norm_num_groups is not None:
self.conv_norm_out = nn.GroupNorm(
num_channels=block_out_channels[0],
@@ -277,8 +268,8 @@ def attn_processors(self) -> Dict[str, AttentionProcessor]:
processors = {}
def fn_recursive_add_processors(name: str, module: nn.Layer, processors: Dict[str, AttentionProcessor]):
- if hasattr(module, "get_processor"):
- processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+ if hasattr(module, "set_processor"):
+ processors[f"{name}.processor"] = module.processor
for sub_name, child in module.named_children():
fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
@@ -357,9 +348,7 @@ def fn_recursive_set_attention_slice(module: nn.Layer, slice_size: List[int]):
fn_recursive_set_attention_slice(module, reversed_slice_size)
# Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
- def set_attn_processor(
- self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
- ):
+ def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
r"""
Sets the attention processor to use to compute attention.
@@ -383,9 +372,9 @@ def set_attn_processor(
def fn_recursive_attn_processor(name: str, module: nn.Layer, processor):
if hasattr(module, "set_processor"):
if not isinstance(processor, dict):
- module.set_processor(processor, _remove_lora=_remove_lora)
+ module.set_processor(processor)
else:
- module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
+ module.set_processor(processor.pop(f"{name}.processor"))
for sub_name, child in module.named_children():
fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
@@ -439,55 +428,12 @@ def set_default_attn_processor(self):
"""
Disables custom attention processors and sets the default attention implementation.
"""
- if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
- processor = AttnAddedKVProcessor()
- elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
- processor = AttnProcessor()
- else:
- raise ValueError(
- f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
- )
-
- self.set_attn_processor(processor)
+ self.set_attn_processor(AttnProcessor())
def _set_gradient_checkpointing(self, module, value=False):
if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
module.gradient_checkpointing = value
- # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.enable_freeu
- def enable_freeu(self, s1, s2, b1, b2):
- r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stage blocks where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
- are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate the "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate the "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- for i, upsample_block in enumerate(self.up_blocks):
- setattr(upsample_block, "s1", s1)
- setattr(upsample_block, "s2", s2)
- setattr(upsample_block, "b1", b1)
- setattr(upsample_block, "b2", b2)
-
- # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism."""
- freeu_keys = {"s1", "s2", "b1", "b2"}
- for i, upsample_block in enumerate(self.up_blocks):
- for k in freeu_keys:
- if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
- setattr(upsample_block, k, None)
-
def forward(
self,
sample: paddle.Tensor,
@@ -510,23 +456,6 @@ def forward(
timestep (`paddle.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
encoder_hidden_states (`paddle.Tensor`):
The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
- class_labels (`paddle.Tensor`, *optional*, defaults to `None`):
- Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
- timestep_cond: (`paddle.Tensor`, *optional*, defaults to `None`):
- Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
- through the `self.time_embedding` layer to obtain the timestep embeddings.
- attention_mask (`paddle.Tensor`, *optional*, defaults to `None`):
- An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
- is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
- negative values to the attention scores corresponding to "discard" tokens.
- cross_attention_kwargs (`dict`, *optional*):
- A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
- `self.processor` in
- [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
- down_block_additional_residuals: (`tuple` of `paddle.Tensor`, *optional*):
- A tuple of tensors that if specified are added to the residuals of down unet blocks.
- mid_block_additional_residual: (`paddle.Tensor`, *optional*):
- A tensor that if specified is added to the residual of the middle unet block.
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
tuple.
@@ -584,8 +513,6 @@ def forward(
emb = self.time_embedding(t_emb, timestep_cond)
emb = emb.repeat_interleave(repeats=num_frames, axis=0)
encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, axis=0)
-
- # 2. pre-process
sample = sample.transpose([0, 2, 1, 3, 4]).reshape(
(sample.shape[0] * num_frames, -1) + tuple(sample.shape[3:])
)
@@ -596,7 +523,6 @@ def forward(
cross_attention_kwargs=cross_attention_kwargs,
return_dict=False,
)[0]
-
# 3. down
down_block_res_samples = (sample,)
for downsample_block in self.down_blocks:
diff --git a/ppdiffusers/ppdiffusers/models/unet_motion_model.py b/ppdiffusers/ppdiffusers/models/unet_motion_model.py
deleted file mode 100644
index 5a74528d5..000000000
--- a/ppdiffusers/ppdiffusers/models/unet_motion_model.py
+++ /dev/null
@@ -1,874 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Any, Dict, Optional, Tuple, Union
-
-import paddle
-import paddle.nn as nn
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..loaders import UNet2DConditionLoadersMixin
-from ..utils import logging
-from .attention_processor import (
- ADDED_KV_ATTENTION_PROCESSORS,
- CROSS_ATTENTION_PROCESSORS,
- AttentionProcessor,
- AttnAddedKVProcessor,
- AttnProcessor,
-)
-from .embeddings import TimestepEmbedding, Timesteps
-from .modeling_utils import ModelMixin
-from .transformer_temporal import TransformerTemporalModel
-from .unet_2d_blocks import UNetMidBlock2DCrossAttn
-from .unet_2d_condition import UNet2DConditionModel
-from .unet_3d_blocks import (
- CrossAttnDownBlockMotion,
- CrossAttnUpBlockMotion,
- DownBlockMotion,
- UNetMidBlockCrossAttnMotion,
- UpBlockMotion,
- get_down_block,
- get_up_block,
-)
-from .unet_3d_condition import UNet3DConditionOutput
-
-logger = logging.get_logger(__name__) # pylint: disable=invalid-name
-
-
-class MotionModules(nn.Layer):
- def __init__(
- self,
- in_channels,
- layers_per_block=2,
- num_attention_heads=8,
- attention_bias=False,
- cross_attention_dim=None,
- activation_fn="geglu",
- norm_num_groups=32,
- max_seq_length=32,
- ):
- super().__init__()
- self.motion_modules = nn.LayerList([])
-
- for i in range(layers_per_block):
- self.motion_modules.append(
- TransformerTemporalModel(
- in_channels=in_channels,
- norm_num_groups=norm_num_groups,
- cross_attention_dim=cross_attention_dim,
- activation_fn=activation_fn,
- attention_bias=attention_bias,
- num_attention_heads=num_attention_heads,
- attention_head_dim=in_channels // num_attention_heads,
- positional_embeddings="sinusoidal",
- num_positional_embeddings=max_seq_length,
- )
- )
-
-
-class MotionAdapter(ModelMixin, ConfigMixin):
- @register_to_config
- def __init__(
- self,
- block_out_channels=(320, 640, 1280, 1280),
- motion_layers_per_block=2,
- motion_mid_block_layers_per_block=1,
- motion_num_attention_heads=8,
- motion_norm_num_groups=32,
- motion_max_seq_length=32,
- use_motion_mid_block=True,
- ):
- """Container to store AnimateDiff Motion Modules
-
- Args:
- block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
- The tuple of output channels for each UNet block.
- motion_layers_per_block (`int`, *optional*, defaults to 2):
- The number of motion layers per UNet block.
- motion_mid_block_layers_per_block (`int`, *optional*, defaults to 1):
- The number of motion layers in the middle UNet block.
- motion_num_attention_heads (`int`, *optional*, defaults to 8):
- The number of heads to use in each attention layer of the motion module.
- motion_norm_num_groups (`int`, *optional*, defaults to 32):
- The number of groups to use in each group normalization layer of the motion module.
- motion_max_seq_length (`int`, *optional*, defaults to 32):
- The maximum sequence length to use in the motion module.
- use_motion_mid_block (`bool`, *optional*, defaults to True):
- Whether to use a motion module in the middle of the UNet.
- """
-
- super().__init__()
- down_blocks = []
- up_blocks = []
-
- for i, channel in enumerate(block_out_channels):
- output_channel = block_out_channels[i]
- down_blocks.append(
- MotionModules(
- in_channels=output_channel,
- norm_num_groups=motion_norm_num_groups,
- cross_attention_dim=None,
- activation_fn="geglu",
- attention_bias=False,
- num_attention_heads=motion_num_attention_heads,
- max_seq_length=motion_max_seq_length,
- layers_per_block=motion_layers_per_block,
- )
- )
-
- if use_motion_mid_block:
- self.mid_block = MotionModules(
- in_channels=block_out_channels[-1],
- norm_num_groups=motion_norm_num_groups,
- cross_attention_dim=None,
- activation_fn="geglu",
- attention_bias=False,
- num_attention_heads=motion_num_attention_heads,
- layers_per_block=motion_mid_block_layers_per_block,
- max_seq_length=motion_max_seq_length,
- )
- else:
- self.mid_block = None
-
- reversed_block_out_channels = list(reversed(block_out_channels))
- output_channel = reversed_block_out_channels[0]
- for i, channel in enumerate(reversed_block_out_channels):
- output_channel = reversed_block_out_channels[i]
- up_blocks.append(
- MotionModules(
- in_channels=output_channel,
- norm_num_groups=motion_norm_num_groups,
- cross_attention_dim=None,
- activation_fn="geglu",
- attention_bias=False,
- num_attention_heads=motion_num_attention_heads,
- max_seq_length=motion_max_seq_length,
- layers_per_block=motion_layers_per_block + 1,
- )
- )
-
- self.down_blocks = nn.LayerList(down_blocks)
- self.up_blocks = nn.LayerList(up_blocks)
-
- def forward(self, sample):
- pass
-
-
-class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
- r"""
- A modified conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a
- sample shaped output.
-
- This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
- for all models (such as downloading or saving).
- """
- _supports_gradient_checkpointing = True
-
- @register_to_config
- def __init__(
- self,
- sample_size: Optional[int] = None,
- in_channels: int = 4,
- out_channels: int = 4,
- down_block_types: Tuple[str] = (
- "CrossAttnDownBlockMotion",
- "CrossAttnDownBlockMotion",
- "CrossAttnDownBlockMotion",
- "DownBlockMotion",
- ),
- up_block_types: Tuple[str] = (
- "UpBlockMotion",
- "CrossAttnUpBlockMotion",
- "CrossAttnUpBlockMotion",
- "CrossAttnUpBlockMotion",
- ),
- block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
- layers_per_block: int = 2,
- downsample_padding: int = 1,
- mid_block_scale_factor: float = 1,
- act_fn: str = "silu",
- norm_num_groups: Optional[int] = 32,
- norm_eps: float = 1e-5,
- cross_attention_dim: int = 1280,
- use_linear_projection: bool = False,
- num_attention_heads: Optional[Union[int, Tuple[int]]] = 8,
- motion_max_seq_length: Optional[int] = 32,
- motion_num_attention_heads: int = 8,
- use_motion_mid_block: int = True,
- ):
- super().__init__()
-
- self.sample_size = sample_size
-
- # Check inputs
- if len(down_block_types) != len(up_block_types):
- raise ValueError(
- f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
- )
-
- if len(block_out_channels) != len(down_block_types):
- raise ValueError(
- f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
- )
-
- if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
- raise ValueError(
- f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
- )
-
- # input
- conv_in_kernel = 3
- conv_out_kernel = 3
- conv_in_padding = (conv_in_kernel - 1) // 2
- self.conv_in = nn.Conv2D(
- in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
- )
-
- # time
- time_embed_dim = block_out_channels[0] * 4
- self.time_proj = Timesteps(block_out_channels[0], True, 0)
- timestep_input_dim = block_out_channels[0]
-
- self.time_embedding = TimestepEmbedding(
- timestep_input_dim,
- time_embed_dim,
- act_fn=act_fn,
- )
-
- # class embedding
- self.down_blocks = nn.LayerList([])
- self.up_blocks = nn.LayerList([])
-
- if isinstance(num_attention_heads, int):
- num_attention_heads = (num_attention_heads,) * len(down_block_types)
-
- # down
- output_channel = block_out_channels[0]
- for i, down_block_type in enumerate(down_block_types):
- input_channel = output_channel
- output_channel = block_out_channels[i]
- is_final_block = i == len(block_out_channels) - 1
-
- down_block = get_down_block(
- down_block_type,
- num_layers=layers_per_block,
- in_channels=input_channel,
- out_channels=output_channel,
- temb_channels=time_embed_dim,
- add_downsample=not is_final_block,
- resnet_eps=norm_eps,
- resnet_act_fn=act_fn,
- resnet_groups=norm_num_groups,
- cross_attention_dim=cross_attention_dim,
- num_attention_heads=num_attention_heads[i],
- downsample_padding=downsample_padding,
- use_linear_projection=use_linear_projection,
- dual_cross_attention=False,
- temporal_num_attention_heads=motion_num_attention_heads,
- temporal_max_seq_length=motion_max_seq_length,
- )
- self.down_blocks.append(down_block)
-
- # mid
- if use_motion_mid_block:
- self.mid_block = UNetMidBlockCrossAttnMotion(
- in_channels=block_out_channels[-1],
- temb_channels=time_embed_dim,
- resnet_eps=norm_eps,
- resnet_act_fn=act_fn,
- output_scale_factor=mid_block_scale_factor,
- cross_attention_dim=cross_attention_dim,
- num_attention_heads=num_attention_heads[-1],
- resnet_groups=norm_num_groups,
- dual_cross_attention=False,
- temporal_num_attention_heads=motion_num_attention_heads,
- temporal_max_seq_length=motion_max_seq_length,
- )
-
- else:
- self.mid_block = UNetMidBlock2DCrossAttn(
- in_channels=block_out_channels[-1],
- temb_channels=time_embed_dim,
- resnet_eps=norm_eps,
- resnet_act_fn=act_fn,
- output_scale_factor=mid_block_scale_factor,
- cross_attention_dim=cross_attention_dim,
- num_attention_heads=num_attention_heads[-1],
- resnet_groups=norm_num_groups,
- dual_cross_attention=False,
- )
-
- # count how many layers upsample the images
- self.num_upsamplers = 0
-
- # up
- reversed_block_out_channels = list(reversed(block_out_channels))
- reversed_num_attention_heads = list(reversed(num_attention_heads))
-
- output_channel = reversed_block_out_channels[0]
- for i, up_block_type in enumerate(up_block_types):
- is_final_block = i == len(block_out_channels) - 1
-
- prev_output_channel = output_channel
- output_channel = reversed_block_out_channels[i]
- input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
-
- # add upsample block for all BUT final layer
- if not is_final_block:
- add_upsample = True
- self.num_upsamplers += 1
- else:
- add_upsample = False
-
- up_block = get_up_block(
- up_block_type,
- num_layers=layers_per_block + 1,
- in_channels=input_channel,
- out_channels=output_channel,
- prev_output_channel=prev_output_channel,
- temb_channels=time_embed_dim,
- add_upsample=add_upsample,
- resnet_eps=norm_eps,
- resnet_act_fn=act_fn,
- resnet_groups=norm_num_groups,
- cross_attention_dim=cross_attention_dim,
- num_attention_heads=reversed_num_attention_heads[i],
- dual_cross_attention=False,
- resolution_idx=i,
- use_linear_projection=use_linear_projection,
- temporal_num_attention_heads=motion_num_attention_heads,
- temporal_max_seq_length=motion_max_seq_length,
- )
- self.up_blocks.append(up_block)
- prev_output_channel = output_channel
-
- # out
- if norm_num_groups is not None:
- self.conv_norm_out = nn.GroupNorm(
- num_channels=block_out_channels[0], num_groups=norm_num_groups, epsilon=norm_eps
- )
- self.conv_act = nn.Silu()
- else:
- self.conv_norm_out = None
- self.conv_act = None
-
- conv_out_padding = (conv_out_kernel - 1) // 2
- self.conv_out = nn.Conv2D(
- block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
- )
-
- @classmethod
- def from_unet2d(
- cls,
- unet: UNet2DConditionModel,
- motion_adapter: Optional[MotionAdapter] = None,
- load_weights: bool = True,
- ):
- has_motion_adapter = motion_adapter is not None
-
- # based on https://github.com/guoyww/AnimateDiff/blob/895f3220c06318ea0760131ec70408b466c49333/animatediff/models/unet.py#L459
- config = unet.config
- config["_class_name"] = cls.__name__
-
- down_blocks = []
- for down_blocks_type in config["down_block_types"]:
- if "CrossAttn" in down_blocks_type:
- down_blocks.append("CrossAttnDownBlockMotion")
- else:
- down_blocks.append("DownBlockMotion")
- config["down_block_types"] = down_blocks
-
- up_blocks = []
- for down_blocks_type in config["up_block_types"]:
- if "CrossAttn" in down_blocks_type:
- up_blocks.append("CrossAttnUpBlockMotion")
- else:
- up_blocks.append("UpBlockMotion")
-
- config["up_block_types"] = up_blocks
-
- if has_motion_adapter:
- config["motion_num_attention_heads"] = motion_adapter.config["motion_num_attention_heads"]
- config["motion_max_seq_length"] = motion_adapter.config["motion_max_seq_length"]
- config["use_motion_mid_block"] = motion_adapter.config["use_motion_mid_block"]
-
- # Need this for backwards compatibility with UNet2DConditionModel checkpoints
- if not config.get("num_attention_heads"):
- config["num_attention_heads"] = config["attention_head_dim"]
-
- model = cls.from_config(config)
-
- if not load_weights:
- return model
-
- model.conv_in.load_dict(unet.conv_in.state_dict())
- model.time_proj.load_dict(unet.time_proj.state_dict())
- model.time_embedding.load_dict(unet.time_embedding.state_dict())
-
- for i, down_block in enumerate(unet.down_blocks):
- model.down_blocks[i].resnets.load_dict(down_block.resnets.state_dict())
- if hasattr(model.down_blocks[i], "attentions"):
- model.down_blocks[i].attentions.load_dict(down_block.attentions.state_dict())
- if model.down_blocks[i].downsamplers:
- model.down_blocks[i].downsamplers.load_dict(down_block.downsamplers.state_dict())
-
- for i, up_block in enumerate(unet.up_blocks):
- model.up_blocks[i].resnets.load_dict(up_block.resnets.state_dict())
- if hasattr(model.up_blocks[i], "attentions"):
- model.up_blocks[i].attentions.load_dict(up_block.attentions.state_dict())
- if model.up_blocks[i].upsamplers:
- model.up_blocks[i].upsamplers.load_dict(up_block.upsamplers.state_dict())
-
- model.mid_block.resnets.load_dict(unet.mid_block.resnets.state_dict())
- model.mid_block.attentions.load_dict(unet.mid_block.attentions.state_dict())
-
- if unet.conv_norm_out is not None:
- model.conv_norm_out.load_dict(unet.conv_norm_out.state_dict())
- if unet.conv_act is not None:
- model.conv_act.load_dict(unet.conv_act.state_dict())
- model.conv_out.load_dict(unet.conv_out.state_dict())
-
- if has_motion_adapter:
- model.load_motion_modules(motion_adapter)
-
- # ensure that the Motion UNet is the same dtype as the UNet2DConditionModel
- model.to(dtype=unet.dtype)
-
- return model
-
- def freeze_unet2d_params(self):
- """Freeze the weights of just the UNet2DConditionModel, and leave the motion modules
- unfrozen for fine tuning.
- """
- # Freeze everything
- for param in self.parameters():
- param.stop_gradient = True
-
- # Unfreeze Motion Modules
- for down_block in self.down_blocks:
- motion_modules = down_block.motion_modules
- for param in motion_modules.parameters():
- param.stop_gradient = False
-
- for up_block in self.up_blocks:
- motion_modules = up_block.motion_modules
- for param in motion_modules.parameters():
- param.stop_gradient = False
-
- if hasattr(self.mid_block, "motion_modules"):
- motion_modules = self.mid_block.motion_modules
- for param in motion_modules.parameters():
- param.stop_gradient = False
-
- return
-
- def load_motion_modules(self, motion_adapter: Optional[MotionAdapter]):
- for i, down_block in enumerate(motion_adapter.down_blocks):
- self.down_blocks[i].motion_modules.load_dict(down_block.motion_modules.state_dict())
- for i, up_block in enumerate(motion_adapter.up_blocks):
- self.up_blocks[i].motion_modules.load_dict(up_block.motion_modules.state_dict())
-
- # to support older motion modules that don't have a mid_block
- if hasattr(self.mid_block, "motion_modules"):
- self.mid_block.motion_modules.load_dict(motion_adapter.mid_block.motion_modules.state_dict())
-
- def save_motion_modules(
- self,
- save_directory: str,
- is_main_process: bool = True,
- safe_serialization: bool = True,
- variant: Optional[str] = None,
- push_to_hub: bool = False,
- **kwargs,
- ):
- state_dict = self.state_dict()
-
- # Extract all motion modules
- motion_state_dict = {}
- for k, v in state_dict.items():
- if "motion_modules" in k:
- motion_state_dict[k] = v
-
- adapter = MotionAdapter(
- block_out_channels=self.config["block_out_channels"],
- motion_layers_per_block=self.config["layers_per_block"],
- motion_norm_num_groups=self.config["norm_num_groups"],
- motion_num_attention_heads=self.config["motion_num_attention_heads"],
- motion_max_seq_length=self.config["motion_max_seq_length"],
- use_motion_mid_block=self.config["use_motion_mid_block"],
- )
- adapter.load_dict(motion_state_dict)
- adapter.save_pretrained(
- save_directory=save_directory,
- is_main_process=is_main_process,
- safe_serialization=safe_serialization,
- variant=variant,
- push_to_hub=push_to_hub,
- **kwargs,
- )
-
- @property
- # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
- def attn_processors(self) -> Dict[str, AttentionProcessor]:
- r"""
- Returns:
- `dict` of attention processors: A dictionary containing all attention processors used in the model with
- indexed by its weight name.
- """
- # set recursively
- processors = {}
-
- def fn_recursive_add_processors(name: str, module: paddle.nn.Layer, processors: Dict[str, AttentionProcessor]):
- if hasattr(module, "get_processor"):
- processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
-
- for sub_name, child in module.named_children():
- fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
- return processors
-
- for name, module in self.named_children():
- fn_recursive_add_processors(name, module, processors)
-
- return processors
-
- # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor
- def set_attn_processor(
- self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]], _remove_lora=False
- ):
- r"""
- Sets the attention processor to use to compute attention.
-
- Parameters:
- processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
- The instantiated processor class or a dictionary of processor classes that will be set as the processor
- for **all** `Attention` layers.
-
- If `processor` is a dict, the key needs to define the path to the corresponding cross attention
- processor. This is strongly recommended when setting trainable attention processors.
-
- """
- count = len(self.attn_processors.keys())
-
- if isinstance(processor, dict) and len(processor) != count:
- raise ValueError(
- f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
- f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
- )
-
- def fn_recursive_attn_processor(name: str, module: paddle.nn.Layer, processor):
- if hasattr(module, "set_processor"):
- if not isinstance(processor, dict):
- module.set_processor(processor, _remove_lora=_remove_lora)
- else:
- module.set_processor(processor.pop(f"{name}.processor"), _remove_lora=_remove_lora)
-
- for sub_name, child in module.named_children():
- fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
- for name, module in self.named_children():
- fn_recursive_attn_processor(name, module, processor)
-
- # Copied from ppdiffusers.models.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
- def enable_forward_chunking(self, chunk_size=None, dim=0):
- """
- Sets the attention processor to use [feed forward
- chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
-
- Parameters:
- chunk_size (`int`, *optional*):
- The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
- over each tensor of dim=`dim`.
- dim (`int`, *optional*, defaults to `0`):
- The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
- or dim=1 (sequence length).
- """
- if dim not in [0, 1]:
- raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
-
- # By default chunk size is 1
- chunk_size = chunk_size or 1
-
- def fn_recursive_feed_forward(module: paddle.nn.Layer, chunk_size: int, dim: int):
- if hasattr(module, "set_chunk_feed_forward"):
- module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
-
- for child in module.children():
- fn_recursive_feed_forward(child, chunk_size, dim)
-
- for module in self.children():
- fn_recursive_feed_forward(module, chunk_size, dim)
-
- # Copied from ppdiffusers.models.unet_3d_condition.UNet3DConditionModel.disable_forward_chunking
- def disable_forward_chunking(self):
- def fn_recursive_feed_forward(module: paddle.nn.Layer, chunk_size: int, dim: int):
- if hasattr(module, "set_chunk_feed_forward"):
- module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
-
- for child in module.children():
- fn_recursive_feed_forward(child, chunk_size, dim)
-
- for module in self.children():
- fn_recursive_feed_forward(module, None, 0)
-
- # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
- def set_default_attn_processor(self):
- """
- Disables custom attention processors and sets the default attention implementation.
- """
- if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
- processor = AttnAddedKVProcessor()
- elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
- processor = AttnProcessor()
- else:
- raise ValueError(
- f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
- )
-
- self.set_attn_processor(processor, _remove_lora=True)
-
- def _set_gradient_checkpointing(self, module, value=False):
- if isinstance(module, (CrossAttnDownBlockMotion, DownBlockMotion, CrossAttnUpBlockMotion, UpBlockMotion)):
- module.gradient_checkpointing = value
-
- # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.enable_freeu
- def enable_freeu(self, s1, s2, b1, b2):
- r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stage blocks where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
- are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate the "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate the "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- for i, upsample_block in enumerate(self.up_blocks):
- setattr(upsample_block, "s1", s1)
- setattr(upsample_block, "s2", s2)
- setattr(upsample_block, "b1", b1)
- setattr(upsample_block, "b2", b2)
-
- # Copied from ppdiffusers.models.unet_2d_condition.UNet2DConditionModel.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism."""
- freeu_keys = {"s1", "s2", "b1", "b2"}
- for i, upsample_block in enumerate(self.up_blocks):
- for k in freeu_keys:
- if hasattr(upsample_block, k) or getattr(upsample_block, k, None) is not None:
- setattr(upsample_block, k, None)
-
- def forward(
- self,
- sample: paddle.Tensor,
- timestep: Union[paddle.Tensor, float, int],
- encoder_hidden_states: paddle.Tensor,
- timestep_cond: Optional[paddle.Tensor] = None,
- attention_mask: Optional[paddle.Tensor] = None,
- cross_attention_kwargs: Optional[Dict[str, Any]] = None,
- down_block_additional_residuals: Optional[Tuple[paddle.Tensor]] = None,
- mid_block_additional_residual: Optional[paddle.Tensor] = None,
- return_dict: bool = True,
- ) -> Union[UNet3DConditionOutput, Tuple]:
- r"""
- The [`UNetMotionModel`] forward method.
-
- Args:
- sample (`paddle.Tensor`):
- The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
- timestep (`paddle.Tensor` or `float` or `int`): The number of timesteps to denoise an input.
- encoder_hidden_states (`paddle.Tensor`):
- The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
- timestep_cond: (`paddle.Tensor`, *optional*, defaults to `None`):
- Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed
- through the `self.time_embedding` layer to obtain the timestep embeddings.
- attention_mask (`paddle.Tensor`, *optional*, defaults to `None`):
- An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
- is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
- negative values to the attention scores corresponding to "discard" tokens.
- cross_attention_kwargs (`dict`, *optional*):
- A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
- `self.processor` in
- [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
- down_block_additional_residuals: (`tuple` of `paddle.Tensor`, *optional*):
- A tuple of tensors that if specified are added to the residuals of down unet blocks.
- mid_block_additional_residual: (`paddle.Tensor`, *optional*):
- A tensor that if specified is added to the residual of the middle unet block.
- return_dict (`bool`, *optional*, defaults to `True`):
- Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain
- tuple.
-
- Returns:
- [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`:
- If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise
- a `tuple` is returned where the first element is the sample tensor.
- """
- # By default samples have to be AT least a multiple of the overall upsampling factor.
- # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
- # However, the upsampling interpolation output size can be forced to fit any upsampling size
- # on the fly if necessary.
- default_overall_up_factor = 2**self.num_upsamplers
-
- # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
- forward_upsample_size = False
- upsample_size = None
-
- if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
- logger.info("Forward upsample size to force interpolation output size.")
- forward_upsample_size = True
-
- # prepare attention_mask
- if attention_mask is not None:
- attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
- attention_mask = attention_mask.unsqueeze(1)
-
- # 1. time
- timesteps = timestep
- if not isinstance(timesteps, paddle.Tensor):
- # # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
- # # This would be a good case for the `match` statement (Python 3.10+)
- # is_mps = sample.device.type == "mps"
- # if isinstance(timestep, float):
- # dtype = paddle.float32 if is_mps else paddle.float64
- # else:
- # dtype = paddle.int32 if is_mps else paddle.int64
- timesteps = paddle.to_tensor([timesteps], dtype=paddle.int64, place=sample.place)
- elif len(timesteps.shape) == 0:
- timesteps = timesteps[None]
-
- # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
- num_frames = sample.shape[2]
- timesteps = timesteps.expand(sample.shape[0])
-
- t_emb = self.time_proj(timesteps)
-
- # timesteps does not contain any weights and will always return f32 tensors
- # but time_embedding might actually be running in fp16. so we need to cast here.
- # there might be better ways to encapsulate this.
- t_emb = t_emb.cast(dtype=self.dtype) # paddle develop has .to()
-
- emb = self.time_embedding(t_emb, timestep_cond)
- emb = emb.repeat_interleave(repeats=num_frames, axis=0)
- encoder_hidden_states = encoder_hidden_states.repeat_interleave(repeats=num_frames, axis=0)
-
- # 2. pre-process
- shape1 = [sample.shape[0] * num_frames, -1] + sample.shape[3:]
- sample = sample.transpose((0, 2, 1, 3, 4)).reshape(shape1)
- sample = self.conv_in(sample)
-
- # 3. down
- down_block_res_samples = (sample,)
- for downsample_block in self.down_blocks:
- if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
- sample, res_samples = downsample_block(
- hidden_states=sample,
- temb=emb,
- encoder_hidden_states=encoder_hidden_states,
- attention_mask=attention_mask,
- num_frames=num_frames,
- cross_attention_kwargs=cross_attention_kwargs,
- )
- else:
- sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
-
- down_block_res_samples += res_samples
-
- if down_block_additional_residuals is not None:
- new_down_block_res_samples = ()
-
- for down_block_res_sample, down_block_additional_residual in zip(
- down_block_res_samples, down_block_additional_residuals
- ):
- down_block_res_sample = down_block_res_sample + down_block_additional_residual
- new_down_block_res_samples += (down_block_res_sample,)
-
- down_block_res_samples = new_down_block_res_samples
-
- # 4. mid
- if self.mid_block is not None:
- # To support older versions of motion modules that don't have a mid_block
- if hasattr(self.mid_block, "motion_modules"):
- sample = self.mid_block(
- sample,
- emb,
- encoder_hidden_states=encoder_hidden_states,
- attention_mask=attention_mask,
- num_frames=num_frames,
- cross_attention_kwargs=cross_attention_kwargs,
- )
- else:
- sample = self.mid_block(
- sample,
- emb,
- encoder_hidden_states=encoder_hidden_states,
- attention_mask=attention_mask,
- cross_attention_kwargs=cross_attention_kwargs,
- )
-
- if mid_block_additional_residual is not None:
- sample = sample + mid_block_additional_residual
-
- # 5. up
- for i, upsample_block in enumerate(self.up_blocks):
- is_final_block = i == len(self.up_blocks) - 1
-
- res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
- down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
-
- # if we have not reached the final block and need to forward the
- # upsample size, we do it here
- if not is_final_block and forward_upsample_size:
- upsample_size = down_block_res_samples[-1].shape[2:]
-
- if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
- sample = upsample_block(
- hidden_states=sample,
- temb=emb,
- res_hidden_states_tuple=res_samples,
- encoder_hidden_states=encoder_hidden_states,
- upsample_size=upsample_size,
- attention_mask=attention_mask,
- num_frames=num_frames,
- cross_attention_kwargs=cross_attention_kwargs,
- )
- else:
- sample = upsample_block(
- hidden_states=sample,
- temb=emb,
- res_hidden_states_tuple=res_samples,
- upsample_size=upsample_size,
- num_frames=num_frames,
- )
-
- # 6. post-process
- if self.conv_norm_out:
- sample = self.conv_norm_out(sample)
- sample = self.conv_act(sample)
-
- sample = self.conv_out(sample)
-
- # reshape to (batch, channel, framerate, width, height)
- sample = sample[None, :].reshape([-1, num_frames] + sample.shape[1:]).transpose((0, 2, 1, 3, 4))
-
- if not return_dict:
- return (sample,)
-
- return UNet3DConditionOutput(sample=sample)
diff --git a/ppdiffusers/ppdiffusers/models/vae.py b/ppdiffusers/ppdiffusers/models/vae.py
index 3b08d1bf2..019cacee0 100644
--- a/ppdiffusers/ppdiffusers/models/vae.py
+++ b/ppdiffusers/ppdiffusers/models/vae.py
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Optional
import numpy as np
import paddle
@@ -21,14 +21,8 @@
from paddle.distributed.fleet.utils import recompute
from ..utils import BaseOutput, randn_tensor
-from .activations import get_activation
from .attention_processor import SpatialNorm
-from .unet_2d_blocks import (
- AutoencoderTinyBlock,
- UNetMidBlock2D,
- get_down_block,
- get_up_block,
-)
+from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block
try:
from paddle.amp.auto_cast import amp_state
@@ -59,39 +53,16 @@ class DecoderOutput(BaseOutput):
class Encoder(nn.Layer):
- r"""
- The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
-
- Args:
- in_channels (`int`, *optional*, defaults to 3):
- The number of input channels.
- out_channels (`int`, *optional*, defaults to 3):
- The number of output channels.
- down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
- The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
- options.
- block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
- The number of output channels for each block.
- layers_per_block (`int`, *optional*, defaults to 2):
- The number of layers per block.
- norm_num_groups (`int`, *optional*, defaults to 32):
- The number of groups for normalization.
- act_fn (`str`, *optional*, defaults to `"silu"`):
- The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
- double_z (`bool`, *optional*, defaults to `True`):
- Whether to double the number of output channels for the last block.
- """
-
def __init__(
self,
- in_channels: int = 3,
- out_channels: int = 3,
- down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
- block_out_channels: Tuple[int, ...] = (64,),
- layers_per_block: int = 2,
- norm_num_groups: int = 32,
- act_fn: str = "silu",
- double_z: bool = True,
+ in_channels=3,
+ out_channels=3,
+ down_block_types=("DownEncoderBlock2D",),
+ block_out_channels=(64,),
+ layers_per_block=2,
+ norm_num_groups=32,
+ act_fn="silu",
+ double_z=True,
):
super().__init__()
self.layers_per_block = layers_per_block
@@ -145,9 +116,8 @@ def __init__(
self.conv_out = nn.Conv2D(block_out_channels[-1], conv_out_channels, 3, padding=1)
self.gradient_checkpointing = False
- def forward(self, sample: paddle.Tensor) -> paddle.Tensor:
- r"""The forward method of the `Encoder` class."""
-
+ def forward(self, x):
+ sample = x
sample = self.conv_in(sample)
if self.training and self.gradient_checkpointing and not sample.stop_gradient:
@@ -182,38 +152,16 @@ def custom_forward(*inputs):
class Decoder(nn.Layer):
- r"""
- The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
-
- Args:
- in_channels (`int`, *optional*, defaults to 3):
- The number of input channels.
- out_channels (`int`, *optional*, defaults to 3):
- The number of output channels.
- up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
- The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
- block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
- The number of output channels for each block.
- layers_per_block (`int`, *optional*, defaults to 2):
- The number of layers per block.
- norm_num_groups (`int`, *optional*, defaults to 32):
- The number of groups for normalization.
- act_fn (`str`, *optional*, defaults to `"silu"`):
- The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
- norm_type (`str`, *optional*, defaults to `"group"`):
- The normalization type to use. Can be either `"group"` or `"spatial"`.
- """
-
def __init__(
self,
- in_channels: int = 3,
- out_channels: int = 3,
- up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
- block_out_channels: Tuple[int, ...] = (64,),
- layers_per_block: int = 2,
- norm_num_groups: int = 32,
- act_fn: str = "silu",
- norm_type: str = "group", # group, spatial
+ in_channels=3,
+ out_channels=3,
+ up_block_types=("UpDecoderBlock2D",),
+ block_out_channels=(64,),
+ layers_per_block=2,
+ norm_num_groups=32,
+ act_fn="silu",
+ norm_type="group", # group, spatial
):
super().__init__()
self.layers_per_block = layers_per_block
@@ -273,9 +221,8 @@ def __init__(
self.conv_out = nn.Conv2D(block_out_channels[0], out_channels, 3, padding=1)
self.gradient_checkpointing = False
- def forward(self, sample: paddle.Tensor, latent_embeds: Optional[paddle.Tensor] = None) -> paddle.Tensor:
- r"""The forward method of the `Decoder` class."""
-
+ def forward(self, z, latent_embeds=None):
+ sample = z
sample = self.conv_in(sample)
upscale_dtype = self.up_blocks.dtype
@@ -323,16 +270,6 @@ def custom_forward(*inputs):
class UpSample(nn.Layer):
- r"""
- The `UpSample` layer of a variational autoencoder that upsamples its input.
-
- Args:
- in_channels (`int`, *optional*, defaults to 3):
- The number of input channels.
- out_channels (`int`, *optional*, defaults to 3):
- The number of output channels.
- """
-
def __init__(
self,
in_channels: int,
@@ -344,7 +281,6 @@ def __init__(
self.deconv = nn.Conv2DTranspose(in_channels, out_channels, kernel_size=4, stride=2, padding=1)
def forward(self, x: paddle.Tensor) -> paddle.Tensor:
- r"""The forward method of the `UpSample` class."""
x = paddle.nn.functional.relu(x)
x = self.deconv(x)
return x
@@ -393,7 +329,6 @@ def __init__(
self.layers = nn.Sequential(*layers)
def forward(self, x: paddle.Tensor, mask=None) -> paddle.Tensor:
- r"""The forward method of the `MaskConditionEncoder` class."""
out = {}
for l in range(len(self.layers)):
layer = self.layers[l]
@@ -404,38 +339,19 @@ def forward(self, x: paddle.Tensor, mask=None) -> paddle.Tensor:
class MaskConditionDecoder(nn.Layer):
- r"""The `MaskConditionDecoder` should be used in combination with [`AsymmetricAutoencoderKL`] to enhance the model's
- decoder with a conditioner on the mask and masked image.
-
- Args:
- in_channels (`int`, *optional*, defaults to 3):
- The number of input channels.
- out_channels (`int`, *optional*, defaults to 3):
- The number of output channels.
- up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
- The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
- block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
- The number of output channels for each block.
- layers_per_block (`int`, *optional*, defaults to 2):
- The number of layers per block.
- norm_num_groups (`int`, *optional*, defaults to 32):
- The number of groups for normalization.
- act_fn (`str`, *optional*, defaults to `"silu"`):
- The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
- norm_type (`str`, *optional*, defaults to `"group"`):
- The normalization type to use. Can be either `"group"` or `"spatial"`.
- """
+ """The `MaskConditionDecoder` should be used in combination with [`AsymmetricAutoencoderKL`] to enhance the model's
+ decoder with a conditioner on the mask and masked image."""
def __init__(
self,
- in_channels: int = 3,
- out_channels: int = 3,
- up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
- block_out_channels: Tuple[int, ...] = (64,),
- layers_per_block: int = 2,
- norm_num_groups: int = 32,
- act_fn: str = "silu",
- norm_type: str = "group", # group, spatial
+ in_channels=3,
+ out_channels=3,
+ up_block_types=("UpDecoderBlock2D",),
+ block_out_channels=(64,),
+ layers_per_block=2,
+ norm_num_groups=32,
+ act_fn="silu",
+ norm_type="group", # group, spatial
):
super().__init__()
self.layers_per_block = layers_per_block
@@ -510,14 +426,7 @@ def __init__(
self.gradient_checkpointing = False
- def forward(
- self,
- z: paddle.Tensor,
- image: Optional[paddle.Tensor] = None,
- mask: Optional[paddle.Tensor] = None,
- latent_embeds: Optional[paddle.Tensor] = None,
- ) -> paddle.Tensor:
- r"""The forward method of the `MaskConditionDecoder` class."""
+ def forward(self, z, image=None, mask=None, latent_embeds=None):
sample = z
sample = self.conv_in(sample)
@@ -589,14 +498,7 @@ class VectorQuantizer(nn.Layer):
# backwards compatibility we use the buggy version by default, but you can
# specify legacy=False to fix it.
def __init__(
- self,
- n_e: int,
- vq_embed_dim: int,
- beta: float,
- remap=None,
- unknown_index: str = "random",
- sane_index_shape: bool = False,
- legacy: bool = True,
+ self, n_e, vq_embed_dim, beta, remap=None, unknown_index="random", sane_index_shape=False, legacy=True
):
super().__init__()
self.n_e = n_e
@@ -625,7 +527,7 @@ def __init__(
self.sane_index_shape = sane_index_shape
- def remap_to_used(self, inds: paddle.Tensor) -> paddle.Tensor:
+ def remap_to_used(self, inds):
ishape = inds.shape
assert len(ishape) > 1
inds = inds.reshape([ishape[0], -1])
@@ -639,7 +541,7 @@ def remap_to_used(self, inds: paddle.Tensor) -> paddle.Tensor:
new[unknown] = self.unknown_index
return new.reshape(ishape)
- def unmap_to_all(self, inds: paddle.Tensor) -> paddle.Tensor:
+ def unmap_to_all(self, inds):
ishape = inds.shape
assert len(ishape) > 1
inds = inds.reshape([ishape[0], -1])
@@ -649,7 +551,7 @@ def unmap_to_all(self, inds: paddle.Tensor) -> paddle.Tensor:
back = paddle.take_along_axis(used[None, :][inds.shape[0] * [0], :], inds, axis=1)
return back.reshape(ishape)
- def forward(self, z: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor, Tuple]:
+ def forward(self, z):
# reshape z -> (batch, height, width, channel) and flatten
z = z.transpose([0, 2, 3, 1])
z_flattened = z.reshape([-1, self.vq_embed_dim])
@@ -688,7 +590,7 @@ def forward(self, z: paddle.Tensor) -> Tuple[paddle.Tensor, paddle.Tensor, Tuple
return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
- def get_codebook_entry(self, indices: paddle.Tensor, shape: Tuple[int, ...]) -> paddle.Tensor:
+ def get_codebook_entry(self, indices, shape):
# shape specifying (batch, height, width, channel)
if self.remap is not None:
indices = indices.reshape([shape[0], -1]) # add batch axis
@@ -711,7 +613,7 @@ def get_codebook_entry(self, indices: paddle.Tensor, shape: Tuple[int, ...]) ->
class DiagonalGaussianDistribution(object):
- def __init__(self, parameters: paddle.Tensor, deterministic: bool = False):
+ def __init__(self, parameters, deterministic=False):
self.parameters = parameters
self.mean, self.logvar = paddle.chunk(parameters, 2, axis=1)
self.logvar = paddle.clip(self.logvar, -30.0, 20.0)
@@ -727,7 +629,7 @@ def sample(self, generator: Optional[paddle.Generator] = None) -> paddle.Tensor:
x = self.mean + self.std * sample
return x
- def kl(self, other: "DiagonalGaussianDistribution" = None) -> paddle.Tensor:
+ def kl(self, other=None):
if self.deterministic:
return paddle.to_tensor([0.0])
else:
@@ -751,141 +653,3 @@ def nll(self, sample, axis=[1, 2, 3]):
def mode(self):
return self.mean
-
-
-class EncoderTiny(nn.Layer):
- r"""
- The `EncoderTiny` layer is a simpler version of the `Encoder` layer.
-
- Args:
- in_channels (`int`):
- The number of input channels.
- out_channels (`int`):
- The number of output channels.
- num_blocks (`Tuple[int, ...]`):
- Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
- use.
- block_out_channels (`Tuple[int, ...]`):
- The number of output channels for each block.
- act_fn (`str`):
- The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
- """
-
- def __init__(
- self,
- in_channels: int,
- out_channels: int,
- num_blocks: Tuple[int, ...],
- block_out_channels: Tuple[int, ...],
- act_fn: str,
- ):
- super().__init__()
-
- layers = []
- for i, num_block in enumerate(num_blocks):
- num_channels = block_out_channels[i]
-
- if i == 0:
- layers.append(nn.Conv2D(in_channels, num_channels, kernel_size=3, padding=1))
- else:
- layers.append(nn.Conv2D(num_channels, num_channels, kernel_size=3, padding=1, stride=2, bias=False))
-
- for _ in range(num_block):
- layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn))
-
- layers.append(nn.Conv2D(block_out_channels[-1], out_channels, kernel_size=3, padding=1))
-
- self.layers = nn.Sequential(*layers)
- self.gradient_checkpointing = False
-
- def forward(self, x: paddle.Tensor) -> paddle.Tensor:
- r"""The forward method of the `EncoderTiny` class."""
- if self.training and self.gradient_checkpointing and not x.stop_gradient:
-
- def create_custom_forward(module):
- def custom_forward(*inputs):
- return module(*inputs)
-
- return custom_forward
-
- x = recompute(create_custom_forward(self.layers), x)
-
- else:
- # scale image from [-1, 1] to [0, 1] to match TAESD convention
- x = self.layers(x.add(1).div(2))
-
- return x
-
-
-class DecoderTiny(nn.Layer):
- r"""
- The `DecoderTiny` layer is a simpler version of the `Decoder` layer.
-
- Args:
- in_channels (`int`):
- The number of input channels.
- out_channels (`int`):
- The number of output channels.
- num_blocks (`Tuple[int, ...]`):
- Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
- use.
- block_out_channels (`Tuple[int, ...]`):
- The number of output channels for each block.
- upsampling_scaling_factor (`int`):
- The scaling factor to use for upsampling.
- act_fn (`str`):
- The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
- """
-
- def __init__(
- self,
- in_channels: int,
- out_channels: int,
- num_blocks: Tuple[int, ...],
- block_out_channels: Tuple[int, ...],
- upsampling_scaling_factor: int,
- act_fn: str,
- ):
- super().__init__()
-
- layers = [
- nn.Conv2D(in_channels, block_out_channels[0], kernel_size=3, padding=1),
- get_activation(act_fn),
- ]
-
- for i, num_block in enumerate(num_blocks):
- is_final_block = i == (len(num_blocks) - 1)
- num_channels = block_out_channels[i]
-
- for _ in range(num_block):
- layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn))
-
- if not is_final_block:
- layers.append(nn.Upsample(scale_factor=upsampling_scaling_factor))
-
- conv_out_channel = num_channels if not is_final_block else out_channels
- layers.append(nn.Conv2d(num_channels, conv_out_channel, kernel_size=3, padding=1, bias=is_final_block))
-
- self.layers = nn.Sequential(*layers)
- self.gradient_checkpointing = False
-
- def forward(self, x: paddle.Tensor) -> paddle.Tensor:
- r"""The forward method of the `DecoderTiny` class."""
- # Clamp.
- x = paddle.tanh(x / 3) * 3
-
- if self.training and self.gradient_checkpointing:
-
- def create_custom_forward(module):
- def custom_forward(*inputs):
- return module(*inputs)
-
- return custom_forward
-
- x = recompute(create_custom_forward(self.layers), x)
-
- else:
- x = self.layers(x)
-
- # scale image from [0, 1] to [-1, 1] to match diffusers convention
- return x.multiply(paddle.to_tensor(2)).subtract(paddle.to_tensor(1))
diff --git a/ppdiffusers/ppdiffusers/models/vq_model.py b/ppdiffusers/ppdiffusers/models/vq_model.py
index 68de2e271..1d6dd5a60 100644
--- a/ppdiffusers/ppdiffusers/models/vq_model.py
+++ b/ppdiffusers/ppdiffusers/models/vq_model.py
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
-from typing import Optional, Tuple, Union
+from typing import Optional, Tuple
import paddle
import paddle.nn as nn
@@ -53,12 +53,10 @@ class VQModel(ModelMixin, ConfigMixin):
Tuple of upsample block types.
block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
Tuple of block output channels.
- layers_per_block (`int`, *optional*, defaults to `1`): Number of layers per block.
act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space.
sample_size (`int`, *optional*, defaults to `32`): Sample input size.
num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE.
- norm_num_groups (`int`, *optional*, defaults to `32`): Number of groups for normalization layers.
vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE.
scaling_factor (`float`, *optional*, defaults to `0.18215`):
The component-wise standard deviation of the trained latent space computed using the first batch of the
@@ -67,8 +65,6 @@ class VQModel(ModelMixin, ConfigMixin):
diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
/ scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
- norm_type (`str`, *optional*, defaults to `"group"`):
- Type of normalization layer to use. Can be one of `"group"` or `"spatial"`.
"""
@register_to_config
@@ -76,9 +72,9 @@ def __init__(
self,
in_channels: int = 3,
out_channels: int = 3,
- down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
- up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
- block_out_channels: Tuple[int, ...] = (64,),
+ down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+ up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+ block_out_channels: Tuple[int] = (64,),
layers_per_block: int = 1,
act_fn: str = "silu",
latent_channels: int = 3,
@@ -132,9 +128,7 @@ def encode(self, x: paddle.Tensor, return_dict: bool = True):
return VQEncoderOutput(latents=h)
@apply_forward_hook
- def decode(
- self, h: paddle.Tensor, force_not_quantize: bool = False, return_dict: bool = True
- ) -> Union[DecoderOutput, paddle.Tensor]:
+ def decode(self, h: paddle.Tensor, force_not_quantize: bool = False, return_dict: bool = True):
# cast h to float16 / float32
h = h.cast(self.dtype)
# also go through quantization layer
@@ -150,7 +144,7 @@ def decode(
return DecoderOutput(sample=dec)
- def forward(self, sample: paddle.Tensor, return_dict: bool = True) -> Union[DecoderOutput, paddle.Tensor]:
+ def forward(self, sample: paddle.Tensor, return_dict: bool = True):
r"""
The [`VQModel`] forward method.
@@ -164,8 +158,8 @@ def forward(self, sample: paddle.Tensor, return_dict: bool = True) -> Union[Deco
If return_dict is True, a [`~models.vq_model.VQEncoderOutput`] is returned, otherwise a plain `tuple`
is returned.
"""
-
- h = self.encode(sample).latents
+ x = sample
+ h = self.encode(x).latents
dec = self.decode(h).sample
if not return_dict:
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index f4b3dd3a4..4fe3c5823 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -13,8 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
-
-# import warnings
+import warnings
from typing import Any, Callable, Dict, List, Optional, Union
import paddle
@@ -25,7 +24,6 @@
from ...image_processor import VaeImageProcessor
from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
-from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import deprecate, logging, randn_tensor, replace_example_docstring
from ..pipeline_utils import DiffusionPipeline
@@ -196,34 +194,6 @@ def _encode_prompt(
prompt_embeds: Optional[paddle.Tensor] = None,
negative_prompt_embeds: Optional[paddle.Tensor] = None,
lora_scale: Optional[float] = None,
- ):
- deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
- deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
-
- prompt_embeds_tuple = self.encode_prompt(
- prompt=prompt,
- num_images_per_prompt=num_images_per_prompt,
- do_classifier_free_guidance=do_classifier_free_guidance,
- negative_prompt=negative_prompt,
- prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds,
- lora_scale=lora_scale,
- )
-
- # concatenate for backwards comp
- prompt_embeds = paddle.concat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
-
- return prompt_embeds
-
- def encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor] = None,
- negative_prompt_embeds: Optional[paddle.Tensor] = None,
- lora_scale: Optional[float] = None,
):
"""
Encodes the prompt into text encoder hidden states.
@@ -253,10 +223,6 @@ def encode_prompt(
# function of text encoder can correctly access it
if lora_scale is not None and isinstance(self, LoraLoaderMixin):
self._lora_scale = lora_scale
-
- # dynamically adjust the LoRA scale
- adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
-
if prompt is not None and isinstance(prompt, str):
batch_size = 1
elif prompt is not None and isinstance(prompt, list):
@@ -292,16 +258,7 @@ def encode_prompt(
attention_mask = None
prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
prompt_embeds = prompt_embeds[0]
-
- if self.text_encoder is not None:
- prompt_embeds_dtype = self.text_encoder.dtype
- elif self.unet is not None:
- prompt_embeds_dtype = self.unet.dtype
- else:
- prompt_embeds_dtype = prompt_embeds.dtype
-
- prompt_embeds = prompt_embeds.cast(dtype=prompt_embeds_dtype)
-
+ prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1])
@@ -348,8 +305,8 @@ def encode_prompt(
# For classifier free guidance, we need to do two forward passes.
# Here we concatenate the unconditional and text embeddings into a single batch
# to avoid doing two forward passes
- # prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds])
- return prompt_embeds, negative_prompt_embeds
+ prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds])
+ return prompt_embeds
def run_safety_checker(self, image, dtype):
if self.safety_checker is None:
@@ -366,9 +323,10 @@ def run_safety_checker(self, image, dtype):
return image, has_nsfw_concept
def decode_latents(self, latents):
- deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
- deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
-
+ warnings.warn(
+ "The decode_latents method is deprecated and will be removed in a future version. Please use VaeImageProcessor instead",
+ FutureWarning,
+ )
latents = 1 / self.vae.config.scaling_factor * latents
image = self.vae.decode(latents, return_dict=False)[0]
image = (image / 2 + 0.5).clip(min=0, max=1)
@@ -552,12 +510,10 @@ def __call__(
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
# corresponds to doing no classifier free guidance.
do_classifier_free_guidance = guidance_scale > 1.0
-
- # 3. Encode input prompt
text_encoder_lora_scale = (
cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
)
- prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+ prompt_embeds = self._encode_prompt(
prompt,
num_images_per_prompt,
do_classifier_free_guidance,
@@ -566,14 +522,9 @@ def __call__(
negative_prompt_embeds=negative_prompt_embeds,
lora_scale=text_encoder_lora_scale,
)
- # For classifier free guidance, we need to do two forward passes.
- # Here we concatenate the unconditional and text embeddings into a single batch
- # to avoid doing two forward passes
- if do_classifier_free_guidance:
- prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
+ self.scheduler.set_timesteps(num_inference_steps)
# 4. Prepare timesteps
- self.scheduler.set_timesteps(num_inference_steps)
timesteps = self.scheduler.timesteps
# 5. Prepare latent variables
@@ -590,9 +541,9 @@ def __call__(
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+ num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
# 7. Denoising loop
- num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
# expand the latents if we are doing classifier free guidance
@@ -620,7 +571,7 @@ def __call__(
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
# call the callback, if provided
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
@@ -637,6 +588,5 @@ def __call__(
image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
if not return_dict:
- return (image, has_nsfw_concept)
-
+ return image, has_nsfw_concept
return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 39a66de41..7e8439c7b 100644
--- a/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/ppdiffusers/ppdiffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -22,12 +22,9 @@
from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, UNet2DConditionModel
from ...models.attention_processor import (
- AttnProcessor2_5,
- LoRAAttnProcessor2_5,
LoRAXFormersAttnProcessor,
XFormersAttnProcessor,
)
-from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import (
is_invisible_watermark_available,
@@ -72,7 +69,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
return noise_cfg
-class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin):
+class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoaderMixin):
"""
Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -110,13 +107,6 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoad
scheduler ([`SchedulerMixin`]):
A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
[`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
- force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
- Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
- `stabilityai/stable-diffusion-xl-base-1-0`.
- add_watermarker (`bool`, *optional*):
- Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
- watermark output images. If not defined, it will default to True if the package is installed, otherwise no
- watermarker will be used.
"""
def __init__(
@@ -151,39 +141,6 @@ def __init__(
else:
self.watermark = None
- # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
- def enable_vae_slicing(self):
- r"""
- Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
- compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
- """
- self.vae.enable_slicing()
-
- # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
- def disable_vae_slicing(self):
- r"""
- Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_slicing()
-
- # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
- def enable_vae_tiling(self):
- r"""
- Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
- compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
- processing larger images.
- """
- self.vae.enable_tiling()
-
- # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
- def disable_vae_tiling(self):
- r"""
- Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
- computing decoding in one step.
- """
- self.vae.disable_tiling()
-
def encode_prompt(
self,
prompt: str,
@@ -239,11 +196,6 @@ def encode_prompt(
# function of text encoder can correctly access it
if lora_scale is not None and isinstance(self, LoraLoaderMixin):
self._lora_scale = lora_scale
-
- # dynamically adjust the LoRA scale
- adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
- adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
-
if prompt is not None and isinstance(prompt, str):
batch_size = 1
elif prompt is not None and isinstance(prompt, list):
@@ -273,15 +225,14 @@ def encode_prompt(
)
text_input_ids = text_inputs.input_ids
untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
-
+ untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pd").input_ids
if (
untruncated_ids.shape[-1] >= text_input_ids.shape[-1]
and not paddle.equal_all(x=text_input_ids, y=untruncated_ids).item()
):
removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
logger.warning(
- "The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {tokenizer.model_max_length} tokens: {removed_text}"
+ f"The following part of your input was truncated because CLIP can only handle sequences up to {tokenizer.model_max_length} tokens: {removed_text}"
)
prompt_embeds = text_encoder(text_input_ids, output_hidden_states=True)
@@ -302,16 +253,13 @@ def encode_prompt(
uncond_tokens: List[str]
if prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
- f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}."
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} != {type(prompt)}."
)
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt, negative_prompt_2]
elif batch_size != len(negative_prompt):
raise ValueError(
- f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
- f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`."
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`: {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches the batch size of `prompt`."
)
else:
uncond_tokens = [negative_prompt, negative_prompt_2]
@@ -331,8 +279,8 @@ def encode_prompt(
negative_prompt_embeds_list.append(negative_prompt_embeds)
negative_prompt_embeds = paddle.concat(x=negative_prompt_embeds_list, axis=-1)
prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder_2.dtype)
- bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
+ bs_embed, seq_len, _ = prompt_embeds.shape
prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1])
prompt_embeds = prompt_embeds.reshape([bs_embed * num_images_per_prompt, seq_len, -1])
if do_classifier_free_guidance:
@@ -390,18 +338,15 @@ def check_inputs(
and (not isinstance(callback_steps, int) or callback_steps <= 0)
):
raise ValueError(
- f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
- f" {type(callback_steps)}."
+ f"`callback_steps` has to be a positive integer but is {callback_steps} of type {type(callback_steps)}."
)
if prompt is not None and prompt_embeds is not None:
raise ValueError(
- f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two."
+ f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two."
)
elif prompt_2 is not None and prompt_embeds is not None:
raise ValueError(
- f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
- " only forward one of the two."
+ f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to only forward one of the two."
)
elif prompt is None and prompt_embeds is None:
raise ValueError(
@@ -413,13 +358,11 @@ def check_inputs(
raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
if negative_prompt is not None and negative_prompt_embeds is not None:
raise ValueError(
- f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
- f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
)
elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
raise ValueError(
- f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
- f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+ f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to only forward one of the two."
)
if prompt_embeds is not None and negative_prompt_embeds is not None:
if prompt_embeds.shape != negative_prompt_embeds.shape:
@@ -465,16 +408,14 @@ def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, d
def upcast_vae(self):
dtype = self.vae.dtype
self.vae.to(dtype="float32")
- use_2_5_or_xformers = isinstance(
+ use_xformers = isinstance(
self.vae.decoder.mid_block.attentions[0].processor,
(
- AttnProcessor2_5,
XFormersAttnProcessor,
LoRAXFormersAttnProcessor,
- LoRAAttnProcessor2_5,
),
)
- if use_2_5_or_xformers:
+ if use_xformers:
self.vae.post_quant_conv.to(dtype=dtype)
self.vae.decoder.conv_in.to(dtype=dtype)
self.vae.decoder.mid_block.to(dtype=dtype)
@@ -509,9 +450,6 @@ def __call__(
original_size: Optional[Tuple[int, int]] = None,
crops_coords_top_left: Tuple[int, int] = (0, 0),
target_size: Optional[Tuple[int, int]] = None,
- negative_original_size: Optional[Tuple[int, int]] = None,
- negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
- negative_target_size: Optional[Tuple[int, int]] = None,
):
"""
Function invoked when calling the pipeline for generation.
@@ -537,7 +475,7 @@ def __call__(
scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
"Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
- guidance_scale (`float`, *optional*, defaults to 5.0):
+ guidance_scale (`float`, *optional*, defaults to 7.5):
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
@@ -610,21 +548,6 @@ def __call__(
For most cases, `target_size` should be set to the desired height and width of the generated image. If
not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
- negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
- To negatively condition the generation process based on a specific image resolution. Part of SDXL's
- micro-conditioning as explained in section 2.2 of
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
- information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
- negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
- To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
- micro-conditioning as explained in section 2.2 of
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
- information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
- negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
- To negatively condition the generation process based on a target image resolution. It should be as same
- as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
- [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
- information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
Examples:
@@ -710,20 +633,10 @@ def __call__(
add_time_ids = self._get_add_time_ids(
original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype
)
- if negative_original_size is not None and negative_target_size is not None:
- negative_add_time_ids = self._get_add_time_ids(
- negative_original_size,
- negative_crops_coords_top_left,
- negative_target_size,
- dtype=prompt_embeds.dtype,
- )
- else:
- negative_add_time_ids = add_time_ids
-
if do_classifier_free_guidance:
prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds], axis=0)
add_text_embeds = paddle.concat(x=[negative_pooled_prompt_embeds, add_text_embeds], axis=0)
- add_time_ids = paddle.concat(x=[negative_add_time_ids, add_time_ids], axis=0)
+ add_time_ids = paddle.concat(x=[add_time_ids, add_time_ids], axis=0)
add_time_ids = add_time_ids.tile(repeat_times=[batch_size * num_images_per_prompt, 1])
@@ -768,7 +681,7 @@ def __call__(
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
# call the callback, if provided
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
progress_bar.update()
if callback is not None and i % callback_steps == 0:
callback(i, t, latents)
diff --git a/ppdiffusers/ppdiffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py b/ppdiffusers/ppdiffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
index 573e9c2bf..d6aa74e1e 100644
--- a/ppdiffusers/ppdiffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
+++ b/ppdiffusers/ppdiffusers/pipelines/t2i_adapter/pipeline_stable_diffusion_adapter.py
@@ -25,12 +25,10 @@
from ...image_processor import VaeImageProcessor
from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
from ...models import AutoencoderKL, MultiAdapter, T2IAdapter, UNet2DConditionModel
-from ...models.lora import adjust_lora_scale_text_encoder
from ...schedulers import KarrasDiffusionSchedulers
from ...utils import (
PIL_INTERPOLATION,
BaseOutput,
- deprecate,
logging,
randn_tensor,
replace_example_docstring,
@@ -91,12 +89,9 @@ def _preprocess_adapter_image(image, height, width):
return image
elif isinstance(image, PIL.Image.Image):
image = [image]
-
if isinstance(image[0], PIL.Image.Image):
image = [np.array(i.resize((width, height), resample=PIL_INTERPOLATION["lanczos"])) for i in image]
- image = [
- i[None, ..., None] if i.ndim == 2 else i[None, ...] for i in image
- ] # expand [h, w] or [h, w, c] to [b, h, w, c]
+ image = [(i[None, ..., None] if i.ndim == 2 else i[None, ...]) for i in image]
image = np.concatenate(image, axis=0)
image = np.array(image).astype(np.float32) / 255.0
image = image.transpose(0, 3, 1, 2)
@@ -166,17 +161,11 @@ def __init__(
super().__init__()
if safety_checker is None and requires_safety_checker:
logger.warning(
- f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
- " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
- " results in services or applications open to the public. Both the diffusers team and Hugging Face"
- " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
- " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
- " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+ f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
)
if safety_checker is not None and feature_extractor is None:
raise ValueError(
- "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
- " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+ "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
)
if isinstance(adapter, (list, tuple)):
adapter = MultiAdapter(adapter, adapter_weights=adapter_weights)
@@ -220,44 +209,12 @@ def _encode_prompt(
prompt_embeds: Optional[paddle.Tensor] = None,
negative_prompt_embeds: Optional[paddle.Tensor] = None,
lora_scale: Optional[float] = None,
- **kwargs,
- ):
- deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
- deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
-
- prompt_embeds_tuple = self.encode_prompt(
- prompt=prompt,
- num_images_per_prompt=num_images_per_prompt,
- do_classifier_free_guidance=do_classifier_free_guidance,
- negative_prompt=negative_prompt,
- prompt_embeds=prompt_embeds,
- negative_prompt_embeds=negative_prompt_embeds,
- lora_scale=lora_scale,
- **kwargs,
- )
-
- # concatenate for backwards comp
- prompt_embeds = paddle.concat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
-
- return prompt_embeds
-
- # Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
- def encode_prompt(
- self,
- prompt,
- num_images_per_prompt,
- do_classifier_free_guidance,
- negative_prompt=None,
- prompt_embeds: Optional[paddle.Tensor] = None,
- negative_prompt_embeds: Optional[paddle.Tensor] = None,
- lora_scale: Optional[float] = None,
- clip_skip: Optional[int] = None,
):
"""
Encodes the prompt into text encoder hidden states.
Args:
- prompt (`str` or `List[str]`, *optional*):
+ prompt (`str` or `List[str]`, *optional*):
prompt to be encoded
num_images_per_prompt (`int`):
number of images that should be generated per prompt
@@ -275,19 +232,12 @@ def encode_prompt(
weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
argument.
lora_scale (`float`, *optional*):
- A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
- clip_skip (`int`, *optional*):
- Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
- the output of the pre-final layer will be used for computing the prompt embeddings.
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
"""
# set lora scale so that monkey patched LoRA
# function of text encoder can correctly access it
if lora_scale is not None and isinstance(self, LoraLoaderMixin):
self._lora_scale = lora_scale
-
- # dynamically adjust the LoRA scale
- adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
-
if prompt is not None and isinstance(prompt, str):
batch_size = 1
elif prompt is not None and isinstance(prompt, list):
@@ -315,39 +265,15 @@ def encode_prompt(
untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
)
logger.warning(
- "The following part of your input was truncated because CLIP can only handle sequences up to"
- f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+ f"The following part of your input was truncated because CLIP can only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}"
)
if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
attention_mask = text_inputs.attention_mask
else:
attention_mask = None
-
- if clip_skip is None:
- prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
- prompt_embeds = prompt_embeds[0]
- else:
- prompt_embeds = self.text_encoder(
- text_input_ids, attention_mask=attention_mask, output_hidden_states=True
- )
- # Access the `hidden_states` first, that contains a tuple of
- # all the hidden states from the encoder layers. Then index into
- # the tuple to access the hidden states from the desired layer.
- prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
- # We also need to apply the final LayerNorm here to not mess with the
- # representations. The `last_hidden_states` that we typically use for
- # obtaining the final prompt representations passes through the LayerNorm
- # layer.
- prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
-
- if self.text_encoder is not None:
- prompt_embeds_dtype = self.text_encoder.dtype
- elif self.unet is not None:
- prompt_embeds_dtype = self.unet.dtype
- else:
- prompt_embeds_dtype = prompt_embeds.dtype
-
- prompt_embeds = prompt_embeds.cast(dtype=prompt_embeds_dtype)
+ prompt_embeds = self.text_encoder(text_input_ids, attention_mask=attention_mask)
+ prompt_embeds = prompt_embeds[0]
+ prompt_embeds = prompt_embeds.cast(dtype=self.text_encoder.dtype)
bs_embed, seq_len, _ = prompt_embeds.shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds = prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1])
@@ -360,16 +286,13 @@ def encode_prompt(
uncond_tokens = [""] * batch_size
elif prompt is not None and type(prompt) is not type(negative_prompt):
raise TypeError(
- f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
- f" {type(prompt)}."
+ f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} != {type(prompt)}."
)
elif isinstance(negative_prompt, str):
uncond_tokens = [negative_prompt]
elif batch_size != len(negative_prompt):
raise ValueError(
- f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
- f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
- " the batch size of `prompt`."
+ f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`: {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches the batch size of `prompt`."
)
else:
uncond_tokens = negative_prompt
@@ -377,7 +300,6 @@ def encode_prompt(
# textual inversion: procecss multi-vector tokens if necessary
if isinstance(self, TextualInversionLoaderMixin):
uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
max_length = prompt_embeds.shape[1]
uncond_input = self.tokenizer(
uncond_tokens, padding="max_length", max_length=max_length, truncation=True, return_tensors="pd"
@@ -395,7 +317,11 @@ def encode_prompt(
negative_prompt_embeds = negative_prompt_embeds.tile(repeat_times=[1, num_images_per_prompt, 1])
negative_prompt_embeds = negative_prompt_embeds.reshape([batch_size * num_images_per_prompt, seq_len, -1])
- return prompt_embeds, negative_prompt_embeds
+ # For classifier free guidance, we need to do two forward passes.
+ # Here we concatenate the unconditional and text embeddings into a single batch
+ # to avoid doing two forward passes
+ prompt_embeds = paddle.concat(x=[negative_prompt_embeds, prompt_embeds])
+ return prompt_embeds
# Copied from ppdiffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
def run_safety_checker(self, image, dtype):
@@ -488,8 +414,7 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype
shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
- f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
- f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+ f"You have passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}. Make sure the batch size matches the length of the generators."
)
if latents is None:
latents = randn_tensor(shape, generator=generator, dtype=dtype)
@@ -522,34 +447,6 @@ def _default_height_width(self, height, width, image):
width = width // self.adapter.total_downscale_factor * self.adapter.total_downscale_factor
return height, width
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
- def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
- r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
- The suffixes after the scaling factors represent the stages where they are being applied.
-
- Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
- that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
- Args:
- s1 (`float`):
- Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- s2 (`float`):
- Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
- mitigate "oversmoothing effect" in the enhanced denoising process.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if not hasattr(self, "unet"):
- raise ValueError("The pipeline must have `unet` for using FreeU.")
- self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
- # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
- def disable_freeu(self):
- """Disables the FreeU mechanism if enabled."""
- self.unet.disable_freeu()
-
@paddle.no_grad()
@replace_example_docstring(EXAMPLE_DOC_STRING)
def __call__(
@@ -573,7 +470,6 @@ def __call__(
callback_steps: int = 1,
cross_attention_kwargs: Optional[Dict[str, Any]] = None,
adapter_conditioning_scale: Union[float, List[float]] = 1.0,
- clip_skip: Optional[int] = None,
):
"""
Function invoked when calling the pipeline for generation.
@@ -642,9 +538,7 @@ def __call__(
The outputs of the adapter are multiplied by `adapter_conditioning_scale` before they are added to the
residual in the original unet. If multiple adapters are specified in init, you can set the
corresponding scale as a list.
- clip_skip (`int`, *optional*):
- Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
- the output of the pre-final layer will be used for computing the prompt embeddings.
+
Examples:
Returns:
@@ -661,26 +555,14 @@ def __call__(
self.check_inputs(
prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
)
-
- # is_multi_adapter = isinstance(self.adapter, MultiAdapter)
- # if is_multi_adapter:
- # adapter_input = [_preprocess_adapter_image(img, height, width) for img in image]
- # n, c, h, w = adapter_input[0].shape
- # adapter_input = paddle.stack(x=[x.reshape([n * c, h, w]) for x in adapter_input])
- # else:
- # adapter_input = _preprocess_adapter_image(image, height, width)
- # adapter_input = adapter_input.cast(self.adapter.dtype)
-
- if isinstance(self.adapter, MultiAdapter):
- adapter_input = []
-
- for one_image in image:
- one_image = _preprocess_adapter_image(one_image, height, width)
- one_image = one_image.cast(dtype=self.adapter.dtype)
- adapter_input.append(one_image)
+ is_multi_adapter = isinstance(self.adapter, MultiAdapter)
+ if is_multi_adapter:
+ adapter_input = [_preprocess_adapter_image(img, height, width) for img in image]
+ n, c, h, w = adapter_input[0].shape
+ adapter_input = paddle.stack(x=[x.reshape([n * c, h, w]) for x in adapter_input])
else:
adapter_input = _preprocess_adapter_image(image, height, width)
- adapter_input = adapter_input.cast(dtype=self.adapter.dtype)
+ adapter_input = adapter_input.cast(self.adapter.dtype)
# 2. Define call parameters
if prompt is not None and isinstance(prompt, str):
@@ -696,20 +578,14 @@ def __call__(
do_classifier_free_guidance = guidance_scale > 1.0
# 3. Encode input prompt
- prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+ prompt_embeds = self._encode_prompt(
prompt,
num_images_per_prompt,
do_classifier_free_guidance,
negative_prompt,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
- clip_skip=clip_skip,
)
- # For classifier free guidance, we need to do two forward passes.
- # Here we concatenate the unconditional and text embeddings into a single batch
- # to avoid doing two forward passes
- if do_classifier_free_guidance:
- prompt_embeds = paddle.concat([negative_prompt_embeds, prompt_embeds])
# 4. Prepare timesteps
self.scheduler.set_timesteps(num_inference_steps)
@@ -731,21 +607,15 @@ def __call__(
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
# 7. Denoising loop
- if isinstance(self.adapter, MultiAdapter):
- adapter_state = self.adapter(adapter_input, adapter_conditioning_scale)
- for k, v in enumerate(adapter_state):
- adapter_state[k] = v
- else:
- adapter_state = self.adapter(adapter_input)
- for k, v in enumerate(adapter_state):
- adapter_state[k] = v * adapter_conditioning_scale
+ adapter_state = self.adapter(adapter_input)
+ for k, v in enumerate(adapter_state):
+ adapter_state[k] = v * adapter_conditioning_scale
if num_images_per_prompt > 1:
for k, v in enumerate(adapter_state):
adapter_state[k] = v.tile(repeat_times=[num_images_per_prompt, 1, 1, 1])
if do_classifier_free_guidance:
for k, v in enumerate(adapter_state):
adapter_state[k] = paddle.concat(x=[v] * 2, axis=0)
-
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
with self.progress_bar(total=num_inference_steps) as progress_bar:
for i, t in enumerate(timesteps):
@@ -759,9 +629,8 @@ def __call__(
t,
encoder_hidden_states=prompt_embeds,
cross_attention_kwargs=cross_attention_kwargs,
- down_intrablock_additional_residuals=[state.clone() for state in adapter_state],
- return_dict=False,
- )[0]
+ down_block_additional_residuals=[state.clone() for state in adapter_state],
+ ).sample
# perform guidance
if do_classifier_free_guidance:
@@ -772,12 +641,10 @@ def __call__(
latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
# call the callback, if provided
- if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+ if i == len(timesteps) - 1 or i + 1 > num_warmup_steps and (i + 1) % self.scheduler.order == 0:
progress_bar.update()
if callback is not None and i % callback_steps == 0:
- step_idx = i // getattr(self.scheduler, "order", 1)
- callback(step_idx, t, latents)
-
+ callback(i, t, latents)
if output_type == "latent":
image = latents
has_nsfw_concept = None
@@ -798,6 +665,5 @@ def __call__(
image, has_nsfw_concept = self.run_safety_checker(image, prompt_embeds.dtype)
if not return_dict:
- return (image, has_nsfw_concept)
-
+ return image, has_nsfw_concept
return StableDiffusionAdapterPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/ppdiffusers/ppdiffusers/utils/__init__.py b/ppdiffusers/ppdiffusers/utils/__init__.py
index 3bf46fdea..ec3dda5e2 100644
--- a/ppdiffusers/ppdiffusers/utils/__init__.py
+++ b/ppdiffusers/ppdiffusers/utils/__init__.py
@@ -45,7 +45,6 @@
TO_DIFFUSERS,
TORCH_SAFETENSORS_WEIGHTS_NAME,
TORCH_WEIGHTS_NAME,
- USE_PEFT_BACKEND,
WEIGHTS_NAME,
get_map_location_default,
str2bool,
diff --git a/ppdiffusers/ppdiffusers/utils/constants.py b/ppdiffusers/ppdiffusers/utils/constants.py
index 0d370ea66..5ce92bfc8 100644
--- a/ppdiffusers/ppdiffusers/utils/constants.py
+++ b/ppdiffusers/ppdiffusers/utils/constants.py
@@ -29,11 +29,10 @@ def str2bool(v):
else:
raise ValueError("Not supported value: {}".format(v))
-
# make sure we have abs path
-ppnlp_cache_home = os.path.abspath(
- os.path.expanduser(os.getenv("PPNLP_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "paddlenlp")))
-)
+ppnlp_cache_home = os.path.abspath(os.path.expanduser(
+ os.getenv("PPNLP_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "paddlenlp"))
+))
ppdiffusers_default_cache_path = os.path.abspath(os.path.join(ppnlp_cache_home, "ppdiffusers"))
diffusers_default_cache_path = os.path.abspath(HUGGINGFACE_HUB_CACHE)
@@ -51,9 +50,7 @@ def str2bool(v):
PPDIFFUSERS_DYNAMIC_MODULE_NAME = "ppdiffusers_modules"
# make sure we have abs path
HF_MODULES_CACHE = os.path.abspath(os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules")))
-PPDIFFUSERS_MODULES_CACHE = os.path.abspath(
- os.getenv("PPDIFFUSERS_MODULES_CACHE", os.path.join(ppnlp_cache_home, "modules"))
-)
+PPDIFFUSERS_MODULES_CACHE = os.path.abspath(os.getenv("PPDIFFUSERS_MODULES_CACHE", os.path.join(ppnlp_cache_home, "modules")))
PADDLE_WEIGHTS_NAME = "model_state.pdparams"
FASTDEPLOY_WEIGHTS_NAME = "inference.pdiparams"
@@ -76,35 +73,22 @@ def str2bool(v):
# FOR tests
if bool(os.getenv("PATCH_ALLCLOSE", False)):
- from pprint import pprint
-
- import numpy
import paddle
-
+ import numpy
+ from pprint import pprint
numpy.set_printoptions(precision=4)
paddle_raw_all_close = paddle.allclose
-
+
def allclose_pd(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
pprint(x.numpy())
pprint(y.numpy())
return paddle_raw_all_close(x, y, rtol=rtol, atol=atol, equal_nan=equal_nan, name=name)
-
paddle.allclose = allclose_pd
numpy_raw_all_close = numpy.allclose
-
- def allclose_np(
- a,
- b,
- rtol=1e-05,
- atol=1e-08,
- equal_nan=False,
- ):
+ def allclose_np(a, b, rtol=1e-05, atol=1e-08, equal_nan=False, ):
pprint(a)
pprint(b)
return numpy_raw_all_close(a, b, rtol=rtol, atol=atol, equal_nan=equal_nan)
-
- numpy.allclose = allclose_np
-
-USE_PEFT_BACKEND = False
+ numpy.allclose = allclose_np
\ No newline at end of file
diff --git a/ppdiffusers/ppdiffusers/utils/paddle_utils.py b/ppdiffusers/ppdiffusers/utils/paddle_utils.py
index b626db689..21856f3e6 100644
--- a/ppdiffusers/ppdiffusers/utils/paddle_utils.py
+++ b/ppdiffusers/ppdiffusers/utils/paddle_utils.py
@@ -225,63 +225,3 @@ def no_init_weights(_enable=True):
yield
finally:
_init_weights = old_init_weights
-
-
-def fourier_filter(x_in, threshold, scale):
- from paddle.fft import fftn, fftshift, ifftn, ifftshift
-
- """Fourier filter as introduced in FreeU (https://arxiv.org/abs/2309.11497).
-
- This version of the method comes from here:
- https://github.com/huggingface/diffusers/pull/5164#issuecomment-1732638706
- """
- x = x_in
- B, C, H, W = x.shape
-
- # Non-power of 2 images must be float32
- if (W & (W - 1)) != 0 or (H & (H - 1)) != 0:
- x = x.to(dtype=paddle.float32)
-
- # FFT
- x_freq = fftn(x, axes=(-2, -1))
- x_freq = fftshift(x_freq, axes=(-2, -1))
-
- B, C, H, W = x_freq.shape
- mask = paddle.ones((B, C, H, W))
-
- crow, ccol = H // 2, W // 2
- mask[..., crow - threshold : crow + threshold, ccol - threshold : ccol + threshold] = scale
- x_freq = x_freq * mask
-
- # IFFT
- x_freq = ifftshift(x_freq, axes=(-2, -1))
- x_filtered = ifftn(x_freq, axes=(-2, -1)).real
-
- return x_filtered.to(dtype=x_in.dtype)
-
-
-def apply_freeu(
- resolution_idx: int, hidden_states: paddle.Tensor, res_hidden_states: paddle.Tensor, **freeu_kwargs
-) -> Tuple[paddle.Tensor, paddle.Tensor]:
- """Applies the FreeU mechanism as introduced in https:
- //arxiv.org/abs/2309.11497. Adapted from the official code repository: https://github.com/ChenyangSi/FreeU.
-
- Args:
- resolution_idx (`int`): Integer denoting the UNet block where FreeU is being applied.
- hidden_states (`paddle.Tensor`): Inputs to the underlying block.
- res_hidden_states (`paddle.Tensor`): Features from the skip block corresponding to the underlying block.
- s1 (`float`): Scaling factor for stage 1 to attenuate the contributions of the skip features.
- s2 (`float`): Scaling factor for stage 2 to attenuate the contributions of the skip features.
- b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
- b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
- """
- if resolution_idx == 0:
- num_half_channels = hidden_states.shape[1] // 2
- hidden_states[:, :num_half_channels] = hidden_states[:, :num_half_channels] * freeu_kwargs["b1"]
- res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=freeu_kwargs["s1"])
- if resolution_idx == 1:
- num_half_channels = hidden_states.shape[1] // 2
- hidden_states[:, :num_half_channels] = hidden_states[:, :num_half_channels] * freeu_kwargs["b2"]
- res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=freeu_kwargs["s2"])
-
- return hidden_states, res_hidden_states
diff --git a/ppdiffusers/tests/lora/test_lora_layers_old_backend.py b/ppdiffusers/tests/lora/test_lora_layers_old_backend.py
deleted file mode 100644
index 9ee791202..000000000
--- a/ppdiffusers/tests/lora/test_lora_layers_old_backend.py
+++ /dev/null
@@ -1,1862 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import copy
-import os
-import tempfile
-import time
-import unittest
-
-import numpy as np
-import paddle
-from huggingface_hub.repocard import RepoCard
-from paddlenlp.transformers import (
- CLIPTextConfig,
- CLIPTextModel,
- CLIPTextModelWithProjection,
- CLIPTokenizer,
-)
-
-from ppdiffusers import (
- AutoencoderKL,
- DDIMScheduler,
- DiffusionPipeline,
- EulerDiscreteScheduler,
- StableDiffusionPipeline,
- StableDiffusionXLPipeline,
- UNet2DConditionModel,
- UNet3DConditionModel,
-)
-from ppdiffusers.loaders import (
- PADDLE_LORA_WEIGHT_NAME,
- TORCH_LORA_WEIGHT_NAME,
- AttnProcsLayers,
- LoraLoaderMixin,
- PatchedLoraProjection,
- text_encoder_attn_modules,
-)
-from ppdiffusers.models.attention_processor import (
- Attention,
- AttnProcessor,
- AttnProcessor2_5,
- LoRAAttnProcessor,
- LoRAAttnProcessor2_5,
- XFormersAttnProcessor,
-)
-from ppdiffusers.utils import floats_tensor, is_ppxformers_available, str2bool
-from ppdiffusers.utils.testing_utils import require_paddle_gpu, slow
-
-lora_weights_name = TORCH_LORA_WEIGHT_NAME if str2bool(os.getenv("TO_DIFFUSERS", False)) else PADDLE_LORA_WEIGHT_NAME
-
-
-def create_lora_layers(model, mock_weights: bool = True):
- lora_attn_procs = {}
- for name in model.attn_processors.keys():
- cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
- if name.startswith("mid_block"):
- hidden_size = model.config.block_out_channels[-1]
- elif name.startswith("up_blocks"):
- block_id = int(name[len("up_blocks.")])
- hidden_size = list(reversed(model.config.block_out_channels))[block_id]
- elif name.startswith("down_blocks"):
- block_id = int(name[len("down_blocks.")])
- hidden_size = model.config.block_out_channels[block_id]
- lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
- if mock_weights:
- with paddle.no_grad():
- lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1)
- lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1)
- lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1)
- lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1)
- return lora_attn_procs
-
-
-def create_unet_lora_layers(unet: paddle.nn.Layer):
- lora_attn_procs = {}
- for name in unet.attn_processors.keys():
- cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
- if name.startswith("mid_block"):
- hidden_size = unet.config.block_out_channels[-1]
- elif name.startswith("up_blocks"):
- block_id = int(name[len("up_blocks.")])
- hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
- elif name.startswith("down_blocks"):
- block_id = int(name[len("down_blocks.")])
- hidden_size = unet.config.block_out_channels[block_id]
- lora_attn_processor_class = (
- LoRAAttnProcessor2_5
- if hasattr(paddle.nn.functional, "scaled_dot_product_attention_")
- else LoRAAttnProcessor
- )
- lora_attn_procs[name] = lora_attn_processor_class(
- hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
- )
- unet_lora_layers = AttnProcsLayers(lora_attn_procs)
- return lora_attn_procs, unet_lora_layers
-
-
-def create_text_encoder_lora_attn_procs(text_encoder: paddle.nn.Layer):
- text_lora_attn_procs = {}
- lora_attn_processor_class = (
- LoRAAttnProcessor2_5 if hasattr(paddle.nn.functional, "scaled_dot_product_attention_") else LoRAAttnProcessor
- )
- for name, module in text_encoder_attn_modules(text_encoder):
- if isinstance(module.out_proj, paddle.nn.Linear):
- out_features = module.out_proj.weight.shape[1]
- elif isinstance(module.out_proj, PatchedLoraProjection):
- out_features = module.out_proj.regular_linear_layer.weight.shape[1]
- else:
- assert False, module.out_proj.__class__
- text_lora_attn_procs[name] = lora_attn_processor_class(hidden_size=out_features, cross_attention_dim=None)
- return text_lora_attn_procs
-
-
-def create_text_encoder_lora_layers(text_encoder: paddle.nn.Layer):
- text_lora_attn_procs = create_text_encoder_lora_attn_procs(text_encoder)
- text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
- return text_encoder_lora_layers
-
-
-def create_lora_3d_layers(model, mock_weights: bool = True):
- lora_attn_procs = {}
- for name in model.attn_processors.keys():
- has_cross_attention = name.endswith("attn2.processor") and not (
- name.startswith("transformer_in") or "temp_attentions" in name.split(".")
- )
- cross_attention_dim = model.config.cross_attention_dim if has_cross_attention else None
- if name.startswith("mid_block"):
- hidden_size = model.config.block_out_channels[-1]
- elif name.startswith("up_blocks"):
- block_id = int(name[len("up_blocks.")])
- hidden_size = list(reversed(model.config.block_out_channels))[block_id]
- elif name.startswith("down_blocks"):
- block_id = int(name[len("down_blocks.")])
- hidden_size = model.config.block_out_channels[block_id]
- elif name.startswith("transformer_in"):
- # Note that the `8 * ...` comes from: https://github.com/huggingface/diffusers/blob/7139f0e874f10b2463caa8cbd585762a309d12d6/src/diffusers/models/unet_3d_condition.py#L148
- hidden_size = 8 * model.config.attention_head_dim
-
- lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
-
- if mock_weights:
- # add 1 to weights to mock trained weights
- with paddle.no_grad():
- lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1)
- lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1)
- lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1)
- lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1)
- return lora_attn_procs
-
-
-def set_lora_weights(lora_attn_parameters, randn_weight=False, var=1.0):
- with paddle.no_grad():
- for parameter in lora_attn_parameters:
- if randn_weight:
- parameter[:] = paddle.randn(shape=parameter.shape, dtype=parameter.dtype) * var
- else:
- parameter.zero_()
-
-
-def state_dicts_almost_equal(sd1, sd2):
- sd1 = dict(sorted(sd1.items()))
- sd2 = dict(sorted(sd2.items()))
-
- models_are_equal = True
- for ten1, ten2 in zip(sd1.values(), sd2.values()):
- if (ten1 - ten2).abs().max() > 1e-3:
- models_are_equal = False
-
- return models_are_equal
-
-
-class LoraLoaderMixinTests(unittest.TestCase):
- def get_dummy_components(self):
- paddle.seed(0)
- unet = UNet2DConditionModel(
- block_out_channels=(32, 64),
- layers_per_block=2,
- sample_size=32,
- in_channels=4,
- out_channels=4,
- down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
- up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- cross_attention_dim=32,
- )
- scheduler = DDIMScheduler(
- beta_start=0.00085,
- beta_end=0.012,
- beta_schedule="scaled_linear",
- clip_sample=False,
- set_alpha_to_one=False,
- steps_offset=1,
- )
- paddle.seed(0)
- vae = AutoencoderKL(
- block_out_channels=[32, 64],
- in_channels=3,
- out_channels=3,
- down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
- up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4,
- )
- text_encoder_config = CLIPTextConfig(
- bos_token_id=0,
- eos_token_id=2,
- hidden_size=32,
- intermediate_size=37,
- layer_norm_eps=1e-05,
- num_attention_heads=4,
- num_hidden_layers=5,
- pad_token_id=1,
- vocab_size=1000,
- )
- text_encoder = CLIPTextModel(text_encoder_config)
- tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
- unet_lora_attn_procs, unet_lora_layers = create_unet_lora_layers(unet)
- text_encoder_lora_layers = create_text_encoder_lora_layers(text_encoder)
-
- pipeline_components = {
- "unet": unet,
- "scheduler": scheduler,
- "vae": vae,
- "text_encoder": text_encoder,
- "tokenizer": tokenizer,
- "safety_checker": None,
- "feature_extractor": None,
- }
- lora_components = {
- "unet_lora_layers": unet_lora_layers,
- "text_encoder_lora_layers": text_encoder_lora_layers,
- "unet_lora_attn_procs": unet_lora_attn_procs,
- }
- return pipeline_components, lora_components
-
- def get_dummy_inputs(self, with_generator=True):
- batch_size = 1
- sequence_length = 10
- num_channels = 4
- sizes = 32, 32
-
- generator = paddle.Generator().manual_seed(0)
- noise = floats_tensor((batch_size, num_channels) + sizes)
- input_ids = paddle.randint(low=1, high=sequence_length, shape=(batch_size, sequence_length))
-
- pipeline_inputs = {
- "prompt": "A painting of a squirrel eating a burger",
- "num_inference_steps": 2,
- "guidance_scale": 6.0,
- "output_type": "np",
- }
- if with_generator:
- pipeline_inputs.update({"generator": generator})
-
- return noise, input_ids, pipeline_inputs
-
- def get_dummy_tokens(self):
- max_seq_length = 77
- inputs = paddle.randint(low=2, high=56, shape=(1, max_seq_length))
- prepared_inputs = {}
- prepared_inputs["input_ids"] = inputs
- return prepared_inputs
-
- def create_lora_weight_file(self, tmpdirname):
- _, lora_components = self.get_dummy_components()
- LoraLoaderMixin.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
-
- def test_lora_save_load(self):
- pipeline_components, lora_components = self.get_dummy_components()
- sd_pipe = StableDiffusionPipeline(**pipeline_components)
- sd_pipe.set_progress_bar_config(disable=None)
- _, _, pipeline_inputs = self.get_dummy_inputs()
- original_images = sd_pipe(**pipeline_inputs).images
- orig_image_slice = original_images[0, -3:, -3:, -1]
- with tempfile.TemporaryDirectory() as tmpdirname:
- LoraLoaderMixin.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(tmpdirname)
- lora_images = sd_pipe(**pipeline_inputs).images
- lora_image_slice = lora_images[0, -3:, -3:, -1]
-
- # Outputs shouldn't match.
- self.assertFalse(
- paddle.allclose(
- x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
- ).item()
- )
-
- # def test_lora_save_load_safetensors(self):
- # pipeline_components, lora_components = self.get_dummy_components()
- # sd_pipe = StableDiffusionPipeline(**pipeline_components)
- # sd_pipe.set_progress_bar_config(disable=None)
- # _, _, pipeline_inputs = self.get_dummy_inputs()
- # original_images = sd_pipe(**pipeline_inputs).images
- # orig_image_slice = original_images[0, -3:, -3:, -1]
- # with tempfile.TemporaryDirectory() as tmpdirname:
- # LoraLoaderMixin.save_lora_weights(
- # save_directory=tmpdirname,
- # unet_lora_layers=lora_components["unet_lora_layers"],
- # text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
- # safe_serialization=True,
- # )
- # self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
- # sd_pipe.load_lora_weights(tmpdirname)
- # lora_images = sd_pipe(**pipeline_inputs).images
- # lora_image_slice = lora_images[0, -3:, -3:, -1]
- # # Outputs shouldn't match.
- # self.assertFalse(
- # paddle.allclose(
- # x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
- # ).item()
- # )
-
- def test_lora_save_load_legacy(self):
- pipeline_components, lora_components = self.get_dummy_components()
- unet_lora_attn_procs = lora_components["unet_lora_attn_procs"]
- sd_pipe = StableDiffusionPipeline(**pipeline_components)
- sd_pipe.set_progress_bar_config(disable=None)
- _, _, pipeline_inputs = self.get_dummy_inputs()
- original_images = sd_pipe(**pipeline_inputs).images
- orig_image_slice = original_images[0, -3:, -3:, -1]
- with tempfile.TemporaryDirectory() as tmpdirname:
- unet = sd_pipe.unet
- unet.set_attn_processor(unet_lora_attn_procs)
- unet.save_attn_procs(tmpdirname)
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(tmpdirname)
- lora_images = sd_pipe(**pipeline_inputs).images
- lora_image_slice = lora_images[0, -3:, -3:, -1]
-
- # Outputs shouldn't match.
- self.assertFalse(
- paddle.allclose(
- x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
- ).item()
- )
-
- def test_text_encoder_lora_monkey_patch(self):
- pipeline_components, _ = self.get_dummy_components()
- pipe = StableDiffusionPipeline(**pipeline_components)
- dummy_tokens = self.get_dummy_tokens()
- # inference without lora
- outputs_without_lora = pipe.text_encoder(**dummy_tokens)[0]
- assert outputs_without_lora.shape == [1, 77, 32]
- # monkey patch
- params = pipe._modify_text_encoder(pipe.text_encoder, pipe.lora_scale)
- set_lora_weights(params, randn_weight=False)
- # inference with lora
- outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0]
- assert outputs_with_lora.shape == [1, 77, 32]
- assert paddle.allclose(
- x=outputs_without_lora, y=outputs_with_lora
- ).item(), "lora_up_weight are all zero, so the lora outputs should be the same to without lora outputs"
- # create lora_attn_procs with randn up.weights
- create_text_encoder_lora_attn_procs(pipe.text_encoder)
- # monkey patch
- params = pipe._modify_text_encoder(pipe.text_encoder, pipe.lora_scale)
- set_lora_weights(params, randn_weight=True)
- # inference with lora
- outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0]
- assert outputs_with_lora.shape == [1, 77, 32]
- assert not paddle.allclose(
- x=outputs_without_lora, y=outputs_with_lora
- ).item(), "lora_up_weight are not zero, so the lora outputs should be different to without lora outputs"
-
- def test_text_encoder_lora_remove_monkey_patch(self):
- pipeline_components, _ = self.get_dummy_components()
- pipe = StableDiffusionPipeline(**pipeline_components)
- dummy_tokens = self.get_dummy_tokens()
- # inference without lora
- outputs_without_lora = pipe.text_encoder(**dummy_tokens)[0]
- assert outputs_without_lora.shape == [1, 77, 32]
- # monkey patch
- params = pipe._modify_text_encoder(pipe.text_encoder, pipe.lora_scale)
- set_lora_weights(params, randn_weight=True)
- # inference with lora
- outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0]
- assert outputs_with_lora.shape == [1, 77, 32]
- assert not paddle.allclose(
- x=outputs_without_lora, y=outputs_with_lora
- ).item(), "lora outputs should be different to without lora outputs"
- # remove monkey patch
- pipe._remove_text_encoder_monkey_patch()
- # inference with removed lora
- outputs_without_lora_removed = pipe.text_encoder(**dummy_tokens)[0]
- assert outputs_without_lora_removed.shape == [1, 77, 32]
- assert paddle.allclose(
- x=outputs_without_lora, y=outputs_without_lora_removed
- ).item(), "remove lora monkey patch should restore the original outputs"
-
- def test_text_encoder_lora_scale(self):
- pipeline_components, lora_components = self.get_dummy_components()
- sd_pipe = StableDiffusionPipeline(**pipeline_components)
- sd_pipe.set_progress_bar_config(disable=None)
- _, _, pipeline_inputs = self.get_dummy_inputs()
- with tempfile.TemporaryDirectory() as tmpdirname:
- LoraLoaderMixin.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(tmpdirname)
- lora_images = sd_pipe(**pipeline_inputs).images
- lora_image_slice = lora_images[0, -3:, -3:, -1]
- lora_images_with_scale = sd_pipe(**pipeline_inputs, cross_attention_kwargs={"scale": 0.5}).images
- lora_image_with_scale_slice = lora_images_with_scale[0, -3:, -3:, -1]
- # Outputs shouldn't match.
- self.assertFalse(
- paddle.allclose(
- x=paddle.to_tensor(data=lora_image_slice), y=paddle.to_tensor(data=lora_image_with_scale_slice)
- ).item()
- )
-
- def test_lora_unet_attn_processors(self):
- with tempfile.TemporaryDirectory() as tmpdirname:
- self.create_lora_weight_file(tmpdirname)
- pipeline_components, _ = self.get_dummy_components()
- sd_pipe = StableDiffusionPipeline(**pipeline_components)
- sd_pipe.set_progress_bar_config(disable=None)
- # check if vanilla attention processors are used
- for _, module in sd_pipe.unet.named_sublayers():
- if isinstance(module, Attention):
- self.assertIsInstance(module.processor, (AttnProcessor, AttnProcessor2_5))
- # load LoRA weight file
- sd_pipe.load_lora_weights(tmpdirname)
- # check if lora attention processors are used
- for _, module in sd_pipe.unet.named_sublayers():
- # if isinstance(module, Attention):
- # attn_proc_class = (
- # LoRAAttnProcessor2_5
- # if hasattr(paddle.nn.functional, "scaled_dot_product_attention_")
- # else LoRAAttnProcessor
- # )
- # self.assertIsInstance(module.processor, attn_proc_class)
- if isinstance(module, Attention):
- self.assertIsNotNone(module.to_q.lora_layer)
- self.assertIsNotNone(module.to_k.lora_layer)
- self.assertIsNotNone(module.to_v.lora_layer)
- self.assertIsNotNone(module.to_out[0].lora_layer)
-
- def test_unload_lora_sd(self):
- pipeline_components, lora_components = self.get_dummy_components()
- _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
- sd_pipe = StableDiffusionPipeline(**pipeline_components)
- original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- orig_image_slice = original_images[0, -3:, -3:, -1]
- # Emulate training.
- set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
- set_lora_weights(lora_components["text_encoder_lora_layers"].parameters(), randn_weight=True)
- with tempfile.TemporaryDirectory() as tmpdirname:
- LoraLoaderMixin.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(tmpdirname)
- lora_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- lora_image_slice = lora_images[0, -3:, -3:, -1]
- # Unload LoRA parameters.
- sd_pipe.unload_lora_weights()
- original_images_two = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- orig_image_slice_two = original_images_two[0, -3:, -3:, -1]
- assert not np.allclose(
- orig_image_slice, lora_image_slice
- ), "LoRA parameters should lead to a different image slice."
- assert not np.allclose(
- orig_image_slice_two, lora_image_slice
- ), "LoRA parameters should lead to a different image slice."
- assert np.allclose(
- orig_image_slice, orig_image_slice_two, atol=0.001
- ), "Unloading LoRA parameters should lead to results similar to what was obtained with the pipeline without any LoRA parameters."
-
- def test_lora_unet_attn_processors_with_xformers(self):
- with tempfile.TemporaryDirectory() as tmpdirname:
- self.create_lora_weight_file(tmpdirname)
- pipeline_components, _ = self.get_dummy_components()
- sd_pipe = StableDiffusionPipeline(**pipeline_components)
- sd_pipe.set_progress_bar_config(disable=None)
- # enable XFormers
- sd_pipe.enable_xformers_memory_efficient_attention()
- # check if xFormers attention processors are used
- for _, module in sd_pipe.unet.named_sublayers():
- if isinstance(module, Attention):
- self.assertIsInstance(module.processor, XFormersAttnProcessor)
- # load LoRA weight file
- sd_pipe.load_lora_weights(tmpdirname)
- # check if lora attention processors are used
- for _, module in sd_pipe.unet.named_sublayers():
- # if isinstance(module, Attention):
- # self.assertIsInstance(module.processor, LoRAXFormersAttnProcessor)
- if isinstance(module, Attention):
- self.assertIsNotNone(module.to_q.lora_layer)
- self.assertIsNotNone(module.to_k.lora_layer)
- self.assertIsNotNone(module.to_v.lora_layer)
- self.assertIsNotNone(module.to_out[0].lora_layer)
- # unload lora weights
- sd_pipe.unload_lora_weights()
- # check if attention processors are reverted back to xFormers
- for _, module in sd_pipe.unet.named_sublayers():
- if isinstance(module, Attention):
- self.assertIsInstance(module.processor, XFormersAttnProcessor)
-
- def test_lora_save_load_with_xformers(self):
- pipeline_components, lora_components = self.get_dummy_components()
- sd_pipe = StableDiffusionPipeline(**pipeline_components)
- sd_pipe.set_progress_bar_config(disable=None)
- _, _, pipeline_inputs = self.get_dummy_inputs()
- # enable XFormers
- sd_pipe.enable_xformers_memory_efficient_attention()
- original_images = sd_pipe(**pipeline_inputs).images
- orig_image_slice = original_images[0, -3:, -3:, -1]
- with tempfile.TemporaryDirectory() as tmpdirname:
- LoraLoaderMixin.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(tmpdirname)
- lora_images = sd_pipe(**pipeline_inputs).images
- lora_image_slice = lora_images[0, -3:, -3:, -1]
- # Outputs shouldn't match.
- self.assertFalse(
- paddle.allclose(
- x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
- ).item()
- )
-
-
-class SDXLLoraLoaderMixinTests(unittest.TestCase):
- def get_dummy_components(self):
- paddle.seed(0)
- unet = UNet2DConditionModel(
- block_out_channels=(32, 64),
- layers_per_block=2,
- sample_size=32,
- in_channels=4,
- out_channels=4,
- down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
- up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
- # SD2-specific config below
- attention_head_dim=(2, 4),
- use_linear_projection=True,
- addition_embed_type="text_time",
- addition_time_embed_dim=8,
- transformer_layers_per_block=(1, 2),
- projection_class_embeddings_input_dim=80, # 6 * 8 + 32
- cross_attention_dim=64,
- )
- scheduler = EulerDiscreteScheduler(
- beta_start=0.00085,
- beta_end=0.012,
- steps_offset=1,
- beta_schedule="scaled_linear",
- timestep_spacing="leading",
- )
- paddle.seed(0)
- vae = AutoencoderKL(
- block_out_channels=[32, 64],
- in_channels=3,
- out_channels=3,
- down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
- up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
- latent_channels=4,
- sample_size=128,
- )
- paddle.seed(0)
- text_encoder_config = CLIPTextConfig(
- bos_token_id=0,
- eos_token_id=2,
- hidden_size=32,
- intermediate_size=37,
- layer_norm_eps=1e-05,
- num_attention_heads=4,
- num_hidden_layers=5,
- pad_token_id=1,
- vocab_size=1000,
- # SD2-specific config below
- hidden_act="gelu",
- projection_dim=32,
- )
- text_encoder = CLIPTextModel(text_encoder_config)
- tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
- text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
- tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
- unet_lora_attn_procs, unet_lora_layers = create_unet_lora_layers(unet)
- text_encoder_one_lora_layers = create_text_encoder_lora_layers(text_encoder)
- text_encoder_two_lora_layers = create_text_encoder_lora_layers(text_encoder_2)
-
- pipeline_components = {
- "unet": unet,
- "scheduler": scheduler,
- "vae": vae,
- "text_encoder": text_encoder,
- "text_encoder_2": text_encoder_2,
- "tokenizer": tokenizer,
- "tokenizer_2": tokenizer_2,
- }
- lora_components = {
- "unet_lora_layers": unet_lora_layers,
- "text_encoder_one_lora_layers": text_encoder_one_lora_layers,
- "text_encoder_two_lora_layers": text_encoder_two_lora_layers,
- "unet_lora_attn_procs": unet_lora_attn_procs,
- }
- return pipeline_components, lora_components
-
- def get_dummy_inputs(self, with_generator=True):
- batch_size = 1
- sequence_length = 10
- num_channels = 4
- sizes = (32, 32)
-
- generator = paddle.Generator().manual_seed(0)
- noise = floats_tensor((batch_size, num_channels) + sizes)
- input_ids = paddle.randint(low=1, high=sequence_length, shape=(batch_size, sequence_length))
- pipeline_inputs = {
- "prompt": "A painting of a squirrel eating a burger",
- "num_inference_steps": 2,
- "guidance_scale": 6.0,
- "output_type": "np",
- }
- if with_generator:
- pipeline_inputs.update({"generator": generator})
-
- return noise, input_ids, pipeline_inputs
-
- def test_lora_save_load(self):
- pipeline_components, lora_components = self.get_dummy_components()
- sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
- sd_pipe.set_progress_bar_config(disable=None)
- _, _, pipeline_inputs = self.get_dummy_inputs()
- original_images = sd_pipe(**pipeline_inputs).images
- orig_image_slice = original_images[0, -3:, -3:, -1]
- with tempfile.TemporaryDirectory() as tmpdirname:
- StableDiffusionXLPipeline.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
- text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(tmpdirname)
- lora_images = sd_pipe(**pipeline_inputs).images
- lora_image_slice = lora_images[0, -3:, -3:, -1]
- # Outputs shouldn't match.
- self.assertFalse(
- paddle.allclose(
- x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
- ).item()
- )
-
- def test_unload_lora_sdxl(self):
- pipeline_components, lora_components = self.get_dummy_components()
- _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
- sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
- original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- orig_image_slice = original_images[0, -3:, -3:, -1]
- # Emulate training.
- set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
- set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True)
- set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True)
- with tempfile.TemporaryDirectory() as tmpdirname:
- StableDiffusionXLPipeline.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
- text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(tmpdirname)
- lora_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- lora_image_slice = lora_images[0, -3:, -3:, -1]
- # Unload LoRA parameters.
- sd_pipe.unload_lora_weights()
- original_images_two = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- orig_image_slice_two = original_images_two[0, -3:, -3:, -1]
- assert not np.allclose(
- orig_image_slice, lora_image_slice
- ), "LoRA parameters should lead to a different image slice."
- assert not np.allclose(
- orig_image_slice_two, lora_image_slice
- ), "LoRA parameters should lead to a different image slice."
- assert np.allclose(
- orig_image_slice, orig_image_slice_two, atol=0.001
- ), "Unloading LoRA parameters should lead to results similar to what was obtained with the pipeline without any LoRA parameters."
-
- def test_load_lora_locally(self):
- pipeline_components, lora_components = self.get_dummy_components()
- sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
- sd_pipe.set_progress_bar_config(disable=None)
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- StableDiffusionXLPipeline.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
- text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
- safe_serialization=False,
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(os.path.join(tmpdirname, lora_weights_name))
-
- sd_pipe.unload_lora_weights()
-
- # There may be only one item in one layer that needs to replace the name with an empty string.
- def replace_regular(self, keys):
- keys_new = []
- for i in keys:
- keys_new.append(i.replace("regular_linear_layer.", ""))
- return keys_new
-
- def test_text_encoder_lora_state_dict_unchanged(self):
- pipeline_components, lora_components = self.get_dummy_components()
- sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
-
- text_encoder_1_sd_keys = sorted(sd_pipe.text_encoder.state_dict().keys())
- text_encoder_2_sd_keys = sorted(sd_pipe.text_encoder_2.state_dict().keys())
-
- sd_pipe.set_progress_bar_config(disable=None)
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- StableDiffusionXLPipeline.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
- text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
- safe_serialization=False,
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(os.path.join(tmpdirname, lora_weights_name))
-
- text_encoder_1_sd_keys_2 = sorted(self.replace_regular(sd_pipe.text_encoder.state_dict().keys()))
- text_encoder_2_sd_keys_2 = sorted(self.replace_regular(sd_pipe.text_encoder_2.state_dict().keys()))
-
- sd_pipe.unload_lora_weights()
-
- text_encoder_1_sd_keys_3 = sorted(self.replace_regular(sd_pipe.text_encoder.state_dict().keys()))
- text_encoder_2_sd_keys_3 = sorted(self.replace_regular(sd_pipe.text_encoder_2.state_dict().keys()))
-
- # default & unloaded LoRA weights should have identical state_dicts
- assert text_encoder_1_sd_keys == text_encoder_1_sd_keys_3
- # default & loaded LoRA weights should NOT have identical state_dicts
- assert text_encoder_1_sd_keys != text_encoder_1_sd_keys_2
-
- # default & unloaded LoRA weights should have identical state_dicts
- assert text_encoder_2_sd_keys == text_encoder_2_sd_keys_3
- # default & loaded LoRA weights should NOT have identical state_dicts
- assert text_encoder_2_sd_keys != text_encoder_2_sd_keys_2
-
- def test_load_lora_locally_safetensors(self):
- pipeline_components, lora_components = self.get_dummy_components()
- sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
- sd_pipe.set_progress_bar_config(disable=None)
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- StableDiffusionXLPipeline.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
- text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
- safe_serialization=True,
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(os.path.join(tmpdirname, lora_weights_name))
-
- sd_pipe.unload_lora_weights()
-
- def test_lora_fusion(self):
- pipeline_components, lora_components = self.get_dummy_components()
- sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
- sd_pipe.set_progress_bar_config(disable=None)
-
- _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
-
- original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- orig_image_slice = original_images[0, -3:, -3:, -1]
-
- # Emulate training.
- set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
- set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True)
- set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True)
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- StableDiffusionXLPipeline.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
- text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
- safe_serialization=True,
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(os.path.join(tmpdirname, lora_weights_name))
-
- sd_pipe.fuse_lora()
- lora_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- lora_image_slice = lora_images[0, -3:, -3:, -1]
-
- self.assertFalse(np.allclose(orig_image_slice, lora_image_slice, atol=1e-3))
-
- def test_unfuse_lora(self):
- pipeline_components, lora_components = self.get_dummy_components()
- sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
- sd_pipe.set_progress_bar_config(disable=None)
-
- _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
-
- original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- orig_image_slice = original_images[0, -3:, -3:, -1]
-
- # Emulate training.
- set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
- set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True)
- set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True)
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- StableDiffusionXLPipeline.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
- text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
- safe_serialization=True,
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(os.path.join(tmpdirname, lora_weights_name))
-
- sd_pipe.fuse_lora()
- lora_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- lora_image_slice = lora_images[0, -3:, -3:, -1]
-
- # Reverse LoRA fusion.
- sd_pipe.unfuse_lora()
- original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- orig_image_slice_two = original_images[0, -3:, -3:, -1]
-
- assert not np.allclose(
- orig_image_slice, lora_image_slice
- ), "Fusion of LoRAs should lead to a different image slice."
- assert not np.allclose(
- orig_image_slice_two, lora_image_slice
- ), "Fusion of LoRAs should lead to a different image slice."
- assert np.allclose(
- orig_image_slice, orig_image_slice_two, atol=5e-3
- ), "Reversing LoRA fusion should lead to results similar to what was obtained with the pipeline without any LoRA parameters."
-
- def test_lora_fusion_is_not_affected_by_unloading(self):
- paddle.seed(0)
- pipeline_components, lora_components = self.get_dummy_components()
- sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
- sd_pipe.set_progress_bar_config(disable=None)
-
- _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
-
- _ = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
-
- # Emulate training.
- set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
- set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True)
- set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True)
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- StableDiffusionXLPipeline.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
- text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
- safe_serialization=True,
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(os.path.join(tmpdirname, lora_weights_name))
-
- sd_pipe.fuse_lora()
- lora_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- lora_image_slice = lora_images[0, -3:, -3:, -1]
-
- # Unload LoRA parameters.
- sd_pipe.unload_lora_weights()
- images_with_unloaded_lora = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- images_with_unloaded_lora_slice = images_with_unloaded_lora[0, -3:, -3:, -1]
-
- assert (
- np.abs(lora_image_slice - images_with_unloaded_lora_slice).max() < 2e-1 # diffusers 0.23
- ), "`unload_lora_weights()` should have not effect on the semantics of the results as the LoRA parameters were fused."
-
- def test_fuse_lora_with_different_scales(self):
- pipeline_components, lora_components = self.get_dummy_components()
- sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
- sd_pipe.set_progress_bar_config(disable=None)
-
- _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
-
- _ = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
-
- # Emulate training.
- set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
- set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True)
- set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True)
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- StableDiffusionXLPipeline.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
- text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
- safe_serialization=True,
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(os.path.join(tmpdirname, lora_weights_name))
-
- sd_pipe.fuse_lora(lora_scale=1.0)
- lora_images_scale_one = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- lora_image_slice_scale_one = lora_images_scale_one[0, -3:, -3:, -1]
-
- # Reverse LoRA fusion.
- sd_pipe.unfuse_lora()
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- StableDiffusionXLPipeline.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
- text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
- safe_serialization=True,
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(os.path.join(tmpdirname, lora_weights_name))
-
- sd_pipe.fuse_lora(lora_scale=0.5)
- lora_images_scale_0_5 = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- lora_image_slice_scale_0_5 = lora_images_scale_0_5[0, -3:, -3:, -1]
-
- assert not np.allclose(
- lora_image_slice_scale_one, lora_image_slice_scale_0_5, atol=1e-03
- ), "Different LoRA scales should influence the outputs accordingly."
-
- def test_with_different_scales(self):
- pipeline_components, lora_components = self.get_dummy_components()
- sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
- sd_pipe.set_progress_bar_config(disable=None)
-
- _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
- original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- original_imagee_slice = original_images[0, -3:, -3:, -1]
-
- # Emulate training.
- set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
- set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True)
- set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True)
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- StableDiffusionXLPipeline.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
- text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
- safe_serialization=True,
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(os.path.join(tmpdirname, lora_weights_name))
-
- lora_images_scale_one = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- lora_image_slice_scale_one = lora_images_scale_one[0, -3:, -3:, -1]
-
- lora_images_scale_0_5 = sd_pipe(
- **pipeline_inputs, generator=paddle.Generator().manual_seed(0), cross_attention_kwargs={"scale": 0.5}
- ).images
- lora_image_slice_scale_0_5 = lora_images_scale_0_5[0, -3:, -3:, -1]
-
- lora_images_scale_0_0 = sd_pipe(
- **pipeline_inputs, generator=paddle.Generator().manual_seed(0), cross_attention_kwargs={"scale": 0.0}
- ).images
- lora_image_slice_scale_0_0 = lora_images_scale_0_0[0, -3:, -3:, -1]
-
- assert not np.allclose(
- lora_image_slice_scale_one, lora_image_slice_scale_0_5, atol=1e-03
- ), "Different LoRA scales should influence the outputs accordingly."
-
- assert np.allclose(
- original_imagee_slice, lora_image_slice_scale_0_0, atol=1e-03
- ), "LoRA scale of 0.0 shouldn't be different from the results without LoRA."
-
- def test_with_different_scales_fusion_equivalence(self):
- pipeline_components, lora_components = self.get_dummy_components()
- sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
- # sd_pipe.unet.set_default_attn_processor()
- sd_pipe.set_progress_bar_config(disable=None)
-
- _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
-
- images = sd_pipe(
- **pipeline_inputs,
- generator=paddle.Generator().manual_seed(0),
- ).images
- images_slice = images[0, -3:, -3:, -1]
-
- # Emulate training.
- set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True, var=0.1)
- set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True, var=0.1)
- set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True, var=0.1)
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- StableDiffusionXLPipeline.save_lora_weights(
- save_directory=tmpdirname,
- unet_lora_layers=lora_components["unet_lora_layers"],
- text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
- text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
- safe_serialization=True,
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- sd_pipe.load_lora_weights(os.path.join(tmpdirname, lora_weights_name))
-
- lora_images_scale_0_5 = sd_pipe(
- **pipeline_inputs,
- generator=paddle.Generator().manual_seed(0),
- cross_attention_kwargs={"scale": 0.5},
- ).images
- lora_image_slice_scale_0_5 = lora_images_scale_0_5[0, -3:, -3:, -1]
-
- sd_pipe.fuse_lora(lora_scale=0.5)
- lora_images_scale_0_5_fusion = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- lora_image_slice_scale_0_5_fusion = lora_images_scale_0_5_fusion[0, -3:, -3:, -1]
-
- assert np.allclose(
- lora_image_slice_scale_0_5, lora_image_slice_scale_0_5_fusion, atol=5e-03
- ), "Fusion shouldn't affect the results when calling the pipeline with a non-default LoRA scale."
-
- sd_pipe.unfuse_lora()
- images_unfused = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
- images_slice_unfused = images_unfused[0, -3:, -3:, -1]
-
- assert np.allclose(images_slice, images_slice_unfused, atol=5e-03), "Unfused should match no LoRA"
-
- assert not np.allclose(
- images_slice, lora_image_slice_scale_0_5, atol=5e-03
- ), "0.5 scale and no scale shouldn't match"
-
-
-# @deprecate_after_peft_backend
-class UNet2DConditionLoRAModelTests(unittest.TestCase):
- model_class = UNet2DConditionModel
- main_input_name = "sample"
-
- @property
- def dummy_input(self):
- batch_size = 4
- num_channels = 4
- sizes = (32, 32)
-
- noise = floats_tensor((batch_size, num_channels) + sizes)
- time_step = paddle.to_tensor([10])
- encoder_hidden_states = floats_tensor((batch_size, 4, 32))
-
- return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
-
- @property
- def input_shape(self):
- return (4, 32, 32)
-
- @property
- def output_shape(self):
- return (4, 32, 32)
-
- def prepare_init_args_and_inputs_for_common(self):
- init_dict = {
- "block_out_channels": (32, 64),
- "down_block_types": ("CrossAttnDownBlock2D", "DownBlock2D"),
- "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D"),
- "cross_attention_dim": 32,
- "attention_head_dim": 8,
- "out_channels": 4,
- "in_channels": 4,
- "layers_per_block": 2,
- "sample_size": 32,
- }
- inputs_dict = self.dummy_input
- return init_dict, inputs_dict
-
- def test_lora_processors(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
- init_dict["attention_head_dim"] = 8, 16
- model = self.model_class(**init_dict)
- with paddle.no_grad():
- sample1 = model(**inputs_dict).sample
- lora_attn_procs = create_lora_layers(model)
- model.set_attn_processor(lora_attn_procs)
- model.set_attn_processor(model.attn_processors)
- with paddle.no_grad():
- sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
- sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
- sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
- assert (sample1 - sample2).abs().max() < 3e-3
- assert (sample3 - sample4).abs().max() < 3e-3
- assert (sample2 - sample3).abs().max() > 3e-3
-
- def test_lora_save_load(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
- init_dict["attention_head_dim"] = 8, 16
- paddle.seed(0)
- model = self.model_class(**init_dict)
- with paddle.no_grad():
- old_sample = model(**inputs_dict).sample
- lora_attn_procs = create_lora_layers(model)
- model.set_attn_processor(lora_attn_procs)
- with paddle.no_grad():
- sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
- with tempfile.TemporaryDirectory() as tmpdirname:
- model.save_attn_procs(tmpdirname, to_diffusers=False)
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- paddle.seed(0)
- new_model = self.model_class(**init_dict)
- new_model.load_attn_procs(tmpdirname, from_diffusers=False)
-
- with paddle.no_grad():
- new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
- assert (sample - new_sample).abs().max() < 1e-4
-
- # LoRA and no LoRA should NOT be the same
- assert (sample - old_sample).abs().max() > 1e-4
-
- def test_lora_save_load_safetensors(self):
- # enable deterministic behavior for gradient checkpointing
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
- init_dict["attention_head_dim"] = (8, 16)
-
- paddle.seed(0)
- model = self.model_class(**init_dict)
-
- with paddle.no_grad():
- old_sample = model(**inputs_dict).sample
-
- lora_attn_procs = create_lora_layers(model)
- model.set_attn_processor(lora_attn_procs)
-
- with paddle.no_grad():
- sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True)
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
- paddle.seed(0)
- new_model = self.model_class(**init_dict)
- new_model.load_attn_procs(tmpdirname, from_diffusers=True, use_safetensors=True)
- with paddle.no_grad():
- new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
- assert (sample - new_sample).abs().max() < 0.0001
- assert (sample - old_sample).abs().max() > 0.0001
-
- # def test_lora_save_safetensors_load_torch(self):
- # # enable deterministic behavior for gradient checkpointing
- # init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
- # init_dict["attention_head_dim"] = (8, 16)
-
- # paddle.seed(0)
- # model = self.model_class(**init_dict)
-
- # lora_attn_procs = create_lora_layers(model, mock_weights=False)
- # model.set_attn_processor(lora_attn_procs)
- # # Saving as paddle, properly reloads with directly filename
- # with tempfile.TemporaryDirectory() as tmpdirname:
- # model.save_attn_procs(tmpdirname, to_diffusers=True)
- # self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
- # paddle.seed(0)
- # new_model = self.model_class(**init_dict)
- # new_model.load_attn_procs(
- # tmpdirname, weight_name="pytorch_lora_weights.bin", from_diffusers=True, use_safetensors=False
- # )
-
- def test_lora_save_torch_force_load_safetensors_error(self):
- pass
-
- def test_lora_on_off(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
- init_dict["attention_head_dim"] = 8, 16
- paddle.seed(0)
- model = self.model_class(**init_dict)
- with paddle.no_grad():
- old_sample = model(**inputs_dict).sample
- lora_attn_procs = create_lora_layers(model)
- model.set_attn_processor(lora_attn_procs)
- with paddle.no_grad():
- sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
- model.set_default_attn_processor()
- with paddle.no_grad():
- new_sample = model(**inputs_dict).sample
- assert (sample - new_sample).abs().max() < 0.0001
- assert (sample - old_sample).abs().max() < 3e-3
-
- @unittest.skipIf(
- not is_ppxformers_available(),
- reason="scaled_dot_product_attention attention is only available with CUDA and `scaled_dot_product_attention` installed",
- )
- def test_lora_xformers_on_off(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
- init_dict["attention_head_dim"] = 8, 16
- paddle.seed(0)
- model = self.model_class(**init_dict)
- lora_attn_procs = create_lora_layers(model)
- model.set_attn_processor(lora_attn_procs)
- with paddle.no_grad():
- sample = model(**inputs_dict).sample
- model.enable_xformers_memory_efficient_attention()
- on_sample = model(**inputs_dict).sample
- model.disable_xformers_memory_efficient_attention()
- off_sample = model(**inputs_dict).sample
- assert (sample - on_sample).abs().max() < 0.05
- assert (sample - off_sample).abs().max() < 0.05
-
-
-# @deprecate_after_peft_backend
-class UNet3DConditionModelTests(unittest.TestCase):
- model_class = UNet3DConditionModel
- main_input_name = "sample"
-
- @property
- def dummy_input(self):
- batch_size = 4
- num_channels = 4
- num_frames = 4
- sizes = (32, 32)
-
- noise = floats_tensor((batch_size, num_channels, num_frames) + sizes)
- time_step = paddle.to_tensor([10])
- encoder_hidden_states = floats_tensor((batch_size, 4, 32))
-
- return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
-
- @property
- def input_shape(self):
- return (4, 4, 32, 32)
-
- @property
- def output_shape(self):
- return (4, 4, 32, 32)
-
- def prepare_init_args_and_inputs_for_common(self):
- init_dict = {
- "block_out_channels": (32, 64),
- "down_block_types": (
- "CrossAttnDownBlock3D",
- "DownBlock3D",
- ),
- "up_block_types": ("UpBlock3D", "CrossAttnUpBlock3D"),
- "cross_attention_dim": 32,
- "attention_head_dim": 8,
- "out_channels": 4,
- "in_channels": 4,
- "layers_per_block": 1,
- "sample_size": 32,
- }
- inputs_dict = self.dummy_input
- return init_dict, inputs_dict
-
- def test_lora_processors(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
- init_dict["attention_head_dim"] = 8
-
- model = self.model_class(**init_dict)
-
- with paddle.no_grad():
- sample1 = model(**inputs_dict).sample
-
- lora_attn_procs = create_lora_3d_layers(model)
-
- # make sure we can set a list of attention processors
- model.set_attn_processor(lora_attn_procs)
-
- # test that attn processors can be set to itself
- model.set_attn_processor(model.attn_processors)
-
- with paddle.no_grad():
- sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
- sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
- sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
- assert (sample1 - sample2).abs().max() < 3e-3
- assert (sample3 - sample4).abs().max() < 3e-3
-
- # sample 2 and sample 3 should be different
- assert (sample2 - sample3).abs().max() > 3e-3
-
- def test_lora_save_load(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
- init_dict["attention_head_dim"] = 8
-
- paddle.seed(0)
- model = self.model_class(**init_dict)
-
- with paddle.no_grad():
- old_sample = model(**inputs_dict).sample
-
- lora_attn_procs = create_lora_3d_layers(model)
- model.set_attn_processor(lora_attn_procs)
-
- with paddle.no_grad():
- sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- model.save_attn_procs(
- tmpdirname,
- to_diffusers=False,
- )
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
- paddle.seed(0)
- new_model = self.model_class(**init_dict)
- new_model.load_attn_procs(tmpdirname, from_diffusers=False)
-
- with paddle.no_grad():
- new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
- assert (sample - new_sample).abs().max() < 1e-3
-
- # LoRA and no LoRA should NOT be the same
- assert (sample - old_sample).abs().max() > 1e-4
-
- def test_lora_save_load_safetensors(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
- init_dict["attention_head_dim"] = 8
-
- paddle.seed(0)
- model = self.model_class(**init_dict)
-
- with paddle.no_grad():
- old_sample = model(**inputs_dict).sample
-
- lora_attn_procs = create_lora_3d_layers(model)
- model.set_attn_processor(lora_attn_procs)
-
- with paddle.no_grad():
- sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True)
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
- paddle.seed(0)
- new_model = self.model_class(**init_dict)
- new_model.load_attn_procs(tmpdirname, use_safetensors=True, from_diffusers=True)
-
- with paddle.no_grad():
- new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
-
- assert (sample - new_sample).abs().max() < 3e-3
-
- # LoRA and no LoRA should NOT be the same
- assert (sample - old_sample).abs().max() > 1e-4
-
- # def test_lora_save_safetensors_load_torch(self):
- # # enable deterministic behavior for gradient checkpointing
- # init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
- # init_dict["attention_head_dim"] = 8
-
- # paddle.seed(0)
- # model = self.model_class(**init_dict)
-
- # lora_attn_procs = create_lora_layers(model, mock_weights=False)
- # model.set_attn_processor(lora_attn_procs)
- # # Saving as paddle, properly reloads with directly filename
- # with tempfile.TemporaryDirectory() as tmpdirname:
- # model.save_attn_procs(tmpdirname, to_diffusers=True)
- # self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
- # paddle.seed(0)
- # new_model = self.model_class(**init_dict)
- # new_model.load_attn_procs(
- # tmpdirname, weight_name="pytorch_lora_weights.bin", use_safetensors=False, from_diffusers=True
- # )
-
- def test_lora_save_paddle_force_load_safetensors_error(self):
- pass
-
- def test_lora_on_off(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
- init_dict["attention_head_dim"] = 8
-
- paddle.seed(0)
- model = self.model_class(**init_dict)
-
- with paddle.no_grad():
- old_sample = model(**inputs_dict).sample
-
- lora_attn_procs = create_lora_3d_layers(model)
- model.set_attn_processor(lora_attn_procs)
-
- with paddle.no_grad():
- sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
-
- model.set_attn_processor(AttnProcessor())
-
- with paddle.no_grad():
- new_sample = model(**inputs_dict).sample
-
- assert (sample - new_sample).abs().max() < 1e-4
- assert (sample - old_sample).abs().max() < 3e-3
-
- @unittest.skipIf(
- not is_ppxformers_available(),
- reason="XFormers attention is only available with CUDA and `xformers` installed",
- )
- def test_lora_xformers_on_off(self):
- # enable deterministic behavior for gradient checkpointing
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
- init_dict["attention_head_dim"] = 4
-
- paddle.seed(0)
- model = self.model_class(**init_dict)
- lora_attn_procs = create_lora_3d_layers(model)
- model.set_attn_processor(lora_attn_procs)
-
- # default
- with paddle.no_grad():
- sample = model(**inputs_dict).sample
-
- model.enable_xformers_memory_efficient_attention()
- on_sample = model(**inputs_dict).sample
-
- model.disable_xformers_memory_efficient_attention()
- off_sample = model(**inputs_dict).sample
-
- assert (sample - on_sample).abs().max() < 0.005
- assert (sample - off_sample).abs().max() < 0.005
-
-
-@slow
-@require_paddle_gpu
-class LoraIntegrationTests(unittest.TestCase):
- def test_dreambooth_old_format(self):
- generator = paddle.Generator().manual_seed(0)
- lora_model_id = "hf-internal-testing/lora_dreambooth_dog_example"
- card = RepoCard.load(lora_model_id)
- base_model_id = card.data.to_dict()["base_model"]
- pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
- pipe.load_lora_weights(lora_model_id)
- images = pipe(
- "A photo of a sks dog floating in the river", output_type="np", generator=generator, num_inference_steps=5
- ).images
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array([0.8065, 0.8114, 0.7948, 0.822, 0.8092, 0.8031, 0.8052, 0.8012, 0.7677])
- self.assertTrue(np.allclose(images, expected, atol=0.0001))
-
- def test_dreambooth_text_encoder_new_format(self):
- generator = paddle.Generator().manual_seed(0)
- lora_model_id = "hf-internal-testing/lora-trained"
- card = RepoCard.load(lora_model_id)
- base_model_id = card.data.to_dict()["base_model"]
- pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
- pipe.load_lora_weights(lora_model_id)
- images = pipe("A photo of a sks dog", output_type="np", generator=generator, num_inference_steps=5).images
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array([0.3782, 0.367, 0.3792, 0.3758, 0.3719, 0.3851, 0.382, 0.3842, 0.4009])
- self.assertTrue(np.allclose(images, expected, atol=0.0001))
-
- def test_a1111(self):
- generator = paddle.Generator().manual_seed(0)
- # hf-internal-testing/Counterfeit-V2.5 -> gsdf/Counterfeit-V2.5
- pipe = StableDiffusionPipeline.from_pretrained("gsdf/Counterfeit-V2.5", safety_checker=None)
- lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
- lora_filename = "light_and_shadow.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, from_diffusers=True)
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array(
- [0.5849, 0.5824, 0.5898, 0.5802, 0.5676, 0.5974, 0.5921, 0.5987, 0.6081],
- )
-
- self.assertTrue(np.allclose(images, expected, atol=0.0001))
-
- def test_a1111_with_model_cpu_offload(self):
- generator = paddle.Generator().manual_seed(0)
-
- pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
- pipe.enable_model_cpu_offload()
- lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
- lora_filename = "light_and_shadow.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
-
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array([0.3636, 0.3708, 0.3694, 0.3679, 0.3829, 0.3677, 0.3692, 0.3688, 0.3292])
-
- self.assertTrue(np.allclose(images, expected, atol=1e-3))
-
- def test_a1111_with_sequential_cpu_offload(self):
- generator = paddle.Generator().manual_seed(0)
-
- pipe = StableDiffusionPipeline.from_pretrained("hf-internal-testing/Counterfeit-V2.5", safety_checker=None)
- pipe.enable_sequential_cpu_offload()
- lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
- lora_filename = "light_and_shadow.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
-
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array([0.3636, 0.3708, 0.3694, 0.3679, 0.3829, 0.3677, 0.3692, 0.3688, 0.3292])
-
- self.assertTrue(np.allclose(images, expected, atol=1e-3))
-
- def test_kohya_sd_v15_with_higher_dimensions(self):
- generator = paddle.Generator().manual_seed(0)
-
- pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
- lora_model_id = "hf-internal-testing/urushisato-lora"
- lora_filename = "urushisato_v15.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
-
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array([0.7165, 0.6616, 0.5833, 0.7504, 0.6718, 0.587, 0.6871, 0.6361, 0.5694])
-
- self.assertTrue(np.allclose(images, expected, atol=1e-3))
-
- def test_vanilla_funetuning(self):
- generator = paddle.Generator().manual_seed(0)
- lora_model_id = "hf-internal-testing/sd-model-finetuned-lora-t4"
- card = RepoCard.load(lora_model_id)
- base_model_id = card.data.to_dict()["base_model"]
- pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
- pipe.load_lora_weights(lora_model_id, from_diffusers=True)
- images = pipe("A pokemon with blue eyes.", output_type="np", generator=generator, num_inference_steps=5).images
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array(
- [0.719, 0.716, 0.6854, 0.7449, 0.7188, 0.7124, 0.7403, 0.7287, 0.6979],
- )
- self.assertTrue(np.allclose(images, expected, atol=0.0001))
-
- def test_unload_kohya_lora(self):
- generator = paddle.Generator().manual_seed(0)
- prompt = "masterpiece, best quality, mountain"
- num_inference_steps = 5
- pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
- initial_images = pipe(
- prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
- ).images
- initial_images = initial_images[0, -3:, -3:, -1].flatten()
- lora_model_id = "hf-internal-testing/civitai-colored-icons-lora"
- lora_filename = "Colored_Icons_by_vizsumit.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, from_diffusers=True)
- generator = paddle.Generator().manual_seed(0)
- lora_images = pipe(
- prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
- ).images
- lora_images = lora_images[0, -3:, -3:, -1].flatten()
- pipe.unload_lora_weights()
- generator = paddle.Generator().manual_seed(0)
- unloaded_lora_images = pipe(
- prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
- ).images
- unloaded_lora_images = unloaded_lora_images[0, -3:, -3:, -1].flatten()
- self.assertFalse(np.allclose(initial_images, lora_images))
- self.assertTrue(np.allclose(initial_images, unloaded_lora_images, atol=0.001))
-
- def test_load_unload_load_kohya_lora(self):
- # This test ensures that a Kohya-style LoRA can be safely unloaded and then loaded
- # without introducing any side-effects. Even though the test uses a Kohya-style
- # LoRA, the underlying adapter handling mechanism is format-agnostic.
- generator = paddle.Generator().manual_seed(0)
- prompt = "masterpiece, best quality, mountain"
- num_inference_steps = 5
- pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
- initial_images = pipe(
- prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
- ).images
- initial_images = initial_images[0, -3:, -3:, -1].flatten()
-
- lora_model_id = "hf-internal-testing/civitai-colored-icons-lora"
- lora_filename = "Colored_Icons_by_vizsumit.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, from_diffusers=True)
- generator = paddle.Generator().manual_seed(0)
- lora_images = pipe(
- prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
- ).images
- lora_images = lora_images[0, -3:, -3:, -1].flatten()
-
- pipe.unload_lora_weights()
- generator = paddle.Generator().manual_seed(0)
- unloaded_lora_images = pipe(
- prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
- ).images
- unloaded_lora_images = unloaded_lora_images[0, -3:, -3:, -1].flatten()
-
- self.assertFalse(np.allclose(initial_images, lora_images))
- self.assertTrue(np.allclose(initial_images, unloaded_lora_images, atol=0.001))
- # make sure we can load a LoRA again after unloading and they don't have
- # any undesired effects.
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, from_diffusers=True)
- generator = paddle.Generator().manual_seed(0)
- lora_images_again = pipe(
- prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
- ).images
- lora_images_again = lora_images_again[0, -3:, -3:, -1].flatten()
- self.assertTrue(np.allclose(lora_images, lora_images_again, atol=0.001))
-
- def test_sdxl_0_9_lora_one(self):
- generator = paddle.Generator().manual_seed(0)
-
- pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9")
- lora_model_id = "hf-internal-testing/sdxl-0.9-daiton-lora"
- lora_filename = "daiton-xl-lora-test.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
- pipe.enable_model_cpu_offload()
-
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
-
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array([0.3838, 0.3482, 0.3588, 0.3162, 0.319, 0.3369, 0.338, 0.3366, 0.3213])
-
- self.assertTrue(np.allclose(images, expected, atol=1e-3))
-
- def test_sdxl_0_9_lora_two(self):
- generator = paddle.Generator().manual_seed(0)
-
- pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9")
- lora_model_id = "hf-internal-testing/sdxl-0.9-costumes-lora"
- lora_filename = "saijo.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
- pipe.enable_model_cpu_offload()
-
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
-
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array([0.3137, 0.3269, 0.3355, 0.255, 0.2577, 0.2563, 0.2679, 0.2758, 0.2626])
-
- self.assertTrue(np.allclose(images, expected, atol=1e-3))
-
- def test_sdxl_0_9_lora_three(self):
- generator = paddle.Generator().manual_seed(0)
-
- pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-0.9")
- lora_model_id = "hf-internal-testing/sdxl-0.9-kamepan-lora"
- lora_filename = "kame_sdxl_v2-000020-16rank.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
- pipe.enable_model_cpu_offload()
-
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
-
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array([0.4015, 0.3761, 0.3616, 0.3745, 0.3462, 0.3337, 0.3564, 0.3649, 0.3468])
-
- self.assertTrue(np.allclose(images, expected, atol=5e-3))
-
- def test_sdxl_1_0_lora(self):
- generator = paddle.Generator().manual_seed(0)
-
- pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
- pipe.enable_model_cpu_offload()
- lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
- lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
-
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535])
-
- self.assertTrue(np.allclose(images, expected, atol=1e-4))
-
- def test_sdxl_1_0_lora_fusion(self):
- generator = paddle.Generator().manual_seed(0)
-
- pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
- lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
- lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
- pipe.fuse_lora()
- pipe.enable_model_cpu_offload()
-
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
-
- images = images[0, -3:, -3:, -1].flatten()
- # This way we also test equivalence between LoRA fusion and the non-fusion behaviour.
- expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535])
-
- self.assertTrue(np.allclose(images, expected, atol=1e-4))
-
- def test_sdxl_1_0_lora_unfusion(self):
- generator = paddle.Generator().manual_seed(0)
-
- pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
- lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
- lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
- pipe.fuse_lora()
- pipe.enable_model_cpu_offload()
-
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
- images_with_fusion = images[0, -3:, -3:, -1].flatten()
-
- pipe.unfuse_lora()
- generator = paddle.Generator().manual_seed(0)
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
- images_without_fusion = images[0, -3:, -3:, -1].flatten()
-
- self.assertFalse(np.allclose(images_with_fusion, images_without_fusion, atol=1e-3))
-
- def test_sdxl_1_0_lora_unfusion_effectivity(self):
- pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
- pipe.enable_model_cpu_offload()
-
- generator = paddle.Generator().manual_seed(0)
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
- original_image_slice = images[0, -3:, -3:, -1].flatten()
-
- lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
- lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
- pipe.fuse_lora()
-
- generator = paddle.Generator().manual_seed(0)
- _ = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
-
- pipe.unfuse_lora()
- generator = paddle.Generator().manual_seed(0)
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
- images_without_fusion_slice = images[0, -3:, -3:, -1].flatten()
-
- self.assertTrue(np.allclose(original_image_slice, images_without_fusion_slice, atol=1e-3))
-
- def test_sdxl_1_0_lora_fusion_efficiency(self):
- generator = paddle.Generator().manual_seed(0)
- lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
- lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
-
- pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
- pipe.enable_model_cpu_offload()
-
- start_time = time.time()
- for _ in range(3):
- pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
- end_time = time.time()
- elapsed_time_non_fusion = end_time - start_time
-
- del pipe
-
- pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
- pipe.fuse_lora()
- pipe.enable_model_cpu_offload()
-
- start_time = time.time()
- generator = paddle.Generator().manual_seed(0)
- for _ in range(3):
- pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
- end_time = time.time()
- elapsed_time_fusion = end_time - start_time
-
- self.assertTrue(elapsed_time_fusion < elapsed_time_non_fusion)
-
- def test_sdxl_1_0_last_ben(self):
- generator = paddle.Generator().manual_seed(0)
-
- pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
- pipe.enable_model_cpu_offload()
- lora_model_id = "TheLastBen/Papercut_SDXL"
- lora_filename = "papercut.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-
- images = pipe("papercut.safetensors", output_type="np", generator=generator, num_inference_steps=2).images
-
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array([0.5244, 0.4347, 0.4312, 0.4246, 0.4398, 0.4409, 0.4884, 0.4938, 0.4094])
-
- self.assertTrue(np.allclose(images, expected, atol=1e-3))
-
- def test_sdxl_1_0_fuse_unfuse_all(self):
- pipe = DiffusionPipeline.from_pretrained(
- "stabilityai/stable-diffusion-xl-base-1.0", paddle_dtype=paddle.float16
- )
- text_encoder_1_sd = copy.deepcopy(pipe.text_encoder.state_dict())
- text_encoder_2_sd = copy.deepcopy(pipe.text_encoder_2.state_dict())
- unet_sd = copy.deepcopy(pipe.unet.state_dict())
-
- pipe.load_lora_weights(
- "davizca87/sun-flower", weight_name="snfw3rXL-000004.safetensors", paddle_dtype=paddle.float16
- )
- pipe.fuse_lora()
- pipe.unload_lora_weights()
- pipe.unfuse_lora()
-
- assert state_dicts_almost_equal(text_encoder_1_sd, pipe.text_encoder.state_dict())
- assert state_dicts_almost_equal(text_encoder_2_sd, pipe.text_encoder_2.state_dict())
- assert state_dicts_almost_equal(unet_sd, pipe.unet.state_dict())
-
- def test_sdxl_1_0_lora_with_sequential_cpu_offloading(self):
- generator = paddle.Generator().manual_seed(0)
-
- pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
- pipe.enable_sequential_cpu_offload()
- lora_model_id = "hf-internal-testing/sdxl-1.0-lora"
- lora_filename = "sd_xl_offset_example-lora_1.0.safetensors"
- pipe.load_lora_weights(lora_model_id, weight_name=lora_filename)
-
- images = pipe(
- "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
- ).images
-
- images = images[0, -3:, -3:, -1].flatten()
- expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535])
-
- self.assertTrue(np.allclose(images, expected, atol=1e-3))
diff --git a/ppdiffusers/tests/models/test_activations.py b/ppdiffusers/tests/models/test_activations.py
deleted file mode 100644
index a5d9c2412..000000000
--- a/ppdiffusers/tests/models/test_activations.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import paddle
-from paddle import nn
-
-from ppdiffusers.models.activations import get_activation
-
-
-class ActivationsTests(unittest.TestCase):
- def test_swish(self):
- act = get_activation("swish")
-
- self.assertIsInstance(act, nn.Silu)
-
- self.assertEqual(act(paddle.to_tensor(-100, dtype=paddle.float32)).item(), 0)
- self.assertNotEqual(act(paddle.to_tensor(-1, dtype=paddle.float32)).item(), 0)
- self.assertEqual(act(paddle.to_tensor(0, dtype=paddle.float32)).item(), 0)
- self.assertEqual(act(paddle.to_tensor(20, dtype=paddle.float32)).item(), 20)
-
- def test_silu(self):
- act = get_activation("silu")
-
- self.assertIsInstance(act, nn.Silu)
-
- self.assertEqual(act(paddle.to_tensor(-100, dtype=paddle.float32)).item(), 0)
- self.assertNotEqual(act(paddle.to_tensor(-1, dtype=paddle.float32)).item(), 0)
- self.assertEqual(act(paddle.to_tensor(0, dtype=paddle.float32)).item(), 0)
- self.assertEqual(act(paddle.to_tensor(20, dtype=paddle.float32)).item(), 20)
-
- def test_mish(self):
- act = get_activation("mish")
-
- self.assertIsInstance(act, nn.Mish)
-
- self.assertEqual(act(paddle.to_tensor(-200, dtype=paddle.float32)).item(), 0)
- self.assertNotEqual(act(paddle.to_tensor(-1, dtype=paddle.float32)).item(), 0)
- self.assertEqual(act(paddle.to_tensor(0, dtype=paddle.float32)).item(), 0)
- self.assertEqual(act(paddle.to_tensor(20, dtype=paddle.float32)).item(), 20)
-
- def test_gelu(self):
- act = get_activation("gelu")
-
- self.assertIsInstance(act, nn.GELU)
-
- self.assertEqual(act(paddle.to_tensor(-100, dtype=paddle.float32)).item(), 0)
- self.assertNotEqual(act(paddle.to_tensor(-1, dtype=paddle.float32)).item(), 0)
- self.assertEqual(act(paddle.to_tensor(0, dtype=paddle.float32)).item(), 0)
- self.assertEqual(act(paddle.to_tensor(20, dtype=paddle.float32)).item(), 20)
diff --git a/ppdiffusers/tests/models/test_lora_layers.py b/ppdiffusers/tests/models/test_lora_layers.py
new file mode 100644
index 000000000..de3263cdb
--- /dev/null
+++ b/ppdiffusers/tests/models/test_lora_layers.py
@@ -0,0 +1,744 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import numpy as np
+import paddle
+from huggingface_hub.repocard import RepoCard
+from paddlenlp.transformers import (
+ CLIPTextConfig,
+ CLIPTextModel,
+ CLIPTextModelWithProjection,
+ CLIPTokenizer,
+)
+
+from ppdiffusers import (
+ AutoencoderKL,
+ DDIMScheduler,
+ EulerDiscreteScheduler,
+ StableDiffusionPipeline,
+ StableDiffusionXLPipeline,
+ UNet2DConditionModel,
+)
+from ppdiffusers.loaders import (
+ AttnProcsLayers,
+ LoraLoaderMixin,
+ PatchedLoraProjection,
+ text_encoder_attn_modules,
+)
+from ppdiffusers.models.attention_processor import (
+ Attention,
+ AttnProcessor,
+ AttnProcessor2_5,
+ LoRAAttnProcessor,
+ LoRAAttnProcessor2_5,
+ LoRAXFormersAttnProcessor,
+ XFormersAttnProcessor,
+)
+from ppdiffusers.utils import floats_tensor, str2bool
+from ppdiffusers.utils.testing_utils import require_paddle_gpu, slow
+
+from ppdiffusers.loaders import TORCH_LORA_WEIGHT_NAME, PADDLE_LORA_WEIGHT_NAME
+
+lora_weights_name = TORCH_LORA_WEIGHT_NAME if str2bool(os.getenv("TO_DIFFUSERS", False)) else PADDLE_LORA_WEIGHT_NAME
+
+def create_unet_lora_layers(unet: paddle.nn.Layer):
+ lora_attn_procs = {}
+ for name in unet.attn_processors.keys():
+ cross_attention_dim = None if name.endswith("attn1.processor") else unet.config.cross_attention_dim
+ if name.startswith("mid_block"):
+ hidden_size = unet.config.block_out_channels[-1]
+ elif name.startswith("up_blocks"):
+ block_id = int(name[len("up_blocks.")])
+ hidden_size = list(reversed(unet.config.block_out_channels))[block_id]
+ elif name.startswith("down_blocks"):
+ block_id = int(name[len("down_blocks.")])
+ hidden_size = unet.config.block_out_channels[block_id]
+ lora_attn_processor_class = (
+ LoRAAttnProcessor2_5
+ if hasattr(paddle.nn.functional, "scaled_dot_product_attention_")
+ else LoRAAttnProcessor
+ )
+ lora_attn_procs[name] = lora_attn_processor_class(
+ hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
+ )
+ unet_lora_layers = AttnProcsLayers(lora_attn_procs)
+ return lora_attn_procs, unet_lora_layers
+
+
+def create_text_encoder_lora_attn_procs(text_encoder: paddle.nn.Layer):
+ text_lora_attn_procs = {}
+ lora_attn_processor_class = (
+ LoRAAttnProcessor2_5 if hasattr(paddle.nn.functional, "scaled_dot_product_attention_") else LoRAAttnProcessor
+ )
+ for name, module in text_encoder_attn_modules(text_encoder):
+ if isinstance(module.out_proj, paddle.nn.Linear):
+ out_features = module.out_proj.weight.shape[1]
+ elif isinstance(module.out_proj, PatchedLoraProjection):
+ out_features = module.out_proj.regular_linear_layer.weight.shape[1]
+ else:
+ assert False, module.out_proj.__class__
+ text_lora_attn_procs[name] = lora_attn_processor_class(hidden_size=out_features, cross_attention_dim=None)
+ return text_lora_attn_procs
+
+
+def create_text_encoder_lora_layers(text_encoder: paddle.nn.Layer):
+ text_lora_attn_procs = create_text_encoder_lora_attn_procs(text_encoder)
+ text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
+ return text_encoder_lora_layers
+
+
+def set_lora_weights(lora_attn_parameters, randn_weight=False):
+ with paddle.no_grad():
+ for parameter in lora_attn_parameters:
+ if randn_weight:
+ parameter[:] = paddle.randn(shape=parameter.shape, dtype=parameter.dtype)
+ else:
+ parameter.zero_()
+
+
+class LoraLoaderMixinTests(unittest.TestCase):
+ def get_dummy_components(self):
+ paddle.seed(0)
+ unet = UNet2DConditionModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ sample_size=32,
+ in_channels=4,
+ out_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+ cross_attention_dim=32,
+ )
+ scheduler = DDIMScheduler(
+ beta_start=0.00085,
+ beta_end=0.012,
+ beta_schedule="scaled_linear",
+ clip_sample=False,
+ set_alpha_to_one=False,
+ steps_offset=1,
+ )
+ paddle.seed(0)
+ vae = AutoencoderKL(
+ block_out_channels=[32, 64],
+ in_channels=3,
+ out_channels=3,
+ down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+ up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+ latent_channels=4,
+ )
+ text_encoder_config = CLIPTextConfig(
+ bos_token_id=0,
+ eos_token_id=2,
+ hidden_size=32,
+ intermediate_size=37,
+ layer_norm_eps=1e-05,
+ num_attention_heads=4,
+ num_hidden_layers=5,
+ pad_token_id=1,
+ vocab_size=1000,
+ )
+ text_encoder = CLIPTextModel(text_encoder_config)
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+ unet_lora_attn_procs, unet_lora_layers = create_unet_lora_layers(unet)
+ text_encoder_lora_layers = create_text_encoder_lora_layers(text_encoder)
+ pipeline_components = {
+ "unet": unet,
+ "scheduler": scheduler,
+ "vae": vae,
+ "text_encoder": text_encoder,
+ "tokenizer": tokenizer,
+ "safety_checker": None,
+ "feature_extractor": None,
+ }
+ lora_components = {
+ "unet_lora_layers": unet_lora_layers,
+ "text_encoder_lora_layers": text_encoder_lora_layers,
+ "unet_lora_attn_procs": unet_lora_attn_procs,
+ }
+ return pipeline_components, lora_components
+
+ def get_dummy_inputs(self, with_generator=True):
+ batch_size = 1
+ sequence_length = 10
+ num_channels = 4
+ sizes = 32, 32
+ generator = paddle.Generator().manual_seed(0)
+ noise = floats_tensor((batch_size, num_channels) + sizes)
+ input_ids = paddle.randint(low=1, high=sequence_length, shape=(batch_size, sequence_length))
+ pipeline_inputs = {
+ "prompt": "A painting of a squirrel eating a burger",
+ "num_inference_steps": 2,
+ "guidance_scale": 6.0,
+ "output_type": "np",
+ }
+ if with_generator:
+ pipeline_inputs.update({"generator": generator})
+ return noise, input_ids, pipeline_inputs
+
+ def get_dummy_tokens(self):
+ max_seq_length = 77
+ inputs = paddle.randint(low=2, high=56, shape=(1, max_seq_length))
+ prepared_inputs = {}
+ prepared_inputs["input_ids"] = inputs
+ return prepared_inputs
+
+ def create_lora_weight_file(self, tmpdirname):
+ _, lora_components = self.get_dummy_components()
+ LoraLoaderMixin.save_lora_weights(
+ save_directory=tmpdirname,
+ unet_lora_layers=lora_components["unet_lora_layers"],
+ text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
+ )
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
+
+ def test_lora_save_load(self):
+ pipeline_components, lora_components = self.get_dummy_components()
+ sd_pipe = StableDiffusionPipeline(**pipeline_components)
+ sd_pipe.set_progress_bar_config(disable=None)
+ _, _, pipeline_inputs = self.get_dummy_inputs()
+ original_images = sd_pipe(**pipeline_inputs).images
+ orig_image_slice = original_images[0, -3:, -3:, -1]
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ LoraLoaderMixin.save_lora_weights(
+ save_directory=tmpdirname,
+ unet_lora_layers=lora_components["unet_lora_layers"],
+ text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
+ )
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
+ sd_pipe.load_lora_weights(tmpdirname)
+ lora_images = sd_pipe(**pipeline_inputs).images
+ lora_image_slice = lora_images[0, -3:, -3:, -1]
+
+ # Outputs shouldn't match.
+ self.assertFalse(
+ paddle.allclose(
+ x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
+ ).item()
+ )
+
+ # def test_lora_save_load_safetensors(self):
+ # pipeline_components, lora_components = self.get_dummy_components()
+ # sd_pipe = StableDiffusionPipeline(**pipeline_components)
+ # sd_pipe.set_progress_bar_config(disable=None)
+ # _, _, pipeline_inputs = self.get_dummy_inputs()
+ # original_images = sd_pipe(**pipeline_inputs).images
+ # orig_image_slice = original_images[0, -3:, -3:, -1]
+ # with tempfile.TemporaryDirectory() as tmpdirname:
+ # LoraLoaderMixin.save_lora_weights(
+ # save_directory=tmpdirname,
+ # unet_lora_layers=lora_components["unet_lora_layers"],
+ # text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
+ # safe_serialization=True,
+ # )
+ # self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
+ # sd_pipe.load_lora_weights(tmpdirname)
+ # lora_images = sd_pipe(**pipeline_inputs).images
+ # lora_image_slice = lora_images[0, -3:, -3:, -1]
+ # # Outputs shouldn't match.
+ # self.assertFalse(
+ # paddle.allclose(
+ # x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
+ # ).item()
+ # )
+
+ def test_lora_save_load_legacy(self):
+ pipeline_components, lora_components = self.get_dummy_components()
+ unet_lora_attn_procs = lora_components["unet_lora_attn_procs"]
+ sd_pipe = StableDiffusionPipeline(**pipeline_components)
+ sd_pipe.set_progress_bar_config(disable=None)
+ _, _, pipeline_inputs = self.get_dummy_inputs()
+ original_images = sd_pipe(**pipeline_inputs).images
+ orig_image_slice = original_images[0, -3:, -3:, -1]
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ unet = sd_pipe.unet
+ unet.set_attn_processor(unet_lora_attn_procs)
+ unet.save_attn_procs(tmpdirname)
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
+ sd_pipe.load_lora_weights(tmpdirname)
+ lora_images = sd_pipe(**pipeline_inputs).images
+ lora_image_slice = lora_images[0, -3:, -3:, -1]
+
+ # Outputs shouldn't match.
+ self.assertFalse(
+ paddle.allclose(
+ x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
+ ).item()
+ )
+
+ def test_text_encoder_lora_monkey_patch(self):
+ pipeline_components, _ = self.get_dummy_components()
+ pipe = StableDiffusionPipeline(**pipeline_components)
+ dummy_tokens = self.get_dummy_tokens()
+ # inference without lora
+ outputs_without_lora = pipe.text_encoder(**dummy_tokens)[0]
+ assert outputs_without_lora.shape == [1, 77, 32]
+ # monkey patch
+ params = pipe._modify_text_encoder(pipe.text_encoder, pipe.lora_scale)
+ set_lora_weights(params, randn_weight=False)
+ # inference with lora
+ outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0]
+ assert outputs_with_lora.shape == [1, 77, 32]
+ assert paddle.allclose(
+ x=outputs_without_lora, y=outputs_with_lora
+ ).item(), "lora_up_weight are all zero, so the lora outputs should be the same to without lora outputs"
+ # create lora_attn_procs with randn up.weights
+ create_text_encoder_lora_attn_procs(pipe.text_encoder)
+ # monkey patch
+ params = pipe._modify_text_encoder(pipe.text_encoder, pipe.lora_scale)
+ set_lora_weights(params, randn_weight=True)
+ # inference with lora
+ outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0]
+ assert outputs_with_lora.shape == [1, 77, 32]
+ assert not paddle.allclose(
+ x=outputs_without_lora, y=outputs_with_lora
+ ).item(), "lora_up_weight are not zero, so the lora outputs should be different to without lora outputs"
+
+ def test_text_encoder_lora_remove_monkey_patch(self):
+ pipeline_components, _ = self.get_dummy_components()
+ pipe = StableDiffusionPipeline(**pipeline_components)
+ dummy_tokens = self.get_dummy_tokens()
+ # inference without lora
+ outputs_without_lora = pipe.text_encoder(**dummy_tokens)[0]
+ assert outputs_without_lora.shape == [1, 77, 32]
+ # monkey patch
+ params = pipe._modify_text_encoder(pipe.text_encoder, pipe.lora_scale)
+ set_lora_weights(params, randn_weight=True)
+ # inference with lora
+ outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0]
+ assert outputs_with_lora.shape == [1, 77, 32]
+ assert not paddle.allclose(
+ x=outputs_without_lora, y=outputs_with_lora
+ ).item(), "lora outputs should be different to without lora outputs"
+ # remove monkey patch
+ pipe._remove_text_encoder_monkey_patch()
+ # inference with removed lora
+ outputs_without_lora_removed = pipe.text_encoder(**dummy_tokens)[0]
+ assert outputs_without_lora_removed.shape == [1, 77, 32]
+ assert paddle.allclose(
+ x=outputs_without_lora, y=outputs_without_lora_removed
+ ).item(), "remove lora monkey patch should restore the original outputs"
+
+ def test_text_encoder_lora_scale(self):
+ pipeline_components, lora_components = self.get_dummy_components()
+ sd_pipe = StableDiffusionPipeline(**pipeline_components)
+ sd_pipe.set_progress_bar_config(disable=None)
+ _, _, pipeline_inputs = self.get_dummy_inputs()
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ LoraLoaderMixin.save_lora_weights(
+ save_directory=tmpdirname,
+ unet_lora_layers=lora_components["unet_lora_layers"],
+ text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
+ )
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
+ sd_pipe.load_lora_weights(tmpdirname)
+ lora_images = sd_pipe(**pipeline_inputs).images
+ lora_image_slice = lora_images[0, -3:, -3:, -1]
+ lora_images_with_scale = sd_pipe(**pipeline_inputs, cross_attention_kwargs={"scale": 0.5}).images
+ lora_image_with_scale_slice = lora_images_with_scale[0, -3:, -3:, -1]
+ # Outputs shouldn't match.
+ self.assertFalse(
+ paddle.allclose(
+ x=paddle.to_tensor(data=lora_image_slice), y=paddle.to_tensor(data=lora_image_with_scale_slice)
+ ).item()
+ )
+
+ def test_lora_unet_attn_processors(self):
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ self.create_lora_weight_file(tmpdirname)
+ pipeline_components, _ = self.get_dummy_components()
+ sd_pipe = StableDiffusionPipeline(**pipeline_components)
+ sd_pipe.set_progress_bar_config(disable=None)
+ # check if vanilla attention processors are used
+ for _, module in sd_pipe.unet.named_sublayers():
+ if isinstance(module, Attention):
+ self.assertIsInstance(module.processor, (AttnProcessor, AttnProcessor2_5))
+ # load LoRA weight file
+ sd_pipe.load_lora_weights(tmpdirname)
+ # check if lora attention processors are used
+ for _, module in sd_pipe.unet.named_sublayers():
+ if isinstance(module, Attention):
+ attn_proc_class = (
+ LoRAAttnProcessor2_5
+ if hasattr(paddle.nn.functional, "scaled_dot_product_attention_")
+ else LoRAAttnProcessor
+ )
+ self.assertIsInstance(module.processor, attn_proc_class)
+
+ def test_unload_lora_sd(self):
+ pipeline_components, lora_components = self.get_dummy_components()
+ _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
+ sd_pipe = StableDiffusionPipeline(**pipeline_components)
+ original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+ orig_image_slice = original_images[0, -3:, -3:, -1]
+ # Emulate training.
+ set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
+ set_lora_weights(lora_components["text_encoder_lora_layers"].parameters(), randn_weight=True)
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ LoraLoaderMixin.save_lora_weights(
+ save_directory=tmpdirname,
+ unet_lora_layers=lora_components["unet_lora_layers"],
+ text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
+ )
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
+ sd_pipe.load_lora_weights(tmpdirname)
+ lora_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+ lora_image_slice = lora_images[0, -3:, -3:, -1]
+ # Unload LoRA parameters.
+ sd_pipe.unload_lora_weights()
+ original_images_two = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+ orig_image_slice_two = original_images_two[0, -3:, -3:, -1]
+ assert not np.allclose(
+ orig_image_slice, lora_image_slice
+ ), "LoRA parameters should lead to a different image slice."
+ assert not np.allclose(
+ orig_image_slice_two, lora_image_slice
+ ), "LoRA parameters should lead to a different image slice."
+ assert np.allclose(
+ orig_image_slice, orig_image_slice_two, atol=0.001
+ ), "Unloading LoRA parameters should lead to results similar to what was obtained with the pipeline without any LoRA parameters."
+
+ def test_lora_unet_attn_processors_with_xformers(self):
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ self.create_lora_weight_file(tmpdirname)
+ pipeline_components, _ = self.get_dummy_components()
+ sd_pipe = StableDiffusionPipeline(**pipeline_components)
+ sd_pipe.set_progress_bar_config(disable=None)
+ # enable XFormers
+ sd_pipe.enable_xformers_memory_efficient_attention()
+ # check if xFormers attention processors are used
+ for _, module in sd_pipe.unet.named_sublayers():
+ if isinstance(module, Attention):
+ self.assertIsInstance(module.processor, XFormersAttnProcessor)
+ # load LoRA weight file
+ sd_pipe.load_lora_weights(tmpdirname)
+ # check if lora attention processors are used
+ for _, module in sd_pipe.unet.named_sublayers():
+ if isinstance(module, Attention):
+ self.assertIsInstance(module.processor, LoRAXFormersAttnProcessor)
+ # unload lora weights
+ sd_pipe.unload_lora_weights()
+ # check if attention processors are reverted back to xFormers
+ for _, module in sd_pipe.unet.named_sublayers():
+ if isinstance(module, Attention):
+ self.assertIsInstance(module.processor, XFormersAttnProcessor)
+
+ def test_lora_save_load_with_xformers(self):
+ pipeline_components, lora_components = self.get_dummy_components()
+ sd_pipe = StableDiffusionPipeline(**pipeline_components)
+ sd_pipe.set_progress_bar_config(disable=None)
+ _, _, pipeline_inputs = self.get_dummy_inputs()
+ # enable XFormers
+ sd_pipe.enable_xformers_memory_efficient_attention()
+ original_images = sd_pipe(**pipeline_inputs).images
+ orig_image_slice = original_images[0, -3:, -3:, -1]
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ LoraLoaderMixin.save_lora_weights(
+ save_directory=tmpdirname,
+ unet_lora_layers=lora_components["unet_lora_layers"],
+ text_encoder_lora_layers=lora_components["text_encoder_lora_layers"],
+ )
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
+ sd_pipe.load_lora_weights(tmpdirname)
+ lora_images = sd_pipe(**pipeline_inputs).images
+ lora_image_slice = lora_images[0, -3:, -3:, -1]
+ # Outputs shouldn't match.
+ self.assertFalse(
+ paddle.allclose(
+ x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
+ ).item()
+ )
+
+
+class SDXLLoraLoaderMixinTests(unittest.TestCase):
+ def get_dummy_components(self):
+ paddle.seed(0)
+ unet = UNet2DConditionModel(
+ block_out_channels=(32, 64),
+ layers_per_block=2,
+ sample_size=32,
+ in_channels=4,
+ out_channels=4,
+ down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+ up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+ # SD2-specific config below
+ attention_head_dim=(2, 4),
+ use_linear_projection=True,
+ addition_embed_type="text_time",
+ addition_time_embed_dim=8,
+ transformer_layers_per_block=(1, 2),
+ projection_class_embeddings_input_dim=80, # 6 * 8 + 32
+ cross_attention_dim=64,
+ )
+ scheduler = EulerDiscreteScheduler(
+ beta_start=0.00085,
+ beta_end=0.012,
+ steps_offset=1,
+ beta_schedule="scaled_linear",
+ timestep_spacing="leading",
+ )
+ paddle.seed(0)
+ vae = AutoencoderKL(
+ block_out_channels=[32, 64],
+ in_channels=3,
+ out_channels=3,
+ down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+ up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+ latent_channels=4,
+ sample_size=128,
+ )
+ paddle.seed(0)
+ text_encoder_config = CLIPTextConfig(
+ bos_token_id=0,
+ eos_token_id=2,
+ hidden_size=32,
+ intermediate_size=37,
+ layer_norm_eps=1e-05,
+ num_attention_heads=4,
+ num_hidden_layers=5,
+ pad_token_id=1,
+ vocab_size=1000,
+ # SD2-specific config below
+ hidden_act="gelu",
+ projection_dim=32,
+ )
+ text_encoder = CLIPTextModel(text_encoder_config)
+ tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+ text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
+ tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+ unet_lora_attn_procs, unet_lora_layers = create_unet_lora_layers(unet)
+ text_encoder_one_lora_layers = create_text_encoder_lora_layers(text_encoder)
+ text_encoder_two_lora_layers = create_text_encoder_lora_layers(text_encoder_2)
+ pipeline_components = {
+ "unet": unet,
+ "scheduler": scheduler,
+ "vae": vae,
+ "text_encoder": text_encoder,
+ "text_encoder_2": text_encoder_2,
+ "tokenizer": tokenizer,
+ "tokenizer_2": tokenizer_2,
+ }
+ lora_components = {
+ "unet_lora_layers": unet_lora_layers,
+ "text_encoder_one_lora_layers": text_encoder_one_lora_layers,
+ "text_encoder_two_lora_layers": text_encoder_two_lora_layers,
+ "unet_lora_attn_procs": unet_lora_attn_procs,
+ }
+ return pipeline_components, lora_components
+
+ def get_dummy_inputs(self, with_generator=True):
+ batch_size = 1
+ sequence_length = 10
+ num_channels = 4
+ sizes = 32, 32
+ generator = paddle.Generator().manual_seed(0)
+ noise = floats_tensor((batch_size, num_channels) + sizes)
+ input_ids = paddle.randint(low=1, high=sequence_length, shape=(batch_size, sequence_length))
+ pipeline_inputs = {
+ "prompt": "A painting of a squirrel eating a burger",
+ "num_inference_steps": 2,
+ "guidance_scale": 6.0,
+ "output_type": "np",
+ }
+ if with_generator:
+ pipeline_inputs.update({"generator": generator})
+ return noise, input_ids, pipeline_inputs
+
+ def test_lora_save_load(self):
+ pipeline_components, lora_components = self.get_dummy_components()
+ sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
+ sd_pipe.set_progress_bar_config(disable=None)
+ _, _, pipeline_inputs = self.get_dummy_inputs()
+ original_images = sd_pipe(**pipeline_inputs).images
+ orig_image_slice = original_images[0, -3:, -3:, -1]
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ StableDiffusionXLPipeline.save_lora_weights(
+ save_directory=tmpdirname,
+ unet_lora_layers=lora_components["unet_lora_layers"],
+ text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+ text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+ )
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
+ sd_pipe.load_lora_weights(tmpdirname)
+ lora_images = sd_pipe(**pipeline_inputs).images
+ lora_image_slice = lora_images[0, -3:, -3:, -1]
+ # Outputs shouldn't match.
+ self.assertFalse(
+ paddle.allclose(
+ x=paddle.to_tensor(data=orig_image_slice), y=paddle.to_tensor(data=lora_image_slice)
+ ).item()
+ )
+
+ def test_unload_lora_sdxl(self):
+ pipeline_components, lora_components = self.get_dummy_components()
+ _, _, pipeline_inputs = self.get_dummy_inputs(with_generator=False)
+ sd_pipe = StableDiffusionXLPipeline(**pipeline_components)
+ original_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+ orig_image_slice = original_images[0, -3:, -3:, -1]
+ # Emulate training.
+ set_lora_weights(lora_components["unet_lora_layers"].parameters(), randn_weight=True)
+ set_lora_weights(lora_components["text_encoder_one_lora_layers"].parameters(), randn_weight=True)
+ set_lora_weights(lora_components["text_encoder_two_lora_layers"].parameters(), randn_weight=True)
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ StableDiffusionXLPipeline.save_lora_weights(
+ save_directory=tmpdirname,
+ unet_lora_layers=lora_components["unet_lora_layers"],
+ text_encoder_lora_layers=lora_components["text_encoder_one_lora_layers"],
+ text_encoder_2_lora_layers=lora_components["text_encoder_two_lora_layers"],
+ )
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, lora_weights_name)))
+ sd_pipe.load_lora_weights(tmpdirname)
+ lora_images = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+ lora_image_slice = lora_images[0, -3:, -3:, -1]
+ # Unload LoRA parameters.
+ sd_pipe.unload_lora_weights()
+ original_images_two = sd_pipe(**pipeline_inputs, generator=paddle.Generator().manual_seed(0)).images
+ orig_image_slice_two = original_images_two[0, -3:, -3:, -1]
+ assert not np.allclose(
+ orig_image_slice, lora_image_slice
+ ), "LoRA parameters should lead to a different image slice."
+ assert not np.allclose(
+ orig_image_slice_two, lora_image_slice
+ ), "LoRA parameters should lead to a different image slice."
+ assert np.allclose(
+ orig_image_slice, orig_image_slice_two, atol=0.001
+ ), "Unloading LoRA parameters should lead to results similar to what was obtained with the pipeline without any LoRA parameters."
+
+
+@slow
+@require_paddle_gpu
+class LoraIntegrationTests(unittest.TestCase):
+ def test_dreambooth_old_format(self):
+ generator = paddle.Generator().manual_seed(0)
+ lora_model_id = "hf-internal-testing/lora_dreambooth_dog_example"
+ card = RepoCard.load(lora_model_id)
+ base_model_id = card.data.to_dict()["base_model"]
+ pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
+ pipe.load_lora_weights(lora_model_id)
+ images = pipe(
+ "A photo of a sks dog floating in the river", output_type="np", generator=generator, num_inference_steps=5
+ ).images
+ images = images[0, -3:, -3:, -1].flatten()
+ expected = np.array([0.8065, 0.8114, 0.7948, 0.822 , 0.8092, 0.8031, 0.8052, 0.8012,
+ 0.7677])
+ self.assertTrue(np.allclose(images, expected, atol=0.0001))
+
+ def test_dreambooth_text_encoder_new_format(self):
+ generator = paddle.Generator().manual_seed(0)
+ lora_model_id = "hf-internal-testing/lora-trained"
+ card = RepoCard.load(lora_model_id)
+ base_model_id = card.data.to_dict()["base_model"]
+ pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
+ pipe.load_lora_weights(lora_model_id)
+ images = pipe("A photo of a sks dog", output_type="np", generator=generator, num_inference_steps=5).images
+ images = images[0, -3:, -3:, -1].flatten()
+ expected = np.array([0.3782, 0.367 , 0.3792, 0.3758, 0.3719, 0.3851, 0.382 , 0.3842,
+ 0.4009])
+ self.assertTrue(np.allclose(images, expected, atol=0.0001))
+
+ def test_a1111(self):
+ generator = paddle.Generator().manual_seed(0)
+ # hf-internal-testing/Counterfeit-V2.5 -> gsdf/Counterfeit-V2.5
+ pipe = StableDiffusionPipeline.from_pretrained("gsdf/Counterfeit-V2.5", safety_checker=None)
+ lora_model_id = "hf-internal-testing/civitai-light-shadow-lora"
+ lora_filename = "light_and_shadow.safetensors"
+ pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, from_diffusers=True)
+ images = pipe(
+ "masterpiece, best quality, mountain", output_type="np", generator=generator, num_inference_steps=2
+ ).images
+ images = images[0, -3:, -3:, -1].flatten()
+ expected = np.array([0.5849, 0.5824, 0.5898, 0.5802, 0.5676, 0.5974, 0.5921, 0.5987,
+ 0.6081], )
+
+ self.assertTrue(np.allclose(images, expected, atol=0.0001))
+
+ def test_vanilla_funetuning(self):
+ generator = paddle.Generator().manual_seed(0)
+ lora_model_id = "hf-internal-testing/sd-model-finetuned-lora-t4"
+ card = RepoCard.load(lora_model_id)
+ base_model_id = card.data.to_dict()["base_model"]
+ pipe = StableDiffusionPipeline.from_pretrained(base_model_id, safety_checker=None)
+ pipe.load_lora_weights(lora_model_id, from_diffusers=True)
+ images = pipe("A pokemon with blue eyes.", output_type="np", generator=generator, num_inference_steps=5).images
+ images = images[0, -3:, -3:, -1].flatten()
+ expected = np.array([0.719 , 0.716 , 0.6854, 0.7449, 0.7188, 0.7124, 0.7403, 0.7287,
+ 0.6979],)
+ self.assertTrue(np.allclose(images, expected, atol=0.0001))
+
+ def test_unload_lora(self):
+ generator = paddle.Generator().manual_seed(0)
+ prompt = "masterpiece, best quality, mountain"
+ num_inference_steps = 5
+ pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
+ initial_images = pipe(
+ prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+ ).images
+ initial_images = initial_images[0, -3:, -3:, -1].flatten()
+ lora_model_id = "hf-internal-testing/civitai-colored-icons-lora"
+ lora_filename = "Colored_Icons_by_vizsumit.safetensors"
+ pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, from_diffusers=True)
+ generator = paddle.Generator().manual_seed(0)
+ lora_images = pipe(
+ prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+ ).images
+ lora_images = lora_images[0, -3:, -3:, -1].flatten()
+ pipe.unload_lora_weights()
+ generator = paddle.Generator().manual_seed(0)
+ unloaded_lora_images = pipe(
+ prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+ ).images
+ unloaded_lora_images = unloaded_lora_images[0, -3:, -3:, -1].flatten()
+ self.assertFalse(np.allclose(initial_images, lora_images))
+ self.assertTrue(np.allclose(initial_images, unloaded_lora_images, atol=0.001))
+
+ def test_load_unload_load_kohya_lora(self):
+ # This test ensures that a Kohya-style LoRA can be safely unloaded and then loaded
+ # without introducing any side-effects. Even though the test uses a Kohya-style
+ # LoRA, the underlying adapter handling mechanism is format-agnostic.
+ generator = paddle.Generator().manual_seed(0)
+ prompt = "masterpiece, best quality, mountain"
+ num_inference_steps = 5
+ pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None)
+ initial_images = pipe(
+ prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+ ).images
+ initial_images = initial_images[0, -3:, -3:, -1].flatten()
+ lora_model_id = "hf-internal-testing/civitai-colored-icons-lora"
+ lora_filename = "Colored_Icons_by_vizsumit.safetensors"
+ pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, from_diffusers=True)
+ generator = paddle.Generator().manual_seed(0)
+ lora_images = pipe(
+ prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+ ).images
+ lora_images = lora_images[0, -3:, -3:, -1].flatten()
+ pipe.unload_lora_weights()
+ generator = paddle.Generator().manual_seed(0)
+ unloaded_lora_images = pipe(
+ prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+ ).images
+ unloaded_lora_images = unloaded_lora_images[0, -3:, -3:, -1].flatten()
+ self.assertFalse(np.allclose(initial_images, lora_images))
+ self.assertTrue(np.allclose(initial_images, unloaded_lora_images, atol=0.001))
+ # make sure we can load a LoRA again after unloading and they don't have
+ # any undesired effects.
+ pipe.load_lora_weights(lora_model_id, weight_name=lora_filename, from_diffusers=True)
+ generator = paddle.Generator().manual_seed(0)
+ lora_images_again = pipe(
+ prompt, output_type="np", generator=generator, num_inference_steps=num_inference_steps
+ ).images
+ lora_images_again = lora_images_again[0, -3:, -3:, -1].flatten()
+ self.assertTrue(np.allclose(lora_images, lora_images_again, atol=0.001))
diff --git a/ppdiffusers/tests/models/test_modeling_common.py b/ppdiffusers/tests/models/test_modeling_common.py
index 5efe6861a..557c19f4a 100644
--- a/ppdiffusers/tests/models/test_modeling_common.py
+++ b/ppdiffusers/tests/models/test_modeling_common.py
@@ -155,7 +155,7 @@ class ModelTesterMixin:
main_input_name = None # overwrite in model specific tester class
base_precision = 1e-3
- def test_from_save_pretrained(self, expected_max_diff=1e-01):
+ def test_from_save_pretrained(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
model = self.model_class(**init_dict)
if hasattr(model, "set_default_attn_processor"):
@@ -174,7 +174,7 @@ def test_from_save_pretrained(self, expected_max_diff=1e-01):
if isinstance(new_image, dict):
new_image = new_image.to_tuple()[0]
max_diff = (image - new_image).abs().sum().item()
- self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
+ self.assertLessEqual(max_diff, 1e-01, "Models give different forward passes")
def test_getattr_is_correct(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -259,7 +259,7 @@ def test_set_attn_processor_for_determinism(self):
assert paddle.allclose(x=output_2, y=output_5, atol=self.base_precision).item()
assert paddle.allclose(x=output_2, y=output_6, atol=self.base_precision).item()
- def test_from_save_pretrained_variant(self, expected_max_diff=1e-01):
+ def test_from_save_pretrained_variant(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
model = self.model_class(**init_dict)
if hasattr(model, "set_default_attn_processor"):
@@ -289,7 +289,7 @@ def test_from_save_pretrained_variant(self, expected_max_diff=1e-01):
if isinstance(new_image, dict):
new_image = new_image.to_tuple()[0]
max_diff = (image - new_image).abs().sum().item()
- self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
+ self.assertLessEqual(max_diff, 1e-01, "Models give different forward passes")
def test_from_save_pretrained_dtype(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -393,7 +393,6 @@ def test_ema_training(self):
def test_outputs_equivalence(self):
def set_nan_tensor_to_zero(t):
# t[t != t] = 0
- t = paddle.nan_to_num(t, 0, 0, 0)
return t
def recursive_check(tuple_object, dict_object):
@@ -408,9 +407,7 @@ def recursive_check(tuple_object, dict_object):
else:
self.assertTrue(
paddle.allclose(
- set_nan_tensor_to_zero(tuple_object),
- set_nan_tensor_to_zero(dict_object),
- atol=5e-05, # 1e-05, The original test needs to increase the error.
+ set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-05
),
msg=f"Tuple and dict output are not equal. Difference: {paddle.max(x=paddle.abs(x=tuple_object - dict_object))}. Tuple has `nan`: {paddle.isnan(x=tuple_object).any()} and `inf`: {paddle.isinf(x=tuple_object)}. Dict has `nan`: {paddle.isnan(x=dict_object).any()} and `inf`: {paddle.isinf(x=dict_object)}.",
)
diff --git a/ppdiffusers/tests/models/test_models_unet_1d.py b/ppdiffusers/tests/models/test_models_unet_1d.py
index a26179fb0..a0b39333f 100644
--- a/ppdiffusers/tests/models/test_models_unet_1d.py
+++ b/ppdiffusers/tests/models/test_models_unet_1d.py
@@ -59,9 +59,6 @@ def test_outputs_equivalence(self):
def test_from_save_pretrained(self):
super().test_from_save_pretrained()
- def test_from_save_pretrained_variant(self):
- super().test_from_save_pretrained_variant()
-
def test_model_from_pretrained(self):
super().test_model_from_pretrained()
diff --git a/ppdiffusers/tests/models/test_models_unet_2d.py b/ppdiffusers/tests/models/test_models_unet_2d.py
index fd5e08043..07885618b 100644
--- a/ppdiffusers/tests/models/test_models_unet_2d.py
+++ b/ppdiffusers/tests/models/test_models_unet_2d.py
@@ -65,35 +65,6 @@ def prepare_init_args_and_inputs_for_common(self):
inputs_dict = self.dummy_input
return init_dict, inputs_dict
- def test_mid_block_attn_groups(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
- init_dict["norm_num_groups"] = 16
- init_dict["add_attention"] = True
- init_dict["attn_norm_num_groups"] = 8
-
- model = self.model_class(**init_dict)
- model.eval()
-
- self.assertIsNotNone(
- model.mid_block.attentions[0].group_norm, "Mid block Attention group norm should exist but does not."
- )
- self.assertEqual(
- model.mid_block.attentions[0].group_norm._num_groups,
- init_dict["attn_norm_num_groups"],
- "Mid block Attention group norm does not have the expected number of groups.",
- )
-
- with paddle.no_grad():
- output = model(**inputs_dict)
-
- if isinstance(output, dict):
- output = output.to_tuple()[0]
-
- self.assertIsNotNone(output)
- expected_shape = inputs_dict["sample"].shape
- self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
class UNetLDMModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
model_class = UNet2DModel
diff --git a/ppdiffusers/tests/models/test_models_unet_2d_condition.py b/ppdiffusers/tests/models/test_models_unet_2d_condition.py
index f73282a24..963a7ddf4 100644
--- a/ppdiffusers/tests/models/test_models_unet_2d_condition.py
+++ b/ppdiffusers/tests/models/test_models_unet_2d_condition.py
@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import copy
import gc
import os
import tempfile
@@ -25,7 +24,10 @@
from pytest import mark
from ppdiffusers import UNet2DConditionModel
-from ppdiffusers.models.attention_processor import CustomDiffusionAttnProcessor
+from ppdiffusers.models.attention_processor import (
+ CustomDiffusionAttnProcessor,
+ LoRAAttnProcessor,
+)
from ppdiffusers.utils import (
floats_tensor,
load_hf_numpy,
@@ -44,6 +46,28 @@
enable_full_determinism()
+def create_lora_layers(model, mock_weights: bool = True):
+ lora_attn_procs = {}
+ for name in model.attn_processors.keys():
+ cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
+ if name.startswith("mid_block"):
+ hidden_size = model.config.block_out_channels[-1]
+ elif name.startswith("up_blocks"):
+ block_id = int(name[len("up_blocks.")])
+ hidden_size = list(reversed(model.config.block_out_channels))[block_id]
+ elif name.startswith("down_blocks"):
+ block_id = int(name[len("down_blocks.")])
+ hidden_size = model.config.block_out_channels[block_id]
+ lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+ if mock_weights:
+ with paddle.no_grad():
+ lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1)
+ lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1)
+ lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1)
+ lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1)
+ return lora_attn_procs
+
+
def create_custom_ppdiffusion_layers(model, mock_weights: bool = True):
train_kv = True
train_q_out = True
@@ -160,12 +184,11 @@ def test_gradient_checkpointing(self):
model_2.clear_gradients()
loss_2 = (out_2 - labels).mean()
loss_2.backward()
- # UNetMidBlock2DCrossAttn create_custom_forward associates the difference.
- self.assertTrue((loss - loss_2).abs() < 1e-5)
+ self.assertTrue((loss - loss_2).abs() < 1e-05)
named_params = dict(model.named_parameters())
named_params_2 = dict(model_2.named_parameters())
for name, param in named_params.items():
- self.assertTrue(paddle_all_close(param.grad, named_params_2[name].grad, atol=5e-5))
+ self.assertTrue(paddle_all_close(param.grad, named_params_2[name].grad, atol=5e-05))
def test_model_with_attention_head_dim_tuple(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
@@ -290,42 +313,6 @@ def check_sliceable_dim_attr(module: paddle.nn.Layer):
for module in model.children():
check_sliceable_dim_attr(module)
- def test_gradient_checkpointing_is_applied(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
- init_dict["attention_head_dim"] = (8, 16)
-
- model_class_copy = copy.copy(self.model_class)
-
- modules_with_gc_enabled = {}
-
- # now monkey patch the following function:
- # def _set_gradient_checkpointing(self, module, value=False):
- # if hasattr(module, "gradient_checkpointing"):
- # module.gradient_checkpointing = value
-
- def _set_gradient_checkpointing_new(self, module, value=False):
- if hasattr(module, "gradient_checkpointing"):
- module.gradient_checkpointing = value
- modules_with_gc_enabled[module.__class__.__name__] = True
-
- model_class_copy._set_gradient_checkpointing = _set_gradient_checkpointing_new
-
- model = model_class_copy(**init_dict)
- model.enable_gradient_checkpointing()
-
- EXPECTED_SET = {
- "CrossAttnUpBlock2D",
- "CrossAttnDownBlock2D",
- "UNetMidBlock2DCrossAttn",
- "UpBlock2D",
- "Transformer2DModel",
- "DownBlock2D",
- }
-
- assert set(modules_with_gc_enabled.keys()) == EXPECTED_SET
- assert all(modules_with_gc_enabled.values()), "All modules should be enabled"
-
def test_special_attn_proc(self):
class AttnEasyProc(nn.Layer):
def __init__(self, num):
@@ -418,6 +405,139 @@ def test_model_xattn_padding(self):
y=keeplast_out, rtol=1e-3, atol=1e-5
).item(), "a mask with fewer tokens than condition, will be padded with 'keep' tokens. a 'discard-all' mask missing the final token is thus equivalent to a 'keep last' mask."
+ def test_lora_processors(self):
+ init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+ init_dict["attention_head_dim"] = 8, 16
+ model = self.model_class(**init_dict)
+ with paddle.no_grad():
+ sample1 = model(**inputs_dict).sample
+ lora_attn_procs = create_lora_layers(model)
+ model.set_attn_processor(lora_attn_procs)
+ model.set_attn_processor(model.attn_processors)
+ with paddle.no_grad():
+ sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+ sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ assert (sample1 - sample2).abs().max() < 3e-3
+ assert (sample3 - sample4).abs().max() < 3e-3
+ assert (sample2 - sample3).abs().max() > 3e-3
+
+ def test_lora_save_load(self):
+ init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+ init_dict["attention_head_dim"] = 8, 16
+ paddle.seed(0)
+ model = self.model_class(**init_dict)
+ with paddle.no_grad():
+ old_sample = model(**inputs_dict).sample
+ lora_attn_procs = create_lora_layers(model)
+ model.set_attn_processor(lora_attn_procs)
+ with paddle.no_grad():
+ sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ model.save_attn_procs(tmpdirname, to_diffusers=False)
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+ paddle.seed(0)
+ new_model = self.model_class(**init_dict)
+ new_model.load_attn_procs(tmpdirname, from_diffusers=False)
+
+ with paddle.no_grad():
+ new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+ assert (sample - new_sample).abs().max() < 1e-4
+
+ # LoRA and no LoRA should NOT be the same
+ assert (sample - old_sample).abs().max() > 1e-4
+
+ def test_lora_save_load_safetensors(self):
+ # enable deterministic behavior for gradient checkpointing
+ init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+ init_dict["attention_head_dim"] = (8, 16)
+
+ paddle.seed(0)
+ model = self.model_class(**init_dict)
+
+ with paddle.no_grad():
+ old_sample = model(**inputs_dict).sample
+
+ lora_attn_procs = create_lora_layers(model)
+ model.set_attn_processor(lora_attn_procs)
+
+ with paddle.no_grad():
+ sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True)
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
+ paddle.seed(0)
+ new_model = self.model_class(**init_dict)
+ new_model.load_attn_procs(tmpdirname, from_diffusers=True, use_safetensors=True)
+ with paddle.no_grad():
+ new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ assert (sample - new_sample).abs().max() < 0.0001
+ assert (sample - old_sample).abs().max() > 0.0001
+
+ # def test_lora_save_safetensors_load_torch(self):
+ # # enable deterministic behavior for gradient checkpointing
+ # init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+ # init_dict["attention_head_dim"] = (8, 16)
+
+ # paddle.seed(0)
+ # model = self.model_class(**init_dict)
+
+ # lora_attn_procs = create_lora_layers(model, mock_weights=False)
+ # model.set_attn_processor(lora_attn_procs)
+ # # Saving as paddle, properly reloads with directly filename
+ # with tempfile.TemporaryDirectory() as tmpdirname:
+ # model.save_attn_procs(tmpdirname, to_diffusers=True)
+ # self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
+ # paddle.seed(0)
+ # new_model = self.model_class(**init_dict)
+ # new_model.load_attn_procs(
+ # tmpdirname, weight_name="pytorch_lora_weights.bin", from_diffusers=True, use_safetensors=False
+ # )
+
+ def test_lora_save_torch_force_load_safetensors_error(self):
+ pass
+
+ def test_lora_on_off(self):
+ init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+ init_dict["attention_head_dim"] = 8, 16
+ paddle.seed(0)
+ model = self.model_class(**init_dict)
+ with paddle.no_grad():
+ old_sample = model(**inputs_dict).sample
+ lora_attn_procs = create_lora_layers(model)
+ model.set_attn_processor(lora_attn_procs)
+ with paddle.no_grad():
+ sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+ model.set_default_attn_processor()
+ with paddle.no_grad():
+ new_sample = model(**inputs_dict).sample
+ assert (sample - new_sample).abs().max() < 0.0001
+ assert (sample - old_sample).abs().max() < 3e-3
+
+ @unittest.skipIf(
+ not is_ppxformers_available(),
+ reason="scaled_dot_product_attention attention is only available with CUDA and `scaled_dot_product_attention` installed",
+ )
+ def test_lora_xformers_on_off(self):
+ init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+ init_dict["attention_head_dim"] = 8, 16
+ paddle.seed(0)
+ model = self.model_class(**init_dict)
+ lora_attn_procs = create_lora_layers(model)
+ model.set_attn_processor(lora_attn_procs)
+ with paddle.no_grad():
+ sample = model(**inputs_dict).sample
+ model.enable_xformers_memory_efficient_attention()
+ on_sample = model(**inputs_dict).sample
+ model.disable_xformers_memory_efficient_attention()
+ off_sample = model(**inputs_dict).sample
+ assert (sample - on_sample).abs().max() < 0.05
+ assert (sample - off_sample).abs().max() < 0.05
+
def test_custom_diffusion_processors(self):
# enable deterministic behavior for gradient checkpointing
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
diff --git a/ppdiffusers/tests/models/test_models_unet_3d_condition.py b/ppdiffusers/tests/models/test_models_unet_3d_condition.py
index 5eebe37e2..e1fcdee1c 100644
--- a/ppdiffusers/tests/models/test_models_unet_3d_condition.py
+++ b/ppdiffusers/tests/models/test_models_unet_3d_condition.py
@@ -12,12 +12,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import os
+import tempfile
import unittest
import numpy as np
import paddle
from ppdiffusers.models import UNet3DConditionModel
+from ppdiffusers.models.attention_processor import AttnProcessor, LoRAAttnProcessor
from ppdiffusers.utils import floats_tensor, logging
from ppdiffusers.utils.import_utils import is_ppxformers_available
from ppdiffusers.utils.testing_utils import enable_full_determinism
@@ -29,6 +32,37 @@
logger = logging.get_logger(__name__)
+def create_lora_layers(model, mock_weights: bool = True):
+ lora_attn_procs = {}
+ for name in model.attn_processors.keys():
+ has_cross_attention = name.endswith("attn2.processor") and not (
+ name.startswith("transformer_in") or "temp_attentions" in name.split(".")
+ )
+ cross_attention_dim = model.config.cross_attention_dim if has_cross_attention else None
+ if name.startswith("mid_block"):
+ hidden_size = model.config.block_out_channels[-1]
+ elif name.startswith("up_blocks"):
+ block_id = int(name[len("up_blocks.")])
+ hidden_size = list(reversed(model.config.block_out_channels))[block_id]
+ elif name.startswith("down_blocks"):
+ block_id = int(name[len("down_blocks.")])
+ hidden_size = model.config.block_out_channels[block_id]
+ elif name.startswith("transformer_in"):
+ # Note that the `8 * ...` comes from: https://github.com/huggingface/diffusers/blob/7139f0e874f10b2463caa8cbd585762a309d12d6/src/diffusers/models/unet_3d_condition.py#L148
+ hidden_size = 8 * model.config.attention_head_dim
+
+ lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim)
+
+ if mock_weights:
+ # add 1 to weights to mock trained weights
+ with paddle.no_grad():
+ lora_attn_procs[name].to_q_lora.up.weight.set_value(lora_attn_procs[name].to_q_lora.up.weight + 1)
+ lora_attn_procs[name].to_k_lora.up.weight.set_value(lora_attn_procs[name].to_k_lora.up.weight + 1)
+ lora_attn_procs[name].to_v_lora.up.weight.set_value(lora_attn_procs[name].to_v_lora.up.weight + 1)
+ lora_attn_procs[name].to_out_lora.up.weight.set_value(lora_attn_procs[name].to_out_lora.up.weight + 1)
+ return lora_attn_procs
+
+
class UNet3DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
model_class = UNet3DConditionModel
main_input_name = "sample"
@@ -138,6 +172,179 @@ def test_model_attention_slicing(self):
output = model(**inputs_dict)
assert output is not None
+ def test_lora_processors(self):
+ init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+ init_dict["attention_head_dim"] = 8
+
+ model = self.model_class(**init_dict)
+
+ with paddle.no_grad():
+ sample1 = model(**inputs_dict).sample
+
+ lora_attn_procs = create_lora_layers(model)
+
+ # make sure we can set a list of attention processors
+ model.set_attn_processor(lora_attn_procs)
+
+ # test that attn processors can be set to itself
+ model.set_attn_processor(model.attn_processors)
+
+ with paddle.no_grad():
+ sample2 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+ sample3 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+ sample4 = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+ assert (sample1 - sample2).abs().max() < 3e-3
+ assert (sample3 - sample4).abs().max() < 3e-3
+
+ # sample 2 and sample 3 should be different
+ assert (sample2 - sample3).abs().max() > 3e-3
+
+ def test_lora_save_load(self):
+ init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+ init_dict["attention_head_dim"] = 8
+
+ paddle.seed(0)
+ model = self.model_class(**init_dict)
+
+ with paddle.no_grad():
+ old_sample = model(**inputs_dict).sample
+
+ lora_attn_procs = create_lora_layers(model)
+ model.set_attn_processor(lora_attn_procs)
+
+ with paddle.no_grad():
+ sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ model.save_attn_procs(
+ tmpdirname,
+ to_diffusers=False,
+ )
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "paddle_lora_weights.pdparams")))
+ paddle.seed(0)
+ new_model = self.model_class(**init_dict)
+ new_model.load_attn_procs(tmpdirname, from_diffusers=False)
+
+ with paddle.no_grad():
+ new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+ assert (sample - new_sample).abs().max() < 1e-3
+
+ # LoRA and no LoRA should NOT be the same
+ assert (sample - old_sample).abs().max() > 1e-4
+
+ def test_lora_save_load_safetensors(self):
+ init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+ init_dict["attention_head_dim"] = 8
+
+ paddle.seed(0)
+ model = self.model_class(**init_dict)
+
+ with paddle.no_grad():
+ old_sample = model(**inputs_dict).sample
+
+ lora_attn_procs = create_lora_layers(model)
+ model.set_attn_processor(lora_attn_procs)
+
+ with paddle.no_grad():
+ sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+ with tempfile.TemporaryDirectory() as tmpdirname:
+ model.save_attn_procs(tmpdirname, safe_serialization=True, to_diffusers=True)
+ self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.safetensors")))
+ paddle.seed(0)
+ new_model = self.model_class(**init_dict)
+ new_model.load_attn_procs(tmpdirname, use_safetensors=True, from_diffusers=True)
+
+ with paddle.no_grad():
+ new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample
+
+ assert (sample - new_sample).abs().max() < 3e-3
+
+ # LoRA and no LoRA should NOT be the same
+ assert (sample - old_sample).abs().max() > 1e-4
+
+ # def test_lora_save_safetensors_load_torch(self):
+ # # enable deterministic behavior for gradient checkpointing
+ # init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+ # init_dict["attention_head_dim"] = 8
+
+ # paddle.seed(0)
+ # model = self.model_class(**init_dict)
+
+ # lora_attn_procs = create_lora_layers(model, mock_weights=False)
+ # model.set_attn_processor(lora_attn_procs)
+ # # Saving as paddle, properly reloads with directly filename
+ # with tempfile.TemporaryDirectory() as tmpdirname:
+ # model.save_attn_procs(tmpdirname, to_diffusers=True)
+ # self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin")))
+ # paddle.seed(0)
+ # new_model = self.model_class(**init_dict)
+ # new_model.load_attn_procs(
+ # tmpdirname, weight_name="pytorch_lora_weights.bin", use_safetensors=False, from_diffusers=True
+ # )
+
+ def test_lora_save_paddle_force_load_safetensors_error(self):
+ pass
+
+ def test_lora_on_off(self):
+ init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+ init_dict["attention_head_dim"] = 8
+
+ paddle.seed(0)
+ model = self.model_class(**init_dict)
+
+ with paddle.no_grad():
+ old_sample = model(**inputs_dict).sample
+
+ lora_attn_procs = create_lora_layers(model)
+ model.set_attn_processor(lora_attn_procs)
+
+ with paddle.no_grad():
+ sample = model(**inputs_dict, cross_attention_kwargs={"scale": 0.0}).sample
+
+ model.set_attn_processor(AttnProcessor())
+
+ with paddle.no_grad():
+ new_sample = model(**inputs_dict).sample
+
+ assert (sample - new_sample).abs().max() < 1e-4
+ assert (sample - old_sample).abs().max() < 3e-3
+
+ @unittest.skipIf(
+ not is_ppxformers_available(),
+ reason="XFormers attention is only available with CUDA and `xformers` installed",
+ )
+ def test_lora_xformers_on_off(self):
+ # enable deterministic behavior for gradient checkpointing
+ init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+ init_dict["attention_head_dim"] = 4
+
+ paddle.seed(0)
+ model = self.model_class(**init_dict)
+ lora_attn_procs = create_lora_layers(model)
+ model.set_attn_processor(lora_attn_procs)
+
+ # default
+ with paddle.no_grad():
+ sample = model(**inputs_dict).sample
+
+ model.enable_xformers_memory_efficient_attention()
+ on_sample = model(**inputs_dict).sample
+
+ model.disable_xformers_memory_efficient_attention()
+ off_sample = model(**inputs_dict).sample
+
+ assert (sample - on_sample).abs().max() < 0.005
+ assert (sample - off_sample).abs().max() < 0.005
+
def test_feed_forward_chunking(self):
init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
init_dict["norm_num_groups"] = 32
diff --git a/ppdiffusers/tests/models/test_models_unet_motion.py b/ppdiffusers/tests/models/test_models_unet_motion.py
deleted file mode 100644
index 11c730095..000000000
--- a/ppdiffusers/tests/models/test_models_unet_motion.py
+++ /dev/null
@@ -1,290 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import os
-import tempfile
-import unittest
-
-import paddle
-
-from ppdiffusers import MotionAdapter, UNet2DConditionModel, UNetMotionModel
-from ppdiffusers.utils import logging
-from ppdiffusers.utils.testing_utils import enable_full_determinism, floats_tensor
-
-from .test_modeling_common import ModelTesterMixin, UNetTesterMixin
-
-logger = logging.get_logger(__name__)
-
-enable_full_determinism()
-
-
-class UNetMotionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
- model_class = UNetMotionModel
- main_input_name = "sample"
-
- @property
- def dummy_input(self):
- batch_size = 4
- num_channels = 4
- num_frames = 8
- sizes = (32, 32)
-
- noise = floats_tensor((batch_size, num_channels, num_frames) + sizes)
- time_step = paddle.to_tensor([10])
- encoder_hidden_states = floats_tensor((batch_size, 4, 32))
-
- return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}
-
- @property
- def input_shape(self):
- return (4, 8, 32, 32)
-
- @property
- def output_shape(self):
- return (4, 8, 32, 32)
-
- def prepare_init_args_and_inputs_for_common(self):
- init_dict = {
- "block_out_channels": (32, 64),
- "down_block_types": ("CrossAttnDownBlockMotion", "DownBlockMotion"),
- "up_block_types": ("UpBlockMotion", "CrossAttnUpBlockMotion"),
- "cross_attention_dim": 32,
- "num_attention_heads": 4,
- "out_channels": 4,
- "in_channels": 4,
- "layers_per_block": 1,
- "sample_size": 32,
- }
- inputs_dict = self.dummy_input
- return init_dict, inputs_dict
-
- def test_from_unet2d(self):
- paddle.seed(0)
- unet2d = UNet2DConditionModel()
-
- paddle.seed(1)
- model = self.model_class.from_unet2d(unet2d)
- model_state_dict = model.state_dict()
-
- for param_name, param_value in unet2d.named_parameters():
- self.assertTrue(paddle.equal_all(model_state_dict[param_name], param_value))
-
- def test_freeze_unet2d(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
- model = self.model_class(**init_dict)
- model.freeze_unet2d_params()
-
- for param_name, param_value in model.named_parameters():
- if "motion_modules" not in param_name:
- self.assertFalse(not param_value.stop_gradient)
- else:
- self.assertTrue(not param_value.stop_gradient)
-
- def test_loading_motion_adapter(self):
- model = self.model_class()
- adapter = MotionAdapter()
- model.load_motion_modules(adapter)
-
- for idx, down_block in enumerate(model.down_blocks):
- adapter_state_dict = adapter.down_blocks[idx].motion_modules.state_dict()
- for param_name, param_value in down_block.motion_modules.named_parameters():
- self.assertTrue(paddle.equal_all(adapter_state_dict[param_name], param_value))
-
- for idx, up_block in enumerate(model.up_blocks):
- adapter_state_dict = adapter.up_blocks[idx].motion_modules.state_dict()
- for param_name, param_value in up_block.motion_modules.named_parameters():
- self.assertTrue(paddle.equal_all(adapter_state_dict[param_name], param_value))
-
- mid_block_adapter_state_dict = adapter.mid_block.motion_modules.state_dict()
- for param_name, param_value in model.mid_block.motion_modules.named_parameters():
- self.assertTrue(paddle.equal_all(mid_block_adapter_state_dict[param_name], param_value))
-
- def test_saving_motion_modules(self):
- paddle.seed(0)
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
- model = self.model_class(**init_dict)
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- model.save_motion_modules(tmpdirname)
- # pytorch: diffusion_pytorch_model.safetensors
- self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "model_state.pdparams")))
-
- adapter_loaded = MotionAdapter.from_pretrained(tmpdirname)
-
- paddle.seed(0)
- model_loaded = self.model_class(**init_dict)
- model_loaded.load_motion_modules(adapter_loaded)
-
- with paddle.no_grad():
- output = model(**inputs_dict)[0]
- output_loaded = model_loaded(**inputs_dict)[0]
-
- max_diff = (output - output_loaded).abs().max().item()
- self.assertLessEqual(max_diff, 1e-4, "Models give different forward passes")
-
- # @unittest.skipIf(
- # torch_device != "cuda" or not is_xformers_available(),
- # reason="XFormers attention is only available with CUDA and `xformers` installed",
- # )
- # def test_xformers_enable_works(self):
- # init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
- # model = self.model_class(**init_dict)
-
- # model.enable_xformers_memory_efficient_attention()
-
- # assert (
- # model.mid_block.attentions[0].transformer_blocks[0].attn1.processor.__class__.__name__
- # == "XFormersAttnProcessor"
- # ), "xformers is not enabled"
-
- def test_gradient_checkpointing_is_applied(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
- model_class_copy = copy.copy(self.model_class)
-
- modules_with_gc_enabled = {}
-
- # now monkey patch the following function:
- # def _set_gradient_checkpointing(self, module, value=False):
- # if hasattr(module, "gradient_checkpointing"):
- # module.gradient_checkpointing = value
-
- def _set_gradient_checkpointing_new(self, module, value=False):
- if hasattr(module, "gradient_checkpointing"):
- module.gradient_checkpointing = value
- modules_with_gc_enabled[module.__class__.__name__] = True
-
- model_class_copy._set_gradient_checkpointing = _set_gradient_checkpointing_new
-
- model = model_class_copy(**init_dict)
- model.enable_gradient_checkpointing()
-
- EXPECTED_SET = {
- "CrossAttnUpBlockMotion",
- "CrossAttnDownBlockMotion",
- "UNetMidBlockCrossAttnMotion",
- "UpBlockMotion",
- "Transformer2DModel",
- "DownBlockMotion",
- }
-
- assert set(modules_with_gc_enabled.keys()) == EXPECTED_SET
- assert all(modules_with_gc_enabled.values()), "All modules should be enabled"
-
- def test_feed_forward_chunking(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
- init_dict["norm_num_groups"] = 32
-
- model = self.model_class(**init_dict)
- model.eval()
-
- with paddle.no_grad():
- output = model(**inputs_dict)[0]
-
- model.enable_forward_chunking()
- with paddle.no_grad():
- output_2 = model(**inputs_dict)[0]
-
- self.assertEqual(output.shape, output_2.shape, "Shape doesn't match")
- assert paddle.abs(output.cpu() - output_2.cpu()).max() < 1e-2
-
- # paddle.Tensor not support pickle
- # def test_pickle(self):
- # # enable deterministic behavior for gradient checkpointing
- # init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
- # model = self.model_class(**init_dict)
-
- # with paddle.no_grad():
- # sample = model(**inputs_dict).sample
-
- # sample_copy = copy.copy(sample)
-
- # assert (sample - sample_copy).abs().max() < 1e-4
-
- def test_from_save_pretrained(self, expected_max_diff=5e-5):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
- paddle.seed(0)
- model = self.model_class(**init_dict)
- model.eval()
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- model.save_pretrained(tmpdirname, safe_serialization=False)
- paddle.seed(0)
- new_model = self.model_class.from_pretrained(tmpdirname)
-
- with paddle.no_grad():
- image = model(**inputs_dict)
- if isinstance(image, dict):
- image = image.to_tuple()[0]
-
- new_image = new_model(**inputs_dict)
-
- if isinstance(new_image, dict):
- new_image = new_image.to_tuple()[0]
-
- max_diff = (image - new_image).abs().max().item()
- self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
-
- def test_from_save_pretrained_variant(self, expected_max_diff=5e-5):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
- paddle.seed(0)
- model = self.model_class(**init_dict)
- model.eval()
-
- with tempfile.TemporaryDirectory() as tmpdirname:
- model.save_pretrained(tmpdirname, variant="fp16", safe_serialization=False)
-
- paddle.seed(0)
- new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16")
- # non-variant cannot be loaded
- with self.assertRaises(OSError) as error_context:
- self.model_class.from_pretrained(tmpdirname)
-
- # make sure that error message states what keys are missing
- assert "Error no file named model_state.pdparams found in directory" in str(error_context.exception)
-
- with paddle.no_grad():
- image = model(**inputs_dict)
- if isinstance(image, dict):
- image = image.to_tuple()[0]
-
- new_image = new_model(**inputs_dict)
-
- if isinstance(new_image, dict):
- new_image = new_image.to_tuple()[0]
-
- max_diff = (image - new_image).abs().max().item()
- self.assertLessEqual(max_diff, expected_max_diff, "Models give different forward passes")
-
- def test_forward_with_norm_groups(self):
- init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
- init_dict["norm_num_groups"] = 16
- init_dict["block_out_channels"] = (16, 32)
-
- model = self.model_class(**init_dict)
- model.eval()
-
- with paddle.no_grad():
- output = model(**inputs_dict)
-
- if isinstance(output, dict):
- output = output.to_tuple()[0]
-
- self.assertIsNotNone(output)
- expected_shape = inputs_dict["sample"].shape
- self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
index f355e1804..65bfc6d6a 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion.py
@@ -14,8 +14,7 @@
# limitations under the License.
import gc
-
-# import tempfile
+import tempfile
import unittest
import numpy as np
@@ -37,7 +36,7 @@
from ppdiffusers.utils import nightly, slow
from ppdiffusers.utils.testing_utils import CaptureLogger, require_paddle_gpu
-from ...lora.test_lora_layers_old_backend import create_lora_layers
+from ...models.test_models_unet_2d_condition import create_lora_layers
from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
from ..test_pipelines_common import PipelineTesterMixin
diff --git a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
index 965f70e60..0b8fca03d 100644
--- a/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
+++ b/ppdiffusers/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
@@ -40,7 +40,7 @@
require_paddle_gpu,
)
-from ...lora.test_lora_layers_old_backend import create_lora_layers
+from ...models.test_models_unet_2d_condition import create_lora_layers
from ..pipeline_params import (
TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS,
TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,