huggingface · yiyixuxu · Jan 31, 2024 · Jan 14, 2024 · Jan 14, 2024 · Jan 14, 2024
diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py
@@ -11,8 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from pathlib import Path
-from typing import Dict, Union
+from typing import Dict, List, Union
 
 import torch
 from huggingface_hub.utils import validate_hf_hub_args
@@ -45,14 +46,15 @@ class IPAdapterMixin:
     @validate_hf_hub_args
     def load_ip_adapter(
         self,
-        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
-        subfolder: str,
-        weight_name: str,
+        pretrained_model_name_or_path_or_dict: Union[str, List[str], Dict[str, torch.Tensor]],
+        subfolder: Union[str, List[str]],
+        weight_name: Union[str, List[str]],
         **kwargs,
     ):
         """
         Parameters:
-            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
+            pretrained_model_name_or_path_or_dict (`str` or `
+            .PathLike` or `dict`):
                 Can be either:
 
                     - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
@@ -87,6 +89,26 @@ def load_ip_adapter(
                 The subfolder location of a model file within a larger model repository on the Hub or locally.
         """
 
+        # handle the list inputs for multiple IP Adapters
+        if not isinstance(weight_name, list):
+            weight_name = [weight_name]
+
+        if not isinstance(pretrained_model_name_or_path_or_dict, list):
+            pretrained_model_name_or_path_or_dict = [pretrained_model_name_or_path_or_dict]
+        if len(pretrained_model_name_or_path_or_dict) == 1:
+            pretrained_model_name_or_path_or_dict = pretrained_model_name_or_path_or_dict * len(weight_name)
+
+        if not isinstance(subfolder, list):
+            subfolder = [subfolder]
+        if len(subfolder) == 1:
+            subfolder = subfolder * len(weight_name)
+
+        if len(weight_name) != len(pretrained_model_name_or_path_or_dict):
+            raise ValueError("`weight_name` and `pretrained_model_name_or_path_or_dict` must have the same length.")
+
+        if len(weight_name) != len(subfolder):
+            raise ValueError("`weight_name` and `subfolder` must have the same length.")
+
         # Load the main state dict first.
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
@@ -100,61 +122,68 @@ def load_ip_adapter(
             "file_type": "attn_procs_weights",
             "framework": "pytorch",
         }
-
-        if not isinstance(pretrained_model_name_or_path_or_dict, dict):
-            model_file = _get_model_file(
-                pretrained_model_name_or_path_or_dict,
-                weights_name=weight_name,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                resume_download=resume_download,
-                proxies=proxies,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-                subfolder=subfolder,
-                user_agent=user_agent,
-            )
-            if weight_name.endswith(".safetensors"):
-                state_dict = {"image_proj": {}, "ip_adapter": {}}
-                with safe_open(model_file, framework="pt", device="cpu") as f:
-                    for key in f.keys():
-                        if key.startswith("image_proj."):
-                            state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
-                        elif key.startswith("ip_adapter."):
-                            state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
-            else:
-                state_dict = torch.load(model_file, map_location="cpu")
-        else:
-            state_dict = pretrained_model_name_or_path_or_dict
-
-        keys = list(state_dict.keys())
-        if keys != ["image_proj", "ip_adapter"]:
-            raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")
-
-        # load CLIP image encoder here if it has not been registered to the pipeline yet
-        if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
+        state_dicts = []
+        for pretrained_model_name_or_path_or_dict, weight_name, subfolder in zip(
+            pretrained_model_name_or_path_or_dict, weight_name, subfolder
+        ):
             if not isinstance(pretrained_model_name_or_path_or_dict, dict):
-                logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
-                image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                model_file = _get_model_file(
                     pretrained_model_name_or_path_or_dict,
-                    subfolder=Path(subfolder, "image_encoder").as_posix(),
-                ).to(self.device, dtype=self.dtype)
-                self.image_encoder = image_encoder
-                self.register_to_config(image_encoder=["transformers", "CLIPVisionModelWithProjection"])
+                    weights_name=weight_name,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                )
+                if weight_name.endswith(".safetensors"):
+                    state_dict = {"image_proj": {}, "ip_adapter": {}}
+                    with safe_open(model_file, framework="pt", device="cpu") as f:
+                        for key in f.keys():
+                            if key.startswith("image_proj."):
+                                state_dict["image_proj"][key.replace("image_proj.", "")] = f.get_tensor(key)
+                            elif key.startswith("ip_adapter."):
+                                state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
+                else:
+                    state_dict = torch.load(model_file, map_location="cpu")
             else:
-                raise ValueError("`image_encoder` cannot be None when using IP Adapters.")
-
-        # create feature extractor if it has not been registered to the pipeline yet
-        if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
-            self.feature_extractor = CLIPImageProcessor()
-            self.register_to_config(feature_extractor=["transformers", "CLIPImageProcessor"])
-
-        # load ip-adapter into unet
+                state_dict = pretrained_model_name_or_path_or_dict
+
+            state_dicts.append(state_dict)
+
+            keys = list(state_dict.keys())
+            if keys != ["image_proj", "ip_adapter"]:
+                raise ValueError("Required keys are (`image_proj` and `ip_adapter`) missing from the state dict.")
+
+            # load CLIP image encoder here if it has not been registered to the pipeline yet
+            if hasattr(self, "image_encoder") and getattr(self, "image_encoder", None) is None:
+                if not isinstance(pretrained_model_name_or_path_or_dict, dict):
+                    logger.info(f"loading image_encoder from {pretrained_model_name_or_path_or_dict}")
+                    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+                        pretrained_model_name_or_path_or_dict,
+                        subfolder=Path(subfolder, "image_encoder").as_posix(),
+                    ).to(self.device, dtype=self.dtype)
+                    self.image_encoder = image_encoder
+                    self.register_to_config(image_encoder=["transformers", "CLIPVisionModelWithProjection"])
+                else:
+                    raise ValueError("`image_encoder` cannot be None when using IP Adapters.")
+
+            # create feature extractor if it has not been registered to the pipeline yet
+            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
+                self.feature_extractor = CLIPImageProcessor()
+                self.register_to_config(feature_extractor=["transformers", "CLIPImageProcessor"])
+
+            # load ip-adapter into unet
         unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
-        unet._load_ip_adapter_weights(state_dict)
+        unet._load_ip_adapter_weights(state_dicts)
 
     def set_ip_adapter_scale(self, scale):
+        if not isinstance(scale, list):
+            scale = [scale]
         unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
         for attn_processor in unet.attn_processors.values():
             if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):

diff --git a/src/diffusers/loaders/unet.py b/src/diffusers/loaders/unet.py
@@ -24,7 +24,12 @@
 from huggingface_hub.utils import validate_hf_hub_args
 from torch import nn
 
-from ..models.embeddings import ImageProjection, IPAdapterFullImageProjection, IPAdapterPlusImageProjection
+from ..models.embeddings import (
+    ImageProjection,
+    IPAdapterFullImageProjection,
+    IPAdapterPlusImageProjection,
+    MultiIPAdapterImageProjection,
+)
 from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
 from ..utils import (
     USE_PEFT_BACKEND,
@@ -761,28 +766,14 @@ def _convert_ip_adapter_image_proj_to_diffusers(self, state_dict):
         image_projection.load_state_dict(updated_state_dict)
         return image_projection
 
-    def _load_ip_adapter_weights(self, state_dict):
+    def _convert_ip_adapter_attn_to_diffusers(self, state_dicts):
         from ..models.attention_processor import (
             AttnProcessor,
             AttnProcessor2_0,
             IPAdapterAttnProcessor,
             IPAdapterAttnProcessor2_0,
         )
 
-        if "proj.weight" in state_dict["image_proj"]:
-            # IP-Adapter
-            num_image_text_embeds = 4
-        elif "proj.3.weight" in state_dict["image_proj"]:
-            # IP-Adapter Full Face
-            num_image_text_embeds = 257  # 256 CLIP tokens + 1 CLS token
-        else:
-            # IP-Adapter Plus
-            num_image_text_embeds = state_dict["image_proj"]["latents"].shape[1]
-
-        # Set encoder_hid_proj after loading ip_adapter weights,
-        # because `IPAdapterPlusImageProjection` also has `attn_processors`.
-        self.encoder_hid_proj = None
-
         # set ip-adapter cross-attention processors & load state_dict
         attn_procs = {}
         key_id = 1
@@ -796,6 +787,7 @@ def _load_ip_adapter_weights(self, state_dict):
             elif name.startswith("down_blocks"):
                 block_id = int(name[len("down_blocks.")])
                 hidden_size = self.config.block_out_channels[block_id]
+
             if cross_attention_dim is None or "motion_modules" in name:
                 attn_processor_class = (
                     AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
@@ -805,6 +797,18 @@ def _load_ip_adapter_weights(self, state_dict):
                 attn_processor_class = (
                     IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
                 )
+                num_image_text_embeds = []
+                for state_dict in state_dicts:
+                    if "proj.weight" in state_dict["image_proj"]:
+                        # IP-Adapter
+                        num_image_text_embeds += [4]
+                    elif "proj.3.weight" in state_dict["image_proj"]:
+                        # IP-Adapter Full Face
+                        num_image_text_embeds += [257]  # 256 CLIP tokens + 1 CLS token
+                    else:
+                        # IP-Adapter Plus
+                        num_image_text_embeds += [state_dict["image_proj"]["latents"].shape[1]]
+
                 attn_procs[name] = attn_processor_class(
                     hidden_size=hidden_size,
                     cross_attention_dim=cross_attention_dim,
@@ -813,16 +817,29 @@ def _load_ip_adapter_weights(self, state_dict):
                 ).to(dtype=self.dtype, device=self.device)
 
                 value_dict = {}
-                for k, w in attn_procs[name].state_dict().items():
-                    value_dict.update({f"{k}": state_dict["ip_adapter"][f"{key_id}.{k}"]})
+                for i, state_dict in enumerate(state_dicts):
+                    value_dict.update({f"to_k_ip.{i}.weight": state_dict["ip_adapter"][f"{key_id}.to_k_ip.weight"]})
+                    value_dict.update({f"to_v_ip.{i}.weight": state_dict["ip_adapter"][f"{key_id}.to_v_ip.weight"]})
 
                 attn_procs[name].load_state_dict(value_dict)
                 key_id += 2
 
+        return attn_procs
+
+    def _load_ip_adapter_weights(self, state_dicts):
+        # Set encoder_hid_proj after loading ip_adapter weights,
+        # because `IPAdapterPlusImageProjection` also has `attn_processors`.
+        self.encoder_hid_proj = None
+
+        attn_procs = self._convert_ip_adapter_attn_to_diffusers(state_dicts)
         self.set_attn_processor(attn_procs)
 
         # convert IP-Adapter Image Projection layers to diffusers
-        image_projection = self._convert_ip_adapter_image_proj_to_diffusers(state_dict["image_proj"])
+        image_projection_layers = []
+        for state_dict in state_dicts:
+            image_projection_layer = self._convert_ip_adapter_image_proj_to_diffusers(state_dict["image_proj"])
+            image_projection_layer.to(device=self.device, dtype=self.dtype)
+            image_projection_layers.append(image_projection_layer)
 
-        self.encoder_hid_proj = image_projection.to(device=self.device, dtype=self.dtype)
+        self.encoder_hid_proj = MultiIPAdapterImageProjection(image_projection_layers)
         self.config.encoder_hid_dim_type = "ip_image_proj"