From 1a8d588c571e27716097e542e5a3b2333770ed00 Mon Sep 17 00:00:00 2001 From: Inigo Goiri Date: Sun, 30 Mar 2025 09:51:20 -0700 Subject: [PATCH 1/7] Add support to pass image embeddings to the pipeline. It allows computing the image embeddings externally and use them. --- .../pipelines/wan/pipeline_wan_i2v.py | 31 +++++++++++++++---- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py index df724894c478..fc62b50b4314 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py @@ -223,12 +223,15 @@ def _get_t5_prompt_embeds( def encode_image( self, image: PipelineImageInput, + image_embeds: Optional[torch.Tensor] = None, device: Optional[torch.device] = None, ): - device = device or self._execution_device - image = self.image_processor(images=image, return_tensors="pt").to(device) - image_embeds = self.image_encoder(**image, output_hidden_states=True) - return image_embeds.hidden_states[-2] + if image_embeds is None: + device = device or self._execution_device + image = self.image_processor(images=image, return_tensors="pt").to(device) + image_embeds = self.image_encoder(**image, output_hidden_states=True) + image_embeds = image_embeds.hidden_states[-2] + return image_embeds # Copied from diffusers.pipelines.wan.pipeline_wan.WanPipeline.encode_prompt def encode_prompt( @@ -321,9 +324,18 @@ def check_inputs( width, prompt_embeds=None, negative_prompt_embeds=None, + image_embeds=None, callback_on_step_end_tensor_inputs=None, ): - if not isinstance(image, torch.Tensor) and not isinstance(image, PIL.Image.Image): + if image is not None and image_embeds is not None: + raise ValueError( + f"Cannot forward both `image`: {image} and `image_embeds`: {image_embeds}. Please make sure to" + " only forward one of the two." + if image is None and image_embeds is None: + raise ValueError( + "Provide either `image` or `prompt_embeds`. Cannot leave both `image` and `image_embeds` undefined." + ) + if image is not None and not isinstance(image, torch.Tensor) and not isinstance(image, PIL.Image.Image): raise ValueError("`image` has to be of type `torch.Tensor` or `PIL.Image.Image` but is" f" {type(image)}") if height % 16 != 0 or width % 16 != 0: raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.") @@ -463,6 +475,7 @@ def __call__( latents: Optional[torch.Tensor] = None, prompt_embeds: Optional[torch.Tensor] = None, negative_prompt_embeds: Optional[torch.Tensor] = None, + image_embeds: Optional[torch.Tensor] = None, output_type: Optional[str] = "np", return_dict: bool = True, attention_kwargs: Optional[Dict[str, Any]] = None, @@ -512,6 +525,12 @@ def __call__( prompt_embeds (`torch.Tensor`, *optional*): Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `negative_prompt` input argument. + image_embeds (`torch.Tensor`, *optional*): + Pre-generated image embeddings. Can be used to easily tweak image inputs (weighting). If not + provided, image embeddings are generated from the `image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): @@ -592,7 +611,7 @@ def __call__( if negative_prompt_embeds is not None: negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype) - image_embeds = self.encode_image(image, device) + image_embeds = self.encode_image(image, image_embeds, device) image_embeds = image_embeds.repeat(batch_size, 1, 1) image_embeds = image_embeds.to(transformer_dtype) From 8477f3f3f429f35b3a275428b75fc6ce6a8c084e Mon Sep 17 00:00:00 2001 From: hlky Date: Mon, 31 Mar 2025 07:25:29 +0100 Subject: [PATCH 2/7] Update src/diffusers/pipelines/wan/pipeline_wan_i2v.py --- src/diffusers/pipelines/wan/pipeline_wan_i2v.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py index fc62b50b4314..3eddb0132170 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py @@ -331,6 +331,7 @@ def check_inputs( raise ValueError( f"Cannot forward both `image`: {image} and `image_embeds`: {image_embeds}. Please make sure to" " only forward one of the two." + ) if image is None and image_embeds is None: raise ValueError( "Provide either `image` or `prompt_embeds`. Cannot leave both `image` and `image_embeds` undefined." From c0470218844d9da06cc6cd22b613e376da16e746 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 31 Mar 2025 06:27:38 +0000 Subject: [PATCH 3/7] Apply style fixes --- src/diffusers/pipelines/wan/pipeline_wan_i2v.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py index 3eddb0132170..0ab06a7e6ee1 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py @@ -530,8 +530,8 @@ def __call__( Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not provided, text embeddings are generated from the `negative_prompt` input argument. image_embeds (`torch.Tensor`, *optional*): - Pre-generated image embeddings. Can be used to easily tweak image inputs (weighting). If not - provided, image embeddings are generated from the `image` input argument. + Pre-generated image embeddings. Can be used to easily tweak image inputs (weighting). If not provided, + image embeddings are generated from the `image` input argument. output_type (`str`, *optional*, defaults to `"pil"`): The output format of the generated image. Choose between `PIL.Image` or `np.array`. return_dict (`bool`, *optional*, defaults to `True`): From 48e8e15eb4ebaa121842ad612c6e87c2c98cad05 Mon Sep 17 00:00:00 2001 From: Inigo Goiri Date: Mon, 31 Mar 2025 15:05:17 -0700 Subject: [PATCH 4/7] Update src/diffusers/pipelines/wan/pipeline_wan_i2v.py Co-authored-by: YiYi Xu --- src/diffusers/pipelines/wan/pipeline_wan_i2v.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py index 0fa7ac136ef9..05caa7d49e25 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py @@ -619,7 +619,8 @@ def __call__( if negative_prompt_embeds is not None: negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype) - image_embeds = self.encode_image(image, image_embeds, device) + if not image_embeds: + image_embeds = self.encode_image(image, device) image_embeds = image_embeds.repeat(batch_size, 1, 1) image_embeds = image_embeds.to(transformer_dtype) From b16b706c92d3e0efb46db9cb126079593193e1c4 Mon Sep 17 00:00:00 2001 From: Inigo Goiri Date: Mon, 31 Mar 2025 15:06:26 -0700 Subject: [PATCH 5/7] Avoid changes in encode_image --- src/diffusers/pipelines/wan/pipeline_wan_i2v.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py index 05caa7d49e25..876be95d2b84 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py @@ -223,15 +223,12 @@ def _get_t5_prompt_embeds( def encode_image( self, image: PipelineImageInput, - image_embeds: Optional[torch.Tensor] = None, device: Optional[torch.device] = None, ): - if image_embeds is None: - device = device or self._execution_device - image = self.image_processor(images=image, return_tensors="pt").to(device) - image_embeds = self.image_encoder(**image, output_hidden_states=True) - image_embeds = image_embeds.hidden_states[-2] - return image_embeds + device = device or self._execution_device + image = self.image_processor(images=image, return_tensors="pt").to(device) + image_embeds = self.image_encoder(**image, output_hidden_states=True) + return image_embeds.hidden_states[-2] # Copied from diffusers.pipelines.wan.pipeline_wan.WanPipeline.encode_prompt def encode_prompt( From 2ae6f772750048040909508ebebab1b49cb74406 Mon Sep 17 00:00:00 2001 From: Inigo Goiri Date: Tue, 1 Apr 2025 16:48:15 -0700 Subject: [PATCH 6/7] use is None Co-authored-by: hlky --- src/diffusers/pipelines/wan/pipeline_wan_i2v.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py index 876be95d2b84..4abc7c153052 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py @@ -616,7 +616,7 @@ def __call__( if negative_prompt_embeds is not None: negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype) - if not image_embeds: + if image_embeds is None: image_embeds = self.encode_image(image, device) image_embeds = image_embeds.repeat(batch_size, 1, 1) image_embeds = image_embeds.to(transformer_dtype) From f9059091f56b6e2fc9beeef2946a1c439681a84b Mon Sep 17 00:00:00 2001 From: Inigo Goiri Date: Wed, 2 Apr 2025 13:32:43 -0700 Subject: [PATCH 7/7] Fixing check_inputs call. The arguments are passed unnamed and callback gets passed as image_embeds. --- src/diffusers/pipelines/wan/pipeline_wan_i2v.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py index 4abc7c153052..487ad2d80ac6 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py @@ -573,6 +573,7 @@ def __call__( width, prompt_embeds, negative_prompt_embeds, + image_embeds, callback_on_step_end_tensor_inputs, )