From e9fc161e0493a4c686a62b493a698ca69e8d5f89 Mon Sep 17 00:00:00 2001 From: Aryan Date: Fri, 28 Mar 2025 15:30:09 +0100 Subject: [PATCH 1/2] update --- src/diffusers/pipelines/wan/pipeline_wan.py | 4 ++++ src/diffusers/pipelines/wan/pipeline_wan_i2v.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/src/diffusers/pipelines/wan/pipeline_wan.py b/src/diffusers/pipelines/wan/pipeline_wan.py index 6fab997e6660..3332347a33d7 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan.py +++ b/src/diffusers/pipelines/wan/pipeline_wan.py @@ -265,12 +265,15 @@ def check_inputs( negative_prompt, height, width, + num_frames, prompt_embeds=None, negative_prompt_embeds=None, callback_on_step_end_tensor_inputs=None, ): if height % 16 != 0 or width % 16 != 0: raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.") + if num_frames % 4 != 1: + raise ValueError("`num_frames` must be of the form 4 * k + 1, for k >= 0") if callback_on_step_end_tensor_inputs is not None and not all( k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs @@ -453,6 +456,7 @@ def __call__( negative_prompt, height, width, + num_frames, prompt_embeds, negative_prompt_embeds, callback_on_step_end_tensor_inputs, diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py index df724894c478..6596169a063e 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py @@ -319,6 +319,7 @@ def check_inputs( image, height, width, + num_frames, prompt_embeds=None, negative_prompt_embeds=None, callback_on_step_end_tensor_inputs=None, @@ -327,6 +328,8 @@ def check_inputs( raise ValueError("`image` has to be of type `torch.Tensor` or `PIL.Image.Image` but is" f" {type(image)}") if height % 16 != 0 or width % 16 != 0: raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.") + if num_frames % 4 != 1: + raise ValueError("`num_frames` must be of the form 4 * k + 1, for k >= 0") if callback_on_step_end_tensor_inputs is not None and not all( k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs @@ -554,6 +557,7 @@ def __call__( image, height, width, + num_frames, prompt_embeds, negative_prompt_embeds, callback_on_step_end_tensor_inputs, From b7ed5153b763cdf45ee92d852aa05095aa0f73e8 Mon Sep 17 00:00:00 2001 From: Aryan Date: Sat, 29 Mar 2025 11:31:46 +0100 Subject: [PATCH 2/2] raise warning and round to nearest multiple of scale factor --- src/diffusers/pipelines/wan/pipeline_wan.py | 11 +++++++---- src/diffusers/pipelines/wan/pipeline_wan_i2v.py | 11 +++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/diffusers/pipelines/wan/pipeline_wan.py b/src/diffusers/pipelines/wan/pipeline_wan.py index 3332347a33d7..3294e9a56a07 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan.py +++ b/src/diffusers/pipelines/wan/pipeline_wan.py @@ -265,15 +265,12 @@ def check_inputs( negative_prompt, height, width, - num_frames, prompt_embeds=None, negative_prompt_embeds=None, callback_on_step_end_tensor_inputs=None, ): if height % 16 != 0 or width % 16 != 0: raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.") - if num_frames % 4 != 1: - raise ValueError("`num_frames` must be of the form 4 * k + 1, for k >= 0") if callback_on_step_end_tensor_inputs is not None and not all( k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs @@ -456,12 +453,18 @@ def __call__( negative_prompt, height, width, - num_frames, prompt_embeds, negative_prompt_embeds, callback_on_step_end_tensor_inputs, ) + if num_frames % self.vae_scale_factor_temporal != 1: + logger.warning( + f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number." + ) + num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1 + num_frames = max(num_frames, 1) + self._guidance_scale = guidance_scale self._attention_kwargs = attention_kwargs self._current_timestep = None diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py index 6596169a063e..fd1d90849a66 100644 --- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py +++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py @@ -319,7 +319,6 @@ def check_inputs( image, height, width, - num_frames, prompt_embeds=None, negative_prompt_embeds=None, callback_on_step_end_tensor_inputs=None, @@ -328,8 +327,6 @@ def check_inputs( raise ValueError("`image` has to be of type `torch.Tensor` or `PIL.Image.Image` but is" f" {type(image)}") if height % 16 != 0 or width % 16 != 0: raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.") - if num_frames % 4 != 1: - raise ValueError("`num_frames` must be of the form 4 * k + 1, for k >= 0") if callback_on_step_end_tensor_inputs is not None and not all( k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs @@ -557,12 +554,18 @@ def __call__( image, height, width, - num_frames, prompt_embeds, negative_prompt_embeds, callback_on_step_end_tensor_inputs, ) + if num_frames % self.vae_scale_factor_temporal != 1: + logger.warning( + f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number." + ) + num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1 + num_frames = max(num_frames, 1) + self._guidance_scale = guidance_scale self._attention_kwargs = attention_kwargs self._current_timestep = None