From e9fc161e0493a4c686a62b493a698ca69e8d5f89 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Fri, 28 Mar 2025 15:30:09 +0100
Subject: [PATCH 1/2] update

---
 src/diffusers/pipelines/wan/pipeline_wan.py     | 4 ++++
 src/diffusers/pipelines/wan/pipeline_wan_i2v.py | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/src/diffusers/pipelines/wan/pipeline_wan.py b/src/diffusers/pipelines/wan/pipeline_wan.py
index 6fab997e6660..3332347a33d7 100644
--- a/src/diffusers/pipelines/wan/pipeline_wan.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan.py
@@ -265,12 +265,15 @@ def check_inputs(
         negative_prompt,
         height,
         width,
+        num_frames,
         prompt_embeds=None,
         negative_prompt_embeds=None,
         callback_on_step_end_tensor_inputs=None,
     ):
         if height % 16 != 0 or width % 16 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
+        if num_frames % 4 != 1:
+            raise ValueError("`num_frames` must be of the form 4 * k + 1, for k >= 0")
 
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
@@ -453,6 +456,7 @@ def __call__(
             negative_prompt,
             height,
             width,
+            num_frames,
             prompt_embeds,
             negative_prompt_embeds,
             callback_on_step_end_tensor_inputs,
diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
index df724894c478..6596169a063e 100644
--- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
@@ -319,6 +319,7 @@ def check_inputs(
         image,
         height,
         width,
+        num_frames,
         prompt_embeds=None,
         negative_prompt_embeds=None,
         callback_on_step_end_tensor_inputs=None,
@@ -327,6 +328,8 @@ def check_inputs(
             raise ValueError("`image` has to be of type `torch.Tensor` or `PIL.Image.Image` but is" f" {type(image)}")
         if height % 16 != 0 or width % 16 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
+        if num_frames % 4 != 1:
+            raise ValueError("`num_frames` must be of the form 4 * k + 1, for k >= 0")
 
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
@@ -554,6 +557,7 @@ def __call__(
             image,
             height,
             width,
+            num_frames,
             prompt_embeds,
             negative_prompt_embeds,
             callback_on_step_end_tensor_inputs,

From b7ed5153b763cdf45ee92d852aa05095aa0f73e8 Mon Sep 17 00:00:00 2001
From: Aryan <aryan@huggingface.co>
Date: Sat, 29 Mar 2025 11:31:46 +0100
Subject: [PATCH 2/2] raise warning and round to nearest multiple of scale
 factor

---
 src/diffusers/pipelines/wan/pipeline_wan.py     | 11 +++++++----
 src/diffusers/pipelines/wan/pipeline_wan_i2v.py | 11 +++++++----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/pipelines/wan/pipeline_wan.py b/src/diffusers/pipelines/wan/pipeline_wan.py
index 3332347a33d7..3294e9a56a07 100644
--- a/src/diffusers/pipelines/wan/pipeline_wan.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan.py
@@ -265,15 +265,12 @@ def check_inputs(
         negative_prompt,
         height,
         width,
-        num_frames,
         prompt_embeds=None,
         negative_prompt_embeds=None,
         callback_on_step_end_tensor_inputs=None,
     ):
         if height % 16 != 0 or width % 16 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
-        if num_frames % 4 != 1:
-            raise ValueError("`num_frames` must be of the form 4 * k + 1, for k >= 0")
 
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
@@ -456,12 +453,18 @@ def __call__(
             negative_prompt,
             height,
             width,
-            num_frames,
             prompt_embeds,
             negative_prompt_embeds,
             callback_on_step_end_tensor_inputs,
         )
 
+        if num_frames % self.vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+        num_frames = max(num_frames, 1)
+
         self._guidance_scale = guidance_scale
         self._attention_kwargs = attention_kwargs
         self._current_timestep = None
diff --git a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
index 6596169a063e..fd1d90849a66 100644
--- a/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
+++ b/src/diffusers/pipelines/wan/pipeline_wan_i2v.py
@@ -319,7 +319,6 @@ def check_inputs(
         image,
         height,
         width,
-        num_frames,
         prompt_embeds=None,
         negative_prompt_embeds=None,
         callback_on_step_end_tensor_inputs=None,
@@ -328,8 +327,6 @@ def check_inputs(
             raise ValueError("`image` has to be of type `torch.Tensor` or `PIL.Image.Image` but is" f" {type(image)}")
         if height % 16 != 0 or width % 16 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
-        if num_frames % 4 != 1:
-            raise ValueError("`num_frames` must be of the form 4 * k + 1, for k >= 0")
 
         if callback_on_step_end_tensor_inputs is not None and not all(
             k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
@@ -557,12 +554,18 @@ def __call__(
             image,
             height,
             width,
-            num_frames,
             prompt_embeds,
             negative_prompt_embeds,
             callback_on_step_end_tensor_inputs,
         )
 
+        if num_frames % self.vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+        num_frames = max(num_frames, 1)
+
         self._guidance_scale = guidance_scale
         self._attention_kwargs = attention_kwargs
         self._current_timestep = None