fix AnimateAnyone&OpenSora (#591)

PaddlePaddle · Jul 9, 2024 · 19ee777 · 19ee777
1 parent 6f7e730
commit 19ee777
Show file tree

Hide file tree

Showing 3 changed files with 6 additions and 6 deletions.
diff --git a/ppdiffusers/examples/AnimateAnyone/README.md b/ppdiffusers/examples/AnimateAnyone/README.md
@@ -16,8 +16,8 @@
 # 克隆 PaddleMIX 仓库
 git clone https://github.com/PaddlePaddle/PaddleMIX
 
-# 安装2.6.0版本的paddlepaddle-gpu，当前我们选择了cuda12.0的版本，可以查看 https://www.paddlepaddle.org.cn/ 寻找自己适合的版本
-python -m pip install paddlepaddle-gpu==2.6.0.post120 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
+# 安装2.6.1版本的paddlepaddle-gpu，当前我们选择了cuda12.0的版本，可以查看 https://www.paddlepaddle.org.cn/ 寻找自己适合的版本
+python -m pip install paddlepaddle-gpu==2.6.1.post120 -f https://www.paddlepaddle.org.cn/whl/linux/mkl/avx/stable.html
 
 # 进入ppdiffusers目录
 cd PaddleMIX/ppdiffusers

diff --git a/ppdiffusers/examples/Open-Sora/README.md b/ppdiffusers/examples/Open-Sora/README.md
@@ -39,7 +39,7 @@ tar -xzvf OpenSoraData.tar.gz
 ```
 
 ### 3.2 单机多卡训练
-训练脚本基于 paddlenlp.trainer 实现，可通过 `--gpus` 指定训练使用的GPU卡号，在多卡环境上支持开启分组切片技术`--sharding`以降低显存占用。
+训练脚本基于 paddlenlp.trainer 实现，可通过 `--gpus` 指定训练使用的GPU卡号，在多卡环境上支持开启分组切片技术`--sharding`以降低显存占用。在NVIDIA 32G V100的算力条件下，需通过减小模型规模(如将模型层数[self.depth](https://github.com/PaddlePaddle/PaddleMIX/blob/develop/ppdiffusers/examples/Open-Sora/models/stdit/stdit2.py#L253)减小为1)，以降低显存占用，从而实现训练流程的跑通。
 ```bash
 ppdiffusers_path=PaddleMIX/ppdiffusers
 export PYTHONPATH=$ppdiffusers_path:$PYTHONPATH
@@ -111,7 +111,7 @@ export PYTHONPATH=$ppdiffusers_path:$PYTHONPATH
 python scripts/inference-long.py --num-frames 12 --image-size 240 240 --sample-name video_extend  --prompt 'A car driving on the ocean.{"reference_path": "./assets/videos/d0_proc.mp4","mask_strategy": "0,0,0,-6,6"}'
 
 # video editting
-python scripts/inference-long.py --num-frames 16 --image-size 256 256 --sample-name edit --prompt 'A cyberpunk-style car at New York city.{"reference_path": "./assets/videos/d0_proc.mp4","mask_strategy": "0,0,0,0,16,0.4"}'
+python scripts/inference-long.py --num-frames 7 --image-size 256 256 --sample-name edit --prompt 'A cyberpunk-style car at New York city.{"reference_path": "./assets/videos/d0_proc.mp4","mask_strategy": "0,0,0,0,7,0.4"}'
 ```
 
 

diff --git a/ppdiffusers/ppdiffusers/models/animate_anyone/transformer_2d.py b/ppdiffusers/ppdiffusers/models/animate_anyone/transformer_2d.py
@@ -106,8 +106,8 @@ def __init__(
         self.attention_head_dim = attention_head_dim
         inner_dim = num_attention_heads * attention_head_dim
 
-        conv_cls = paddle.nn.Conv2D if USE_PEFT_BACKEND else LoRACompatibleConv
-        linear_cls = paddle.nn.Linear if USE_PEFT_BACKEND else LoRACompatibleLinear
+        conv_cls = LoRACompatibleConv if USE_PEFT_BACKEND else paddle.nn.Conv2D
+        linear_cls = LoRACompatibleLinear if USE_PEFT_BACKEND else paddle.nn.Linear
 
         # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
         # Define whether input is continuous or discrete depending on configuration