From 3917928abaf36ec12827cd9377bfbfda0c5eda6f Mon Sep 17 00:00:00 2001
From: zR <2448370773@qq.com>
Date: Thu, 15 Aug 2024 22:48:24 +0800
Subject: [PATCH 1/5] update reference
---
README.md | 13 +++++--------
README_ja.md | 13 +++++--------
README_zh.md | 13 +++++--------
sat/finetune_multi_gpus.sh | 2 +-
4 files changed, 16 insertions(+), 25 deletions(-)
diff --git a/README.md b/README.md
index 05cd6cd6..a9477d7d 100644
--- a/README.md
+++ b/README.md
@@ -219,14 +219,11 @@ hands-on practice on text-to-video generation. *The original input is in Chinese
🌟 If you find our work helpful, please leave us a star and cite our paper.
```
-@misc{yang2024cogvideoxtexttovideodiffusionmodels,
- title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
- author={Zhuoyi Yang and Jiayan Teng and Wendi Zheng and Ming Ding and Shiyu Huang and Jiazheng Xu and Yuanming Yang and Wenyi Hong and Xiaohan Zhang and Guanyu Feng and Da Yin and Xiaotao Gu and Yuxuan Zhang and Weihan Wang and Yean Cheng and Ting Liu and Bin Xu and Yuxiao Dong and Jie Tang},
- year={2024},
- eprint={2408.06072},
- archivePrefix={arXiv},
- primaryClass={cs.CV},
- url={https://arxiv.org/abs/2408.06072},
+@article{yang2024cogvideox,
+ title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
+ author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
+ journal={arXiv preprint arXiv:2408.06072},
+ year={2024}
}
@article{hong2022cogvideo,
title={CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers},
diff --git a/README_ja.md b/README_ja.md
index 01be69f2..1caaf409 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -211,14 +211,11 @@ CogVideoのデモは [https://models.aminer.cn/cogvideo](https://models.aminer.c
🌟 私たちの仕事が役立つと思われた場合、ぜひスターを付けていただき、論文を引用してください。
```
-@misc{yang2024cogvideoxtexttovideodiffusionmodels,
- title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
- author={Zhuoyi Yang and Jiayan Teng and Wendi Zheng and Ming Ding and Shiyu Huang and Jiazheng Xu and Yuanming Yang and Wenyi Hong and Xiaohan Zhang and Guanyu Feng and Da Yin and Xiaotao Gu and Yuxuan Zhang and Weihan Wang and Yean Cheng and Ting Liu and Bin Xu and Yuxiao Dong and Jie Tang},
- year={2024},
- eprint={2408.06072},
- archivePrefix={arXiv},
- primaryClass={cs.CV},
- url={https://arxiv.org/abs/2408.06072},
+@article{yang2024cogvideox,
+ title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
+ author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
+ journal={arXiv preprint arXiv:2408.06072},
+ year={2024}
}
@article{hong2022cogvideo,
title={CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers},
diff --git a/README_zh.md b/README_zh.md
index 0e5515ca..424a705f 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -191,14 +191,11 @@ CogVideo的demo网站在[https://models.aminer.cn/cogvideo](https://models.amine
🌟 如果您发现我们的工作有所帮助,欢迎引用我们的文章,留下宝贵的stars
```
-@misc{yang2024cogvideoxtexttovideodiffusionmodels,
- title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
- author={Zhuoyi Yang and Jiayan Teng and Wendi Zheng and Ming Ding and Shiyu Huang and Jiazheng Xu and Yuanming Yang and Wenyi Hong and Xiaohan Zhang and Guanyu Feng and Da Yin and Xiaotao Gu and Yuxuan Zhang and Weihan Wang and Yean Cheng and Ting Liu and Bin Xu and Yuxiao Dong and Jie Tang},
- year={2024},
- eprint={2408.06072},
- archivePrefix={arXiv},
- primaryClass={cs.CV},
- url={https://arxiv.org/abs/2408.06072},
+@article{yang2024cogvideox,
+ title={CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer},
+ author={Yang, Zhuoyi and Teng, Jiayan and Zheng, Wendi and Ding, Ming and Huang, Shiyu and Xu, Jiazheng and Yang, Yuanming and Hong, Wenyi and Zhang, Xiaohan and Feng, Guanyu and others},
+ journal={arXiv preprint arXiv:2408.06072},
+ year={2024}
}
@article{hong2022cogvideo,
title={CogVideo: Large-scale Pretraining for Text-to-Video Generation via Transformers},
diff --git a/sat/finetune_multi_gpus.sh b/sat/finetune_multi_gpus.sh
index d6b6383f..bf1df4af 100644
--- a/sat/finetune_multi_gpus.sh
+++ b/sat/finetune_multi_gpus.sh
@@ -2,7 +2,7 @@
echo "RUN on `hostname`, CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-run_cmd="torchrun --standalone --nproc_per_node=4 train_video.py --base configs/cogvideox_2b_sft.yaml --seed $RANDOM“
+run_cmd="torchrun --standalone --nproc_per_node=4 train_video.py --base configs/cogvideox_2b_sft.yaml --seed $RANDOM"
echo ${run_cmd}
eval ${run_cmd}
From 013961525282388d1b1241337f190e8451d79a3d Mon Sep 17 00:00:00 2001
From: zR <2448370773@qq.com>
Date: Thu, 15 Aug 2024 23:20:17 +0800
Subject: [PATCH 2/5] update cli demo readme
---
README.md | 6 ++++--
README_ja.md | 10 ++++++++--
README_zh.md | 2 +-
3 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/README.md b/README.md
index a9477d7d..de4fe93c 100644
--- a/README.md
+++ b/README.md
@@ -22,10 +22,12 @@
## Update and News
-- 🔥🔥 **News**: ```2024/8/15```: The `SwissArmyTransformer` dependency in CogVideoX has been upgraded to `0.4.12`. Fine-tuning
+- 🔥🔥 **News**: ```2024/8/15```: The `SwissArmyTransformer` dependency in CogVideoX has been upgraded to `0.4.12`.
+ Fine-tuning
no longer requires installing `SwissArmyTransformer` from source. Additionally, the `Tied VAE` technique has been
applied in the implementation within the `diffusers` library. Please install `diffusers` and `accelerate` libraries
- from source. Inference for CogVideoX now requires only 12GB of VRAM.
+ from source. Inference for CogVideoX now requires only 12GB of VRAM. The inference code needs to be modified. Please
+ check [cli_demo](inference/cli_demo.py).
- 🔥 **News**: ```2024/8/12```: The CogVideoX paper has been uploaded to arxiv. Feel free to check out
the [paper](https://arxiv.org/abs/2408.06072).
- 🔥 **News**: ```2024/8/7```: CogVideoX has been integrated into `diffusers` version 0.30.0. Inference can now be
diff --git a/README_ja.md b/README_ja.md
index 1caaf409..d1d25aac 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -21,8 +21,14 @@
## 更新とニュース
-- 🔥🔥 **ニュース**: 2024/8/15: CogVideoX の依存関係である`SwissArmyTransformer`の依存が`0.4.12`にアップグレードされました。これにより、微調整の際に`SwissArmyTransformer`をソースコードからインストールする必要がなくなりました。同時に、`Tied VAE` 技術が `diffusers` ライブラリの実装に適用されました。`diffusers` と `accelerate` ライブラリをソースコードからインストールしてください。CogVdideoX の推論には 12GB の VRAM だけが必要です。
-- 🔥 **ニュース**: ```2024/8/12```: CogVideoX 論文がarxivにアップロードされました。ぜひ[論文](https://arxiv.org/abs/2408.06072)をご覧ください。
+
+- 🔥🔥 **ニュース**: 2024/8/15: CogVideoX の依存関係である`SwissArmyTransformer`の依存が`0.4.12`
+ にアップグレードされました。これにより、微調整の際に`SwissArmyTransformer`
+ をソースコードからインストールする必要がなくなりました。同時に、`Tied VAE` 技術が `diffusers`
+ ライブラリの実装に適用されました。`diffusers` と `accelerate` ライブラリをソースコードからインストールしてください。CogVdideoX
+ の推論には 12GB の VRAM だけが必要です。 推論コードの修正が必要です。[cli_demo](inference/cli_demo.py)をご確認ください。
+- 🔥 **ニュース**: ```2024/8/12```: CogVideoX
+ 論文がarxivにアップロードされました。ぜひ[論文](https://arxiv.org/abs/2408.06072)をご覧ください。
- 🔥 **ニュース**: ```2024/8/7```: CogVideoX は `diffusers` バージョン 0.30.0 に統合されました。単一の 3090 GPU
で推論を実行できます。詳細については [コード](inference/cli_demo.py) を参照してください。
- 🔥 **ニュース**: ```2024/8/6```: **CogVideoX-2B** で使用される **3D Causal VAE** もオープンソース化しました。これにより、ビデオをほぼ無損失で再構築できます。
diff --git a/README_zh.md b/README_zh.md
index 424a705f..c3dfb0a0 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -25,7 +25,7 @@
- 🔥🔥 **News**: ```2024/8/15```: CogVideoX 依赖中`SwissArmyTransformer`依赖升级到`0.4.12`,
微调不再需要从源代码安装`SwissArmyTransformer`。同时,`Tied VAE` 技术已经被应用到 `diffusers`
- 库中的实现,请从源代码安装 `diffusers` 和 `accelerate` 库,推理 CogVdideoX 仅需 12GB显存。
+ 库中的实现,请从源代码安装 `diffusers` 和 `accelerate` 库,推理 CogVdideoX 仅需 12GB显存。推理代码需要修改,请查看 [cli_demo](inference/cli_demo.py)
- 🔥 **News**: ```2024/8/12```: CogVideoX 论文已上传到arxiv,欢迎查看[论文](https://arxiv.org/abs/2408.06072)。
- 🔥 **News**: ```2024/8/7```: CogVideoX 已经合并入 `diffusers`
0.30.0版本,单张3090可以推理,详情请见[代码](inference/cli_demo.py)。
From a490c3c8954971765c0a28fb965e850bf5c07710 Mon Sep 17 00:00:00 2001
From: zR <2448370773@qq.com>
Date: Mon, 19 Aug 2024 16:47:51 +0800
Subject: [PATCH 3/5] update finetune
---
README.md | 2 +-
README_ja.md | 2 +-
README_zh.md | 2 +-
requirements.txt | 4 +-
sat/README.md | 160 +++++++-----------
sat/README_ja.md | 85 +++++++---
sat/README_zh.md | 47 ++++-
...ogvideox_2b_sft.yaml => cogvideox_2b.yaml} | 79 +--------
...x_2b_infer.yaml => cogvideox_2b_lora.yaml} | 26 ++-
sat/configs/inference.yaml | 15 ++
sat/configs/sft.yaml | 65 +++++++
sat/finetune_multi_gpus.sh | 4 +-
sat/finetune_single_gpu.sh | 2 +-
sat/inference.sh | 2 +-
14 files changed, 273 insertions(+), 222 deletions(-)
rename sat/configs/{cogvideox_2b_sft.yaml => cogvideox_2b.yaml} (70%)
rename sat/configs/{cogvideox_2b_infer.yaml => cogvideox_2b_lora.yaml} (91%)
create mode 100644 sat/configs/inference.yaml
create mode 100644 sat/configs/sft.yaml
diff --git a/README.md b/README.md
index de4fe93c..371b5e03 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@
📚 Check here to view Paper
- 👋 Join our WeChat and Discord
+ 👋 Join our WeChat and Discord
📍 Visit 清影 and API Platform to experience larger-scale commercial video generation models.
diff --git a/README_ja.md b/README_ja.md
index d1d25aac..3bcea003 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -14,7 +14,7 @@
📚 論文 をチェック
- 👋 WeChat と Discord に参加
+ 👋 WeChat と Discord に参加
📍 清影 と APIプラットフォーム を訪問して、より大規模な商用ビデオ生成モデルを体験
diff --git a/README_zh.md b/README_zh.md
index c3dfb0a0..2186c1b0 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -15,7 +15,7 @@
📚 查看 论文
- 👋 加入我们的 微信 和 Discord
+ 👋 加入我们的 微信 和 Discord
📍 前往 清影 和 API平台 体验更大规模的商业版视频生成模型。
diff --git a/requirements.txt b/requirements.txt
index 05e7d5c0..6dc6a201 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-diffusers==0.30.0
+git+https://github.com/huggingface/diffusers.git@main#egg=diffusers
transformers==4.44.0
-accelerate==0.33.0
+git+https://github.com/huggingface/accelerate.git@main#egg=accelerate
sentencepiece==0.2.0 # T5
SwissArmyTransformer==0.4.12 # Inference
torch==2.4.0 # Tested in 2.2 2.3 2.4 and 2.5
diff --git a/sat/README.md b/sat/README.md
index 1bf1be3a..2293dd98 100644
--- a/sat/README.md
+++ b/sat/README.md
@@ -4,7 +4,6 @@
[日本語で読む](./README_ja.md)
-
This folder contains the inference code using [SAT](https://github.com/THUDM/SwissArmyTransformer) weights and the
fine-tuning code for SAT weights.
@@ -69,110 +68,49 @@ loading it into Deepspeed in Finetune.
0 directories, 8 files
```
-3. Modify the file `configs/cogvideox_2b_infer.yaml`.
-
-```yaml
-load: "{your_CogVideoX-2b-sat_path}/transformer" ## Transformer model path
-
-conditioner_config:
- target: sgm.modules.GeneralConditioner
- params:
- emb_models:
- - is_trainable: false
- input_key: txt
- ucg_rate: 0.1
- target: sgm.modules.encoders.modules.FrozenT5Embedder
- params:
- model_dir: "google/t5-v1_1-xxl" ## T5 model path
- max_length: 226
-
-first_stage_config:
- target: sgm.models.autoencoder.VideoAutoencoderInferenceWrapper
- params:
- cp_size: 1
- ckpt_path: "{your_CogVideoX-2b-sat_path}/vae/3d-vae.pt" ## VAE model path
-```
-
-+ If using txt to save multiple prompts, please refer to `configs/test.txt` for modification. One prompt per line. If
- you don't know how to write prompts, you can first use [this code](../inference/convert_demo.py) to call LLM for
- refinement.
-+ If using the command line as input, modify
-
-```yaml
-input_type: cli
-```
-
-so that prompts can be entered from the command line.
-
-If you want to change the output video directory, you can modify:
-
-```yaml
-output_dir: outputs/
-```
-
-The default is saved in the `.outputs/` folder.
-
-4. Run the inference code to start inference
-
-```shell
-bash inference.sh
-```
+Each text file shares the same name as its corresponding video, serving as the label for that video. Videos and labels
+should be matched one-to-one. Generally, a single video should not be associated with multiple labels.
-## Fine-Tuning the Model
+For style fine-tuning, please prepare at least 50 videos and labels with similar styles to ensure proper fitting.
-### Preparing the Dataset
+### Modifying Configuration Files
-The dataset format should be as follows:
+We support two fine-tuning methods: `Lora` and full-parameter fine-tuning. Please note that both methods only fine-tune
+the `transformer` part and do not modify the `VAE` section. `T5` is used solely as an Encoder. Please modify
+the `configs/sft.yaml` (for full-parameter fine-tuning) file as follows:
```
-.
-├── labels
-│ ├── 1.txt
-│ ├── 2.txt
-│ ├── ...
-└── videos
- ├── 1.mp4
- ├── 2.mp4
- ├── ...
-```
-
-Each txt file should have the same name as its corresponding video file and contain the labels for that video. Each
-video should have a one-to-one correspondence with a label. Typically, a video should not have multiple labels.
-
-For style fine-tuning, please prepare at least 50 videos and labels with similar styles to facilitate fitting.
-
-### Modifying the Configuration File
-
-We support both `Lora` and `full-parameter fine-tuning` methods. Please note that both fine-tuning methods only apply to
-the `transformer` part. The `VAE part` is not modified. `T5` is only used as an Encoder.
-
-the `configs/cogvideox_2b_sft.yaml` (for full fine-tuning) as follows.
-
-```yaml
- # checkpoint_activations: True ## using gradient checkpointing (both checkpoint_activations in the configuration file need to be set to True)
+ # checkpoint_activations: True ## Using gradient checkpointing (Both checkpoint_activations in the config file need to be set to True)
model_parallel_size: 1 # Model parallel size
- experiment_name: lora-disney # Experiment name (do not change)
- mode: finetune # Mode (do not change)
- load: "{your_CogVideoX-2b-sat_path}/transformer" # Transformer model path
- no_load_rng: True # Whether to load the random seed
- train_iters: 1000 # Number of training iterations
- eval_iters: 1 # Number of evaluation iterations
- eval_interval: 100 # Evaluation interval
- eval_batch_size: 1 # Batch size for evaluation
+ experiment_name: lora-disney # Experiment name (do not modify)
+ mode: finetune # Mode (do not modify)
+ load: "{your_CogVideoX-2b-sat_path}/transformer" ## Transformer model path
+ no_load_rng: True # Whether to load random seed
+ train_iters: 1000 # Training iterations
+ eval_iters: 1 # Evaluation iterations
+ eval_interval: 100 # Evaluation interval
+ eval_batch_size: 1 # Evaluation batch size
save: ckpts # Model save path
save_interval: 100 # Model save interval
log_interval: 20 # Log output interval
train_data: [ "your train data path" ]
- valid_data: [ "your val data path" ] # Training and validation sets can be the same
- split: 1,0,0 # Ratio of training, validation, and test sets
- num_workers: 8 # Number of worker threads for data loading
- force_train: True # Allow missing keys when loading ckpt (refer to T5 and VAE which are loaded independently)
- only_log_video_latents: True # Avoid using VAE decoder when eval to save memory
+ valid_data: [ "your val data path" ] # Training and validation datasets can be the same
+ split: 1,0,0 # Training, validation, and test set ratio
+ num_workers: 8 # Number of worker threads for data loader
+ force_train: True # Allow missing keys when loading checkpoint (T5 and VAE are loaded separately)
+ only_log_video_latents: True # Avoid memory overhead caused by VAE decode
+ deepspeed:
+ bf16:
+ enabled: False # For CogVideoX-2B set to False and for CogVideoX-5B set to True
+ fp16:
+ enabled: True # For CogVideoX-2B set to True and for CogVideoX-5B set to False
```
-If you wish to use Lora fine-tuning, you also need to modify:
+If you wish to use Lora fine-tuning, you also need to modify the `cogvideox__lora` file:
+
+Here, take `CogVideoX-2B` as a reference:
-```yaml
+```
model:
scale_factor: 1.15258426
disable_first_stage_autocast: true
@@ -186,15 +124,47 @@ model:
r: 256
```
-### Fine-Tuning and Validation
+### Modifying Run Scripts
-1. Run the inference code to start fine-tuning.
+Edit `finetune_single_gpu.sh` or `finetune_multi_gpus.sh` to select the configuration file. Below are two examples:
-```shell
+1. If you want to use the `CogVideoX-2B` model and the `Lora` method, you need to modify `finetune_single_gpu.sh`
+ or `finetune_multi_gpus.sh`:
+
+```
+run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_2b_lora.yaml configs/sft.yaml --seed $RANDOM"
+```
+
+2. If you want to use the `CogVideoX-2B` model and the `full-parameter fine-tuning` method, you need to
+ modify `finetune_single_gpu.sh` or `finetune_multi_gpus.sh`:
+
+```
+run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_2b.yaml configs/sft.yaml --seed $RANDOM"
+```
+
+### Fine-Tuning and Evaluation
+
+Run the inference code to start fine-tuning.
+
+```
bash finetune_single_gpu.sh # Single GPU
bash finetune_multi_gpus.sh # Multi GPUs
```
+### Using the Fine-Tuned Model
+
+The fine-tuned model cannot be merged; here is how to modify the inference configuration file `inference.sh`:
+
+```
+run_cmd="$environs python sample_video.py --base configs/cogvideox__lora.yaml configs/inference.yaml --seed 42"
+```
+
+Then, execute the code:
+
+```
+bash inference.sh
+```
+
### Converting to Huggingface Diffusers Supported Weights
The SAT weight format is different from Huggingface's weight format and needs to be converted. Please run:
diff --git a/sat/README_ja.md b/sat/README_ja.md
index 957b89e1..5c0e8526 100644
--- a/sat/README_ja.md
+++ b/sat/README_ja.md
@@ -140,57 +140,94 @@ bash inference.sh
### 設定ファイルの変更
-`Lora` と
-全パラメータファインチューニングの2つの方法をサポートしています。これらのファインチューニング方法は `transformer`
-部分にのみ適用されます。`VAE` 部分は変更されません。`T5` はエンコーダーとしてのみ使用されます。
+`Lora` とフルパラメータ微調整の2つの方法をサポートしています。両方の微調整方法は、`transformer` 部分のみを微調整し、`VAE`
+部分には変更を加えないことに注意してください。`T5` はエンコーダーとしてのみ使用されます。以下のように `configs/sft.yaml` (
+フルパラメータ微調整用) ファイルを変更してください。
-`configs/cogvideox_2b_sft.yaml` (全量ファインチューニング用) を次のように変更します。
-
-```yaml
- # checkpoint_activations: True ## using gradient checkpointing (設定ファイル内の2つのcheckpoint_activationsを両方Trueに設定する必要があります)
+```
+ # checkpoint_activations: True ## 勾配チェックポイントを使用する場合 (設定ファイル内の2つの checkpoint_activations を True に設定する必要があります)
model_parallel_size: 1 # モデル並列サイズ
experiment_name: lora-disney # 実験名 (変更しないでください)
mode: finetune # モード (変更しないでください)
- load: "{your_CogVideoX-2b-sat_path}/transformer" # Transformer モデルパス
- no_load_rng: True # ランダムシードをロードするかどうか
+ load: "{your_CogVideoX-2b-sat_path}/transformer" ## Transformer モデルのパス
+ no_load_rng: True # 乱数シードを読み込むかどうか
train_iters: 1000 # トレーニングイテレーション数
eval_iters: 1 # 評価イテレーション数
- eval_interval: 100 # 評価間隔
- eval_batch_size: 1 # 評価のバッチサイズ
+ eval_interval: 100 # 評価間隔
+ eval_batch_size: 1 # 評価バッチサイズ
save: ckpts # モデル保存パス
save_interval: 100 # モデル保存間隔
log_interval: 20 # ログ出力間隔
train_data: [ "your train data path" ]
- valid_data: [ "your val data path" ] # トレーニングセットと検証セットは同じでもかまいません
- split: 1,0,0 # トレーニングセット、検証セット、テストセットの比率
+ valid_data: [ "your val data path" ] # トレーニングデータと評価データは同じでも構いません
+ split: 1,0,0 # トレーニングセット、評価セット、テストセットの割合
num_workers: 8 # データローダーのワーカースレッド数
- force_train: True # ckpt をロードする際に missing keys を許可するかどうか (T5 と VAE は独立してロードされます)
- only_log_video_latents: True # VAE デコーダーを使用しないようにしてメモリを節約します
+ force_train: True # チェックポイントをロードするときに欠落したキーを許可 (T5 と VAE は別々にロードされます)
+ only_log_video_latents: True # VAE のデコードによるメモリオーバーヘッドを回避
+ deepspeed:
+ bf16:
+ enabled: False # CogVideoX-2B の場合は False に設定し、CogVideoX-5B の場合は True に設定
+ fp16:
+ enabled: True # CogVideoX-2B の場合は True に設定し、CogVideoX-5B の場合は False に設定
```
-Lora ファインチューニングを使用する場合は、次のように変更する必要があります:
+Lora 微調整を使用したい場合は、`cogvideox__lora` ファイルも変更する必要があります。
-```yaml
+ここでは、`CogVideoX-2B` を参考にします。
+
+```
model:
scale_factor: 1.15258426
disable_first_stage_autocast: true
- not_trainable_prefixes: [ 'all' ] ## コメント解除
+ not_trainable_prefixes: [ 'all' ] ## コメントを解除
log_keys:
- txt'
- lora_config: ## コメント解除
+ lora_config: ## コメントを解除
target: sat.model.finetune.lora2.LoraMixin
params:
r: 256
```
-### ファインチューニングと検証
+### 実行スクリプトの変更
-1. 推論コードを実行してファインチューニングを開始します。
+設定ファイルを選択するために `finetune_single_gpu.sh` または `finetune_multi_gpus.sh` を編集します。以下に2つの例を示します。
-```shell
-bash finetune_single_gpu.sh # Single GPU
-bash finetune_multi_gpus.sh # Multi GPUs
+1. `CogVideoX-2B` モデルを使用し、`Lora` 手法を利用する場合は、`finetune_single_gpu.sh` または `finetune_multi_gpus.sh`
+ を変更する必要があります。
+
+```
+run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_2b_lora.yaml configs/sft.yaml --seed $RANDOM"
+```
+
+2. `CogVideoX-2B` モデルを使用し、`フルパラメータ微調整` 手法を利用する場合は、`finetune_single_gpu.sh`
+ または `finetune_multi_gpus.sh` を変更する必要があります。
+
+```
+run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_2b.yaml configs/sft.yaml --seed $RANDOM"
+```
+
+### 微調整と評価
+
+推論コードを実行して微調整を開始します。
+
+```
+bash finetune_single_gpu.sh # シングルGPU
+bash finetune_multi_gpus.sh # マルチGPU
+```
+
+### 微調整後のモデルの使用
+
+微調整されたモデルは統合できません。ここでは、推論設定ファイル `inference.sh` を変更する方法を示します。
+
+```
+run_cmd="$environs python sample_video.py --base configs/cogvideox__lora.yaml configs/inference.yaml --seed 42"
+```
+
+その後、次のコードを実行します。
+
+```
+bash inference.sh
```
### Huggingface Diffusers サポートのウェイトに変換
diff --git a/sat/README_zh.md b/sat/README_zh.md
index b90da30b..807b1333 100644
--- a/sat/README_zh.md
+++ b/sat/README_zh.md
@@ -50,7 +50,9 @@ git clone https://huggingface.co/THUDM/CogVideoX-2b.git
mkdir t5-v1_1-xxl
mv CogVideoX-2b/text_encoder/* CogVideoX-2b/tokenizer/* t5-v1_1-xxl
```
+
通过上述方案,你将会得到一个 safetensor 格式的T5文件,确保在 Deepspeed微调过程中读入的时候不会报错。
+
```
├── added_tokens.json
├── config.json
@@ -63,6 +65,7 @@ mv CogVideoX-2b/text_encoder/* CogVideoX-2b/tokenizer/* t5-v1_1-xxl
0 directories, 8 files
```
+
3. 修改`configs/cogvideox_2b_infer.yaml`中的文件。
```yaml
@@ -138,7 +141,7 @@ bash inference.sh
我们支持 `Lora` 和 全参数微调两种方式。请注意,两种微调方式都仅仅对 `transformer` 部分进行微调。不改动 `VAE` 部分。`T5`仅作为
Encoder 使用。
-部分。 请按照以下方式修改`configs/cogvideox_2b_sft.yaml`(全量微调) 中的文件。
+部分。 请按照以下方式修改`configs/sft.yaml`(全量微调) 中的文件。
```yaml
# checkpoint_activations: True ## using gradient checkpointing (配置文件中的两个checkpoint_activations都需要设置为True)
@@ -160,9 +163,16 @@ Encoder 使用。
num_workers: 8 # 数据加载器的工作线程数
force_train: True # 在加载checkpoint时允许missing keys (T5 和 VAE 单独加载)
only_log_video_latents: True # 避免VAE decode带来的显存开销
+ deepspeed:
+ bf16:
+ enabled: False # For CogVideoX-2B Turn to False and For CogVideoX-5B Turn to True
+ fp16:
+ enabled: True # For CogVideoX-2B Turn to True and For CogVideoX-5B Turn to False
```
-如果你希望使用 Lora 微调,你还需要修改:
+如果你希望使用 Lora 微调,你还需要修改`cogvideox_<模型参数>_lora` 文件:
+
+这里以 `CogVideoX-2B` 为参考:
```yaml
model:
@@ -178,15 +188,46 @@ model:
r: 256
```
+### 修改运行脚本
+
+编辑`finetune_single_gpu.sh` 或者 `finetune_multi_gpus.sh`,选择配置文件。下面是两个例子:
+
+1. 如果您想使用 `CogVideoX-2B` 模型并使用`Lora`方案,您需要修改`finetune_single_gpu.sh` 或者 `finetune_multi_gpus.sh`:
+
+```
+run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_2b_lora.yaml configs/sft.yaml --seed $RANDOM"
+```
+
+2. 如果您想使用 `CogVideoX-2B` 模型并使用`全量微调`方案,您需要修改`finetune_single_gpu.sh`
+ 或者 `finetune_multi_gpus.sh`:
+
+```
+run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_2b.yaml configs/sft.yaml --seed $RANDOM"
+```
+
### 微调和验证
-1. 运行推理代码,即可开始微调。
+运行推理代码,即可开始微调。
```shell
bash finetune_single_gpu.sh # Single GPU
bash finetune_multi_gpus.sh # Multi GPUs
```
+### 使用微调后的模型
+
+微调后的模型无法合并,这里展现了如何修改推理配置文件 `inference.sh`
+
+```
+run_cmd="$environs python sample_video.py --base configs/cogvideox_<模型参数>_lora.yaml configs/inference.yaml --seed 42"
+```
+
+然后,执行代码:
+
+```
+bash inference.sh
+```
+
### 转换到 Huggingface Diffusers 库支持的权重
SAT 权重格式与 Huggingface 的权重格式不同,需要转换。请运行
diff --git a/sat/configs/cogvideox_2b_sft.yaml b/sat/configs/cogvideox_2b.yaml
similarity index 70%
rename from sat/configs/cogvideox_2b_sft.yaml
rename to sat/configs/cogvideox_2b.yaml
index 1cac09b0..f142b627 100644
--- a/sat/configs/cogvideox_2b_sft.yaml
+++ b/sat/configs/cogvideox_2b.yaml
@@ -1,75 +1,9 @@
-args:
- checkpoint_activations: True ## using gradient checkpointing
- model_parallel_size: 1
- experiment_name: lora-disney
- mode: finetune
- load: "CogVideoX-2b-sat/transformer"
- no_load_rng: True
- train_iters: 1000
- eval_iters: 1
- eval_interval: 100
- eval_batch_size: 1
- save: ckpts
- save_interval: 100
- log_interval: 20
- train_data: ["disney"]
- valid_data: ["disney"]
- split: 1,0,0
- num_workers: 8
- force_train: True
- only_log_video_latents: True
-
-data:
- target: data_video.SFTDataset
- params:
- video_size: [480, 720]
- fps: 8
- max_num_frames: 49
- skip_frms_num: 3.
-
-deepspeed:
- train_micro_batch_size_per_gpu: 1
- gradient_accumulation_steps: 1
- steps_per_print: 50
- gradient_clipping: 0.1
- zero_optimization:
- stage: 2
- cpu_offload: false
- contiguous_gradients: false
- overlap_comm: true
- reduce_scatter: true
- reduce_bucket_size: 1000000000
- allgather_bucket_size: 1000000000
- load_from_fp32_weights: false
- zero_allow_untested_optimizer: true
- bf16:
- enabled: False
- fp16:
- enabled: True
- loss_scale: 0
- loss_scale_window: 400
- hysteresis: 2
- min_loss_scale: 1
- optimizer:
- type: sat.ops.FusedEmaAdam
- params:
- lr: 0.0002
- betas: [0.9, 0.95]
- eps: 1e-8
- weight_decay: 1e-4
- activation_checkpointing:
- partition_activations: false
- contiguous_memory_optimization: false
- wall_clock_breakdown: false
-
-
model:
scale_factor: 1.15258426
disable_first_stage_autocast: true
- not_trainable_prefixes: ['all'] ## Using Lora
log_keys:
- txt
-
+
denoiser_config:
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
params:
@@ -119,11 +53,6 @@ model:
height_interpolation: 1.875
width_interpolation: 1.875
- lora_config: ## Using Lora
- target: sat.model.finetune.lora2.LoraMixin
- params:
- r: 128
-
patch_embed_config:
target: dit_video_concat.ImagePatchEmbeddingMixin
params:
@@ -146,14 +75,14 @@ model:
ucg_rate: 0.1
target: sgm.modules.encoders.modules.FrozenT5Embedder
params:
- model_dir: "google/t5-v1_1-xxl"
+ model_dir: "t5-v1_1-xxl"
max_length: 226
first_stage_config:
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
params:
cp_size: 1
- ckpt_path: "CogVideoX-2b-sat/vae/3d-vae.pt"
+ ckpt_path: "cogvideox-2b-sat/vae/3d-vae.pt"
ignore_keys: [ 'loss' ]
loss_config:
@@ -190,7 +119,7 @@ model:
attn_resolutions: [ ]
num_res_blocks: 3
dropout: 0.0
- gather_norm: false
+ gather_norm: False
loss_fn_config:
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
diff --git a/sat/configs/cogvideox_2b_infer.yaml b/sat/configs/cogvideox_2b_lora.yaml
similarity index 91%
rename from sat/configs/cogvideox_2b_infer.yaml
rename to sat/configs/cogvideox_2b_lora.yaml
index adf9de21..af04479a 100644
--- a/sat/configs/cogvideox_2b_infer.yaml
+++ b/sat/configs/cogvideox_2b_lora.yaml
@@ -1,19 +1,7 @@
-args:
- latent_channels: 16
- mode: inference
- load: "CogVideoX-2b-sat/transformer"
- batch_size: 1
- input_type: txt
- input_file: test.txt
- sampling_num_frames: 13 # Must be 13, 11 or 9
- sampling_fps: 8
- fp16: True
- output_dir: outputs/
- force_inference: True
-
model:
scale_factor: 1.15258426
disable_first_stage_autocast: true
+ not_trainable_prefixes: ['all'] ## Using Lora
log_keys:
- txt
@@ -50,6 +38,7 @@ model:
num_attention_heads: 30
transformer_args:
+ checkpoint_activations: True ## using gradient checkpointing
vocab_size: 1
max_sequence_length: 64
layernorm_order: pre
@@ -65,6 +54,11 @@ model:
height_interpolation: 1.875
width_interpolation: 1.875
+ lora_config:
+ target: sat.model.finetune.lora2.LoraMixin
+ params:
+ r: 128
+
patch_embed_config:
target: dit_video_concat.ImagePatchEmbeddingMixin
params:
@@ -87,14 +81,14 @@ model:
ucg_rate: 0.1
target: sgm.modules.encoders.modules.FrozenT5Embedder
params:
- model_dir: "google/t5-v1_1-xxl"
+ model_dir: "t5-v1_1-xxl"
max_length: 226
first_stage_config:
target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper
params:
cp_size: 1
- ckpt_path: "CogVideoX-2b-sat/vae/3d-vae.pt"
+ ckpt_path: "cogvideox-2b-sat/vae/3d-vae.pt"
ignore_keys: [ 'loss' ]
loss_config:
@@ -131,7 +125,7 @@ model:
attn_resolutions: [ ]
num_res_blocks: 3
dropout: 0.0
- gather_norm: false
+ gather_norm: False
loss_fn_config:
target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss
diff --git a/sat/configs/inference.yaml b/sat/configs/inference.yaml
new file mode 100644
index 00000000..bf90d34e
--- /dev/null
+++ b/sat/configs/inference.yaml
@@ -0,0 +1,15 @@
+args:
+ latent_channels: 16
+ mode: inference
+ # load: "{your_CogVideoX-2b-sat_path}/transformer" # This is for Full model without lora adapter
+ # load: "{your lora folder} such as zRzRzRzRzRzRzR/lora-disney-08-20-13-28" # This is for Full model without lora adapter
+
+ batch_size: 1
+ input_type: txt
+ input_file: configs/test.txt
+ sampling_num_frames: 13 # Must be 13, 11 or 9
+ sampling_fps: 8
+ fp16: True # For CogVideoX-2B
+# bf16: True # For CogVideoX-5B
+ output_dir: outputs/
+ force_inference: True
\ No newline at end of file
diff --git a/sat/configs/sft.yaml b/sat/configs/sft.yaml
new file mode 100644
index 00000000..bbdf1a78
--- /dev/null
+++ b/sat/configs/sft.yaml
@@ -0,0 +1,65 @@
+args:
+ checkpoint_activations: True ## using gradient checkpointing
+ model_parallel_size: 1
+ experiment_name: lora-disney
+ mode: finetune
+ load: "cogvideox-2b-sat/transformer"
+ no_load_rng: True
+ train_iters: 1000 # Suggest more than 1000 For Lora and SFT For 500 is enough
+ eval_iters: 1
+ eval_interval: 100
+ eval_batch_size: 1
+ save: ckpts_2b_lora
+ save_interval: 500
+ log_interval: 20
+ train_data: [ "disney" ] # Train data path
+ valid_data: [ "disney" ] # Validation data path, can be the same as train_data(not recommended)
+ split: 1,0,0
+ num_workers: 8
+ force_train: True
+ only_log_video_latents: True
+
+data:
+ target: data_video.SFTDataset
+ params:
+ video_size: [ 480, 720 ]
+ fps: 8
+ max_num_frames: 49
+ skip_frms_num: 3.
+
+deepspeed:
+ # Minimun for 16 videos per batch for ALL GPUs, This setting is for 8 x A100 GPUs
+ train_micro_batch_size_per_gpu: 2
+ gradient_accumulation_steps: 1
+ steps_per_print: 50
+ gradient_clipping: 0.1
+ zero_optimization:
+ stage: 2
+ cpu_offload: false
+ contiguous_gradients: false
+ overlap_comm: true
+ reduce_scatter: true
+ reduce_bucket_size: 1000000000
+ allgather_bucket_size: 1000000000
+ load_from_fp32_weights: false
+ zero_allow_untested_optimizer: true
+ bf16:
+ enabled: False # For CogVideoX-2B Turn to False and For CogVideoX-5B Turn to True
+ fp16:
+ enabled: True # For CogVideoX-2B Turn to True and For CogVideoX-5B Turn to False
+ loss_scale: 0
+ loss_scale_window: 400
+ hysteresis: 2
+ min_loss_scale: 1
+
+ optimizer:
+ type: sat.ops.FusedEmaAdam
+ params:
+ lr: 0.001 # Between 1E-3 and 5E-4 For Lora and 1E-5 For SFT
+ betas: [ 0.9, 0.95 ]
+ eps: 1e-8
+ weight_decay: 1e-4
+ activation_checkpointing:
+ partition_activations: false
+ contiguous_memory_optimization: false
+ wall_clock_breakdown: false
\ No newline at end of file
diff --git a/sat/finetune_multi_gpus.sh b/sat/finetune_multi_gpus.sh
index bf1df4af..ef56701c 100644
--- a/sat/finetune_multi_gpus.sh
+++ b/sat/finetune_multi_gpus.sh
@@ -1,8 +1,8 @@
#! /bin/bash
-echo "RUN on `hostname`, CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
+echo "RUN on $(hostname), CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-run_cmd="torchrun --standalone --nproc_per_node=4 train_video.py --base configs/cogvideox_2b_sft.yaml --seed $RANDOM"
+run_cmd="torchrun --standalone --nproc_per_node=8 train_video.py --base configs/cogvideox_2b_lora.yaml configs/sft.yaml --seed $RANDOM"
echo ${run_cmd}
eval ${run_cmd}
diff --git a/sat/finetune_single_gpu.sh b/sat/finetune_single_gpu.sh
index da312478..13591720 100644
--- a/sat/finetune_single_gpu.sh
+++ b/sat/finetune_single_gpu.sh
@@ -4,7 +4,7 @@ echo "RUN on `hostname`, CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
environs="WORLD_SIZE=1 RANK=0 LOCAL_RANK=0 LOCAL_WORLD_SIZE=1"
-run_cmd="$environs python train_video.py --base configs/cogvideox_2b_sft.yaml --seed $RANDOM"
+run_cmd="$environs python train_video.py --base configs/cogvideox_2b_lora.yaml configs/sft.yaml --seed $RANDOM"
echo ${run_cmd}
eval ${run_cmd}
diff --git a/sat/inference.sh b/sat/inference.sh
index 8b446eee..11c50a60 100755
--- a/sat/inference.sh
+++ b/sat/inference.sh
@@ -4,7 +4,7 @@ echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
environs="WORLD_SIZE=1 RANK=0 LOCAL_RANK=0 LOCAL_WORLD_SIZE=1"
-run_cmd="$environs python sample_video.py --base configs/cogvideox_2b_infer.yaml"
+run_cmd="$environs python sample_video.py --base configs/cogvideox_2b.yaml configs/inference.yaml --seed $RANDOM"
echo ${run_cmd}
eval ${run_cmd}
From 17e6ed86853b9a1e5403fdb7dcb4ecf863415d05 Mon Sep 17 00:00:00 2001
From: zR <2448370773@qq.com>
Date: Tue, 20 Aug 2024 15:43:49 +0800
Subject: [PATCH 4/5] upload venhancer
---
README.md | 8 +--
README_ja.md | 5 +-
README_zh.md | 7 ++-
tools/venhancer/README.md | 98 ++++++++++++++++++++++++++++++++++
tools/venhancer/README_ja.md | 91 +++++++++++++++++++++++++++++++
tools/venhancer/README_zh.md | 100 +++++++++++++++++++++++++++++++++++
6 files changed, 303 insertions(+), 6 deletions(-)
create mode 100644 tools/venhancer/README.md
create mode 100644 tools/venhancer/README_ja.md
create mode 100644 tools/venhancer/README_zh.md
diff --git a/README.md b/README.md
index 371b5e03..97db40bf 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,10 @@
## Update and News
-- 🔥🔥 **News**: ```2024/8/15```: The `SwissArmyTransformer` dependency in CogVideoX has been upgraded to `0.4.12`.
+- 🔥🔥 **News**: ```2024/8/20```: [VEnhancer](https://github.com/Vchitect/VEnhancer) now supports enhancing videos generated by
+ CogVideoX, achieving higher resolution and higher quality video rendering. We welcome you to try it out by following
+ the [tutorial](tools/venhancer/README_zh.md).
+- 🔥 **News**: ```2024/8/15```: The `SwissArmyTransformer` dependency in CogVideoX has been upgraded to `0.4.12`.
Fine-tuning
no longer requires installing `SwissArmyTransformer` from source. Additionally, the `Tied VAE` technique has been
applied in the implementation within the `diffusers` library. Please install `diffusers` and `accelerate` libraries
@@ -34,8 +37,7 @@
performed
on a single 3090 GPU. For more details, please refer to the [code](inference/cli_demo.py).
- 🔥 **News**: ```2024/8/6```: We have also open-sourced **3D Causal VAE** used in **CogVideoX-2B**, which can
- reconstruct
- the video almost losslessly.
+ reconstruct the video almost losslessly.
- 🔥 **News**: ```2024/8/6```: We have open-sourced **CogVideoX-2B**,the first model in the CogVideoX series of video
generation models.
- 🌱 **Source**: ```2022/5/19```: We have open-sourced **CogVideo** (now you can see in `CogVideo` branch),the **first**
diff --git a/README_ja.md b/README_ja.md
index 3bcea003..27bf1423 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -22,7 +22,10 @@
## 更新とニュース
-- 🔥🔥 **ニュース**: 2024/8/15: CogVideoX の依存関係である`SwissArmyTransformer`の依存が`0.4.12`
+- 🔥🔥 **ニュース**: ```2024/8/20```: [VEnhancer](https://github.com/Vchitect/VEnhancer) は CogVideoX
+ が生成したビデオの強化をサポートしました。より高い解像度とより高品質なビデオレンダリングを実現します。[チュートリアル](tools/venhancer/README_ja.md)
+ に従って、ぜひお試しください。
+- 🔥**ニュース**: 2024/8/15: CogVideoX の依存関係である`SwissArmyTransformer`の依存が`0.4.12`
にアップグレードされました。これにより、微調整の際に`SwissArmyTransformer`
をソースコードからインストールする必要がなくなりました。同時に、`Tied VAE` 技術が `diffusers`
ライブラリの実装に適用されました。`diffusers` と `accelerate` ライブラリをソースコードからインストールしてください。CogVdideoX
diff --git a/README_zh.md b/README_zh.md
index 2186c1b0..a5a2bc00 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -23,9 +23,12 @@
## 项目更新
-- 🔥🔥 **News**: ```2024/8/15```: CogVideoX 依赖中`SwissArmyTransformer`依赖升级到`0.4.12`,
+- 🔥🔥**News**: ```2024/8/20```: [VEnhancer](https://github.com/Vchitect/VEnhancer) 已经支持对 CogVideoX
+ 生成的视频进行增强,实现更高分辨率,更高质量的视频渲染。欢迎大家按照[教程](tools/venhancer/README_zh.md)体验使用。
+- 🔥**News**: ```2024/8/15```: CogVideoX 依赖中`SwissArmyTransformer`依赖升级到`0.4.12`,
微调不再需要从源代码安装`SwissArmyTransformer`。同时,`Tied VAE` 技术已经被应用到 `diffusers`
- 库中的实现,请从源代码安装 `diffusers` 和 `accelerate` 库,推理 CogVdideoX 仅需 12GB显存。推理代码需要修改,请查看 [cli_demo](inference/cli_demo.py)
+ 库中的实现,请从源代码安装 `diffusers` 和 `accelerate` 库,推理 CogVdideoX 仅需
+ 12GB显存。推理代码需要修改,请查看 [cli_demo](inference/cli_demo.py)
- 🔥 **News**: ```2024/8/12```: CogVideoX 论文已上传到arxiv,欢迎查看[论文](https://arxiv.org/abs/2408.06072)。
- 🔥 **News**: ```2024/8/7```: CogVideoX 已经合并入 `diffusers`
0.30.0版本,单张3090可以推理,详情请见[代码](inference/cli_demo.py)。
diff --git a/tools/venhancer/README.md b/tools/venhancer/README.md
new file mode 100644
index 00000000..b5f947e6
--- /dev/null
+++ b/tools/venhancer/README.md
@@ -0,0 +1,98 @@
+# Enhance CogVideoX Generated Videos with VEnhancer
+
+This tutorial will guide you through using the VEnhancer tool to enhance videos generated by CogVideoX, including
+achieving higher frame rates and higher resolutions.
+
+## Model Introduction
+
+VEnhancer implements spatial super-resolution, temporal super-resolution (frame interpolation), and video refinement in
+a unified framework. It can flexibly adapt to different upsampling factors (e.g., 1x~8x) for spatial or temporal
+super-resolution. Additionally, it provides flexible control to modify the refinement strength, enabling it to handle
+diverse video artifacts.
+
+VEnhancer follows the design of ControlNet, copying the architecture and weights of the multi-frame encoder and middle
+block from a pre-trained video diffusion model to build a trainable conditional network. This video ControlNet accepts
+low-resolution keyframes and noisy full-frame latents as inputs. In addition to the time step t and prompt, our proposed
+video-aware conditioning also includes noise augmentation level σ and downscaling factor s as additional network
+conditioning inputs.
+
+## Hardware Requirements
+
++ Operating System: Linux (requires xformers dependency)
++ Hardware: NVIDIA GPU with at least 60GB of VRAM per card. Machines such as H100, A100 are recommended.
+
+## Quick Start
+
+1. Clone the repository and install dependencies as per the official instructions:
+
+```shell
+git clone https://github.com/Vchitect/VEnhancer.git
+cd VEnhancer
+## Torch and other dependencies can use those from CogVideoX. If you need to create a new environment, use the following commands:
+pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2
+
+## Install required dependencies
+pip install -r requirements.txt
+```
+
+Where:
+
+- `input_path` is the path to the input video
+- `prompt` is the description of the video content. The prompt used by this tool should be shorter, not exceeding 77
+ words. You may need to simplify the prompt used for generating the CogVideoX video.
+- `up_scale` is the upsampling factor, which can be set to 2, 4, or 8
+- `target_fps` is the target frame rate for the video. Typically, 16 fps is already smooth, with 24 fps as the default
+ value.
+- `noise_aug` controls the strength of noise augmentation, typically set to 250
+- `steps` indicates the number of optimization steps, usually set to 15. If you want faster model generation, you can
+ reduce this number, but the quality will significantly decrease.
+
+The code will automatically download the required models from Hugging Face during execution.
+
+Typical runtime logs are as follows:
+
+```shell
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/xformers/ops/fmha/flash.py:211: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
+ @torch.library.impl_abstract("xformers_flash::flash_fwd")
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/xformers/ops/fmha/flash.py:344: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
+ @torch.library.impl_abstract("xformers_flash::flash_bwd")
+2024-08-20 13:25:17,553 - video_to_video - INFO - checkpoint_path: ./ckpts/venhancer_paper.pt
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/open_clip/factory.py:88: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+ checkpoint = torch.load(checkpoint_path, map_location=map_location)
+2024-08-20 13:25:37,486 - video_to_video - INFO - Build encoder with FrozenOpenCLIPEmbedder
+/share/home/zyx/Code/VEnhancer/video_to_video/video_to_video_model.py:35: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+ load_dict = torch.load(cfg.model_path, map_location='cpu')
+2024-08-20 13:25:55,391 - video_to_video - INFO - Load model path ./ckpts/venhancer_paper.pt, with local status
+2024-08-20 13:25:55,392 - video_to_video - INFO - Build diffusion with GaussianDiffusion
+2024-08-20 13:26:16,092 - video_to_video - INFO - input video path: inputs/000000.mp4
+2024-08-20 13:26:16,093 - video_to_video - INFO - text: Wide-angle aerial shot at dawn,soft morning light casting long shadows,an elderly man walking his dog through a quiet,foggy park,trees and benches in the background,peaceful and serene atmosphere
+2024-08-20 13:26:16,156 - video_to_video - INFO - input frames length: 49
+2024-08-20 13:26:16,156 - video_to_video - INFO - input fps: 8.0
+2024-08-20 13:26:16,156 - video_to_video - INFO - target_fps: 24.0
+2024-08-20 13:26:16,311 - video_to_video - INFO - input resolution: (480, 720)
+2024-08-20 13:26:16,312 - video_to_video - INFO - target resolution: (1320, 1982)
+2024-08-20 13:26:16,312 - video_to_video - INFO - noise augmentation: 250
+2024-08-20 13:26:16,312 - video_to_video - INFO - scale s is set to: 8
+2024-08-20 13:26:16,399 - video_to_video - INFO - video_data shape: torch.Size([145, 3, 1320, 1982])
+/share/home/zyx/Code/VEnhancer/video_to_video/video_to_video_model.py:108: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+ with amp.autocast(enabled=True):
+2024-08-20 13:27:19,605 - video_to_video - INFO - step: 0
+2024-08-20 13:30:12,020 - video_to_video - INFO - step: 1
+2024-08-20 13:33:04,956 - video_to_video - INFO - step: 2
+2024-08-20 13:35:58,691 - video_to_video - INFO - step: 3
+2024-08-20 13:38:51,254 - video_to_video - INFO - step: 4
+2024-08-20 13:41:44,150 - video_to_video - INFO - step: 5
+2024-08-20 13:44:37,017 - video_to_video - INFO - step: 6
+2024-08-20 13:47:30,037 - video_to_video - INFO - step: 7
+2024-08-20 13:50:22,838 - video_to_video - INFO - step: 8
+2024-08-20 13:53:15,844 - video_to_video - INFO - step: 9
+2024-08-20 13:56:08,657 - video_to_video - INFO - step: 10
+2024-08-20 13:59:01,648 - video_to_video - INFO - step: 11
+2024-08-20 14:01:54,541 - video_to_video - INFO - step: 12
+2024-08-20 14:04:47,488 - video_to_video - INFO - step: 13
+2024-08-20 14:10:13,637 - video_to_video - INFO - sampling, finished.
+
+```
+
+Running on a single A100 GPU, enhancing each 6-second CogVideoX generated video with default settings will consume 60GB
+of VRAM and take 40-50 minutes.
\ No newline at end of file
diff --git a/tools/venhancer/README_ja.md b/tools/venhancer/README_ja.md
new file mode 100644
index 00000000..b0973df9
--- /dev/null
+++ b/tools/venhancer/README_ja.md
@@ -0,0 +1,91 @@
+
+# VEnhancer で CogVideoX によって生成されたビデオを強化する
+
+このチュートリアルでは、VEnhancer ツールを使用して、CogVideoX で生成されたビデオを強化し、より高いフレームレートと高い解像度を実現する方法を説明します。
+
+## モデルの紹介
+
+VEnhancer は、空間超解像、時間超解像(フレーム補間)、およびビデオのリファインメントを統一されたフレームワークで実現します。空間または時間の超解像のために、さまざまなアップサンプリング係数(例:1x〜8x)に柔軟に対応できます。さらに、多様なビデオアーティファクトを処理するために、リファインメント強度を変更する柔軟な制御を提供します。
+
+VEnhancer は ControlNet の設計に従い、事前訓練されたビデオ拡散モデルのマルチフレームエンコーダーとミドルブロックのアーキテクチャとウェイトをコピーして、トレーニング可能な条件ネットワークを構築します。このビデオ ControlNet は、低解像度のキーフレームとノイズを含む完全なフレームを入力として受け取ります。さらに、タイムステップ t とプロンプトに加えて、提案されたビデオ対応条件により、ノイズ増幅レベル σ およびダウンスケーリングファクター s が追加のネットワーク条件として使用されます。
+
+## ハードウェア要件
+
++ オペレーティングシステム: Linux (xformers 依存関係が必要)
++ ハードウェア: 単一カードあたり少なくとも 60GB の VRAM を持つ NVIDIA GPU。H100、A100 などのマシンを推奨します。
+
+## クイックスタート
+
+1. 公式の指示に従ってリポジトリをクローンし、依存関係をインストールします。
+
+```shell
+git clone https://github.com/Vchitect/VEnhancer.git
+cd VEnhancer
+## Torch などの依存関係は CogVideoX の依存関係を使用できます。新しい環境を作成する必要がある場合は、以下のコマンドを使用してください。
+pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2
+
+## 必須の依存関係をインストールします。
+pip install -r requirements.txt
+```
+
+2. コードを実行します。
+
+```shell
+python enhance_a_video.py --up_scale 4 --target_fps 24 --noise_aug 250 --solver_mode 'fast' --steps 15 --input_path inputs/000000.mp4 --prompt 'Wide-angle aerial shot at dawn, soft morning light casting long shadows, an elderly man walking his dog through a quiet, foggy park, trees and benches in the background, peaceful and serene atmosphere' --save_dir 'results/'
+```
+
+次の設定を行います:
+
+- `input_path` は入力ビデオのパスです。
+- `prompt` はビデオの内容を説明するプロンプトです。このツールで使用されるプロンプトは短く、77 単語を超えないようにする必要があります。CogVideoX の生成ビデオのプロンプトを適宜簡略化することをお勧めします。
+- `up_scale` はアップサンプリング係数で、2、4、8 に設定できます。
+- `target_fps` はビデオの目標フレームレートです。通常、16 fps であれば十分にスムーズですが、デフォルト値は 24 fps です。
+- `noise_aug` はノイズ増幅の強度を制御し、通常は 250 に設定します。
+- `steps` は最適化ステップ数を示します。通常 15 に設定されますが、より速いモデル生成を望む場合はこの値を減らすことができますが、品質が大幅に低下します。
+
+コードの実行中に、必要なモデルは Hugging Face から自動的にダウンロードされます。
+
+```shell
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/xformers/ops/fmha/flash.py:211: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
+ @torch.library.impl_abstract("xformers_flash::flash_fwd")
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/xformers/ops/fmha/flash.py:344: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
+ @torch.library.impl_abstract("xformers_flash::flash_bwd")
+2024-08-20 13:25:17,553 - video_to_video - INFO - checkpoint_path: ./ckpts/venhancer_paper.pt
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/open_clip/factory.py:88: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+ checkpoint = torch.load(checkpoint_path, map_location=map_location)
+2024-08-20 13:25:37,486 - video_to_video - INFO - Build encoder with FrozenOpenCLIPEmbedder
+/share/home/zyx/Code/VEnhancer/video_to_video/video_to_video_model.py:35: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+ load_dict = torch.load(cfg.model_path, map_location='cpu')
+2024-08-20 13:25:55,391 - video_to_video - INFO - Load model path ./ckpts/venhancer_paper.pt, with local status
+2024-08-20 13:25:55,392 - video_to_video - INFO - Build diffusion with GaussianDiffusion
+2024-08-20 13:26:16,092 - video_to_video - INFO - input video path: inputs/000000.mp4
+2024-08-20 13:26:16,093 - video_to_video - INFO - text: Wide-angle aerial shot at dawn,soft morning light casting long shadows,an elderly man walking his dog through a quiet,foggy park,trees and benches in the background,peaceful and serene atmosphere
+2024-08-20 13:26:16,156 - video_to_video - INFO - input frames length: 49
+2024-08-20 13:26:16,156 - video_to_video - INFO - input fps: 8.0
+2024-08-20 13:26:16,156 - video_to_video - INFO - target_fps: 24.0
+2024-08-20 13:26:16,311 - video_to_video - INFO - input resolution: (480, 720)
+2024-08-20 13:26:16,312 - video_to_video - INFO - target resolution: (1320, 1982)
+2024-08-20 13:26:16,312 - video_to_video - INFO - noise augmentation: 250
+2024-08-20 13:26:16,312 - video_to_video - INFO - scale s is set to: 8
+2024-08-20 13:26:16,399 - video_to_video - INFO - video_data shape: torch.Size([145, 3, 1320, 1982])
+/share/home/zyx/Code/VEnhancer/video_to_video/video_to_video_model.py:108: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+ with amp.autocast(enabled=True):
+2024-08-20 13:27:19,605 - video_to_video - INFO - step: 0
+2024-08-20 13:30:12,020 - video_to_video - INFO - step: 1
+2024-08-20 13:33:04,956 - video_to_video - INFO - step: 2
+2024-08-20 13:35:58,691 - video_to_video - INFO - step: 3
+2024-08-20 13:38:51,254 - video_to_video - INFO - step: 4
+2024-08-20 13:41:44,150 - video_to_video - INFO - step: 5
+2024-08-20 13:44:37,017 - video_to_video - INFO - step: 6
+2024-08-20 13:47:30,037 - video_to_video - INFO - step: 7
+2024-08-20 13:50:22,838 - video_to_video - INFO - step: 8
+2024-08-20 13:53:15,844 - video_to_video - INFO - step: 9
+2024-08-20 13:56:08,657 - video_to_video - INFO - step: 10
+2024-08-20 13:59:01,648 - video_to_video - INFO - step: 11
+2024-08-20 14:01:54,541 - video_to_video - INFO - step: 12
+2024-08-20 14:04:47,488 - video_to_video - INFO - step: 13
+2024-08-20 14:10:13,637 - video_to_video - INFO - sampling, finished.
+
+```
+
+A100 GPU を単一で使用している場合、CogVideoX によって生成された 6 秒間のビデオを強化するには、デフォルト設定で 60GB の VRAM を消費し、40〜50 分かかります。
diff --git a/tools/venhancer/README_zh.md b/tools/venhancer/README_zh.md
new file mode 100644
index 00000000..2738d3f2
--- /dev/null
+++ b/tools/venhancer/README_zh.md
@@ -0,0 +1,100 @@
+# 使用 VEnhancer 对 CogVdieoX 生成视频进行增强
+
+本教程将要使用 VEnhancer 工具 对 CogVdieoX 生成视频进行增强, 包括更高的帧率和更高的分辨率
+
+## 模型介绍
+
+VEnhancer 在一个统一的框架中实现了空间超分辨率、时间超分辨率(帧插值)和视频优化。它可以灵活地适应不同的上采样因子(例如,1x~
+8x)用于空间或时间超分辨率。此外,它提供了灵活的控制,以修改优化强度,从而处理多样化的视频伪影。
+
+VEnhancer 遵循 ControlNet 的设计,复制了预训练的视频扩散模型的多帧编码器和中间块的架构和权重,构建了一个可训练的条件网络。这个视频
+ControlNet 接受低分辨率关键帧和包含噪声的完整帧作为输入。此外,除了时间步 t 和提示词外,我们提出的视频感知条件还将噪声增强的噪声级别
+σ 和降尺度因子 s 作为附加的网络条件输入。
+
+## 硬件需求
+
++ 操作系统: Linux (需要依赖xformers)
++ 硬件: NVIDIA GPU 并至少保证单卡显存超过60G,推荐使用 H100,A100等机器。
+
+## 快速上手
+
+1. 按照官方指引克隆仓库并安装依赖
+
+```shell
+git clone https://github.com/Vchitect/VEnhancer.git
+cd VEnhancer
+## torch等依赖可以使用CogVideoX的依赖,如果你需要创建一个新的环境,可以使用以下命令
+pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2
+
+## 安装必须的依赖
+pip install -r requirements.txt
+```
+
+2. 运行代码
+
+```shell
+python enhance_a_video.py \
+--up_scale 4 --target_fps 24 --noise_aug 250 \
+--solver_mode 'fast' --steps 15 \
+--input_path inputs/000000.mp4 \
+--prompt 'Wide-angle aerial shot at dawn,soft morning light casting long shadows,an elderly man walking his dog through a quiet,foggy park,trees and benches in the background,peaceful and serene atmosphere' \
+--save_dir 'results/'
+```
+
+其中:
+
+- `input_path` 是输入视频的路径
+- `prompt` 是描述视频内容的提示词,本工具使用的提示词更短,不能超过77个单词,您可以适当简化 CogVideoX 生成视频的提示词。
+- `up_scale` 是上采样因子,可以设置为 2, 4, 8
+- `target_fps` 是目标视频的帧率,通常来说,16帧就已经流畅,24帧是默认值
+- `noise_aug` 是噪声增强的强度,通常设置为250
+- `step` 是优化步数,通常设置为15,如果你想更快的生成模型,可以调低,但是质量会大幅下降。
+
+代码运行过程中,会自动从Huggingface拉取需要的模型
+
+运行日志通常如下:
+
+```shell
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/xformers/ops/fmha/flash.py:211: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
+ @torch.library.impl_abstract("xformers_flash::flash_fwd")
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/xformers/ops/fmha/flash.py:344: FutureWarning: `torch.library.impl_abstract` was renamed to `torch.library.register_fake`. Please use that instead; we will remove `torch.library.impl_abstract` in a future version of PyTorch.
+ @torch.library.impl_abstract("xformers_flash::flash_bwd")
+2024-08-20 13:25:17,553 - video_to_video - INFO - checkpoint_path: ./ckpts/venhancer_paper.pt
+/share/home/zyx/.conda/envs/cogvideox/lib/python3.10/site-packages/open_clip/factory.py:88: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+ checkpoint = torch.load(checkpoint_path, map_location=map_location)
+2024-08-20 13:25:37,486 - video_to_video - INFO - Build encoder with FrozenOpenCLIPEmbedder
+/share/home/zyx/Code/VEnhancer/video_to_video/video_to_video_model.py:35: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
+ load_dict = torch.load(cfg.model_path, map_location='cpu')
+2024-08-20 13:25:55,391 - video_to_video - INFO - Load model path ./ckpts/venhancer_paper.pt, with local status
+2024-08-20 13:25:55,392 - video_to_video - INFO - Build diffusion with GaussianDiffusion
+2024-08-20 13:26:16,092 - video_to_video - INFO - input video path: inputs/000000.mp4
+2024-08-20 13:26:16,093 - video_to_video - INFO - text: Wide-angle aerial shot at dawn,soft morning light casting long shadows,an elderly man walking his dog through a quiet,foggy park,trees and benches in the background,peaceful and serene atmosphere
+2024-08-20 13:26:16,156 - video_to_video - INFO - input frames length: 49
+2024-08-20 13:26:16,156 - video_to_video - INFO - input fps: 8.0
+2024-08-20 13:26:16,156 - video_to_video - INFO - target_fps: 24.0
+2024-08-20 13:26:16,311 - video_to_video - INFO - input resolution: (480, 720)
+2024-08-20 13:26:16,312 - video_to_video - INFO - target resolution: (1320, 1982)
+2024-08-20 13:26:16,312 - video_to_video - INFO - noise augmentation: 250
+2024-08-20 13:26:16,312 - video_to_video - INFO - scale s is set to: 8
+2024-08-20 13:26:16,399 - video_to_video - INFO - video_data shape: torch.Size([145, 3, 1320, 1982])
+/share/home/zyx/Code/VEnhancer/video_to_video/video_to_video_model.py:108: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
+ with amp.autocast(enabled=True):
+2024-08-20 13:27:19,605 - video_to_video - INFO - step: 0
+2024-08-20 13:30:12,020 - video_to_video - INFO - step: 1
+2024-08-20 13:33:04,956 - video_to_video - INFO - step: 2
+2024-08-20 13:35:58,691 - video_to_video - INFO - step: 3
+2024-08-20 13:38:51,254 - video_to_video - INFO - step: 4
+2024-08-20 13:41:44,150 - video_to_video - INFO - step: 5
+2024-08-20 13:44:37,017 - video_to_video - INFO - step: 6
+2024-08-20 13:47:30,037 - video_to_video - INFO - step: 7
+2024-08-20 13:50:22,838 - video_to_video - INFO - step: 8
+2024-08-20 13:53:15,844 - video_to_video - INFO - step: 9
+2024-08-20 13:56:08,657 - video_to_video - INFO - step: 10
+2024-08-20 13:59:01,648 - video_to_video - INFO - step: 11
+2024-08-20 14:01:54,541 - video_to_video - INFO - step: 12
+2024-08-20 14:04:47,488 - video_to_video - INFO - step: 13
+2024-08-20 14:10:13,637 - video_to_video - INFO - sampling, finished.
+
+```
+
+使用A100单卡运行,对于每个CogVideoX生产的6秒视频,按照默认配置,会消耗60G显存,并用时40-50分钟。
\ No newline at end of file
From 457864526797ede7a23d517a30112ae51e3b9d1d Mon Sep 17 00:00:00 2001
From: zR <2448370773@qq.com>
Date: Tue, 20 Aug 2024 21:19:45 +0800
Subject: [PATCH 5/5] suggestion parm
---
tools/venhancer/README.md | 12 ++++++------
tools/venhancer/README_ja.md | 13 +++++++------
tools/venhancer/README_zh.md | 11 ++++++-----
3 files changed, 19 insertions(+), 17 deletions(-)
diff --git a/tools/venhancer/README.md b/tools/venhancer/README.md
index b5f947e6..cc6f45c6 100644
--- a/tools/venhancer/README.md
+++ b/tools/venhancer/README.md
@@ -40,14 +40,14 @@ Where:
- `input_path` is the path to the input video
- `prompt` is the description of the video content. The prompt used by this tool should be shorter, not exceeding 77
words. You may need to simplify the prompt used for generating the CogVideoX video.
-- `up_scale` is the upsampling factor, which can be set to 2, 4, or 8
- `target_fps` is the target frame rate for the video. Typically, 16 fps is already smooth, with 24 fps as the default
value.
-- `noise_aug` controls the strength of noise augmentation, typically set to 250
-- `steps` indicates the number of optimization steps, usually set to 15. If you want faster model generation, you can
- reduce this number, but the quality will significantly decrease.
-
-The code will automatically download the required models from Hugging Face during execution.
+- `up_scale` is recommend to be set to 2,3,4. The target resolution is limited to be around 2k and below.
+- `noise_aug` value depends on the input video quality. Lower quality needs higher noise levels, which corresponds to
+ stronger refinement. 250~300 is for very low-quality videos. good videos: <= 200.
+- `steps` if you want fewer steps, please change solver_mode to "normal" first, then decline the number of steps. "
+ fast" solver_mode has fixed steps (15).
+ The code will automatically download the required models from Hugging Face during execution.
Typical runtime logs are as follows:
diff --git a/tools/venhancer/README_ja.md b/tools/venhancer/README_ja.md
index b0973df9..70f2d74d 100644
--- a/tools/venhancer/README_ja.md
+++ b/tools/venhancer/README_ja.md
@@ -36,12 +36,13 @@ python enhance_a_video.py --up_scale 4 --target_fps 24 --noise_aug 250 --solver_
次の設定を行います:
-- `input_path` は入力ビデオのパスです。
-- `prompt` はビデオの内容を説明するプロンプトです。このツールで使用されるプロンプトは短く、77 単語を超えないようにする必要があります。CogVideoX の生成ビデオのプロンプトを適宜簡略化することをお勧めします。
-- `up_scale` はアップサンプリング係数で、2、4、8 に設定できます。
-- `target_fps` はビデオの目標フレームレートです。通常、16 fps であれば十分にスムーズですが、デフォルト値は 24 fps です。
-- `noise_aug` はノイズ増幅の強度を制御し、通常は 250 に設定します。
-- `steps` は最適化ステップ数を示します。通常 15 に設定されますが、より速いモデル生成を望む場合はこの値を減らすことができますが、品質が大幅に低下します。
+- `input_path` 是输入视频的路径
+- `prompt` 是视频内容的描述。此工具使用的提示词应更短,不超过77个字。您可能需要简化用于生成CogVideoX视频的提示词。
+- `target_fps` 是视频的目标帧率。通常,16 fps已经很流畅,默认值为24 fps。
+- `up_scale` 推荐设置为2、3或4。目标分辨率限制在2k左右及以下。
+- `noise_aug` 的值取决于输入视频的质量。质量较低的视频需要更高的噪声级别,这对应于更强的优化。250~300适用于非常低质量的视频。对于高质量视频,设置为≤200。
+- `steps` 如果想减少步数,请先将solver_mode改为“normal”,然后减少步数。“fast”模式的步数是固定的(15步)。
+ 代码在执行过程中会自动从Hugging Face下载所需的模型。
コードの実行中に、必要なモデルは Hugging Face から自動的にダウンロードされます。
diff --git a/tools/venhancer/README_zh.md b/tools/venhancer/README_zh.md
index 2738d3f2..a481cd17 100644
--- a/tools/venhancer/README_zh.md
+++ b/tools/venhancer/README_zh.md
@@ -44,11 +44,12 @@ python enhance_a_video.py \
其中:
- `input_path` 是输入视频的路径
-- `prompt` 是描述视频内容的提示词,本工具使用的提示词更短,不能超过77个单词,您可以适当简化 CogVideoX 生成视频的提示词。
-- `up_scale` 是上采样因子,可以设置为 2, 4, 8
-- `target_fps` 是目标视频的帧率,通常来说,16帧就已经流畅,24帧是默认值
-- `noise_aug` 是噪声增强的强度,通常设置为250
-- `step` 是优化步数,通常设置为15,如果你想更快的生成模型,可以调低,但是质量会大幅下降。
+- `prompt` 是视频内容的描述。此工具使用的提示词应更短,不超过77个字。您可能需要简化用于生成CogVideoX视频的提示词。
+- `target_fps` 是视频的目标帧率。通常,16 fps已经很流畅,默认值为24 fps。
+- `up_scale` 推荐设置为2、3或4。目标分辨率限制在2k左右及以下。
+- `noise_aug` 的值取决于输入视频的质量。质量较低的视频需要更高的噪声级别,这对应于更强的优化。250~300适用于非常低质量的视频。对于高质量视频,设置为≤200。
+- `steps` 如果想减少步数,请先将solver_mode改为“normal”,然后减少步数。“fast”模式的步数是固定的(15步)。
+ 代码在执行过程中会自动从Hugging Face下载所需的模型。
代码运行过程中,会自动从Huggingface拉取需要的模型