From 5677af571fdd7e601ea84b0ee9700d54147a83ba Mon Sep 17 00:00:00 2001
From: FengWen <109639975+ccssu@users.noreply.github.com>
Date: Thu, 13 Jun 2024 15:46:55 +0800
Subject: [PATCH] Support comfyui sd3 speedup (#946)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## 安装
## Run WorkFlow
- 28 step image 1024x1024 mode: max-optimize:max-autotune:low-precision
| Accelerator | Baseline (non-optimized) | OneDiff (optimized) |
Percentage improvement |
| --------------------- | ------------------------ | -------------------
| ---------------------- |
| NVIDIA A800-SXM4-80GB | ~4.03 sec | ~2.93 sec | ~27.29 % |
```
动态 shape
print("Test run with multiple resolutions...")
sizes = [1024, 512, 768, 256]
for h in sizes:
for w in sizes:
```
![workflow
(17)](https://github.com/siliconflow/onediff/assets/109639975/a385fac5-1f82-4905-a941-4c71ff1c616e)
---------
Co-authored-by: Xiaoyu Xu
---
.../extras_nodes/nodes_nexfort_booster.py | 3 +-
onediff_comfy_nodes/sd3_demo/README.md | 97 +++++++++++++++++++
onediff_comfy_nodes/sd3_demo/requirements.txt | 74 ++++++++++++++
3 files changed, 173 insertions(+), 1 deletion(-)
create mode 100644 onediff_comfy_nodes/sd3_demo/README.md
create mode 100644 onediff_comfy_nodes/sd3_demo/requirements.txt
diff --git a/onediff_comfy_nodes/extras_nodes/nodes_nexfort_booster.py b/onediff_comfy_nodes/extras_nodes/nodes_nexfort_booster.py
index a265f9d0d..bdd0535e0 100644
--- a/onediff_comfy_nodes/extras_nodes/nodes_nexfort_booster.py
+++ b/onediff_comfy_nodes/extras_nodes/nodes_nexfort_booster.py
@@ -7,7 +7,8 @@
{
"jit:disable-runtime-fusion:low-precision": "This compiles super quickly, but the performance might not be optimized very noticeably.",
"jit:benchmark:low-precision:freezing:cudagraphs": "This compiles the model very quickly, but the performance might be not as good as `TorchInductor` optimized models.",
- "max-autotune:low-precision": "This will deliver a good performance and adapt quickly to shape changes.",
+ "max-optimize:max-autotune:low-precision": "This will deliver a good performance and adapt quickly to shape changes.",
+ # "max-optimize:max-autotune:low-precision": "",
"max-autotune:benchmark:low-precision:cudagraphs": "This is the most suggested combination of compiler modes. It will deliver a good balance between performance and compilation time.",
"max-optimize:max-autotune:benchmark:low-precision:freezing:cudagraphs": "This is the most aggressive combination of compiler modes. It will deliver the best performance but might slow down the compilation significantly.",
}
diff --git a/onediff_comfy_nodes/sd3_demo/README.md b/onediff_comfy_nodes/sd3_demo/README.md
new file mode 100644
index 000000000..135543da4
--- /dev/null
+++ b/onediff_comfy_nodes/sd3_demo/README.md
@@ -0,0 +1,97 @@
+## Accelerate SD3 by using onediff
+huggingface: https://huggingface.co/stabilityai/stable-diffusion-3-medium
+
+
+### Feature
+- ✅ Multiple resolutions
+
+### Performance
+
+- Timings for 28 steps at 1024x1024
+- OneDiff[Nexfort] Compile mode: max-optimize:max-autotune:low-precision
+
+| Accelerator | Baseline (non-optimized) | OneDiff (optimized) | Percentage improvement |
+| --------------------- | ------------------------ | ------------------- | ---------------------- |
+| NVIDIA A800-SXM4-80GB | ~4.03 sec | ~2.93 sec | ~27.29 % |
+
+
+
+
+The following table shows the comparison of the plot, seed=1, Baseline (non optimized) on the left, and OneDiff (optimized) on the right
+
+| | |
+| -------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
+| ![sd3_baseline_00001_](https://github.com/siliconflow/onediff/assets/109639975/c86f2dc8-fc6f-4cc7-b85d-d4d973594ee6) | ![sd3_speedup_00001_](https://github.com/siliconflow/onediff/assets/109639975/c81b3fc9-d588-4ba1-9911-ae3a8a8d2454) |
+
+
+### Multiple resolutions
+test with multiple resolutions and support shape switching in a single line of Python code
+```
+[print(f"Testing resolution: {h}x{w}") for h in [1024, 512, 768, 256] for w in [1024, 512, 768, 256]]
+```
+
+## Usage Example
+
+### Install
+
+```shell
+# python 3.10
+COMFYUI_DIR=$pwd/ComfyUI
+git clone https://github.com/siliconflow/onediff.git
+cd onediff && pip install -r onediff_comfy_nodes/sd3_demo/requirements.txt && pip install -e .
+ln -s $pwd/onediff/onediff_comfy_nodes $COMFYUI_DIR/custom_nodes
+git clone https://github.com/comfyanonymous/ComfyUI.git
+```
+
+
+ test_install.py
+
+```python
+# Compile arbitrary models (torch.nn.Module)
+import torch
+from onediff.utils.import_utils import is_nexfort_available
+assert is_nexfort_available() == True
+
+import onediff.infer_compiler as infer_compiler
+
+class MyModule(torch.nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.lin = torch.nn.Linear(100, 10)
+
+ def forward(self, x):
+ return torch.nn.functional.relu(self.lin(x))
+
+mod = MyModule().to("cuda").half()
+with torch.inference_mode():
+ compiled_mod = infer_compiler.compile(mod,
+ backend="nexfort",
+ options={"mode": "max-autotune:cudagraphs", "dynamic": True, "fullgraph": True},
+ )
+ print(compiled_mod(torch.randn(10, 100, device="cuda").half()).shape)
+
+print("Successfully installed~")
+```
+
+
+### Run ComfyUI
+```shell
+# run comfyui
+# For CUDA Graph
+export NEXFORT_FX_CUDAGRAPHS=1
+# For best performance
+export TORCHINDUCTOR_MAX_AUTOTUNE=1
+# Enable CUDNN benchmark
+export NEXFORT_FX_CONV_BENCHMARK=1
+# Faster float32 matmul
+export NEXFORT_FX_MATMUL_ALLOW_TF32=1
+# For graph cache to speedup compilation
+export TORCHINDUCTOR_FX_GRAPH_CACHE=1
+# For persistent cache dir
+export TORCHINDUCTOR_CACHE_DIR=~/.torchinductor_cache
+cd $COMFYUI_DIR && python main.py --gpu-only --disable-cuda-malloc
+```
+
+### WorkFlow
+![WorkFlow](https://github.com/siliconflow/onediff/assets/109639975/a385fac5-1f82-4905-a941-4c71ff1c616e)
+
diff --git a/onediff_comfy_nodes/sd3_demo/requirements.txt b/onediff_comfy_nodes/sd3_demo/requirements.txt
new file mode 100644
index 000000000..7b86a7fae
--- /dev/null
+++ b/onediff_comfy_nodes/sd3_demo/requirements.txt
@@ -0,0 +1,74 @@
+accelerate==0.31.0
+aiohttp==3.9.5
+aiosignal==1.3.1
+async-timeout==4.0.3
+attrs==23.2.0
+certifi==2024.6.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cryptography==42.0.8
+Deprecated==1.2.14
+diffusers==0.29.0
+einops==0.8.0
+filelock==3.15.1
+frozenlist==1.4.1
+fsspec==2024.6.0
+gitdb==4.0.11
+GitPython==3.1.43
+huggingface-hub==0.23.3
+idna==3.7
+importlib_metadata==7.1.0
+Jinja2==3.1.4
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matrix-client==0.4.0
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.5
+networkx==3.3
+nexfort==0.1.dev242
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.5.40
+nvidia-nvtx-cu12==12.1.105
+packaging==24.1
+pillow==10.3.0
+psutil==5.9.8
+pycparser==2.22
+Pygments==2.18.0
+PyJWT==2.8.0
+PyNaCl==1.5.0
+PyYAML==6.0.1
+regex==2024.5.15
+requests==2.32.3
+rich==13.7.1
+safetensors==0.4.3
+scipy==1.13.1
+shellingham==1.5.4
+smmap==5.0.1
+sympy==1.12.1
+tokenizers==0.19.1
+torch==2.3.1
+torchaudio==2.3.1
+torchsde==0.2.6
+torchvision==0.18.1
+tqdm==4.66.4
+trampoline==0.1.2
+transformers==4.41.2
+triton==2.3.1
+typer==0.12.3
+typing_extensions==4.12.2
+urllib3==1.26.18
+wrapt==1.16.0
+yarl==1.9.4
+zipp==3.19.2