From 5677af571fdd7e601ea84b0ee9700d54147a83ba Mon Sep 17 00:00:00 2001 From: FengWen <109639975+ccssu@users.noreply.github.com> Date: Thu, 13 Jun 2024 15:46:55 +0800 Subject: [PATCH] Support comfyui sd3 speedup (#946) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## 安装
SD3 Speedup Quick Start Open In
Kaggle
## Run WorkFlow - 28 step image 1024x1024 mode: max-optimize:max-autotune:low-precision | Accelerator | Baseline (non-optimized) | OneDiff (optimized) | Percentage improvement | | --------------------- | ------------------------ | ------------------- | ---------------------- | | NVIDIA A800-SXM4-80GB | ~4.03 sec | ~2.93 sec | ~27.29 % | ``` 动态 shape print("Test run with multiple resolutions...") sizes = [1024, 512, 768, 256] for h in sizes: for w in sizes: ``` ![workflow (17)](https://github.com/siliconflow/onediff/assets/109639975/a385fac5-1f82-4905-a941-4c71ff1c616e) --------- Co-authored-by: Xiaoyu Xu --- .../extras_nodes/nodes_nexfort_booster.py | 3 +- onediff_comfy_nodes/sd3_demo/README.md | 97 +++++++++++++++++++ onediff_comfy_nodes/sd3_demo/requirements.txt | 74 ++++++++++++++ 3 files changed, 173 insertions(+), 1 deletion(-) create mode 100644 onediff_comfy_nodes/sd3_demo/README.md create mode 100644 onediff_comfy_nodes/sd3_demo/requirements.txt diff --git a/onediff_comfy_nodes/extras_nodes/nodes_nexfort_booster.py b/onediff_comfy_nodes/extras_nodes/nodes_nexfort_booster.py index a265f9d0d..bdd0535e0 100644 --- a/onediff_comfy_nodes/extras_nodes/nodes_nexfort_booster.py +++ b/onediff_comfy_nodes/extras_nodes/nodes_nexfort_booster.py @@ -7,7 +7,8 @@ { "jit:disable-runtime-fusion:low-precision": "This compiles super quickly, but the performance might not be optimized very noticeably.", "jit:benchmark:low-precision:freezing:cudagraphs": "This compiles the model very quickly, but the performance might be not as good as `TorchInductor` optimized models.", - "max-autotune:low-precision": "This will deliver a good performance and adapt quickly to shape changes.", + "max-optimize:max-autotune:low-precision": "This will deliver a good performance and adapt quickly to shape changes.", + # "max-optimize:max-autotune:low-precision": "", "max-autotune:benchmark:low-precision:cudagraphs": "This is the most suggested combination of compiler modes. It will deliver a good balance between performance and compilation time.", "max-optimize:max-autotune:benchmark:low-precision:freezing:cudagraphs": "This is the most aggressive combination of compiler modes. It will deliver the best performance but might slow down the compilation significantly.", } diff --git a/onediff_comfy_nodes/sd3_demo/README.md b/onediff_comfy_nodes/sd3_demo/README.md new file mode 100644 index 000000000..135543da4 --- /dev/null +++ b/onediff_comfy_nodes/sd3_demo/README.md @@ -0,0 +1,97 @@ +## Accelerate SD3 by using onediff +huggingface: https://huggingface.co/stabilityai/stable-diffusion-3-medium + + +### Feature +- ✅ Multiple resolutions + +### Performance + +- Timings for 28 steps at 1024x1024 +- OneDiff[Nexfort] Compile mode: max-optimize:max-autotune:low-precision + +| Accelerator | Baseline (non-optimized) | OneDiff (optimized) | Percentage improvement | +| --------------------- | ------------------------ | ------------------- | ---------------------- | +| NVIDIA A800-SXM4-80GB | ~4.03 sec | ~2.93 sec | ~27.29 % | + + + + +The following table shows the comparison of the plot, seed=1, Baseline (non optimized) on the left, and OneDiff (optimized) on the right + +| | | +| -------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- | +| ![sd3_baseline_00001_](https://github.com/siliconflow/onediff/assets/109639975/c86f2dc8-fc6f-4cc7-b85d-d4d973594ee6) | ![sd3_speedup_00001_](https://github.com/siliconflow/onediff/assets/109639975/c81b3fc9-d588-4ba1-9911-ae3a8a8d2454) | + + +### Multiple resolutions +test with multiple resolutions and support shape switching in a single line of Python code +``` +[print(f"Testing resolution: {h}x{w}") for h in [1024, 512, 768, 256] for w in [1024, 512, 768, 256]] +``` + +## Usage Example + +### Install + +```shell +# python 3.10 +COMFYUI_DIR=$pwd/ComfyUI +git clone https://github.com/siliconflow/onediff.git +cd onediff && pip install -r onediff_comfy_nodes/sd3_demo/requirements.txt && pip install -e . +ln -s $pwd/onediff/onediff_comfy_nodes $COMFYUI_DIR/custom_nodes +git clone https://github.com/comfyanonymous/ComfyUI.git +``` + +
+ test_install.py + +```python +# Compile arbitrary models (torch.nn.Module) +import torch +from onediff.utils.import_utils import is_nexfort_available +assert is_nexfort_available() == True + +import onediff.infer_compiler as infer_compiler + +class MyModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.lin = torch.nn.Linear(100, 10) + + def forward(self, x): + return torch.nn.functional.relu(self.lin(x)) + +mod = MyModule().to("cuda").half() +with torch.inference_mode(): + compiled_mod = infer_compiler.compile(mod, + backend="nexfort", + options={"mode": "max-autotune:cudagraphs", "dynamic": True, "fullgraph": True}, + ) + print(compiled_mod(torch.randn(10, 100, device="cuda").half()).shape) + +print("Successfully installed~") +``` +
+ +### Run ComfyUI +```shell +# run comfyui +# For CUDA Graph +export NEXFORT_FX_CUDAGRAPHS=1 +# For best performance +export TORCHINDUCTOR_MAX_AUTOTUNE=1 +# Enable CUDNN benchmark +export NEXFORT_FX_CONV_BENCHMARK=1 +# Faster float32 matmul +export NEXFORT_FX_MATMUL_ALLOW_TF32=1 +# For graph cache to speedup compilation +export TORCHINDUCTOR_FX_GRAPH_CACHE=1 +# For persistent cache dir +export TORCHINDUCTOR_CACHE_DIR=~/.torchinductor_cache +cd $COMFYUI_DIR && python main.py --gpu-only --disable-cuda-malloc +``` + +### WorkFlow +![WorkFlow](https://github.com/siliconflow/onediff/assets/109639975/a385fac5-1f82-4905-a941-4c71ff1c616e) + diff --git a/onediff_comfy_nodes/sd3_demo/requirements.txt b/onediff_comfy_nodes/sd3_demo/requirements.txt new file mode 100644 index 000000000..7b86a7fae --- /dev/null +++ b/onediff_comfy_nodes/sd3_demo/requirements.txt @@ -0,0 +1,74 @@ +accelerate==0.31.0 +aiohttp==3.9.5 +aiosignal==1.3.1 +async-timeout==4.0.3 +attrs==23.2.0 +certifi==2024.6.2 +cffi==1.16.0 +charset-normalizer==3.3.2 +click==8.1.7 +cryptography==42.0.8 +Deprecated==1.2.14 +diffusers==0.29.0 +einops==0.8.0 +filelock==3.15.1 +frozenlist==1.4.1 +fsspec==2024.6.0 +gitdb==4.0.11 +GitPython==3.1.43 +huggingface-hub==0.23.3 +idna==3.7 +importlib_metadata==7.1.0 +Jinja2==3.1.4 +markdown-it-py==3.0.0 +MarkupSafe==2.1.5 +matrix-client==0.4.0 +mdurl==0.1.2 +mpmath==1.3.0 +multidict==6.0.5 +networkx==3.3 +nexfort==0.1.dev242 +numpy==1.26.4 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.5.40 +nvidia-nvtx-cu12==12.1.105 +packaging==24.1 +pillow==10.3.0 +psutil==5.9.8 +pycparser==2.22 +Pygments==2.18.0 +PyJWT==2.8.0 +PyNaCl==1.5.0 +PyYAML==6.0.1 +regex==2024.5.15 +requests==2.32.3 +rich==13.7.1 +safetensors==0.4.3 +scipy==1.13.1 +shellingham==1.5.4 +smmap==5.0.1 +sympy==1.12.1 +tokenizers==0.19.1 +torch==2.3.1 +torchaudio==2.3.1 +torchsde==0.2.6 +torchvision==0.18.1 +tqdm==4.66.4 +trampoline==0.1.2 +transformers==4.41.2 +triton==2.3.1 +typer==0.12.3 +typing_extensions==4.12.2 +urllib3==1.26.18 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.19.2