From d29477c4452bb950198ff1e2e6254c1a4d9ec786 Mon Sep 17 00:00:00 2001
From: FengWen <ccsuwen@gmail.com>
Date: Thu, 13 Jun 2024 17:54:32 +0800
Subject: [PATCH 1/9] Fix onediff_comfy_nodes/sd3_demo/README.md

---
 onediff_comfy_nodes/sd3_demo/README.md | 54 +++++++++++++++++++-------
 1 file changed, 40 insertions(+), 14 deletions(-)
diff --git a/onediff_comfy_nodes/sd3_demo/README.md b/onediff_comfy_nodes/sd3_demo/README.md
index 135543da4..5287837d9 100644
--- a/onediff_comfy_nodes/sd3_demo/README.md
+++ b/onediff_comfy_nodes/sd3_demo/README.md
@@ -7,15 +7,14 @@ huggingface: https://huggingface.co/stabilityai/stable-diffusion-3-medium
 
 ### Performance
 
-- Timings for 28 steps at 1024x1024
-- OneDiff[Nexfort] Compile mode: max-optimize:max-autotune:low-precision
-
-| Accelerator           | Baseline (non-optimized) | OneDiff (optimized) | Percentage improvement |
-| --------------------- | ------------------------ | ------------------- | ---------------------- |
-| NVIDIA A800-SXM4-80GB | ~4.03 sec                | ~2.93 sec           | ~27.29 %               |
-
-
+- Testing on NVIDIA A800-SXM4-80GB, with image size of 1024*1024, iterating 28 steps. 
+- OneDiff[Nexfort] Compile mode: 
+`max-optimize:max-autotune:low-precision`
 
+|                          | Iteration speed    | E2E Inference Time | Max CUDA Memory Used |
+| ------------------------ | ------------------ | ------------------ | -------------------- |
+| Baseline (non-optimized) | 7.44it/s           | 4.03 s             | 18.827 GiB           |
+| OneDiff (optimized)      | 10.51it/s (+41.2%) | 2.96 s (-26.5%)    | 20.766 GiB           |
 
 The following table shows the comparison of the plot, seed=1, Baseline (non optimized) on the left, and OneDiff (optimized) on the right
 
@@ -29,18 +28,18 @@ test with multiple resolutions and support shape switching in a single line of P
 ```
 [print(f"Testing resolution: {h}x{w}") for h in [1024, 512, 768, 256] for w in [1024, 512, 768, 256]]
 ```
-
-## Usage Example
-
-### Install
-
+## Environment setup
+### Set SD3 requirements
 ```shell
 # python 3.10 
 COMFYUI_DIR=$pwd/ComfyUI
+# install ComfyUI
+git clone https://github.com/comfyanonymous/ComfyUI.git
+
+# install onediff & onediff_comfy_nodes
 git clone https://github.com/siliconflow/onediff.git 
 cd onediff && pip install -r onediff_comfy_nodes/sd3_demo/requirements.txt && pip install -e .
 ln -s $pwd/onediff/onediff_comfy_nodes  $COMFYUI_DIR/custom_nodes
-git clone https://github.com/comfyanonymous/ComfyUI.git
 ```
 
 <details close>
@@ -72,8 +71,34 @@ with torch.inference_mode():
     
 print("Successfully installed～")
 ```
+
 </details>
 
+### Download relevant models
+
+- step1: Get User Access Tokens here https://huggingface.co/settings/tokens
+
+- step2: Download relevant models
+```shell
+export ACCESS_TOKEN="User Access Tokens"
+wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium.safetensors -O models/checkpoints/sd3_medium.safetensors 
+
+wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/clip_g.safetensors -O models/clip/clip_g.safetensors
+    
+wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/clip_l.safetensors -O models/clip/clip_l.safetensors
+
+# wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+# https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/t5xxl_fp16.safetensors -O models/clip/t5xxl_fp16.safetensors
+
+wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/t5xxl_fp8_e4m3fn.safetensors -O models/clip/t5xxl_fp8_e4m3fn.safetensors
+```
+
+## Usage Example
+
 ### Run ComfyUI
 ```shell
 # run comfyui
@@ -93,5 +118,6 @@ cd $COMFYUI_DIR && python main.py --gpu-only --disable-cuda-malloc
 ```
 
 ### WorkFlow
+Here is a very basic example how to use it:
 ![WorkFlow](https://github.com/siliconflow/onediff/assets/109639975/a385fac5-1f82-4905-a941-4c71ff1c616e)
 

From 160bba5167888055cb7b98bf73476ff2c8d2ee46 Mon Sep 17 00:00:00 2001
From: FengWen <ccsuwen@gmail.com>
Date: Fri, 14 Jun 2024 18:36:54 +0800
Subject: [PATCH 2/9] warmup prompt

---
 onediff_comfy_nodes/sd3_demo/main.py | 70 ++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 onediff_comfy_nodes/sd3_demo/main.py

diff --git a/onediff_comfy_nodes/sd3_demo/main.py b/onediff_comfy_nodes/sd3_demo/main.py
new file mode 100644
index 000000000..89899eb4f
--- /dev/null
+++ b/onediff_comfy_nodes/sd3_demo/main.py
@@ -0,0 +1,70 @@
+import json
+from urllib import request
+
+workflow_api_path = "./workflow_api.json" 
+
+def queue_prompt(prompt):
+    p = {"prompt": prompt}
+    data = json.dumps(p).encode('utf-8')
+    req =  request.Request("http://127.0.0.1:9999/prompt", data=data) # comfyui start port 
+    request.urlopen(req)
+
+with open(workflow_api_path, "r") as fp:
+   prompt = json.load(fp)
+
+
+def generate_texts(min_length=50, max_length=302):
+    # 50 world
+    base_text = "a female character with long, flowing hair that appears to be made of ethereal, swirling patterns resembling the Northern Lights or Aurora Borealis. The background is dominated by deep blues and purples, creating a mysterious and dramatic atmosphere. The character's face is serene, with pale skin and striking features. She"
+
+    # Additional words pool
+    additional_words = [
+        "gracefully",
+        "beautifully",
+        "elegant",
+        "radiant",
+        "mysteriously",
+        "vibrant",
+        "softly",
+        "gently",
+        "luminescent",
+        "sparkling",
+        "delicately",
+        "glowing",
+        "brightly",
+        "shimmering",
+        "enchanting",
+        "gloriously",
+        "magnificent",
+        "majestic",
+        "fantastically",
+        "dazzlingly",
+    ]
+    for i in range(min_length, max_length):
+        idx = i % len(additional_words)
+        base_text = base_text + " " + additional_words[idx]
+        yield base_text
+
+
+
+generated_texts = list(generate_texts(max_length=101))
+generated_texts.reverse()
+
+cout = 0
+dimensions = [
+    (1024, 1024), (1024, 768), (1024, 576), 
+    (1024, 512), (512, 1024), (768, 512), (512, 512)
+]
+
+for width, height in dimensions:
+    # Set the width and height in the prompt
+    prompt["135"]["inputs"]["width"] = width
+    prompt["135"]["inputs"]["height"] = height
+
+    # Loop through each generated text and send the prompt to the server
+    for text in generated_texts:
+        prompt["6"]["inputs"]["text"] = text  
+        queue_prompt(prompt)
+        print(f'{cout=}')
+        cout += 1
+        break
\ No newline at end of file

From 5069a70b4b803befb160f250864bdc1deb7d8aad Mon Sep 17 00:00:00 2001
From: FengWen <ccsuwen@gmail.com>
Date: Thu, 20 Jun 2024 11:25:16 +0800
Subject: [PATCH 3/9] refine

---
 onediff_comfy_nodes/sd3_demo/README.md | 24 ++++++++++++++++++------
 1 file changed, 18 insertions(+), 6 deletions(-)

diff --git a/onediff_comfy_nodes/sd3_demo/README.md b/onediff_comfy_nodes/sd3_demo/README.md
index 5287837d9..8f4836ee7 100644
--- a/onediff_comfy_nodes/sd3_demo/README.md
+++ b/onediff_comfy_nodes/sd3_demo/README.md
@@ -6,15 +6,26 @@ huggingface: https://huggingface.co/stabilityai/stable-diffusion-3-medium
 - ✅ Multiple resolutions
 
 ### Performance
+### Metric
 
 - Testing on NVIDIA A800-SXM4-80GB, with image size of 1024*1024, iterating 28 steps. 
 - OneDiff[Nexfort] Compile mode: 
 `max-optimize:max-autotune:low-precision`
 
-|                          | Iteration speed    | E2E Inference Time | Max CUDA Memory Used |
-| ------------------------ | ------------------ | ------------------ | -------------------- |
-| Baseline (non-optimized) | 7.44it/s           | 4.03 s             | 18.827 GiB           |
-| OneDiff (optimized)      | 10.51it/s (+41.2%) | 2.96 s (-26.5%)    | 20.766 GiB           |
+
+| Metric                                           | NVIDIA GeForce RTX 4090 (1024 * 1024) |
+| ------------------------------------------------ | ------------------------------------- |
+| Data update date(yyyy-mm-dd)                     | 2024-06-19                            |
+| PyTorch E2E time                                 | 4.27 s                                |
+| OneDiff E2E time                                 | 3.17 s(-25.7%)                        |
+| PyTorch Max Mem Used                             | 18.445GiB                             |
+| OneDiff Max Mem Used                             | 19.199GiB                             |
+| PyTorch Warmup with Run time                     | 10s                                   |
+| OneDiff Warmup with Compilation time<sup>1</sup> | 209s                                  |
+| OneDiff Warmup with Cache time                   | 45s                                   |
+
+ <sup>1</sup> OneDiff Warmup with Compilation time is tested on  AMD EPYC 7543 32-Core Processor CPU. Note this is just for reference, and it varies a lot on different CPU.
+
 
 The following table shows the comparison of the plot, seed=1, Baseline (non optimized) on the left, and OneDiff (optimized) on the right
 
@@ -103,7 +114,7 @@ https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_e
 ```shell
 # run comfyui
 # For CUDA Graph
-export NEXFORT_FX_CUDAGRAPHS=1
+# export NEXFORT_FX_CUDAGRAPHS=1 
 # For best performance
 export TORCHINDUCTOR_MAX_AUTOTUNE=1
 # Enable CUDNN benchmark
@@ -119,5 +130,6 @@ cd $COMFYUI_DIR && python main.py --gpu-only --disable-cuda-malloc
 
 ### WorkFlow
 Here is a very basic example how to use it:
-![WorkFlow](https://github.com/siliconflow/onediff/assets/109639975/a385fac5-1f82-4905-a941-4c71ff1c616e)
+[workflow_sd3_speedup.json](https://github.com/user-attachments/files/15907863/sd3_suppedup.json)
+![sd3_speedup_workflow](https://github.com/siliconflow/onediff/assets/109639975/c1e955ae-7cc5-4197-9635-7cc05d5fd7a6)
 

From d2a018fe991a852a104ba9ce917392efe2312c28 Mon Sep 17 00:00:00 2001
From: FengWen <ccsuwen@gmail.com>
Date: Thu, 20 Jun 2024 11:33:41 +0800
Subject: [PATCH 4/9] refine

---
 onediff_comfy_nodes/sd3_demo/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onediff_comfy_nodes/sd3_demo/README.md b/onediff_comfy_nodes/sd3_demo/README.md
index 8f4836ee7..94f346bbe 100644
--- a/onediff_comfy_nodes/sd3_demo/README.md
+++ b/onediff_comfy_nodes/sd3_demo/README.md
@@ -8,7 +8,7 @@ huggingface: https://huggingface.co/stabilityai/stable-diffusion-3-medium
 ### Performance
 ### Metric
 
-- Testing on NVIDIA A800-SXM4-80GB, with image size of 1024*1024, iterating 28 steps. 
+- Testing on NVIDIA GeForce RTX 4090, with image size of 1024*1024, iterating 28 steps. 
 - OneDiff[Nexfort] Compile mode: 
 `max-optimize:max-autotune:low-precision`
 

From 044d26d5b5ee4a9cb47ca201375308c22c265675 Mon Sep 17 00:00:00 2001
From: FengWen <ccsuwen@gmail.com>
Date: Tue, 25 Jun 2024 11:53:17 +0800
Subject: [PATCH 5/9] refine

---
 onediff_comfy_nodes/sd3/README.md        | 133 +++++++++++++++++++++++
 onediff_comfy_nodes/sd3/main.py          |  78 +++++++++++++
 onediff_comfy_nodes/sd3/requirements.txt |  74 +++++++++++++
 3 files changed, 285 insertions(+)
 create mode 100644 onediff_comfy_nodes/sd3/README.md
 create mode 100644 onediff_comfy_nodes/sd3/main.py
 create mode 100644 onediff_comfy_nodes/sd3/requirements.txt

diff --git a/onediff_comfy_nodes/sd3/README.md b/onediff_comfy_nodes/sd3/README.md
new file mode 100644
index 000000000..5ef838d0b
--- /dev/null
+++ b/onediff_comfy_nodes/sd3/README.md
@@ -0,0 +1,133 @@
+## Accelerate SD3 by using onediff
+huggingface: https://huggingface.co/stabilityai/stable-diffusion-3-medium 
+
+## Environment setup
+### Set UP requirements
+```shell
+# python 3.10 
+COMFYUI_DIR=$pwd/ComfyUI
+# install ComfyUI
+git clone https://github.com/comfyanonymous/ComfyUI.git
+
+# install onediff & onediff_comfy_nodes
+git clone https://github.com/siliconflow/onediff.git 
+cd onediff && pip install -r onediff_comfy_nodes/sd3_demo/requirements.txt && pip install -e .
+ln -s $pwd/onediff/onediff_comfy_nodes  $COMFYUI_DIR/custom_nodes
+```
+
+<details close>
+<summary> test_install.py </summary>
+
+```python
+# Compile arbitrary models (torch.nn.Module)
+import torch
+from onediff.utils.import_utils import is_nexfort_available
+assert is_nexfort_available() == True
+
+import onediff.infer_compiler as infer_compiler
+
+class MyModule(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lin = torch.nn.Linear(100, 10)
+
+    def forward(self, x):
+        return torch.nn.functional.relu(self.lin(x))
+
+mod = MyModule().to("cuda").half()
+with torch.inference_mode():
+    compiled_mod = infer_compiler.compile(mod,
+        backend="nexfort",
+        options={"mode": "max-autotune:cudagraphs", "dynamic": True, "fullgraph": True},
+    )
+    print(compiled_mod(torch.randn(10, 100, device="cuda").half()).shape)
+    
+print("Successfully installed～")
+```
+
+</details>
+
+### Download relevant models
+
+- step1: Get User Access Tokens here https://huggingface.co/settings/tokens
+
+- step2: Download relevant models
+```shell
+export ACCESS_TOKEN="User Access Tokens"
+wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium.safetensors -O models/checkpoints/sd3_medium.safetensors 
+
+wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/clip_g.safetensors -O models/clip/clip_g.safetensors
+    
+wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/clip_l.safetensors -O models/clip/clip_l.safetensors
+
+# wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+# https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/t5xxl_fp16.safetensors -O models/clip/t5xxl_fp16.safetensors
+
+wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/t5xxl_fp8_e4m3fn.safetensors -O models/clip/t5xxl_fp8_e4m3fn.safetensors
+```
+
+
+## Usage Example
+### Run ComfyUI
+```shell
+# run comfyui
+# For CUDA Graph
+# export NEXFORT_FX_CUDAGRAPHS=1 
+# For best performance
+export TORCHINDUCTOR_MAX_AUTOTUNE=1
+# Enable CUDNN benchmark
+export NEXFORT_FX_CONV_BENCHMARK=1
+# Faster float32 matmul
+export NEXFORT_FX_MATMUL_ALLOW_TF32=1
+# For graph cache to speedup compilation
+export TORCHINDUCTOR_FX_GRAPH_CACHE=1
+# For persistent cache dir
+export TORCHINDUCTOR_CACHE_DIR=~/.torchinductor_cache
+cd $COMFYUI_DIR && python main.py --gpu-only --disable-cuda-malloc
+```
+
+### WorkFlow
+Here is a very basic example how to use it:
+[workflow_sd3_speedup.json](https://github.com/user-attachments/files/15907863/sd3_suppedup.json)
+![sd3_speedup_workflow](https://github.com/siliconflow/onediff/assets/109639975/c1e955ae-7cc5-4197-9635-7cc05d5fd7a6)
+
+
+## Performance Comparison
+
+- Testing on NVIDIA GeForce RTX 4090, with image size of 1024*1024, iterating 28 steps. 
+- OneDiff[Nexfort] Compile mode: 
+`max-optimize:max-autotune:low-precision`
+
+
+| Metric                                           | NVIDIA GeForce RTX 4090 (1024 * 1024) |
+| ------------------------------------------------ | ------------------------------------- |
+| Data update date(yyyy-mm-dd)                     | 2024-06-19                            |
+| PyTorch E2E time                                 | 4.27 s                                |
+| OneDiff E2E time                                 | 3.17 s(-25.7%)                        |
+| PyTorch Max Mem Used                             | 18.445GiB                             |
+| OneDiff Max Mem Used                             | 19.199GiB                             |
+| PyTorch Warmup with Run time                     | 10s                                   |
+| OneDiff Warmup with Compilation time<sup>1</sup> | 209s                                  |
+| OneDiff Warmup with Cache time                   | 45s                                   |
+
+ <sup>1</sup> OneDiff Warmup with Compilation time is tested on  AMD EPYC 7543 32-Core Processor CPU. Note this is just for reference, and it varies a lot on different CPU.
+
+
+
+- ✅ Multiple resolutions
+test with multiple resolutions and support shape switching in a single line of Python code
+```
+[print(f"Testing resolution: {h}x{w}") for h in [1024, 512, 768, 256] for w in [1024, 512, 768, 256]]
+```
+
+## Quality
+
+The following table shows the comparison of the plot, seed=1, Baseline (non optimized) on the left, and OneDiff (optimized) on the right
+
+|                                                                                                                      |                                                                                                                     |
+| -------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
+| ![sd3_baseline_00001_](https://github.com/siliconflow/onediff/assets/109639975/c86f2dc8-fc6f-4cc7-b85d-d4d973594ee6) | ![sd3_speedup_00001_](https://github.com/siliconflow/onediff/assets/109639975/c81b3fc9-d588-4ba1-9911-ae3a8a8d2454) |
diff --git a/onediff_comfy_nodes/sd3/main.py b/onediff_comfy_nodes/sd3/main.py
new file mode 100644
index 000000000..186a2d8ff
--- /dev/null
+++ b/onediff_comfy_nodes/sd3/main.py
@@ -0,0 +1,78 @@
+import json
+from urllib import request
+
+workflow_api_path = "./workflow_api.json"
+
+
+def queue_prompt(prompt):
+    p = {"prompt": prompt}
+    data = json.dumps(p).encode("utf-8")
+    req = request.Request(
+        "http://127.0.0.1:9999/prompt", data=data
+    )  # comfyui start port
+    request.urlopen(req)
+
+
+with open(workflow_api_path, "r") as fp:
+    prompt = json.load(fp)
+
+
+def generate_texts(min_length=50, max_length=302):
+    # 50 world
+    base_text = "a female character with long, flowing hair that appears to be made of ethereal, swirling patterns resembling the Northern Lights or Aurora Borealis. The background is dominated by deep blues and purples, creating a mysterious and dramatic atmosphere. The character's face is serene, with pale skin and striking features. She"
+
+    # Additional words pool
+    additional_words = [
+        "gracefully",
+        "beautifully",
+        "elegant",
+        "radiant",
+        "mysteriously",
+        "vibrant",
+        "softly",
+        "gently",
+        "luminescent",
+        "sparkling",
+        "delicately",
+        "glowing",
+        "brightly",
+        "shimmering",
+        "enchanting",
+        "gloriously",
+        "magnificent",
+        "majestic",
+        "fantastically",
+        "dazzlingly",
+    ]
+    for i in range(min_length, max_length):
+        idx = i % len(additional_words)
+        base_text = base_text + " " + additional_words[idx]
+        yield base_text
+
+
+generated_texts = list(generate_texts(max_length=101))
+generated_texts.reverse()
+
+cout = 0
+dimensions = [
+    (1024, 1024),
+    (1024, 768),
+    (1024, 576),
+    (1024, 512),
+    (512, 1024),
+    (768, 512),
+    (512, 512),
+]
+
+for width, height in dimensions:
+    # Set the width and height in the prompt
+    prompt["135"]["inputs"]["width"] = width
+    prompt["135"]["inputs"]["height"] = height
+
+    # Loop through each generated text and send the prompt to the server
+    for text in generated_texts:
+        prompt["6"]["inputs"]["text"] = text
+        queue_prompt(prompt)
+        print(f"{cout=}")
+        cout += 1
+        break
diff --git a/onediff_comfy_nodes/sd3/requirements.txt b/onediff_comfy_nodes/sd3/requirements.txt
new file mode 100644
index 000000000..7b86a7fae
--- /dev/null
+++ b/onediff_comfy_nodes/sd3/requirements.txt
@@ -0,0 +1,74 @@
+accelerate==0.31.0
+aiohttp==3.9.5
+aiosignal==1.3.1
+async-timeout==4.0.3
+attrs==23.2.0
+certifi==2024.6.2
+cffi==1.16.0
+charset-normalizer==3.3.2
+click==8.1.7
+cryptography==42.0.8
+Deprecated==1.2.14
+diffusers==0.29.0
+einops==0.8.0
+filelock==3.15.1
+frozenlist==1.4.1
+fsspec==2024.6.0
+gitdb==4.0.11
+GitPython==3.1.43
+huggingface-hub==0.23.3
+idna==3.7
+importlib_metadata==7.1.0
+Jinja2==3.1.4
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matrix-client==0.4.0
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.5
+networkx==3.3
+nexfort==0.1.dev242
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.5.40
+nvidia-nvtx-cu12==12.1.105
+packaging==24.1
+pillow==10.3.0
+psutil==5.9.8
+pycparser==2.22
+Pygments==2.18.0
+PyJWT==2.8.0
+PyNaCl==1.5.0
+PyYAML==6.0.1
+regex==2024.5.15
+requests==2.32.3
+rich==13.7.1
+safetensors==0.4.3
+scipy==1.13.1
+shellingham==1.5.4
+smmap==5.0.1
+sympy==1.12.1
+tokenizers==0.19.1
+torch==2.3.1
+torchaudio==2.3.1
+torchsde==0.2.6
+torchvision==0.18.1
+tqdm==4.66.4
+trampoline==0.1.2
+transformers==4.41.2
+triton==2.3.1
+typer==0.12.3
+typing_extensions==4.12.2
+urllib3==1.26.18
+wrapt==1.16.0
+yarl==1.9.4
+zipp==3.19.2

From 366e222bae645d0fa9c5d16465a7f4eb7fdfd4ad Mon Sep 17 00:00:00 2001
From: FengWen <ccsuwen@gmail.com>
Date: Tue, 25 Jun 2024 11:55:06 +0800
Subject: [PATCH 6/9] del  onediff_comfy_nodes/sd3_demo/

---
 onediff_comfy_nodes/sd3_demo/README.md        | 135 ------------------
 onediff_comfy_nodes/sd3_demo/main.py          |  70 ---------
 onediff_comfy_nodes/sd3_demo/requirements.txt |  74 ----------
 3 files changed, 279 deletions(-)
 delete mode 100644 onediff_comfy_nodes/sd3_demo/README.md
 delete mode 100644 onediff_comfy_nodes/sd3_demo/main.py
 delete mode 100644 onediff_comfy_nodes/sd3_demo/requirements.txt

diff --git a/onediff_comfy_nodes/sd3_demo/README.md b/onediff_comfy_nodes/sd3_demo/README.md
deleted file mode 100644
index 94f346bbe..000000000
--- a/onediff_comfy_nodes/sd3_demo/README.md
+++ /dev/null
@@ -1,135 +0,0 @@
-## Accelerate SD3 by using onediff
-huggingface: https://huggingface.co/stabilityai/stable-diffusion-3-medium 
-
-
-### Feature
-- ✅ Multiple resolutions
-
-### Performance
-### Metric
-
-- Testing on NVIDIA GeForce RTX 4090, with image size of 1024*1024, iterating 28 steps. 
-- OneDiff[Nexfort] Compile mode: 
-`max-optimize:max-autotune:low-precision`
-
-
-| Metric                                           | NVIDIA GeForce RTX 4090 (1024 * 1024) |
-| ------------------------------------------------ | ------------------------------------- |
-| Data update date(yyyy-mm-dd)                     | 2024-06-19                            |
-| PyTorch E2E time                                 | 4.27 s                                |
-| OneDiff E2E time                                 | 3.17 s(-25.7%)                        |
-| PyTorch Max Mem Used                             | 18.445GiB                             |
-| OneDiff Max Mem Used                             | 19.199GiB                             |
-| PyTorch Warmup with Run time                     | 10s                                   |
-| OneDiff Warmup with Compilation time<sup>1</sup> | 209s                                  |
-| OneDiff Warmup with Cache time                   | 45s                                   |
-
- <sup>1</sup> OneDiff Warmup with Compilation time is tested on  AMD EPYC 7543 32-Core Processor CPU. Note this is just for reference, and it varies a lot on different CPU.
-
-
-The following table shows the comparison of the plot, seed=1, Baseline (non optimized) on the left, and OneDiff (optimized) on the right
-
-|                                                                                                                      |                                                                                                                     |
-| -------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
-| ![sd3_baseline_00001_](https://github.com/siliconflow/onediff/assets/109639975/c86f2dc8-fc6f-4cc7-b85d-d4d973594ee6) | ![sd3_speedup_00001_](https://github.com/siliconflow/onediff/assets/109639975/c81b3fc9-d588-4ba1-9911-ae3a8a8d2454) |
-
-
-### Multiple resolutions
-test with multiple resolutions and support shape switching in a single line of Python code
-```
-[print(f"Testing resolution: {h}x{w}") for h in [1024, 512, 768, 256] for w in [1024, 512, 768, 256]]
-```
-## Environment setup
-### Set SD3 requirements
-```shell
-# python 3.10 
-COMFYUI_DIR=$pwd/ComfyUI
-# install ComfyUI
-git clone https://github.com/comfyanonymous/ComfyUI.git
-
-# install onediff & onediff_comfy_nodes
-git clone https://github.com/siliconflow/onediff.git 
-cd onediff && pip install -r onediff_comfy_nodes/sd3_demo/requirements.txt && pip install -e .
-ln -s $pwd/onediff/onediff_comfy_nodes  $COMFYUI_DIR/custom_nodes
-```
-
-<details close>
-<summary> test_install.py </summary>
-
-```python
-# Compile arbitrary models (torch.nn.Module)
-import torch
-from onediff.utils.import_utils import is_nexfort_available
-assert is_nexfort_available() == True
-
-import onediff.infer_compiler as infer_compiler
-
-class MyModule(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.lin = torch.nn.Linear(100, 10)
-
-    def forward(self, x):
-        return torch.nn.functional.relu(self.lin(x))
-
-mod = MyModule().to("cuda").half()
-with torch.inference_mode():
-    compiled_mod = infer_compiler.compile(mod,
-        backend="nexfort",
-        options={"mode": "max-autotune:cudagraphs", "dynamic": True, "fullgraph": True},
-    )
-    print(compiled_mod(torch.randn(10, 100, device="cuda").half()).shape)
-    
-print("Successfully installed～")
-```
-
-</details>
-
-### Download relevant models
-
-- step1: Get User Access Tokens here https://huggingface.co/settings/tokens
-
-- step2: Download relevant models
-```shell
-export ACCESS_TOKEN="User Access Tokens"
-wget --header="Authorization: Bearer $ACCESS_TOKEN" \
-https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium.safetensors -O models/checkpoints/sd3_medium.safetensors 
-
-wget --header="Authorization: Bearer $ACCESS_TOKEN" \
-https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/clip_g.safetensors -O models/clip/clip_g.safetensors
-    
-wget --header="Authorization: Bearer $ACCESS_TOKEN" \
-https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/clip_l.safetensors -O models/clip/clip_l.safetensors
-
-# wget --header="Authorization: Bearer $ACCESS_TOKEN" \
-# https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/t5xxl_fp16.safetensors -O models/clip/t5xxl_fp16.safetensors
-
-wget --header="Authorization: Bearer $ACCESS_TOKEN" \
-https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/t5xxl_fp8_e4m3fn.safetensors -O models/clip/t5xxl_fp8_e4m3fn.safetensors
-```
-
-## Usage Example
-
-### Run ComfyUI
-```shell
-# run comfyui
-# For CUDA Graph
-# export NEXFORT_FX_CUDAGRAPHS=1 
-# For best performance
-export TORCHINDUCTOR_MAX_AUTOTUNE=1
-# Enable CUDNN benchmark
-export NEXFORT_FX_CONV_BENCHMARK=1
-# Faster float32 matmul
-export NEXFORT_FX_MATMUL_ALLOW_TF32=1
-# For graph cache to speedup compilation
-export TORCHINDUCTOR_FX_GRAPH_CACHE=1
-# For persistent cache dir
-export TORCHINDUCTOR_CACHE_DIR=~/.torchinductor_cache
-cd $COMFYUI_DIR && python main.py --gpu-only --disable-cuda-malloc
-```
-
-### WorkFlow
-Here is a very basic example how to use it:
-[workflow_sd3_speedup.json](https://github.com/user-attachments/files/15907863/sd3_suppedup.json)
-![sd3_speedup_workflow](https://github.com/siliconflow/onediff/assets/109639975/c1e955ae-7cc5-4197-9635-7cc05d5fd7a6)
-
diff --git a/onediff_comfy_nodes/sd3_demo/main.py b/onediff_comfy_nodes/sd3_demo/main.py
deleted file mode 100644
index 89899eb4f..000000000
--- a/onediff_comfy_nodes/sd3_demo/main.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import json
-from urllib import request
-
-workflow_api_path = "./workflow_api.json" 
-
-def queue_prompt(prompt):
-    p = {"prompt": prompt}
-    data = json.dumps(p).encode('utf-8')
-    req =  request.Request("http://127.0.0.1:9999/prompt", data=data) # comfyui start port 
-    request.urlopen(req)
-
-with open(workflow_api_path, "r") as fp:
-   prompt = json.load(fp)
-
-
-def generate_texts(min_length=50, max_length=302):
-    # 50 world
-    base_text = "a female character with long, flowing hair that appears to be made of ethereal, swirling patterns resembling the Northern Lights or Aurora Borealis. The background is dominated by deep blues and purples, creating a mysterious and dramatic atmosphere. The character's face is serene, with pale skin and striking features. She"
-
-    # Additional words pool
-    additional_words = [
-        "gracefully",
-        "beautifully",
-        "elegant",
-        "radiant",
-        "mysteriously",
-        "vibrant",
-        "softly",
-        "gently",
-        "luminescent",
-        "sparkling",
-        "delicately",
-        "glowing",
-        "brightly",
-        "shimmering",
-        "enchanting",
-        "gloriously",
-        "magnificent",
-        "majestic",
-        "fantastically",
-        "dazzlingly",
-    ]
-    for i in range(min_length, max_length):
-        idx = i % len(additional_words)
-        base_text = base_text + " " + additional_words[idx]
-        yield base_text
-
-
-
-generated_texts = list(generate_texts(max_length=101))
-generated_texts.reverse()
-
-cout = 0
-dimensions = [
-    (1024, 1024), (1024, 768), (1024, 576), 
-    (1024, 512), (512, 1024), (768, 512), (512, 512)
-]
-
-for width, height in dimensions:
-    # Set the width and height in the prompt
-    prompt["135"]["inputs"]["width"] = width
-    prompt["135"]["inputs"]["height"] = height
-
-    # Loop through each generated text and send the prompt to the server
-    for text in generated_texts:
-        prompt["6"]["inputs"]["text"] = text  
-        queue_prompt(prompt)
-        print(f'{cout=}')
-        cout += 1
-        break
\ No newline at end of file
diff --git a/onediff_comfy_nodes/sd3_demo/requirements.txt b/onediff_comfy_nodes/sd3_demo/requirements.txt
deleted file mode 100644
index 7b86a7fae..000000000
--- a/onediff_comfy_nodes/sd3_demo/requirements.txt
+++ /dev/null
@@ -1,74 +0,0 @@
-accelerate==0.31.0
-aiohttp==3.9.5
-aiosignal==1.3.1
-async-timeout==4.0.3
-attrs==23.2.0
-certifi==2024.6.2
-cffi==1.16.0
-charset-normalizer==3.3.2
-click==8.1.7
-cryptography==42.0.8
-Deprecated==1.2.14
-diffusers==0.29.0
-einops==0.8.0
-filelock==3.15.1
-frozenlist==1.4.1
-fsspec==2024.6.0
-gitdb==4.0.11
-GitPython==3.1.43
-huggingface-hub==0.23.3
-idna==3.7
-importlib_metadata==7.1.0
-Jinja2==3.1.4
-markdown-it-py==3.0.0
-MarkupSafe==2.1.5
-matrix-client==0.4.0
-mdurl==0.1.2
-mpmath==1.3.0
-multidict==6.0.5
-networkx==3.3
-nexfort==0.1.dev242
-numpy==1.26.4
-nvidia-cublas-cu12==12.1.3.1
-nvidia-cuda-cupti-cu12==12.1.105
-nvidia-cuda-nvrtc-cu12==12.1.105
-nvidia-cuda-runtime-cu12==12.1.105
-nvidia-cudnn-cu12==8.9.2.26
-nvidia-cufft-cu12==11.0.2.54
-nvidia-curand-cu12==10.3.2.106
-nvidia-cusolver-cu12==11.4.5.107
-nvidia-cusparse-cu12==12.1.0.106
-nvidia-nccl-cu12==2.20.5
-nvidia-nvjitlink-cu12==12.5.40
-nvidia-nvtx-cu12==12.1.105
-packaging==24.1
-pillow==10.3.0
-psutil==5.9.8
-pycparser==2.22
-Pygments==2.18.0
-PyJWT==2.8.0
-PyNaCl==1.5.0
-PyYAML==6.0.1
-regex==2024.5.15
-requests==2.32.3
-rich==13.7.1
-safetensors==0.4.3
-scipy==1.13.1
-shellingham==1.5.4
-smmap==5.0.1
-sympy==1.12.1
-tokenizers==0.19.1
-torch==2.3.1
-torchaudio==2.3.1
-torchsde==0.2.6
-torchvision==0.18.1
-tqdm==4.66.4
-trampoline==0.1.2
-transformers==4.41.2
-triton==2.3.1
-typer==0.12.3
-typing_extensions==4.12.2
-urllib3==1.26.18
-wrapt==1.16.0
-yarl==1.9.4
-zipp==3.19.2

From 2f1e40dc01eca172791fe6abfd80b21c4781898c Mon Sep 17 00:00:00 2001
From: FengWen <ccsuwen@gmail.com>
Date: Tue, 25 Jun 2024 13:56:27 +0800
Subject: [PATCH 7/9] Structure Refine

---
 onediff_comfy_nodes/sd3/README.md | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/onediff_comfy_nodes/sd3/README.md b/onediff_comfy_nodes/sd3/README.md
index 5ef838d0b..05fa747e1 100644
--- a/onediff_comfy_nodes/sd3/README.md
+++ b/onediff_comfy_nodes/sd3/README.md
@@ -74,15 +74,6 @@ https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_e
 ## Usage Example
 ### Run ComfyUI
 ```shell
-# run comfyui
-# For CUDA Graph
-# export NEXFORT_FX_CUDAGRAPHS=1 
-# For best performance
-export TORCHINDUCTOR_MAX_AUTOTUNE=1
-# Enable CUDNN benchmark
-export NEXFORT_FX_CONV_BENCHMARK=1
-# Faster float32 matmul
-export NEXFORT_FX_MATMUL_ALLOW_TF32=1
 # For graph cache to speedup compilation
 export TORCHINDUCTOR_FX_GRAPH_CACHE=1
 # For persistent cache dir
@@ -118,11 +109,13 @@ Here is a very basic example how to use it:
 
 
 
-- ✅ Multiple resolutions
-test with multiple resolutions and support shape switching in a single line of Python code
-```
-[print(f"Testing resolution: {h}x{w}") for h in [1024, 512, 768, 256] for w in [1024, 512, 768, 256]]
-```
+## Dynamic shape for SD3.
+
+**Q: How to use different resolutions in a production environment?**
+
+A: Warmup: Perform inference at different resolutions before deployment to ensure stability and performance; (utilize NVIDIA AUTO TUNE: Automatically optimize GPU settings when switching resolutions to enhance performance.)
+
+
 
 ## Quality
 

From 957d9c753fdb732dcc67024e7c9b2a50b0845b93 Mon Sep 17 00:00:00 2001
From: FengWen <ccsuwen@gmail.com>
Date: Tue, 25 Jun 2024 14:33:40 +0800
Subject: [PATCH 8/9] refine

---
 onediff_comfy_nodes/sd3/README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/onediff_comfy_nodes/sd3/README.md b/onediff_comfy_nodes/sd3/README.md
index 05fa747e1..5c001bdfd 100644
--- a/onediff_comfy_nodes/sd3/README.md
+++ b/onediff_comfy_nodes/sd3/README.md
@@ -113,9 +113,13 @@ Here is a very basic example how to use it:
 
 **Q: How to use different resolutions in a production environment?**
 
-A: Warmup: Perform inference at different resolutions before deployment to ensure stability and performance; (utilize NVIDIA AUTO TUNE: Automatically optimize GPU settings when switching resolutions to enhance performance.)
+A: Warmup: Perform inference at different resolutions before deployment to ensure stability and performance;
 
 
+**Q: Why is warmup necessary when switching resolutions?**
+
+A: Warmup is necessary because NVIDIA AUTO TUNE automatically optimizes GPU settings during this process to enhance system efficiency when switching resolutions.
+
 
 ## Quality
 

From d0e59869aeb19645b06d02ffe7659e13a4eea631 Mon Sep 17 00:00:00 2001
From: FengWen <109639975+ccssu@users.noreply.github.com>
Date: Tue, 25 Jun 2024 15:25:23 +0800
Subject: [PATCH 9/9] Update README.md

---
 onediff_comfy_nodes/sd3/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onediff_comfy_nodes/sd3/README.md b/onediff_comfy_nodes/sd3/README.md
index 5c001bdfd..845d2074b 100644
--- a/onediff_comfy_nodes/sd3/README.md
+++ b/onediff_comfy_nodes/sd3/README.md
@@ -11,7 +11,7 @@ git clone https://github.com/comfyanonymous/ComfyUI.git
 
 # install onediff & onediff_comfy_nodes
 git clone https://github.com/siliconflow/onediff.git 
-cd onediff && pip install -r onediff_comfy_nodes/sd3_demo/requirements.txt && pip install -e .
+cd onediff && pip install -r onediff_comfy_nodes/sd3/requirements.txt && pip install -e .
 ln -s $pwd/onediff/onediff_comfy_nodes  $COMFYUI_DIR/custom_nodes
 ```