siliconflow · ccssu · Jun 25, 2024 · Jun 13, 2024 · Jun 14, 2024 · Jun 20, 2024
diff --git a/onediff_comfy_nodes/sd3_demo/README.md b/onediff_comfy_nodes/sd3_demo/README.md
@@ -6,15 +6,25 @@ huggingface: https://huggingface.co/stabilityai/stable-diffusion-3-medium
 - ✅ Multiple resolutions
 
 ### Performance
+### Metric
 
-- Timings for 28 steps at 1024x1024
-- OneDiff[Nexfort] Compile mode: max-optimize:max-autotune:low-precision
+- Testing on NVIDIA A800-SXM4-80GB, with image size of 1024*1024, iterating 28 steps. 
+- OneDiff[Nexfort] Compile mode: 
+`max-optimize:max-autotune:low-precision`
 
-| Accelerator           | Baseline (non-optimized) | OneDiff (optimized) | Percentage improvement |
-| --------------------- | ------------------------ | ------------------- | ---------------------- |
-| NVIDIA A800-SXM4-80GB | ~4.03 sec                | ~2.93 sec           | ~27.29 %               |
 
+| Metric                                           | NVIDIA GeForce RTX 4090 (1024 * 1024) |
+| ------------------------------------------------ | ------------------------------------- |
+| Data update date(yyyy-mm-dd)                     | 2024-06-19                            |
+| PyTorch E2E time                                 | 4.27 s                                |
+| OneDiff E2E time                                 | 3.17 s(-25.7%)                        |
+| PyTorch Max Mem Used                             | 18.445GiB                             |
+| OneDiff Max Mem Used                             | 19.199GiB                             |
+| PyTorch Warmup with Run time                     | 10s                                   |
+| OneDiff Warmup with Compilation time<sup>1</sup> | 209s                                  |
+| OneDiff Warmup with Cache time                   | 45s                                   |
 
+ <sup>1</sup> OneDiff Warmup with Compilation time is tested on  AMD EPYC 7543 32-Core Processor CPU. Note this is just for reference, and it varies a lot on different CPU.
 
 
 The following table shows the comparison of the plot, seed=1, Baseline (non optimized) on the left, and OneDiff (optimized) on the right
@@ -29,18 +39,18 @@ test with multiple resolutions and support shape switching in a single line of P
 ```
 [print(f"Testing resolution: {h}x{w}") for h in [1024, 512, 768, 256] for w in [1024, 512, 768, 256]]
 ```
-
-## Usage Example
-
-### Install
-
+## Environment setup
+### Set SD3 requirements
 ```shell
 # python 3.10 
 COMFYUI_DIR=$pwd/ComfyUI
+# install ComfyUI
+git clone https://github.com/comfyanonymous/ComfyUI.git
+
+# install onediff & onediff_comfy_nodes
 git clone https://github.com/siliconflow/onediff.git 
 cd onediff && pip install -r onediff_comfy_nodes/sd3_demo/requirements.txt && pip install -e .
 ln -s $pwd/onediff/onediff_comfy_nodes  $COMFYUI_DIR/custom_nodes
-git clone https://github.com/comfyanonymous/ComfyUI.git
 ```
 
 <details close>
@@ -72,13 +82,39 @@ with torch.inference_mode():
 
 print("Successfully installed～")
 ```
+
 </details>
 
+### Download relevant models
+
+- step1: Get User Access Tokens here https://huggingface.co/settings/tokens
+
+- step2: Download relevant models
+```shell
+export ACCESS_TOKEN="User Access Tokens"
+wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/sd3_medium.safetensors -O models/checkpoints/sd3_medium.safetensors 
+
+wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/clip_g.safetensors -O models/clip/clip_g.safetensors
+
+wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/clip_l.safetensors -O models/clip/clip_l.safetensors
+
+# wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+# https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/t5xxl_fp16.safetensors -O models/clip/t5xxl_fp16.safetensors
+
+wget --header="Authorization: Bearer $ACCESS_TOKEN" \
+https://huggingface.co/stabilityai/stable-diffusion-3-medium/resolve/main/text_encoders/t5xxl_fp8_e4m3fn.safetensors -O models/clip/t5xxl_fp8_e4m3fn.safetensors
+```
+
+## Usage Example
+
 ### Run ComfyUI
 ```shell
 # run comfyui
 # For CUDA Graph
-export NEXFORT_FX_CUDAGRAPHS=1
+# export NEXFORT_FX_CUDAGRAPHS=1 
 # For best performance
 export TORCHINDUCTOR_MAX_AUTOTUNE=1
 # Enable CUDNN benchmark
@@ -93,5 +129,7 @@ cd $COMFYUI_DIR && python main.py --gpu-only --disable-cuda-malloc
 ```
 
 ### WorkFlow
-![WorkFlow](https://github.com/siliconflow/onediff/assets/109639975/a385fac5-1f82-4905-a941-4c71ff1c616e)
+Here is a very basic example how to use it:
+[workflow_sd3_speedup.json](https://github.com/user-attachments/files/15907863/sd3_suppedup.json)
+![sd3_speedup_workflow](https://github.com/siliconflow/onediff/assets/109639975/c1e955ae-7cc5-4197-9635-7cc05d5fd7a6)
 
diff --git a/onediff_comfy_nodes/sd3_demo/main.py b/onediff_comfy_nodes/sd3_demo/main.py
@@ -0,0 +1,70 @@
+import json
+from urllib import request
+
+workflow_api_path = "./workflow_api.json" 
+
+def queue_prompt(prompt):
+    p = {"prompt": prompt}
+    data = json.dumps(p).encode('utf-8')
+    req =  request.Request("http://127.0.0.1:9999/prompt", data=data) # comfyui start port 
+    request.urlopen(req)
+
+with open(workflow_api_path, "r") as fp:
+   prompt = json.load(fp)
+
+
+def generate_texts(min_length=50, max_length=302):
+    # 50 world
+    base_text = "a female character with long, flowing hair that appears to be made of ethereal, swirling patterns resembling the Northern Lights or Aurora Borealis. The background is dominated by deep blues and purples, creating a mysterious and dramatic atmosphere. The character's face is serene, with pale skin and striking features. She"
+
+    # Additional words pool
+    additional_words = [
+        "gracefully",
+        "beautifully",
+        "elegant",
+        "radiant",
+        "mysteriously",
+        "vibrant",
+        "softly",
+        "gently",
+        "luminescent",
+        "sparkling",
+        "delicately",
+        "glowing",
+        "brightly",
+        "shimmering",
+        "enchanting",
+        "gloriously",
+        "magnificent",
+        "majestic",
+        "fantastically",
+        "dazzlingly",
+    ]
+    for i in range(min_length, max_length):
+        idx = i % len(additional_words)
+        base_text = base_text + " " + additional_words[idx]
+        yield base_text
+
+
+
+generated_texts = list(generate_texts(max_length=101))
+generated_texts.reverse()
+
+cout = 0
+dimensions = [
+    (1024, 1024), (1024, 768), (1024, 576), 
+    (1024, 512), (512, 1024), (768, 512), (512, 512)
+]
+
+for width, height in dimensions:
+    # Set the width and height in the prompt
+    prompt["135"]["inputs"]["width"] = width
+    prompt["135"]["inputs"]["height"] = height
+
+    # Loop through each generated text and send the prompt to the server
+    for text in generated_texts:
+        prompt["6"]["inputs"]["text"] = text  
+        queue_prompt(prompt)
+        print(f'{cout=}')
+        cout += 1
+        break