diff --git a/onediff_diffusers_extensions/examples/sd3/README.md b/onediff_diffusers_extensions/examples/sd3/README.md
index 79149df3e..71d9c72f7 100644
--- a/onediff_diffusers_extensions/examples/sd3/README.md
+++ b/onediff_diffusers_extensions/examples/sd3/README.md
@@ -1,5 +1,17 @@
# Run SD3 with nexfort backend (Beta Release)
+1. [Environment Setup](#environment-setup)
+ - [Set Up OneDiff](#set-up-onediff)
+ - [Set Up NexFort Backend](#set-up-nexfort-backend)
+ - [Set Up Diffusers Library](#set-up-diffusers-library)
+ - [Download SD3 Model for Diffusers](#download-sd3-model-for-diffusers)
+2. [Execution Instructions](#execution-instructions)
+ - [Run Without Compilation (Baseline)](#run-without-compilation-baseline)
+ - [Run With Compilation](#run-with-compilation)
+3. [Performance Comparison](#performance-comparison)
+4. [Dynamic Shape for SD3](#dynamic-shape-for-sd3)
+5. [Quality](#quality)
+
## Environment setup
### Set up onediff
https://github.com/siliconflow/onediff?tab=readme-ov-file#installation
@@ -10,10 +22,11 @@ https://github.com/siliconflow/onediff/tree/main/src/onediff/infer_compiler/back
### Set up diffusers
```
-pip install git+https://github.com/huggingface/diffusers.git@main
+# Ensure diffusers include the SD3 pipeline.
+pip3 install --upgrade diffusers[torch]
```
### Set up SD3
-Model version for diffusers: https://huggingface.co/stabilityai/stable-diffusion-3-medium/tree/refs%2Fpr%2F26
+Model version for diffusers: https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers
HF pipeline: https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md
@@ -29,25 +42,44 @@ python3 onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py \
```
python3 onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py \
- --compiler-config '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision:cudagraphs", "memory_format": "channels_last"}' \
+ --compiler-config '{"mode": "max-optimize:max-autotune:low-precision:cache-all:freezing:benchmark", "memory_format": "channels_last"}' \
--saved-image sd3_compile.png
```
## Performance comparation
-Testing on H800, with image size of 1024*1024, iterating 28 steps.
-
-| | Iteration speed | E2E Inference Time | Max CUDA Memory Used |
-| --------------- | -------------------- | ------------------ | -------------------- |
-| Baseline | 15.56 it/s | 1.96 s | 18.784 GiB |
-| Nexfort compile | 25.91 it/s (+66.5%) | 1.15 s (-41.3%) | 18.324 GiB |
-
-Testing on A100-PCIE-40GB, with image size of 1024*1024, iterating 28 steps.
-
-| | Iteration speed | E2E Inference Time | Max CUDA Memory Used |
-| --------------- | ------------------ | ------------------ | -------------------- |
-| Baseline | 6.66 it/s | 4.50 s | 18.762 GiB |
-| Nexfort compile | 9.39 it/s (+40.9%) | 3.15 s (-30.0%) | 17.939 GiB |
+Testing on H800-NVL-80GB, with image size of 1024*1024, iterating 28 steps:
+| Metric | NVIDIA A100-PCIE-40GB (1024 * 1024) |
+| ------------------------------------------------ | ----------------------------------- |
+| Data update date(yyyy-mm-dd) | 2024-06-24 |
+| PyTorch iteration speed | 15.56 it/s |
+| OneDiff iteration speed | 25.91 it/s (+66.5%) |
+| PyTorch E2E time | 1.96 s |
+| OneDiff E2E time | 1.15 s (-41.3%) |
+| PyTorch Max Mem Used | 18.784 GiB |
+| OneDiff Max Mem Used | 18.324 GiB |
+| PyTorch Warmup with Run time | 2.86 s |
+| OneDiff Warmup with Compilation time1 | 889.25 s |
+| OneDiff Warmup with Cache time | 44.38 s |
+
+1 OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Platinum 8468. Note this is just for reference, and it varies a lot on different CPU.
+
+
+Testing on 4090:
+| Metric | NVIDIA A100-PCIE-40GB (1024 * 1024) |
+| ------------------------------------------------ | ----------------------------------- |
+| Data update date(yyyy-mm-dd) | 2024-06-24 |
+| PyTorch iteration speed | 6.67 it/s |
+| OneDiff iteration speed | 12.24 it/s (+83.3%) |
+| PyTorch E2E time | 4.90 s |
+| OneDiff E2E time | 2.48 s (-49.4%) |
+| PyTorch Max Mem Used | 18.799 GiB |
+| OneDiff Max Mem Used | 17.902 GiB |
+| PyTorch Warmup with Run time | 4.99 s |
+| OneDiff Warmup with Compilation time2 | 302.79 s |
+| OneDiff Warmup with Cache time | 51.96 s |
+
+ 2 AMD EPYC 7543 32-Core Processor
## Dynamic shape for SD3.
@@ -55,6 +87,7 @@ Testing on A100-PCIE-40GB, with image size of 1024*1024, iterating 28 steps.
Run:
```
+# The best practice mode configuration for dynamic shape is `max-optimize:max-autotune:low-precision`.
python3 onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py \
--compiler-config '{"mode": "max-optimize:max-autotune:low-precision", "memory_format": "channels_last", "dynamic": true}' \
--height 512 \
diff --git a/onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py b/onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py
index b651e1703..4809c9f07 100644
--- a/onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py
+++ b/onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py
@@ -14,7 +14,7 @@ def parse_args():
parser.add_argument(
"--model",
type=str,
- default="stabilityai/stable-diffusion-3-medium",
+ default="stabilityai/stable-diffusion-3-medium-diffusers",
help="Model path or identifier.",
)
parser.add_argument(
@@ -29,12 +29,21 @@ def parse_args():
default="photo of a dog and a cat both standing on a red box, with a blue ball in the middle with a parrot standing on top of the ball. The box has the text 'onediff'",
help="Prompt for the image generation.",
)
+ parser.add_argument(
+ "--negative_prompt",
+ type=str,
+ default="",
+ help="Negative prompt for the image generation.",
+ )
parser.add_argument(
"--height", type=int, default=1024, help="Height of the generated image."
)
parser.add_argument(
"--width", type=int, default=1024, help="Width of the generated image."
)
+ parser.add_argument(
+ "--guidance_scale", type=float, default=4.5, help="The scale factor for the guidance."
+ )
parser.add_argument(
"--num-inference-steps", type=int, default=28, help="Number of inference steps."
)
@@ -119,9 +128,12 @@ def warmup(self, gen_args, warmup_iterations=1):
warmup_args["generator"] = torch.Generator(device=device).manual_seed(0)
print("Starting warmup...")
+ start_time = time.time()
for _ in range(warmup_iterations):
self.pipe(**warmup_args)
+ end_time = time.time()
print("Warmup complete.")
+ print(f"Warmup time: {end_time - start_time:.2f} seconds")
def generate(self, gen_args):
gen_args["generator"] = torch.Generator(device=device).manual_seed(args.seed)
@@ -166,6 +178,8 @@ def main():
"num_inference_steps": args.num_inference_steps,
"height": args.height,
"width": args.width,
+ "guidance_scale": args.guidance_scale,
+ "negative_prompt": args.negative_prompt,
}
sd3.warmup(gen_args)