diff --git a/onediff_diffusers_extensions/examples/sd3/README.md b/onediff_diffusers_extensions/examples/sd3/README.md index 79149df3e..71d9c72f7 100644 --- a/onediff_diffusers_extensions/examples/sd3/README.md +++ b/onediff_diffusers_extensions/examples/sd3/README.md @@ -1,5 +1,17 @@ # Run SD3 with nexfort backend (Beta Release) +1. [Environment Setup](#environment-setup) + - [Set Up OneDiff](#set-up-onediff) + - [Set Up NexFort Backend](#set-up-nexfort-backend) + - [Set Up Diffusers Library](#set-up-diffusers-library) + - [Download SD3 Model for Diffusers](#download-sd3-model-for-diffusers) +2. [Execution Instructions](#execution-instructions) + - [Run Without Compilation (Baseline)](#run-without-compilation-baseline) + - [Run With Compilation](#run-with-compilation) +3. [Performance Comparison](#performance-comparison) +4. [Dynamic Shape for SD3](#dynamic-shape-for-sd3) +5. [Quality](#quality) + ## Environment setup ### Set up onediff https://github.com/siliconflow/onediff?tab=readme-ov-file#installation @@ -10,10 +22,11 @@ https://github.com/siliconflow/onediff/tree/main/src/onediff/infer_compiler/back ### Set up diffusers ``` -pip install git+https://github.com/huggingface/diffusers.git@main +# Ensure diffusers include the SD3 pipeline. +pip3 install --upgrade diffusers[torch] ``` ### Set up SD3 -Model version for diffusers: https://huggingface.co/stabilityai/stable-diffusion-3-medium/tree/refs%2Fpr%2F26 +Model version for diffusers: https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers HF pipeline: https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_3.md @@ -29,25 +42,44 @@ python3 onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py \ ``` python3 onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py \ - --compiler-config '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision:cudagraphs", "memory_format": "channels_last"}' \ + --compiler-config '{"mode": "max-optimize:max-autotune:low-precision:cache-all:freezing:benchmark", "memory_format": "channels_last"}' \ --saved-image sd3_compile.png ``` ## Performance comparation -Testing on H800, with image size of 1024*1024, iterating 28 steps. - -| | Iteration speed | E2E Inference Time | Max CUDA Memory Used | -| --------------- | -------------------- | ------------------ | -------------------- | -| Baseline | 15.56 it/s | 1.96 s | 18.784 GiB | -| Nexfort compile | 25.91 it/s (+66.5%) | 1.15 s (-41.3%) | 18.324 GiB | - -Testing on A100-PCIE-40GB, with image size of 1024*1024, iterating 28 steps. - -| | Iteration speed | E2E Inference Time | Max CUDA Memory Used | -| --------------- | ------------------ | ------------------ | -------------------- | -| Baseline | 6.66 it/s | 4.50 s | 18.762 GiB | -| Nexfort compile | 9.39 it/s (+40.9%) | 3.15 s (-30.0%) | 17.939 GiB | +Testing on H800-NVL-80GB, with image size of 1024*1024, iterating 28 steps: +| Metric | NVIDIA A100-PCIE-40GB (1024 * 1024) | +| ------------------------------------------------ | ----------------------------------- | +| Data update date(yyyy-mm-dd) | 2024-06-24 | +| PyTorch iteration speed | 15.56 it/s | +| OneDiff iteration speed | 25.91 it/s (+66.5%) | +| PyTorch E2E time | 1.96 s | +| OneDiff E2E time | 1.15 s (-41.3%) | +| PyTorch Max Mem Used | 18.784 GiB | +| OneDiff Max Mem Used | 18.324 GiB | +| PyTorch Warmup with Run time | 2.86 s | +| OneDiff Warmup with Compilation time1 | 889.25 s | +| OneDiff Warmup with Cache time | 44.38 s | + +1 OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Platinum 8468. Note this is just for reference, and it varies a lot on different CPU. + + +Testing on 4090: +| Metric | NVIDIA A100-PCIE-40GB (1024 * 1024) | +| ------------------------------------------------ | ----------------------------------- | +| Data update date(yyyy-mm-dd) | 2024-06-24 | +| PyTorch iteration speed | 6.67 it/s | +| OneDiff iteration speed | 12.24 it/s (+83.3%) | +| PyTorch E2E time | 4.90 s | +| OneDiff E2E time | 2.48 s (-49.4%) | +| PyTorch Max Mem Used | 18.799 GiB | +| OneDiff Max Mem Used | 17.902 GiB | +| PyTorch Warmup with Run time | 4.99 s | +| OneDiff Warmup with Compilation time2 | 302.79 s | +| OneDiff Warmup with Cache time | 51.96 s | + + 2 AMD EPYC 7543 32-Core Processor ## Dynamic shape for SD3. @@ -55,6 +87,7 @@ Testing on A100-PCIE-40GB, with image size of 1024*1024, iterating 28 steps. Run: ``` +# The best practice mode configuration for dynamic shape is `max-optimize:max-autotune:low-precision`. python3 onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py \ --compiler-config '{"mode": "max-optimize:max-autotune:low-precision", "memory_format": "channels_last", "dynamic": true}' \ --height 512 \ diff --git a/onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py b/onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py index b651e1703..4809c9f07 100644 --- a/onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py +++ b/onediff_diffusers_extensions/examples/sd3/text_to_image_sd3.py @@ -14,7 +14,7 @@ def parse_args(): parser.add_argument( "--model", type=str, - default="stabilityai/stable-diffusion-3-medium", + default="stabilityai/stable-diffusion-3-medium-diffusers", help="Model path or identifier.", ) parser.add_argument( @@ -29,12 +29,21 @@ def parse_args(): default="photo of a dog and a cat both standing on a red box, with a blue ball in the middle with a parrot standing on top of the ball. The box has the text 'onediff'", help="Prompt for the image generation.", ) + parser.add_argument( + "--negative_prompt", + type=str, + default="", + help="Negative prompt for the image generation.", + ) parser.add_argument( "--height", type=int, default=1024, help="Height of the generated image." ) parser.add_argument( "--width", type=int, default=1024, help="Width of the generated image." ) + parser.add_argument( + "--guidance_scale", type=float, default=4.5, help="The scale factor for the guidance." + ) parser.add_argument( "--num-inference-steps", type=int, default=28, help="Number of inference steps." ) @@ -119,9 +128,12 @@ def warmup(self, gen_args, warmup_iterations=1): warmup_args["generator"] = torch.Generator(device=device).manual_seed(0) print("Starting warmup...") + start_time = time.time() for _ in range(warmup_iterations): self.pipe(**warmup_args) + end_time = time.time() print("Warmup complete.") + print(f"Warmup time: {end_time - start_time:.2f} seconds") def generate(self, gen_args): gen_args["generator"] = torch.Generator(device=device).manual_seed(args.seed) @@ -166,6 +178,8 @@ def main(): "num_inference_steps": args.num_inference_steps, "height": args.height, "width": args.width, + "guidance_scale": args.guidance_scale, + "negative_prompt": args.negative_prompt, } sd3.warmup(gen_args)