Support latte (#964)

onediff 适配 Latte T2V 模型， https://github.com/Vchitect/Latte 依赖nexfort siliconflow/nexfort#90 合并
siliconflow · Jul 1, 2024 · 31f1601 · 31f1601
1 parent c19064f
commit 31f1601
Show file tree

Hide file tree

Showing 4 changed files with 365 additions and 1 deletion.
diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py
@@ -0,0 +1,275 @@
+MODEL = "maxin-cn/Latte-1"
+CKPT = "t2v_v20240523.pt"
+VARIANT = None
+CUSTOM_PIPELINE = None
+# SAMPLE_METHOD = "DDIM"
+BETA_START = 0.0001
+BETA_END = 0.02
+BREA_SCHEDULE = "linear"
+VARIANCE_TYPE = "learned_range"
+STEPS = 50
+SEED = 25
+WARMUPS = 1
+BATCH = 1
+HEIGHT = 512
+WIDTH = 512
+VIDEO_LENGTH = 16
+FPS = 8
+GUIDANCE_SCALE = 7.5
+ENABLE_TEMPORAL_ATTENTIONS = "true"
+ENABLE_VAE_TEMPORAL_DECODER = "true"
+OUTPUT_VIDEO = "output.mp4"
+
+PROMPT = "An epic tornado attacking above aglowing city at night."
+
+EXTRA_CALL_KWARGS = None
+ATTENTION_FP16_SCORE_ACCUM_MAX_M = 0
+
+COMPILER_CONFIG = None
+
+
+import os
+import importlib
+import inspect
+import argparse
+import time
+import json
+import random
+from PIL import Image, ImageDraw
+
+import torch
+import oneflow as flow
+from onediffx import compile_pipe, OneflowCompileOptions
+from diffusers.utils import load_image, export_to_video
+from diffusers.schedulers import DDIMScheduler
+from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
+from transformers import T5EncoderModel, T5Tokenizer
+import imageio
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, default=MODEL)
+    parser.add_argument("--ckpt", type=str, default=CKPT)
+    parser.add_argument("--prompt", type=str, default=PROMPT)
+    parser.add_argument("--save_graph", action="store_true")
+    parser.add_argument("--load_graph", action="store_true")
+    parser.add_argument("--variant", type=str, default=VARIANT)
+    parser.add_argument("--custom-pipeline", type=str, default=CUSTOM_PIPELINE)
+    # parser.add_argument("--sample-method", type=str, default=SAMPLE_METHOD)
+    parser.add_argument("--beta-start", type=float, default=BETA_START)
+    parser.add_argument("--beta-end", type=float, default=BETA_END)
+    parser.add_argument("--beta-schedule", type=str, default=BREA_SCHEDULE)
+    parser.add_argument(
+        "--enable_temporal_attentions",
+        type=(lambda x: str(x).lower() in ["true", "1", "yes"]),
+        default=ENABLE_TEMPORAL_ATTENTIONS,
+    )
+    parser.add_argument(
+        "--enable_vae_temporal_decoder",
+        type=(lambda x: str(x).lower() in ["true", "1", "yes"]),
+        default=ENABLE_VAE_TEMPORAL_DECODER,
+    )
+    parser.add_argument("--guidance-scale", type=float, default=GUIDANCE_SCALE)
+    parser.add_argument("--variance-type", type=str, default=VARIANCE_TYPE)
+    parser.add_argument("--steps", type=int, default=STEPS)
+    parser.add_argument("--seed", type=int, default=SEED)
+    parser.add_argument("--warmups", type=int, default=WARMUPS)
+    parser.add_argument("--batch", type=int, default=BATCH)
+    parser.add_argument("--height", type=int, default=HEIGHT)
+    parser.add_argument("--width", type=int, default=WIDTH)
+    parser.add_argument("--video-length", type=int, default=VIDEO_LENGTH)
+    parser.add_argument("--fps", type=int, default=FPS)
+    parser.add_argument("--extra-call-kwargs", type=str, default=EXTRA_CALL_KWARGS)
+    parser.add_argument("--output-video", type=str, default=OUTPUT_VIDEO)
+    parser.add_argument(
+        "--compiler",
+        type=str,
+        default="nexfort",
+        choices=["none", "nexfort", "compile"],
+    )
+    parser.add_argument(
+        "--compiler-config", type=str, default=COMPILER_CONFIG,
+    )
+    parser.add_argument(
+        "--attention-fp16-score-accum-max-m",
+        type=int,
+        default=ATTENTION_FP16_SCORE_ACCUM_MAX_M,
+    )
+    return parser.parse_args()
+
+
+class IterationProfiler:
+    def __init__(self):
+        self.begin = None
+        self.end = None
+        self.num_iterations = 0
+
+    def get_iter_per_sec(self):
+        if self.begin is None or self.end is None:
+            return None
+        self.end.synchronize()
+        dur = self.begin.elapsed_time(self.end)
+        return self.num_iterations / dur * 1000.0
+
+    def callback_on_step_end(self, pipe, i, t, callback_kwargs={}):
+        if self.begin is None:
+            event = torch.cuda.Event(enable_timing=True)
+            event.record()
+            self.begin = event
+        else:
+            event = torch.cuda.Event(enable_timing=True)
+            event.record()
+            self.end = event
+            self.num_iterations += 1
+        return callback_kwargs
+
+
+def main():
+    args = parse_args()
+
+    if os.path.exists(args.model):
+        model_path = args.model
+    else:
+        from huggingface_hub import snapshot_download
+
+        model_path = snapshot_download(repo_id=args.model)
+
+    torch.set_grad_enabled(False)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    from models.latte_t2v import LatteT2V
+    from sample.pipeline_latte  import LattePipeline
+
+    transformer_model = LatteT2V.from_pretrained(
+        model_path, subfolder="transformer", video_length=args.video_length
+    ).to(device, dtype=torch.float16)
+
+    if args.enable_vae_temporal_decoder:
+        vae = AutoencoderKLTemporalDecoder.from_pretrained(
+            args.model, subfolder="vae_temporal_decoder", torch_dtype=torch.float16
+        ).to(device)
+    else:
+        vae = AutoencoderKL.from_pretrained(
+            args.model, subfolder="vae", torch_dtype=torch.float16
+        ).to(device)
+    tokenizer = T5Tokenizer.from_pretrained(args.model, subfolder="tokenizer")
+    text_encoder = T5EncoderModel.from_pretrained(
+        args.model, subfolder="text_encoder", torch_dtype=torch.float16
+    ).to(device)
+
+    # set eval mode
+    transformer_model.eval()
+    vae.eval()
+    text_encoder.eval()
+
+    scheduler = DDIMScheduler.from_pretrained(
+        model_path,
+        subfolder="scheduler",
+        beta_start=args.beta_start,
+        beta_end=args.beta_end,
+        beta_schedule=args.beta_schedule,
+        variance_type=args.variance_type,
+        clip_sample=False,
+    )
+
+    pipe = LattePipeline(
+        vae=vae,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        scheduler=scheduler,
+        transformer=transformer_model,
+    ).to(device)
+
+    if args.compiler == "none":
+        pass
+    elif args.compiler == "nexfort":
+        print("Nexfort backend is now active...")
+        if args.compiler_config is not None:
+            # config with dict
+            options = json.loads(args.compiler_config)
+        else:
+            # config with string
+            options = '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision",             \
+                "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, \
+                "triton.fuse_attention_allow_fp16_reduction": false}}'
+        pipe = compile_pipe(
+            pipe, backend="nexfort", options=options, fuse_qkv_projections=True
+        )
+    elif args.compiler == "compile":
+        if hasattr(pipe, "unet"):
+            pipe.unet = torch.compile(pipe.unet)
+        if hasattr(pipe, "transformer"):
+            pipe.transformer = torch.compile(pipe.transformer)
+        if hasattr(pipe, "controlnet"):
+            pipe.controlnet = torch.compile(pipe.controlnet)
+        pipe.vae = torch.compile(pipe.vae)
+    else:
+        raise ValueError(f"Unknown compiler: {args.compiler}")
+
+    def get_kwarg_inputs():
+        kwarg_inputs = dict(
+            prompt=args.prompt,
+            video_length=args.video_length,
+            height=args.height,
+            width=args.width,
+            num_inference_steps=args.steps,
+            guidance_scale=args.guidance_scale,
+            enable_temporal_attentions=args.enable_temporal_attentions,
+            num_images_per_prompt=1,
+            mask_feature=True,
+            enable_vae_temporal_decoder=args.enable_vae_temporal_decoder,
+            **(
+                dict()
+                if args.extra_call_kwargs is None
+                else json.loads(args.extra_call_kwargs)
+            ),
+        )
+        return kwarg_inputs
+
+    if args.warmups > 0:
+        print("=======================================")
+        print("Begin warmup")
+        begin = time.time()
+        for _ in range(args.warmups):
+            pipe(**get_kwarg_inputs()).video
+        end = time.time()
+        print("End warmup")
+        print(f"Warmup time: {end - begin:.3f}s")
+
+        print("=======================================")
+
+    kwarg_inputs = get_kwarg_inputs()
+    iter_profiler = IterationProfiler()
+    if "callback_on_step_end" in inspect.signature(pipe).parameters:
+        kwarg_inputs["callback_on_step_end"] = iter_profiler.callback_on_step_end
+    elif "callback" in inspect.signature(pipe).parameters:
+        kwarg_inputs["callback"] = iter_profiler.callback_on_step_end
+    torch.manual_seed(args.seed)
+    begin = time.time()
+    videos = pipe(**kwarg_inputs).video
+    end = time.time()
+
+    print(f"Inference time: {end - begin:.3f}s")
+    iter_per_sec = iter_profiler.get_iter_per_sec()
+    if iter_per_sec is not None:
+        print(f"Iterations per second: {iter_per_sec:.3f}")
+    cuda_mem_after_used = flow._oneflow_internal.GetCUDAMemoryUsed()
+    host_mem_after_used = flow._oneflow_internal.GetCPUMemoryUsed()
+    print(f"CUDA Mem after: {cuda_mem_after_used / 1024:.3f}GiB")
+    print(f"Host Mem after: {host_mem_after_used / 1024:.3f}GiB")
+
+    if args.output_video is not None:
+        # export_to_video(output_frames[0], args.output_video, fps=args.fps)
+        try:
+            imageio.mimwrite(
+                args.output_video, videos[0], fps=8, quality=9
+            )  # highest quality is 10, lowest is 0
+        except:
+            print("Error when saving {}".format(prompt))
+    else:
+        print("Please set `--output-video` to save the output video")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/imgs/latte_nexfort.gif b/imgs/latte_nexfort.gif
diff --git a/onediff_diffusers_extensions/examples/latte/README.md b/onediff_diffusers_extensions/examples/latte/README.md
@@ -0,0 +1,89 @@
+# Run Latte with nexfort backend(Beta Release)
+
+
+1. [Environment Setup](#environment-setup)
+   - [Set up Latte](#set-up-latte)
+   - [Set up nexfort backend](#set-up-nexfort-backend)
+   - [Set up onediff](#set-up-onediff)
+2. [Run](#run)
+   - [Run without compile](#run-without-compile)
+   - [Run with compile](#run-with-compile)
+3. [Performance Comparison](#performance-comparison)
+4. [Quality](#quality)
+
+## Environment setup
+### Set up Latte
+HF model: https://huggingface.co/maxin-cn/Latte-1
+```bash
+git clone -b run https://github.com/siliconflow/dit_latte/
+cd dit_latte
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Set up nexfort backend
+https://github.com/siliconflow/onediff/tree/main/src/onediff/infer_compiler/backends/nexfort
+
+### Set up onediff
+https://github.com/siliconflow/onediff?tab=readme-ov-file#installation
+
+## Run
+model_id_or_path_to_latte is the model id or model path of latte, such as `maxin-cn/Latte-1` or `/data/hf_models/Latte-1/`
+
+### Go to the onediff folder
+```
+cd onediff
+```
+
+### Run without compile(the original pytorch HF diffusers pipeline)
+```
+python3 ./benchmarks/text_to_video_latte.py \
+--model maxin-cn/Latte-1 \
+--steps 50 \
+--compiler none \
+----output-video ./latte.mp4 \
+--prompt "An epic tornado attacking above aglowing city at night."
+```
+
+### Run with compile
+```
+python3 ./benchmarks/text_to_video_latte.py \
+--model maxin-cn/Latte-1 \
+--steps 50 \
+--compiler nexfort \
+----output-video ./latte_compile.mp4 \
+--prompt "An epic tornado attacking above aglowing city at night."
+```
+
+## Performance Comparison
+
+### Metric
+
+#### On A100
+| Metric                                           | NVIDIA A100-PCIE-40GB (512 * 512) |
+| ------------------------------------------------ | ----------------------------------- |
+| Data update date(yyyy-mm-dd)                     | 2024-06-19                          |
+| PyTorch iteration speed                          | 1.60it/s                            |
+| OneDiff iteration speed                          | 2.27it/s(+41.9%)                    |
+| PyTorch E2E time                                 | 32.618s                             |
+| OneDiff E2E time                                 | 22.601s(-30.7%)                     |
+| PyTorch Max Mem Used                             | 28.208GiB                           |
+| OneDiff Max Mem Used                             | 24.753GiB                           |
+| PyTorch Warmup with Run time                     | 33.291s                             |
+| OneDiff Warmup with Compilation time<sup>1</sup> | 572.877s                            |
+| OneDiff Warmup with Cache time                   | 148.068s                            |
+
+ <sup>1</sup> OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz. Note this is just for reference, and it varies a lot on different CPU.
+
+#### nexfort compile config and warmup cost
+- compiler-config 
+  - setting `--compiler-config '{"mode": "max-optimize:max-autotune:freezing:benchmark:low-precision", "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, "triton.fuse_attention_allow_fp16_reduction": false}}` will help to make the best performance but the compilation time is about 572 seconds
+  - setting `--compiler-config '{"mode": "max-autotune", "memory_format": "channels_last", "options": {"inductor.optimize_linear_epilogue": false, "triton.fuse_attention_allow_fp16_reduction": false}}` will reduce compilation time to about 236 seconds and just slightly reduce the performance
+- fuse_qkv_projections: True
+
+## Quality
+
+When using nexfort as the backend for onediff compilation acceleration (right video), the generated video are lossless.
+
+<p align="center">
+<img src="../../../imgs/latte_nexfort.gif">
+</p>
diff --git a/onediff_diffusers_extensions/examples/pixart_alpha/README.md b/onediff_diffusers_extensions/examples/pixart_alpha/README.md
@@ -57,7 +57,7 @@ python3 ./benchmarks/text_to_image.py \
 --prompt "product photography, world of warcraft orc warrior, white background"
 ```
 
-## Performance comparation
+## Performance Comparison
 
 ### Metric