siliconflow · clackhan · Jul 17, 2024 · Jul 17, 2024 · Jul 17, 2024 · Jul 17, 2024
diff --git a/benchmarks/text_to_video_latte.py b/benchmarks/text_to_video_latte.py
@@ -38,9 +38,7 @@
 from PIL import Image, ImageDraw
 
 import torch
-import oneflow as flow
-from onediffx import compile_pipe, OneflowCompileOptions
-from diffusers.utils import load_image, export_to_video
+from onediffx import compile_pipe
 from diffusers.schedulers import DDIMScheduler
 from diffusers.models import AutoencoderKL, AutoencoderKLTemporalDecoder
 from transformers import T5EncoderModel, T5Tokenizer
@@ -250,14 +248,16 @@ def get_kwarg_inputs():
     videos = pipe(**kwarg_inputs).video
     end = time.time()
 
+    print("=======================================")
     print(f"Inference time: {end - begin:.3f}s")
     iter_per_sec = iter_profiler.get_iter_per_sec()
     if iter_per_sec is not None:
         print(f"Iterations per second: {iter_per_sec:.3f}")
-    cuda_mem_after_used = flow._oneflow_internal.GetCUDAMemoryUsed()
-    host_mem_after_used = flow._oneflow_internal.GetCPUMemoryUsed()
-    print(f"CUDA Mem after: {cuda_mem_after_used / 1024:.3f}GiB")
-    print(f"Host Mem after: {host_mem_after_used / 1024:.3f}GiB")
+    cuda_mem_max_used = torch.cuda.max_memory_allocated() / (1024**3)
+    cuda_mem_max_reserved = torch.cuda.max_memory_reserved() / (1024**3)
+    print(f"Max used CUDA memory : {cuda_mem_max_used:.3f}GiB")
+    print(f"Max reserved CUDA memory : {cuda_mem_max_reserved:.3f}GiB")
+    print("=======================================")
 
     if args.output_video is not None:
         # export_to_video(output_frames[0], args.output_video, fps=args.fps)
@@ -266,7 +266,7 @@ def get_kwarg_inputs():
                 args.output_video, videos[0], fps=8, quality=9
             )  # highest quality is 10, lowest is 0
         except:
-            print("Error when saving {}".format(prompt))
+            print("Error when saving {}".format(args.prompt))
     else:
         print("Please set `--output-video` to save the output video")
 

diff --git a/onediff_diffusers_extensions/examples/latte/README.md b/onediff_diffusers_extensions/examples/latte/README.md
@@ -40,7 +40,7 @@ python3 ./benchmarks/text_to_video_latte.py \
 --model maxin-cn/Latte-1 \
 --steps 50 \
 --compiler none \
-----output-video ./latte.mp4 \
+--output-video ./latte.mp4 \
 --prompt "An epic tornado attacking above aglowing city at night."
 ```
 
@@ -50,7 +50,7 @@ python3 ./benchmarks/text_to_video_latte.py \
 --model maxin-cn/Latte-1 \
 --steps 50 \
 --compiler nexfort \
-----output-video ./latte_compile.mp4 \
+--output-video ./latte_compile.mp4 \
 --prompt "An epic tornado attacking above aglowing city at night."
 ```
 
@@ -60,17 +60,17 @@ python3 ./benchmarks/text_to_video_latte.py \
 
 #### On A100
 | Metric                                           | NVIDIA A100-PCIE-40GB (512 * 512) |
-| ------------------------------------------------ | ----------------------------------- |
-| Data update date(yyyy-mm-dd)                     | 2024-06-19                          |
-| PyTorch iteration speed                          | 1.60it/s                            |
-| OneDiff iteration speed                          | 2.27it/s(+41.9%)                    |
-| PyTorch E2E time                                 | 32.618s                             |
-| OneDiff E2E time                                 | 22.601s(-30.7%)                     |
-| PyTorch Max Mem Used                             | 28.208GiB                           |
-| OneDiff Max Mem Used                             | 24.753GiB                           |
-| PyTorch Warmup with Run time                     | 33.291s                             |
-| OneDiff Warmup with Compilation time<sup>1</sup> | 572.877s                            |
-| OneDiff Warmup with Cache time                   | 148.068s                            |
+| ------------------------------------------------ | --------------------------------- |
+| Data update date(yyyy-mm-dd)                     | 2024-06-19                        |
+| PyTorch iteration speed                          | 1.60 it/s                         |
+| OneDiff iteration speed                          | 2.27 it/s(+41.9%)                 |
+| PyTorch E2E time                                 | 32.618 s                          |
+| OneDiff E2E time                                 | 22.601 s(-30.7%)                  |
+| PyTorch Max Mem Used                             | 19.9 GiB                          |
+| OneDiff Max Mem Used                             | 19.9 GiB                          |
+| PyTorch Warmup with Run time                     | 33.291 s                          |
+| OneDiff Warmup with Compilation time<sup>1</sup> | 572.877 s                         |
+| OneDiff Warmup with Cache time                   | 148.068 s                         |
 
  <sup>1</sup> OneDiff Warmup with Compilation time is tested on Intel(R) Xeon(R) Gold 6348 CPU @ 2.60GHz. Note this is just for reference, and it varies a lot on different CPU.