stable diffusion stdcase (FlagOpen#191)

* bert * fix * add * add MFU * vit * addsrc * sd
zjmoo123 · Aug 31, 2023 · c012dc5 · c012dc5
1 parent 2931093
commit c012dc5
Show file tree

Hide file tree

Showing 23 changed files with 1,841 additions and 0 deletions.
diff --git a/inference/benchmarks/stable_diffusion_v1_4/README.md b/inference/benchmarks/stable_diffusion_v1_4/README.md
@@ -0,0 +1,60 @@
+### 1. 推理数据集
+
+
+### 2. 模型与权重
+
+* 模型实现
+  * pytorch：transformers.UNet2DConditionalModel
+* 权重下载
+  * pytorch：from_pretrained("CompViz/stable-diffusion-v1-4")
+
+### 2. 软硬件配置与运行信息参考
+
+#### 2.1 Nvidia A100
+
+- ##### 硬件环境
+    - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB
+    - 多机网络类型、带宽: InfiniBand，200Gb/s
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.4.0-113-generic
+   - 加速卡驱动版本：470.129.06
+   - Docker 版本：20.10.16
+   - 训练框架版本：pytorch-2.1.0a0+4136153
+   - 依赖软件版本：
+     - cuda: 12.1
+
+- 推理工具包
+
+   - TensorRT 8.6.1
+
+- 其他说明
+
+   - 本case在大批尺寸情况下涉及到了张量超过4B的情况，因此在大批尺寸离线批推理场景下，不宜作为性能及MFU基准。
+
+### 3. 运行情况
+
+* 指标列表
+
+| 指标名称           | 指标值索引       | 特殊说明                                     |
+| ------------------ | ---------------- | -------------------------------------------- |
+| 数据精度           | precision        | 可选fp32/fp16                                |
+| 批尺寸             | bs               |                                              |
+| 硬件存储使用       | mem              | 通常称为“显存”,单位为GiB                     |
+| 端到端时间         | e2e_time         | 总时间+Perf初始化等时间                      |
+| 验证总吞吐量       | p_val_whole      | 实际验证prompts数除以总验证时间          |
+| 验证计算吞吐量     | p_val_core       | 不包含IO部分耗时                             |
+| 推理总吞吐量       | p_infer_whole    | 实际推理prompts数除以总推理时间          |
+| **推理计算吞吐量** | **\*p_infer_core** | 不包含IO部分耗时                             |
+| **计算卡使用率** | **\*MFU** | model flops utilization                             |
+| 推理结果           | CLIP Score(推理/验证) | 单位为text2img耦合度分数       |
+
+* 指标值
+
+| 推理工具  | precision | bs   | e2e_time | p_val_whole | p_val_core | p_infer_whole | \*p_infer_core | \*MFU     | CLIP Score  | mem        |
+| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- |
+| tensorrt | fp16    | 2   |1674.9 | 11.4        | 45.2 | 10.6 | 60.6 | 13.2% | 17.1/25.2 | 13.3/40.0 |
+| tensorrt | fp32   | 2 | 1807.4 | 8.2 | 20.6 | 7.2  | 16.1 | 7.0% | 25.2/25.3 | 39.2/40.0 |
+| null | fp16 | 16 | / | 11.7 | 60.7 | /  | / | 13.2% | -/25.2 | 5.7/40.0 |
+| null | fp32 | 8 | / | 9.3 | 27.3 | /  | / | 11.9% | -/25.3 | 6.3/40.0 |
diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py
@@ -0,0 +1,5 @@
+from .dataloader import build_dataloader
+from .model import create_model
+from .export import export_model
+from .evaluator import evaluator
+from .forward import model_forward, engine_forward
diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py
@@ -0,0 +1,31 @@
+from torch.utils.data import DataLoader as dl
+import torch
+import json
+import random
+
+
+def build_dataset(config):
+
+    df = json.load(open(config.data_dir + "/" + config.prompts))["annotations"]
+    prompts = []
+    for item in df:
+        prompts.append(item["caption"])
+    dataset = [
+        item for item in prompts if len(item) < config.prompt_max_len - 2
+    ]
+    random.seed(config.random_seed)
+    dataset = random.sample(dataset, config.prompt_samples)
+
+    return dataset
+
+
+def build_dataloader(config):
+    dataset = build_dataset(config)
+    loader = dl(dataset,
+                batch_size=config.batch_size,
+                shuffle=False,
+                drop_last=True,
+                num_workers=config.num_workers,
+                pin_memory=True)
+
+    return loader
diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py
@@ -0,0 +1,12 @@
+import torch
+
+
+def evaluator(metric, image, prompt, config):
+    scores = []
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+    image = (image * 255).round().astype("uint8")
+    image = torch.tensor(image)
+    for i in range(config.batch_size):
+        scores.append(float(metric(image[i], prompt[i])))
+    return scores
diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py
@@ -0,0 +1,41 @@
+import torch
+import os
+
+
+def export_model(model, config):
+    if config.exist_onnx_path is not None:
+        return config.exist_onnx_path
+
+    filename = config.case + "_bs" + str(config.batch_size)
+    filename = filename + "_" + str(config.framework)
+    filename = filename + "_fp16" + str(config.fp16)
+    filename = "onnxs/" + filename + ".onnx"
+    onnx_path = config.perf_dir + "/" + filename
+
+    latent = torch.randn(config.batch_size * 2, config.in_channels,
+                         config.height // config.scale_size,
+                         config.width // config.scale_size).cuda().float()
+    t = torch.randn([]).cuda().int()
+    embed = torch.randn(config.batch_size * 2, config.prompt_max_len,
+                        config.embed_hidden_size).cuda().float()
+
+    if config.fp16:
+        latent = latent.half()
+        embed = embed.half()
+
+    dummy_input = (latent, t, embed)
+
+    dir_onnx_path = os.path.dirname(onnx_path)
+    os.makedirs(dir_onnx_path, exist_ok=True)
+
+    with torch.no_grad():
+        torch.onnx.export(model,
+                          dummy_input,
+                          onnx_path,
+                          verbose=False,
+                          input_names=["input_0", "input_1", "input_2"],
+                          output_names=["output_0"],
+                          training=torch.onnx.TrainingMode.EVAL,
+                          do_constant_folding=True)
+
+    return onnx_path