-
Notifications
You must be signed in to change notification settings - Fork 103
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* bert * fix * add * add MFU * vit * addsrc * sd
- Loading branch information
Showing
23 changed files
with
1,841 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
### 1. 推理数据集 | ||
|
||
|
||
### 2. 模型与权重 | ||
|
||
* 模型实现 | ||
* pytorch:transformers.UNet2DConditionalModel | ||
* 权重下载 | ||
* pytorch:from_pretrained("CompViz/stable-diffusion-v1-4") | ||
|
||
### 2. 软硬件配置与运行信息参考 | ||
|
||
#### 2.1 Nvidia A100 | ||
|
||
- ##### 硬件环境 | ||
- 机器、加速卡型号: NVIDIA_A100-SXM4-40GB | ||
- 多机网络类型、带宽: InfiniBand,200Gb/s | ||
|
||
- ##### 软件环境 | ||
- OS版本:Ubuntu 20.04 | ||
- OS kernel版本: 5.4.0-113-generic | ||
- 加速卡驱动版本:470.129.06 | ||
- Docker 版本:20.10.16 | ||
- 训练框架版本:pytorch-2.1.0a0+4136153 | ||
- 依赖软件版本: | ||
- cuda: 12.1 | ||
|
||
- 推理工具包 | ||
|
||
- TensorRT 8.6.1 | ||
|
||
- 其他说明 | ||
|
||
- 本case在大批尺寸情况下涉及到了张量超过4B的情况,因此在大批尺寸离线批推理场景下,不宜作为性能及MFU基准。 | ||
|
||
### 3. 运行情况 | ||
|
||
* 指标列表 | ||
|
||
| 指标名称 | 指标值索引 | 特殊说明 | | ||
| ------------------ | ---------------- | -------------------------------------------- | | ||
| 数据精度 | precision | 可选fp32/fp16 | | ||
| 批尺寸 | bs | | | ||
| 硬件存储使用 | mem | 通常称为“显存”,单位为GiB | | ||
| 端到端时间 | e2e_time | 总时间+Perf初始化等时间 | | ||
| 验证总吞吐量 | p_val_whole | 实际验证prompts数除以总验证时间 | | ||
| 验证计算吞吐量 | p_val_core | 不包含IO部分耗时 | | ||
| 推理总吞吐量 | p_infer_whole | 实际推理prompts数除以总推理时间 | | ||
| **推理计算吞吐量** | **\*p_infer_core** | 不包含IO部分耗时 | | ||
| **计算卡使用率** | **\*MFU** | model flops utilization | | ||
| 推理结果 | CLIP Score(推理/验证) | 单位为text2img耦合度分数 | | ||
|
||
* 指标值 | ||
|
||
| 推理工具 | precision | bs | e2e_time | p_val_whole | p_val_core | p_infer_whole | \*p_infer_core | \*MFU | CLIP Score | mem | | ||
| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | | ||
| tensorrt | fp16 | 2 |1674.9 | 11.4 | 45.2 | 10.6 | 60.6 | 13.2% | 17.1/25.2 | 13.3/40.0 | | ||
| tensorrt | fp32 | 2 | 1807.4 | 8.2 | 20.6 | 7.2 | 16.1 | 7.0% | 25.2/25.3 | 39.2/40.0 | | ||
| null | fp16 | 16 | / | 11.7 | 60.7 | / | / | 13.2% | -/25.2 | 5.7/40.0 | | ||
| null | fp32 | 8 | / | 9.3 | 27.3 | / | / | 11.9% | -/25.3 | 6.3/40.0 | |
5 changes: 5 additions & 0 deletions
5
inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from .dataloader import build_dataloader | ||
from .model import create_model | ||
from .export import export_model | ||
from .evaluator import evaluator | ||
from .forward import model_forward, engine_forward |
31 changes: 31 additions & 0 deletions
31
inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from torch.utils.data import DataLoader as dl | ||
import torch | ||
import json | ||
import random | ||
|
||
|
||
def build_dataset(config): | ||
|
||
df = json.load(open(config.data_dir + "/" + config.prompts))["annotations"] | ||
prompts = [] | ||
for item in df: | ||
prompts.append(item["caption"]) | ||
dataset = [ | ||
item for item in prompts if len(item) < config.prompt_max_len - 2 | ||
] | ||
random.seed(config.random_seed) | ||
dataset = random.sample(dataset, config.prompt_samples) | ||
|
||
return dataset | ||
|
||
|
||
def build_dataloader(config): | ||
dataset = build_dataset(config) | ||
loader = dl(dataset, | ||
batch_size=config.batch_size, | ||
shuffle=False, | ||
drop_last=True, | ||
num_workers=config.num_workers, | ||
pin_memory=True) | ||
|
||
return loader |
12 changes: 12 additions & 0 deletions
12
inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import torch | ||
|
||
|
||
def evaluator(metric, image, prompt, config): | ||
scores = [] | ||
image = (image / 2 + 0.5).clamp(0, 1) | ||
image = image.detach().cpu().permute(0, 2, 3, 1).numpy() | ||
image = (image * 255).round().astype("uint8") | ||
image = torch.tensor(image) | ||
for i in range(config.batch_size): | ||
scores.append(float(metric(image[i], prompt[i]))) | ||
return scores |
41 changes: 41 additions & 0 deletions
41
inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import torch | ||
import os | ||
|
||
|
||
def export_model(model, config): | ||
if config.exist_onnx_path is not None: | ||
return config.exist_onnx_path | ||
|
||
filename = config.case + "_bs" + str(config.batch_size) | ||
filename = filename + "_" + str(config.framework) | ||
filename = filename + "_fp16" + str(config.fp16) | ||
filename = "onnxs/" + filename + ".onnx" | ||
onnx_path = config.perf_dir + "/" + filename | ||
|
||
latent = torch.randn(config.batch_size * 2, config.in_channels, | ||
config.height // config.scale_size, | ||
config.width // config.scale_size).cuda().float() | ||
t = torch.randn([]).cuda().int() | ||
embed = torch.randn(config.batch_size * 2, config.prompt_max_len, | ||
config.embed_hidden_size).cuda().float() | ||
|
||
if config.fp16: | ||
latent = latent.half() | ||
embed = embed.half() | ||
|
||
dummy_input = (latent, t, embed) | ||
|
||
dir_onnx_path = os.path.dirname(onnx_path) | ||
os.makedirs(dir_onnx_path, exist_ok=True) | ||
|
||
with torch.no_grad(): | ||
torch.onnx.export(model, | ||
dummy_input, | ||
onnx_path, | ||
verbose=False, | ||
input_names=["input_0", "input_1", "input_2"], | ||
output_names=["output_0"], | ||
training=torch.onnx.TrainingMode.EVAL, | ||
do_constant_folding=True) | ||
|
||
return onnx_path |
Oops, something went wrong.