From 05ba7a0f03fd8a928f3d4c312fab38a12ebc97ea Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Thu, 3 Aug 2023 16:56:04 +0800 Subject: [PATCH 1/7] bert --- inference/benchmarks/bertLarge/README.md | 66 ++++++++++ .../benchmarks/bertLarge/pytorch/__init__.py | 5 + .../bertLarge/pytorch/dataloader.py | 67 +++++++++++ .../benchmarks/bertLarge/pytorch/evaluator.py | 11 ++ .../benchmarks/bertLarge/pytorch/export.py | 30 +++++ .../benchmarks/bertLarge/pytorch/forward.py | 113 ++++++++++++++++++ .../benchmarks/bertLarge/pytorch/model.py | 13 ++ .../bertLarge/pytorch/requirements.txt | 1 + inference/benchmarks/resnet50/README.md | 11 +- .../benchmarks/resnet50/pytorch/forward.py | 7 +- .../configs/bertLarge/configurations.yaml | 14 +++ inference/configs/bertLarge/parameters.yaml | 6 + .../vendor_config/nvidia_configurations.yaml | 3 + inference/inference_engine/nvidia/tensorrt.py | 41 +++++-- inference/run_inference.py | 8 +- 15 files changed, 372 insertions(+), 24 deletions(-) create mode 100644 inference/benchmarks/bertLarge/README.md create mode 100644 inference/benchmarks/bertLarge/pytorch/__init__.py create mode 100644 inference/benchmarks/bertLarge/pytorch/dataloader.py create mode 100644 inference/benchmarks/bertLarge/pytorch/evaluator.py create mode 100644 inference/benchmarks/bertLarge/pytorch/export.py create mode 100644 inference/benchmarks/bertLarge/pytorch/forward.py create mode 100644 inference/benchmarks/bertLarge/pytorch/model.py create mode 100644 inference/benchmarks/bertLarge/pytorch/requirements.txt create mode 100644 inference/configs/bertLarge/configurations.yaml create mode 100644 inference/configs/bertLarge/parameters.yaml create mode 100644 inference/configs/bertLarge/vendor_config/nvidia_configurations.yaml diff --git a/inference/benchmarks/bertLarge/README.md b/inference/benchmarks/bertLarge/README.md new file mode 100644 index 000000000..d653f3c7a --- /dev/null +++ b/inference/benchmarks/bertLarge/README.md @@ -0,0 +1,66 @@ +### 1. 推理数据集 + +● 下载地址:`https://drive.google.com/drive/folders/1cywmDnAsrP5-2vsr8GDc6QUc7VWe-M3v` + +``` +文件列表: +results_text.tar.gz +bert_reference_results_text_md5.txt +``` + +* 解压后将eval.txt放置在目录下 + +### 2. 模型与权重 + +* 模型实现 + * pytorch:transformers.BertForMaskedLM +* 权重下载 + * pytorch:BertForMaskedLM.from_pretrained("bert-large/base-uncased") +* 权重选择 + * 使用save_pretrained将加载的bert-large或bert-base权重保存到/路径下 + +### 3. 软硬件配置与运行信息参考 + +#### 2.1 Nvidia A100 + +- ##### 硬件环境 + - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-113-generic + - 加速卡驱动版本:470.129.06 + - Docker 版本:20.10.16 + - 训练框架版本:pytorch-1.13.0a0+937e930 + - 依赖软件版本: + - cuda: 11.8 + +- 推理工具包 + + - TensorRT 8.5.1.7 + +### 4. 运行情况(BERT-Large) + +* 指标列表 + +| 指标名称 | 指标值索引 | 特殊说明 | +| ------------------ | ----------------- | ----------------------------------------------------------- | +| 数据精度 | precision | 可选fp32/fp16 | +| 批尺寸 | bs | 此外,对于bert-large seq_length==128 | +| 批输入大小 | byte_per_batch | 每一个batch包含的字节数 | +| 硬件存储使用 | mem | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time | 总时间+Perf初始化等时间 | +| 验证总吞吐量 | p_val_whole | 实际验证序列数除以总验证时间 | +| 验证计算吞吐量 | *p_val_core | 不包含IO部分耗时 | +| 推理总吞吐量 | p_infer_whole | 实际推理序列数除以总推理时间 | +| **推理计算吞吐量** | ***p_infer_core** | 不包含IO部分耗时。此外,此值*seq_length即为token per second | +| 推理单样本耗时 | infer_time | 1/p_infer_core,单位为毫秒(ms)或微秒(μs) | +| 推理结果 | acc(推理/验证) | 单位为top1MaskedLM准确率(acc1) | + +* 指标值 + +| 推理工具 | precision | bs | byte_per_batch | e2e_time | p_val_whole | \*p_val_core | p_infer_whole | \*p_infer_core | acc | mem | +| -------- | --------- | ---- | -------------- | -------- | ----------- | ------------ | ------------- | -------------- | ----------- | --------- | +| tensorrt | fp16 | 32 | 32768 | 1283.9 | 257.3 | 260.4 | 408.3 | 418.1 | 0.600/0.638 | 17.4/40.0 | + diff --git a/inference/benchmarks/bertLarge/pytorch/__init__.py b/inference/benchmarks/bertLarge/pytorch/__init__.py new file mode 100644 index 000000000..1f6cdf49b --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/__init__.py @@ -0,0 +1,5 @@ +from .dataloader import build_dataloader +from .model import create_model +from .export import export_model +from .evaluator import evaluator +from .forward import model_forward, engine_forward diff --git a/inference/benchmarks/bertLarge/pytorch/dataloader.py b/inference/benchmarks/bertLarge/pytorch/dataloader.py new file mode 100644 index 000000000..52ebc4d8d --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/dataloader.py @@ -0,0 +1,67 @@ +from transformers import BertTokenizer +from torch.utils.data import DataLoader, Dataset +import torch +import random + + +class BertInferDataset(Dataset): + + def __init__(self, input_ids, label_ids, random_dupo, seq_length): + self.input_ids = input_ids + self.label_ids = label_ids + self.random_dupo = random_dupo + self.seq_length = seq_length + + def __len__(self): + return len(self.input_ids) // self.seq_length * self.random_dupo + + def __getitem__(self, idx): + idx_global = idx // self.random_dupo + start_idx = idx_global * self.seq_length + chunk_input = self.input_ids[start_idx:start_idx + self.seq_length] + chunk_label = self.label_ids[start_idx:start_idx + self.seq_length] + + chunk_input = torch.tensor(chunk_input).int() + chunk_label = torch.tensor(chunk_label).int() + + return (chunk_input, chunk_label) + + +def build_dataset(config): + + random.seed(config.random_seed) + + with open(config.data_dir + "/" + config.eval_file, "r") as file: + text = file.read() + + tokenizer = BertTokenizer.from_pretrained(config.data_dir + "/" + + config.weight_dir) + tokens = tokenizer.tokenize(text) + + label_ids = tokenizer.convert_tokens_to_ids(tokens) + label_ids = [tokenizer.cls_token_id] + label_ids + [tokenizer.sep_token_id] + + masked_tokens = [] + for token in tokens: + if token != "[CLS]" and token != "[SEP]": + masked_tokens.append( + "[MASK]" if random.random() < config.mask_ratio else token) + input_ids = tokenizer.convert_tokens_to_ids(masked_tokens) + input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id] + + dataset = BertInferDataset(input_ids, label_ids, config.random_dupo, + config.seq_length) + + return dataset + + +def build_dataloader(config): + dataset = build_dataset(config) + loader = DataLoader(dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=True, + num_workers=config.num_workers, + pin_memory=True) + + return loader diff --git a/inference/benchmarks/bertLarge/pytorch/evaluator.py b/inference/benchmarks/bertLarge/pytorch/evaluator.py new file mode 100644 index 000000000..1dac4c977 --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/evaluator.py @@ -0,0 +1,11 @@ +import torch + + +def evaluator(pred, x, y): + mask = x == 103 + masked_pred = pred[mask] + masked_y = y[mask] + + correct = masked_pred[masked_pred == masked_y] + + return len(correct), len(masked_y) diff --git a/inference/benchmarks/bertLarge/pytorch/export.py b/inference/benchmarks/bertLarge/pytorch/export.py new file mode 100644 index 000000000..f9eba5109 --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/export.py @@ -0,0 +1,30 @@ +import torch +import os + + +def export_model(model, config): + if config.exist_onnx_path is not None: + return config.exist_onnx_path + + filename = config.case + "_bs" + str(config.batch_size) + filename = filename + "_" + str(config.framework) + filename = filename + "_fp16" + str(config.fp16) + filename = "onnxs/" + filename + ".onnx" + onnx_path = config.perf_dir + "/" + filename + + dummy_input = torch.ones(config.batch_size, config.seq_length).int().cuda() + + dir_onnx_path = os.path.dirname(onnx_path) + os.makedirs(dir_onnx_path, exist_ok=True) + + with torch.no_grad(): + torch.onnx.export(model, + dummy_input, + onnx_path, + verbose=False, + input_names=["input"], + output_names=["output"], + training=torch.onnx.TrainingMode.EVAL, + do_constant_folding=True) + + return onnx_path diff --git a/inference/benchmarks/bertLarge/pytorch/forward.py b/inference/benchmarks/bertLarge/pytorch/forward.py new file mode 100644 index 000000000..05ba97385 --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/forward.py @@ -0,0 +1,113 @@ +from loguru import logger +import torch +import numpy as np +import time +from tools import torch_sync + + +def cal_perf(config, dataloader_len, duration, core_time, str_prefix): + model_forward_perf = config.repeat * dataloader_len * config.batch_size / duration + logger.info(str_prefix + "(" + config.framework + ") Perf: " + + str(model_forward_perf) + " qps") + model_forward_core_perf = config.repeat * dataloader_len * config.batch_size / core_time + logger.info(str_prefix + "(" + config.framework + ") core Perf: " + + str(model_forward_core_perf) + " qps") + return round(model_forward_perf, 3), round(model_forward_core_perf, 3) + + +def model_forward(model, dataloader, evaluator, config): + if config.no_validation: + return None, None, None + start = time.time() + core_time = 0.0 + + correct = 1 + whole = 1 + + for times in range(config.repeat): + + logger.debug("Repeat: " + str(times + 1)) + + all_top1 = [] + for step, (x, y) in enumerate(dataloader): + torch_sync(config) + core_time_start = time.time() + + if step % config.log_freq == 0: + logger.debug("Step: " + str(step) + " / " + + str(len(dataloader))) + + with torch.no_grad(): + x = x.cuda() + y = y.cuda() + + pred = model(x) + torch_sync(config) + core_time += time.time() - core_time_start + + pred = pred[0] + pred = torch.argmax(pred, dim=2) + correct_iter, whole_iter = evaluator(pred, x, y) + + correct += correct_iter + whole += whole_iter + + acc = correct / whole + + logger.info("MaskedLM Acc: " + str(acc)) + + duration = time.time() - start + model_forward_perf, model_forward_core_perf = cal_perf( + config, len(dataloader), duration, core_time, "Validation") + + return model_forward_perf, model_forward_core_perf, round(acc, 3) + + +def engine_forward(model, dataloader, evaluator, config): + start = time.time() + core_time = 0.0 + foo_time = 0.0 + + correct = 1 + whole = 1 + + for times in range(config.repeat): + + logger.debug("Repeat: " + str(times + 1)) + + all_top1 = [] + for step, (x, y) in enumerate(dataloader): + torch_sync(config) + core_time_start = time.time() + + if step % config.log_freq == 0: + logger.debug("Step: " + str(step) + " / " + + str(len(dataloader))) + + with torch.no_grad(): + + outputs = model([x]) + pred = outputs[0] + foo_time += outputs[1] + + torch_sync(config) + core_time += time.time() - core_time_start + + pred = pred[0] + pred = pred.reshape(config.batch_size, config.seq_length, -1) + pred = torch.argmax(pred, dim=2) + pred = pred.cpu() + correct_iter, whole_iter = evaluator(pred, x, y) + + correct += correct_iter + whole += whole_iter + + acc = correct / whole + + logger.info("MaskedLM Acc: " + str(acc)) + + duration = time.time() - start - foo_time + model_forward_perf, model_forward_core_perf = cal_perf( + config, len(dataloader), duration, core_time - foo_time, "Inference") + + return model_forward_perf, model_forward_core_perf, round(acc, 3) diff --git a/inference/benchmarks/bertLarge/pytorch/model.py b/inference/benchmarks/bertLarge/pytorch/model.py new file mode 100644 index 000000000..c8d1e4833 --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/model.py @@ -0,0 +1,13 @@ +from transformers import BertForMaskedLM + + +def create_model(config): + model = BertForMaskedLM.from_pretrained(config.data_dir + "/" + + config.weight_dir, + torchscript=True) + model.cuda() + model.eval() + if config.fp16: + model.half() + + return model diff --git a/inference/benchmarks/bertLarge/pytorch/requirements.txt b/inference/benchmarks/bertLarge/pytorch/requirements.txt new file mode 100644 index 000000000..976a2b1f3 --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/requirements.txt @@ -0,0 +1 @@ +transformers diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md index 88b2f9aae..e7436d315 100644 --- a/inference/benchmarks/resnet50/README.md +++ b/inference/benchmarks/resnet50/README.md @@ -68,6 +68,7 @@ find ./val -name "*JPEG" | wc -l | ------------------ | ---------------- | -------------------------------------------- | | 数据精度 | precision | 可选fp32/fp16 | | 批尺寸 | bs | | +| 批输入大小 | byte_per_batch | 每一个batch包含的字节数 | | 硬件存储使用 | mem | 通常称为“显存”,单位为GiB | | 端到端时间 | e2e_time | 总时间+Perf初始化等时间 | | 验证总吞吐量 | p_val_whole | 实际验证图片数除以总验证时间 | @@ -78,9 +79,9 @@ find ./val -name "*JPEG" | wc -l * 指标值 -| 推理工具 | precision | bs | e2e_time | p_val_whole | \*p_val_core | p_infer_whole | \*p_infer_core | acc | mem | -| ----------- | --------- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ---------- | -| tensorrt | fp16 | 256 | 613.4 | 1358.9 | 4263.3 | 1391.4 | 12406.0 | 76.2/76.2 | 19.7/40.0 | -| tensorrt | fp32 | 256 | 474.4 | 1487.3 | 2653.2 | 1560.3 | 6091.6 | 76.2/76.2 | 28.86/40.0 | -| torchtrt | fp16 | 256 | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 76.2/76.2 | 9.42/40.0 | +| 推理工具 | precision | bs | byte_per_batch | e2e_time | p_val_whole | \*p_val_core | p_infer_whole | \*p_infer_core | acc | mem | +| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ---------- | +| tensorrt | fp16 | 256 | 77070336 | 613.4 | 1358.9 | 4263.3 | 1391.4 | 12406.0 | 76.2/76.2 | 19.7/40.0 | +| tensorrt | fp32 | 256 | 77070336 | 474.4 | 1487.3 | 2653.2 | 1560.3 | 6091.6 | 76.2/76.2 | 28.86/40.0 | +| torchtrt | fp16 | 256 | 77070336 | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 76.2/76.2 | 9.42/40.0 | diff --git a/inference/benchmarks/resnet50/pytorch/forward.py b/inference/benchmarks/resnet50/pytorch/forward.py index 5619760df..77ff03bd3 100644 --- a/inference/benchmarks/resnet50/pytorch/forward.py +++ b/inference/benchmarks/resnet50/pytorch/forward.py @@ -81,12 +81,15 @@ def engine_forward(model, dataloader, evaluator, config): with torch.no_grad(): outputs = model([x]) - pred = outputs[0][0] + pred = outputs[0] foo_time += outputs[1] - pred = pred.float() + torch_sync(config) core_time += time.time() - core_time_start + pred = pred[0].float() + pred = pred.reshape(config.batch_size, -1) + pred = pred.cpu() top1 = evaluator(pred, y) all_top1.extend(top1.cpu()) diff --git a/inference/configs/bertLarge/configurations.yaml b/inference/configs/bertLarge/configurations.yaml new file mode 100644 index 000000000..2f24464c7 --- /dev/null +++ b/inference/configs/bertLarge/configurations.yaml @@ -0,0 +1,14 @@ +batch_size: 32 +# 512 length seq(1 item in x) +input_size: 512 +fp16: true +compiler: tensorrt +num_workers: 8 +log_freq: 100 +repeat: 1 +# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null +no_validation: false +# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt) +exist_onnx_path: null +# set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine +exist_compiler_path: null \ No newline at end of file diff --git a/inference/configs/bertLarge/parameters.yaml b/inference/configs/bertLarge/parameters.yaml new file mode 100644 index 000000000..464095c5f --- /dev/null +++ b/inference/configs/bertLarge/parameters.yaml @@ -0,0 +1,6 @@ +seq_length: 512 +mask_ratio: 0.10 +weight_dir: "weights" +eval_file: "eval.txt" +random_dupo: 10 +random_seed: 0 \ No newline at end of file diff --git a/inference/configs/bertLarge/vendor_config/nvidia_configurations.yaml b/inference/configs/bertLarge/vendor_config/nvidia_configurations.yaml new file mode 100644 index 000000000..b10bc4faf --- /dev/null +++ b/inference/configs/bertLarge/vendor_config/nvidia_configurations.yaml @@ -0,0 +1,3 @@ +trt_tmp_path: nvidia_tmp/bertLarge.trt +has_dynamic_axis: false +torchtrt_full_compile: true \ No newline at end of file diff --git a/inference/inference_engine/nvidia/tensorrt.py b/inference/inference_engine/nvidia/tensorrt.py index fb215ba17..b2ac1b29f 100644 --- a/inference/inference_engine/nvidia/tensorrt.py +++ b/inference/inference_engine/nvidia/tensorrt.py @@ -27,6 +27,8 @@ def __repr__(self): return self.__str__() def __init__(self, config, onnx_path, model): + self.config = config + self.logger = trt.Logger(trt.Logger.WARNING) self.runtime = trt.Runtime(self.logger) @@ -49,6 +51,19 @@ def __init__(self, config, onnx_path, model): np.complex64: torch.complex64, np.complex128: torch.complex128, } + self.str_to_torch_dtype_dict = { + "bool": torch.bool, + "uint8": torch.uint8, + "int8": torch.int8, + "int16": torch.int16, + "int32": torch.int32, + "int64": torch.int64, + "float16": torch.float16, + "float32": torch.float32, + "float64": torch.float64, + "complex64": torch.complex64, + "complex128": torch.complex128, + } def build_engine(self, config, onnx_path): if config.exist_compiler_path is None: @@ -99,15 +114,10 @@ def allocate_buffers(self, engine): def __call__(self, model_inputs: list): - batch_size = np.unique(np.array([i.size(dim=0) for i in model_inputs])) - batch_size = batch_size[0] + batch_size = self.config.batch_size for i, model_input in enumerate(model_inputs): - binding_name = self.engine[i] - binding_dtype = trt.nptype( - self.engine.get_binding_dtype(binding_name)) - model_input = model_input.to( - self.numpy_to_torch_dtype_dict[binding_dtype]) + model_input = model_input.cuda() cuda.memcpy_dtod_async( self.inputs[i].device, @@ -118,12 +128,17 @@ def __call__(self, model_inputs: list): self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle) + result = [] for out in self.outputs: - cuda.memcpy_dtoh_async(out.host, out.device, self.stream) + out_tensor = torch.empty(out.host.shape, device="cuda").to( + self.str_to_torch_dtype_dict[str(out.host.dtype)]) + cuda.memcpy_dtod_async( + out_tensor.data_ptr(), + out.device, + out_tensor.element_size() * out_tensor.nelement(), + self.stream, + ) + result.append(out_tensor) self.stream.synchronize() - - return [ - torch.from_numpy(out.host.reshape(batch_size, -1)) - for out in self.outputs - ], 0 + return result, 0 diff --git a/inference/run_inference.py b/inference/run_inference.py index ff6ba2acb..ff42c393f 100644 --- a/inference/run_inference.py +++ b/inference/run_inference.py @@ -151,10 +151,10 @@ def parse_args(): "batchsize": config.batch_size, "byte_per_batch": batch_input_byte, "e2e_time(second)": e2e_time, - "p_validation_whole(items per second)": p_forward, - "*p_validation_core(items per second)": p_forward_core, - "p_inference_whole(items per second)": p_infer, - "*p_inference_core(items per second)": p_infer_core, + "p_validation_whole(qps)": p_forward, + "*p_validation_core(qps)": p_forward_core, + "p_inference_whole(qps)": p_infer, + "*p_inference_core(qps)": p_infer_core, "val_average_acc": val_acc, "infer_average_acc": infer_acc } From 1b365f2a2db5e7939a1a3b16aecb899f509c80ba Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Thu, 3 Aug 2023 17:37:44 +0800 Subject: [PATCH 2/7] fix --- inference/benchmarks/bertLarge/pytorch/dataloader.py | 11 ++++------- inference/configs/bertLarge/configurations.yaml | 2 +- inference/configs/bertLarge/parameters.yaml | 1 - 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/inference/benchmarks/bertLarge/pytorch/dataloader.py b/inference/benchmarks/bertLarge/pytorch/dataloader.py index 52ebc4d8d..64542cb81 100644 --- a/inference/benchmarks/bertLarge/pytorch/dataloader.py +++ b/inference/benchmarks/bertLarge/pytorch/dataloader.py @@ -6,18 +6,16 @@ class BertInferDataset(Dataset): - def __init__(self, input_ids, label_ids, random_dupo, seq_length): + def __init__(self, input_ids, label_ids, seq_length): self.input_ids = input_ids self.label_ids = label_ids - self.random_dupo = random_dupo self.seq_length = seq_length def __len__(self): - return len(self.input_ids) // self.seq_length * self.random_dupo + return len(self.input_ids) // self.seq_length def __getitem__(self, idx): - idx_global = idx // self.random_dupo - start_idx = idx_global * self.seq_length + start_idx = idx * self.seq_length chunk_input = self.input_ids[start_idx:start_idx + self.seq_length] chunk_label = self.label_ids[start_idx:start_idx + self.seq_length] @@ -49,8 +47,7 @@ def build_dataset(config): input_ids = tokenizer.convert_tokens_to_ids(masked_tokens) input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id] - dataset = BertInferDataset(input_ids, label_ids, config.random_dupo, - config.seq_length) + dataset = BertInferDataset(input_ids, label_ids, config.seq_length) return dataset diff --git a/inference/configs/bertLarge/configurations.yaml b/inference/configs/bertLarge/configurations.yaml index 2f24464c7..515a95180 100644 --- a/inference/configs/bertLarge/configurations.yaml +++ b/inference/configs/bertLarge/configurations.yaml @@ -5,7 +5,7 @@ fp16: true compiler: tensorrt num_workers: 8 log_freq: 100 -repeat: 1 +repeat: 10 # skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null no_validation: false # set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt) diff --git a/inference/configs/bertLarge/parameters.yaml b/inference/configs/bertLarge/parameters.yaml index 464095c5f..9125bfa74 100644 --- a/inference/configs/bertLarge/parameters.yaml +++ b/inference/configs/bertLarge/parameters.yaml @@ -2,5 +2,4 @@ seq_length: 512 mask_ratio: 0.10 weight_dir: "weights" eval_file: "eval.txt" -random_dupo: 10 random_seed: 0 \ No newline at end of file From 4b8c20a2223993aa89101b3e816209522b5d2c06 Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Fri, 4 Aug 2023 10:00:29 +0800 Subject: [PATCH 3/7] add --- inference/benchmarks/bertLarge/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/inference/benchmarks/bertLarge/README.md b/inference/benchmarks/bertLarge/README.md index d653f3c7a..6c7cb0c29 100644 --- a/inference/benchmarks/bertLarge/README.md +++ b/inference/benchmarks/bertLarge/README.md @@ -63,4 +63,5 @@ bert_reference_results_text_md5.txt | 推理工具 | precision | bs | byte_per_batch | e2e_time | p_val_whole | \*p_val_core | p_infer_whole | \*p_infer_core | acc | mem | | -------- | --------- | ---- | -------------- | -------- | ----------- | ------------ | ------------- | -------------- | ----------- | --------- | | tensorrt | fp16 | 32 | 32768 | 1283.9 | 257.3 | 260.4 | 408.3 | 418.1 | 0.600/0.638 | 17.4/40.0 | +| tensorrt | fp32 | 32 | 65536 | 1868.8 | 150.4 | 152.2 | 190.4 | 194.1 | 0.638/0.638 | 16.9/40.0 | From 189c0e11b1d854dc76044fc81f7555e204451e27 Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Fri, 4 Aug 2023 12:56:24 +0800 Subject: [PATCH 4/7] add MFU --- inference/benchmarks/bertLarge/README.md | 19 ++++++++++--------- inference/benchmarks/resnet50/README.md | 14 +++++++------- .../configs/bertLarge/configurations.yaml | 6 ++++-- .../configs/resnet50/configurations.yaml | 6 ++++-- .../docker_images/nvidia/nvidia_analysis.py | 3 ++- inference/run.py | 7 ++++++- inference/run_inference.py | 8 +++----- 7 files changed, 36 insertions(+), 27 deletions(-) diff --git a/inference/benchmarks/bertLarge/README.md b/inference/benchmarks/bertLarge/README.md index 6c7cb0c29..f84a474eb 100644 --- a/inference/benchmarks/bertLarge/README.md +++ b/inference/benchmarks/bertLarge/README.md @@ -47,21 +47,22 @@ bert_reference_results_text_md5.txt | 指标名称 | 指标值索引 | 特殊说明 | | ------------------ | ----------------- | ----------------------------------------------------------- | | 数据精度 | precision | 可选fp32/fp16 | -| 批尺寸 | bs | 此外,对于bert-large seq_length==128 | -| 批输入大小 | byte_per_batch | 每一个batch包含的字节数 | +| 批尺寸 | bs | | | 硬件存储使用 | mem | 通常称为“显存”,单位为GiB | | 端到端时间 | e2e_time | 总时间+Perf初始化等时间 | | 验证总吞吐量 | p_val_whole | 实际验证序列数除以总验证时间 | -| 验证计算吞吐量 | *p_val_core | 不包含IO部分耗时 | +| 验证计算吞吐量 | p_val_core | 不包含IO部分耗时 | | 推理总吞吐量 | p_infer_whole | 实际推理序列数除以总推理时间 | -| **推理计算吞吐量** | ***p_infer_core** | 不包含IO部分耗时。此外,此值*seq_length即为token per second | -| 推理单样本耗时 | infer_time | 1/p_infer_core,单位为毫秒(ms)或微秒(μs) | +| **推理计算吞吐量** | **\*p_infer_core** | 不包含IO部分耗时 | +| **计算卡使用率** | **\*MFU** | model flops utilization | | 推理结果 | acc(推理/验证) | 单位为top1MaskedLM准确率(acc1) | * 指标值 -| 推理工具 | precision | bs | byte_per_batch | e2e_time | p_val_whole | \*p_val_core | p_infer_whole | \*p_infer_core | acc | mem | -| -------- | --------- | ---- | -------------- | -------- | ----------- | ------------ | ------------- | -------------- | ----------- | --------- | -| tensorrt | fp16 | 32 | 32768 | 1283.9 | 257.3 | 260.4 | 408.3 | 418.1 | 0.600/0.638 | 17.4/40.0 | -| tensorrt | fp32 | 32 | 65536 | 1868.8 | 150.4 | 152.2 | 190.4 | 194.1 | 0.638/0.638 | 16.9/40.0 | + +| 推理工具 | precision | bs | e2e_time | p_val_whole | p_val_core | p_infer_whole | \*p_infer_core | \*MFU | acc | mem | +| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | +| tensorrt | fp16 | 32 | 1283.9 | 257.3 | 260.4 | 408.3 | 418.1 | 45.3% | 0.600/0.638 | 17.4/40.0 | +| tensorrt | fp32 | 32 | 1868.8 | 150.4 | 152.2 | 190.4 | 194.1 | 42.0% | 0.638/0.638 | 16.9/40.0 | + diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md index e7436d315..024b7f417 100644 --- a/inference/benchmarks/resnet50/README.md +++ b/inference/benchmarks/resnet50/README.md @@ -68,20 +68,20 @@ find ./val -name "*JPEG" | wc -l | ------------------ | ---------------- | -------------------------------------------- | | 数据精度 | precision | 可选fp32/fp16 | | 批尺寸 | bs | | -| 批输入大小 | byte_per_batch | 每一个batch包含的字节数 | | 硬件存储使用 | mem | 通常称为“显存”,单位为GiB | | 端到端时间 | e2e_time | 总时间+Perf初始化等时间 | | 验证总吞吐量 | p_val_whole | 实际验证图片数除以总验证时间 | -| 验证计算吞吐量 | \*p_val_core | 不包含IO部分耗时 | +| 验证计算吞吐量 | p_val_core | 不包含IO部分耗时 | | 推理总吞吐量 | p_infer_whole | 实际推理图片数除以总推理时间 | | **推理计算吞吐量** | **\*p_infer_core** | 不包含IO部分耗时 | +| **计算卡使用率** | **\*MFU** | model flops utilization | | 推理结果 | acc(推理/验证) | 单位为top1分类准确率(acc1) | * 指标值 -| 推理工具 | precision | bs | byte_per_batch | e2e_time | p_val_whole | \*p_val_core | p_infer_whole | \*p_infer_core | acc | mem | -| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ---------- | -| tensorrt | fp16 | 256 | 77070336 | 613.4 | 1358.9 | 4263.3 | 1391.4 | 12406.0 | 76.2/76.2 | 19.7/40.0 | -| tensorrt | fp32 | 256 | 77070336 | 474.4 | 1487.3 | 2653.2 | 1560.3 | 6091.6 | 76.2/76.2 | 28.86/40.0 | -| torchtrt | fp16 | 256 | 77070336 | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 76.2/76.2 | 9.42/40.0 | +| 推理工具 | precision | bs | e2e_time | p_val_whole | p_val_core | p_infer_whole | \*p_infer_core | \*MFU | acc | mem | +| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | ---------- | +| tensorrt | fp16 | 256 |613.4 | 1358.9 | 4469.4 | 1391.4 | 12698.7 | 16.8% | 76.2/76.2 | 19.7/40.0 | +| tensorrt | fp32 | 256 | 474.4 | 1487.3 | 2653.2 | 1560.3 | 6091.6 | 16.1% | 76.2/76.2 | 28.86/40.0 | +| torchtrt | fp16 | 256 | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 | diff --git a/inference/configs/bertLarge/configurations.yaml b/inference/configs/bertLarge/configurations.yaml index 515a95180..fbfb2f523 100644 --- a/inference/configs/bertLarge/configurations.yaml +++ b/inference/configs/bertLarge/configurations.yaml @@ -1,6 +1,8 @@ batch_size: 32 -# 512 length seq(1 item in x) -input_size: 512 +# 1 item(like 1 sequence, 1 image) flops +# Attention! For transformer decoder like bert, 1 token cause 2*param flops, so we need 2*length*params like 2*512*0.33B here +# format: a_1*a*2*...*a_nea_0,like 2*512*0.33e9(bert) or 4.12e9(resnet50) +flops: 2*512*0.33e9 fp16: true compiler: tensorrt num_workers: 8 diff --git a/inference/configs/resnet50/configurations.yaml b/inference/configs/resnet50/configurations.yaml index fa1739983..814dc58a3 100644 --- a/inference/configs/resnet50/configurations.yaml +++ b/inference/configs/resnet50/configurations.yaml @@ -1,6 +1,8 @@ batch_size: 256 -# 3*224*224(1 item in x) -input_size: 150528 +# 1 item(like 1 sequence, 1 image) flops +# Attention! For transformer decoder like bert, 1 token cause 2*param flops, so we need 2*length*params like 2*512*0.33B here +# format: a_1*a*2*...*a_nea_0,like 2*512*0.33e9(bert) or 4.12e9(resnet50) +flops: 4.12e9 fp16: true compiler: tensorrt num_workers: 8 diff --git a/inference/docker_images/nvidia/nvidia_analysis.py b/inference/docker_images/nvidia/nvidia_analysis.py index 26132d19d..697148933 100644 --- a/inference/docker_images/nvidia/nvidia_analysis.py +++ b/inference/docker_images/nvidia/nvidia_analysis.py @@ -11,4 +11,5 @@ def analysis_log(logpath): max_mem = line.split(" ")[3] max_mem = float(max_mem[:-3]) - return round(max_usage / 1024.0, 2), round(max_mem / 1024.0, 2) + return round(max_usage / 1024.0, + 2), round(max_mem / 1024.0, 2), eval("156e12"), eval("312e12") diff --git a/inference/run.py b/inference/run.py index d452d83e6..cf768e364 100644 --- a/inference/run.py +++ b/inference/run.py @@ -446,11 +446,16 @@ def compilation_result(case_log_path, config): vendor_module = importlib.import_module("docker_images." + config.VENDOR + "." + config.VENDOR + "_analysis") - vendor_usage, vendor_maxmem = vendor_module.analysis_log(vendor_usage_path) + vendor_usage, vendor_maxmem, fp32, fp16 = vendor_module.analysis_log( + vendor_usage_path) case_perf["vendor_usage(GiB)"] = vendor_usage case_perf["vendor_max_mem(GiB)"] = vendor_maxmem + theory = fp32 if case_perf["precision"] == "fp32" else fp16 + mfu = case_perf["flops"] / theory + case_perf["*MFU"] = str(round(mfu * 100, 1)) + "%" + for key in case_perf.keys(): padding_str = str(key).ljust(43) + " : " + str( case_perf[key]).ljust(23) diff --git a/inference/run_inference.py b/inference/run_inference.py index ff42c393f..e56fd5527 100644 --- a/inference/run_inference.py +++ b/inference/run_inference.py @@ -140,19 +140,17 @@ def parse_args(): e2e_time = time.time() - e2e_start e2e_time = round(float(e2e_time), 3) - input_byte = 2 if config.fp16 else 4 - batch_input_byte = config.batch_size * config.input_size * input_byte - batch_input_byte = int(batch_input_byte) + flops = eval(config.flops) * p_infer_core infer_info = { "vendor": config.vendor, "compiler": config.compiler, "precision": "fp16" if config.fp16 else "fp32", "batchsize": config.batch_size, - "byte_per_batch": batch_input_byte, + "flops": flops, "e2e_time(second)": e2e_time, "p_validation_whole(qps)": p_forward, - "*p_validation_core(qps)": p_forward_core, + "p_validation_core(qps)": p_forward_core, "p_inference_whole(qps)": p_infer, "*p_inference_core(qps)": p_infer_core, "val_average_acc": val_acc, From d34c7a314a05835bac7297162f124def0a09f585 Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Tue, 8 Aug 2023 11:22:54 +0800 Subject: [PATCH 5/7] vit --- inference/benchmarks/vit_l_16/README.md | 86 ++++++++++++++ .../benchmarks/vit_l_16/pytorch/__init__.py | 5 + .../benchmarks/vit_l_16/pytorch/dataloader.py | 49 ++++++++ .../benchmarks/vit_l_16/pytorch/evaluator.py | 10 ++ .../benchmarks/vit_l_16/pytorch/export.py | 34 ++++++ .../benchmarks/vit_l_16/pytorch/forward.py | 106 ++++++++++++++++++ .../benchmarks/vit_l_16/pytorch/model.py | 14 +++ .../vit_l_16/pytorch/requirements.txt | 1 + .../configs/vit_l_16/configurations.yaml | 16 +++ inference/configs/vit_l_16/parameters.yaml | 1 + .../vendor_config/nvidia_configurations.yaml | 3 + 11 files changed, 325 insertions(+) create mode 100644 inference/benchmarks/vit_l_16/README.md create mode 100644 inference/benchmarks/vit_l_16/pytorch/__init__.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/dataloader.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/evaluator.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/export.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/forward.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/model.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/requirements.txt create mode 100644 inference/configs/vit_l_16/configurations.yaml create mode 100644 inference/configs/vit_l_16/parameters.yaml create mode 100644 inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml diff --git a/inference/benchmarks/vit_l_16/README.md b/inference/benchmarks/vit_l_16/README.md new file mode 100644 index 000000000..391eac738 --- /dev/null +++ b/inference/benchmarks/vit_l_16/README.md @@ -0,0 +1,86 @@ +### 1. 推理数据集 +> Download website:https://image-net.org/ + +We use ImageNet2012 Validation Images: +| Dataset | FileName | Size | Checksum | +| ----------------------------- | ---------------------- | ----- | ------------------------------------- | +| Validation images (all tasks) | ILSVRC2012_img_val.tar | 6.3GB | MD5: 29b22e2961454d5413ddabcf34fc5622 | +Dataset format conversion: +https://github.com/pytorch/examples/blob/main/imagenet/extract_ILSVRC.sh + +make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar are in the same directory with extract_ILSVRC.sh. +```bash +sh extract_ILSVRC.sh +``` + +preview directory structures of decompressed dataset. + +```bash +tree -d -L 1 +``` + +``` +. +├── train +└── val +``` +dataset samples size + +```bash +find ./val -name "*JPEG" | wc -l +50000 +``` + +### 2. 模型与权重 + +* 模型实现 + * pytorch:transformers.ViTForImageClassification +* 权重下载 + * pytorch:from_pretrained("google/vit-large-patch16-224") + +### 2. 软硬件配置与运行信息参考 + +#### 2.1 Nvidia A100 + +- ##### 硬件环境 + - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-113-generic + - 加速卡驱动版本:470.129.06 + - Docker 版本:20.10.16 + - 训练框架版本:pytorch-1.13.0a0+937e930 + - 依赖软件版本: + - cuda: 11.8 + +- 推理工具包 + + - TensorRT 8.5.1.7 + - torch_tensorrt 1.3.0 + +### 3. 运行情况 + +* 指标列表 + +| 指标名称 | 指标值索引 | 特殊说明 | +| ------------------ | ---------------- | -------------------------------------------- | +| 数据精度 | precision | 可选fp32/fp16 | +| 批尺寸 | bs | | +| 硬件存储使用 | mem | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time | 总时间+Perf初始化等时间 | +| 验证总吞吐量 | p_val_whole | 实际验证图片数除以总验证时间 | +| 验证计算吞吐量 | p_val_core | 不包含IO部分耗时 | +| 推理总吞吐量 | p_infer_whole | 实际推理图片数除以总推理时间 | +| **推理计算吞吐量** | **\*p_infer_core** | 不包含IO部分耗时 | +| **计算卡使用率** | **\*MFU** | model flops utilization | +| 推理结果 | acc(推理/验证) | 单位为top1分类准确率(acc1) | + +* 指标值 + +| 推理工具 | precision | bs | e2e_time | p_val_whole | p_val_core | p_infer_whole | \*p_infer_core | \*MFU | acc | mem | +| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | +| tensorrt | fp16 | 64 |1009.7 | 777.8 | 796.7 | 825.8 | 1329.2 | 26.2% | 79.0/79.3 | 35.0/40.0 | +| tensorrt | fp32 | 32 | 1275.9 | 482.4 | 491.1 | 555.5 | 590.5 | 23.3% | 79.3/79.3 | 35.0/40.0 | + diff --git a/inference/benchmarks/vit_l_16/pytorch/__init__.py b/inference/benchmarks/vit_l_16/pytorch/__init__.py new file mode 100644 index 000000000..1f6cdf49b --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/__init__.py @@ -0,0 +1,5 @@ +from .dataloader import build_dataloader +from .model import create_model +from .export import export_model +from .evaluator import evaluator +from .forward import model_forward, engine_forward diff --git a/inference/benchmarks/vit_l_16/pytorch/dataloader.py b/inference/benchmarks/vit_l_16/pytorch/dataloader.py new file mode 100644 index 000000000..d08453f1e --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/dataloader.py @@ -0,0 +1,49 @@ +import torchvision as tv +from torch.utils.data import DataLoader as dl +import torch +import tqdm + + +def build_dataset(config): + crop = 256 + c_crop = 224 + mean = (0.485, 0.456, 0.406) + std = (0.229, 0.224, 0.225) + + if config.fp16: + + class ToFloat16(object): + + def __call__(self, tensor): + return tensor.to(dtype=torch.float16) + + tx = tv.transforms.Compose([ + tv.transforms.Resize(crop), + tv.transforms.CenterCrop(c_crop), + tv.transforms.ToTensor(), + ToFloat16(), + tv.transforms.Normalize(mean=mean, std=std), + ]) + dataset = tv.datasets.ImageFolder(config.data_dir, tx) + else: + tx = tv.transforms.Compose([ + tv.transforms.Resize(crop), + tv.transforms.CenterCrop(c_crop), + tv.transforms.ToTensor(), + tv.transforms.Normalize(mean=mean, std=std), + ]) + dataset = tv.datasets.ImageFolder(config.data_dir, tx) + + return dataset + + +def build_dataloader(config): + dataset = build_dataset(config) + loader = dl(dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=True, + num_workers=config.num_workers, + pin_memory=True) + + return loader diff --git a/inference/benchmarks/vit_l_16/pytorch/evaluator.py b/inference/benchmarks/vit_l_16/pytorch/evaluator.py new file mode 100644 index 000000000..5481c5e5b --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/evaluator.py @@ -0,0 +1,10 @@ +def topk(output, target, ks=(1, )): + _, pred = output.topk(max(ks), 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + return [correct[:k].max(0)[0] for k in ks] + + +def evaluator(pred, ground_truth): + top1, top5 = topk(pred, ground_truth, ks=(1, 5)) + return top1 diff --git a/inference/benchmarks/vit_l_16/pytorch/export.py b/inference/benchmarks/vit_l_16/pytorch/export.py new file mode 100644 index 000000000..3df1a821b --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/export.py @@ -0,0 +1,34 @@ +import torch +import os + + +def export_model(model, config): + if config.exist_onnx_path is not None: + return config.exist_onnx_path + + filename = config.case + "_bs" + str(config.batch_size) + filename = filename + "_" + str(config.framework) + filename = filename + "_fp16" + str(config.fp16) + filename = "onnxs/" + filename + ".onnx" + onnx_path = config.perf_dir + "/" + filename + + dummy_input = torch.randn(config.batch_size, 3, 224, 224) + + if config.fp16: + dummy_input = dummy_input.half() + dummy_input = dummy_input.cuda() + + dir_onnx_path = os.path.dirname(onnx_path) + os.makedirs(dir_onnx_path, exist_ok=True) + + with torch.no_grad(): + torch.onnx.export(model, + dummy_input, + onnx_path, + verbose=False, + input_names=["input"], + output_names=["output"], + training=torch.onnx.TrainingMode.EVAL, + do_constant_folding=True) + + return onnx_path diff --git a/inference/benchmarks/vit_l_16/pytorch/forward.py b/inference/benchmarks/vit_l_16/pytorch/forward.py new file mode 100644 index 000000000..a61caf685 --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/forward.py @@ -0,0 +1,106 @@ +from loguru import logger +import torch +import numpy as np +import time +from tools import torch_sync + + +def cal_perf(config, dataloader_len, duration, core_time, str_prefix): + model_forward_perf = config.repeat * dataloader_len * config.batch_size / duration + logger.info(str_prefix + "(" + config.framework + ") Perf: " + + str(model_forward_perf) + " ips") + model_forward_core_perf = config.repeat * dataloader_len * config.batch_size / core_time + logger.info(str_prefix + "(" + config.framework + ") core Perf: " + + str(model_forward_core_perf) + " ips") + return round(model_forward_perf, 3), round(model_forward_core_perf, 3) + + +def model_forward(model, dataloader, evaluator, config): + if config.no_validation: + return None, None, None + start = time.time() + core_time = 0.0 + acc = [] + + for times in range(config.repeat): + + logger.debug("Repeat: " + str(times + 1)) + + all_top1 = [] + for step, (x, y) in enumerate(dataloader): + torch_sync(config) + core_time_start = time.time() + + if step % config.log_freq == 0: + logger.debug("Step: " + str(step) + " / " + + str(len(dataloader))) + + with torch.no_grad(): + + x = x.cuda() + y = y.cuda() + pred = model(x)[0] + torch_sync(config) + core_time += time.time() - core_time_start + + top1 = evaluator(pred, y) + + all_top1.extend(top1.cpu()) + + acc.append(np.mean(all_top1)) + + logger.info("Top1 Acc: " + str(acc)) + + duration = time.time() - start + model_forward_perf, model_forward_core_perf = cal_perf( + config, len(dataloader), duration, core_time, "Validation") + + return model_forward_perf, model_forward_core_perf, round( + float(np.mean(acc)), 3) + + +def engine_forward(model, dataloader, evaluator, config): + start = time.time() + core_time = 0.0 + foo_time = 0.0 + acc = [] + + for times in range(config.repeat): + + logger.debug("Repeat: " + str(times + 1)) + + all_top1 = [] + for step, (x, y) in enumerate(dataloader): + torch_sync(config) + core_time_start = time.time() + + if step % config.log_freq == 0: + logger.debug("Step: " + str(step) + " / " + + str(len(dataloader))) + + with torch.no_grad(): + + outputs = model([x]) + pred = outputs[0] + foo_time += outputs[1] + + torch_sync(config) + core_time += time.time() - core_time_start + + pred = pred[0].float() + pred = pred.reshape(config.batch_size, -1) + pred = pred.cpu() + top1 = evaluator(pred, y) + + all_top1.extend(top1.cpu()) + + acc.append(np.mean(all_top1)) + + logger.info("Top1 Acc: " + str(acc)) + + duration = time.time() - start - foo_time + model_forward_perf, model_forward_core_perf = cal_perf( + config, len(dataloader), duration, core_time - foo_time, "Inference") + + return model_forward_perf, model_forward_core_perf, round( + float(np.mean(acc)), 3) diff --git a/inference/benchmarks/vit_l_16/pytorch/model.py b/inference/benchmarks/vit_l_16/pytorch/model.py new file mode 100644 index 000000000..186148119 --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/model.py @@ -0,0 +1,14 @@ +from transformers import ViTForImageClassification as vit + + +def create_model(config): + if config.no_validation: + assert config.exist_onnx_path is not None + return None + model = vit.from_pretrained(config.weights) + model.cuda() + model.eval() + if config.fp16: + model.half() + + return model diff --git a/inference/benchmarks/vit_l_16/pytorch/requirements.txt b/inference/benchmarks/vit_l_16/pytorch/requirements.txt new file mode 100644 index 000000000..976a2b1f3 --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/requirements.txt @@ -0,0 +1 @@ +transformers diff --git a/inference/configs/vit_l_16/configurations.yaml b/inference/configs/vit_l_16/configurations.yaml new file mode 100644 index 000000000..da9354aa0 --- /dev/null +++ b/inference/configs/vit_l_16/configurations.yaml @@ -0,0 +1,16 @@ +batch_size: 32 +# 1 item(like 1 sequence, 1 image) flops +# Attention! For transformer decoder like bert, 1 token cause 2*param flops, so we need 2*length*params like 2*512*0.33B here +# format: a_1*a*2*...*a_nea_0,like 2*512*0.33e9(bert) or 4.12e9(resnet50) +flops: 6.16e10 +fp16: false +compiler: tensorrt +num_workers: 8 +log_freq: 30 +repeat: 5 +# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null +no_validation: false +# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt) +exist_onnx_path: null +# set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine +exist_compiler_path: null \ No newline at end of file diff --git a/inference/configs/vit_l_16/parameters.yaml b/inference/configs/vit_l_16/parameters.yaml new file mode 100644 index 000000000..d5d7da9dd --- /dev/null +++ b/inference/configs/vit_l_16/parameters.yaml @@ -0,0 +1 @@ +weights: "google/vit-large-patch16-224" \ No newline at end of file diff --git a/inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml b/inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml new file mode 100644 index 000000000..5fc40bbf6 --- /dev/null +++ b/inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml @@ -0,0 +1,3 @@ +trt_tmp_path: nvidia_tmp/vit.trt +has_dynamic_axis: false +torchtrt_full_compile: true \ No newline at end of file From 45a744425a46aeff3a699b5d174e7b732e301866 Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Tue, 8 Aug 2023 11:47:08 +0800 Subject: [PATCH 6/7] addsrc --- inference/benchmarks/vit_l_16/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inference/benchmarks/vit_l_16/README.md b/inference/benchmarks/vit_l_16/README.md index 391eac738..5998c0cf9 100644 --- a/inference/benchmarks/vit_l_16/README.md +++ b/inference/benchmarks/vit_l_16/README.md @@ -34,9 +34,9 @@ find ./val -name "*JPEG" | wc -l ### 2. 模型与权重 * 模型实现 - * pytorch:transformers.ViTForImageClassification + * pytorch:transformers.ViTForImageClassification(hugging face) * 权重下载 - * pytorch:from_pretrained("google/vit-large-patch16-224") + * pytorch:from_pretrained("google/vit-large-patch16-224")(hugging face) ### 2. 软硬件配置与运行信息参考 From 6f9ad1b323ef8b636516d653e1cc2f983a7d1609 Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Wed, 9 Aug 2023 11:29:26 +0800 Subject: [PATCH 7/7] sd --- .../stable_diffusion_v1_4/README.md | 60 + .../stable_diffusion_v1_4/pytorch/__init__.py | 5 + .../pytorch/dataloader.py | 31 + .../pytorch/evaluator.py | 12 + .../stable_diffusion_v1_4/pytorch/export.py | 41 + .../stable_diffusion_v1_4/pytorch/forward.py | 251 ++++ .../stable_diffusion_v1_4/pytorch/model.py | 16 + .../pytorch/model_utils/unet2d.py | 1064 +++++++++++++++++ .../pytorch/requirements.txt | 3 + .../stable_diffusion_v1_4/configurations.yaml | 16 + .../stable_diffusion_v1_4/parameters.yaml | 14 + .../vendor_config/nvidia_configurations.yaml | 3 + 12 files changed, 1516 insertions(+) create mode 100644 inference/benchmarks/stable_diffusion_v1_4/README.md create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/forward.py create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/model.py create mode 100755 inference/benchmarks/stable_diffusion_v1_4/pytorch/model_utils/unet2d.py create mode 100644 inference/benchmarks/stable_diffusion_v1_4/pytorch/requirements.txt create mode 100644 inference/configs/stable_diffusion_v1_4/configurations.yaml create mode 100644 inference/configs/stable_diffusion_v1_4/parameters.yaml create mode 100644 inference/configs/stable_diffusion_v1_4/vendor_config/nvidia_configurations.yaml diff --git a/inference/benchmarks/stable_diffusion_v1_4/README.md b/inference/benchmarks/stable_diffusion_v1_4/README.md new file mode 100644 index 000000000..07aade914 --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/README.md @@ -0,0 +1,60 @@ +### 1. 推理数据集 + + +### 2. 模型与权重 + +* 模型实现 + * pytorch:transformers.UNet2DConditionalModel +* 权重下载 + * pytorch:from_pretrained("CompViz/stable-diffusion-v1-4") + +### 2. 软硬件配置与运行信息参考 + +#### 2.1 Nvidia A100 + +- ##### 硬件环境 + - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-113-generic + - 加速卡驱动版本:470.129.06 + - Docker 版本:20.10.16 + - 训练框架版本:pytorch-2.1.0a0+4136153 + - 依赖软件版本: + - cuda: 12.1 + +- 推理工具包 + + - TensorRT 8.6.1 + +- 其他说明 + + - 本case在大批尺寸情况下涉及到了张量超过4B的情况,因此在大批尺寸离线批推理场景下,不宜作为性能及MFU基准。 + +### 3. 运行情况 + +* 指标列表 + +| 指标名称 | 指标值索引 | 特殊说明 | +| ------------------ | ---------------- | -------------------------------------------- | +| 数据精度 | precision | 可选fp32/fp16 | +| 批尺寸 | bs | | +| 硬件存储使用 | mem | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time | 总时间+Perf初始化等时间 | +| 验证总吞吐量 | p_val_whole | 实际验证prompts数除以总验证时间 | +| 验证计算吞吐量 | p_val_core | 不包含IO部分耗时 | +| 推理总吞吐量 | p_infer_whole | 实际推理prompts数除以总推理时间 | +| **推理计算吞吐量** | **\*p_infer_core** | 不包含IO部分耗时 | +| **计算卡使用率** | **\*MFU** | model flops utilization | +| 推理结果 | CLIP Score(推理/验证) | 单位为text2img耦合度分数 | + +* 指标值 + +| 推理工具 | precision | bs | e2e_time | p_val_whole | p_val_core | p_infer_whole | \*p_infer_core | \*MFU | CLIP Score | mem | +| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | +| tensorrt | fp16 | 2 |1674.9 | 11.4 | 45.2 | 10.6 | 60.6 | 13.2% | 17.1/25.2 | 13.3/40.0 | +| tensorrt | fp32 | 2 | 1807.4 | 8.2 | 20.6 | 7.2 | 16.1 | 7.0% | 25.2/25.3 | 39.2/40.0 | +| null | fp16 | 16 | / | 11.7 | 60.7 | / | / | 13.2% | -/25.2 | 5.7/40.0 | +| null | fp32 | 8 | / | 9.3 | 27.3 | / | / | 11.9% | -/25.3 | 6.3/40.0 | diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py new file mode 100644 index 000000000..1f6cdf49b --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/__init__.py @@ -0,0 +1,5 @@ +from .dataloader import build_dataloader +from .model import create_model +from .export import export_model +from .evaluator import evaluator +from .forward import model_forward, engine_forward diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py new file mode 100644 index 000000000..94f00f2f2 --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/dataloader.py @@ -0,0 +1,31 @@ +from torch.utils.data import DataLoader as dl +import torch +import json +import random + + +def build_dataset(config): + + df = json.load(open(config.data_dir + "/" + config.prompts))["annotations"] + prompts = [] + for item in df: + prompts.append(item["caption"]) + dataset = [ + item for item in prompts if len(item) < config.prompt_max_len - 2 + ] + random.seed(config.random_seed) + dataset = random.sample(dataset, config.prompt_samples) + + return dataset + + +def build_dataloader(config): + dataset = build_dataset(config) + loader = dl(dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=True, + num_workers=config.num_workers, + pin_memory=True) + + return loader diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py new file mode 100644 index 000000000..824323809 --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/evaluator.py @@ -0,0 +1,12 @@ +import torch + + +def evaluator(metric, image, prompt, config): + scores = [] + image = (image / 2 + 0.5).clamp(0, 1) + image = image.detach().cpu().permute(0, 2, 3, 1).numpy() + image = (image * 255).round().astype("uint8") + image = torch.tensor(image) + for i in range(config.batch_size): + scores.append(float(metric(image[i], prompt[i]))) + return scores diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py new file mode 100644 index 000000000..60fa8fbb8 --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/export.py @@ -0,0 +1,41 @@ +import torch +import os + + +def export_model(model, config): + if config.exist_onnx_path is not None: + return config.exist_onnx_path + + filename = config.case + "_bs" + str(config.batch_size) + filename = filename + "_" + str(config.framework) + filename = filename + "_fp16" + str(config.fp16) + filename = "onnxs/" + filename + ".onnx" + onnx_path = config.perf_dir + "/" + filename + + latent = torch.randn(config.batch_size * 2, config.in_channels, + config.height // config.scale_size, + config.width // config.scale_size).cuda().float() + t = torch.randn([]).cuda().int() + embed = torch.randn(config.batch_size * 2, config.prompt_max_len, + config.embed_hidden_size).cuda().float() + + if config.fp16: + latent = latent.half() + embed = embed.half() + + dummy_input = (latent, t, embed) + + dir_onnx_path = os.path.dirname(onnx_path) + os.makedirs(dir_onnx_path, exist_ok=True) + + with torch.no_grad(): + torch.onnx.export(model, + dummy_input, + onnx_path, + verbose=False, + input_names=["input_0", "input_1", "input_2"], + output_names=["output_0"], + training=torch.onnx.TrainingMode.EVAL, + do_constant_folding=True) + + return onnx_path diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/forward.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/forward.py new file mode 100644 index 000000000..a9314a90a --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/forward.py @@ -0,0 +1,251 @@ +from loguru import logger +import torch +import numpy as np +import time +from tools import torch_sync +from diffusers import AutoencoderKL, UNet2DConditionModel, DDIMScheduler +from transformers import CLIPTextModel, CLIPTokenizer +from torchmetrics.multimodal import CLIPScore + + +def cal_perf(config, dataloader_len, duration, core_time, str_prefix): + model_forward_perf = config.repeat * dataloader_len * config.batch_size * config.num_inference_steps / duration + logger.info(str_prefix + "(" + config.framework + ") Perf: " + + str(model_forward_perf) + " ips") + model_forward_core_perf = config.repeat * dataloader_len * config.batch_size * config.num_inference_steps / core_time + logger.info(str_prefix + "(" + config.framework + ") core Perf: " + + str(model_forward_core_perf) + " ips") + return round(model_forward_perf, 3), round(model_forward_core_perf, 3) + + +def model_forward(model, dataloader, evaluator, config): + if config.no_validation: + return None, None, None + vae = AutoencoderKL.from_pretrained(config.data_dir + "/" + config.weights, + subfolder="vae") + tokenizer = CLIPTokenizer.from_pretrained(config.data_dir + "/" + + config.weights, + subfolder="tokenizer") + text_encoder = CLIPTextModel.from_pretrained(config.data_dir + "/" + + config.weights, + subfolder="text_encoder") + noise_scheduler = DDIMScheduler( + num_train_timesteps=config.num_train_timesteps, + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, + ) + vae.eval() + text_encoder.eval() + + metric = CLIPScore(model_name_or_path=config.data_dir + "/" + + config.eval_weights) + metric.eval() + + generator = torch.Generator().manual_seed(config.random_seed) + + start = time.time() + core_time = 0.0 + scores = [] + for times in range(config.repeat): + + logger.debug("Repeat: " + str(times + 1)) + for step, prompt in enumerate(dataloader): + if step % config.log_freq == 0: + logger.debug("Step: " + str(step) + " / " + + str(len(dataloader))) + + with torch.no_grad(): + text_input = tokenizer(prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt") + + text_embeddings = text_encoder(text_input.input_ids)[0] + + max_length = text_input.input_ids.shape[-1] + uncond_input = tokenizer([""] * config.batch_size, + padding="max_length", + max_length=max_length, + return_tensors="pt") + + uncond_embeddings = text_encoder(uncond_input.input_ids)[0] + text_embeddings = torch.cat( + [uncond_embeddings, text_embeddings]) + + latents = torch.randn( + (config.batch_size, config.in_channels, config.height // + config.scale_size, config.width // config.scale_size), + generator=generator) + + noise_scheduler.set_timesteps(config.num_inference_steps) + + timesteps_tensor = torch.linspace( + config.num_train_timesteps - + config.num_train_timesteps // config.num_inference_steps, + 0, config.num_inference_steps).int() + + for t in timesteps_tensor: + latent_model_input = torch.cat([latents] * 2) + + torch_sync(config) + core_time_start = time.time() + if config.fp16: + noise_pred = model( + latent_model_input.cuda().to(torch.float16), + t.cuda(), + text_embeddings.cuda().to(torch.float16)) + else: + noise_pred = model(latent_model_input.cuda(), t.cuda(), + text_embeddings.cuda()) + + torch_sync(config) + core_time += time.time() - core_time_start + + noise_pred = noise_pred.to(torch.float32).cpu() + + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + config.guidance_scale * ( + noise_pred_text - noise_pred_uncond) + + latents = noise_scheduler.step(noise_pred, t, + latents).prev_sample + + latents = 1 / 0.18215 * latents + image = vae.decode(latents).sample + + scores_iter = evaluator(metric, image, prompt, config) + for score in scores_iter: + scores.append(score) + + duration = time.time() - start + logger.info("CLIP Scores: " + str(np.mean(scores))) + + duration = time.time() - start + model_forward_perf, model_forward_core_perf = cal_perf( + config, len(dataloader), duration, core_time, "Validation") + + return model_forward_perf, model_forward_core_perf, round( + float(np.mean(scores)), 3) + + +def engine_forward(model, dataloader, evaluator, config): + vae = AutoencoderKL.from_pretrained(config.data_dir + "/" + config.weights, + subfolder="vae") + tokenizer = CLIPTokenizer.from_pretrained(config.data_dir + "/" + + config.weights, + subfolder="tokenizer") + text_encoder = CLIPTextModel.from_pretrained(config.data_dir + "/" + + config.weights, + subfolder="text_encoder") + noise_scheduler = DDIMScheduler( + num_train_timesteps=config.num_train_timesteps, + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, + ) + vae.eval() + text_encoder.eval() + + metric = CLIPScore(model_name_or_path=config.data_dir + "/" + + config.eval_weights) + metric.eval() + + generator = torch.Generator().manual_seed(config.random_seed) + + start = time.time() + core_time = 0.0 + scores = [] + for times in range(config.repeat): + + logger.debug("Repeat: " + str(times + 1)) + for step, prompt in enumerate(dataloader): + if step % config.log_freq == 0: + logger.debug("Step: " + str(step) + " / " + + str(len(dataloader))) + + with torch.no_grad(): + text_input = tokenizer(prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt") + + text_embeddings = text_encoder(text_input.input_ids)[0] + + max_length = text_input.input_ids.shape[-1] + uncond_input = tokenizer([""] * config.batch_size, + padding="max_length", + max_length=max_length, + return_tensors="pt") + + uncond_embeddings = text_encoder(uncond_input.input_ids)[0] + text_embeddings = torch.cat( + [uncond_embeddings, text_embeddings]) + + latents = torch.randn( + (config.batch_size, config.in_channels, config.height // + config.scale_size, config.width // config.scale_size), + generator=generator) + + noise_scheduler.set_timesteps(config.num_inference_steps) + + timesteps_tensor = torch.linspace( + config.num_train_timesteps - + config.num_train_timesteps // config.num_inference_steps, + 0, config.num_inference_steps).int() + + for t in timesteps_tensor: + latent_model_input = torch.cat([latents] * 2) + + inputs = [latent_model_input, t, text_embeddings] + if config.fp16: + inputs = [ + latent_model_input.to(torch.float16), t, + text_embeddings.to(torch.float16) + ] + + torch_sync(config) + core_time_start = time.time() + outputs = model(inputs) + noise_pred = outputs[0] + foo_time = outputs[1] + + torch_sync(config) + core_time += time.time() - core_time_start + + noise_pred = noise_pred[0].float() + noise_pred = noise_pred.reshape( + config.batch_size * 2, config.in_channels, + config.height // config.scale_size, + config.width // config.scale_size) + noise_pred = noise_pred.cpu() + + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + config.guidance_scale * ( + noise_pred_text - noise_pred_uncond) + + latents = noise_scheduler.step(noise_pred, t, + latents).prev_sample + + latents = 1 / 0.18215 * latents + image = vae.decode(latents).sample + + scores_iter = evaluator(metric, image, prompt, config) + for score in scores_iter: + scores.append(score) + + duration = time.time() - start + logger.info("CLIP Scores: " + str(np.mean(scores))) + + duration = time.time() - start + model_forward_perf, model_forward_core_perf = cal_perf( + config, len(dataloader), duration, core_time, "Inference") + + return model_forward_perf, model_forward_core_perf, round( + float(np.mean(scores)), 3) diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/model.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/model.py new file mode 100644 index 000000000..e1b4db5cb --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/model.py @@ -0,0 +1,16 @@ +from .model_utils.unet2d import UNet2DConditionModel + + +def create_model(config): + if config.no_validation: + assert config.exist_onnx_path is not None + return None + model = UNet2DConditionModel.from_pretrained(config.data_dir + "/" + + config.weights, + subfolder="unet") + model.cuda() + model.eval() + if config.fp16: + model.half() + + return model diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/model_utils/unet2d.py b/inference/benchmarks/stable_diffusion_v1_4/pytorch/model_utils/unet2d.py new file mode 100755 index 000000000..cc803a9f3 --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/model_utils/unet2d.py @@ -0,0 +1,1064 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.loaders import UNet2DConditionLoadersMixin +from diffusers.utils import BaseOutput, logging +from diffusers.models.activations import get_activation +from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor +from diffusers.models.embeddings import ( + GaussianFourierProjection, + ImageHintTimeEmbedding, + ImageProjection, + ImageTimeEmbedding, + TextImageProjection, + TextImageTimeEmbedding, + TextTimeEmbedding, + TimestepEmbedding, + Timesteps, +) +from diffusers.models.modeling_utils import ModelMixin +from diffusers.models.unet_2d_blocks import ( + CrossAttnDownBlock2D, + CrossAttnUpBlock2D, + DownBlock2D, + UNetMidBlock2DCrossAttn, + UNetMidBlock2DSimpleCrossAttn, + UpBlock2D, + get_down_block, + get_up_block, +) + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class UNet2DConditionOutput(BaseOutput): + """ + The output of [`UNet2DConditionModel`]. + + Args: + sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. + """ + + sample: torch.FloatTensor = None + + +class UNet2DConditionModel(ModelMixin, ConfigMixin, + UNet2DConditionLoadersMixin): + r""" + A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample + shaped output. + + This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented + for all models (such as downloading or saving). + + Parameters: + sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`): + Height and width of input/output sample. + in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample. + out_channels (`int`, *optional*, defaults to 4): Number of channels in the output. + center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample. + flip_sin_to_cos (`bool`, *optional*, defaults to `False`): + Whether to flip the sin to cos in the time embedding. + freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding. + down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`): + The tuple of downsample blocks to use. + mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`): + Block type for middle of UNet, it can be either `UNetMidBlock2DCrossAttn` or + `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped. + up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`): + The tuple of upsample blocks to use. + only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`): + Whether to include self-attention in the basic transformer blocks, see + [`~models.attention.BasicTransformerBlock`]. + block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`): + The tuple of output channels for each block. + layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block. + downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution. + mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block. + act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. + norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization. + If `None`, normalization and activation layers is skipped in post-processing. + norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization. + cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280): + The dimension of the cross attention features. + transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1): + The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for + [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`], + [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`]. + encoder_hid_dim (`int`, *optional*, defaults to None): + If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim` + dimension to `cross_attention_dim`. + encoder_hid_dim_type (`str`, *optional*, defaults to `None`): + If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text + embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`. + attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads. + num_attention_heads (`int`, *optional*): + The number of attention heads. If not defined, defaults to `attention_head_dim` + resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config + for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`. + class_embed_type (`str`, *optional*, defaults to `None`): + The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`, + `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`. + addition_embed_type (`str`, *optional*, defaults to `None`): + Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or + "text". "text" will use the `TextTimeEmbedding` layer. + addition_time_embed_dim: (`int`, *optional*, defaults to `None`): + Dimension for the timestep embeddings. + num_class_embeds (`int`, *optional*, defaults to `None`): + Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing + class conditioning with `class_embed_type` equal to `None`. + time_embedding_type (`str`, *optional*, defaults to `positional`): + The type of position embedding to use for timesteps. Choose from `positional` or `fourier`. + time_embedding_dim (`int`, *optional*, defaults to `None`): + An optional override for the dimension of the projected time embedding. + time_embedding_act_fn (`str`, *optional*, defaults to `None`): + Optional activation function to use only once on the time embeddings before they are passed to the rest of + the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`. + timestep_post_act (`str`, *optional*, defaults to `None`): + The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`. + time_cond_proj_dim (`int`, *optional*, defaults to `None`): + The dimension of `cond_proj` layer in the timestep embedding. + conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. + conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer. + projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when + `class_embed_type="projection"`. Required when `class_embed_type="projection"`. + class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time + embeddings with the class embeddings. + mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`): + Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If + `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the + `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False` + otherwise. + """ + + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + sample_size: Optional[int] = None, + in_channels: int = 4, + out_channels: int = 4, + center_input_sample: bool = False, + flip_sin_to_cos: bool = True, + freq_shift: int = 0, + down_block_types: Tuple[str] = ( + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "DownBlock2D", + ), + mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn", + up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D"), + only_cross_attention: Union[bool, Tuple[bool]] = False, + block_out_channels: Tuple[int] = (320, 640, 1280, 1280), + layers_per_block: Union[int, Tuple[int]] = 2, + downsample_padding: int = 1, + mid_block_scale_factor: float = 1, + act_fn: str = "silu", + norm_num_groups: Optional[int] = 32, + norm_eps: float = 1e-5, + cross_attention_dim: Union[int, Tuple[int]] = 1280, + transformer_layers_per_block: Union[int, Tuple[int]] = 1, + encoder_hid_dim: Optional[int] = None, + encoder_hid_dim_type: Optional[str] = None, + attention_head_dim: Union[int, Tuple[int]] = 8, + num_attention_heads: Optional[Union[int, Tuple[int]]] = None, + dual_cross_attention: bool = False, + use_linear_projection: bool = False, + class_embed_type: Optional[str] = None, + addition_embed_type: Optional[str] = None, + addition_time_embed_dim: Optional[int] = None, + num_class_embeds: Optional[int] = None, + upcast_attention: bool = False, + resnet_time_scale_shift: str = "default", + resnet_skip_time_act: bool = False, + resnet_out_scale_factor: int = 1.0, + time_embedding_type: str = "positional", + time_embedding_dim: Optional[int] = None, + time_embedding_act_fn: Optional[str] = None, + timestep_post_act: Optional[str] = None, + time_cond_proj_dim: Optional[int] = None, + conv_in_kernel: int = 3, + conv_out_kernel: int = 3, + projection_class_embeddings_input_dim: Optional[int] = None, + class_embeddings_concat: bool = False, + mid_block_only_cross_attention: Optional[bool] = None, + cross_attention_norm: Optional[str] = None, + addition_embed_type_num_heads=64, + ): + super().__init__() + + self.sample_size = sample_size + + if num_attention_heads is not None: + raise ValueError( + "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19." + ) + + # If `num_attention_heads` is not defined (which is the case for most models) + # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. + # The reason for this behavior is to correct for incorrectly named variables that were introduced + # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 + # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking + # which is why we correct for the naming here. + num_attention_heads = num_attention_heads or attention_head_dim + + # Check inputs + if len(down_block_types) != len(up_block_types): + raise ValueError( + f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}." + ) + + if len(block_out_channels) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." + ) + + if not isinstance( + only_cross_attention, + bool) and len(only_cross_attention) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." + ) + + if not isinstance( + num_attention_heads, + int) and len(num_attention_heads) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." + ) + + if not isinstance( + attention_head_dim, + int) and len(attention_head_dim) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}." + ) + + if isinstance( + cross_attention_dim, + list) and len(cross_attention_dim) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}." + ) + + if not isinstance( + layers_per_block, + int) and len(layers_per_block) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}." + ) + + # input + conv_in_padding = (conv_in_kernel - 1) // 2 + self.conv_in = nn.Conv2d(in_channels, + block_out_channels[0], + kernel_size=conv_in_kernel, + padding=conv_in_padding) + + # time + if time_embedding_type == "fourier": + time_embed_dim = time_embedding_dim or block_out_channels[0] * 2 + if time_embed_dim % 2 != 0: + raise ValueError( + f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}." + ) + self.time_proj = GaussianFourierProjection( + time_embed_dim // 2, + set_W_to_weight=False, + log=False, + flip_sin_to_cos=flip_sin_to_cos) + timestep_input_dim = time_embed_dim + elif time_embedding_type == "positional": + time_embed_dim = time_embedding_dim or block_out_channels[0] * 4 + + self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, + freq_shift) + timestep_input_dim = block_out_channels[0] + else: + raise ValueError( + f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`." + ) + + self.time_embedding = TimestepEmbedding( + timestep_input_dim, + time_embed_dim, + act_fn=act_fn, + post_act_fn=timestep_post_act, + cond_proj_dim=time_cond_proj_dim, + ) + + if encoder_hid_dim_type is None and encoder_hid_dim is not None: + encoder_hid_dim_type = "text_proj" + self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type) + logger.info( + "encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined." + ) + + if encoder_hid_dim is None and encoder_hid_dim_type is not None: + raise ValueError( + f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}." + ) + + if encoder_hid_dim_type == "text_proj": + self.encoder_hid_proj = nn.Linear(encoder_hid_dim, + cross_attention_dim) + elif encoder_hid_dim_type == "text_image_proj": + # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)` + self.encoder_hid_proj = TextImageProjection( + text_embed_dim=encoder_hid_dim, + image_embed_dim=cross_attention_dim, + cross_attention_dim=cross_attention_dim, + ) + elif encoder_hid_dim_type == "image_proj": + # Kandinsky 2.2 + self.encoder_hid_proj = ImageProjection( + image_embed_dim=encoder_hid_dim, + cross_attention_dim=cross_attention_dim, + ) + elif encoder_hid_dim_type is not None: + raise ValueError( + f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'." + ) + else: + self.encoder_hid_proj = None + + # class embedding + if class_embed_type is None and num_class_embeds is not None: + self.class_embedding = nn.Embedding(num_class_embeds, + time_embed_dim) + elif class_embed_type == "timestep": + self.class_embedding = TimestepEmbedding(timestep_input_dim, + time_embed_dim, + act_fn=act_fn) + elif class_embed_type == "identity": + self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim) + elif class_embed_type == "projection": + if projection_class_embeddings_input_dim is None: + raise ValueError( + "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set" + ) + # The projection `class_embed_type` is the same as the timestep `class_embed_type` except + # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings + # 2. it projects from an arbitrary input dimension. + # + # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations. + # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings. + # As a result, `TimestepEmbedding` can be passed arbitrary vectors. + self.class_embedding = TimestepEmbedding( + projection_class_embeddings_input_dim, time_embed_dim) + elif class_embed_type == "simple_projection": + if projection_class_embeddings_input_dim is None: + raise ValueError( + "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set" + ) + self.class_embedding = nn.Linear( + projection_class_embeddings_input_dim, time_embed_dim) + else: + self.class_embedding = None + + if addition_embed_type == "text": + if encoder_hid_dim is not None: + text_time_embedding_from_dim = encoder_hid_dim + else: + text_time_embedding_from_dim = cross_attention_dim + + self.add_embedding = TextTimeEmbedding( + text_time_embedding_from_dim, + time_embed_dim, + num_heads=addition_embed_type_num_heads) + elif addition_embed_type == "text_image": + # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)` + self.add_embedding = TextImageTimeEmbedding( + text_embed_dim=cross_attention_dim, + image_embed_dim=cross_attention_dim, + time_embed_dim=time_embed_dim) + elif addition_embed_type == "text_time": + self.add_time_proj = Timesteps(addition_time_embed_dim, + flip_sin_to_cos, freq_shift) + self.add_embedding = TimestepEmbedding( + projection_class_embeddings_input_dim, time_embed_dim) + elif addition_embed_type == "image": + # Kandinsky 2.2 + self.add_embedding = ImageTimeEmbedding( + image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim) + elif addition_embed_type == "image_hint": + # Kandinsky 2.2 ControlNet + self.add_embedding = ImageHintTimeEmbedding( + image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim) + elif addition_embed_type is not None: + raise ValueError( + f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'." + ) + + if time_embedding_act_fn is None: + self.time_embed_act = None + else: + self.time_embed_act = get_activation(time_embedding_act_fn) + + self.down_blocks = nn.ModuleList([]) + self.up_blocks = nn.ModuleList([]) + + if isinstance(only_cross_attention, bool): + if mid_block_only_cross_attention is None: + mid_block_only_cross_attention = only_cross_attention + + only_cross_attention = [only_cross_attention + ] * len(down_block_types) + + if mid_block_only_cross_attention is None: + mid_block_only_cross_attention = False + + if isinstance(num_attention_heads, int): + num_attention_heads = ( + num_attention_heads, ) * len(down_block_types) + + if isinstance(attention_head_dim, int): + attention_head_dim = (attention_head_dim, ) * len(down_block_types) + + if isinstance(cross_attention_dim, int): + cross_attention_dim = ( + cross_attention_dim, ) * len(down_block_types) + + if isinstance(layers_per_block, int): + layers_per_block = [layers_per_block] * len(down_block_types) + + if isinstance(transformer_layers_per_block, int): + transformer_layers_per_block = [transformer_layers_per_block + ] * len(down_block_types) + + if class_embeddings_concat: + # The time embeddings are concatenated with the class embeddings. The dimension of the + # time embeddings passed to the down, middle, and up blocks is twice the dimension of the + # regular time embeddings + blocks_time_embed_dim = time_embed_dim * 2 + else: + blocks_time_embed_dim = time_embed_dim + + # down + output_channel = block_out_channels[0] + for i, down_block_type in enumerate(down_block_types): + input_channel = output_channel + output_channel = block_out_channels[i] + is_final_block = i == len(block_out_channels) - 1 + + down_block = get_down_block( + down_block_type, + num_layers=layers_per_block[i], + transformer_layers_per_block=transformer_layers_per_block[i], + in_channels=input_channel, + out_channels=output_channel, + temb_channels=blocks_time_embed_dim, + add_downsample=not is_final_block, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim[i], + num_attention_heads=num_attention_heads[i], + downsample_padding=downsample_padding, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + resnet_skip_time_act=resnet_skip_time_act, + resnet_out_scale_factor=resnet_out_scale_factor, + cross_attention_norm=cross_attention_norm, + attention_head_dim=attention_head_dim[i] + if attention_head_dim[i] is not None else output_channel, + ) + self.down_blocks.append(down_block) + + # mid + if mid_block_type == "UNetMidBlock2DCrossAttn": + self.mid_block = UNetMidBlock2DCrossAttn( + transformer_layers_per_block=transformer_layers_per_block[-1], + in_channels=block_out_channels[-1], + temb_channels=blocks_time_embed_dim, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + output_scale_factor=mid_block_scale_factor, + resnet_time_scale_shift=resnet_time_scale_shift, + cross_attention_dim=cross_attention_dim[-1], + num_attention_heads=num_attention_heads[-1], + resnet_groups=norm_num_groups, + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + upcast_attention=upcast_attention, + ) + elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn": + self.mid_block = UNetMidBlock2DSimpleCrossAttn( + in_channels=block_out_channels[-1], + temb_channels=blocks_time_embed_dim, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + output_scale_factor=mid_block_scale_factor, + cross_attention_dim=cross_attention_dim[-1], + attention_head_dim=attention_head_dim[-1], + resnet_groups=norm_num_groups, + resnet_time_scale_shift=resnet_time_scale_shift, + skip_time_act=resnet_skip_time_act, + only_cross_attention=mid_block_only_cross_attention, + cross_attention_norm=cross_attention_norm, + ) + elif mid_block_type is None: + self.mid_block = None + else: + raise ValueError(f"unknown mid_block_type : {mid_block_type}") + + # count how many layers upsample the images + self.num_upsamplers = 0 + + # up + reversed_block_out_channels = list(reversed(block_out_channels)) + reversed_num_attention_heads = list(reversed(num_attention_heads)) + reversed_layers_per_block = list(reversed(layers_per_block)) + reversed_cross_attention_dim = list(reversed(cross_attention_dim)) + reversed_transformer_layers_per_block = list( + reversed(transformer_layers_per_block)) + only_cross_attention = list(reversed(only_cross_attention)) + + output_channel = reversed_block_out_channels[0] + for i, up_block_type in enumerate(up_block_types): + is_final_block = i == len(block_out_channels) - 1 + + prev_output_channel = output_channel + output_channel = reversed_block_out_channels[i] + input_channel = reversed_block_out_channels[min( + i + 1, + len(block_out_channels) - 1)] + + # add upsample block for all BUT final layer + if not is_final_block: + add_upsample = True + self.num_upsamplers += 1 + else: + add_upsample = False + + up_block = get_up_block( + up_block_type, + num_layers=reversed_layers_per_block[i] + 1, + transformer_layers_per_block= + reversed_transformer_layers_per_block[i], + in_channels=input_channel, + out_channels=output_channel, + prev_output_channel=prev_output_channel, + temb_channels=blocks_time_embed_dim, + add_upsample=add_upsample, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=reversed_cross_attention_dim[i], + num_attention_heads=reversed_num_attention_heads[i], + dual_cross_attention=dual_cross_attention, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + resnet_skip_time_act=resnet_skip_time_act, + resnet_out_scale_factor=resnet_out_scale_factor, + cross_attention_norm=cross_attention_norm, + attention_head_dim=attention_head_dim[i] + if attention_head_dim[i] is not None else output_channel, + ) + self.up_blocks.append(up_block) + prev_output_channel = output_channel + + # out + if norm_num_groups is not None: + self.conv_norm_out = nn.GroupNorm( + num_channels=block_out_channels[0], + num_groups=norm_num_groups, + eps=norm_eps) + + self.conv_act = get_activation(act_fn) + + else: + self.conv_norm_out = None + self.conv_act = None + + conv_out_padding = (conv_out_kernel - 1) // 2 + self.conv_out = nn.Conv2d(block_out_channels[0], + out_channels, + kernel_size=conv_out_kernel, + padding=conv_out_padding) + + @property + def attn_processors(self) -> Dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors(name: str, module: torch.nn.Module, + processors: Dict[str, + AttentionProcessor]): + if hasattr(module, "set_processor"): + processors[f"{name}.processor"] = module.processor + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, + processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + def set_attn_processor(self, processor: Union[AttentionProcessor, + Dict[str, + AttentionProcessor]]): + r""" + Sets the attention processor to use to compute attention. + + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, + processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, + processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + def set_default_attn_processor(self): + """ + Disables custom attention processors and sets the default attention implementation. + """ + self.set_attn_processor(AttnProcessor()) + + def set_attention_slice(self, slice_size): + r""" + Enable sliced attention computation. + + When this option is enabled, the attention module splits the input tensor in slices to compute attention in + several steps. This is useful for saving some memory in exchange for a small decrease in speed. + + Args: + slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): + When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If + `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. + """ + sliceable_head_dims = [] + + def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module): + if hasattr(module, "set_attention_slice"): + sliceable_head_dims.append(module.sliceable_head_dim) + + for child in module.children(): + fn_recursive_retrieve_sliceable_dims(child) + + # retrieve number of attention layers + for module in self.children(): + fn_recursive_retrieve_sliceable_dims(module) + + num_sliceable_layers = len(sliceable_head_dims) + + if slice_size == "auto": + # half the attention head size is usually a good trade-off between + # speed and memory + slice_size = [dim // 2 for dim in sliceable_head_dims] + elif slice_size == "max": + # make smallest slice possible + slice_size = num_sliceable_layers * [1] + + slice_size = num_sliceable_layers * [slice_size] if not isinstance( + slice_size, list) else slice_size + + if len(slice_size) != len(sliceable_head_dims): + raise ValueError( + f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different" + f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}." + ) + + for i in range(len(slice_size)): + size = slice_size[i] + dim = sliceable_head_dims[i] + if size is not None and size > dim: + raise ValueError( + f"size {size} has to be smaller or equal to {dim}.") + + # Recursively walk through all the children. + # Any children which exposes the set_attention_slice method + # gets the message + def fn_recursive_set_attention_slice(module: torch.nn.Module, + slice_size: List[int]): + if hasattr(module, "set_attention_slice"): + module.set_attention_slice(slice_size.pop()) + + for child in module.children(): + fn_recursive_set_attention_slice(child, slice_size) + + reversed_slice_size = list(reversed(slice_size)) + for module in self.children(): + fn_recursive_set_attention_slice(module, reversed_slice_size) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D, + CrossAttnUpBlock2D, UpBlock2D)): + module.gradient_checkpointing = value + + def forward( + self, + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + class_labels: Optional[torch.Tensor] = None, + timestep_cond: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, + down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, + mid_block_additional_residual: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + return_dict: bool = True, + ) -> Union[UNet2DConditionOutput, Tuple]: + r""" + The [`UNet2DConditionModel`] forward method. + + Args: + sample (`torch.FloatTensor`): + The noisy input tensor with the following shape `(batch, channel, height, width)`. + timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.FloatTensor`): + The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. + encoder_attention_mask (`torch.Tensor`): + A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If + `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias, + which adds large negative values to the attention scores corresponding to "discard" tokens. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. + added_cond_kwargs: (`dict`, *optional*): + A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that + are passed along to the UNet blocks. + + Returns: + [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: + If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise + a `tuple` is returned where the first element is the sample tensor. + """ + # By default samples have to be AT least a multiple of the overall upsampling factor. + # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). + # However, the upsampling interpolation output size can be forced to fit any upsampling size + # on the fly if necessary. + default_overall_up_factor = 2**self.num_upsamplers + + # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor` + forward_upsample_size = False + upsample_size = None + + if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]): + logger.info( + "Forward upsample size to force interpolation output size.") + forward_upsample_size = True + + # ensure attention_mask is a bias, and give it a singleton query_tokens dimension + # expects mask of shape: + # [batch, key_tokens] + # adds singleton query_tokens dimension: + # [batch, 1, key_tokens] + # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes: + # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn) + # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn) + if attention_mask is not None: + # assume that mask is expressed as: + # (1 = keep, 0 = discard) + # convert mask into a bias that can be added to attention scores: + # (keep = +0, discard = -10000.0) + attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # convert encoder_attention_mask to a bias the same way we do for attention_mask + if encoder_attention_mask is not None: + encoder_attention_mask = ( + 1 - encoder_attention_mask.to(sample.dtype)) * -10000.0 + encoder_attention_mask = encoder_attention_mask.unsqueeze(1) + + # 0. center input if necessary + if self.config.center_input_sample: + sample = 2 * sample - 1.0 + + # 1. time + timesteps = timestep + if not torch.is_tensor(timesteps): + # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can + # This would be a good case for the `match` statement (Python 3.10+) + is_mps = sample.device.type == "mps" + if isinstance(timestep, float): + dtype = torch.float32 if is_mps else torch.float64 + else: + dtype = torch.int32 if is_mps else torch.int64 + timesteps = torch.tensor([timesteps], + dtype=dtype, + device=sample.device) + elif len(timesteps.shape) == 0: + timesteps = timesteps[None].to(sample.device) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timesteps = timesteps.expand(sample.shape[0]) + + t_emb = self.time_proj(timesteps) + + # `Timesteps` does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=sample.dtype) + + emb = self.time_embedding(t_emb, timestep_cond) + aug_emb = None + + if self.class_embedding is not None: + if class_labels is None: + raise ValueError( + "class_labels should be provided when num_class_embeds > 0" + ) + + if self.config.class_embed_type == "timestep": + class_labels = self.time_proj(class_labels) + + # `Timesteps` does not contain any weights and will always return f32 tensors + # there might be better ways to encapsulate this. + class_labels = class_labels.to(dtype=sample.dtype) + + class_emb = self.class_embedding(class_labels).to( + dtype=sample.dtype) + + if self.config.class_embeddings_concat: + emb = torch.cat([emb, class_emb], dim=-1) + else: + emb = emb + class_emb + + if self.config.addition_embed_type == "text": + aug_emb = self.add_embedding(encoder_hidden_states) + elif self.config.addition_embed_type == "text_image": + # Kandinsky 2.1 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + + image_embs = added_cond_kwargs.get("image_embeds") + text_embs = added_cond_kwargs.get("text_embeds", + encoder_hidden_states) + aug_emb = self.add_embedding(text_embs, image_embs) + elif self.config.addition_embed_type == "text_time": + # SDXL - style + if "text_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`" + ) + text_embeds = added_cond_kwargs.get("text_embeds") + if "time_ids" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`" + ) + time_ids = added_cond_kwargs.get("time_ids") + time_embeds = self.add_time_proj(time_ids.flatten()) + time_embeds = time_embeds.reshape((text_embeds.shape[0], -1)) + + add_embeds = torch.concat([text_embeds, time_embeds], dim=-1) + add_embeds = add_embeds.to(emb.dtype) + aug_emb = self.add_embedding(add_embeds) + elif self.config.addition_embed_type == "image": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + aug_emb = self.add_embedding(image_embs) + elif self.config.addition_embed_type == "image_hint": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + hint = added_cond_kwargs.get("hint") + aug_emb, hint = self.add_embedding(image_embs, hint) + sample = torch.cat([sample, hint], dim=1) + + emb = emb + aug_emb if aug_emb is not None else emb + + if self.time_embed_act is not None: + emb = self.time_embed_act(emb) + + if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj": + encoder_hidden_states = self.encoder_hid_proj( + encoder_hidden_states) + elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj": + # Kadinsky 2.1 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj( + encoder_hidden_states, image_embeds) + elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj(image_embeds) + # 2. pre-process + sample = self.conv_in(sample) + + # 3. down + + is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None + is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None + + down_block_res_samples = (sample, ) + for downsample_block in self.down_blocks: + if hasattr(downsample_block, "has_cross_attention" + ) and downsample_block.has_cross_attention: + # For t2i-adapter CrossAttnDownBlock2D + additional_residuals = {} + if is_adapter and len(down_block_additional_residuals) > 0: + additional_residuals[ + "additional_residuals"] = down_block_additional_residuals.pop( + 0) + + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, + **additional_residuals, + ) + else: + sample, res_samples = downsample_block(hidden_states=sample, + temb=emb) + + if is_adapter and len(down_block_additional_residuals) > 0: + sample += down_block_additional_residuals.pop(0) + + down_block_res_samples += res_samples + + if is_controlnet: + new_down_block_res_samples = () + + for down_block_res_sample, down_block_additional_residual in zip( + down_block_res_samples, down_block_additional_residuals): + down_block_res_sample = down_block_res_sample + down_block_additional_residual + new_down_block_res_samples = new_down_block_res_samples + ( + down_block_res_sample, ) + + down_block_res_samples = new_down_block_res_samples + + # 4. mid + if self.mid_block is not None: + sample = self.mid_block( + sample, + emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, + ) + + if is_controlnet: + sample = sample + mid_block_additional_residual + + # 5. up + for i, upsample_block in enumerate(self.up_blocks): + is_final_block = i == len(self.up_blocks) - 1 + + res_samples = down_block_res_samples[-len(upsample_block.resnets):] + down_block_res_samples = down_block_res_samples[:-len( + upsample_block.resnets)] + + # if we have not reached the final block and need to forward the + # upsample size, we do it here + if not is_final_block and forward_upsample_size: + upsample_size = down_block_res_samples[-1].shape[2:] + + if hasattr(upsample_block, "has_cross_attention" + ) and upsample_block.has_cross_attention: + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + upsample_size=upsample_size, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + ) + else: + sample = upsample_block(hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + upsample_size=upsample_size) + + # 6. post-process + if self.conv_norm_out: + sample = self.conv_norm_out(sample) + sample = self.conv_act(sample) + sample = self.conv_out(sample) + return sample diff --git a/inference/benchmarks/stable_diffusion_v1_4/pytorch/requirements.txt b/inference/benchmarks/stable_diffusion_v1_4/pytorch/requirements.txt new file mode 100644 index 000000000..2bd1558a3 --- /dev/null +++ b/inference/benchmarks/stable_diffusion_v1_4/pytorch/requirements.txt @@ -0,0 +1,3 @@ +transformers +diffusers +torchmetrics diff --git a/inference/configs/stable_diffusion_v1_4/configurations.yaml b/inference/configs/stable_diffusion_v1_4/configurations.yaml new file mode 100644 index 000000000..77014a03b --- /dev/null +++ b/inference/configs/stable_diffusion_v1_4/configurations.yaml @@ -0,0 +1,16 @@ +batch_size: 2 +# 1 item(like 1 sequence, 1 image) flops +# Attention! For transformer decoder like bert, 1 token cause 2*param flops, so we need 2*length*params like 2*512*0.33B here +# format: a_1*a*2*...*a_nea_0,like 2*512*0.33e9(bert) or 4.12e9(resnet50) +flops: 6.78e11 +fp16: false +compiler: tensorrt +num_workers: 8 +log_freq: 5 +repeat: 1 +# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null +no_validation: false +# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt) +exist_onnx_path: null +# set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine +exist_compiler_path: null diff --git a/inference/configs/stable_diffusion_v1_4/parameters.yaml b/inference/configs/stable_diffusion_v1_4/parameters.yaml new file mode 100644 index 000000000..b8d6d33f0 --- /dev/null +++ b/inference/configs/stable_diffusion_v1_4/parameters.yaml @@ -0,0 +1,14 @@ +weights: "weights_v1_4" +eval_weights: "weights_evaluator" +prompts: "data_vizwiz/val.json" +random_seed: 0 +prompt_max_len: 77 +in_channels: 4 +height: 512 +width: 512 +scale_size: 8 +num_inference_steps: 50 +guidance_scale: 7.5 +prompt_samples: 10 +num_train_timesteps: 1000 +embed_hidden_size: 768 diff --git a/inference/configs/stable_diffusion_v1_4/vendor_config/nvidia_configurations.yaml b/inference/configs/stable_diffusion_v1_4/vendor_config/nvidia_configurations.yaml new file mode 100644 index 000000000..130eff42e --- /dev/null +++ b/inference/configs/stable_diffusion_v1_4/vendor_config/nvidia_configurations.yaml @@ -0,0 +1,3 @@ +trt_tmp_path: nvidia_tmp/unet.trt +has_dynamic_axis: false +torchtrt_full_compile: true \ No newline at end of file