From 05ba7a0f03fd8a928f3d4c312fab38a12ebc97ea Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Thu, 3 Aug 2023 16:56:04 +0800 Subject: [PATCH 1/6] bert --- inference/benchmarks/bertLarge/README.md | 66 ++++++++++ .../benchmarks/bertLarge/pytorch/__init__.py | 5 + .../bertLarge/pytorch/dataloader.py | 67 +++++++++++ .../benchmarks/bertLarge/pytorch/evaluator.py | 11 ++ .../benchmarks/bertLarge/pytorch/export.py | 30 +++++ .../benchmarks/bertLarge/pytorch/forward.py | 113 ++++++++++++++++++ .../benchmarks/bertLarge/pytorch/model.py | 13 ++ .../bertLarge/pytorch/requirements.txt | 1 + inference/benchmarks/resnet50/README.md | 11 +- .../benchmarks/resnet50/pytorch/forward.py | 7 +- .../configs/bertLarge/configurations.yaml | 14 +++ inference/configs/bertLarge/parameters.yaml | 6 + .../vendor_config/nvidia_configurations.yaml | 3 + inference/inference_engine/nvidia/tensorrt.py | 41 +++++-- inference/run_inference.py | 8 +- 15 files changed, 372 insertions(+), 24 deletions(-) create mode 100644 inference/benchmarks/bertLarge/README.md create mode 100644 inference/benchmarks/bertLarge/pytorch/__init__.py create mode 100644 inference/benchmarks/bertLarge/pytorch/dataloader.py create mode 100644 inference/benchmarks/bertLarge/pytorch/evaluator.py create mode 100644 inference/benchmarks/bertLarge/pytorch/export.py create mode 100644 inference/benchmarks/bertLarge/pytorch/forward.py create mode 100644 inference/benchmarks/bertLarge/pytorch/model.py create mode 100644 inference/benchmarks/bertLarge/pytorch/requirements.txt create mode 100644 inference/configs/bertLarge/configurations.yaml create mode 100644 inference/configs/bertLarge/parameters.yaml create mode 100644 inference/configs/bertLarge/vendor_config/nvidia_configurations.yaml diff --git a/inference/benchmarks/bertLarge/README.md b/inference/benchmarks/bertLarge/README.md new file mode 100644 index 000000000..d653f3c7a --- /dev/null +++ b/inference/benchmarks/bertLarge/README.md @@ -0,0 +1,66 @@ +### 1. 推理数据集 + +● 下载地址:`https://drive.google.com/drive/folders/1cywmDnAsrP5-2vsr8GDc6QUc7VWe-M3v` + +``` +文件列表: +results_text.tar.gz +bert_reference_results_text_md5.txt +``` + +* 解压后将eval.txt放置在目录下 + +### 2. 模型与权重 + +* 模型实现 + * pytorch:transformers.BertForMaskedLM +* 权重下载 + * pytorch:BertForMaskedLM.from_pretrained("bert-large/base-uncased") +* 权重选择 + * 使用save_pretrained将加载的bert-large或bert-base权重保存到/路径下 + +### 3. 软硬件配置与运行信息参考 + +#### 2.1 Nvidia A100 + +- ##### 硬件环境 + - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-113-generic + - 加速卡驱动版本:470.129.06 + - Docker 版本:20.10.16 + - 训练框架版本:pytorch-1.13.0a0+937e930 + - 依赖软件版本: + - cuda: 11.8 + +- 推理工具包 + + - TensorRT 8.5.1.7 + +### 4. 运行情况(BERT-Large) + +* 指标列表 + +| 指标名称 | 指标值索引 | 特殊说明 | +| ------------------ | ----------------- | ----------------------------------------------------------- | +| 数据精度 | precision | 可选fp32/fp16 | +| 批尺寸 | bs | 此外,对于bert-large seq_length==128 | +| 批输入大小 | byte_per_batch | 每一个batch包含的字节数 | +| 硬件存储使用 | mem | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time | 总时间+Perf初始化等时间 | +| 验证总吞吐量 | p_val_whole | 实际验证序列数除以总验证时间 | +| 验证计算吞吐量 | *p_val_core | 不包含IO部分耗时 | +| 推理总吞吐量 | p_infer_whole | 实际推理序列数除以总推理时间 | +| **推理计算吞吐量** | ***p_infer_core** | 不包含IO部分耗时。此外,此值*seq_length即为token per second | +| 推理单样本耗时 | infer_time | 1/p_infer_core,单位为毫秒(ms)或微秒(μs) | +| 推理结果 | acc(推理/验证) | 单位为top1MaskedLM准确率(acc1) | + +* 指标值 + +| 推理工具 | precision | bs | byte_per_batch | e2e_time | p_val_whole | \*p_val_core | p_infer_whole | \*p_infer_core | acc | mem | +| -------- | --------- | ---- | -------------- | -------- | ----------- | ------------ | ------------- | -------------- | ----------- | --------- | +| tensorrt | fp16 | 32 | 32768 | 1283.9 | 257.3 | 260.4 | 408.3 | 418.1 | 0.600/0.638 | 17.4/40.0 | + diff --git a/inference/benchmarks/bertLarge/pytorch/__init__.py b/inference/benchmarks/bertLarge/pytorch/__init__.py new file mode 100644 index 000000000..1f6cdf49b --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/__init__.py @@ -0,0 +1,5 @@ +from .dataloader import build_dataloader +from .model import create_model +from .export import export_model +from .evaluator import evaluator +from .forward import model_forward, engine_forward diff --git a/inference/benchmarks/bertLarge/pytorch/dataloader.py b/inference/benchmarks/bertLarge/pytorch/dataloader.py new file mode 100644 index 000000000..52ebc4d8d --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/dataloader.py @@ -0,0 +1,67 @@ +from transformers import BertTokenizer +from torch.utils.data import DataLoader, Dataset +import torch +import random + + +class BertInferDataset(Dataset): + + def __init__(self, input_ids, label_ids, random_dupo, seq_length): + self.input_ids = input_ids + self.label_ids = label_ids + self.random_dupo = random_dupo + self.seq_length = seq_length + + def __len__(self): + return len(self.input_ids) // self.seq_length * self.random_dupo + + def __getitem__(self, idx): + idx_global = idx // self.random_dupo + start_idx = idx_global * self.seq_length + chunk_input = self.input_ids[start_idx:start_idx + self.seq_length] + chunk_label = self.label_ids[start_idx:start_idx + self.seq_length] + + chunk_input = torch.tensor(chunk_input).int() + chunk_label = torch.tensor(chunk_label).int() + + return (chunk_input, chunk_label) + + +def build_dataset(config): + + random.seed(config.random_seed) + + with open(config.data_dir + "/" + config.eval_file, "r") as file: + text = file.read() + + tokenizer = BertTokenizer.from_pretrained(config.data_dir + "/" + + config.weight_dir) + tokens = tokenizer.tokenize(text) + + label_ids = tokenizer.convert_tokens_to_ids(tokens) + label_ids = [tokenizer.cls_token_id] + label_ids + [tokenizer.sep_token_id] + + masked_tokens = [] + for token in tokens: + if token != "[CLS]" and token != "[SEP]": + masked_tokens.append( + "[MASK]" if random.random() < config.mask_ratio else token) + input_ids = tokenizer.convert_tokens_to_ids(masked_tokens) + input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id] + + dataset = BertInferDataset(input_ids, label_ids, config.random_dupo, + config.seq_length) + + return dataset + + +def build_dataloader(config): + dataset = build_dataset(config) + loader = DataLoader(dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=True, + num_workers=config.num_workers, + pin_memory=True) + + return loader diff --git a/inference/benchmarks/bertLarge/pytorch/evaluator.py b/inference/benchmarks/bertLarge/pytorch/evaluator.py new file mode 100644 index 000000000..1dac4c977 --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/evaluator.py @@ -0,0 +1,11 @@ +import torch + + +def evaluator(pred, x, y): + mask = x == 103 + masked_pred = pred[mask] + masked_y = y[mask] + + correct = masked_pred[masked_pred == masked_y] + + return len(correct), len(masked_y) diff --git a/inference/benchmarks/bertLarge/pytorch/export.py b/inference/benchmarks/bertLarge/pytorch/export.py new file mode 100644 index 000000000..f9eba5109 --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/export.py @@ -0,0 +1,30 @@ +import torch +import os + + +def export_model(model, config): + if config.exist_onnx_path is not None: + return config.exist_onnx_path + + filename = config.case + "_bs" + str(config.batch_size) + filename = filename + "_" + str(config.framework) + filename = filename + "_fp16" + str(config.fp16) + filename = "onnxs/" + filename + ".onnx" + onnx_path = config.perf_dir + "/" + filename + + dummy_input = torch.ones(config.batch_size, config.seq_length).int().cuda() + + dir_onnx_path = os.path.dirname(onnx_path) + os.makedirs(dir_onnx_path, exist_ok=True) + + with torch.no_grad(): + torch.onnx.export(model, + dummy_input, + onnx_path, + verbose=False, + input_names=["input"], + output_names=["output"], + training=torch.onnx.TrainingMode.EVAL, + do_constant_folding=True) + + return onnx_path diff --git a/inference/benchmarks/bertLarge/pytorch/forward.py b/inference/benchmarks/bertLarge/pytorch/forward.py new file mode 100644 index 000000000..05ba97385 --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/forward.py @@ -0,0 +1,113 @@ +from loguru import logger +import torch +import numpy as np +import time +from tools import torch_sync + + +def cal_perf(config, dataloader_len, duration, core_time, str_prefix): + model_forward_perf = config.repeat * dataloader_len * config.batch_size / duration + logger.info(str_prefix + "(" + config.framework + ") Perf: " + + str(model_forward_perf) + " qps") + model_forward_core_perf = config.repeat * dataloader_len * config.batch_size / core_time + logger.info(str_prefix + "(" + config.framework + ") core Perf: " + + str(model_forward_core_perf) + " qps") + return round(model_forward_perf, 3), round(model_forward_core_perf, 3) + + +def model_forward(model, dataloader, evaluator, config): + if config.no_validation: + return None, None, None + start = time.time() + core_time = 0.0 + + correct = 1 + whole = 1 + + for times in range(config.repeat): + + logger.debug("Repeat: " + str(times + 1)) + + all_top1 = [] + for step, (x, y) in enumerate(dataloader): + torch_sync(config) + core_time_start = time.time() + + if step % config.log_freq == 0: + logger.debug("Step: " + str(step) + " / " + + str(len(dataloader))) + + with torch.no_grad(): + x = x.cuda() + y = y.cuda() + + pred = model(x) + torch_sync(config) + core_time += time.time() - core_time_start + + pred = pred[0] + pred = torch.argmax(pred, dim=2) + correct_iter, whole_iter = evaluator(pred, x, y) + + correct += correct_iter + whole += whole_iter + + acc = correct / whole + + logger.info("MaskedLM Acc: " + str(acc)) + + duration = time.time() - start + model_forward_perf, model_forward_core_perf = cal_perf( + config, len(dataloader), duration, core_time, "Validation") + + return model_forward_perf, model_forward_core_perf, round(acc, 3) + + +def engine_forward(model, dataloader, evaluator, config): + start = time.time() + core_time = 0.0 + foo_time = 0.0 + + correct = 1 + whole = 1 + + for times in range(config.repeat): + + logger.debug("Repeat: " + str(times + 1)) + + all_top1 = [] + for step, (x, y) in enumerate(dataloader): + torch_sync(config) + core_time_start = time.time() + + if step % config.log_freq == 0: + logger.debug("Step: " + str(step) + " / " + + str(len(dataloader))) + + with torch.no_grad(): + + outputs = model([x]) + pred = outputs[0] + foo_time += outputs[1] + + torch_sync(config) + core_time += time.time() - core_time_start + + pred = pred[0] + pred = pred.reshape(config.batch_size, config.seq_length, -1) + pred = torch.argmax(pred, dim=2) + pred = pred.cpu() + correct_iter, whole_iter = evaluator(pred, x, y) + + correct += correct_iter + whole += whole_iter + + acc = correct / whole + + logger.info("MaskedLM Acc: " + str(acc)) + + duration = time.time() - start - foo_time + model_forward_perf, model_forward_core_perf = cal_perf( + config, len(dataloader), duration, core_time - foo_time, "Inference") + + return model_forward_perf, model_forward_core_perf, round(acc, 3) diff --git a/inference/benchmarks/bertLarge/pytorch/model.py b/inference/benchmarks/bertLarge/pytorch/model.py new file mode 100644 index 000000000..c8d1e4833 --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/model.py @@ -0,0 +1,13 @@ +from transformers import BertForMaskedLM + + +def create_model(config): + model = BertForMaskedLM.from_pretrained(config.data_dir + "/" + + config.weight_dir, + torchscript=True) + model.cuda() + model.eval() + if config.fp16: + model.half() + + return model diff --git a/inference/benchmarks/bertLarge/pytorch/requirements.txt b/inference/benchmarks/bertLarge/pytorch/requirements.txt new file mode 100644 index 000000000..976a2b1f3 --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/requirements.txt @@ -0,0 +1 @@ +transformers diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md index 88b2f9aae..e7436d315 100644 --- a/inference/benchmarks/resnet50/README.md +++ b/inference/benchmarks/resnet50/README.md @@ -68,6 +68,7 @@ find ./val -name "*JPEG" | wc -l | ------------------ | ---------------- | -------------------------------------------- | | 数据精度 | precision | 可选fp32/fp16 | | 批尺寸 | bs | | +| 批输入大小 | byte_per_batch | 每一个batch包含的字节数 | | 硬件存储使用 | mem | 通常称为“显存”,单位为GiB | | 端到端时间 | e2e_time | 总时间+Perf初始化等时间 | | 验证总吞吐量 | p_val_whole | 实际验证图片数除以总验证时间 | @@ -78,9 +79,9 @@ find ./val -name "*JPEG" | wc -l * 指标值 -| 推理工具 | precision | bs | e2e_time | p_val_whole | \*p_val_core | p_infer_whole | \*p_infer_core | acc | mem | -| ----------- | --------- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ---------- | -| tensorrt | fp16 | 256 | 613.4 | 1358.9 | 4263.3 | 1391.4 | 12406.0 | 76.2/76.2 | 19.7/40.0 | -| tensorrt | fp32 | 256 | 474.4 | 1487.3 | 2653.2 | 1560.3 | 6091.6 | 76.2/76.2 | 28.86/40.0 | -| torchtrt | fp16 | 256 | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 76.2/76.2 | 9.42/40.0 | +| 推理工具 | precision | bs | byte_per_batch | e2e_time | p_val_whole | \*p_val_core | p_infer_whole | \*p_infer_core | acc | mem | +| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ---------- | +| tensorrt | fp16 | 256 | 77070336 | 613.4 | 1358.9 | 4263.3 | 1391.4 | 12406.0 | 76.2/76.2 | 19.7/40.0 | +| tensorrt | fp32 | 256 | 77070336 | 474.4 | 1487.3 | 2653.2 | 1560.3 | 6091.6 | 76.2/76.2 | 28.86/40.0 | +| torchtrt | fp16 | 256 | 77070336 | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 76.2/76.2 | 9.42/40.0 | diff --git a/inference/benchmarks/resnet50/pytorch/forward.py b/inference/benchmarks/resnet50/pytorch/forward.py index 5619760df..77ff03bd3 100644 --- a/inference/benchmarks/resnet50/pytorch/forward.py +++ b/inference/benchmarks/resnet50/pytorch/forward.py @@ -81,12 +81,15 @@ def engine_forward(model, dataloader, evaluator, config): with torch.no_grad(): outputs = model([x]) - pred = outputs[0][0] + pred = outputs[0] foo_time += outputs[1] - pred = pred.float() + torch_sync(config) core_time += time.time() - core_time_start + pred = pred[0].float() + pred = pred.reshape(config.batch_size, -1) + pred = pred.cpu() top1 = evaluator(pred, y) all_top1.extend(top1.cpu()) diff --git a/inference/configs/bertLarge/configurations.yaml b/inference/configs/bertLarge/configurations.yaml new file mode 100644 index 000000000..2f24464c7 --- /dev/null +++ b/inference/configs/bertLarge/configurations.yaml @@ -0,0 +1,14 @@ +batch_size: 32 +# 512 length seq(1 item in x) +input_size: 512 +fp16: true +compiler: tensorrt +num_workers: 8 +log_freq: 100 +repeat: 1 +# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null +no_validation: false +# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt) +exist_onnx_path: null +# set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine +exist_compiler_path: null \ No newline at end of file diff --git a/inference/configs/bertLarge/parameters.yaml b/inference/configs/bertLarge/parameters.yaml new file mode 100644 index 000000000..464095c5f --- /dev/null +++ b/inference/configs/bertLarge/parameters.yaml @@ -0,0 +1,6 @@ +seq_length: 512 +mask_ratio: 0.10 +weight_dir: "weights" +eval_file: "eval.txt" +random_dupo: 10 +random_seed: 0 \ No newline at end of file diff --git a/inference/configs/bertLarge/vendor_config/nvidia_configurations.yaml b/inference/configs/bertLarge/vendor_config/nvidia_configurations.yaml new file mode 100644 index 000000000..b10bc4faf --- /dev/null +++ b/inference/configs/bertLarge/vendor_config/nvidia_configurations.yaml @@ -0,0 +1,3 @@ +trt_tmp_path: nvidia_tmp/bertLarge.trt +has_dynamic_axis: false +torchtrt_full_compile: true \ No newline at end of file diff --git a/inference/inference_engine/nvidia/tensorrt.py b/inference/inference_engine/nvidia/tensorrt.py index fb215ba17..b2ac1b29f 100644 --- a/inference/inference_engine/nvidia/tensorrt.py +++ b/inference/inference_engine/nvidia/tensorrt.py @@ -27,6 +27,8 @@ def __repr__(self): return self.__str__() def __init__(self, config, onnx_path, model): + self.config = config + self.logger = trt.Logger(trt.Logger.WARNING) self.runtime = trt.Runtime(self.logger) @@ -49,6 +51,19 @@ def __init__(self, config, onnx_path, model): np.complex64: torch.complex64, np.complex128: torch.complex128, } + self.str_to_torch_dtype_dict = { + "bool": torch.bool, + "uint8": torch.uint8, + "int8": torch.int8, + "int16": torch.int16, + "int32": torch.int32, + "int64": torch.int64, + "float16": torch.float16, + "float32": torch.float32, + "float64": torch.float64, + "complex64": torch.complex64, + "complex128": torch.complex128, + } def build_engine(self, config, onnx_path): if config.exist_compiler_path is None: @@ -99,15 +114,10 @@ def allocate_buffers(self, engine): def __call__(self, model_inputs: list): - batch_size = np.unique(np.array([i.size(dim=0) for i in model_inputs])) - batch_size = batch_size[0] + batch_size = self.config.batch_size for i, model_input in enumerate(model_inputs): - binding_name = self.engine[i] - binding_dtype = trt.nptype( - self.engine.get_binding_dtype(binding_name)) - model_input = model_input.to( - self.numpy_to_torch_dtype_dict[binding_dtype]) + model_input = model_input.cuda() cuda.memcpy_dtod_async( self.inputs[i].device, @@ -118,12 +128,17 @@ def __call__(self, model_inputs: list): self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle) + result = [] for out in self.outputs: - cuda.memcpy_dtoh_async(out.host, out.device, self.stream) + out_tensor = torch.empty(out.host.shape, device="cuda").to( + self.str_to_torch_dtype_dict[str(out.host.dtype)]) + cuda.memcpy_dtod_async( + out_tensor.data_ptr(), + out.device, + out_tensor.element_size() * out_tensor.nelement(), + self.stream, + ) + result.append(out_tensor) self.stream.synchronize() - - return [ - torch.from_numpy(out.host.reshape(batch_size, -1)) - for out in self.outputs - ], 0 + return result, 0 diff --git a/inference/run_inference.py b/inference/run_inference.py index ff6ba2acb..ff42c393f 100644 --- a/inference/run_inference.py +++ b/inference/run_inference.py @@ -151,10 +151,10 @@ def parse_args(): "batchsize": config.batch_size, "byte_per_batch": batch_input_byte, "e2e_time(second)": e2e_time, - "p_validation_whole(items per second)": p_forward, - "*p_validation_core(items per second)": p_forward_core, - "p_inference_whole(items per second)": p_infer, - "*p_inference_core(items per second)": p_infer_core, + "p_validation_whole(qps)": p_forward, + "*p_validation_core(qps)": p_forward_core, + "p_inference_whole(qps)": p_infer, + "*p_inference_core(qps)": p_infer_core, "val_average_acc": val_acc, "infer_average_acc": infer_acc } From 1b365f2a2db5e7939a1a3b16aecb899f509c80ba Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Thu, 3 Aug 2023 17:37:44 +0800 Subject: [PATCH 2/6] fix --- inference/benchmarks/bertLarge/pytorch/dataloader.py | 11 ++++------- inference/configs/bertLarge/configurations.yaml | 2 +- inference/configs/bertLarge/parameters.yaml | 1 - 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/inference/benchmarks/bertLarge/pytorch/dataloader.py b/inference/benchmarks/bertLarge/pytorch/dataloader.py index 52ebc4d8d..64542cb81 100644 --- a/inference/benchmarks/bertLarge/pytorch/dataloader.py +++ b/inference/benchmarks/bertLarge/pytorch/dataloader.py @@ -6,18 +6,16 @@ class BertInferDataset(Dataset): - def __init__(self, input_ids, label_ids, random_dupo, seq_length): + def __init__(self, input_ids, label_ids, seq_length): self.input_ids = input_ids self.label_ids = label_ids - self.random_dupo = random_dupo self.seq_length = seq_length def __len__(self): - return len(self.input_ids) // self.seq_length * self.random_dupo + return len(self.input_ids) // self.seq_length def __getitem__(self, idx): - idx_global = idx // self.random_dupo - start_idx = idx_global * self.seq_length + start_idx = idx * self.seq_length chunk_input = self.input_ids[start_idx:start_idx + self.seq_length] chunk_label = self.label_ids[start_idx:start_idx + self.seq_length] @@ -49,8 +47,7 @@ def build_dataset(config): input_ids = tokenizer.convert_tokens_to_ids(masked_tokens) input_ids = [tokenizer.cls_token_id] + input_ids + [tokenizer.sep_token_id] - dataset = BertInferDataset(input_ids, label_ids, config.random_dupo, - config.seq_length) + dataset = BertInferDataset(input_ids, label_ids, config.seq_length) return dataset diff --git a/inference/configs/bertLarge/configurations.yaml b/inference/configs/bertLarge/configurations.yaml index 2f24464c7..515a95180 100644 --- a/inference/configs/bertLarge/configurations.yaml +++ b/inference/configs/bertLarge/configurations.yaml @@ -5,7 +5,7 @@ fp16: true compiler: tensorrt num_workers: 8 log_freq: 100 -repeat: 1 +repeat: 10 # skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null no_validation: false # set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt) diff --git a/inference/configs/bertLarge/parameters.yaml b/inference/configs/bertLarge/parameters.yaml index 464095c5f..9125bfa74 100644 --- a/inference/configs/bertLarge/parameters.yaml +++ b/inference/configs/bertLarge/parameters.yaml @@ -2,5 +2,4 @@ seq_length: 512 mask_ratio: 0.10 weight_dir: "weights" eval_file: "eval.txt" -random_dupo: 10 random_seed: 0 \ No newline at end of file From 4b8c20a2223993aa89101b3e816209522b5d2c06 Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Fri, 4 Aug 2023 10:00:29 +0800 Subject: [PATCH 3/6] add --- inference/benchmarks/bertLarge/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/inference/benchmarks/bertLarge/README.md b/inference/benchmarks/bertLarge/README.md index d653f3c7a..6c7cb0c29 100644 --- a/inference/benchmarks/bertLarge/README.md +++ b/inference/benchmarks/bertLarge/README.md @@ -63,4 +63,5 @@ bert_reference_results_text_md5.txt | 推理工具 | precision | bs | byte_per_batch | e2e_time | p_val_whole | \*p_val_core | p_infer_whole | \*p_infer_core | acc | mem | | -------- | --------- | ---- | -------------- | -------- | ----------- | ------------ | ------------- | -------------- | ----------- | --------- | | tensorrt | fp16 | 32 | 32768 | 1283.9 | 257.3 | 260.4 | 408.3 | 418.1 | 0.600/0.638 | 17.4/40.0 | +| tensorrt | fp32 | 32 | 65536 | 1868.8 | 150.4 | 152.2 | 190.4 | 194.1 | 0.638/0.638 | 16.9/40.0 | From 189c0e11b1d854dc76044fc81f7555e204451e27 Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Fri, 4 Aug 2023 12:56:24 +0800 Subject: [PATCH 4/6] add MFU --- inference/benchmarks/bertLarge/README.md | 19 ++++++++++--------- inference/benchmarks/resnet50/README.md | 14 +++++++------- .../configs/bertLarge/configurations.yaml | 6 ++++-- .../configs/resnet50/configurations.yaml | 6 ++++-- .../docker_images/nvidia/nvidia_analysis.py | 3 ++- inference/run.py | 7 ++++++- inference/run_inference.py | 8 +++----- 7 files changed, 36 insertions(+), 27 deletions(-) diff --git a/inference/benchmarks/bertLarge/README.md b/inference/benchmarks/bertLarge/README.md index 6c7cb0c29..f84a474eb 100644 --- a/inference/benchmarks/bertLarge/README.md +++ b/inference/benchmarks/bertLarge/README.md @@ -47,21 +47,22 @@ bert_reference_results_text_md5.txt | 指标名称 | 指标值索引 | 特殊说明 | | ------------------ | ----------------- | ----------------------------------------------------------- | | 数据精度 | precision | 可选fp32/fp16 | -| 批尺寸 | bs | 此外,对于bert-large seq_length==128 | -| 批输入大小 | byte_per_batch | 每一个batch包含的字节数 | +| 批尺寸 | bs | | | 硬件存储使用 | mem | 通常称为“显存”,单位为GiB | | 端到端时间 | e2e_time | 总时间+Perf初始化等时间 | | 验证总吞吐量 | p_val_whole | 实际验证序列数除以总验证时间 | -| 验证计算吞吐量 | *p_val_core | 不包含IO部分耗时 | +| 验证计算吞吐量 | p_val_core | 不包含IO部分耗时 | | 推理总吞吐量 | p_infer_whole | 实际推理序列数除以总推理时间 | -| **推理计算吞吐量** | ***p_infer_core** | 不包含IO部分耗时。此外,此值*seq_length即为token per second | -| 推理单样本耗时 | infer_time | 1/p_infer_core,单位为毫秒(ms)或微秒(μs) | +| **推理计算吞吐量** | **\*p_infer_core** | 不包含IO部分耗时 | +| **计算卡使用率** | **\*MFU** | model flops utilization | | 推理结果 | acc(推理/验证) | 单位为top1MaskedLM准确率(acc1) | * 指标值 -| 推理工具 | precision | bs | byte_per_batch | e2e_time | p_val_whole | \*p_val_core | p_infer_whole | \*p_infer_core | acc | mem | -| -------- | --------- | ---- | -------------- | -------- | ----------- | ------------ | ------------- | -------------- | ----------- | --------- | -| tensorrt | fp16 | 32 | 32768 | 1283.9 | 257.3 | 260.4 | 408.3 | 418.1 | 0.600/0.638 | 17.4/40.0 | -| tensorrt | fp32 | 32 | 65536 | 1868.8 | 150.4 | 152.2 | 190.4 | 194.1 | 0.638/0.638 | 16.9/40.0 | + +| 推理工具 | precision | bs | e2e_time | p_val_whole | p_val_core | p_infer_whole | \*p_infer_core | \*MFU | acc | mem | +| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | +| tensorrt | fp16 | 32 | 1283.9 | 257.3 | 260.4 | 408.3 | 418.1 | 45.3% | 0.600/0.638 | 17.4/40.0 | +| tensorrt | fp32 | 32 | 1868.8 | 150.4 | 152.2 | 190.4 | 194.1 | 42.0% | 0.638/0.638 | 16.9/40.0 | + diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md index e7436d315..024b7f417 100644 --- a/inference/benchmarks/resnet50/README.md +++ b/inference/benchmarks/resnet50/README.md @@ -68,20 +68,20 @@ find ./val -name "*JPEG" | wc -l | ------------------ | ---------------- | -------------------------------------------- | | 数据精度 | precision | 可选fp32/fp16 | | 批尺寸 | bs | | -| 批输入大小 | byte_per_batch | 每一个batch包含的字节数 | | 硬件存储使用 | mem | 通常称为“显存”,单位为GiB | | 端到端时间 | e2e_time | 总时间+Perf初始化等时间 | | 验证总吞吐量 | p_val_whole | 实际验证图片数除以总验证时间 | -| 验证计算吞吐量 | \*p_val_core | 不包含IO部分耗时 | +| 验证计算吞吐量 | p_val_core | 不包含IO部分耗时 | | 推理总吞吐量 | p_infer_whole | 实际推理图片数除以总推理时间 | | **推理计算吞吐量** | **\*p_infer_core** | 不包含IO部分耗时 | +| **计算卡使用率** | **\*MFU** | model flops utilization | | 推理结果 | acc(推理/验证) | 单位为top1分类准确率(acc1) | * 指标值 -| 推理工具 | precision | bs | byte_per_batch | e2e_time | p_val_whole | \*p_val_core | p_infer_whole | \*p_infer_core | acc | mem | -| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ---------- | -| tensorrt | fp16 | 256 | 77070336 | 613.4 | 1358.9 | 4263.3 | 1391.4 | 12406.0 | 76.2/76.2 | 19.7/40.0 | -| tensorrt | fp32 | 256 | 77070336 | 474.4 | 1487.3 | 2653.2 | 1560.3 | 6091.6 | 76.2/76.2 | 28.86/40.0 | -| torchtrt | fp16 | 256 | 77070336 | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 76.2/76.2 | 9.42/40.0 | +| 推理工具 | precision | bs | e2e_time | p_val_whole | p_val_core | p_infer_whole | \*p_infer_core | \*MFU | acc | mem | +| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | ---------- | +| tensorrt | fp16 | 256 |613.4 | 1358.9 | 4469.4 | 1391.4 | 12698.7 | 16.8% | 76.2/76.2 | 19.7/40.0 | +| tensorrt | fp32 | 256 | 474.4 | 1487.3 | 2653.2 | 1560.3 | 6091.6 | 16.1% | 76.2/76.2 | 28.86/40.0 | +| torchtrt | fp16 | 256 | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 | diff --git a/inference/configs/bertLarge/configurations.yaml b/inference/configs/bertLarge/configurations.yaml index 515a95180..fbfb2f523 100644 --- a/inference/configs/bertLarge/configurations.yaml +++ b/inference/configs/bertLarge/configurations.yaml @@ -1,6 +1,8 @@ batch_size: 32 -# 512 length seq(1 item in x) -input_size: 512 +# 1 item(like 1 sequence, 1 image) flops +# Attention! For transformer decoder like bert, 1 token cause 2*param flops, so we need 2*length*params like 2*512*0.33B here +# format: a_1*a*2*...*a_nea_0,like 2*512*0.33e9(bert) or 4.12e9(resnet50) +flops: 2*512*0.33e9 fp16: true compiler: tensorrt num_workers: 8 diff --git a/inference/configs/resnet50/configurations.yaml b/inference/configs/resnet50/configurations.yaml index fa1739983..814dc58a3 100644 --- a/inference/configs/resnet50/configurations.yaml +++ b/inference/configs/resnet50/configurations.yaml @@ -1,6 +1,8 @@ batch_size: 256 -# 3*224*224(1 item in x) -input_size: 150528 +# 1 item(like 1 sequence, 1 image) flops +# Attention! For transformer decoder like bert, 1 token cause 2*param flops, so we need 2*length*params like 2*512*0.33B here +# format: a_1*a*2*...*a_nea_0,like 2*512*0.33e9(bert) or 4.12e9(resnet50) +flops: 4.12e9 fp16: true compiler: tensorrt num_workers: 8 diff --git a/inference/docker_images/nvidia/nvidia_analysis.py b/inference/docker_images/nvidia/nvidia_analysis.py index 26132d19d..697148933 100644 --- a/inference/docker_images/nvidia/nvidia_analysis.py +++ b/inference/docker_images/nvidia/nvidia_analysis.py @@ -11,4 +11,5 @@ def analysis_log(logpath): max_mem = line.split(" ")[3] max_mem = float(max_mem[:-3]) - return round(max_usage / 1024.0, 2), round(max_mem / 1024.0, 2) + return round(max_usage / 1024.0, + 2), round(max_mem / 1024.0, 2), eval("156e12"), eval("312e12") diff --git a/inference/run.py b/inference/run.py index d452d83e6..cf768e364 100644 --- a/inference/run.py +++ b/inference/run.py @@ -446,11 +446,16 @@ def compilation_result(case_log_path, config): vendor_module = importlib.import_module("docker_images." + config.VENDOR + "." + config.VENDOR + "_analysis") - vendor_usage, vendor_maxmem = vendor_module.analysis_log(vendor_usage_path) + vendor_usage, vendor_maxmem, fp32, fp16 = vendor_module.analysis_log( + vendor_usage_path) case_perf["vendor_usage(GiB)"] = vendor_usage case_perf["vendor_max_mem(GiB)"] = vendor_maxmem + theory = fp32 if case_perf["precision"] == "fp32" else fp16 + mfu = case_perf["flops"] / theory + case_perf["*MFU"] = str(round(mfu * 100, 1)) + "%" + for key in case_perf.keys(): padding_str = str(key).ljust(43) + " : " + str( case_perf[key]).ljust(23) diff --git a/inference/run_inference.py b/inference/run_inference.py index ff42c393f..e56fd5527 100644 --- a/inference/run_inference.py +++ b/inference/run_inference.py @@ -140,19 +140,17 @@ def parse_args(): e2e_time = time.time() - e2e_start e2e_time = round(float(e2e_time), 3) - input_byte = 2 if config.fp16 else 4 - batch_input_byte = config.batch_size * config.input_size * input_byte - batch_input_byte = int(batch_input_byte) + flops = eval(config.flops) * p_infer_core infer_info = { "vendor": config.vendor, "compiler": config.compiler, "precision": "fp16" if config.fp16 else "fp32", "batchsize": config.batch_size, - "byte_per_batch": batch_input_byte, + "flops": flops, "e2e_time(second)": e2e_time, "p_validation_whole(qps)": p_forward, - "*p_validation_core(qps)": p_forward_core, + "p_validation_core(qps)": p_forward_core, "p_inference_whole(qps)": p_infer, "*p_inference_core(qps)": p_infer_core, "val_average_acc": val_acc, From d34c7a314a05835bac7297162f124def0a09f585 Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Tue, 8 Aug 2023 11:22:54 +0800 Subject: [PATCH 5/6] vit --- inference/benchmarks/vit_l_16/README.md | 86 ++++++++++++++ .../benchmarks/vit_l_16/pytorch/__init__.py | 5 + .../benchmarks/vit_l_16/pytorch/dataloader.py | 49 ++++++++ .../benchmarks/vit_l_16/pytorch/evaluator.py | 10 ++ .../benchmarks/vit_l_16/pytorch/export.py | 34 ++++++ .../benchmarks/vit_l_16/pytorch/forward.py | 106 ++++++++++++++++++ .../benchmarks/vit_l_16/pytorch/model.py | 14 +++ .../vit_l_16/pytorch/requirements.txt | 1 + .../configs/vit_l_16/configurations.yaml | 16 +++ inference/configs/vit_l_16/parameters.yaml | 1 + .../vendor_config/nvidia_configurations.yaml | 3 + 11 files changed, 325 insertions(+) create mode 100644 inference/benchmarks/vit_l_16/README.md create mode 100644 inference/benchmarks/vit_l_16/pytorch/__init__.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/dataloader.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/evaluator.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/export.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/forward.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/model.py create mode 100644 inference/benchmarks/vit_l_16/pytorch/requirements.txt create mode 100644 inference/configs/vit_l_16/configurations.yaml create mode 100644 inference/configs/vit_l_16/parameters.yaml create mode 100644 inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml diff --git a/inference/benchmarks/vit_l_16/README.md b/inference/benchmarks/vit_l_16/README.md new file mode 100644 index 000000000..391eac738 --- /dev/null +++ b/inference/benchmarks/vit_l_16/README.md @@ -0,0 +1,86 @@ +### 1. 推理数据集 +> Download website:https://image-net.org/ + +We use ImageNet2012 Validation Images: +| Dataset | FileName | Size | Checksum | +| ----------------------------- | ---------------------- | ----- | ------------------------------------- | +| Validation images (all tasks) | ILSVRC2012_img_val.tar | 6.3GB | MD5: 29b22e2961454d5413ddabcf34fc5622 | +Dataset format conversion: +https://github.com/pytorch/examples/blob/main/imagenet/extract_ILSVRC.sh + +make sure ILSVRC2012_img_train.tar & ILSVRC2012_img_val.tar are in the same directory with extract_ILSVRC.sh. +```bash +sh extract_ILSVRC.sh +``` + +preview directory structures of decompressed dataset. + +```bash +tree -d -L 1 +``` + +``` +. +├── train +└── val +``` +dataset samples size + +```bash +find ./val -name "*JPEG" | wc -l +50000 +``` + +### 2. 模型与权重 + +* 模型实现 + * pytorch:transformers.ViTForImageClassification +* 权重下载 + * pytorch:from_pretrained("google/vit-large-patch16-224") + +### 2. 软硬件配置与运行信息参考 + +#### 2.1 Nvidia A100 + +- ##### 硬件环境 + - 机器、加速卡型号: NVIDIA_A100-SXM4-40GB + - 多机网络类型、带宽: InfiniBand,200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.4.0-113-generic + - 加速卡驱动版本:470.129.06 + - Docker 版本:20.10.16 + - 训练框架版本:pytorch-1.13.0a0+937e930 + - 依赖软件版本: + - cuda: 11.8 + +- 推理工具包 + + - TensorRT 8.5.1.7 + - torch_tensorrt 1.3.0 + +### 3. 运行情况 + +* 指标列表 + +| 指标名称 | 指标值索引 | 特殊说明 | +| ------------------ | ---------------- | -------------------------------------------- | +| 数据精度 | precision | 可选fp32/fp16 | +| 批尺寸 | bs | | +| 硬件存储使用 | mem | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time | 总时间+Perf初始化等时间 | +| 验证总吞吐量 | p_val_whole | 实际验证图片数除以总验证时间 | +| 验证计算吞吐量 | p_val_core | 不包含IO部分耗时 | +| 推理总吞吐量 | p_infer_whole | 实际推理图片数除以总推理时间 | +| **推理计算吞吐量** | **\*p_infer_core** | 不包含IO部分耗时 | +| **计算卡使用率** | **\*MFU** | model flops utilization | +| 推理结果 | acc(推理/验证) | 单位为top1分类准确率(acc1) | + +* 指标值 + +| 推理工具 | precision | bs | e2e_time | p_val_whole | p_val_core | p_infer_whole | \*p_infer_core | \*MFU | acc | mem | +| ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- | +| tensorrt | fp16 | 64 |1009.7 | 777.8 | 796.7 | 825.8 | 1329.2 | 26.2% | 79.0/79.3 | 35.0/40.0 | +| tensorrt | fp32 | 32 | 1275.9 | 482.4 | 491.1 | 555.5 | 590.5 | 23.3% | 79.3/79.3 | 35.0/40.0 | + diff --git a/inference/benchmarks/vit_l_16/pytorch/__init__.py b/inference/benchmarks/vit_l_16/pytorch/__init__.py new file mode 100644 index 000000000..1f6cdf49b --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/__init__.py @@ -0,0 +1,5 @@ +from .dataloader import build_dataloader +from .model import create_model +from .export import export_model +from .evaluator import evaluator +from .forward import model_forward, engine_forward diff --git a/inference/benchmarks/vit_l_16/pytorch/dataloader.py b/inference/benchmarks/vit_l_16/pytorch/dataloader.py new file mode 100644 index 000000000..d08453f1e --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/dataloader.py @@ -0,0 +1,49 @@ +import torchvision as tv +from torch.utils.data import DataLoader as dl +import torch +import tqdm + + +def build_dataset(config): + crop = 256 + c_crop = 224 + mean = (0.485, 0.456, 0.406) + std = (0.229, 0.224, 0.225) + + if config.fp16: + + class ToFloat16(object): + + def __call__(self, tensor): + return tensor.to(dtype=torch.float16) + + tx = tv.transforms.Compose([ + tv.transforms.Resize(crop), + tv.transforms.CenterCrop(c_crop), + tv.transforms.ToTensor(), + ToFloat16(), + tv.transforms.Normalize(mean=mean, std=std), + ]) + dataset = tv.datasets.ImageFolder(config.data_dir, tx) + else: + tx = tv.transforms.Compose([ + tv.transforms.Resize(crop), + tv.transforms.CenterCrop(c_crop), + tv.transforms.ToTensor(), + tv.transforms.Normalize(mean=mean, std=std), + ]) + dataset = tv.datasets.ImageFolder(config.data_dir, tx) + + return dataset + + +def build_dataloader(config): + dataset = build_dataset(config) + loader = dl(dataset, + batch_size=config.batch_size, + shuffle=False, + drop_last=True, + num_workers=config.num_workers, + pin_memory=True) + + return loader diff --git a/inference/benchmarks/vit_l_16/pytorch/evaluator.py b/inference/benchmarks/vit_l_16/pytorch/evaluator.py new file mode 100644 index 000000000..5481c5e5b --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/evaluator.py @@ -0,0 +1,10 @@ +def topk(output, target, ks=(1, )): + _, pred = output.topk(max(ks), 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + return [correct[:k].max(0)[0] for k in ks] + + +def evaluator(pred, ground_truth): + top1, top5 = topk(pred, ground_truth, ks=(1, 5)) + return top1 diff --git a/inference/benchmarks/vit_l_16/pytorch/export.py b/inference/benchmarks/vit_l_16/pytorch/export.py new file mode 100644 index 000000000..3df1a821b --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/export.py @@ -0,0 +1,34 @@ +import torch +import os + + +def export_model(model, config): + if config.exist_onnx_path is not None: + return config.exist_onnx_path + + filename = config.case + "_bs" + str(config.batch_size) + filename = filename + "_" + str(config.framework) + filename = filename + "_fp16" + str(config.fp16) + filename = "onnxs/" + filename + ".onnx" + onnx_path = config.perf_dir + "/" + filename + + dummy_input = torch.randn(config.batch_size, 3, 224, 224) + + if config.fp16: + dummy_input = dummy_input.half() + dummy_input = dummy_input.cuda() + + dir_onnx_path = os.path.dirname(onnx_path) + os.makedirs(dir_onnx_path, exist_ok=True) + + with torch.no_grad(): + torch.onnx.export(model, + dummy_input, + onnx_path, + verbose=False, + input_names=["input"], + output_names=["output"], + training=torch.onnx.TrainingMode.EVAL, + do_constant_folding=True) + + return onnx_path diff --git a/inference/benchmarks/vit_l_16/pytorch/forward.py b/inference/benchmarks/vit_l_16/pytorch/forward.py new file mode 100644 index 000000000..a61caf685 --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/forward.py @@ -0,0 +1,106 @@ +from loguru import logger +import torch +import numpy as np +import time +from tools import torch_sync + + +def cal_perf(config, dataloader_len, duration, core_time, str_prefix): + model_forward_perf = config.repeat * dataloader_len * config.batch_size / duration + logger.info(str_prefix + "(" + config.framework + ") Perf: " + + str(model_forward_perf) + " ips") + model_forward_core_perf = config.repeat * dataloader_len * config.batch_size / core_time + logger.info(str_prefix + "(" + config.framework + ") core Perf: " + + str(model_forward_core_perf) + " ips") + return round(model_forward_perf, 3), round(model_forward_core_perf, 3) + + +def model_forward(model, dataloader, evaluator, config): + if config.no_validation: + return None, None, None + start = time.time() + core_time = 0.0 + acc = [] + + for times in range(config.repeat): + + logger.debug("Repeat: " + str(times + 1)) + + all_top1 = [] + for step, (x, y) in enumerate(dataloader): + torch_sync(config) + core_time_start = time.time() + + if step % config.log_freq == 0: + logger.debug("Step: " + str(step) + " / " + + str(len(dataloader))) + + with torch.no_grad(): + + x = x.cuda() + y = y.cuda() + pred = model(x)[0] + torch_sync(config) + core_time += time.time() - core_time_start + + top1 = evaluator(pred, y) + + all_top1.extend(top1.cpu()) + + acc.append(np.mean(all_top1)) + + logger.info("Top1 Acc: " + str(acc)) + + duration = time.time() - start + model_forward_perf, model_forward_core_perf = cal_perf( + config, len(dataloader), duration, core_time, "Validation") + + return model_forward_perf, model_forward_core_perf, round( + float(np.mean(acc)), 3) + + +def engine_forward(model, dataloader, evaluator, config): + start = time.time() + core_time = 0.0 + foo_time = 0.0 + acc = [] + + for times in range(config.repeat): + + logger.debug("Repeat: " + str(times + 1)) + + all_top1 = [] + for step, (x, y) in enumerate(dataloader): + torch_sync(config) + core_time_start = time.time() + + if step % config.log_freq == 0: + logger.debug("Step: " + str(step) + " / " + + str(len(dataloader))) + + with torch.no_grad(): + + outputs = model([x]) + pred = outputs[0] + foo_time += outputs[1] + + torch_sync(config) + core_time += time.time() - core_time_start + + pred = pred[0].float() + pred = pred.reshape(config.batch_size, -1) + pred = pred.cpu() + top1 = evaluator(pred, y) + + all_top1.extend(top1.cpu()) + + acc.append(np.mean(all_top1)) + + logger.info("Top1 Acc: " + str(acc)) + + duration = time.time() - start - foo_time + model_forward_perf, model_forward_core_perf = cal_perf( + config, len(dataloader), duration, core_time - foo_time, "Inference") + + return model_forward_perf, model_forward_core_perf, round( + float(np.mean(acc)), 3) diff --git a/inference/benchmarks/vit_l_16/pytorch/model.py b/inference/benchmarks/vit_l_16/pytorch/model.py new file mode 100644 index 000000000..186148119 --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/model.py @@ -0,0 +1,14 @@ +from transformers import ViTForImageClassification as vit + + +def create_model(config): + if config.no_validation: + assert config.exist_onnx_path is not None + return None + model = vit.from_pretrained(config.weights) + model.cuda() + model.eval() + if config.fp16: + model.half() + + return model diff --git a/inference/benchmarks/vit_l_16/pytorch/requirements.txt b/inference/benchmarks/vit_l_16/pytorch/requirements.txt new file mode 100644 index 000000000..976a2b1f3 --- /dev/null +++ b/inference/benchmarks/vit_l_16/pytorch/requirements.txt @@ -0,0 +1 @@ +transformers diff --git a/inference/configs/vit_l_16/configurations.yaml b/inference/configs/vit_l_16/configurations.yaml new file mode 100644 index 000000000..da9354aa0 --- /dev/null +++ b/inference/configs/vit_l_16/configurations.yaml @@ -0,0 +1,16 @@ +batch_size: 32 +# 1 item(like 1 sequence, 1 image) flops +# Attention! For transformer decoder like bert, 1 token cause 2*param flops, so we need 2*length*params like 2*512*0.33B here +# format: a_1*a*2*...*a_nea_0,like 2*512*0.33e9(bert) or 4.12e9(resnet50) +flops: 6.16e10 +fp16: false +compiler: tensorrt +num_workers: 8 +log_freq: 30 +repeat: 5 +# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null +no_validation: false +# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt) +exist_onnx_path: null +# set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine +exist_compiler_path: null \ No newline at end of file diff --git a/inference/configs/vit_l_16/parameters.yaml b/inference/configs/vit_l_16/parameters.yaml new file mode 100644 index 000000000..d5d7da9dd --- /dev/null +++ b/inference/configs/vit_l_16/parameters.yaml @@ -0,0 +1 @@ +weights: "google/vit-large-patch16-224" \ No newline at end of file diff --git a/inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml b/inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml new file mode 100644 index 000000000..5fc40bbf6 --- /dev/null +++ b/inference/configs/vit_l_16/vendor_config/nvidia_configurations.yaml @@ -0,0 +1,3 @@ +trt_tmp_path: nvidia_tmp/vit.trt +has_dynamic_axis: false +torchtrt_full_compile: true \ No newline at end of file From 45a744425a46aeff3a699b5d174e7b732e301866 Mon Sep 17 00:00:00 2001 From: shh2000 <13820618441@163.com> Date: Tue, 8 Aug 2023 11:47:08 +0800 Subject: [PATCH 6/6] addsrc --- inference/benchmarks/vit_l_16/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inference/benchmarks/vit_l_16/README.md b/inference/benchmarks/vit_l_16/README.md index 391eac738..5998c0cf9 100644 --- a/inference/benchmarks/vit_l_16/README.md +++ b/inference/benchmarks/vit_l_16/README.md @@ -34,9 +34,9 @@ find ./val -name "*JPEG" | wc -l ### 2. 模型与权重 * 模型实现 - * pytorch:transformers.ViTForImageClassification + * pytorch:transformers.ViTForImageClassification(hugging face) * 权重下载 - * pytorch:from_pretrained("google/vit-large-patch16-224") + * pytorch:from_pretrained("google/vit-large-patch16-224")(hugging face) ### 2. 软硬件配置与运行信息参考