diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md index 024b7f417..f8ebdf6be 100644 --- a/inference/benchmarks/resnet50/README.md +++ b/inference/benchmarks/resnet50/README.md @@ -60,6 +60,24 @@ find ./val -name "*JPEG" | wc -l - TensorRT 8.5.1.7 - torch_tensorrt 1.3.0 +#### 2.2 昆仑芯R200 + +- ##### 硬件环境 + - 机器、加速卡型号: R200 + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.15.0-56-generic + - 加速卡驱动版本:4.0 + - Docker 版本:20.10.21 + - 依赖软件版本: + - pytorch: 1.13.0+cpu + - onnx: 1.14.0 + +- 推理工具包 + + - XTCL 2.1 + ### 3. 运行情况 * 指标列表 @@ -84,4 +102,5 @@ find ./val -name "*JPEG" | wc -l | tensorrt | fp16 | 256 |613.4 | 1358.9 | 4469.4 | 1391.4 | 12698.7 | 16.8% | 76.2/76.2 | 19.7/40.0 | | tensorrt | fp32 | 256 | 474.4 | 1487.3 | 2653.2 | 1560.3 | 6091.6 | 16.1% | 76.2/76.2 | 28.86/40.0 | | torchtrt | fp16 | 256 | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 | +| kunlunxin_xtcl | fp32 | 128 | 311.215 | / | / | 837.507 | 1234.727 | / | 76.2/76.2 | / | diff --git a/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml b/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml new file mode 100644 index 000000000..4b2b5ffcb --- /dev/null +++ b/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml @@ -0,0 +1,4 @@ +fp16: false +compiler: xtcl +no_validation: true +exist_onnx_path: onnxs/resnet50_bs256_pytorch_fp16False.onnx diff --git a/inference/docker_images/kunlunxin/kunlunxin_analysis.py b/inference/docker_images/kunlunxin/kunlunxin_analysis.py new file mode 100644 index 000000000..388f89cee --- /dev/null +++ b/inference/docker_images/kunlunxin/kunlunxin_analysis.py @@ -0,0 +1,23 @@ +def analysis_log(logpath): + logfile = open(logpath) + + max_usage = 0.0 ## usage_mem + max_mem = 0.0 + for line in logfile.readlines(): + ''' + xpu_smi temp power mem w_mem use_rate + ''' + if "xpu_smi" in line: + line = line[:-1] + usage = line.split(" ")[4] + usage = float(usage) + max_usage = max(max_usage, usage) + max_mem = line.split(" ")[5] + max_mem = float(max_mem) + + return round(max_usage / 1024.0, + 2), round(max_mem / 1024.0, 2), eval("32e12"), eval("128e12") + + +if __name__ == "__main__": + max1, max2, max2,max4 = analysis_log("/home/zhoujiamin01/workspace/zjm_flag/FlagPerf/inference/result/run20230809192313/resnet50:pytorch_1.13/127.0.0.1_noderank0/kunlunxin_monitor.log") diff --git a/inference/docker_images/kunlunxin/kunlunxin_monitor.py b/inference/docker_images/kunlunxin/kunlunxin_monitor.py new file mode 100644 index 000000000..ba5a877a1 --- /dev/null +++ b/inference/docker_images/kunlunxin/kunlunxin_monitor.py @@ -0,0 +1,256 @@ +# !/usr/bin/env python3 +# encoding: utf-8 +''' +Usage: python3 sys-monitor.py -o operation -l [log_path] + -o, --operation start|stop|restart|status + -l, --log log path , ./logs/ default +''' + +import os +import sys +import time +import signal +import atexit +import argparse +import datetime +from multiprocessing import Process +import subprocess +import schedule + + +class Daemon: + ''' + daemon subprocess class. + usage: subclass this daemon and override the run() method. + sys-monitor.pid: in the /tmp/, auto del when unexpected exit. + verbose: debug mode, disabled default. + ''' + + def __init__(self, + pid_file, + log_file, + err_file, + gpu_log, + log_path, + rate=5, + stdin=os.devnull, + stdout=os.devnull, + stderr=os.devnull, + home_dir='.', + umask=0o22, + verbose=0): + self.stdin = stdin + self.stdout = stdout + self.stderr = stderr + self.home_dir = home_dir + self.verbose = verbose + self.pidfile = pid_file + self.logfile = log_file + self.errfile = err_file + self.gpufile = gpu_log + self.logpath = log_path + self.rate = rate + self.umask = umask + self.verbose = verbose + self.daemon_alive = True + + def get_pid(self): + try: + with open(self.pidfile, 'r') as pf: + pid = int(pf.read().strip()) + except IOError: + pid = None + except SystemExit: + pid = None + return pid + + def del_pid(self): + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + + def run(self): + ''' + NOTE: override the method in subclass + ''' + + def gpu_mon(file): + TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') + cmd = "xpu_smi |grep '/dev/xpu0'|awk '{print $29,$27,$22,$24,$14}'" ## temp power mem w_mem use_rate + process = subprocess.Popen(cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + encoding='utf-8') + try: + out = process.communicate(timeout=10) + except subprocess.TimeoutExpired: + process.kill() + out = process.communicate() + + if process.returncode != 0: + result = "error" + result = TIMESTAMP + "\n xpu_smi " + out[0] + "\n" + with open(file, 'a') as f: + f.write(result) + + def timer_gpu_mon(): + gpu_process = Process(target=gpu_mon, args=(self.gpufile, )) + gpu_process.start() + + schedule.every(self.rate).seconds.do(timer_gpu_mon) + while True: + schedule.run_pending() + time.sleep(5) + + def daemonize(self): + if self.verbose >= 1: + print('daemon process starting ...') + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError as e: + sys.stderr.write('fork #1 failed: %d (%s)\n' % + (e.errno, e.strerror)) + sys.exit(1) + os.chdir(self.home_dir) + os.setsid() + os.umask(self.umask) + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError as e: + sys.stderr.write('fork #2 failed: %d (%s)\n' % + (e.errno, e.strerror)) + sys.exit(1) + sys.stdout.flush() + sys.stderr.flush() + si = open(self.stdin, 'r') + so = open(self.stdout, 'a+') + if self.stderr: + se = open(self.stderr, 'a+') + else: + se = so + os.dup2(si.fileno(), sys.stdin.fileno()) + os.dup2(so.fileno(), sys.stdout.fileno()) + os.dup2(se.fileno(), sys.stderr.fileno()) + atexit.register(self.del_pid) + pid = str(os.getpid()) + with open(self.pidfile, 'w+') as f: + f.write('%s\n' % pid) + + def start(self): + if not os.path.exists(self.logpath): + os.makedirs(self.logpath) + elif os.path.exists(self.gpufile): + os.remove(self.gpufile) + if self.verbose >= 1: + print('ready to start ......') + # check for a pid file to see if the daemon already runs + pid = self.get_pid() + if pid: + msg = 'pid file %s already exists, is it already running?\n' + sys.stderr.write(msg % self.pidfile) + sys.exit(1) + # start the daemon + self.daemonize() + self.run() + + def stop(self): + if self.verbose >= 1: + print('stopping ...') + pid = self.get_pid() + if not pid: + msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile + sys.stderr.write(msg) + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + return + # try to kill the daemon process + try: + i = 0 + while 1: + os.kill(pid, signal.SIGTERM) + time.sleep(1) + i = i + 1 + if i % 10 == 0: + os.kill(pid, signal.SIGHUP) + except OSError as err: + err = str(err) + if err.find('No such process') > 0: + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + else: + print(str(err)) + sys.exit(1) + if self.verbose >= 1: + print('Stopped!') + + def restart(self): + self.stop() + self.start() + + def status(self): + pid = self.get_pid() + if pid: + if os.path.exists('/proc/%d' % pid): + return pid + return False + + +def parse_args(): + ''' Check script input parameter. ''' + parse = argparse.ArgumentParser(description='Sys monitor script') + parse.add_argument('-o', + type=str, + metavar='[operation]', + required=True, + help='start|stop|restart|status') + parse.add_argument('-l', + type=str, + metavar='[log_path]', + required=False, + default='./logs/', + help='log path') + args = parse.parse_args() + return args + + +def main(): + sample_rate1 = 5 + args = parse_args() + operation = args.o + log_path = args.l + pid_fn = str('/tmp/xpu_monitor.pid') + log_fn = str(log_path + '/kunlunxin_monitor.log') + err_fn = str(log_path + '/kunlunxin_monitor.err') + # result for gpu + gpu_fn = str(log_path + '/kunlunxin_monitor.log') + + subdaemon = Daemon(pid_fn, + log_fn, + err_fn, + gpu_fn, + log_path, + verbose=1, + rate=sample_rate1) + if operation == 'start': + subdaemon.start() + elif operation == 'stop': + subdaemon.stop() + elif operation == 'restart': + subdaemon.restart() + elif operation == 'status': + pid = subdaemon.status() + if pid: + print('process [%s] is running ......' % pid) + else: + print('daemon process [%s] stopped' % pid) + else: + print("invalid argument!") + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile b/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile new file mode 100644 index 000000000..7227b9743 --- /dev/null +++ b/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile @@ -0,0 +1,77 @@ +FROM ubuntu:18.04 + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + bash \ + build-essential \ + ca-certificates \ + wget \ + git \ + locales \ + locales-all \ + python3.8-dev \ + lsb-release && \ + rm -rf /var/lib/apt/lists/* + + +# Set timezone +RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime +RUN echo 'Asia/Shanghai' >/etc/timezone + +# Install miniconda +# Manually invoke bash on miniconda script per https://github.com/conda/conda/issues/10431 +# RUN curl -fsSL -v -o ~/miniconda.sh -O "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" && \ +RUN wget -O ~/miniconda.sh "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" && \ + chmod +x ~/miniconda.sh && \ + bash ~/miniconda.sh -b -p /root/miniconda && \ + rm ~/miniconda.sh && \ + /root/miniconda/bin/conda config --set show_channel_urls yes && \ + /root/miniconda/bin/conda create --name python38 python=3.8 -y && \ + /root/miniconda/bin/conda clean -ya + +# hyperparamer, typing_extensions, numpy requests +RUN /root/miniconda/envs/python38/bin/pip install \ + --no-cache-dir \ + -i https://pypi.tuna.tsinghua.edu.cn/simple \ + hyperparameter \ + typing_extensions \ + numpy \ + requests \ + onnx \ + onnxruntime \ + attrs \ + regex \ + decorator \ + loguru \ + schedule \ + munch \ + pyyaml \ + tqdm \ + scipy + +RUN /root/miniconda/envs/python38/bin/pip install torch==1.13.0+cpu torchvision==0.14.0+cpu torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cpu + +RUN cd /root && wget https://baidu-kunlun-public.su.bcebos.com/XTCL/XTCL-2.1/XTCL-ubuntu_x86_64.tar.gz && tar -xzf XTCL-ubuntu_x86_64.tar.gz + +RUN cd /root && wget https://klx-sdk-release-public.su.bcebos.com/xre/release/4.0.18.1/xre-ubuntu_2004_x86_64.tar.gz && tar -xzf xre-ubuntu_2004_x86_64.tar.gz + +ENV LD_LIBRARY_PATH=/root/XTCL-ubuntu_x86_64/3rdparty/lib:/root/XTCL-ubuntu_x86_64/runtime/shlib:/root/XTCL-ubuntu_x86_64/shlib +ENV KERNEL_INCLUDE=/root/XTCL-ubuntu_x86_64/xpu/kernels +ENV XTCL_L3_SIZE=67104768 +ENV XPU_PADDLE_L3_SIZE=67104768 +ENV RT_LIBRARY_PATH=/root/XTCL-ubuntu_x86_64/runtime/shlib +ENV THIRDPARTY_LIB_DIR=/root/XTCL-ubuntu_x86_64/3rdparty/lib +ENV XTCL_INSTALL_DIR=/root/XTCL-ubuntu_x86_64 +ENV XTCL_QUANTIZE_WEIGHT=1 +ENV XTCL_USE_FP16=1 +ENV PYTHONPATH=/root/XTCL-ubuntu_x86_64/python:/root/XTCL-ubuntu_x86_64/python/tvm:/root/XTCL-ubuntu_x86_64/python/topi +ENV CLANG_PATH=/root/XTCL-ubuntu_x86_64 +ENV KERNEL_SEARCH_PATH=/root/XTCL-ubuntu_x86_64/xpu/kernels +ENV XPUSIM_DEVICE_MODEL=KUNLUN2 +ENV XTCL_AUTO_ALLOC_L3=1 +ENV TVM_DIR=/root/XTCL-ubuntu_x86_64 + + + +ENV PATH /root/xre-ubuntu_2004_x86_64/bin:$PATH +ENV PATH /root/miniconda/envs/python38/bin:$PATH + diff --git a/inference/inference_engine/kunlunxin/xtcl.py b/inference/inference_engine/kunlunxin/xtcl.py new file mode 100755 index 000000000..396cc3ae9 --- /dev/null +++ b/inference/inference_engine/kunlunxin/xtcl.py @@ -0,0 +1,70 @@ +import onnx +import tvm +import tvm.relay as relay +from tvm.contrib.download import download_testdata +from tvm.relay import param_dict +from tvm.contrib import xpu_config +import torch +import os +import subprocess +from loguru import logger +import numpy as np +import time + +class InferModel: + + def __init__(self, config , onnx_path, model): + self.input_names = [] + self.engine = self.build_engine(config, onnx_path) + + def build_engine(self, config, onnx_path): + onnx_model = onnx.load(onnx_path) + shape_dict = {} + for input in onnx_model.graph.input: + input_shape = input.type.tensor_type.shape.dim + input_shape = [a.dim_value for a in input_shape] + input_shape[0] = config.batch_size + input_name = input.name #'inputs:0' + self.input_names.append(input_name) + shape_dict[input_name] = input_shape + + mod, params = relay.frontend.from_onnx(onnx_model, shape_dict) + + target_host = f'llvm -acc=xpu{os.environ.get("XPUSIM_DEVICE_MODEL", "KUNLUN1")[-1]}' + ctx = tvm.device("xpu", 0) + build_config = {} + if config.fp16 == True: + os.environ["XTCL_USE_NEW_ALTER_PASS"] = '1' + input_fp16 = { name:"float16" for name in self.input_names} + build_config["XPUOutDtypeConfig"] = xpu_config.XPUOutDtypeConfig( + default_precision="float16", + config_last_node=True, + config_map={ + }, + config_var_dtype_map=input_fp16, + ).value() + else: ## fp32 + os.environ['XTCL_USE_FP16'] = '0' + os.environ['XTCL_QUANTIZE_WEIGHT'] = '0' + + with tvm.transform.PassContext(opt_level=3, config=build_config): + vm_exec = relay.backend.vm.compile(mod, + target=target_host, + target_host=target_host, + params=params) + from tvm.runtime.vm import VirtualMachine + vm = VirtualMachine(vm_exec, ctx) + return vm + + def __call__(self, model_inputs: list): + for index, input_name in enumerate(self.input_names): + self.engine.set_one_input("main",input_name, tvm.nd.array(model_inputs[index])) + self.engine.run() + output_list = [self.engine.get_output(i) for i in range(self.engine.get_num_outputs())] + foo_time_start = time.time() + # d2h + output_list = [torch.from_numpy(output.asnumpy()) for output in output_list] + foo_time = time.time() - foo_time_start + return output_list, foo_time + + diff --git a/inference/tools/torch_sync.py b/inference/tools/torch_sync.py index 6e5e8b09a..aa2e2110d 100644 --- a/inference/tools/torch_sync.py +++ b/inference/tools/torch_sync.py @@ -4,3 +4,7 @@ def torch_sync(config): if config.vendor == "nvidia": torch.cuda.synchronize() + if config.vendor == "kunlunxin": + # kunlunxin case + # xpu sync already finsh after InferModel.__call__ + pass