Kunlunxin inference (#192)

* kunlunxin inference * change docker version * xtcl support fp16 onnx * add kunlun monitor * kunlunxin sync and remove d2h time --------- Co-authored-by: zhaoyixuan02 <zhaoyixuan02@baidu.com> Co-authored-by: zhoujiamin01 <zhoujiamin01@baidu.com>
FlagOpen · Aug 16, 2023 · 9e4076b · 9e4076b
1 parent dc86b14
commit 9e4076b
Show file tree

Hide file tree

Showing 7 changed files with 453 additions and 0 deletions.
diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md
@@ -60,6 +60,24 @@ find ./val -name "*JPEG" | wc -l
    - TensorRT 8.5.1.7
    - torch_tensorrt 1.3.0
 
+#### 2.2 昆仑芯R200
+
+- ##### 硬件环境
+    - 机器、加速卡型号: R200
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.15.0-56-generic
+   - 加速卡驱动版本：4.0
+   - Docker 版本：20.10.21
+   - 依赖软件版本：
+     - pytorch: 1.13.0+cpu
+     - onnx: 1.14.0
+
+- 推理工具包
+
+   - XTCL 2.1
+
 ### 3. 运行情况
 
 * 指标列表
@@ -84,4 +102,5 @@ find ./val -name "*JPEG" | wc -l
 | tensorrt | fp16      | 256  |613.4 | 1358.9   | 4469.4 | 1391.4   | 12698.7 | 16.8% | 76.2/76.2 | 19.7/40.0 |
 | tensorrt | fp32   | 256  | 474.4    | 1487.3      | 2653.2     | 1560.3        | 6091.6  | 16.1% | 76.2/76.2 | 28.86/40.0 |
 | torchtrt | fp16     | 256  | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 |
+| kunlunxin_xtcl | fp32   | 128  | 311.215    | /      | /     |  837.507    | 1234.727  | / | 76.2/76.2 | / |
 
diff --git a/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml b/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml
@@ -0,0 +1,4 @@
+fp16: false
+compiler: xtcl
+no_validation: true
+exist_onnx_path: onnxs/resnet50_bs256_pytorch_fp16False.onnx 
diff --git a/inference/docker_images/kunlunxin/kunlunxin_analysis.py b/inference/docker_images/kunlunxin/kunlunxin_analysis.py
@@ -0,0 +1,23 @@
+def analysis_log(logpath):
+    logfile = open(logpath)
+
+    max_usage = 0.0 ## usage_mem
+    max_mem = 0.0 
+    for line in logfile.readlines():
+        '''
+        xpu_smi temp power mem w_mem use_rate
+        '''
+        if "xpu_smi" in line:
+            line = line[:-1]
+            usage = line.split(" ")[4]
+            usage = float(usage)
+            max_usage = max(max_usage, usage)
+            max_mem = line.split(" ")[5]
+            max_mem = float(max_mem)
+
+    return round(max_usage / 1024.0,
+                 2), round(max_mem / 1024.0, 2), eval("32e12"), eval("128e12")
+
+
+if __name__ == "__main__":
+    max1, max2, max2,max4 = analysis_log("/home/zhoujiamin01/workspace/zjm_flag/FlagPerf/inference/result/run20230809192313/resnet50:pytorch_1.13/127.0.0.1_noderank0/kunlunxin_monitor.log")
diff --git a/inference/docker_images/kunlunxin/kunlunxin_monitor.py b/inference/docker_images/kunlunxin/kunlunxin_monitor.py
@@ -0,0 +1,256 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 gpu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.gpufile = gpu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def gpu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            cmd = "xpu_smi |grep '/dev/xpu0'|awk '{print $29,$27,$22,$24,$14}'"  ## temp power mem w_mem use_rate
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n xpu_smi " + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_gpu_mon():
+            gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
+            gpu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_gpu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.gpufile):
+            os.remove(self.gpufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/xpu_monitor.pid')
+    log_fn = str(log_path + '/kunlunxin_monitor.log')
+    err_fn = str(log_path + '/kunlunxin_monitor.err')
+    # result for gpu
+    gpu_fn = str(log_path + '/kunlunxin_monitor.log')
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       gpu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile b/inference/docker_images/kunlunxin/pytorch_1.13/Dockerfile
@@ -0,0 +1,77 @@
+FROM ubuntu:18.04
+
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+        bash \
+        build-essential \
+        ca-certificates \
+        wget \
+        git \
+        locales \
+        locales-all \
+        python3.8-dev \
+        lsb-release && \
+    rm -rf /var/lib/apt/lists/*
+
+
+# Set timezone
+RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
+RUN echo 'Asia/Shanghai' >/etc/timezone
+
+# Install miniconda
+# Manually invoke bash on miniconda script per https://github.com/conda/conda/issues/10431
+# RUN curl -fsSL -v -o ~/miniconda.sh -O  "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" && \
+RUN wget -O ~/miniconda.sh "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" && \
+    chmod +x ~/miniconda.sh && \
+    bash ~/miniconda.sh -b -p /root/miniconda && \
+    rm ~/miniconda.sh && \
+    /root/miniconda/bin/conda config --set show_channel_urls yes && \
+    /root/miniconda/bin/conda create --name python38 python=3.8 -y && \
+    /root/miniconda/bin/conda clean -ya
+
+# hyperparamer, typing_extensions, numpy requests
+RUN /root/miniconda/envs/python38/bin/pip install \
+    --no-cache-dir \
+    -i https://pypi.tuna.tsinghua.edu.cn/simple \
+    hyperparameter \
+    typing_extensions \
+    numpy \
+    requests \
+    onnx \
+    onnxruntime \
+    attrs \
+    regex \
+    decorator \
+    loguru \
+    schedule \
+    munch \
+    pyyaml \
+    tqdm \
+    scipy
+
+RUN /root/miniconda/envs/python38/bin/pip install torch==1.13.0+cpu torchvision==0.14.0+cpu torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cpu
+
+RUN cd /root && wget https://baidu-kunlun-public.su.bcebos.com/XTCL/XTCL-2.1/XTCL-ubuntu_x86_64.tar.gz && tar -xzf XTCL-ubuntu_x86_64.tar.gz
+
+RUN cd /root && wget https://klx-sdk-release-public.su.bcebos.com/xre/release/4.0.18.1/xre-ubuntu_2004_x86_64.tar.gz && tar -xzf xre-ubuntu_2004_x86_64.tar.gz
+
+ENV LD_LIBRARY_PATH=/root/XTCL-ubuntu_x86_64/3rdparty/lib:/root/XTCL-ubuntu_x86_64/runtime/shlib:/root/XTCL-ubuntu_x86_64/shlib
+ENV KERNEL_INCLUDE=/root/XTCL-ubuntu_x86_64/xpu/kernels
+ENV XTCL_L3_SIZE=67104768
+ENV XPU_PADDLE_L3_SIZE=67104768
+ENV RT_LIBRARY_PATH=/root/XTCL-ubuntu_x86_64/runtime/shlib
+ENV THIRDPARTY_LIB_DIR=/root/XTCL-ubuntu_x86_64/3rdparty/lib
+ENV XTCL_INSTALL_DIR=/root/XTCL-ubuntu_x86_64
+ENV XTCL_QUANTIZE_WEIGHT=1
+ENV XTCL_USE_FP16=1
+ENV PYTHONPATH=/root/XTCL-ubuntu_x86_64/python:/root/XTCL-ubuntu_x86_64/python/tvm:/root/XTCL-ubuntu_x86_64/python/topi
+ENV CLANG_PATH=/root/XTCL-ubuntu_x86_64
+ENV KERNEL_SEARCH_PATH=/root/XTCL-ubuntu_x86_64/xpu/kernels
+ENV XPUSIM_DEVICE_MODEL=KUNLUN2
+ENV XTCL_AUTO_ALLOC_L3=1
+ENV TVM_DIR=/root/XTCL-ubuntu_x86_64
+
+
+
+ENV PATH /root/xre-ubuntu_2004_x86_64/bin:$PATH
+ENV PATH /root/miniconda/envs/python38/bin:$PATH
+