FlagOpen · upvenly · Aug 18, 2023 · Aug 7, 2023 · Aug 7, 2023 · Aug 7, 2023
diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md
@@ -60,6 +60,25 @@ find ./val -name "*JPEG" | wc -l
    - TensorRT 8.5.1.7
    - torch_tensorrt 1.3.0
 
+#### 2.3 Nvidia A100
+
+- ##### 硬件环境
+    - 机器、加速卡型号: MR-V100
+
+- ##### 软件环境
+   - OS版本：Ubuntu 18.04
+   - OS kernel版本: 5.15.0-78-generic
+   - 加速卡驱动版本：3.2.0
+   - Docker 版本：24.0.4
+   - 训练框架版本：torch-1.13.1+corex.3.2.0
+   - 依赖软件版本：
+     - cuda: 10.2
+
+- 推理工具包
+
+   - TensorRT 8.5.1.7
+   - torch_tensorrt 1.3.0
+
 ### 3. 运行情况
 
 * 指标列表
@@ -84,4 +103,4 @@ find ./val -name "*JPEG" | wc -l
 | tensorrt | fp16      | 256  |613.4 | 1358.9   | 4469.4 | 1391.4   | 12698.7 | 16.8% | 76.2/76.2 | 19.7/40.0 |
 | tensorrt | fp32   | 256  | 474.4    | 1487.3      | 2653.2     | 1560.3        | 6091.6  | 16.1% | 76.2/76.2 | 28.86/40.0 |
 | torchtrt | fp16     | 256  | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 |
-
+| ixrt | fp16     | 256  | 275.6 |  |  | 276.8 | 1914.3 | 8.2% | 76.2/76.2 | 4.3/32.0 |
diff --git a/inference/configs/resnet50/vendor_config/iluvatar_configurations.yaml b/inference/configs/resnet50/vendor_config/iluvatar_configurations.yaml
@@ -0,0 +1,8 @@
+ixrt_tmp_path: iluvatar_tmp/resnet50-fp16.engine
+has_dynamic_axis: false
+repeat: 1
+image_size: 224
+exist_onnx_path: onnxs/resnet50_bs256_pytorch_fp16False.onnx
+# exist_compiler_path: resnet50-fp16.engine
+output_types: {"output":"float32"}
+input_types: {"input": "float32"}
diff --git a/inference/docker_images/iluvatar/iluvatar_analysis.py b/inference/docker_images/iluvatar/iluvatar_analysis.py
@@ -0,0 +1,15 @@
+def analysis_log(logpath):
+    logfile = open(logpath)
+
+    max_usage = 0.0
+    max_mem = 0.0
+    for line in logfile.readlines():
+        if "MiB" in line:
+            usage = line.split(" ")[2]
+            usage = float(usage[:-3])
+            max_usage = max(max_usage, usage)
+            max_mem = line.split(" ")[3]
+            max_mem = float(max_mem[:-3])
+
+    return round(max_usage / 1024.0,
+                 2), round(max_mem / 1024.0, 2), eval("24e12"), eval("96e12")
diff --git a/inference/docker_images/iluvatar/iluvatar_monitor.py b/inference/docker_images/iluvatar/iluvatar_monitor.py
@@ -0,0 +1,256 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 gpu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.gpufile = gpu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def gpu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            cmd = "ixsmi |grep 'Default'|awk '{print $3,$5,$9,$11,$13}'"
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n" + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_gpu_mon():
+            gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
+            gpu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_gpu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.gpufile):
+            os.remove(self.gpufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/gpu_monitor.pid')
+    log_fn = str(log_path + '/iluvatar_monitor.log')
+    err_fn = str(log_path + '/iluvatar_monitor.err')
+    # result for gpu
+    gpu_fn = str(log_path + '/iluvatar_monitor.log')
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       gpu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/inference/docker_images/iluvatar/pytorch/Dockerfile b/inference/docker_images/iluvatar/pytorch/Dockerfile
@@ -0,0 +1,61 @@
+FROM ubuntu:20.04
+
+RUN /bin/bash -c "source /root/.bashrc"
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PATH /root/miniconda/bin:$PATH
+
+RUN sed -i 's#http://archive.ubuntu.com/#http://mirrors.tuna.tsinghua.edu.cn/#' /etc/apt/sources.list
+RUN apt-get update -y
+RUN apt-get install -y --fix-missing \
+     apt-utils \
+     sudo \
+     openssh-server \
+     vim \
+     git \
+     curl \
+     wget \
+     tree \
+     perl \
+     kmod \
+     make \
+     pciutils \
+     build-essential \
+     python3.8-dev \
+     python3-pip \
+     libjpeg-dev \
+     zlib1g-dev \
+     unzip \
+     cmake \
+     bzip2 \
+     cabextract \
+     iputils-ping \
+     pbzip2 \
+     pv \
+     numactl \
+     ninja-build \
+     libgl1-mesa-dev 
+
+
+# Configure anaconda
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh && \
+    bash ./Miniconda3-py38_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \
+    /root/miniconda/bin/conda clean -tipsy && \
+    ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
+    echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
+    echo "conda activate base" >> ~/.bashrc && \
+    conda config --set always_yes yes --set changeps1 no && \
+    echo 'LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"' >> ~/.bashrc && \
+    echo 'PATH="/usr/local/corex/bin:${PATH}"' >> ~/.bashrc 
+
+
+RUN /bin/bash -c "apt-get install -y linux-headers-`uname -r`"
+
+RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"
+
+ENV LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"
+ENV PATH="/usr/local/corex/bin:${PATH}"
+ENV NVCC_ARGUMENTS="-U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -ftemplate-depth=1024"
+
+RUN pip install loguru
+RUN pip install pyyaml
diff --git a/inference/docker_images/iluvatar/pytorch/packages/README.md b/inference/docker_images/iluvatar/pytorch/packages/README.md
@@ -0,0 +1,9 @@
+# 以下软件包需联系天数智芯获取
+
+>联系邮箱: contact-us@iluvatar.com
+
+ixrt-0.4.0+corex.3.2.0-cp38-cp38-linux_x86_64.whl
+
+torch-1.13.1+corex.3.2.0-cp38-cp38-linux_x86_64.whl
+
+torchvision-0.14.1+corex.3.2.0-cp38-cp38-linux_x86_64.whl