Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Iluvatar inference Resnet50 #195

Merged
merged 11 commits into from
Aug 18, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion inference/benchmarks/resnet50/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,25 @@ find ./val -name "*JPEG" | wc -l
- TensorRT 8.5.1.7
- torch_tensorrt 1.3.0

#### 2.3 Nvidia A100
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里请改为2.2,硬件类型改成Iluvatar MR-V100


- ##### 硬件环境
- 机器、加速卡型号: MR-V100

- ##### 软件环境
- OS版本:Ubuntu 18.04
- OS kernel版本: 5.15.0-78-generic
- 加速卡驱动版本:3.2.0
- Docker 版本:24.0.4
- 训练框架版本:torch-1.13.1+corex.3.2.0
- 依赖软件版本:
- cuda: 10.2

- 推理工具包

- TensorRT 8.5.1.7
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这两个编译器请改为ixrt+对应版本

- torch_tensorrt 1.3.0

### 3. 运行情况

* 指标列表
Expand All @@ -84,4 +103,4 @@ find ./val -name "*JPEG" | wc -l
| tensorrt | fp16 | 256 |613.4 | 1358.9 | 4469.4 | 1391.4 | 12698.7 | 16.8% | 76.2/76.2 | 19.7/40.0 |
| tensorrt | fp32 | 256 | 474.4 | 1487.3 | 2653.2 | 1560.3 | 6091.6 | 16.1% | 76.2/76.2 | 28.86/40.0 |
| torchtrt | fp16 | 256 | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 |

| ixrt | fp16 | 256 | 275.6 | | | 276.8 | 1914.3 | 8.2% | 76.2/76.2 | 4.3/32.0 |
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

请将后一个76.2删除。用户可以参考nvidia A100上面的验证结果,这里只需要填写厂商的推理acc

Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
ixrt_tmp_path: iluvatar_tmp/resnet50-fp16.engine
has_dynamic_axis: false
repeat: 1
image_size: 224
exist_onnx_path: onnxs/resnet50_bs256_pytorch_fp16False.onnx
# exist_compiler_path: resnet50-fp16.engine
output_types: {"output":"float32"}
input_types: {"input": "float32"}
15 changes: 15 additions & 0 deletions inference/docker_images/iluvatar/iluvatar_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
def analysis_log(logpath):
logfile = open(logpath)

max_usage = 0.0
max_mem = 0.0
for line in logfile.readlines():
if "MiB" in line:
usage = line.split(" ")[2]
usage = float(usage[:-3])
max_usage = max(max_usage, usage)
max_mem = line.split(" ")[3]
max_mem = float(max_mem[:-3])

return round(max_usage / 1024.0,
2), round(max_mem / 1024.0, 2), eval("24e12"), eval("96e12")
256 changes: 256 additions & 0 deletions inference/docker_images/iluvatar/iluvatar_monitor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
# !/usr/bin/env python3
# encoding: utf-8
'''
Usage: python3 sys-monitor.py -o operation -l [log_path]
-o, --operation start|stop|restart|status
-l, --log log path , ./logs/ default
'''

import os
import sys
import time
import signal
import atexit
import argparse
import datetime
from multiprocessing import Process
import subprocess
import schedule


class Daemon:
'''
daemon subprocess class.
usage: subclass this daemon and override the run() method.
sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
verbose: debug mode, disabled default.
'''

def __init__(self,
pid_file,
log_file,
err_file,
gpu_log,
log_path,
rate=5,
stdin=os.devnull,
stdout=os.devnull,
stderr=os.devnull,
home_dir='.',
umask=0o22,
verbose=0):
self.stdin = stdin
self.stdout = stdout
self.stderr = stderr
self.home_dir = home_dir
self.verbose = verbose
self.pidfile = pid_file
self.logfile = log_file
self.errfile = err_file
self.gpufile = gpu_log
self.logpath = log_path
self.rate = rate
self.umask = umask
self.verbose = verbose
self.daemon_alive = True

def get_pid(self):
try:
with open(self.pidfile, 'r') as pf:
pid = int(pf.read().strip())
except IOError:
pid = None
except SystemExit:
pid = None
return pid

def del_pid(self):
if os.path.exists(self.pidfile):
os.remove(self.pidfile)

def run(self):
'''
NOTE: override the method in subclass
'''

def gpu_mon(file):
TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
cmd = "ixsmi |grep 'Default'|awk '{print $3,$5,$9,$11,$13}'"
process = subprocess.Popen(cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
encoding='utf-8')
try:
out = process.communicate(timeout=10)
except subprocess.TimeoutExpired:
process.kill()
out = process.communicate()

if process.returncode != 0:
result = "error"
result = TIMESTAMP + "\n" + out[0] + "\n"
with open(file, 'a') as f:
f.write(result)

def timer_gpu_mon():
gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
gpu_process.start()

schedule.every(self.rate).seconds.do(timer_gpu_mon)
while True:
schedule.run_pending()
time.sleep(5)

def daemonize(self):
if self.verbose >= 1:
print('daemon process starting ...')
try:
pid = os.fork()
if pid > 0:
sys.exit(0)
except OSError as e:
sys.stderr.write('fork #1 failed: %d (%s)\n' %
(e.errno, e.strerror))
sys.exit(1)
os.chdir(self.home_dir)
os.setsid()
os.umask(self.umask)
try:
pid = os.fork()
if pid > 0:
sys.exit(0)
except OSError as e:
sys.stderr.write('fork #2 failed: %d (%s)\n' %
(e.errno, e.strerror))
sys.exit(1)
sys.stdout.flush()
sys.stderr.flush()
si = open(self.stdin, 'r')
so = open(self.stdout, 'a+')
if self.stderr:
se = open(self.stderr, 'a+')
else:
se = so
os.dup2(si.fileno(), sys.stdin.fileno())
os.dup2(so.fileno(), sys.stdout.fileno())
os.dup2(se.fileno(), sys.stderr.fileno())
atexit.register(self.del_pid)
pid = str(os.getpid())
with open(self.pidfile, 'w+') as f:
f.write('%s\n' % pid)

def start(self):
if not os.path.exists(self.logpath):
os.makedirs(self.logpath)
elif os.path.exists(self.gpufile):
os.remove(self.gpufile)
if self.verbose >= 1:
print('ready to start ......')
# check for a pid file to see if the daemon already runs
pid = self.get_pid()
if pid:
msg = 'pid file %s already exists, is it already running?\n'
sys.stderr.write(msg % self.pidfile)
sys.exit(1)
# start the daemon
self.daemonize()
self.run()

def stop(self):
if self.verbose >= 1:
print('stopping ...')
pid = self.get_pid()
if not pid:
msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
sys.stderr.write(msg)
if os.path.exists(self.pidfile):
os.remove(self.pidfile)
return
# try to kill the daemon process
try:
i = 0
while 1:
os.kill(pid, signal.SIGTERM)
time.sleep(1)
i = i + 1
if i % 10 == 0:
os.kill(pid, signal.SIGHUP)
except OSError as err:
err = str(err)
if err.find('No such process') > 0:
if os.path.exists(self.pidfile):
os.remove(self.pidfile)
else:
print(str(err))
sys.exit(1)
if self.verbose >= 1:
print('Stopped!')

def restart(self):
self.stop()
self.start()

def status(self):
pid = self.get_pid()
if pid:
if os.path.exists('/proc/%d' % pid):
return pid
return False


def parse_args():
''' Check script input parameter. '''
parse = argparse.ArgumentParser(description='Sys monitor script')
parse.add_argument('-o',
type=str,
metavar='[operation]',
required=True,
help='start|stop|restart|status')
parse.add_argument('-l',
type=str,
metavar='[log_path]',
required=False,
default='./logs/',
help='log path')
args = parse.parse_args()
return args


def main():
sample_rate1 = 5
args = parse_args()
operation = args.o
log_path = args.l
pid_fn = str('/tmp/gpu_monitor.pid')
log_fn = str(log_path + '/iluvatar_monitor.log')
err_fn = str(log_path + '/iluvatar_monitor.err')
# result for gpu
gpu_fn = str(log_path + '/iluvatar_monitor.log')

subdaemon = Daemon(pid_fn,
log_fn,
err_fn,
gpu_fn,
log_path,
verbose=1,
rate=sample_rate1)
if operation == 'start':
subdaemon.start()
elif operation == 'stop':
subdaemon.stop()
elif operation == 'restart':
subdaemon.restart()
elif operation == 'status':
pid = subdaemon.status()
if pid:
print('process [%s] is running ......' % pid)
else:
print('daemon process [%s] stopped' % pid)
else:
print("invalid argument!")
sys.exit(1)


if __name__ == '__main__':
main()
61 changes: 61 additions & 0 deletions inference/docker_images/iluvatar/pytorch/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
FROM ubuntu:20.04

RUN /bin/bash -c "source /root/.bashrc"

ENV DEBIAN_FRONTEND=noninteractive
ENV PATH /root/miniconda/bin:$PATH

RUN sed -i 's#http://archive.ubuntu.com/#http://mirrors.tuna.tsinghua.edu.cn/#' /etc/apt/sources.list
RUN apt-get update -y
RUN apt-get install -y --fix-missing \
apt-utils \
sudo \
openssh-server \
vim \
git \
curl \
wget \
tree \
perl \
kmod \
make \
pciutils \
build-essential \
python3.8-dev \
python3-pip \
libjpeg-dev \
zlib1g-dev \
unzip \
cmake \
bzip2 \
cabextract \
iputils-ping \
pbzip2 \
pv \
numactl \
ninja-build \
libgl1-mesa-dev


# Configure anaconda
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh && \
bash ./Miniconda3-py38_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \
/root/miniconda/bin/conda clean -tipsy && \
ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
echo "conda activate base" >> ~/.bashrc && \
conda config --set always_yes yes --set changeps1 no && \
echo 'LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"' >> ~/.bashrc && \
echo 'PATH="/usr/local/corex/bin:${PATH}"' >> ~/.bashrc


RUN /bin/bash -c "apt-get install -y linux-headers-`uname -r`"

RUN /bin/bash -c "pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple"

ENV LD_LIBRARY_PATH="/usr/local/corex/lib:${LD_LIBRARY_PATH}"
ENV PATH="/usr/local/corex/bin:${PATH}"
ENV NVCC_ARGUMENTS="-U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -ftemplate-depth=1024"

RUN pip install loguru
RUN pip install pyyaml
9 changes: 9 additions & 0 deletions inference/docker_images/iluvatar/pytorch/packages/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# 以下软件包需联系天数智芯获取

>联系邮箱: contact-us@iluvatar.com

ixrt-0.4.0+corex.3.2.0-cp38-cp38-linux_x86_64.whl

torch-1.13.1+corex.3.2.0-cp38-cp38-linux_x86_64.whl

torchvision-0.14.1+corex.3.2.0-cp38-cp38-linux_x86_64.whl
Loading