-
Notifications
You must be signed in to change notification settings - Fork 103
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* kunlunxin inference * change docker version * xtcl support fp16 onnx * add kunlun monitor * kunlunxin sync and remove d2h time --------- Co-authored-by: zhaoyixuan02 <zhaoyixuan02@baidu.com> Co-authored-by: zhoujiamin01 <zhoujiamin01@baidu.com>
- Loading branch information
1 parent
dc86b14
commit 9e4076b
Showing
7 changed files
with
453 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
4 changes: 4 additions & 0 deletions
4
inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
fp16: false | ||
compiler: xtcl | ||
no_validation: true | ||
exist_onnx_path: onnxs/resnet50_bs256_pytorch_fp16False.onnx |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
def analysis_log(logpath): | ||
logfile = open(logpath) | ||
|
||
max_usage = 0.0 ## usage_mem | ||
max_mem = 0.0 | ||
for line in logfile.readlines(): | ||
''' | ||
xpu_smi temp power mem w_mem use_rate | ||
''' | ||
if "xpu_smi" in line: | ||
line = line[:-1] | ||
usage = line.split(" ")[4] | ||
usage = float(usage) | ||
max_usage = max(max_usage, usage) | ||
max_mem = line.split(" ")[5] | ||
max_mem = float(max_mem) | ||
|
||
return round(max_usage / 1024.0, | ||
2), round(max_mem / 1024.0, 2), eval("32e12"), eval("128e12") | ||
|
||
|
||
if __name__ == "__main__": | ||
max1, max2, max2,max4 = analysis_log("/home/zhoujiamin01/workspace/zjm_flag/FlagPerf/inference/result/run20230809192313/resnet50:pytorch_1.13/127.0.0.1_noderank0/kunlunxin_monitor.log") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,256 @@ | ||
# !/usr/bin/env python3 | ||
# encoding: utf-8 | ||
''' | ||
Usage: python3 sys-monitor.py -o operation -l [log_path] | ||
-o, --operation start|stop|restart|status | ||
-l, --log log path , ./logs/ default | ||
''' | ||
|
||
import os | ||
import sys | ||
import time | ||
import signal | ||
import atexit | ||
import argparse | ||
import datetime | ||
from multiprocessing import Process | ||
import subprocess | ||
import schedule | ||
|
||
|
||
class Daemon: | ||
''' | ||
daemon subprocess class. | ||
usage: subclass this daemon and override the run() method. | ||
sys-monitor.pid: in the /tmp/, auto del when unexpected exit. | ||
verbose: debug mode, disabled default. | ||
''' | ||
|
||
def __init__(self, | ||
pid_file, | ||
log_file, | ||
err_file, | ||
gpu_log, | ||
log_path, | ||
rate=5, | ||
stdin=os.devnull, | ||
stdout=os.devnull, | ||
stderr=os.devnull, | ||
home_dir='.', | ||
umask=0o22, | ||
verbose=0): | ||
self.stdin = stdin | ||
self.stdout = stdout | ||
self.stderr = stderr | ||
self.home_dir = home_dir | ||
self.verbose = verbose | ||
self.pidfile = pid_file | ||
self.logfile = log_file | ||
self.errfile = err_file | ||
self.gpufile = gpu_log | ||
self.logpath = log_path | ||
self.rate = rate | ||
self.umask = umask | ||
self.verbose = verbose | ||
self.daemon_alive = True | ||
|
||
def get_pid(self): | ||
try: | ||
with open(self.pidfile, 'r') as pf: | ||
pid = int(pf.read().strip()) | ||
except IOError: | ||
pid = None | ||
except SystemExit: | ||
pid = None | ||
return pid | ||
|
||
def del_pid(self): | ||
if os.path.exists(self.pidfile): | ||
os.remove(self.pidfile) | ||
|
||
def run(self): | ||
''' | ||
NOTE: override the method in subclass | ||
''' | ||
|
||
def gpu_mon(file): | ||
TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') | ||
cmd = "xpu_smi |grep '/dev/xpu0'|awk '{print $29,$27,$22,$24,$14}'" ## temp power mem w_mem use_rate | ||
process = subprocess.Popen(cmd, | ||
shell=True, | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.STDOUT, | ||
encoding='utf-8') | ||
try: | ||
out = process.communicate(timeout=10) | ||
except subprocess.TimeoutExpired: | ||
process.kill() | ||
out = process.communicate() | ||
|
||
if process.returncode != 0: | ||
result = "error" | ||
result = TIMESTAMP + "\n xpu_smi " + out[0] + "\n" | ||
with open(file, 'a') as f: | ||
f.write(result) | ||
|
||
def timer_gpu_mon(): | ||
gpu_process = Process(target=gpu_mon, args=(self.gpufile, )) | ||
gpu_process.start() | ||
|
||
schedule.every(self.rate).seconds.do(timer_gpu_mon) | ||
while True: | ||
schedule.run_pending() | ||
time.sleep(5) | ||
|
||
def daemonize(self): | ||
if self.verbose >= 1: | ||
print('daemon process starting ...') | ||
try: | ||
pid = os.fork() | ||
if pid > 0: | ||
sys.exit(0) | ||
except OSError as e: | ||
sys.stderr.write('fork #1 failed: %d (%s)\n' % | ||
(e.errno, e.strerror)) | ||
sys.exit(1) | ||
os.chdir(self.home_dir) | ||
os.setsid() | ||
os.umask(self.umask) | ||
try: | ||
pid = os.fork() | ||
if pid > 0: | ||
sys.exit(0) | ||
except OSError as e: | ||
sys.stderr.write('fork #2 failed: %d (%s)\n' % | ||
(e.errno, e.strerror)) | ||
sys.exit(1) | ||
sys.stdout.flush() | ||
sys.stderr.flush() | ||
si = open(self.stdin, 'r') | ||
so = open(self.stdout, 'a+') | ||
if self.stderr: | ||
se = open(self.stderr, 'a+') | ||
else: | ||
se = so | ||
os.dup2(si.fileno(), sys.stdin.fileno()) | ||
os.dup2(so.fileno(), sys.stdout.fileno()) | ||
os.dup2(se.fileno(), sys.stderr.fileno()) | ||
atexit.register(self.del_pid) | ||
pid = str(os.getpid()) | ||
with open(self.pidfile, 'w+') as f: | ||
f.write('%s\n' % pid) | ||
|
||
def start(self): | ||
if not os.path.exists(self.logpath): | ||
os.makedirs(self.logpath) | ||
elif os.path.exists(self.gpufile): | ||
os.remove(self.gpufile) | ||
if self.verbose >= 1: | ||
print('ready to start ......') | ||
# check for a pid file to see if the daemon already runs | ||
pid = self.get_pid() | ||
if pid: | ||
msg = 'pid file %s already exists, is it already running?\n' | ||
sys.stderr.write(msg % self.pidfile) | ||
sys.exit(1) | ||
# start the daemon | ||
self.daemonize() | ||
self.run() | ||
|
||
def stop(self): | ||
if self.verbose >= 1: | ||
print('stopping ...') | ||
pid = self.get_pid() | ||
if not pid: | ||
msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile | ||
sys.stderr.write(msg) | ||
if os.path.exists(self.pidfile): | ||
os.remove(self.pidfile) | ||
return | ||
# try to kill the daemon process | ||
try: | ||
i = 0 | ||
while 1: | ||
os.kill(pid, signal.SIGTERM) | ||
time.sleep(1) | ||
i = i + 1 | ||
if i % 10 == 0: | ||
os.kill(pid, signal.SIGHUP) | ||
except OSError as err: | ||
err = str(err) | ||
if err.find('No such process') > 0: | ||
if os.path.exists(self.pidfile): | ||
os.remove(self.pidfile) | ||
else: | ||
print(str(err)) | ||
sys.exit(1) | ||
if self.verbose >= 1: | ||
print('Stopped!') | ||
|
||
def restart(self): | ||
self.stop() | ||
self.start() | ||
|
||
def status(self): | ||
pid = self.get_pid() | ||
if pid: | ||
if os.path.exists('/proc/%d' % pid): | ||
return pid | ||
return False | ||
|
||
|
||
def parse_args(): | ||
''' Check script input parameter. ''' | ||
parse = argparse.ArgumentParser(description='Sys monitor script') | ||
parse.add_argument('-o', | ||
type=str, | ||
metavar='[operation]', | ||
required=True, | ||
help='start|stop|restart|status') | ||
parse.add_argument('-l', | ||
type=str, | ||
metavar='[log_path]', | ||
required=False, | ||
default='./logs/', | ||
help='log path') | ||
args = parse.parse_args() | ||
return args | ||
|
||
|
||
def main(): | ||
sample_rate1 = 5 | ||
args = parse_args() | ||
operation = args.o | ||
log_path = args.l | ||
pid_fn = str('/tmp/xpu_monitor.pid') | ||
log_fn = str(log_path + '/kunlunxin_monitor.log') | ||
err_fn = str(log_path + '/kunlunxin_monitor.err') | ||
# result for gpu | ||
gpu_fn = str(log_path + '/kunlunxin_monitor.log') | ||
|
||
subdaemon = Daemon(pid_fn, | ||
log_fn, | ||
err_fn, | ||
gpu_fn, | ||
log_path, | ||
verbose=1, | ||
rate=sample_rate1) | ||
if operation == 'start': | ||
subdaemon.start() | ||
elif operation == 'stop': | ||
subdaemon.stop() | ||
elif operation == 'restart': | ||
subdaemon.restart() | ||
elif operation == 'status': | ||
pid = subdaemon.status() | ||
if pid: | ||
print('process [%s] is running ......' % pid) | ||
else: | ||
print('daemon process [%s] stopped' % pid) | ||
else: | ||
print("invalid argument!") | ||
sys.exit(1) | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
FROM ubuntu:18.04 | ||
|
||
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ | ||
bash \ | ||
build-essential \ | ||
ca-certificates \ | ||
wget \ | ||
git \ | ||
locales \ | ||
locales-all \ | ||
python3.8-dev \ | ||
lsb-release && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
|
||
# Set timezone | ||
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime | ||
RUN echo 'Asia/Shanghai' >/etc/timezone | ||
|
||
# Install miniconda | ||
# Manually invoke bash on miniconda script per https://github.com/conda/conda/issues/10431 | ||
# RUN curl -fsSL -v -o ~/miniconda.sh -O "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" && \ | ||
RUN wget -O ~/miniconda.sh "https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh" && \ | ||
chmod +x ~/miniconda.sh && \ | ||
bash ~/miniconda.sh -b -p /root/miniconda && \ | ||
rm ~/miniconda.sh && \ | ||
/root/miniconda/bin/conda config --set show_channel_urls yes && \ | ||
/root/miniconda/bin/conda create --name python38 python=3.8 -y && \ | ||
/root/miniconda/bin/conda clean -ya | ||
|
||
# hyperparamer, typing_extensions, numpy requests | ||
RUN /root/miniconda/envs/python38/bin/pip install \ | ||
--no-cache-dir \ | ||
-i https://pypi.tuna.tsinghua.edu.cn/simple \ | ||
hyperparameter \ | ||
typing_extensions \ | ||
numpy \ | ||
requests \ | ||
onnx \ | ||
onnxruntime \ | ||
attrs \ | ||
regex \ | ||
decorator \ | ||
loguru \ | ||
schedule \ | ||
munch \ | ||
pyyaml \ | ||
tqdm \ | ||
scipy | ||
|
||
RUN /root/miniconda/envs/python38/bin/pip install torch==1.13.0+cpu torchvision==0.14.0+cpu torchaudio==0.13.0 --extra-index-url https://download.pytorch.org/whl/cpu | ||
|
||
RUN cd /root && wget https://baidu-kunlun-public.su.bcebos.com/XTCL/XTCL-2.1/XTCL-ubuntu_x86_64.tar.gz && tar -xzf XTCL-ubuntu_x86_64.tar.gz | ||
|
||
RUN cd /root && wget https://klx-sdk-release-public.su.bcebos.com/xre/release/4.0.18.1/xre-ubuntu_2004_x86_64.tar.gz && tar -xzf xre-ubuntu_2004_x86_64.tar.gz | ||
|
||
ENV LD_LIBRARY_PATH=/root/XTCL-ubuntu_x86_64/3rdparty/lib:/root/XTCL-ubuntu_x86_64/runtime/shlib:/root/XTCL-ubuntu_x86_64/shlib | ||
ENV KERNEL_INCLUDE=/root/XTCL-ubuntu_x86_64/xpu/kernels | ||
ENV XTCL_L3_SIZE=67104768 | ||
ENV XPU_PADDLE_L3_SIZE=67104768 | ||
ENV RT_LIBRARY_PATH=/root/XTCL-ubuntu_x86_64/runtime/shlib | ||
ENV THIRDPARTY_LIB_DIR=/root/XTCL-ubuntu_x86_64/3rdparty/lib | ||
ENV XTCL_INSTALL_DIR=/root/XTCL-ubuntu_x86_64 | ||
ENV XTCL_QUANTIZE_WEIGHT=1 | ||
ENV XTCL_USE_FP16=1 | ||
ENV PYTHONPATH=/root/XTCL-ubuntu_x86_64/python:/root/XTCL-ubuntu_x86_64/python/tvm:/root/XTCL-ubuntu_x86_64/python/topi | ||
ENV CLANG_PATH=/root/XTCL-ubuntu_x86_64 | ||
ENV KERNEL_SEARCH_PATH=/root/XTCL-ubuntu_x86_64/xpu/kernels | ||
ENV XPUSIM_DEVICE_MODEL=KUNLUN2 | ||
ENV XTCL_AUTO_ALLOC_L3=1 | ||
ENV TVM_DIR=/root/XTCL-ubuntu_x86_64 | ||
|
||
|
||
|
||
ENV PATH /root/xre-ubuntu_2004_x86_64/bin:$PATH | ||
ENV PATH /root/miniconda/envs/python38/bin:$PATH | ||
|
Oops, something went wrong.