From 9419bde6a6107bea9f03bfc1448ab1fa5878b03c Mon Sep 17 00:00:00 2001
From: fred1912 <fred1912@users.noreply.github.com>
Date: Thu, 25 Jan 2024 18:25:41 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90metax=E3=80=91First=20PR=20&=20faster?=
 =?UTF-8?q?=5Frcnn=20project=20(#402)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* update readme

* add company info

* faster_rcnn update & first PR

* fix readme

* add config 1x8 bs=16

* fix typo A100->C500

* remove torchvision in requirements.txt

* update readme

* update fasterrcnn readme

* update

* add 2x8 info & add 带宽

* fix typo

* delete history

* update info

* update info

* update table

* delete history

* add info in test-conf

* fix typo

* delete history

* fix env bug & add mx tf32 env

* update requirements

* fix bug

---------

Co-authored-by: Shengchu Zhao <shengchu.zhao@metax-tech.com>
---
 training/benchmarks/driver/helper.py          |   5 +
 .../environment_variables.sh                  |   5 +
 .../faster_rcnn-pytorch/requirements.txt      |   6 +
 training/metax/README.md                      |  70 +++++
 .../metax/docker_image/pytorch_2.0/Dockerfile |   3 +
 .../metax/docker_image/pytorch_2.0/README.md  |   5 +
 .../pytorch_2.0/pytorch_install.sh            |   1 +
 training/metax/faster_rcnn-pytorch/README.md  |  59 ++++
 .../config/config_C500x1x1.py                 |   4 +
 .../config/config_C500x1x8.py                 |   3 +
 .../config/config_C500x2x8.py                 |   4 +
 .../config/requirements.txt                   |   3 +
 .../metax/faster_rcnn-pytorch/extern/.gitkeep |   0
 training/metax/metax_monitor.py               | 288 ++++++++++++++++++
 training/run_benchmarks/config/test_conf.py   |   6 +-
 training/run_benchmarks/run.py                |   5 +-
 16 files changed, 465 insertions(+), 2 deletions(-)
 create mode 100644 training/kunlunxin/faster_rcnn-pytorch/environment_variables.sh
 create mode 100644 training/kunlunxin/faster_rcnn-pytorch/requirements.txt
 create mode 100644 training/metax/README.md
 create mode 100644 training/metax/docker_image/pytorch_2.0/Dockerfile
 create mode 100755 training/metax/docker_image/pytorch_2.0/README.md
 create mode 100644 training/metax/docker_image/pytorch_2.0/pytorch_install.sh
 create mode 100644 training/metax/faster_rcnn-pytorch/README.md
 create mode 100644 training/metax/faster_rcnn-pytorch/config/config_C500x1x1.py
 create mode 100644 training/metax/faster_rcnn-pytorch/config/config_C500x1x8.py
 create mode 100644 training/metax/faster_rcnn-pytorch/config/config_C500x2x8.py
 create mode 100644 training/metax/faster_rcnn-pytorch/config/requirements.txt
 create mode 100644 training/metax/faster_rcnn-pytorch/extern/.gitkeep
 create mode 100644 training/metax/metax_monitor.py

diff --git a/training/benchmarks/driver/helper.py b/training/benchmarks/driver/helper.py
index de513901e..c616b4e92 100644
--- a/training/benchmarks/driver/helper.py
+++ b/training/benchmarks/driver/helper.py
@@ -83,3 +83,8 @@ def set_seed(self, seed: int, vendor: str = None):
         else:
             # TODO 其他厂商设置seed，在此扩展
             pass
+
+        if os.environ.get("METAX_USE_TF32"):
+            import torch
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
\ No newline at end of file
diff --git a/training/kunlunxin/faster_rcnn-pytorch/environment_variables.sh b/training/kunlunxin/faster_rcnn-pytorch/environment_variables.sh
new file mode 100644
index 000000000..a7f429ac2
--- /dev/null
+++ b/training/kunlunxin/faster_rcnn-pytorch/environment_variables.sh
@@ -0,0 +1,5 @@
+# =================================================
+# Export variables
+# =================================================
+
+export METAX_USE_TF32=1
diff --git a/training/kunlunxin/faster_rcnn-pytorch/requirements.txt b/training/kunlunxin/faster_rcnn-pytorch/requirements.txt
new file mode 100644
index 000000000..45dd53af8
--- /dev/null
+++ b/training/kunlunxin/faster_rcnn-pytorch/requirements.txt
@@ -0,0 +1,6 @@
+/root/.cache/torch/hub/checkpoints/torchvision-0.15.1+mc2.19.0.2-cp38-cp38-linux_x86_64.whl
+/root/.cache/torch/hub/checkpoints/torch-2.0.0+gite544b36-cp38-cp38-linux_x86_64.whl
+pycocotools
+numpy
+tqdm
+schedule
\ No newline at end of file
diff --git a/training/metax/README.md b/training/metax/README.md
new file mode 100644
index 000000000..54fe64afe
--- /dev/null
+++ b/training/metax/README.md
@@ -0,0 +1,70 @@
+# 厂商信息
+
+官网: https://www.metax-tech.com/
+
+沐曦集成电路（上海）有限公司，于2020年9月成立于上海，并在北京、南京、成都、杭州、深圳、武汉和长沙等地建立了全资子公司暨研发中心。沐曦拥有技术完备、设计和产业化经验丰富的团队，核心成员平均拥有近20年高性能GPU产品端到端研发经验，曾主导过十多款世界主流高性能GPU产品研发及量产，包括GPU架构定义、GPU IP设计、GPU SoC设计及GPU系统解决方案的量产交付全流程。
+
+沐曦致力于为异构计算提供全栈GPU芯片及解决方案，可广泛应用于智算、智慧城市、云计算、自动驾驶、数字孪生、元宇宙等前沿领域，为数字经济发展提供强大的算力支撑。
+
+沐曦打造全栈GPU芯片产品，推出曦思®N系列GPU产品用于智算推理，曦云®C系列GPU产品用于通用计算，以及曦彩®G系列GPU产品用于图形渲染，满足“高能效”和“高通用性”的算力需求。沐曦产品均采用完全自主研发的GPU IP，拥有完全自主知识产权的指令集和架构，配以兼容主流GPU生态的完整软件栈（MXMACA®），具备高能效和高通用性的天然优势，能够为客户构建软硬件一体的全面生态解决方案，是“双碳”背景下推动数字经济建设和产业数字化、智能化转型升级的算力基石。
+
+
+
+# FlagPerf适配验证环境说明
+## 环境配置参考
+- 硬件
+  - 机器型号: 同泰怡 G658V3
+  - 加速卡型号: 曦云®C500 64G  
+  - 多机网络类型、带宽: InfiniBand，2x200 Gb/s
+- 软件
+  - OS版本：Ubuntu 20.04.6
+  - OS kernel版本: 5.4.0-26-generic
+  - 加速卡驱动版本：2.18.0.8
+  - VBIOS：1.0.102.0
+  - Docker版本：24.0.7
+
+
+## 容器镜像信息
+- 容器构建信息
+  - Dockerfile路径：metax/docker_image/pytorch_2.0/Dockerfile
+  - 构建后软件安装脚本：metax/docker_image/pytorch_2.0/pytorch_install.sh
+
+- 核心软件信息 
+  - AI框架&相关版本：  
+    torch: pytorch-2.0-mc  
+    torchvision: torchvision-0.15-mc  
+    maca: 2.18.0.8  
+
+
+## 加速卡监控采集
+- 加速卡使用信息采集命令
+
+  ```shell 
+  mx_smi
+  ```
+- 监控项示例：
+
++---------------------------------------------------------------------------------+  
+|&emsp; MX-SMI 2.0.12&emsp; &emsp; &emsp; &emsp; &emsp; Kernel Mode Driver Version: 2.2.0&emsp; &emsp; &emsp; &thinsp; |  
+|&emsp;  MACA Version: 2.0&emsp; &emsp; &emsp; &emsp; &emsp; &emsp; &emsp;  BIOS Version: 1.0.102.0&emsp; &emsp; &emsp; &thinsp; &thinsp; |  
+|------------------------------------+---------------------+----------------------+  
+|&emsp; GPU&emsp;&emsp;&thinsp; NAME &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;| Bus-i&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&thinsp;| GPU-Util&emsp;&emsp;&emsp;&emsp;&emsp;&thinsp;|  
+|&emsp; Temp&emsp;&emsp;Power &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;| Memory-Usage&emsp;&thinsp;&thinsp;&thinsp;|&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&thinsp;&thinsp;|  
+|=====================+============+==============|  
+| &emsp;0&emsp;&emsp;&thinsp; MXC500   &emsp;&emsp;&thinsp;&emsp;&emsp;&thinsp;&emsp;&emsp;| 0000:1b:00.0 &emsp;&emsp;&thinsp;&thinsp; | 0%&emsp;&emsp;&thinsp;&emsp;&thinsp;&thinsp;&emsp;&thinsp;&thinsp;&emsp;&thinsp;&thinsp;&emsp;&thinsp;&thinsp;|  
+| &emsp;35C &emsp;&emsp;&thinsp;56W &emsp;&emsp;&thinsp;&emsp;&emsp;&emsp;&emsp; &thinsp; | 914/65536 MiB &thinsp; &thinsp; &thinsp;    | &emsp;&emsp;&thinsp;  &emsp;&emsp;&thinsp;&thinsp;&emsp;&thinsp;&thinsp;&emsp;&emsp;&emsp;|  
++------------------------------------+---------------------+----------------------+  
+
+
+- 加速卡使用信息采集项说明
+
+|监控项| 日志文件 | 格式 |
+|---|---|---|
+|温度| mx_monitor.log | xxx C |
+|功耗 |mx_monitor.log | xxx W |
+|显存占用大小 |mx_monitor.log |xxx MiB |
+|总显存大小 |mx_monitor.log |xxx MiB |
+|显存使用率 |mx_monitor.log |xxx % |
+
+
+
diff --git a/training/metax/docker_image/pytorch_2.0/Dockerfile b/training/metax/docker_image/pytorch_2.0/Dockerfile
new file mode 100644
index 000000000..476c1448a
--- /dev/null
+++ b/training/metax/docker_image/pytorch_2.0/Dockerfile
@@ -0,0 +1,3 @@
+FROM maca-2.18.0.8-ubuntu18.04-amd64:FlagPerf-base-v1
+ENV PATH="/opt/conda/bin:${PATH}"
+RUN /bin/bash -c "uname -a"
\ No newline at end of file
diff --git a/training/metax/docker_image/pytorch_2.0/README.md b/training/metax/docker_image/pytorch_2.0/README.md
new file mode 100755
index 000000000..5fcb6fa35
--- /dev/null
+++ b/training/metax/docker_image/pytorch_2.0/README.md
@@ -0,0 +1,5 @@
+# 以下软件包需联系沐曦获取
+
+>联系邮箱: shengchu.zhao@metax-tech.com
+
+docker image: maca-2.18.0.8-ubuntu18.04-amd64
diff --git a/training/metax/docker_image/pytorch_2.0/pytorch_install.sh b/training/metax/docker_image/pytorch_2.0/pytorch_install.sh
new file mode 100644
index 000000000..a9bf588e2
--- /dev/null
+++ b/training/metax/docker_image/pytorch_2.0/pytorch_install.sh
@@ -0,0 +1 @@
+#!/bin/bash
diff --git a/training/metax/faster_rcnn-pytorch/README.md b/training/metax/faster_rcnn-pytorch/README.md
new file mode 100644
index 000000000..0e892eda5
--- /dev/null
+++ b/training/metax/faster_rcnn-pytorch/README.md
@@ -0,0 +1,59 @@
+### 模型backbone权重下载
+[模型backbone权重下载](../../benchmarks/faster_rcnn) 
+
+这一部分路径在FlagPerf/training/benchmarks/faster_rcnn/pytorch/model/\_\_init__.py中提供：
+
+```python
+torchvision.models.resnet.ResNet50_Weights.IMAGENET1K_V1.value.url = 'https://download.pytorch.org/models/resnet50-0676ba61.pth'
+```
+本case中默认配置为，从官网同路径（0676ba61）自动下载backbone权重。用户如需手动指定，可自行下载至被挂载到容器内的路径下，并于此处修改路径为"file://"+download_path
+
+### 测试数据集下载
+
+[测试数据集下载](https://cocodataset.org/)
+
+### 沐曦集成电路 C500 GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器、加速卡型号: 曦云®C500 64G
+    - 多机网络类型、带宽: InfiniBand，2x200 Gb/s
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04.6
+   - OS kernel版本:  5.4.0-26-generic
+   - 加速卡驱动版本：2.2.0
+   - Docker 版本：24.0.7
+   - 训练框架版本：pytorch-2.0.0+mc2.18.0.8-cp38-cp38-linux_x86_64.whl
+   - 依赖软件版本：无
+
+
+
+
+* 通用指标
+
+| 指标名称       | 指标值                  | 特殊说明                                    |
+| -------------- | ----------------------- | ------------------------------------------- |
+| 任务类别       | 图像目标检测            |                                             |
+| 模型           | fasterRCNN              |                                             |
+| 数据集         | coco2017                |                                             |
+| 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16                           |
+| 超参修改       | fix_hp,见“性能指标”     | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称   | MXC500                 |                                             |
+| 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB                    |
+| 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间                     |
+| 总吞吐量       | p_whole,见“性能指标”    | 实际训练图片数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”    | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1)            |
+| 训练结果       | map,见“性能指标”        | 单位为平均目标检测正确率                    |
+| 额外修改项     | 无                      |                                             |
+
+
+* 性能指标
+
+| 配置                  | precision | fix_hp       | e2e_time | p_whole | p_train | p_core | map | mem |
+| --------------------- | --------- | ------------ | -------- | ------- | ------- | ------ | --- | --- |
+| MXC500 单机8卡（1x8） | fp32      | /            |         |         |         |        |    |9.9/64     |
+| MXC500单机8卡（1x8）  | fp32      | bs=16,lr=0.16 |          |         |         |        |36.7%|44.5/64     |
+| MXC500 单机单卡（1x1）| fp32      | /             | /           |         |         |         |        | 31.8/64       |
+| MXC500 两机8卡（2x8） | fp32      | /             | /           |         |         |         |        | 44.3/64   |
+
diff --git a/training/metax/faster_rcnn-pytorch/config/config_C500x1x1.py b/training/metax/faster_rcnn-pytorch/config/config_C500x1x1.py
new file mode 100644
index 000000000..c11690f00
--- /dev/null
+++ b/training/metax/faster_rcnn-pytorch/config/config_C500x1x1.py
@@ -0,0 +1,4 @@
+vendor: str = "metax"
+train_batch_size = 16
+eval_batch_size = 16
+lr = 0.16
\ No newline at end of file
diff --git a/training/metax/faster_rcnn-pytorch/config/config_C500x1x8.py b/training/metax/faster_rcnn-pytorch/config/config_C500x1x8.py
new file mode 100644
index 000000000..842eda4bc
--- /dev/null
+++ b/training/metax/faster_rcnn-pytorch/config/config_C500x1x8.py
@@ -0,0 +1,3 @@
+vendor: str = "metax"
+train_batch_size = 2
+eval_batch_size = 2
diff --git a/training/metax/faster_rcnn-pytorch/config/config_C500x2x8.py b/training/metax/faster_rcnn-pytorch/config/config_C500x2x8.py
new file mode 100644
index 000000000..07128ad72
--- /dev/null
+++ b/training/metax/faster_rcnn-pytorch/config/config_C500x2x8.py
@@ -0,0 +1,4 @@
+vendor: str = "metax"
+train_batch_size = 16
+eval_batch_size = 16
+lr = 0.08
diff --git a/training/metax/faster_rcnn-pytorch/config/requirements.txt b/training/metax/faster_rcnn-pytorch/config/requirements.txt
new file mode 100644
index 000000000..cc8d4dc61
--- /dev/null
+++ b/training/metax/faster_rcnn-pytorch/config/requirements.txt
@@ -0,0 +1,3 @@
+pycocotools
+numpy
+tqdm
\ No newline at end of file
diff --git a/training/metax/faster_rcnn-pytorch/extern/.gitkeep b/training/metax/faster_rcnn-pytorch/extern/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/metax/metax_monitor.py b/training/metax/metax_monitor.py
new file mode 100644
index 000000000..a8d745822
--- /dev/null
+++ b/training/metax/metax_monitor.py
@@ -0,0 +1,288 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 gpu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.gpufile = gpu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def gpu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            cmd = "mx-smi"# |grep 'Default'|awk '{print $3,$5,$9,$11,$13}'"
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n" + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_gpu_mon():
+            gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
+            gpu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_gpu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.gpufile):
+            os.remove(self.gpufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def get_system_info():
+    cmd = r"echo OS version:;"
+    cmd = cmd + r"cat /etc/issue | head -n1 | awk '{print $1, $2, $3}';"
+    cmd = cmd + r"echo ;"
+    
+    cmd = cmd + r"echo OS Kernel version:;"
+    cmd = cmd + r"uname -r;"
+    cmd = cmd + r"echo ;"
+    
+    cmd = cmd + r"echo Hardware Model:;"
+    cmd = cmd + r"sudo dmidecode | grep -A9 'System Information' | tail -n +2 | sed 's/^[ \t]*//';"
+    cmd = cmd + r"echo ;"
+    
+    cmd = cmd + r"echo Accelerator Model:;"
+    cmd = cmd + r"mx-smi -L;"
+    cmd = cmd + r"echo ;"
+    
+    cmd = cmd + r"echo Accelerator Driver version:;"
+    cmd = cmd + r"mx-smi | grep 'Driver Version' | awk '{print $3}';"
+    cmd = cmd + r"echo ;"
+    
+    cmd = cmd + r"echo Docker version:;"
+    cmd = cmd + r"docker -v"
+    
+    return cmd
+    
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/gpu_monitor.pid')
+    log_fn = str(log_path + '/mx-smi_monitor.log')
+    err_fn = str(log_path + '/mx-smi_monitor.err')
+    # result for gpu
+    gpu_fn = str(log_path + '/mx-smi_monitor.log')
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       gpu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        sys_fn = os.path.join(log_path, 'sys_info.log')
+        cmd = get_system_info()
+        with open(sys_fn, "w") as f:
+            p = subprocess.Popen(cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
+            p.wait()
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index a2fca20d3..400119015 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -1,7 +1,7 @@
 '''Test Configs, including'''
 # -*-coding:utf-8 -*-
 
-# Set accelerator's vendor name, e.g. iluvatar, cambricon, kunlunxin, ascend and mthreads.
+# Set accelerator's vendor name, e.g. iluvatar, cambricon, kunlunxin, ascend, mthreads and metax.
 # We will run benchmarks in training/<vendor>
 VENDOR = "nvidia"
 
@@ -21,6 +21,8 @@
 #        -v /usr/local/Ascend/driver -v /usr/local/dcmi -v /usr/local/bin/npu-smi"
 #   mthreads:
 #       " --env MTHREADS_VISIBLE_DEVICES=all"
+#   metax:
+#       " --device=/dev/dri --device=/dev/mxcd --group-add video"
 ACCE_CONTAINER_OPT = " --gpus all"
 # XXX_VISIBLE_DEVICE item name in env
 # possible value of ACCE_VISIBLE_DEVICE_ENV_NAME are:
@@ -148,4 +150,6 @@
     # "transformer:pytorch:BI-V100:1:8:1": "/raid/dataset/transformer/wmt14_en_de_joined_dict",
     # "bert_hf:pytorch:BI-V100:1:8:1": "/raid/dataset/bert_hf_train",
 
+    # metax cases
+    # "faster_rcnn:C500:pytorch_2.0:1:8:1": "/dataset/coco2017/",
 }
diff --git a/training/run_benchmarks/run.py b/training/run_benchmarks/run.py
index 96f1f508e..a846ce2b1 100644
--- a/training/run_benchmarks/run.py
+++ b/training/run_benchmarks/run.py
@@ -288,9 +288,12 @@ def start_tasks_in_cluster(dp_path, container_name, case_config, base_args,
                            count, curr_log_path):
     '''Start tasks in cluster, and NOT wait.'''
     nnodes = case_config["nnodes"]
+    framework_sub_path = case_config["framework"]
+    if "_" in framework_sub_path:
+        framework_sub_path = framework_sub_path.split("_")[0]
     env_file = os.path.join(
         tc.FLAGPERF_PATH, tc.VENDOR,
-        case_config["model"] + "-" + case_config["framework"],
+        case_config["model"] + "-" + framework_sub_path,
         "config/environment_variables.sh")
     framework = case_config["framework"].split("_")[0]
     if (os.path.isfile(env_file)):