Merge branch 'main' into feature/longformer

FlagOpen · Oct 8, 2023 · 8ea0b13 · 8ea0b13
2 parents 7e57189 + 84132ae
commit 8ea0b13
Show file tree

Hide file tree

Showing 126 changed files with 4,298 additions and 503 deletions.
diff --git a/README.md b/README.md
diff --git a/inference/benchmarks/resnet50/README.md b/inference/benchmarks/resnet50/README.md
@@ -76,7 +76,7 @@ find ./val -name "*JPEG" | wc -l
 
 - 推理工具包
 
-   - XTCL 2.1
+   - XTCL daily 2023.09.23
 
 #### 2.3 天数智芯 MR-V100
 
@@ -96,6 +96,24 @@ find ./val -name "*JPEG" | wc -l
 
    - IXRT: ixrt-0.4.0+corex.3.2.0
 
+#### 2.5 腾讯紫霄 C100
+
+- ##### 硬件环境
+    - 机器、加速卡型号: C100
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.15.0-78-generic
+   - 加速卡驱动版本：2.4.12
+   - Docker 版本：24.0.4
+   - 依赖软件版本：
+     - pytorch: 1.13.0+cpu
+     - onnx: 1.14.0
+
+- 推理工具包
+
+   - zxrt 2.4.12
+
 ### 3. 运行情况
 
 * 指标列表
@@ -120,6 +138,8 @@ find ./val -name "*JPEG" | wc -l
 | tensorrt | fp16      | 256  |613.4 | 1358.9   | 4469.4 | 1391.4   | 12698.7 | 16.8% | 76.2/76.2 | 19.7/40.0 |
 | tensorrt | fp32   | 256  | 474.4    | 1487.3      | 2653.2     | 1560.3        | 6091.6  | 16.1% | 76.2/76.2 | 28.86/40.0 |
 | torchtrt | fp16     | 256  | 716.4 | 1370.4 | 4282.6 | 1320.0 | 4723.0 | 6.3% | 76.2/76.2 | 9.42/40.0 |
-| ixrt     | fp16     | 256  | 136.4 | /      | /      | 1146.6 | 2679.9 | 11.5% | 76.2 | 4.3/32.0 |
-| kunlunxin_xtcl | fp32   | 128  | 311.215    | /      | /     |  837.507    | 1234.727  | / | 76.2/76.2 | / |
+| ixrt     | fp16  (W16A32)   | 256  | 261.467 | /      | /      | 1389.332  | 2721.402 | 11.7% | 76.2/76.2 | 8.02/32.0 |
+| kunlunxin_xtcl | fp32   | 128  | / | /  | / | /      | /      | 12.1% | 76.2/76.2 | 4.52/32.0 |
+| kunlunxin_xtcl | fp16   | 256  | 164.675 | /  | /   |  1566.407 | 3317.012  | 12.1% | 76.2/76.2 | 4.52/32.0 |
+| zixiao | fp16   | 32*6  | 261.103    | /      | /     |  193.151    | 6342.191  | / | 76.2/76.2 | / |
 
diff --git a/inference/benchmarks/sam_h/README.md b/inference/benchmarks/sam_h/README.md
@@ -36,6 +36,24 @@
 
   - TensorRT 8.6.1
 
+#### 2.2 昆仑芯R200
+
+- ##### 硬件环境
+    - 机器、加速卡型号: R200
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.15.0-56-generic
+   - 加速卡驱动版本：4.0
+   - Docker 版本：20.10.21
+   - 依赖软件版本：
+     - pytorch: 1.13.0+cpu
+     - onnx: 1.14.0
+
+- 推理工具包
+
+   - XTCL 2.0.0.67
+
 ### 3. 运行情况
 
 * 指标列表

diff --git a/inference/benchmarks/sam_h/pytorch/forward.py b/inference/benchmarks/sam_h/pytorch/forward.py
@@ -84,7 +84,6 @@ def engine_forward(model, dataloader, evaluator, config):
         for step, (x, y, osize, dsize) in enumerate(dataloader):
             if config.fp16:
                 x = x.to(torch.float16)
-                y = y.to(torch.float16)
             torch_sync(config)
             core_time_start = time.time()
 
@@ -101,7 +100,7 @@ def engine_forward(model, dataloader, evaluator, config):
                 torch_sync(config)
                 core_time += time.time() - core_time_start
 
-                pred = pred[0]
+                pred = pred[1]
                 pred = pred.reshape(config.batch_size, 1, 3, 256, 256).float()
                 pred = pred.cpu()
 

diff --git a/inference/benchmarks/stable_diffusion_v1_4/README.md b/inference/benchmarks/stable_diffusion_v1_4/README.md
@@ -56,5 +56,6 @@
 | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- |
 | tensorrt | fp16    | 2   |1674.9 | 11.4        | 45.2 | 10.6 | 60.6 | 13.2% | 17.1/25.2 | 13.3/40.0 |
 | tensorrt | fp32   | 2 | 1807.4 | 8.2 | 20.6 | 7.2  | 16.1 | 7.0% | 25.2/25.3 | 39.2/40.0 |
+| kunlunxin_xtcl | fp32   | 2 | 213.822 | / | / | 4.755  | 9.471 | 20.1% | 26.524/25.3 | 0.07/32.0 |
 | null | fp16 | 16 | / | 11.7 | 60.7 | /  | / | 13.2% | -/25.2 | 5.7/40.0 |
 | null | fp32 | 8 | / | 9.3 | 27.3 | /  | / | 11.9% | -/25.3 | 6.3/40.0 |
diff --git a/inference/benchmarks/swinTransformer/README.md b/inference/benchmarks/swinTransformer/README.md
@@ -84,4 +84,4 @@ find ./val -name "*JPEG" | wc -l
 | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- |
 | tensorrt | fp16      | 512 |1011.7 | 1347.5 | 1511.3 | 1231.7 | 1359.1 | 6.8% | 81.7/83.2 | 19.9/40.0 |
 | tensorrt | fp32   | 256 | 856.9 | 761.5 | 794.3 | 789.2 | 826.4 | 8.2% | 83.2/83.2 | 20.0/40.0 |
-
+| kunlunxin_xtcl| W32A16   | 256 | 543.745 | / | / | / | / | / | 0.832 | / |
diff --git a/inference/benchmarks/vit_l_16/README.md b/inference/benchmarks/vit_l_16/README.md
@@ -83,4 +83,5 @@ find ./val -name "*JPEG" | wc -l
 | ----------- | --------- | ---- | ---- | -------- | ----------- | ---------- | ------------- | ------------ | ----------- | ----------- |
 | tensorrt | fp16    | 64   |1009.7 | 777.8 | 796.7 | 825.8 | 1329.2 | 26.2% | 79.0/79.3 | 35.0/40.0 |
 | tensorrt | fp32   | 32 | 1275.9 | 482.4  | 491.1 | 555.5    | 590.5 | 23.3% | 79.3/79.3 | 35.0/40.0 |
+| kunlunxin_xtcl | W32A16   | 32 | 2118.307 | / | / | 130.006    | 144.914 | 27.9% | 79.3/79.3 | / |
 
diff --git a/inference/benchmarks/yolov5/README.md b/inference/benchmarks/yolov5/README.md
@@ -53,6 +53,25 @@ find ./val -name "*JPEG" | wc -l
    - TensorRT 8.5.1.7
    - torch_tensorrt 1.3.0
 
+#### 2.2 昆仑芯R200
+
+- ##### 硬件环境
+    - 机器、加速卡型号: R200
+
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.15.0-56-generic
+   - 加速卡驱动版本：4.0
+   - Docker 版本：20.10.21
+   - 依赖软件版本：
+     - pytorch: 1.13.0+cpu
+     - onnx: 1.14.0
+     - pycocotools: 2.0.7
+
+- 推理工具包
+
+   - XTCL 2.1
+
 ### 3. 运行情况
 
 * 指标列表
@@ -75,3 +94,4 @@ find ./val -name "*JPEG" | wc -l
 | ----------- | --------- | ---- | -------- | ----------- | ---------- | ------------- | ------------ |  ------------ |----------- | ---------- |
 | tensorrt | fp32   | 96  | 733.8    |    /   | /    | 53.8       | 361.4 |12.6%| 0.45 | 35.44/40.0 |
 | tensorrt | fp16   | 96  | 1665.8    |    /   | /    | 58.6     | 859 |15.0%| 0.45 | 26.15/40.0 |
+| kunlunxin_xtcl | fp32   | 96  | / |    /   | / | /   | / |18.9%| 0.451 | 26.42/32.0 |
diff --git a/inference/benchmarks/yolov5/pytorch/kunlunxin_requirements.txt b/inference/benchmarks/yolov5/pytorch/kunlunxin_requirements.txt
@@ -0,0 +1,2 @@
+pycocotools
+opencv-python-headless
diff --git a/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml b/inference/configs/bertLarge/vendor_config/kunlunxin_configurations.yaml
@@ -1,3 +1,4 @@
 compiler: xtcl
 no_validation: true
+vm_enable: false
 exist_onnx_path: onnxs/bertLarge/bertLarge_bs32_pytorch_fp16False.onnx
diff --git a/inference/configs/host.yaml b/inference/configs/host.yaml
@@ -13,4 +13,4 @@ PIP_SOURCE: "https://mirror.baidu.com/pypi/simple"
 CLEAR_CACHES: True
 ACCE_VISIBLE_DEVICE_ENV_NAME: "CUDA_VISIBLE_DEVICES"
 CASES: 
-    "resnet50:pytorch_1.13": "/raid/dataset/ImageNet/imagenet/val"
+    "resnet50:pytorch_1.13": "/raid/dataset/ImageNet/imagenet/val"
diff --git a/inference/configs/resnet50/vendor_config/iluvatar_configurations.yaml b/inference/configs/resnet50/vendor_config/iluvatar_configurations.yaml
@@ -2,7 +2,3 @@ ixrt_tmp_path: iluvatar_tmp/resnet50-fp16.engine
 has_dynamic_axis: false
 repeat: 1
 image_size: 224
-exist_onnx_path: onnxs/resnet50_bs256_pytorch_fp16False.onnx
-# exist_compiler_path: resnet50-fp16.engine
-output_types: {"output":"float32"}
-input_types: {"input": "float32"}
diff --git a/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml b/inference/configs/resnet50/vendor_config/kunlunxin_configurations.yaml
@@ -2,3 +2,4 @@ fp16: false
 compiler: xtcl
 no_validation: true
 exist_onnx_path: onnxs/resnet50_bs256_pytorch_fp16False.onnx 
+resnet50_fuse: true
diff --git a/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml b/inference/configs/resnet50/vendor_config/zixiao_configurations.yaml
@@ -0,0 +1,6 @@
+compiler: zxrt
+no_validation: true
+batch_size: 50000
+exist_onnx_path: onnxs/resnet50_pytorch.onnx
+repeat: 1
+zixiao_test_batch_size: 32
diff --git a/inference/configs/sam_h/vendor_config/kunlunxin_configurations.yaml b/inference/configs/sam_h/vendor_config/kunlunxin_configurations.yaml
@@ -0,0 +1,10 @@
+compiler: xtcl
+no_validation: true
+build_config:
+  FuseWithoutPattern:
+    - FuseConv2dTransposeBiasAdd
+  pattern_match:
+    - fuse_attention_sam
+disabled_pass:
+  - xgraph_layout_opt
+exist_onnx_path: onnxs/sam_h_bs4_pytorch_fp16True.onnx
diff --git a/inference/configs/stable_diffusion_v1_4/vendor_config/kunlunxin_configurations.yaml b/inference/configs/stable_diffusion_v1_4/vendor_config/kunlunxin_configurations.yaml
@@ -0,0 +1,3 @@
+fp16: false
+compiler: xtcl
+no_validation: true
diff --git a/inference/configs/swinTransformer/configurations.yaml b/inference/configs/swinTransformer/configurations.yaml
@@ -13,4 +13,4 @@ no_validation: false
 # set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt)
 exist_onnx_path: null
 # set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine
-exist_compiler_path: null
+exist_compiler_path: null
diff --git a/inference/configs/swinTransformer/vendor_config/kunlunxin_configurations.yaml b/inference/configs/swinTransformer/vendor_config/kunlunxin_configurations.yaml
@@ -0,0 +1,16 @@
+batch_size: 256
+# 1 item(like 1 sequence, 1 image) flops
+# Attention! For transformer decoder like bert, 1 token cause 2*param flops, so we need 2*length*params like 2*512*0.33B here
+# format: a_1*a*2*...*a_nea_0,like 2*512*0.33e9(bert) or 4.12e9(resnet50)
+flops: 1.55e10
+fp16: false
+compiler: xtcl
+num_workers: 8
+log_freq: 30
+repeat: 5
+# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null
+no_validation: true
+# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt)
+exist_onnx_path: /home/liuyu/flagperf/FlagPerf/inference/onnxs/kunlunxin_flagperf_swinTransformer/swinTransformer_bs256_pytorch_fp16False.onnx
+# set a exist path of engine file like resnet50.trt/resnet50.plan/resnet50.engine
+exist_compiler_path: null
diff --git a/inference/configs/vit_l_16/vendor_config/kunlunxin_configurations.yaml b/inference/configs/vit_l_16/vendor_config/kunlunxin_configurations.yaml
@@ -0,0 +1,5 @@
+compiler: xtcl
+# skip validation(will also skip create_model, export onnx). Assert exist_onnx_path != null
+no_validation: true
+# set a real onnx_path to use exist, or set it to anything but null to avoid export onnx manually(like torch-tensorrt)
+exist_onnx_path: /home/FlagPerf/inference/onnxs/vit_l_16_bs32_pytorch_fp16False.onnx
diff --git a/inference/docker_images/iluvatar/pytorch/Dockerfile b/inference/docker_images/iluvatar/pytorch/Dockerfile
@@ -38,9 +38,9 @@ RUN apt-get install -y --fix-missing \
 
 
 # Configure anaconda
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py38_4.10.3-Linux-x86_64.sh && \
-    bash ./Miniconda3-py38_4.10.3-Linux-x86_64.sh -b -p /root/miniconda && \
-    /root/miniconda/bin/conda clean -tipsy && \
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh && \
+    bash ./Miniconda3-py310_23.5.2-0-Linux-x86_64.sh -b -p /root/miniconda && \
+    /root/miniconda/bin/conda clean -tip && \
     ln -s /root/miniconda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
     echo ". /root/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
     echo "conda activate base" >> ~/.bashrc && \

diff --git a/inference/docker_images/iluvatar/pytorch/packages/README.md b/inference/docker_images/iluvatar/pytorch/packages/README.md
@@ -2,8 +2,10 @@
 
 >联系邮箱: contact-us@iluvatar.com
 
-ixrt-0.4.0+corex.3.2.0-cp38-cp38-linux_x86_64.whl
+ixrt-0.7.0+corex.latest.version-cp310-cp310-linux_x86_64.whl
 
-torch-1.13.1+corex.3.2.0-cp38-cp38-linux_x86_64.whl
+torchvision-0.14.1+corex.3.2.0.20230914.859-cp310-cp310-linux_x86_64.whl
 
-torchvision-0.14.1+corex.3.2.0-cp38-cp38-linux_x86_64.whl
+pycuda-2022.2.2+corex.3.2.0.20230914.859-cp310-cp310-linux_x86_64.whl
+
+torch-1.13.1+corex.3.2.0.20230914.859-cp310-cp310-linux_x86_64.whl
diff --git a/inference/docker_images/iluvatar/pytorch/pytorch_install.sh b/inference/docker_images/iluvatar/pytorch/pytorch_install.sh
@@ -14,7 +14,7 @@ done
 search_sdk_results=`find ${SDK_DIR} -name "corex*.run"`
 for installer in $search_sdk_results; do
     echo "Install ${installer}"
-    sh "${installer}" -- --silent --driver --toolkit
+    sh "${installer}" -- --silent --toolkit
 done
 
 search_packages_results=`find ${PKG_DIR} -name "*.whl"`

diff --git a/inference/docker_images/kunlunxin/kunlunxin_analysis.py b/inference/docker_images/kunlunxin/kunlunxin_analysis.py
@@ -1,23 +1,21 @@
-def analysis_log(logpath):
-    logfile = open(logpath)
-
-    max_usage = 0.0 ## usage_mem
-    max_mem = 0.0 
-    for line in logfile.readlines():
-        '''
-        xpu_smi temp power mem w_mem use_rate
-        '''
-        if "xpu_smi" in line:
-            line = line[:-1]
-            usage = line.split(" ")[4]
-            usage = float(usage)
-            max_usage = max(max_usage, usage)
-            max_mem = line.split(" ")[5]
-            max_mem = float(max_mem)
-
-    return round(max_usage / 1024.0,
-                 2), round(max_mem / 1024.0, 2), eval("32e12"), eval("128e12")
-
-
-if __name__ == "__main__":
-    max1, max2, max2,max4 = analysis_log("/home/zhoujiamin01/workspace/zjm_flag/FlagPerf/inference/result/run20230809192313/resnet50:pytorch_1.13/127.0.0.1_noderank0/kunlunxin_monitor.log")
+def analysis_log(logpath):
+    logfile = open(logpath)
+
+    max_usage = 0.0 ## usage_mem
+    max_mem = 0.0 
+    for line in logfile.readlines():
+        '''
+        xpu_smi temp power mem w_mem use_rate
+        '''
+        if "xpu_smi" in line:
+            line = line[:-1]
+            usage = line.split(" ")[4]
+            usage = float(usage)
+            max_usage = max(max_usage, usage)
+            max_mem = line.split(" ")[5]
+            max_mem = float(max_mem)
+
+    return round(max_usage / 1024.0,
+                 2), round(max_mem / 1024.0, 2), eval("32e12"), eval("128e12")
+
+