diff --git a/.drone.yml b/.drone.yml
new file mode 100644
index 000000000..0b8f744db
--- /dev/null
+++ b/.drone.yml
@@ -0,0 +1,81 @@
+---
+kind: pipeline
+name: TengineRV64
+platform:
+  os: linux
+  arch: amd64 
+
+steps:
+  - name: build
+    image: ubuntu20.04:qemu
+    commands:
+      - PATH=$PATH:/home/riscv/bin cmake -DCMAKE_TOOLCHAIN_FILE=toolchains/rv64-c906.toolchain.cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=RELEASE -DTENGINE_BUILD_TESTS=ON -DTENGINE_COVERAGE=ON -B build
+      - PATH=$PATH:/home/riscv/bin cmake --build build -- -j`cat /proc/cpuinfo | grep 'processor' | wc -l` VERBOSE=1 
+  - name: test ops 
+    image: ubuntu20.04:qemu
+    commands:
+      - cd build
+      - export QEMU_CMD='qemu-riscv64 -cpu rv64,v=true -E TG_DEBUG_TIME=1 -L /home/riscv/sysroot'
+      - ../tests/test_rv64_ops.sh
+  - name: test models 
+    image: ubuntu20.04:qemu
+    environment:
+      DATA_SERVER_URL:
+        from_secret: DATA_SERVER_URL
+    commands:
+      - cd build
+      - wget -nv $${DATA_SERVER_URL}/tengine_model_zoo/ci_data/models.tar.gz
+      - wget -nv $${DATA_SERVER_URL}/tengine_model_zoo/ci_data/images.tar.gz
+      - wget -nv $${DATA_SERVER_URL}/tengine_model_zoo/ci_data/data_x86.tar.gz
+      - mkdir models images data
+      - tar zxvf models.tar.gz -C models
+      - tar zxvf images.tar.gz -C images
+      - tar zxvf data_x86.tar.gz -C data
+      - export QEMU_CMD='qemu-riscv64 -cpu rv64,v=true -E TG_DEBUG_TIME=1 -L /home/riscv/sysroot'
+      - ../tests/test_rv64_models.sh
+    when:
+      branch:
+      - master
+  - name: code coverage 
+    image: ubuntu20.04:qemu
+    commands:
+      - cd build
+      - apt update && apt install lcov -y
+      - lcov --gcov-tool /home/riscv/bin/riscv64-unknown-linux-gnu-gcov --capture --directory . --output-file $${DRONE_REPO_NAME}.info
+      - genhtml --branch-coverage -o ../codecov $${DRONE_REPO_NAME}.info 
+  - name: scp files
+    image: appleboy/drone-scp
+    settings:
+      host: conleylee.com
+      username:
+        from_secret: download_host_user
+      password: 
+        from_secret: download_host_passwd
+      port: 38000
+      target: /home/lee/codecov/${DRONE_REPO_NAME}/${DRONE_BUILD_NUMBER}/${DRONE_COMMIT_SHA}
+      strip_components: 1
+      source: codecov/*
+  - name: upload_to_codecov 
+    image: robertstettner/drone-codecov:latest 
+    settings:
+      token:
+        from_secret: CODECOV_TOKEN
+      files: 
+        - build/${DRONE_REPO_NAME}.info
+      flags:
+        - model_test
+  - name: notify
+    image: ubuntu20.04:drone_script 
+    environment:
+      MATTERMOST_TOKEN:
+        from_secret: MATTERMOST_TOKEN
+      GITEA_API_TOKEN:
+        from_secret: gitea_api_token
+    commands:
+      - 'export DRONE_SCRIPT_DOWNLOAD_LINK=https://download.conleylee.com/scripts/drone_bot.py'
+      - 'export DRONE_CODECOV_LINK=https://codecov.conleylee.com/$${DRONE_REPO_NAME}/$${DRONE_BUILD_NUMBER}/$${DRONE_COMMIT_SHA}'
+      - 'wget $${DRONE_SCRIPT_DOWNLOAD_LINK}'
+      - pip3 install mattermostdriver
+      - python3 `basename $${DRONE_SCRIPT_DOWNLOAD_LINK}` 
+    when:
+      status: [success, failure]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 32fae8481..42ac4eb43 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,18 +35,6 @@ ENDIF()
 # Enable the languages which in use
 ENABLE_LANGUAGE (C CXX)
 
-IF (CMAKE_TOOLCHAIN_FILE)
-    SET (LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to")
-
-    # get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :(
-    GET_FILENAME_COMPONENT (CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME)
-    FIND_FILE (CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH)
-    MESSAGE (STATUS "Using CMake tool chain file ${CMAKE_TOOLCHAIN_FILE}")
-ENDIF()
-
-IF (NOT CMAKE_BUILD_TYPE)
-    SET (CMAKE_BUILD_TYPE release CACHE STRING "Choose the type of build" FORCE)
-ENDIF()
 
 # Module options
 OPTION (TENGINE_BUILD_BENCHMARK             "Build benchmark"                           ON)
@@ -92,7 +80,23 @@ OPTION (TENGINE_ENABLE_ALL_SYMBOL           "All symbol visible"
 OPTION (TENGINE_ENABLE_MODEL_CACHE          "NPU kernel cache file option"              OFF)
 
 # Online report
-OPTION (TENGINE_ONLINE_REPORT               "online report"                             ON)
+OPTION (TENGINE_ONLINE_REPORT               "online report"                             OFF)
+
+OPTION (TENGINE_RV64_RVV_C906               "build for c906"                            OFF)
+OPTION (TENGINE_COVERAGE                    "build with coverage info"                  OFF)
+
+IF (CMAKE_TOOLCHAIN_FILE)
+    SET (LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to")
+
+    # get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :(
+    GET_FILENAME_COMPONENT (CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME)
+    FIND_FILE (CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH)
+    MESSAGE (STATUS "Using CMake tool chain file ${CMAKE_TOOLCHAIN_FILE}")
+ENDIF()
+
+IF (NOT CMAKE_BUILD_TYPE)
+    SET (CMAKE_BUILD_TYPE release CACHE STRING "Choose the type of build" FORCE)
+ENDIF()
 
 # Do check list
 INCLUDE ("${CMAKE_CURRENT_SOURCE_DIR}/cmake/check.cmake")
diff --git a/README.md b/README.md
index 2b50777ef..73ad8af11 100644
--- a/README.md
+++ b/README.md
@@ -7,11 +7,9 @@
 
 # Tengine
 
-[![GitHub license](http://OAID.github.io/pics/apache_2.0.svg)](./LICENSE)
-[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/OAID/Tengine/build-and-test.yml?branch=tengine-lite)](https://github.com/OAID/Tengine/actions)
-[![Test Status](https://img.shields.io/travis/OAID/Tengine/tengine-lite?label=test)](https://travis-ci.org/OAID/Tengine)
-[![codecov](https://codecov.io/gh/OAID/Tengine/branch/tengine-lite/graph/badge.svg?token=kz9NcQPRrk)](https://codecov.io/gh/OAID/Tengine)
-[![Language grade: C/C++](https://img.shields.io/lgtm/grade/cpp/g/OAID/Tengine.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/OAID/Tengine/context:cpp)
+[![License](https://img.shields.io/badge/license-Apache_2.0-blue)](./LICENSE)
+[![Build Status](https://drone.conleylee.com/api/badges/conley/Tengine/status.svg?ref=refs/heads/master)](https://drone.conleylee.com/conley/Tengine)
+[![codecov](https://codecov.io/gh/ComingToy/Tengine/graph/badge.svg?token=KVOX0LW1NJ)](https://codecov.io/gh/ComingToy/Tengine)
 
 
 ## 简介
diff --git a/README_EN.md b/README_EN.md
index 5acaef03c..dfef60542 100644
--- a/README_EN.md
+++ b/README_EN.md
@@ -7,13 +7,9 @@ English | [简体中文](./README.md)
 
 # Tengine
 
-[![GitHub license](http://OAID.github.io/pics/apache_2.0.svg)](./LICENSE)
-[![Build Status](https://img.shields.io/github/workflow/status/OAID/Tengine/Tengine-Lite-Actions/tengine-lite)](https://github.com/OAID/Tengine/actions?query=workflow%3ATengine-Lite-Actions)
-[![Build Status](https://img.shields.io/github/workflow/status/OAID/Tengine-Convert-Tools/Tengine-Convert-Tools-Actions?label=tools%20build)](https://github.com/OAID/Tengine-Convert-Tools/actions?query=workflow%3ATengine-Convert-Tools-Actions)
-[![Test Status](https://img.shields.io/travis/OAID/Tengine/tengine-lite?label=test)](https://travis-ci.org/OAID/Tengine)
-[![codecov](https://codecov.io/gh/OAID/Tengine/branch/tengine-lite/graph/badge.svg?token=kz9NcQPRrk)](https://codecov.io/gh/OAID/Tengine)
-[![Language grade: C/C++](https://img.shields.io/lgtm/grade/cpp/g/OAID/Tengine.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/OAID/Tengine/context:cpp)
-
+[![License](https://img.shields.io/badge/license-Apache_2.0-blue)](./LICENSE)
+[![Build Status](https://drone.conleylee.com/api/badges/conley/Tengine/status.svg?ref=refs/heads/master)](https://drone.conleylee.com/conley/Tengine)
+[![codecov](https://codecov.io/gh/ComingToy/Tengine/graph/badge.svg?token=KVOX0LW1NJ)](https://codecov.io/gh/ComingToy/Tengine)
 
 ## Introduction
 
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index f610c0ed2..1041fe6ab 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -62,7 +62,9 @@ TENGINE_EXAMPLE (tm_efficientdet_uint8       tm_efficientdet_uint8.c)
 TENGINE_EXAMPLE (tm_mobilenet_ssd            tm_mobilenet_ssd.c)
 TENGINE_EXAMPLE (tm_mobilenet_ssd_uint8      tm_mobilenet_ssd_uint8.cpp)
 TENGINE_EXAMPLE (tm_retinaface               tm_retinaface.cpp)
+TENGINE_EXAMPLE (tm_retinaface_vulkan        tm_retinaface_vulkan.cpp)
 TENGINE_EXAMPLE (tm_landmark                 tm_landmark.cpp)
+TENGINE_EXAMPLE (tm_landmark_vulkan          tm_landmark_vulkan.cpp)
 TENGINE_EXAMPLE (tm_landmark_uint8           tm_landmark_uint8.cpp)
 TENGINE_EXAMPLE (tm_mobilefacenet            tm_mobilefacenet.cpp)
 TENGINE_EXAMPLE (tm_mobilefacenet_uint8      tm_mobilefacenet_uint8.cpp)
diff --git a/examples/tm_landmark_vulkan.cpp b/examples/tm_landmark_vulkan.cpp
new file mode 100644
index 000000000..76f35245d
--- /dev/null
+++ b/examples/tm_landmark_vulkan.cpp
@@ -0,0 +1,206 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: qtang@openailab.com
+ */
+
+#include <iostream>
+#include <functional>
+
+#include "common.h"
+#include "tengine/c_api.h"
+#include "tengine_operations.h"
+
+#define DEFAULT_REPEAT_COUNT 1
+#define DEFAULT_THREAD_COUNT 1
+
+void get_input_fp32_data(const char* image_file, float* input_data, int img_h, int img_w, float* mean, float* scale)
+{
+    image img = imread_process(image_file, img_w, img_h, mean, scale);
+
+    float* image_data = (float*)img.data;
+
+    for (int i = 0; i < img_w * img_h * 3; i++)
+        input_data[i] = image_data[i];
+
+    free_image(img);
+}
+
+void show_usage()
+{
+    fprintf(stderr, "[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n");
+}
+
+int main(int argc, char* argv[])
+{
+    int repeat_count = DEFAULT_REPEAT_COUNT;
+    int num_thread = DEFAULT_THREAD_COUNT;
+    char* model_file = nullptr;
+    char* image_file = nullptr;
+    int img_h = 144;
+    int img_w = 144;
+    float mean[3] = {128.f, 128.f, 128.f};
+    float scale[3] = {0.0039, 0.0039, 0.0039};
+
+    int res;
+    while ((res = getopt(argc, argv, "m:i:r:t:h:")) != -1)
+    {
+        switch (res)
+        {
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
+        }
+    }
+
+    /* check files */
+    if (model_file == nullptr)
+    {
+        fprintf(stderr, "Error: Tengine model file not specified!\n");
+        show_usage();
+        return -1;
+    }
+
+    if (image_file == nullptr)
+    {
+        fprintf(stderr, "Error: Image file not specified!\n");
+        show_usage();
+        return -1;
+    }
+
+    if (!check_file_exist(model_file) || !check_file_exist(image_file))
+        return -1;
+
+    /* set runtime options */
+    struct options opt;
+    opt.num_thread = num_thread;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
+
+    /* inital tengine */
+    init_tengine();
+    fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version());
+
+    /* create graph, load tengine model xxx.tmfile */
+    context_t vk_context = create_context("VK", 1);
+    add_context_device(vk_context, "VK");
+    graph_t graph = create_graph(vk_context, "tengine", model_file);
+    set_graph_device(graph, "VK");
+    if (graph == nullptr)
+    {
+        std::cout << "Create graph0 failed\n";
+        return -1;
+    }
+
+    /* set the input shape to initial the graph, and prerun graph to infer shape */
+    int img_size = img_h * img_w * 3;
+    int dims[] = {1, 3, img_h, img_w}; // nchw
+    float* input_data = (float*)malloc(img_size * sizeof(float));
+
+    tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
+    if (input_tensor == nullptr)
+    {
+        fprintf(stderr, "Get input tensor failed\n");
+        return -1;
+    }
+
+    if (set_tensor_shape(input_tensor, dims, 4) < 0)
+    {
+        fprintf(stderr, "Set input tensor shape failed\n");
+        return -1;
+    }
+
+    if (set_tensor_buffer(input_tensor, input_data, img_size * sizeof(float)) < 0)
+    {
+        fprintf(stderr, "Set input tensor buffer failed\n");
+        return -1;
+    }
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (prerun_graph_multithread(graph, opt) < 0)
+    {
+        fprintf(stderr, "Prerun multithread graph failed.\n");
+        return -1;
+    }
+
+    /* prepare process input data, set the data mem to input tensor */
+    get_input_fp32_data(image_file, input_data, img_h, img_w, mean, scale);
+
+    /* run graph */
+    double min_time = DBL_MAX;
+    double max_time = DBL_MIN;
+    double total_time = 0.;
+    for (int i = 0; i < repeat_count; i++)
+    {
+        double start = get_current_time();
+        if (run_graph(graph, 1) < 0)
+        {
+            fprintf(stderr, "Run graph failed\n");
+            return -1;
+        }
+        double end = get_current_time();
+        double cur = end - start;
+        total_time += cur;
+        if (min_time > cur)
+            min_time = cur;
+        if (max_time < cur)
+            max_time = cur;
+    }
+    printf("Repeat [%d] min %.3f ms, max %.3f ms, avg %.3f ms\n", repeat_count, min_time, max_time,
+           total_time / repeat_count);
+
+    /* get output tensor */
+    tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
+
+    float* data = (float*)(get_tensor_buffer(output_tensor));
+    int data_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
+
+    image img_out = imread(image_file);
+    for (int i = 0; i < data_size / 2; i++)
+    {
+        int x = (int)(data[2 * i] * (float)img_out.w / 144.f);
+        int y = (int)(data[2 * i + 1] * (float)img_out.h / 144.f);
+        draw_circle(img_out, x, y, 2, 0, 255, 0);
+    }
+
+    save_image(img_out, "landmark_out");
+
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+
+    return 0;
+}
diff --git a/examples/tm_retinaface_vulkan.cpp b/examples/tm_retinaface_vulkan.cpp
new file mode 100644
index 000000000..14f1936d8
--- /dev/null
+++ b/examples/tm_retinaface_vulkan.cpp
@@ -0,0 +1,606 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: jxyang@openailab.com
+ * 
+ * original model: https://github.com/deepinsight/insightface/tree/master/RetinaFace#retinaface-pretrained-models
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/blob/master/examples/retinaface.cpp
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+#include <vector>
+#include <string>
+
+#ifdef _MSC_VER
+#define NOMINMAX
+#endif
+
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+
+#include "common.h"
+
+#include "tengine/c_api.h"
+#include "tengine_operations.h"
+
+#define DEFAULT_REPEAT_COUNT 1
+#define DEFAULT_THREAD_COUNT 1
+
+#define MODEL_PATH "models/retinaface.tmfile"
+#define IMAGE_PATH "images/selfie_960.jpg"
+
+const float CONF_THRESH = 0.8f;
+const float NMS_THRESH = 0.4f;
+
+const char* input_name = "data";
+
+const char* bbox_name[3] = {"face_rpn_bbox_pred_stride32", "face_rpn_bbox_pred_stride16", "face_rpn_bbox_pred_stride8"};
+const char* score_name[3] = {"face_rpn_cls_prob_reshape_stride32", "face_rpn_cls_prob_reshape_stride16",
+                             "face_rpn_cls_prob_reshape_stride8"};
+const char* landmark_name[3] = {"face_rpn_landmark_pred_stride32", "face_rpn_landmark_pred_stride16",
+                                "face_rpn_landmark_pred_stride8"};
+
+const int stride[3] = {32, 16, 8};
+
+const float g_scales[3][2] = {{32.f, 16.f}, {8.f, 4.f}, {2.f, 1.f}};
+
+struct Size2i
+{
+    int width;
+    int height;
+};
+
+struct Point2f
+{
+    float x;
+    float y;
+};
+
+struct Box2f
+{
+    float x1;
+    float y1;
+    float x2;
+    float y2;
+};
+
+struct Rect2f
+{
+    float x;
+    float y;
+    float w;
+    float h;
+};
+
+struct Face2f
+{
+    float score;
+    Rect2f rect;
+    Point2f landmark[5];
+};
+
+void draw_target(const std::vector<Face2f>& all_pred_boxes, image img)
+{
+    const char* class_names[] = {"faces"};
+
+    fprintf(stdout, "detected face num: %zu\n", all_pred_boxes.size());
+    for (int b = 0; b < (int)all_pred_boxes.size(); b++)
+    {
+        Face2f box = all_pred_boxes[b];
+
+        printf("BOX %.2f:( %g , %g ),( %g , %g )\n", box.score, box.rect.x, box.rect.y, box.rect.w, box.rect.h);
+
+        draw_box(img, box.rect.x, box.rect.y, box.rect.x + box.rect.w, box.rect.y + box.rect.h, 2, 0, 255, 0);
+
+        for (int l = 0; l < 5; l++)
+        {
+            draw_circle(img, box.landmark[l].x, box.landmark[l].y, 1, 0, 128, 128);
+        }
+    }
+    save_image(img, "retinaface_out");
+}
+
+float iou(const Face2f& a, const Face2f& b)
+{
+    float area_a = a.rect.w * a.rect.h;
+    float area_b = b.rect.w * b.rect.h;
+
+    float xx1 = std::max(a.rect.x, b.rect.x);
+    float yy1 = std::max(a.rect.y, b.rect.y);
+    float xx2 = std::min(a.rect.x + a.rect.w, b.rect.x + b.rect.w);
+    float yy2 = std::min(a.rect.y + a.rect.h, b.rect.y + b.rect.h);
+
+    float w = std::max(float(0), xx2 - xx1 + 1);
+    float h = std::max(float(0), yy2 - yy1 + 1);
+
+    float inter = w * h;
+    float ovr = inter / (area_a + area_b - inter);
+    return ovr;
+}
+
+void nms_sorted_boxes(const std::vector<Face2f>& face_objects, std::vector<int>& picked, float nms_threshold)
+{
+    picked.clear();
+
+    const int n = face_objects.size();
+
+    std::vector<float> areas(n);
+    for (int i = 0; i < n; i++)
+    {
+        areas[i] = face_objects[i].rect.w * face_objects[i].rect.h;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        const Face2f& a = face_objects[i];
+
+        int keep = 1;
+        for (int j = 0; j < (int)picked.size(); j++)
+        {
+            const Face2f& b = face_objects[picked[j]];
+
+            // intersection over union
+            float inter_area = iou(a, b);
+            if (inter_area > nms_threshold)
+                keep = 0;
+        }
+
+        if (keep)
+            picked.push_back(i);
+    }
+}
+
+void qsort_descent_inplace(std::vector<Face2f>& face_objects, const int& left, const int& right)
+{
+    int i = left;
+    int j = right;
+
+    float p = face_objects[(left + right) / 2].score;
+
+    while (i <= j)
+    {
+        while (face_objects[i].score > p)
+            i++;
+
+        while (face_objects[j].score < p)
+            j--;
+
+        if (i <= j)
+        {
+            // swap
+            std::swap(face_objects[i], face_objects[j]);
+
+            i++;
+            j--;
+        }
+    }
+
+    if (left < j)
+        qsort_descent_inplace(face_objects, left, j);
+    if (i < right)
+        qsort_descent_inplace(face_objects, i, right);
+}
+
+void qsort_descent_inplace(std::vector<Face2f>& face_objects)
+{
+    if (face_objects.empty())
+        return;
+
+    qsort_descent_inplace(face_objects, 0, face_objects.size() - 1);
+}
+
+std::vector<Box2f> generate_anchors(int base_size, const std::vector<float>& ratios, const std::vector<float>& scales)
+{
+    size_t num_ratio = ratios.size();
+    size_t num_scale = scales.size();
+
+    std::vector<Box2f> anchors(num_ratio * num_scale);
+
+    const float cx = (float)base_size * 0.5f;
+    const float cy = (float)base_size * 0.5f;
+
+    for (int i = 0; i < num_ratio; i++)
+    {
+        float ar = ratios[i];
+
+        int r_w = (int)round((float)base_size / sqrt(ar));
+        int r_h = (int)round((float)r_w * ar); // round(base_size * sqrt(ar));
+
+        for (int j = 0; j < num_scale; j++)
+        {
+            float scale = scales[j];
+
+            float rs_w = (float)r_w * scale;
+            float rs_h = (float)r_h * scale;
+
+            Box2f& anchor = anchors[i * num_scale + j];
+
+            anchor.x1 = cx - rs_w * 0.5f;
+            anchor.y1 = cy - rs_h * 0.5f;
+            anchor.x2 = cx + rs_w * 0.5f;
+            anchor.y2 = cy + rs_h * 0.5f;
+        }
+    }
+
+    return anchors;
+}
+
+static void generate_proposals(std::vector<Box2f>& anchors, int feat_stride, const float* score_blob,
+                               const int score_dims[], const float* bbox_blob, const int bbox_dims[],
+                               const float* landmark_blob, const int landmark_dims[], const float& prob_threshold,
+                               std::vector<Face2f>& faces)
+{
+    int w = bbox_dims[3];
+    int h = bbox_dims[2];
+    int offset = w * h;
+
+    // generate face proposal from bbox deltas and shifted anchors
+    const int num_anchors = anchors.size();
+
+    for (int q = 0; q < num_anchors; q++)
+    {
+        const Box2f& anchor = anchors[q];
+
+        const float* score = score_blob + (q + num_anchors) * offset;
+        const float* bbox = bbox_blob + (q * 4) * offset;
+        const float* landmark = landmark_blob + (q * 10) * offset;
+
+        // shifted anchor
+        float anchor_y = anchor.y1;
+
+        float anchor_w = anchor.x2 - anchor.x1;
+        float anchor_h = anchor.y2 - anchor.y1;
+
+        for (int i = 0; i < h; i++)
+        {
+            float anchor_x = anchor.x1;
+
+            for (int j = 0; j < w; j++)
+            {
+                int index = i * w + j;
+
+                float prob = score[index];
+
+                if (prob >= prob_threshold)
+                {
+                    // apply center size
+                    float dx = bbox[index + offset * 0];
+                    float dy = bbox[index + offset * 1];
+                    float dw = bbox[index + offset * 2];
+                    float dh = bbox[index + offset * 3];
+
+                    float cx = anchor_x + anchor_w * 0.5f;
+                    float cy = anchor_y + anchor_h * 0.5f;
+
+                    float pb_cx = cx + anchor_w * dx;
+                    float pb_cy = cy + anchor_h * dy;
+
+                    float pb_w = anchor_w * exp(dw);
+                    float pb_h = anchor_h * exp(dh);
+
+                    float x0 = pb_cx - pb_w * 0.5f;
+                    float y0 = pb_cy - pb_h * 0.5f;
+                    float x1 = pb_cx + pb_w * 0.5f;
+                    float y1 = pb_cy + pb_h * 0.5f;
+
+                    Face2f obj{};
+                    obj.rect.x = x0;
+                    obj.rect.y = y0;
+                    obj.rect.w = x1 - x0 + 1;
+                    obj.rect.h = y1 - y0 + 1;
+
+                    obj.landmark[0].x = cx + (anchor_w + 1) * landmark[index + offset * 0];
+                    obj.landmark[0].y = cy + (anchor_h + 1) * landmark[index + offset * 1];
+                    obj.landmark[1].x = cx + (anchor_w + 1) * landmark[index + offset * 2];
+                    obj.landmark[1].y = cy + (anchor_h + 1) * landmark[index + offset * 3];
+                    obj.landmark[2].x = cx + (anchor_w + 1) * landmark[index + offset * 4];
+                    obj.landmark[2].y = cy + (anchor_h + 1) * landmark[index + offset * 5];
+                    obj.landmark[3].x = cx + (anchor_w + 1) * landmark[index + offset * 6];
+                    obj.landmark[3].y = cy + (anchor_h + 1) * landmark[index + offset * 7];
+                    obj.landmark[4].x = cx + (anchor_w + 1) * landmark[index + offset * 8];
+                    obj.landmark[4].y = cy + (anchor_h + 1) * landmark[index + offset * 9];
+
+                    obj.score = prob;
+
+                    faces.push_back(obj);
+                }
+
+                anchor_x += (float)feat_stride;
+            }
+
+            anchor_y += (float)feat_stride;
+        }
+    }
+}
+
+int get_input_data(const char* image_file, std::vector<float>& image_data, Size2i& size)
+{
+    image img = imread(image_file);
+
+    size.width = img.w;
+    size.height = img.h;
+
+    int img_size = img.w * img.h * img.c;
+
+    img = image_permute(img);
+
+    image_data.resize(img_size);
+
+    memcpy(image_data.data(), img.data, img_size * sizeof(float));
+
+    free_image(img);
+
+    return img_size;
+}
+
+void show_usage()
+{
+    printf("[Usage]:  [-h]\n    [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count] [-n device_name]\n");
+}
+
+int main(int argc, char* argv[])
+{
+    int repeat_count = DEFAULT_REPEAT_COUNT;
+    int num_thread = DEFAULT_THREAD_COUNT;
+
+    const char* model_file = MODEL_PATH;
+    const char* image_file = IMAGE_PATH;
+    const char* device_name = "";
+
+    int res;
+    while ((res = getopt(argc, argv, "m:i:r:t:h:n:")) != -1)
+    {
+        switch (res)
+        {
+        case 'm':
+            model_file = optarg;
+            break;
+        case 'i':
+            image_file = optarg;
+            break;
+        case 'r':
+            repeat_count = atoi(optarg);
+            break;
+        case 't':
+            num_thread = atoi(optarg);
+            break;
+        case 'n':
+            device_name = optarg;
+            break;
+        case 'h':
+            show_usage();
+            return 0;
+        default:
+            break;
+        }
+    }
+
+    /* check files */
+    if (model_file == nullptr)
+    {
+        printf("Error: Tengine model file not specified!\n");
+        show_usage();
+        return -1;
+    }
+
+    if (image_file == nullptr)
+    {
+        printf("Error: Image file not specified!\n");
+        show_usage();
+        return -1;
+    }
+
+    if (!check_file_exist(model_file) || !check_file_exist(image_file))
+        return -1;
+
+    /* set runtime options */
+    struct options opt;
+    opt.num_thread = num_thread;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 0;
+
+    /* inital tengine */
+    int ret = init_tengine();
+    if (0 != ret)
+    {
+        printf("Init tengine-lite failed.\n");
+        return -1;
+    }
+
+    printf("tengine-lite library version: %s\n", get_tengine_version());
+
+    /* create graph, load tengine model xxx.tmfile */
+    context_t vk_context = create_context("VK", 1);
+    add_context_device(vk_context, "VK");
+    graph_t graph = create_graph(vk_context, "tengine", model_file);
+    set_graph_device(graph, "VK");
+    if (graph == nullptr)
+    {
+        printf("Load model to graph failed.\n");
+        return -1;
+    }
+
+    /* prepare process input data */
+    int target_size = 1024;
+    int max_size = 1980;
+
+    std::vector<float> image_data;
+
+    Size2i image_size;
+    // Size2i tensor_size;
+
+    float im_scale;
+
+    int img_size = get_input_data(image_file, image_data, image_size);
+
+    /* set the input shape to initial the graph, and pre-run graph to infer shape */
+    int dims[] = {1, 3, image_size.height, image_size.width};
+
+    tensor_t input_tensor = get_graph_tensor(graph, input_name);
+    if (nullptr == input_tensor)
+    {
+        printf("Get input tensor failed\n");
+        return -1;
+    }
+
+    if (0 != set_tensor_shape(input_tensor, dims, 4))
+    {
+        printf("Set input tensor shape failed\n");
+        return -1;
+    }
+
+    /* set the data mem to input tensor */
+    if (set_tensor_buffer(input_tensor, image_data.data(), img_size * sizeof(float)) < 0)
+    {
+        printf("Set input tensor buffer failed\n");
+        return -1;
+    }
+
+    /* prerun graph, set work options(num_thread, cluster, precision) */
+    if (0 != prerun_graph_multithread(graph, opt))
+    {
+        printf("Pre-run graph failed\n");
+        return -1;
+    }
+
+    /* run graph */
+    float min_time = FLT_MAX, max_time = 0, total_time = 0.f;
+    for (int i = 0; i < repeat_count; i++)
+    {
+        double start = get_current_time();
+        if (run_graph(graph, 1) < 0)
+        {
+            printf("Run graph failed\n");
+            return -1;
+        }
+        double end = get_current_time();
+
+        float cur = float(end - start);
+
+        total_time += cur;
+        min_time = std::min(min_time, cur);
+        max_time = std::max(max_time, cur);
+    }
+    printf("img_h, img_w : %d, %d\n", image_size.height, image_size.width);
+    printf("Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count,
+           num_thread, total_time / (float)repeat_count, max_time, min_time);
+    printf("--------------------------------------\n");
+
+    /* process the detection result */
+    std::vector<Face2f> face_proposals;
+
+    for (int stride_index = 0; stride_index < 3; stride_index++)
+    {
+        // ==================================================================
+        // ========== This part is to get tensor information ================
+        // ==================================================================
+        tensor_t score_blob_tensor = get_graph_tensor(graph, score_name[stride_index]);
+        tensor_t bbox_blob_tensor = get_graph_tensor(graph, bbox_name[stride_index]);
+        tensor_t landmark_blob_tensor = get_graph_tensor(graph, landmark_name[stride_index]);
+
+        int score_blob_dims[MAX_SHAPE_DIM_NUM] = {0};
+        int bbox_blob_dims[MAX_SHAPE_DIM_NUM] = {0};
+        int landmark_blob_dims[MAX_SHAPE_DIM_NUM] = {0};
+
+        get_tensor_shape(score_blob_tensor, score_blob_dims, MAX_SHAPE_DIM_NUM);
+        get_tensor_shape(bbox_blob_tensor, bbox_blob_dims, MAX_SHAPE_DIM_NUM);
+        get_tensor_shape(landmark_blob_tensor, landmark_blob_dims, MAX_SHAPE_DIM_NUM);
+
+        float* score_blob = (float*)get_tensor_buffer(score_blob_tensor);
+        float* bbox_blob = (float*)get_tensor_buffer(bbox_blob_tensor);
+        float* landmark_blob = (float*)get_tensor_buffer(landmark_blob_tensor);
+
+        const int base_size = 16;
+        const int feat_stride = stride[stride_index];
+
+        std::vector<float> current_ratios(1);
+        current_ratios[0] = 1.f;
+
+        std::vector<float> current_scales(2);
+        current_scales[0] = g_scales[stride_index][0];
+        current_scales[1] = g_scales[stride_index][1];
+
+        const float threshold = CONF_THRESH;
+
+        std::vector<Box2f> anchors = generate_anchors(base_size, current_ratios, current_scales);
+
+        std::vector<Face2f> face_objects;
+        generate_proposals(anchors, feat_stride, score_blob, score_blob_dims, bbox_blob, bbox_blob_dims, landmark_blob,
+                           landmark_blob_dims, threshold, face_objects);
+
+        face_proposals.insert(face_proposals.end(), face_objects.begin(), face_objects.end());
+    }
+
+    // sort all proposals by score from highest to lowest
+    qsort_descent_inplace(face_proposals);
+
+    // apply nms with nms_threshold
+    std::vector<int> picked;
+    nms_sorted_boxes(face_proposals, picked, NMS_THRESH);
+
+    int face_count = picked.size();
+
+    std::vector<Face2f> face_objects(face_count);
+    for (int i = 0; i < face_count; i++)
+    {
+        face_objects[i] = face_proposals[picked[i]];
+
+        // clip to image size
+        float x0 = face_objects[i].rect.x;
+        float y0 = face_objects[i].rect.y;
+        float x1 = x0 + face_objects[i].rect.w;
+        float y1 = y0 + face_objects[i].rect.h;
+
+        x0 = std::max(std::min(x0, (float)image_size.width - 1), 0.f);
+        y0 = std::max(std::min(y0, (float)image_size.height - 1), 0.f);
+        x1 = std::max(std::min(x1, (float)image_size.width - 1), 0.f);
+        y1 = std::max(std::min(y1, (float)image_size.height - 1), 0.f);
+
+        face_objects[i].rect.x = x0;
+        face_objects[i].rect.y = y0;
+        face_objects[i].rect.w = x1 - x0;
+        face_objects[i].rect.h = y1 - y0;
+    }
+
+    image img = imread(image_file);
+    draw_target(face_objects, img);
+
+    postrun_graph(graph);
+    destroy_graph(graph);
+    release_tengine();
+
+    return 0;
+}
diff --git a/scripts/mm_bot.py b/scripts/mm_bot.py
new file mode 100644
index 000000000..c4436d8b8
--- /dev/null
+++ b/scripts/mm_bot.py
@@ -0,0 +1,42 @@
+from mattermostdriver import Driver
+import requests
+import os
+
+bot_username = 'drone'
+server_url = 'mm.conleylee.com'
+
+def main():
+    status = os.environ['DRONE_STAGE_STATUS']
+    bot_password = os.environ['MATTERMOST_TOKEN']
+    repo = os.environ['DRONE_REPO_NAME']
+    branch = os.environ['DRONE_SOURCE_BRANCH']
+    repo_link = os.environ['DRONE_REPO_LINK']
+    author = os.environ['DRONE_COMMIT_AUTHOR_NAME']
+    build_number = os.environ['DRONE_BUILD_NUMBER']
+    build_link = os.environ['DRONE_BUILD_LINK']
+
+    if status == 'success':
+        message = f'[{repo}/{branch}]({repo_link}/src/branch/{branch}) [build\#{build_number}]({build_link}) {status}. good job!'
+    else:
+        message = f'[{repo}/{branch}]({repo_link}/src/branch/{branch}) [build\#{build_number}]({build_link}) {status}. follow previous link for more details!'
+
+    bot = Driver({
+        'url': server_url,  # no firewall, proxy etc.
+        'token': bot_password,
+        'port': 443,
+        'scheme': 'https',  # no SSL issues
+        'verify': False,
+    })
+
+    bot.login()
+    my_channel_id = bot.channels.get_channel_by_name_and_team_name(
+        'stupidcode',
+        'Tengine')['id']
+    bot.posts.create_post(options={
+        'channel_id': my_channel_id,
+        'message': message,
+    })
+
+
+if __name__ == '__main__':
+    main()
diff --git a/source/device/cpu/CMakeLists.txt b/source/device/cpu/CMakeLists.txt
index c975cdb66..7702e3b2d 100644
--- a/source/device/cpu/CMakeLists.txt
+++ b/source/device/cpu/CMakeLists.txt
@@ -279,9 +279,14 @@ IF (TENGINE_COMPILER_GCC OR TENGINE_COMPILER_CLANG)
     ENDIF()
 
     IF (${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv")
-        LIST (APPEND _CPU_COMPILER_OPTIONS "-march=rv64gcvxthead")
+        LIST (APPEND _CPU_COMPILER_OPTIONS "-march=rv64gcv")
         LIST (APPEND _CPU_COMPILER_OPTIONS "-mabi=lp64d")
-        LIST (APPEND _CPU_COMPILER_OPTIONS "-mfp16")
+        IF (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "release" OR CMAKE_BUILD_TYPE STREQUAL "RELEASE")
+            LIST (APPEND _CPU_COMPILER_OPTIONS "-mtune=thead-c906")
+        ENDIF()
+        IF (TENGINE_RV64_RVV_C906)
+            LIST (APPEND _CPU_COMPILER_OPTIONS "-D__FIX_RVV_C906")
+        ENDIF()
         LIST (APPEND _CPU_COMPILER_OPTIONS "-lc")
     ENDIF()    
 ENDIF()
diff --git a/source/device/cpu/cpu_device.c b/source/device/cpu/cpu_device.c
index b5bea801f..aecf9045d 100644
--- a/source/device/cpu/cpu_device.c
+++ b/source/device/cpu/cpu_device.c
@@ -45,6 +45,7 @@
 #include "utility/utils.h"
 #include "utility/log.h"
 
+#include <stdio.h>
 #include <string.h>
 
 int init_cpu(struct device* device)
@@ -94,6 +95,17 @@ static int prerun(struct device* dev, struct subgraph* subgraph, void* option)
     return 0;
 }
 
+static void fname_normalize(char* fname)
+{
+    for (char* pos = fname; *pos != '\0'; ++pos)
+    {
+        if (*pos == '/')
+        {
+            *pos = '_';
+        }
+    }
+}
+
 static int run(struct device* dev, struct subgraph* subgraph)
 {
     struct exec_graph* exec_graph = (struct exec_graph*)subgraph->device_graph;
@@ -214,6 +226,26 @@ static int run(struct device* dev, struct subgraph* subgraph)
             dump_float(fname, ir_tensor->data, ir_tensor->elem_num);
         }
 
+#endif
+#if 0
+        struct node* ir_node = node->ir_node;
+        struct graph* ir_graph = ir_node->graph;
+        char fname[512];
+
+        const char* root = getenv("TENGINE_DEBUG_DIR");
+        if (!root) root = "./";
+        char* pname = fname + sprintf(fname, "%s/", root);
+
+        for (int i = 0; i < ir_node->output_num; ++i)
+        {
+            struct tensor* ir_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[i]);
+            float mean = tensor_mean(ir_tensor);
+
+            fprintf(stderr, "%s output %d, mean: %f\n", ir_node->name, i, mean);
+            sprintf(pname, "%s_out_%d", ir_node->name, i);
+            fname_normalize(pname);
+            save_tensor(fname, ir_tensor->data, ir_tensor->dims, ir_tensor->dim_num);
+        }
 #endif
     }
 
diff --git a/source/device/cpu/cpu_node.h b/source/device/cpu/cpu_node.h
index b0c2fa575..2a2c8bd9b 100644
--- a/source/device/cpu/cpu_node.h
+++ b/source/device/cpu/cpu_node.h
@@ -28,6 +28,7 @@
 #include "cpu_define.h"
 
 #include <stdint.h>
+#include <stdbool.h>
 
 struct node;
 struct node_ops;
diff --git a/source/device/cpu/op/absval/absval_ref.c b/source/device/cpu/op/absval/absval_ref.c
index 973bbae6d..786a451f6 100644
--- a/source/device/cpu/op/absval/absval_ref.c
+++ b/source/device/cpu/op/absval/absval_ref.c
@@ -30,6 +30,7 @@
 #include "device/cpu/cpu_node.h"
 #include "device/cpu/cpu_graph.h"
 #include "device/cpu/cpu_module.h"
+#include <stdbool.h>
 
 #include <math.h>
 
@@ -85,13 +86,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_absval_ref_op()
 {
diff --git a/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c b/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c
index c01c37a0c..0ec31e0d5 100644
--- a/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c
+++ b/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c
@@ -109,13 +109,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_absval_hcl_arm_op()
 {
@@ -125,4 +127,4 @@ int register_absval_hcl_arm_op()
 int unregister_absval_hcl_arm_op()
 {
     return unregister_builtin_node_ops(OP_ABSVAL, &hcl_node_ops);
-}
\ No newline at end of file
+}
diff --git a/source/device/cpu/op/absval/risc-v/lp64dv/absval_hcl_rv64.c b/source/device/cpu/op/absval/risc-v/lp64dv/absval_hcl_rv64.c
new file mode 100644
index 000000000..c79e36103
--- /dev/null
+++ b/source/device/cpu/op/absval/risc-v/lp64dv/absval_hcl_rv64.c
@@ -0,0 +1,100 @@
+#include "api/c_api.h"
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "op/conv/risc-v/lp64dv/vsetvl_rvv.h"
+#include "utility/sys_port.h"
+#include "utility/log.h"
+#include "device/cpu/cpu_node.h"
+#include "device/cpu/cpu_graph.h"
+#include "operator/op.h"
+#include <math.h>
+#include "device/cpu/cpu_module.h"
+
+static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    return 0;
+}
+
+static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    return 0;
+}
+
+static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    return 0;
+}
+
+static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    struct node* ir_node = exec_node->ir_node;
+    struct graph* ir_graph = ir_node->graph;
+    struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+
+    const float* input_data = input_tensor->data;
+    float* output_data = output_tensor->data;
+
+    const int batch = input_tensor->dims[0];
+    const int channel = input_tensor->dims[1];
+    const int img_size = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3];
+
+    vsetvl_e32_m2();
+
+    for (int b = 0; b < batch; ++b)
+    {
+        int i = 0;
+        for (; i < (img_size & -8); i += 8)
+        {
+            asm("vle32.v    v0, (%0);\n"
+                "vfabs.v    v2, v0;\n"
+                "vse32.v    v2, (%1);\n"
+                :
+                : "r"(input_data), "r"(output_data)
+                : "memory");
+            input_data += 8;
+            output_data += 8;
+        }
+
+        for (; i < img_size; ++i)
+        {
+            *output_data = fabsf(*input_data);
+            output_data++;
+            input_data++;
+        }
+    }
+
+    return 0;
+}
+
+static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* ir_node)
+{
+    struct graph* graph = ir_node->graph;
+    struct tensor* input_tensor = get_ir_graph_tensor(graph, ir_node->input_tensors[0]);
+    if (input_tensor->data_type != TENGINE_MODE_FP32 || input_tensor->layout != TENGINE_LAYOUT_NCHW)
+    {
+        return 0;
+    }
+
+    return OPS_SCORE_PREFER;
+}
+
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score};
+
+int register_absval_hcl_rv64_op()
+{
+    return register_builtin_node_ops(OP_ABSVAL, &hcl_node_ops);
+}
+
+int unregister_absval_hcl_rv64_op()
+{
+    return unregister_builtin_node_ops(OP_ABSVAL, &hcl_node_ops);
+}
diff --git a/source/device/cpu/op/add_n/add_n_ref.c b/source/device/cpu/op/add_n/add_n_ref.c
index 559b6cc44..c242dd29d 100644
--- a/source/device/cpu/op/add_n/add_n_ref.c
+++ b/source/device/cpu/op/add_n/add_n_ref.c
@@ -117,16 +117,27 @@ static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struc
 
 static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
 {
-    return OPS_SCORE_BEST;
+    struct node* ir_node = exec_node;
+    struct graph* ir_graph = ir_node->graph;
+    struct tensor* input_tensor;
+
+    input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+
+    if (input_tensor->data_type != TENGINE_DT_FP32 || input_tensor->layout != TENGINE_LAYOUT_NCHW)
+        return 0;
+
+    return OPS_SCORE_CANDO;
 }
 
-static struct node_ops add_n_node_ops = {.prerun = prerun,
-                                         .run = run,
-                                         .reshape = NULL,
-                                         .postrun = postrun,
-                                         .init_node = init_node,
-                                         .release_node = release_node,
-                                         .score = score};
+static struct node_ops add_n_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_add_n_ref_op()
 {
diff --git a/source/device/cpu/op/add_n/risc-v/lp64dv/add_n_hcl_rv64.c b/source/device/cpu/op/add_n/risc-v/lp64dv/add_n_hcl_rv64.c
new file mode 100644
index 000000000..fc7780f6f
--- /dev/null
+++ b/source/device/cpu/op/add_n/risc-v/lp64dv/add_n_hcl_rv64.c
@@ -0,0 +1,183 @@
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "op/conv/risc-v/lp64dv/vsetvl_rvv.h"
+#include "utility/sys_port.h"
+#include "utility/log.h"
+#include "device/cpu/cpu_node.h"
+#include "device/cpu/cpu_graph.h"
+#include "device/cpu/cpu_module.h"
+
+#include <math.h>
+
+struct add_n_op_param
+{
+    int in_num;
+    void** input_data;
+};
+
+static int ref_add_n_fp32(const float** input, float* output, int size, const struct add_n_op_param* param)
+{
+    int in_num = param->in_num;
+    vsetvl_e32_m2();
+
+    float* output_data = output;
+    int i = 0;
+    for (; i < (size & -8); i += 8)
+    {
+        asm("vmv.v.x  v0, x0;\n");
+        int n = 0;
+        for (; n < (in_num & -8); n += 8)
+        {
+            const float** inputs = input + n;
+            const float* in0 = inputs[0] + i;
+            const float* in1 = inputs[1] + i;
+            const float* in2 = inputs[2] + i;
+            const float* in3 = inputs[3] + i;
+            const float* in4 = inputs[4] + i;
+            const float* in5 = inputs[5] + i;
+            const float* in6 = inputs[6] + i;
+            const float* in7 = inputs[7] + i;
+
+            asm("vle32.v    v2,  (%0);\n"
+                "vle32.v    v4,  (%1);\n"
+                "vle32.v    v6,  (%2);\n"
+                "vle32.v    v8,  (%3);\n"
+                "vle32.v    v10, (%4);\n"
+                "vle32.v    v12, (%5);\n"
+                "vle32.v    v14, (%6);\n"
+                "vle32.v    v16, (%7);\n"
+                "vfadd.vv   v0, v0, v2;\n"
+                "vfadd.vv   v0, v0, v4;\n"
+                "vfadd.vv   v0, v0, v6;\n"
+                "vfadd.vv   v0, v0, v8;\n"
+                "vfadd.vv   v0, v0, v10;\n"
+                "vfadd.vv   v0, v0, v12;\n"
+                "vfadd.vv   v0, v0, v14;\n"
+                "vfadd.vv   v0, v0, v16;\n"
+                :
+                : "r"(in0), "r"(in1), "r"(in2), "r"(in3), "r"(in4), "r"(in5), "r"(in6), "r"(in7));
+        }
+
+        for (; n < in_num; n += 1)
+        {
+            const float* in0 = input[n] + i;
+            asm("vle32.v    v2, (%0);\n"
+                "vfadd.vv   v0, v0, v2;\n"
+                :
+                : "r"(in0));
+        }
+
+        asm("vse32.v    v0, (%0);\n"
+            :
+            : "r"(output_data)
+            : "memory");
+        output_data += 8;
+    }
+
+    for (; i < size; i += 1)
+    {
+        output[i] = input[0][i];
+        for (int n = 1; n < in_num; n++)
+        {
+            output[i] += input[n][i];
+        }
+    }
+
+    return 0;
+}
+
+static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)sys_malloc(sizeof(struct add_n_op_param));
+    exec_node->ops_priv = add_n_op_param;
+    return 0;
+}
+
+static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    sys_free(exec_node->ops_priv);
+    return 0;
+}
+
+static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    struct node* ir_node = exec_node->ir_node;
+    struct graph* ir_graph = ir_node->graph;
+    struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)exec_node->ops_priv;
+    struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+
+    int in_num = ir_node->input_num;
+    add_n_op_param->in_num = in_num;
+    add_n_op_param->input_data = (void**)sys_malloc(sizeof(void*) * in_num);
+
+    return 0;
+}
+
+static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    struct node* ir_node = exec_node->ir_node;
+    struct graph* ir_graph = ir_node->graph;
+    struct tensor* input_tensor_a = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+
+    uint32_t elem_num = input_tensor_a->elem_num;
+    struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)exec_node->ops_priv;
+    for (int i = 0; i < add_n_op_param->in_num; i++)
+    {
+        struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[i]);
+        void* data = input_tensor->data;
+        add_n_op_param->input_data[i] = data;
+    }
+    const void** input = (const void**)add_n_op_param->input_data;
+
+    float* output = (float*)output_tensor->data;
+    for (uint32_t i = 0; i < elem_num; i++)
+    {
+        output[i] = 0;
+    }
+    ref_add_n_fp32((const float**)input, output, elem_num, add_n_op_param);
+    return 0;
+}
+
+static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)exec_node->ops_priv;
+    sys_free(add_n_op_param->input_data);
+
+    return 0;
+}
+
+static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
+{
+    struct node* ir_node = exec_node;
+    struct graph* ir_graph = ir_node->graph;
+    struct tensor* input_tensor;
+
+    input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+
+    if (input_tensor->data_type != TENGINE_DT_FP32 || input_tensor->layout != TENGINE_LAYOUT_NCHW)
+        return 0;
+
+    return OPS_SCORE_PREFER;
+}
+
+static struct node_ops add_n_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
+
+int register_add_n_hcl_rv64_op()
+{
+    return register_builtin_node_ops(OP_ADD_N, &add_n_node_ops);
+}
+
+int unregister_add_n_hcl_rv64_op()
+{
+    return unregister_builtin_node_ops(OP_ADD_N, &add_n_node_ops);
+}
diff --git a/source/device/cpu/op/argmax/argmax_ref.c b/source/device/cpu/op/argmax/argmax_ref.c
index ba8898a38..f3a810516 100644
--- a/source/device/cpu/op/argmax/argmax_ref.c
+++ b/source/device/cpu/op/argmax/argmax_ref.c
@@ -77,7 +77,7 @@ static int ref_argmax_fp32(float* input, int* output, const struct argmax_op_par
     return 0;
 }
 
-static int ref_argmax_uint8(uint8_t* input, int* output, const struct argmax_op_param* param)
+static int ref_argmax_uint8(uint8_t* input, uint8_t* output, const struct argmax_op_param* param)
 {
     uint8_t max_value;
     int max_value_index;
@@ -175,13 +175,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     struct argmax_op_param* argmax_op_param = (struct argmax_op_param*)exec_node->ops_priv;
 
-    TLOG_ERR("output_tensor->elem_num:%d\n", output_tensor->elem_num);
-    TLOG_ERR("output_tensor->elem_size:%d\n", output_tensor->elem_size);
-
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ref_argmax_fp32((float*)in_data, (int*)out_data, argmax_op_param);
     else if (input_tensor->data_type == TENGINE_DT_UINT8)
-        ref_argmax_uint8((uint8_t*)in_data, (int*)out_data, argmax_op_param);
+        ref_argmax_uint8((uint8_t*)in_data, (uint8_t*)out_data, argmax_op_param);
 
     return 0;
 }
@@ -196,13 +193,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops argmax_node_ops = {.prerun = prerun,
-                                          .run = run,
-                                          .reshape = NULL,
-                                          .postrun = postrun,
-                                          .init_node = init_node,
-                                          .release_node = release_node,
-                                          .score = score};
+static struct node_ops argmax_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_argmax_ref_op()
 {
diff --git a/source/device/cpu/op/argmin/argmin_ref.c b/source/device/cpu/op/argmin/argmin_ref.c
index 58da946b0..ca4f23466 100644
--- a/source/device/cpu/op/argmin/argmin_ref.c
+++ b/source/device/cpu/op/argmin/argmin_ref.c
@@ -77,7 +77,7 @@ static int ref_argmin_fp32(float* input, int* output, const struct argmin_op_par
     return 0;
 }
 
-static int ref_argmin_uint8(uint8_t* input, int* output, const struct argmin_op_param* param)
+static int ref_argmin_uint8(uint8_t* input, uint8_t* output, const struct argmin_op_param* param)
 {
     uint8_t min_value;
     int min_value_index;
@@ -175,13 +175,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 
     struct argmin_op_param* argmin_op_param = (struct argmin_op_param*)exec_node->ops_priv;
 
-    TLOG_ERR("output_tensor->elem_num:%d\n", output_tensor->elem_num);
-    TLOG_ERR("output_tensor->elem_size:%d\n", output_tensor->elem_size);
-
     if (input_tensor->data_type == TENGINE_DT_FP32)
         ref_argmin_fp32((float*)in_data, (int*)out_data, argmin_op_param);
     else if (input_tensor->data_type == TENGINE_DT_UINT8)
-        ref_argmin_uint8((uint8_t*)in_data, (int*)out_data, argmin_op_param);
+        ref_argmin_uint8((uint8_t*)in_data, (uint8_t*)out_data, argmin_op_param);
 
     return 0;
 }
@@ -196,13 +193,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops argmin_node_ops = {.prerun = prerun,
-                                          .run = run,
-                                          .reshape = NULL,
-                                          .postrun = postrun,
-                                          .init_node = init_node,
-                                          .release_node = release_node,
-                                          .score = score};
+static struct node_ops argmin_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_argmin_ref_op()
 {
diff --git a/source/device/cpu/op/batchnorm/batchnorm_ref.c b/source/device/cpu/op/batchnorm/batchnorm_ref.c
index 5c7c5f526..5c2818aad 100644
--- a/source/device/cpu/op/batchnorm/batchnorm_ref.c
+++ b/source/device/cpu/op/batchnorm/batchnorm_ref.c
@@ -164,13 +164,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = postrun,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_batchnorm_ref_op()
 {
diff --git a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c
index 359b14ee5..2db14b462 100644
--- a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c
+++ b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c
@@ -145,13 +145,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = postrun,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_batchnorm_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/batchtospacend/batchtospacend_ref.c b/source/device/cpu/op/batchtospacend/batchtospacend_ref.c
index 9c9aa6044..a755b6614 100644
--- a/source/device/cpu/op/batchtospacend/batchtospacend_ref.c
+++ b/source/device/cpu/op/batchtospacend/batchtospacend_ref.c
@@ -116,13 +116,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_batchtospacend_ref_op()
 {
diff --git a/source/device/cpu/op/bias/bias_ref.c b/source/device/cpu/op/bias/bias_ref.c
index 2eb39c085..56c128394 100644
--- a/source/device/cpu/op/bias/bias_ref.c
+++ b/source/device/cpu/op/bias/bias_ref.c
@@ -101,13 +101,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_bias_ref_op()
 {
diff --git a/source/device/cpu/op/broadmul/broadmul_ref.c b/source/device/cpu/op/broadmul/broadmul_ref.c
index 92ed72a28..92bb49cd8 100644
--- a/source/device/cpu/op/broadmul/broadmul_ref.c
+++ b/source/device/cpu/op/broadmul/broadmul_ref.c
@@ -53,10 +53,6 @@ typedef struct __ref_broadmul_param
     int out_size;
     int on_size;
     int in_size;
-    float in0_scale;
-    float in1_scale;
-    int in0_zero;
-    int in1_zero;
 } ref_broadmul_param, *p_ref_broadmul_param;
 
 static int ref_broadmul_fp32(float* in0, float* in1, float* out, p_ref_broadmul_param param)
@@ -64,6 +60,7 @@ static int ref_broadmul_fp32(float* in0, float* in1, float* out, p_ref_broadmul_
     int out_size = param->out_size;
     int in_size = param->in_size;
     int on_size = param->on_size;
+    int last_i = 0;
 
     for (int o = 0; o < out_size; o++)
     {
@@ -74,6 +71,7 @@ static int ref_broadmul_fp32(float* in0, float* in1, float* out, p_ref_broadmul_
             {
                 int index = (o * on_size + j) * in_size + i;
                 out[index] = in0[index] * data1;
+                last_i = index;
             }
         }
     }
@@ -133,13 +131,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_broadmul_ref_op()
 {
diff --git a/source/device/cpu/op/cast/cast_ref.c b/source/device/cpu/op/cast/cast_ref.c
index 9eb88fb16..791eb8a1f 100644
--- a/source/device/cpu/op/cast/cast_ref.c
+++ b/source/device/cpu/op/cast/cast_ref.c
@@ -191,13 +191,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops ref_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops ref_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_cast_ref_op()
 {
diff --git a/source/device/cpu/op/ceil/ceil_ref.c b/source/device/cpu/op/ceil/ceil_ref.c
index 95cc44f39..790bdbca1 100644
--- a/source/device/cpu/op/ceil/ceil_ref.c
+++ b/source/device/cpu/op/ceil/ceil_ref.c
@@ -34,51 +34,22 @@
 #include "device/cpu/cpu_module.h"
 
 #include <math.h>
+#include <stdio.h>
 
 int ref_ceil_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
 {
     // dims size = 2 or 3
-    if (input_tensor->dim_num < 4)
-    {
-        float* input_data = (float*)input_tensor->data;
-        float* out_data = (float*)output_tensor->data;
-        int total_size = input_tensor->elem_num;
-
-        for (int i = 0; i < total_size; i++)
-        {
-            input_data[i] = ceilf(out_data[i]);
-        }
-
-        return 0;
-    }
-    // dims size 3
-    else if (input_tensor->dim_num == 4)
-    {
-        int w = input_tensor->dims[3];
-        int h = output_tensor->dims[2];
-        int channels = input_tensor->dims[1];
-        int size = h * w;
-        int c_step = h * w;
-
-        float* input_data = (float*)input_tensor->data;
-        float* out_data = (float*)output_tensor->data;
+    float* input_data = (float*)input_tensor->data;
+    float* out_data = (float*)output_tensor->data;
+    int total_size = input_tensor->elem_num;
 
 #pragma omp parallel for num_threads(num_thread)
-        for (int q = 0; q < channels; q++)
-        {
-            float* src = input_data + c_step * q;
-            float* dst = out_data + c_step * q;
-
-            for (int i = 0; i < size; i++)
-            {
-                dst[i] = ceilf(src[i]);
-            }
-        }
-
-        return 0;
+    for (int i = 0; i < total_size; i++)
+    {
+        out_data[i] = ceilf(input_data[i]);
     }
 
-    return -1;
+    return 0;
 }
 
 int ref_ceil_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread)
@@ -101,40 +72,12 @@ int ref_ceil_uint8(struct tensor* input_tensor, struct tensor* output_tensor, in
         input_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale;
     }
 
-    // dims size = 2 or 3
-    if (input_tensor->dim_num < 4)
-    {
-        int total_size = input_tensor->elem_num;
-
-        for (int i = 0; i < total_size; i++)
-        {
-            input_data[i] = ceil(out_data[i]);
-        }
-
-        // return 0;
-    }
-    // dims size 3
-    else if (input_tensor->dim_num == 4)
-    {
-        int w = input_tensor->dims[3];
-        int h = output_tensor->dims[2];
-        int channels = input_tensor->dims[1];
-        int size = h * w;
-        int c_step = h * w;
+    int total_size = input_tensor->elem_num;
 
 #pragma omp parallel for num_threads(num_thread)
-        for (int q = 0; q < channels; q++)
-        {
-            float* src = input_data + c_step * q;
-            float* dst = out_data + c_step * q;
-
-            for (int i = 0; i < size; i++)
-            {
-                dst[i] = ceil(src[i]);
-            }
-        }
-
-        // return 0;
+    for (int i = 0; i < total_size; i++)
+    {
+        out_data[i] = ceil(input_data[i]);
     }
 
     /* quant */
@@ -192,13 +135,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_ceil_ref_op()
 {
diff --git a/source/device/cpu/op/clip/clip_ref.c b/source/device/cpu/op/clip/clip_ref.c
index 2582ef334..288a04194 100644
--- a/source/device/cpu/op/clip/clip_ref.c
+++ b/source/device/cpu/op/clip/clip_ref.c
@@ -84,13 +84,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_clip_ref_op()
 {
diff --git a/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c b/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c
index bfa3e4b70..8fa3719c4 100644
--- a/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c
+++ b/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c
@@ -43,7 +43,7 @@ void comp_equal(int input_hw, int input_hw_1, int input_count4, int input1_count
     }
     else if (input_count4 == 1)
     {
-        for (int i = 0; i < input_count4; ++i)
+        for (int i = 0; i < input1_count4; ++i)
         {
             *output++ = (input0[0] == input1[i]);
         }
@@ -107,7 +107,7 @@ void comp_nequal(int input_hw, int input_hw_1, int input_count4, int input1_coun
     }
     else if (input_count4 == 1)
     {
-        for (int i = 0; i < input_count4; ++i)
+        for (int i = 0; i < input1_count4; ++i)
         {
             *output++ = (input0[0] != input1[i]);
         }
@@ -171,7 +171,7 @@ void comp_less(int input_hw, int input_hw_1, int input_count4, int input1_count4
     }
     else if (input_count4 == 1)
     {
-        for (int i = 0; i < input_count4; ++i)
+        for (int i = 0; i < input1_count4; ++i)
         {
             *output++ = (input0[0] < input1[i]);
         }
@@ -235,7 +235,7 @@ void comp_lesse(int input_hw, int input_hw_1, int input_count4, int input1_count
     }
     else if (input_count4 == 1)
     {
-        for (int i = 0; i < input_count4; ++i)
+        for (int i = 0; i < input1_count4; ++i)
         {
             *output++ = (input0[0] <= input1[i]);
         }
@@ -299,7 +299,7 @@ void comp_greater(int input_hw, int input_hw_1, int input_count4, int input1_cou
     }
     else if (input_count4 == 1)
     {
-        for (int i = 0; i < input_count4; ++i)
+        for (int i = 0; i < input1_count4; ++i)
         {
             *output++ = (input0[0] > input1[i]);
         }
@@ -363,7 +363,7 @@ void comp_greatere(int input_hw, int input_hw_1, int input_count4, int input1_co
     }
     else if (input_count4 == 1)
     {
-        for (int i = 0; i < input_count4; ++i)
+        for (int i = 0; i < input1_count4; ++i)
         {
             *output++ = (input0[0] >= input1[i]);
         }
diff --git a/source/device/cpu/op/comparison/comparison_ref.c b/source/device/cpu/op/comparison/comparison_ref.c
index 14405732c..fb7e211a4 100644
--- a/source/device/cpu/op/comparison/comparison_ref.c
+++ b/source/device/cpu/op/comparison/comparison_ref.c
@@ -69,17 +69,35 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
     void* output = output_tensor->data;
 
     _comparison_param op_param;
-    int ii = 0;
-    op_param.shape1[0] = input_tensor1->dims[ii++];
-    op_param.shape1[1] = input_tensor1->dims[ii++];
-    op_param.shape1[2] = input_tensor1->dims[ii++];
-    op_param.shape1[3] = input_tensor1->dims[ii++];
-
-    ii = 0;
-    op_param.shape0[0] = input_tensor->dims[ii++];
-    op_param.shape0[1] = input_tensor->dims[ii++];
-    op_param.shape0[2] = input_tensor->dims[ii++];
-    op_param.shape0[3] = input_tensor->dims[ii++];
+    if (input_tensor1->dim_num == 4)
+    {
+        op_param.shape1[0] = input_tensor1->dims[0];
+        op_param.shape1[1] = input_tensor1->dims[1];
+        op_param.shape1[2] = input_tensor1->dims[2];
+        op_param.shape1[3] = input_tensor1->dims[3];
+    }
+    else if (input_tensor1->dim_num == 1)
+    {
+        op_param.shape1[0] = 1;
+        op_param.shape1[1] = input_tensor1->dims[0];
+        op_param.shape1[2] = 1;
+        op_param.shape1[3] = 1;
+    }
+
+    if (input_tensor->dim_num == 4)
+    {
+        op_param.shape0[0] = input_tensor->dims[0];
+        op_param.shape0[1] = input_tensor->dims[1];
+        op_param.shape0[2] = input_tensor->dims[2];
+        op_param.shape0[3] = input_tensor->dims[3];
+    }
+    else if (input_tensor->dim_num == 1)
+    {
+        op_param.shape0[0] = 1;
+        op_param.shape0[1] = input_tensor->dims[0];
+        op_param.shape0[2] = 1;
+        op_param.shape0[3] = 1;
+    }
 
     op_param.layout = input_tensor->layout;
     op_param.type = param->type;
@@ -92,13 +110,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_comparison_ref_op()
 {
diff --git a/source/device/cpu/op/concat/concat_ref.c b/source/device/cpu/op/concat/concat_ref.c
index 854f3a8a1..6a7939ac2 100644
--- a/source/device/cpu/op/concat/concat_ref.c
+++ b/source/device/cpu/op/concat/concat_ref.c
@@ -86,7 +86,8 @@ static struct node_ops hcl_node_ops = {
     .postrun = NULL,
     .init_node = init_node,
     .release_node = release_node,
-    .score = score};
+    .score = score,
+};
 
 int register_concat_ref_op()
 {
diff --git a/source/device/cpu/op/conv/conv_ref.c b/source/device/cpu/op/conv/conv_ref.c
index 8f655f580..ea29309b8 100644
--- a/source/device/cpu/op/conv/conv_ref.c
+++ b/source/device/cpu/op/conv/conv_ref.c
@@ -199,13 +199,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_conv_ref_op()
 {
diff --git a/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c b/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c
index 5958c7c38..f68d5e3d4 100644
--- a/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c
+++ b/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c
@@ -468,7 +468,8 @@ static struct node_ops hcl_node_ops = {
     .postrun = postrun,
     .init_node = init_node,
     .release_node = release_node,
-    .score = score};
+    .score = score,
+};
 
 int register_conv_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/conv/cortex-m/conv_cmsis.c b/source/device/cpu/op/conv/cortex-m/conv_cmsis.c
index f9057f0b6..150878790 100644
--- a/source/device/cpu/op/conv/cortex-m/conv_cmsis.c
+++ b/source/device/cpu/op/conv/cortex-m/conv_cmsis.c
@@ -134,13 +134,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops cmsis_node_ops = {.prerun = NULL,
-                                         .run = run,
-                                         .reshape = reshape,
-                                         .postrun = NULL,
-                                         .init_node = init_node,
-                                         .release_node = release_node,
-                                         .score = score};
+static struct node_ops cmsis_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_conv_cmsis_op()
 {
diff --git a/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c b/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c
index 095dc59f8..62d822a14 100644
--- a/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c
+++ b/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c
@@ -113,13 +113,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
         return 0;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_conv_dw_hcl_mips_op()
 {
diff --git a/source/device/cpu/op/conv/mips/conv_hcl_mips.c b/source/device/cpu/op/conv/mips/conv_hcl_mips.c
index baa067b77..34b8619bd 100644
--- a/source/device/cpu/op/conv/mips/conv_hcl_mips.c
+++ b/source/device/cpu/op/conv/mips/conv_hcl_mips.c
@@ -241,13 +241,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_PREFER;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = postrun,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = reshape,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_conv_hcl_mips_op()
 {
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c
index 338827acd..936f1457f 100644
--- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c
@@ -113,20 +113,22 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
         return 0;
 
     if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2)))
-        return OPS_SCORE_BEST;
+        return OPS_SCORE_PREFER;
     else if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 5 && kernel_w == 5 && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2)))
-        return OPS_SCORE_BEST;
+        return OPS_SCORE_PREFER;
     else
         return 0;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_conv_dw_hcl_rv64_op()
 {
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_packn_hcl_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_packn_hcl_rv64.c
new file mode 100644
index 000000000..398575aa1
--- /dev/null
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_packn_hcl_rv64.c
@@ -0,0 +1,146 @@
+#include "convolution_param.h"
+#include "api/c_api.h"
+
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "device/cpu/cpu_graph.h"
+#include "device/cpu/cpu_node.h"
+#include "device/cpu/cpu_module.h"
+#include "utility/sys_port.h"
+#include <stdio.h>
+#include <string.h>
+
+extern int conv_dw_packn_kernel_run(const ir_node_t* ir_node, const ir_tensor_t* input_tensor, const ir_tensor_t* filter_tensor, const ir_tensor_t* bias_tensor, ir_tensor_t* output_tensor, const struct conv_priv_info* priv_info, const struct conv_param* params, const int num_thread, const int cpu_affinity);
+extern int conv_dw_packn_kernel_prerun(const ir_node_t* ir_node, const ir_tensor_t* input_tensor, const ir_tensor_t* filter_tensor, struct conv_priv_info* info, struct conv_param* params);
+extern int conv_dw_packn_kernel_postrun(const ir_node_t* ir_node, struct conv_priv_info* info);
+
+static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    const ir_node_t* ir_node = exec_node->ir_node;
+    ir_graph_t* ir_graph = ir_node->graph;
+    const ir_tensor_t* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    const ir_tensor_t* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
+    const ir_tensor_t* bias_tensor = NULL;
+    ir_tensor_t* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+    const int num_thread = exec_graph->num_thread;
+    const int cpu_affinity = exec_graph->cpu_affinity;
+
+    if (ir_node->input_num > 2)
+    {
+        bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
+    }
+
+    const struct conv_param* params = (const struct conv_param*)ir_node->op.param_mem;
+    const struct conv_priv_info* info = (const struct conv_priv_info*)exec_node->ops_priv;
+
+    if (exec_graph->mode != TENGINE_MODE_FP32)
+    {
+        return -1;
+    }
+
+    return conv_dw_packn_kernel_run(ir_node, input_tensor, filter_tensor, bias_tensor, output_tensor, info, params, num_thread, cpu_affinity);
+}
+
+static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    struct conv_priv_info* info = sys_malloc(sizeof(struct conv_priv_info));
+    if (!info)
+    {
+        return -1;
+    }
+
+    memset(info, 0, sizeof(*info));
+    exec_node->ops_priv = info;
+
+    return 0;
+}
+
+static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    struct conv_priv_info* info = exec_node->ops_priv;
+    sys_free(info);
+    exec_node->ops_priv = NULL;
+    return 0;
+}
+
+static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* ir_node)
+{
+    struct conv_param* param = (struct conv_param*)ir_node->op.param_mem;
+    struct graph* ir_graph = ir_node->graph;
+
+    struct tensor* input_tensor;
+    struct tensor* output_tensor;
+
+    int group = param->group;
+    int kernel_h = param->kernel_h;
+    int kernel_w = param->kernel_w;
+    int stride_h = param->stride_h;
+    int stride_w = param->stride_w;
+    int dilation_h = param->dilation_h;
+    int dilation_w = param->dilation_w;
+    int pad_h0 = param->pad_h0;
+    int pad_w0 = param->pad_w0;
+    int pad_h1 = param->pad_h1;
+    int pad_w1 = param->pad_w1;
+
+    input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+
+    int in_c = input_tensor->dims[1] / group;
+    int out_c = output_tensor->dims[1] / group;
+    int outh = output_tensor->dims[2];
+    int outw = output_tensor->dims[3];
+
+    if (!(input_tensor->data_type == TENGINE_DT_FP32))
+        return 0;
+
+    if (kernel_h != kernel_w || input_tensor->dims[0] > 1)
+        return 0;
+
+    if (param->group > 1
+        && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1
+        && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3
+        && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2)))
+        return OPS_SCORE_BEST;
+    else
+        return 0;
+}
+
+static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    const ir_node_t* ir_node = exec_node->ir_node;
+    ir_graph_t* ir_graph = ir_node->graph;
+    const ir_tensor_t* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    const ir_tensor_t* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
+    struct conv_priv_info* info = (struct conv_priv_info*)exec_node->ops_priv;
+
+    struct conv_param* params = (struct conv_param*)ir_node->op.param_mem;
+    return conv_dw_packn_kernel_prerun(ir_node, input_tensor, filter_tensor, info, params);
+}
+
+static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    const ir_node_t* ir_node = exec_node->ir_node;
+    struct conv_priv_info* info = (struct conv_priv_info*)exec_node->ops_priv;
+    return conv_dw_packn_kernel_postrun(ir_node, info);
+}
+
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score};
+
+int register_conv_dw_packn_hcl_rv64_op()
+{
+    return register_builtin_node_ops(OP_CONV, &hcl_node_ops);
+}
+
+int unregister_conv_dw_packn_hcl_rv64_op()
+{
+    return unregister_builtin_node_ops(OP_CONV, &hcl_node_ops);
+}
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_packn_kernel_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_packn_kernel_rv64.c
new file mode 100644
index 000000000..0d0b83625
--- /dev/null
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_packn_kernel_rv64.c
@@ -0,0 +1,1747 @@
+#include "api/c_api.h"
+#include <string.h>
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "graph/tensor.h"
+#include "device/cpu/cpu_node.h"
+#include "device/cpu/cpu_graph.h"
+#include "device/cpu/cpu_module.h"
+#include "op/conv/risc-v/lp64dv/vsetvl_rvv.h"
+#include "utility/sys_port.h"
+#include <stdio.h>
+#include "utility/sys_port.h"
+#include "convolution_param.h"
+
+#define __likely(x)   __builtin_expect(!!(x), 1)
+#define __unlikely(x) __builtin_expect(!!(x), 0)
+#define max(a, b)     ((a) > (b) ? (a) : (b))
+#define min(a, b)     ((a) < (b) ? (a) : (b))
+
+// TODO: vectorize
+static void pad(const float* input, float* output, const int in_h, const int in_w, const int out_h, const int out_w, const int top, const int left, const float v)
+{
+    float* ptr = input;
+    float* outptr = output;
+
+    int y = 0;
+    // fill top
+    for (; y < top; y++)
+    {
+        int x = 0;
+        for (; x < out_w; x++)
+        {
+            outptr[x] = v;
+        }
+        outptr += out_w;
+    }
+    // fill center
+    for (; y < (top + in_h); y++)
+    {
+        int x = 0;
+        for (; x < left; x++)
+        {
+            outptr[x] = v;
+        }
+        if (in_w < 12)
+        {
+            for (; x < (left + in_w); x++)
+            {
+                outptr[x] = ptr[x - left];
+            }
+        }
+        else
+        {
+            memcpy(outptr + left, ptr, in_w * sizeof(float));
+            x += in_w;
+        }
+        for (; x < out_w; x++)
+        {
+            outptr[x] = v;
+        }
+        ptr += in_w;
+        outptr += out_w;
+    }
+    // fill bottom
+    for (; y < out_h; y++)
+    {
+        int x = 0;
+        for (; x < out_w; x++)
+        {
+            outptr[x] = v;
+        }
+        outptr += out_w;
+    }
+}
+
+static void do_pack(const float* input, float* output, const int channels, const int feat_size, const int packn)
+{
+    const int channels_packed = (channels + packn - 1) / packn;
+    const int feat_size_packed = feat_size * packn;
+    const int input_num = channels * feat_size;
+
+    int in = 0;
+
+    for (int c = 0; c < channels_packed; ++c)
+    {
+        for (int i = 0; i < feat_size_packed; i += packn)
+        {
+            float* output_base = output + c * feat_size_packed + i;
+            for (int k = 0; k < packn; ++k)
+            {
+                in = c * feat_size_packed + i / packn + k * feat_size;
+                if (__likely(in < input_num))
+                {
+                    output_base[k] = input[in];
+                }
+                else
+                {
+                    output_base[k] = .0f;
+                }
+            }
+        }
+    }
+}
+
+// channels: packed_channels, feat_size: packed_feat_size
+static void do_unpack(const float* packed, float* unpacked, const int packed_channels, const int packed_feat_size, const int unpacked_channels, const int packn)
+{
+    const int feat_size = packed_feat_size / packn;
+    const int unpacked_num = unpacked_channels * packed_feat_size / packn;
+
+    for (int c = 0; c < packed_channels; ++c)
+    {
+        for (int i = 0; i < packed_feat_size; i += packn)
+        {
+            const float* packed_base = packed + c * packed_feat_size + i;
+            for (int k = 0; k < packn; ++k)
+            {
+                int out = c * packed_feat_size + i / packn + k * feat_size;
+                if (__likely(out < unpacked_num))
+                {
+                    unpacked[out] = packed_base[k];
+                }
+            }
+        }
+    }
+}
+
+int conv_dw_packn_kernel_prerun(const ir_node_t* ir_node, const ir_tensor_t* input_tensor, const ir_tensor_t* filter_tensor, struct conv_priv_info* info, struct conv_param* params)
+{
+    const int inb = input_tensor->dims[0];
+    const int inc = input_tensor->dims[1];
+    const int inh = input_tensor->dims[2];
+    const int inw = input_tensor->dims[3];
+
+    const int pad_w = params->pad_w0;
+    const int pad_h = params->pad_h0;
+    const int inh_pad = inh + pad_h + pad_h;
+    const int inw_pad = inw + pad_w + pad_w;
+
+    if (inh_pad == inh && inw_pad == inw)
+    {
+        return 0;
+    }
+
+    if (!info->input_pad)
+    {
+        info->input_pad = sys_malloc(inb * inh_pad * inw_pad * inc * sizeof(float));
+    }
+
+    return 0;
+}
+
+int conv_dw_packn_kernel_postrun(const ir_node_t* ir_node, struct conv_priv_info* info)
+{
+    if (info->input_pad)
+    {
+        sys_free(info->input_pad);
+    }
+
+    return 0;
+}
+
+void convdw3x3s1_pack8_rvv(const float* input, const float* kernel, const float* bias, float* output, const int inc, const int inh, const int inw, const int outc, const int outh, const int outw, const int act, const struct conv_param* params, int num_thread)
+{
+    const int packn = 8;
+    vsetvl_e32_m2();
+
+#pragma omp parallel for num_threads(num_thread)
+    for (int c = 0; c < inc; ++c)
+    {
+        const float* feat_map = input + c * inh * inw;
+        const float* kernel_base = kernel + c * 9;
+        const float* bias_base = bias ? bias + c : NULL;
+
+        __asm__(
+            "vle32.v     v18, (%0);\n"
+
+            "vrgather.vi     v0,  v18, 0;\n"
+            "vrgather.vi     v2,  v18, 1;\n"
+            "vrgather.vi     v4,  v18, 2;\n"
+            "vrgather.vi     v6,  v18, 3;\n"
+            "vrgather.vi     v8,  v18, 4;\n"
+            "vrgather.vi     v10, v18, 5;\n"
+            "vrgather.vi     v12, v18, 6;\n"
+            "vrgather.vi     v14, v18, 7;\n"
+
+            "lw              t0, 32(%0);"
+            "vmv.v.x     v16, t0;\n"
+            :
+            : "r"(kernel_base)
+            : "t0");
+
+        float* output_base = output + c * outw * outh;
+
+        int h = 0;
+        for (; h < (outh & -2); h += 2)
+        {
+            const float* row0 = feat_map + h * inw;
+            const float* row1 = row0 + inw;
+            const float* row2 = row1 + inw;
+            const float* row3 = row2 + inw;
+
+            int w = 0;
+            for (; w < (outw & -packn); w += packn)
+            {
+                // bias = v18
+                if (bias_base)
+                {
+                    __asm__("lw         t0, (%0)\n"
+                            "vmv.v.x    v18, t0;\n"
+                            "vmv.v.x    v20, t0;\n"
+                            :
+                            : "r"(bias_base)
+                            : "t0");
+                }
+                else
+                {
+                    __asm__("vmv.v.x    v18, x0;\n"
+                            "vmv.v.x    v20, x0;\n");
+                }
+
+                // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17
+                __asm__(
+                    "vle32.v   v22, (%1);\n"
+                    "addi       t0, %1, 4;\n"
+                    "vle32.v   v24, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v26, (t0);\n"
+
+                    "vfmacc.vv v18, v0, v22;\n"
+                    "vfmacc.vv v18, v2, v24;\n"
+                    "vfmacc.vv v18, v4, v26;\n"
+
+                    "vle32.v   v22, (%2);\n"
+                    "addi       t0, %2, 4;\n"
+                    "vle32.v   v24, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v26, (t0);\n"
+
+                    "vfmacc.vv v18, v6, v22;\n"
+                    "vfmacc.vv v18, v8, v24;\n"
+                    "vfmacc.vv v18, v10, v26;\n"
+
+                    "vfmacc.vv v20, v0, v22;\n"
+                    "vfmacc.vv v20, v2, v24;\n"
+                    "vfmacc.vv v20, v4, v26;\n"
+
+                    "vle32.v   v22, (%3);\n"
+                    "addi       t0, %3, 4;\n"
+                    "vle32.v   v24, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v26, (t0);\n"
+
+                    "vfmacc.vv v18, v12, v22;\n"
+                    "vfmacc.vv v18, v14, v24;\n"
+                    "vfmacc.vv v18, v16, v26;\n"
+
+                    "vfmacc.vv v20, v6, v22;\n"
+                    "vfmacc.vv v20, v8, v24;\n"
+                    "vfmacc.vv v20, v10, v26;\n"
+
+                    "vle32.v   v22, (%4);\n"
+                    "addi       t0, %4, 4;\n"
+                    "vle32.v   v24, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v26, (t0);\n"
+
+                    "vfmacc.vv v20, v12, v22;\n"
+                    "vfmacc.vv v20, v14, v24;\n"
+                    "vfmacc.vv v20, v16, v26;\n"
+                    :
+                    : "r"(output_base), "r"(row0), "r"(row1), "r"(row2), "r"(row3)
+                    : "t0");
+
+                if (act == 0)
+                {
+                    __asm__("vmv.v.x    v22, x0;\n"
+                            "vfmax.vv   v18, v18, v22;\n"
+                            "vfmax.vv   v20, v20, v22;\n");
+                }
+                else if (act > 0)
+                {
+                    __asm__("vmv.v.x    v22, x0;\n"
+                            "vmv.v.x    v24, %0;\n"
+                            "vfmax.vv   v18, v18, v22;\n"
+                            "vfmin.vv   v18, v18, v24;\n"
+                            "vfmax.vv   v20, v20, v22;\n"
+                            "vfmin.vv   v20, v20, v24;\n"
+                            :
+                            : "r"(act));
+                }
+
+                __asm__("vse32.v    v18, (%0);\n" ::"r"(output_base));
+                __asm__("vse32.v    v20, (%0);\n" ::"r"(output_base + outw));
+
+                row0 += packn;
+                row1 += packn;
+                row2 += packn;
+                row3 += packn;
+                output_base += packn;
+            }
+
+            const float k00 = kernel_base[0];
+            const float k01 = kernel_base[1];
+            const float k02 = kernel_base[2];
+            const float k10 = kernel_base[3];
+            const float k11 = kernel_base[4];
+            const float k12 = kernel_base[5];
+            const float k20 = kernel_base[6];
+            const float k21 = kernel_base[7];
+            const float k22 = kernel_base[8];
+            float bias_value = bias_base ? bias_base[0] : .0f;
+
+            for (; w < outw; ++w)
+            {
+                const float i00 = row0[0];
+                const float i01 = row0[1];
+                const float i02 = row0[2];
+                const float i10 = row1[0];
+                const float i11 = row1[1];
+                const float i12 = row1[2];
+                const float i20 = row2[0];
+                const float i21 = row2[1];
+                const float i22 = row2[2];
+                const float i30 = row3[0];
+                const float i31 = row3[1];
+                const float i32 = row3[2];
+
+                float out1 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value);
+                float out2 = (k00 * i10 + k01 * i11 + k02 * i12 + k10 * i20 + k11 * i21 + k12 * i22 + k20 * i30 + k21 * i31 + k22 * i32 + bias_value);
+
+                if (act >= 0)
+                {
+                    out1 = max(out1, .0f);
+                    out2 = max(out2, .0f);
+                    if (act > 0)
+                    {
+                        out1 = min(out1, (float)act);
+                        out2 = min(out2, (float)act);
+                    }
+                }
+
+                *output_base = out1;
+                *(output_base + outw) = out2;
+
+                output_base += 1;
+                row0 += 1;
+                row1 += 1;
+                row2 += 1;
+                row3 += 1;
+            }
+
+            output_base += outw;
+        }
+
+        for (; h < outh; ++h)
+        {
+            const float* row0 = feat_map + h * inw;
+            const float* row1 = row0 + inw;
+            const float* row2 = row1 + inw;
+
+            int w = 0;
+            for (; w < (outw & -packn); w += packn)
+            {
+                // bias = v18
+                if (bias_base)
+                {
+                    __asm__("lw         t0, (%0)\n"
+                            "vmv.v.x    v18, t0;\n"
+                            :
+                            : "r"(bias_base)
+                            : "t0");
+                }
+                else
+                {
+                    __asm__("vmv.v.x    v18, x0;\n");
+                }
+
+                // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17
+                __asm__(
+                    "vle32.v   v22, (%0);\n"
+                    "addi       t0, %0, 4;\n"
+                    "vle32.v   v24, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v26, (t0);\n"
+
+                    "vfmacc.vv v18, v0, v22;\n"
+                    "vfmacc.vv v18, v2, v24;\n"
+                    "vfmacc.vv v18, v4, v26;\n"
+
+                    "vle32.v   v22, (%1);\n"
+                    "addi       t0, %1, 4;\n"
+                    "vle32.v   v24, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v26, (t0);\n"
+
+                    "vfmacc.vv v18, v6, v22;\n"
+                    "vfmacc.vv v18, v8, v24;\n"
+                    "vfmacc.vv v18, v10, v26;\n"
+
+                    "vle32.v   v22, (%2);\n"
+                    "addi       t0, %2, 4;\n"
+                    "vle32.v   v24, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v26, (t0);\n"
+
+                    "vfmacc.vv v18, v12, v22;\n"
+                    "vfmacc.vv v18, v14, v24;\n"
+                    "vfmacc.vv v18, v16, v26;\n"
+                    :
+                    : "r"(row0), "r"(row1), "r"(row2)
+                    : "t0");
+
+                if (act == 0)
+                {
+                    __asm__("vmv.v.x    v22, x0;\n"
+                            "vfmax.vv   v18, v18, v22;\n");
+                }
+                else if (act > 0)
+                {
+                    __asm__("vmv.v.x    v22, x0;\n"
+                            "vmv.v.x    v24, %0;\n"
+                            "vfmax.vv   v18, v18, v22;\n"
+                            "vfmin.vv   v18, v18, v24;\n"
+                            :
+                            : "r"(act));
+                }
+
+                __asm__("vse32.v    v18, (%0);\n" ::"r"(output_base));
+
+                row0 += packn;
+                row1 += packn;
+                row2 += packn;
+                output_base += packn;
+            }
+
+            const float k00 = kernel_base[0];
+            const float k01 = kernel_base[1];
+            const float k02 = kernel_base[2];
+            const float k10 = kernel_base[3];
+            const float k11 = kernel_base[4];
+            const float k12 = kernel_base[5];
+            const float k20 = kernel_base[6];
+            const float k21 = kernel_base[7];
+            const float k22 = kernel_base[8];
+            const float bias_value = bias_base ? bias_base[0] : .0f;
+
+            for (; w < outw; ++w)
+            {
+                const float i00 = row0[0];
+                const float i01 = row0[1];
+                const float i02 = row0[2];
+                const float i10 = row1[0];
+                const float i11 = row1[1];
+                const float i12 = row1[2];
+                const float i20 = row2[0];
+                const float i21 = row2[1];
+                const float i22 = row2[2];
+
+                float out1 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value);
+
+                if (act >= 0)
+                {
+                    out1 = max(out1, .0f);
+                    if (act > 0)
+                    {
+                        out1 = min(out1, (float)act);
+                    }
+                }
+
+                *output_base = out1;
+
+                output_base += 1;
+                row0 += 1;
+                row1 += 1;
+                row2 += 1;
+            }
+
+            output_base += outw;
+        }
+    }
+}
+
+void convdw3x3s1_pack4_rvv(const float* input, const float* kernel, const float* bias, float* output, const int inc, const int inh, const int inw, const int outc, const int outh, const int outw, const int act, const struct conv_param* params, int num_thread)
+{
+    const int packn = 4;
+    vsetvl_e32_m1();
+
+#pragma omp parallel for num_threads(num_thread)
+    for (int c = 0; c < inc; ++c)
+    {
+        const float* feat_map = input + c * inh * inw;
+        const float* kernel_base = kernel + c * 9;
+        const float* bias_base = bias ? bias + c : NULL;
+
+        __asm__(
+            "vle32.v     v9, (%0);\n"
+            "addi        t0, %0, 16;\n"
+            "vle32.v     v10, (t0);\n"
+
+            "vrgather.vi     v0,  v9, 0;\n"
+            "vrgather.vi     v1,  v9, 1;\n"
+            "vrgather.vi     v2,  v9, 2;\n"
+            "vrgather.vi     v3,  v9, 3;\n"
+            "vrgather.vi     v4,  v10, 0;\n"
+            "vrgather.vi     v5,  v10, 1;\n"
+            "vrgather.vi     v6,  v10, 2;\n"
+            "vrgather.vi     v7,  v10, 3;\n"
+
+            "lw              t0, 32(%0);"
+            "vmv.v.x     v8, t0;\n"
+            :
+            : "r"(kernel_base)
+            : "t0");
+
+        float* out0 = output + c * outw * outh;
+        float* out1 = out0 + outw;
+        float* out2 = out1 + outw;
+        float* out3 = out2 + outw;
+
+        int h = 0;
+        for (; h < (outh & -4); h += 4)
+        {
+            const float* row0 = feat_map + h * inw;
+            const float* row1 = row0 + inw;
+            const float* row2 = row1 + inw;
+            const float* row3 = row2 + inw;
+            const float* row4 = row3 + inw;
+            const float* row5 = row4 + inw;
+
+            int w = 0;
+            for (; w < (outw & -packn); w += packn)
+            {
+                // bias = v18
+                if (bias_base)
+                {
+                    __asm__("lw         t0, (%0)\n"
+                            "vmv.v.x    v28, t0;\n"
+                            "vmv.v.x    v29, t0;\n"
+                            "vmv.v.x    v30, t0;\n"
+                            "vmv.v.x    v31, t0;\n"
+                            :
+                            : "r"(bias_base)
+                            : "t0");
+                }
+                else
+                {
+                    __asm__("vmv.v.x    v28, x0;\n"
+                            "vmv.v.x    v29, x0;\n"
+                            "vmv.v.x    v30, x0;\n"
+                            "vmv.v.x    v31, x0;\n");
+                }
+
+                // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17
+                __asm__(
+                    "vle32.v    v9, (%0);\n"
+                    "addi       t0, %0, 4;\n"
+                    "vle32.v   v10, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v11, (t0);\n"
+
+                    "vfmacc.vv v28, v0, v9;\n"
+                    "vfmacc.vv v28, v1, v10;\n"
+                    "vfmacc.vv v28, v2, v11;\n"
+
+                    "vle32.v   v12, (%1);\n"
+                    "addi       t0, %1, 4;\n"
+                    "vle32.v   v13, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v14, (t0);\n"
+
+                    "vfmacc.vv v28, v3, v12;\n"
+                    "vfmacc.vv v28, v4, v13;\n"
+                    "vfmacc.vv v28, v5, v14;\n"
+
+                    "vfmacc.vv v29, v0, v12;\n"
+                    "vfmacc.vv v29, v1, v13;\n"
+                    "vfmacc.vv v29, v2, v14;\n"
+
+                    "vle32.v   v15, (%2);\n"
+                    "addi       t0, %2, 4;\n"
+                    "vle32.v   v16, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v17, (t0);\n"
+
+                    "vfmacc.vv  v28, v6, v15;\n"
+                    "vfmacc.vv  v28, v7, v16;\n"
+                    "vfmacc.vv  v28, v8, v17;\n"
+
+                    "vfmacc.vv  v29, v3, v15;\n"
+                    "vfmacc.vv  v29, v4, v16;\n"
+                    "vfmacc.vv  v29, v5, v17;\n"
+
+                    "vfmacc.vv  v30, v0, v15;\n"
+                    "vfmacc.vv  v30, v1, v16;\n"
+                    "vfmacc.vv  v30, v2, v17;\n"
+
+                    "vle32.v   v18, (%3);\n"
+                    "addi       t0, %3, 4;\n"
+                    "vle32.v   v19, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v20, (t0);\n"
+
+                    "vfmacc.vv v29, v6, v18;\n"
+                    "vfmacc.vv v29, v7, v19;\n"
+                    "vfmacc.vv v29, v8, v20;\n"
+
+                    "vfmacc.vv v30, v3, v18;\n"
+                    "vfmacc.vv v30, v4, v19;\n"
+                    "vfmacc.vv v30, v5, v20;\n"
+
+                    "vfmacc.vv v31, v0, v18;\n"
+                    "vfmacc.vv v31, v1, v19;\n"
+                    "vfmacc.vv v31, v2, v20;\n"
+
+                    "vle32.v   v21, (%4);\n"
+                    "addi       t0, %4, 4;\n"
+                    "vle32.v   v22, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v23, (t0);\n"
+
+                    "vfmacc.vv v30, v6, v21;\n"
+                    "vfmacc.vv v30, v7, v22;\n"
+                    "vfmacc.vv v30, v8, v23;\n"
+
+                    "vfmacc.vv v31, v3, v21;\n"
+                    "vfmacc.vv v31, v4, v22;\n"
+                    "vfmacc.vv v31, v5, v23;\n"
+
+                    "vle32.v   v24, (%5);\n"
+                    "addi       t0, %5, 4;\n"
+                    "vle32.v   v25, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v26, (t0);\n"
+
+                    "vfmacc.vv v31, v6, v24;\n"
+                    "vfmacc.vv v31, v7, v25;\n"
+                    "vfmacc.vv v31, v8, v26;\n"
+                    :
+                    : "r"(row0), "r"(row1), "r"(row2), "r"(row3), "r"(row4), "r"(row5)
+                    : "t0");
+
+                if (act == 0)
+                {
+                    __asm__("vmv.v.x    v22, x0;\n"
+                            "vfmax.vv   v28, v28, v22;\n"
+                            "vfmax.vv   v29, v29, v22;\n"
+                            "vfmax.vv   v30, v30, v22;\n"
+                            "vfmax.vv   v31, v31, v22;\n");
+                }
+                else if (act > 0)
+                {
+                    __asm__("vmv.v.x    v22, x0;\n"
+                            "vmv.v.x    v23, %0;\n"
+                            "vfmax.vv   v28, v28, v22;\n"
+                            "vfmin.vv   v28, v28, v23;\n"
+                            "vfmax.vv   v29, v29, v22;\n"
+                            "vfmin.vv   v29, v29, v23;\n"
+                            "vfmax.vv   v30, v30, v22;\n"
+                            "vfmin.vv   v30, v30, v23;\n"
+                            "vfmax.vv   v31, v31, v22;\n"
+                            "vfmin.vv   v31, v31, v23;\n"
+                            :
+                            : "r"(act));
+                }
+
+                __asm__("vse32.v    v28, (%0);\n"
+                        "vse32.v    v29, (%1);\n"
+                        "vse32.v    v30, (%2);\n"
+                        "vse32.v    v31, (%3);\n"
+                        :
+                        : "r"(out0), "r"(out1), "r"(out2), "r"(out3));
+
+                row0 += packn;
+                row1 += packn;
+                row2 += packn;
+                row3 += packn;
+                row4 += packn;
+                row5 += packn;
+
+                out0 += packn;
+                out1 += packn;
+                out2 += packn;
+                out3 += packn;
+            }
+
+            const float k00 = kernel_base[0];
+            const float k01 = kernel_base[1];
+            const float k02 = kernel_base[2];
+            const float k10 = kernel_base[3];
+            const float k11 = kernel_base[4];
+            const float k12 = kernel_base[5];
+            const float k20 = kernel_base[6];
+            const float k21 = kernel_base[7];
+            const float k22 = kernel_base[8];
+            const float bias_value = bias_base ? bias_base[0] : .0f;
+
+            for (; w < outw; ++w)
+            {
+                const float i00 = row0[0];
+                const float i01 = row0[1];
+                const float i02 = row0[2];
+
+                const float i10 = row1[0];
+                const float i11 = row1[1];
+                const float i12 = row1[2];
+
+                const float i20 = row2[0];
+                const float i21 = row2[1];
+                const float i22 = row2[2];
+
+                const float i30 = row3[0];
+                const float i31 = row3[1];
+                const float i32 = row3[2];
+
+                const float i40 = row4[0];
+                const float i41 = row4[1];
+                const float i42 = row4[2];
+
+                const float i50 = row5[0];
+                const float i51 = row5[1];
+                const float i52 = row5[2];
+
+                float v0 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value);
+                float v1 = (k00 * i10 + k01 * i11 + k02 * i12 + k10 * i20 + k11 * i21 + k12 * i22 + k20 * i30 + k21 * i31 + k22 * i32 + bias_value);
+                float v2 = (k00 * i20 + k01 * i21 + k02 * i22 + k10 * i30 + k11 * i31 + k12 * i32 + k20 * i40 + k21 * i41 + k22 * i42 + bias_value);
+                float v3 = (k00 * i30 + k01 * i31 + k02 * i32 + k10 * i40 + k11 * i41 + k12 * i42 + k20 * i50 + k21 * i51 + k22 * i52 + bias_value);
+
+                if (act >= 0)
+                {
+                    v0 = max(v0, .0f);
+                    v1 = max(v1, .0f);
+                    v2 = max(v2, .0f);
+                    v3 = max(v3, .0f);
+
+                    if (act > 0)
+                    {
+                        v0 = min(v0, (float)act);
+                        v1 = min(v1, (float)act);
+                        v2 = min(v2, (float)act);
+                        v3 = min(v3, (float)act);
+                    }
+                }
+
+                *out0 = v0;
+                *out1 = v1;
+                *out2 = v2;
+                *out3 = v3;
+
+                out0 += 1;
+                out1 += 1;
+                out2 += 1;
+                out3 += 1;
+
+                row0 += 1;
+                row1 += 1;
+                row2 += 1;
+                row3 += 1;
+                row4 += 1;
+                row5 += 1;
+            }
+
+            out0 += 3 * outw;
+            out1 += 3 * outw;
+            out2 += 3 * outw;
+            out3 += 3 * outw;
+        }
+
+        for (; h < outh; ++h)
+        {
+            const float* row0 = feat_map + h * inw;
+            const float* row1 = row0 + inw;
+            const float* row2 = row1 + inw;
+
+            int w = 0;
+            for (; w < (outw & -packn); w += packn)
+            {
+                // bias = v18
+                if (bias_base)
+                {
+                    __asm__("lw         t0, (%0)\n"
+                            "vmv.v.x    v28, t0;\n"
+                            :
+                            : "r"(bias_base)
+                            : "t0");
+                }
+                else
+                {
+                    __asm__("vmv.v.x    v28, x0;\n");
+                }
+
+                // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17
+                __asm__(
+                    "vle32.v    v9, (%0);\n"
+                    "addi       t0, %0, 4;\n"
+                    "vle32.v   v10, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v11, (t0);\n"
+
+                    "vfmacc.vv v28, v0, v9;\n"
+                    "vfmacc.vv v28, v1, v10;\n"
+                    "vfmacc.vv v28, v2, v11;\n"
+
+                    "vle32.v   v9, (%1);\n"
+                    "addi       t0, %1, 4;\n"
+                    "vle32.v   v10, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v11, (t0);\n"
+
+                    "vfmacc.vv v28, v3, v9;\n"
+                    "vfmacc.vv v28, v4, v10;\n"
+                    "vfmacc.vv v28, v5, v11;\n"
+
+                    "vle32.v   v9, (%2);\n"
+                    "addi       t0, %2, 4;\n"
+                    "vle32.v   v10, (t0);\n"
+                    "addi       t0, t0, 4;\n"
+                    "vle32.v   v11, (t0);\n"
+
+                    "vfmacc.vv  v28, v6, v9;\n"
+                    "vfmacc.vv  v28, v7, v10;\n"
+                    "vfmacc.vv  v28, v8, v11;\n"
+                    :
+                    : "r"(row0), "r"(row1), "r"(row2)
+                    : "t0");
+
+                if (act == 0)
+                {
+                    __asm__("vmv.v.x    v22, x0;\n"
+                            "vfmax.vv   v28, v28, v22;\n");
+                }
+                else if (act > 0)
+                {
+                    __asm__("vmv.v.x    v22, x0;\n"
+                            "vmv.v.x    v23, %0;\n"
+                            "vfmax.vv   v28, v28, v22;\n"
+                            "vfmin.vv   v28, v28, v23;\n"
+                            :
+                            : "r"(act));
+                }
+
+                __asm__("vse32.v    v28, (%0);\n"
+                        :
+                        : "r"(out0));
+
+                row0 += packn;
+                row1 += packn;
+                row2 += packn;
+
+                out0 += packn;
+            }
+
+            const float k00 = kernel_base[0];
+            const float k01 = kernel_base[1];
+            const float k02 = kernel_base[2];
+            const float k10 = kernel_base[3];
+            const float k11 = kernel_base[4];
+            const float k12 = kernel_base[5];
+            const float k20 = kernel_base[6];
+            const float k21 = kernel_base[7];
+            const float k22 = kernel_base[8];
+            const float bias_value = bias_base ? bias_base[0] : .0f;
+
+            for (; w < outw; ++w)
+            {
+                const float i00 = row0[0];
+                const float i01 = row0[1];
+                const float i02 = row0[2];
+
+                const float i10 = row1[0];
+                const float i11 = row1[1];
+                const float i12 = row1[2];
+
+                const float i20 = row2[0];
+                const float i21 = row2[1];
+                const float i22 = row2[2];
+
+                float v0 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value);
+
+                if (act >= 0)
+                {
+                    v0 = max(v0, .0f);
+
+                    if (act > 0)
+                    {
+                        v0 = min(v0, (float)act);
+                    }
+                }
+
+                *out0 = v0;
+                out0 += 1;
+
+                row0 += 1;
+                row1 += 1;
+                row2 += 1;
+            }
+        }
+    }
+}
+
+void convdw3x3s2_pack4_rvv(const float* input, const float* kernel, const float* bias, float* output, const int inc, const int inh, const int inw, const int outc, const int outh, const int outw, const int act, const struct conv_param* params, int num_thread)
+{
+    const int packn = 4;
+    vsetvl_e32_m1();
+
+#pragma omp parallel for num_threads(num_thread)
+    for (int c = 0; c < inc; ++c)
+    {
+        const float* feat_map = input + c * inh * inw;
+        const float* kernel_base = kernel + c * 9;
+        const float* bias_base = bias ? bias + c : NULL;
+        __asm__(
+            "vle32.v     v9, (%0);\n"
+            "addi        t0, %0, 16;\n"
+            "vle32.v     v10, (t0);\n"
+
+            "vrgather.vi     v0,  v9, 0;\n"
+            "vrgather.vi     v1,  v9, 1;\n"
+            "vrgather.vi     v2,  v9, 2;\n"
+            "vrgather.vi     v3,  v9, 3;\n"
+            "vrgather.vi     v4,  v10, 0;\n"
+            "vrgather.vi     v5,  v10, 1;\n"
+            "vrgather.vi     v6,  v10, 2;\n"
+            "vrgather.vi     v7,  v10, 3;\n"
+
+            "lw              t0, 32(%0);"
+            "vmv.v.x     v8, t0;\n"
+            :
+            : "r"(kernel_base)
+            : "t0");
+
+        float* out0 = output + c * outw * outh;
+        float* out1 = out0 + outw;
+        float* out2 = out1 + outw;
+        float* out3 = out2 + outw;
+
+        int h = 0;
+        for (; h < (outh & -4); h += 4)
+        {
+            const float* row0 = feat_map + 2 * h * inw;
+            const float* row1 = row0 + inw;
+            const float* row2 = row1 + inw;
+            const float* row3 = row2 + inw;
+            const float* row4 = row3 + inw;
+            const float* row5 = row4 + inw;
+            const float* row6 = row5 + inw;
+            const float* row7 = row6 + inw;
+            const float* row8 = row7 + inw;
+
+            int w = 0;
+            for (; w < (outw & -packn); w += packn)
+            {
+                // bias = v18
+                if (bias_base)
+                {
+                    __asm__("lw         t0, (%0)\n"
+                            "vmv.v.x    v28, t0;\n"
+                            "vmv.v.x    v29, t0;\n"
+                            "vmv.v.x    v30, t0;\n"
+                            "vmv.v.x    v31, t0;\n"
+                            :
+                            : "r"(bias_base)
+                            : "t0");
+                }
+                else
+                {
+                    __asm__("vmv.v.x    v28, x0;\n"
+                            "vmv.v.x    v29, x0;\n"
+                            "vmv.v.x    v30, x0;\n"
+                            "vmv.v.x    v31, x0;\n");
+                }
+
+                // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17
+                __asm__(
+                    "li         t1, 8;\n"
+                    "vlse32.v   v9, (%0), t1;\n"
+                    "addi       t0, %0, 4;\n"
+                    "vlse32.v   v10, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v11, (t0), t1;\n"
+
+                    "vfmacc.vv v28, v0, v9;\n"
+                    "vfmacc.vv v28, v1, v10;\n"
+                    "vfmacc.vv v28, v2, v11;\n"
+
+                    "vlse32.v   v9, (%1), t1;\n"
+                    "addi       t0, %1, 4;\n"
+                    "vlse32.v   v10, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v11, (t0), t1;\n"
+
+                    "vfmacc.vv v28, v3, v9;\n"
+                    "vfmacc.vv v28, v4, v10;\n"
+                    "vfmacc.vv v28, v5, v11;\n"
+
+                    "vlse32.v   v9, (%2), t1;\n"
+                    "addi       t0, %2, 4;\n"
+                    "vlse32.v   v10, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v11, (t0), t1;\n"
+
+                    "vfmacc.vv v28, v6, v9;\n"
+                    "vfmacc.vv v28, v7, v10;\n"
+                    "vfmacc.vv v28, v8, v11;\n"
+
+                    "vfmacc.vv v29, v0, v9;\n"
+                    "vfmacc.vv v29, v1, v10;\n"
+                    "vfmacc.vv v29, v2, v11;\n"
+
+                    "vlse32.v   v9, (%3), t1;\n"
+                    "addi       t0, %3, 4;\n"
+                    "vlse32.v   v10, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v11, (t0), t1;\n"
+
+                    "vfmacc.vv v29, v3, v9;\n"
+                    "vfmacc.vv v29, v4, v10;\n"
+                    "vfmacc.vv v29, v5, v11;\n"
+
+                    "vlse32.v   v9, (%4), t1;\n"
+                    "addi       t0, %4, 4;\n"
+                    "vlse32.v   v10, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v11, (t0), t1;\n"
+
+                    "vfmacc.vv v29, v6, v9;\n"
+                    "vfmacc.vv v29, v7, v10;\n"
+                    "vfmacc.vv v29, v8, v11;\n"
+
+                    "vfmacc.vv v30, v0, v9;\n"
+                    "vfmacc.vv v30, v1, v10;\n"
+                    "vfmacc.vv v30, v2, v11;\n"
+
+                    "vlse32.v   v9, (%5), t1;\n"
+                    "addi       t0, %5, 4;\n"
+                    "vlse32.v   v10, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v11, (t0), t1;\n"
+
+                    "vfmacc.vv v30, v3, v9;\n"
+                    "vfmacc.vv v30, v4, v10;\n"
+                    "vfmacc.vv v30, v5, v11;\n"
+
+                    "vlse32.v   v9, (%6), t1;\n"
+                    "addi       t0, %6, 4;\n"
+                    "vlse32.v   v10, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v11, (t0), t1;\n"
+
+                    "vfmacc.vv v30, v6, v9;\n"
+                    "vfmacc.vv v30, v7, v10;\n"
+                    "vfmacc.vv v30, v8, v11;\n"
+
+                    "vfmacc.vv v31, v0, v9;\n"
+                    "vfmacc.vv v31, v1, v10;\n"
+                    "vfmacc.vv v31, v2, v11;\n"
+
+                    "vlse32.v   v9, (%7), t1;\n"
+                    "addi       t0, %7, 4;\n"
+                    "vlse32.v   v10, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v11, (t0), t1;\n"
+
+                    "vfmacc.vv v31, v3, v9;\n"
+                    "vfmacc.vv v31, v4, v10;\n"
+                    "vfmacc.vv v31, v5, v11;\n"
+
+                    "vlse32.v   v9, (%8), t1;\n"
+                    "addi       t0, %8, 4;\n"
+                    "vlse32.v   v10, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v11, (t0), t1;\n"
+
+                    "vfmacc.vv v31, v6, v9;\n"
+                    "vfmacc.vv v31, v7, v10;\n"
+                    "vfmacc.vv v31, v8, v11;\n"
+                    :
+                    : "r"(row0), "r"(row1), "r"(row2), "r"(row3), "r"(row4), "r"(row5), "r"(row6), "r"(row7), "r"(row8)
+                    : "t0", "t1");
+
+                if (act == 0)
+                {
+                    __asm__("vmv.v.x    v27, x0;\n"
+                            "vfmax.vv   v28, v28, v27;\n"
+                            "vfmax.vv   v29, v29, v27;\n"
+                            "vfmax.vv   v30, v30, v27;\n"
+                            "vfmax.vv   v31, v31, v27;\n");
+                }
+                else if (act > 0)
+                {
+                    __asm__("vmv.v.x    v26, x0;\n"
+                            "vmv.v.x    v27, %0;\n"
+                            "vfmax.vv   v28, v28, v26;\n"
+                            "vfmin.vv   v28, v28, v27;\n"
+                            "vfmax.vv   v29, v29, v26;\n"
+                            "vfmin.vv   v29, v29, v27;\n"
+                            "vfmax.vv   v30, v30, v26;\n"
+                            "vfmin.vv   v30, v30, v27;\n"
+                            "vfmax.vv   v31, v31, v26;\n"
+                            "vfmin.vv   v31, v31, v27;\n"
+                            :
+                            : "r"(act));
+                }
+
+                __asm__(
+                    "vse32.v    v28, (%0);\n"
+                    "vse32.v    v29, (%1);\n"
+                    "vse32.v    v30, (%2);\n"
+                    "vse32.v    v31, (%3);\n"
+                    :
+                    : "r"(out0), "r"(out1), "r"(out2), "r"(out3));
+
+                row0 += 2 * packn;
+                row1 += 2 * packn;
+                row2 += 2 * packn;
+                row3 += 2 * packn;
+                row4 += 2 * packn;
+                row5 += 2 * packn;
+                row6 += 2 * packn;
+                row7 += 2 * packn;
+                row8 += 2 * packn;
+                out0 += packn;
+                out1 += packn;
+                out2 += packn;
+                out3 += packn;
+            }
+
+            const float k00 = kernel_base[0];
+            const float k01 = kernel_base[1];
+            const float k02 = kernel_base[2];
+            const float k10 = kernel_base[3];
+            const float k11 = kernel_base[4];
+            const float k12 = kernel_base[5];
+            const float k20 = kernel_base[6];
+            const float k21 = kernel_base[7];
+            const float k22 = kernel_base[8];
+            const float bias_value = bias_base ? bias_base[0] : .0f;
+
+            for (; w < outw; ++w)
+            {
+                const float i00 = row0[0];
+                const float i01 = row0[1];
+                const float i02 = row0[2];
+                const float i10 = row1[0];
+                const float i11 = row1[1];
+                const float i12 = row1[2];
+                const float i20 = row2[0];
+                const float i21 = row2[1];
+                const float i22 = row2[2];
+                const float i30 = row3[0];
+                const float i31 = row3[1];
+                const float i32 = row3[2];
+                const float i40 = row4[0];
+                const float i41 = row4[1];
+                const float i42 = row4[2];
+                const float i50 = row5[0];
+                const float i51 = row5[1];
+                const float i52 = row5[2];
+                const float i60 = row6[0];
+                const float i61 = row6[1];
+                const float i62 = row6[2];
+                const float i70 = row7[0];
+                const float i71 = row7[1];
+                const float i72 = row7[2];
+                const float i80 = row8[0];
+                const float i81 = row8[1];
+                const float i82 = row8[2];
+
+                float v0 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value);
+                float v1 = (k00 * i20 + k01 * i21 + k02 * i22 + k10 * i30 + k11 * i31 + k12 * i32 + k20 * i40 + k21 * i41 + k22 * i42 + bias_value);
+                float v2 = (k00 * i40 + k01 * i41 + k02 * i42 + k10 * i50 + k11 * i51 + k12 * i52 + k20 * i60 + k21 * i61 + k22 * i62 + bias_value);
+                float v3 = (k00 * i60 + k01 * i61 + k02 * i62 + k10 * i70 + k11 * i71 + k12 * i72 + k20 * i80 + k21 * i81 + k22 * i82 + bias_value);
+
+                if (act >= 0)
+                {
+                    v0 = max(v0, .0f);
+                    v1 = max(v1, .0f);
+                    v2 = max(v2, .0f);
+                    v3 = max(v3, .0f);
+                    if (act > 0)
+                    {
+                        v0 = min(v0, (float)act);
+                        v1 = min(v1, (float)act);
+                        v2 = min(v2, (float)act);
+                        v3 = min(v3, (float)act);
+                    }
+                }
+
+                *out0 = v0;
+                *out1 = v1;
+                *out2 = v2;
+                *out3 = v3;
+
+                out0 += 1;
+                out1 += 1;
+                out2 += 1;
+                out3 += 1;
+
+                row0 += 2;
+                row1 += 2;
+                row2 += 2;
+                row3 += 2;
+                row4 += 2;
+                row5 += 2;
+                row6 += 2;
+                row7 += 2;
+                row8 += 2;
+            }
+
+            out0 += 3 * outw;
+            out1 += 3 * outw;
+            out2 += 3 * outw;
+            out3 += 3 * outw;
+        }
+
+        for (; h < outh; ++h)
+        {
+            const float* row0 = feat_map + 2 * h * inw;
+            const float* row1 = row0 + inw;
+            const float* row2 = row1 + inw;
+
+            int w = 0;
+            for (; w < (outw & -packn); w += packn)
+            {
+                // bias = v18
+                if (bias_base)
+                {
+                    __asm__("lw         t0, (%0)\n"
+                            "vmv.v.x    v28, t0;\n"
+                            :
+                            : "r"(bias_base)
+                            : "t0");
+                }
+                else
+                {
+                    __asm__("vmv.v.x    v28, x0;\n");
+                }
+
+                // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17
+                __asm__(
+                    "li         t1, 8;\n"
+                    "vlse32.v   v9, (%0), t1;\n"
+                    "addi       t0, %0, 4;\n"
+                    "vlse32.v   v10, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v11, (t0), t1;\n"
+
+                    "vfmacc.vv v28, v0, v9;\n"
+                    "vfmacc.vv v28, v1, v10;\n"
+                    "vfmacc.vv v28, v2, v11;\n"
+
+                    "vlse32.v   v9, (%1), t1;\n"
+                    "addi       t0, %1, 4;\n"
+                    "vlse32.v   v10, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v11, (t0), t1;\n"
+
+                    "vfmacc.vv v28, v3, v9;\n"
+                    "vfmacc.vv v28, v4, v10;\n"
+                    "vfmacc.vv v28, v5, v11;\n"
+
+                    "vlse32.v   v9, (%2), t1;\n"
+                    "addi       t0, %2, 4;\n"
+                    "vlse32.v   v10, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v11, (t0), t1;\n"
+
+                    "vfmacc.vv v28, v6, v9;\n"
+                    "vfmacc.vv v28, v7, v10;\n"
+                    "vfmacc.vv v28, v8, v11;\n"
+                    :
+                    : "r"(row0), "r"(row1), "r"(row2)
+                    : "t0", "t1");
+
+                if (act == 0)
+                {
+                    __asm__("vmv.v.x    v27, x0;\n"
+                            "vfmax.vv   v28, v28, v27;\n");
+                }
+                else if (act > 0)
+                {
+                    __asm__("vmv.v.x    v26, x0;\n"
+                            "vmv.v.x    v27, %0;\n"
+                            "vfmax.vv   v28, v28, v26;\n"
+                            "vfmin.vv   v28, v28, v27;\n"
+                            :
+                            : "r"(act));
+                }
+
+                __asm__(
+                    "vse32.v    v28, (%0);\n"
+                    :
+                    : "r"(out0));
+
+                row0 += 2 * packn;
+                row1 += 2 * packn;
+                row2 += 2 * packn;
+                out0 += packn;
+            }
+
+            const float k00 = kernel_base[0];
+            const float k01 = kernel_base[1];
+            const float k02 = kernel_base[2];
+            const float k10 = kernel_base[3];
+            const float k11 = kernel_base[4];
+            const float k12 = kernel_base[5];
+            const float k20 = kernel_base[6];
+            const float k21 = kernel_base[7];
+            const float k22 = kernel_base[8];
+            const float bias_value = bias_base ? bias_base[0] : .0f;
+
+            for (; w < outw; ++w)
+            {
+                const float i00 = row0[0];
+                const float i01 = row0[1];
+                const float i02 = row0[2];
+                const float i10 = row1[0];
+                const float i11 = row1[1];
+                const float i12 = row1[2];
+                const float i20 = row2[0];
+                const float i21 = row2[1];
+                const float i22 = row2[2];
+
+                float v0 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value);
+
+                if (act >= 0)
+                {
+                    v0 = max(v0, .0f);
+                    if (act > 0)
+                    {
+                        v0 = min(v0, (float)act);
+                    }
+                }
+
+                *out0 = v0;
+
+                out0 += 1;
+                row0 += 2;
+                row1 += 2;
+                row2 += 2;
+            }
+        }
+    }
+}
+
+void convdw3x3s2_pack8_rvv(const float* input, const float* kernel, const float* bias, float* output, const int inc, const int inh, const int inw, const int outc, const int outh, const int outw, const int act, const struct conv_param* params, int num_thread)
+{
+    const int packn = 8;
+
+    vsetvl_e32_m2();
+#pragma omp parallel for num_threads(num_thread)
+    for (int c = 0; c < inc; ++c)
+    {
+        const float* feat_map = input + c * inh * inw;
+        const float* kernel_base = kernel + c * 9;
+        const float* bias_base = bias ? bias + c : NULL;
+
+        __asm__(
+            "vle32.v     v18, (%0);\n"
+
+            "vrgather.vi     v0,  v18, 0;\n"
+            "vrgather.vi     v2,  v18, 1;\n"
+            "vrgather.vi     v4,  v18, 2;\n"
+            "vrgather.vi     v6,  v18, 3;\n"
+            "vrgather.vi     v8,  v18, 4;\n"
+            "vrgather.vi     v10, v18, 5;\n"
+            "vrgather.vi     v12, v18, 6;\n"
+            "vrgather.vi     v14, v18, 7;\n"
+
+            "lw              t0, 32(%0);"
+            "vmv.v.x     v16, t0;\n"
+            :
+            : "r"(kernel_base));
+
+        float* output_base = output + c * outw * outh;
+
+        int h = 0;
+        for (; h < (outh & -2); h += 2)
+        {
+            const float* row0 = feat_map + 2 * h * inw;
+            const float* row1 = row0 + inw;
+            const float* row2 = row1 + inw;
+            const float* row3 = row2 + inw;
+            const float* row4 = row3 + inw;
+
+            int w = 0;
+            for (; w < (outw & -packn); w += packn)
+            {
+                // bias = v18
+                if (bias_base)
+                {
+                    __asm__("lw         t0, (%0)\n"
+                            "vmv.v.x    v18, t0;\n"
+                            "vmv.v.x    v20, t0;\n"
+                            :
+                            : "r"(bias_base)
+                            : "t0");
+                }
+                else
+                {
+                    __asm__("vmv.v.x    v18, x0;\n"
+                            "vmv.v.x    v20, x0;\n");
+                }
+
+                // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17
+                __asm__(
+                    "li         t1, 8;\n"
+                    "vlse32.v   v22, (%1), t1;\n"
+                    "addi       t0, %1, 4;\n"
+                    "vlse32.v   v24, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v26, (t0), t1;\n"
+
+                    "vfmacc.vv v18, v0, v22;\n"
+                    "vfmacc.vv v18, v2, v24;\n"
+                    "vfmacc.vv v18, v4, v26;\n"
+
+                    "vlse32.v   v22, (%2), t1;\n"
+                    "addi       t0, %2, 4;\n"
+                    "vlse32.v   v24, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v26, (t0), t1;\n"
+
+                    "vfmacc.vv v18, v6, v22;\n"
+                    "vfmacc.vv v18, v8, v24;\n"
+                    "vfmacc.vv v18, v10, v26;\n"
+
+                    "vlse32.v   v22, (%3), t1;\n"
+                    "addi       t0, %3, 4;\n"
+                    "vlse32.v   v24, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v26, (t0), t1;\n"
+
+                    "vfmacc.vv v18, v12, v22;\n"
+                    "vfmacc.vv v18, v14, v24;\n"
+                    "vfmacc.vv v18, v16, v26;\n"
+
+                    "vfmacc.vv v20, v0, v22;\n"
+                    "vfmacc.vv v20, v2, v24;\n"
+                    "vfmacc.vv v20, v4, v26;\n"
+
+                    "vlse32.v   v22, (%4), t1;\n"
+                    "addi       t0, %4, 4;\n"
+                    "vlse32.v   v24, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v26, (t0), t1;\n"
+
+                    "vfmacc.vv v20, v6, v22;\n"
+                    "vfmacc.vv v20, v8, v24;\n"
+                    "vfmacc.vv v20, v10, v26;\n"
+
+                    "vlse32.v   v22, (%5), t1;\n"
+                    "addi       t0, %5, 4;\n"
+                    "vlse32.v   v24, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v26, (t0), t1;\n"
+
+                    "vfmacc.vv v20, v12, v22;\n"
+                    "vfmacc.vv v20, v14, v24;\n"
+                    "vfmacc.vv v20, v16, v26;\n"
+                    :
+                    : "r"(output_base), "r"(row0), "r"(row1), "r"(row2), "r"(row3), "r"(row4)
+                    : "t0", "t1");
+
+                if (act == 0)
+                {
+                    __asm__("vmv.v.x    v22, x0;\n"
+                            "vfmax.vv   v18, v18, v22;\n"
+                            "vfmax.vv   v20, v20, v22;\n");
+                }
+                else if (act > 0)
+                {
+                    __asm__("vmv.v.x    v22, x0;\n"
+                            "vmv.v.x    v24, %0;\n"
+                            "vfmax.vv   v18, v18, v22;\n"
+                            "vfmin.vv   v18, v18, v24;\n"
+                            "vfmax.vv   v20, v20, v22;\n"
+                            "vfmin.vv   v20, v20, v24;\n"
+                            :
+                            : "r"(act));
+                }
+
+                __asm__("vse32.v    v18, (%0);\n" ::"r"(output_base));
+                __asm__("vse32.v    v20, (%0);\n" ::"r"(output_base + outw));
+
+                row0 += 2 * packn;
+                row1 += 2 * packn;
+                row2 += 2 * packn;
+                row3 += 2 * packn;
+                row4 += 2 * packn;
+                output_base += packn;
+            }
+
+            const float k00 = kernel_base[0];
+            const float k01 = kernel_base[1];
+            const float k02 = kernel_base[2];
+            const float k10 = kernel_base[3];
+            const float k11 = kernel_base[4];
+            const float k12 = kernel_base[5];
+            const float k20 = kernel_base[6];
+            const float k21 = kernel_base[7];
+            const float k22 = kernel_base[8];
+            const float bias_value = bias_base ? bias_base[0] : .0f;
+
+            for (; w < outw; ++w)
+            {
+                const float i00 = row0[0];
+                const float i01 = row0[1];
+                const float i02 = row0[2];
+                const float i10 = row1[0];
+                const float i11 = row1[1];
+                const float i12 = row1[2];
+                const float i20 = row2[0];
+                const float i21 = row2[1];
+                const float i22 = row2[2];
+                const float i30 = row3[0];
+                const float i31 = row3[1];
+                const float i32 = row3[2];
+                const float i40 = row4[0];
+                const float i41 = row4[1];
+                const float i42 = row4[2];
+
+                float out1 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value);
+                float out2 = (k00 * i20 + k01 * i21 + k02 * i22 + k10 * i30 + k11 * i31 + k12 * i32 + k20 * i40 + k21 * i41 + k22 * i42 + bias_value);
+
+                if (act >= 0)
+                {
+                    out1 = max(out1, .0f);
+                    out2 = max(out2, .0f);
+                    if (act > 0)
+                    {
+                        out1 = min(out1, (float)act);
+                        out2 = min(out2, (float)act);
+                    }
+                }
+
+                *output_base = out1;
+                *(output_base + outw) = out2;
+
+                output_base += 1;
+                row0 += 2;
+                row1 += 2;
+                row2 += 2;
+                row3 += 2;
+                row4 += 2;
+            }
+
+            output_base += outw;
+        }
+
+        for (; h < outh; ++h)
+        {
+            const float* row0 = feat_map + 2 * h * inw;
+            const float* row1 = row0 + inw;
+            const float* row2 = row1 + inw;
+
+            int w = 0;
+            for (; w < (outw & -packn); w += packn)
+            {
+                // bias = v18
+                if (bias_base)
+                {
+                    __asm__("lw         t0, (%0)\n"
+                            "vmv.v.x    v18, t0;\n"
+                            :
+                            : "r"(bias_base)
+                            : "t0");
+                }
+                else
+                {
+                    __asm__("vmv.v.x    v18, x0;\n");
+                }
+
+                // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17
+                __asm__(
+                    "li         t1, 8;\n"
+                    "vlse32.v   v22, (%0), t1;\n"
+                    "addi       t0, %0, 4;\n"
+                    "vlse32.v   v24, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v26, (t0), t1;\n"
+
+                    "vfmacc.vv v18, v0, v22;\n"
+                    "vfmacc.vv v18, v2, v24;\n"
+                    "vfmacc.vv v18, v4, v26;\n"
+
+                    "vlse32.v   v22, (%1), t1;\n"
+                    "addi       t0, %1, 4;\n"
+                    "vlse32.v   v24, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v26, (t0), t1;\n"
+
+                    "vfmacc.vv v18, v6, v22;\n"
+                    "vfmacc.vv v18, v8, v24;\n"
+                    "vfmacc.vv v18, v10, v26;\n"
+
+                    "vlse32.v   v22, (%2), t1;\n"
+                    "addi       t0, %2, 4;\n"
+                    "vlse32.v   v24, (t0), t1;\n"
+                    "addi       t0, t0, 4;\n"
+                    "vlse32.v   v26, (t0), t1;\n"
+
+                    "vfmacc.vv v18, v12, v22;\n"
+                    "vfmacc.vv v18, v14, v24;\n"
+                    "vfmacc.vv v18, v16, v26;\n"
+                    :
+                    : "r"(row0), "r"(row1), "r"(row2)
+                    : "t0", "t1");
+
+                if (act == 0)
+                {
+                    __asm__("vmv.v.x    v22, x0;\n"
+                            "vfmax.vv   v18, v18, v22;\n");
+                }
+                else if (act > 0)
+                {
+                    __asm__("vmv.v.x    v22, x0;\n"
+                            "vfmax.vv   v18, v18, v22;\n"
+                            "vfmin.vv   v18, v18, v24;\n"
+                            :
+                            : "r"(act));
+                }
+
+                __asm__("vse32.v    v18, (%0);\n" ::"r"(output_base));
+
+                row0 += 2 * packn;
+                row1 += 2 * packn;
+                row2 += 2 * packn;
+                output_base += packn;
+            }
+
+            const float k00 = kernel_base[0];
+            const float k01 = kernel_base[1];
+            const float k02 = kernel_base[2];
+            const float k10 = kernel_base[3];
+            const float k11 = kernel_base[4];
+            const float k12 = kernel_base[5];
+            const float k20 = kernel_base[6];
+            const float k21 = kernel_base[7];
+            const float k22 = kernel_base[8];
+            const float bias_value = bias_base ? bias_base[0] : .0f;
+
+            for (; w < outw; ++w)
+            {
+                const float i00 = row0[0];
+                const float i01 = row0[1];
+                const float i02 = row0[2];
+                const float i10 = row1[0];
+                const float i11 = row1[1];
+                const float i12 = row1[2];
+                const float i20 = row2[0];
+                const float i21 = row2[1];
+                const float i22 = row2[2];
+
+                float out1 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value);
+
+                if (act >= 0)
+                {
+                    out1 = max(out1, .0f);
+                    if (act > 0)
+                    {
+                        out1 = min(out1, (float)act);
+                    }
+                }
+
+                *output_base = out1;
+
+                output_base += 1;
+                row0 += 2;
+                row1 += 2;
+                row2 += 2;
+            }
+            output_base += outw;
+        }
+    }
+}
+
+int conv_dw_packn_kernel_run(const ir_node_t* ir_node, const ir_tensor_t* input_tensor, const ir_tensor_t* filter_tensor, const ir_tensor_t* bias_tensor, ir_tensor_t* output_tensor, const struct conv_priv_info* priv_info, const struct conv_param* params, const int num_thread, const int cpu_affinity)
+{
+    float* input = (float*)input_tensor->data;
+    float* output = (float*)output_tensor->data;
+    const float* kernel = filter_tensor->data;
+    const float* bias = bias_tensor ? bias_tensor->data : NULL;
+
+    const int inb = input_tensor->dims[0];
+    const int inc = input_tensor->dims[1];
+    const int inh = input_tensor->dims[2];
+    const int inw = input_tensor->dims[3];
+
+    const int outb = output_tensor->dims[0];
+    const int outc = output_tensor->dims[1];
+    const int outh = output_tensor->dims[2];
+    const int outw = output_tensor->dims[3];
+
+    const int ksize_h = params->kernel_h;
+    const int ksize_w = params->kernel_w;
+    const int pad_w = params->pad_w0;
+    const int pad_h = params->pad_h0;
+    const int stride_w = params->stride_w;
+    const int stride_h = params->stride_h;
+
+    const int dilation_w = params->dilation_w;
+    const int dilation_h = params->dilation_h;
+    const int group = params->group;
+    const int act = params->activation;
+
+    int inh_pad = inh + pad_h + pad_h;
+    int inw_pad = inw + pad_w + pad_w;
+    float* input_pad = NULL;
+
+    if (inh_pad == inh && inw_pad == inw)
+    {
+        input_pad = input;
+    }
+    else
+    {
+        input_pad = priv_info->input_pad;
+        for (int b = 0; b < inb; ++b)
+        {
+            const float* input_batch_base = input + b * inc * inh * inw;
+            float* input_batch_padded_base = input_pad + b * inc * inh_pad * inw_pad;
+#pragma omp parallel for num_threads(num_thread)
+            for (int g = 0; g < group; ++g)
+            {
+                const float* pad_in = input_batch_base + g * inh * inw;
+                float* pad_out = input_batch_padded_base + g * inh_pad * inw_pad;
+                pad(pad_in, pad_out, inh, inw, inh_pad, inw_pad, pad_h, pad_w, .0f);
+            }
+        }
+    }
+
+    for (int b = 0; b < inb; ++b)
+    {
+        const float* input_batch_base = input_pad + b * inc * inh_pad * inw_pad;
+        float* output_batch_base = output + b * outc * outh * outw;
+        if (stride_h == 1)
+        {
+            convdw3x3s1_pack4_rvv(input_batch_base, kernel, bias, output_batch_base, inc, inh_pad, inw_pad, outc, outh, outw, act, params, num_thread);
+        }
+        else
+        {
+            convdw3x3s2_pack8_rvv(input_batch_base, kernel, bias, output_batch_base, inc, inh_pad, inw_pad, outc, outh, outw, act, params, num_thread);
+        }
+    }
+
+    return 0;
+}
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c
index ac7333ff0..420f4cadc 100644
--- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c
@@ -1,98 +1,100 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2021, OPEN AI LAB
- * Author: ddzhao@openailab.com
- */
-
 #include "convolution_param.h"
-
 #include "graph/tensor.h"
 #include "graph/node.h"
 #include "graph/graph.h"
-#include "module/module.h"
-#include "operator/op.h"
-#include "utility/sys_port.h"
-#include "utility/log.h"
 #include "device/cpu/cpu_node.h"
 #include "device/cpu/cpu_graph.h"
+#include "operator/op.h"
+#include "api/c_api.h"
+#include "utility/log.h"
+#include "utility/sys_port.h"
 #include "device/cpu/cpu_module.h"
+#include <string.h>
+#include <stdio.h>
+
+extern int conv_hcl_prerun_rv64(struct node* ir_node, struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param);
+extern int conv_hcl_run_rv64(struct node* ir_node, struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param, int num_thread, int cpu_affinity);
+extern int conv_hcl_get_shared_mem_size_rv64(struct tensor* input_tensor, struct tensor* output_tensor, struct conv_param* param);
+extern int conv_hcl_postrun_rv64(struct node* ir_node, struct conv_priv_info* info);
+
+static int init_node(struct node_ops* ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
+{
+    struct node* ir_node = exec_node->ir_node;
+    struct graph* ir_graph = ir_node->graph;
+    struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct tensor* kernel_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
+    struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+    struct conv_param* params = ir_node->op.param_mem;
+    struct conv_priv_info* info = sys_malloc(sizeof(struct conv_priv_info));
+    if (!info)
+    {
+        return -1;
+    }
+
+    memset(info, 0, sizeof(*info));
+    exec_node->ops_priv = info;
 
-#include "conv_kernel_rv64.h"
+    if (exec_graph->mode == TENGINE_MODE_FP32)
+    {
+        exec_node->shared_mem_size = conv_hcl_get_shared_mem_size_rv64(input_tensor, output_tensor, params);
+        exec_node->shared_pack4_mem_size = 0;
+    }
+    else
+    {
+        TLOG_ERR("Tengine work node %s not support %d\n", ir_node->name, exec_graph->mode);
+        return -1;
+    }
 
-#include "string.h"
+    return 0;
+}
 
 static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
+
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
     struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 
-    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
+    struct conv_param* param = ir_node->op.param_mem;
+    struct conv_priv_info* info = exec_node->ops_priv;
 
-    /* get cpu affinity */
-    conv_priv_info->cpu_type = exec_graph->cpu_affinity;
+    info->cpu_type = exec_graph->cpu_affinity;
 
-    /* fp32 prerun */
     if (exec_graph->mode == TENGINE_MODE_FP32)
     {
-        if (conv_hcl_set_shared_mem && exec_node->shared_mem_size < exec_graph->shared_mem_size)
+        if (exec_node->shared_mem_size < exec_graph->shared_mem_size)
         {
-            if (conv_hcl_set_shared_mem(conv_priv_info, exec_graph->shared_mem, exec_graph->shared_mem_size) < 0)
-            {
-                TLOG_ERR("hcl conv: set shared memory failed\n");
-                return -1;
-            }
+            info->external_im2col_mem = 1;
+            info->im2col_buffer = exec_graph->shared_mem;
+            info->im2col_buffer_size = exec_graph->shared_mem_size;
         }
-        if (conv_hcl_set_shared_pack4_mem && exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size)
+
+        if (exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size)
         {
-            if (conv_hcl_set_shared_pack4_mem(conv_priv_info, exec_graph->shared_pack4_mem,
-                                              exec_graph->shared_pack4_mem_size)
-                < 0)
-            {
-                TLOG_ERR("hcl conv: set shared pack4 memory failed\n");
-                return -1;
-            }
+            info->external_im2col_pack4_mem = 0;
+            info->im2col_buffer_pack4 = NULL;
+            info->im2col_buffer_pack4_size = 0;
         }
 
-        int group = conv_param->group;
-        int kernel_h = conv_param->kernel_h;
-        int kernel_w = conv_param->kernel_w;
-        if (group > 1 && kernel_h == 7 && kernel_w == 7)
-            conv_priv_info->external_interleave_pack4_mem = 0;
+        if (param->group > 1 && param->kernel_h == 7 && param->kernel_w == 7)
+        {
+            info->external_interleave_pack4_mem = 0;
+        }
         else
-            conv_priv_info->external_interleave_pack4_mem = 1;
+        {
+            info->external_interleave_pack4_mem = 1;
+        }
 
-        /* do prerun */
-        if (conv_hcl_prerun(input_tensor, filter_tensor, output_tensor, conv_priv_info, conv_param) < 0)
+        if (conv_hcl_prerun_rv64(ir_node, input_tensor, filter_tensor, output_tensor, info, param) < 0)
         {
-            TLOG_ERR("hcl conv prerun failed\n");
+            TLOG_ERR("hcl conv prerun failed.\n");
             return -1;
         }
     }
     else
     {
-        printf("Tengine work node not support %d\n", exec_graph->mode);
         return -1;
     }
 
@@ -103,37 +105,32 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex
 {
     struct node* ir_node = exec_node->ir_node;
     struct graph* ir_graph = ir_node->graph;
-    struct tensor* input_tensor;
-    struct tensor* weight_tensor;
-    struct tensor* output_tensor;
+    struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
+    struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
     struct tensor* bias_tensor = NULL;
-    int num_thread = exec_graph->num_thread;
-    int cpu_affinity = exec_graph->cpu_affinity;
-
-    /* set the input data and shape again, in case of reshape or dynamic shape */
-    input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
-    weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
-    output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
     if (ir_node->input_num > 2)
+    {
         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
+    }
 
-    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
-    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
+    struct conv_param* params = ir_node->op.param_mem;
+    struct conv_priv_info* info = exec_node->ops_priv;
+    int num_thread = exec_graph->num_thread;
+    int cpu_affinity = exec_graph->cpu_affinity;
 
-    /* fp32 run */
-    if (exec_graph->mode == TENGINE_MODE_FP32)
+    if (exec_graph->mode == TENGINE_DT_FP32)
     {
-        if (conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread,
-                         cpu_affinity)
-            < 0)
+        int ret = conv_hcl_run_rv64(ir_node, input_tensor, filter_tensor, bias_tensor, output_tensor, info, params, num_thread, cpu_affinity);
+        if (ret < 0)
         {
-            TLOG_ERR("hcl conv run failed\n");
-            return -1;
+            TLOG_ERR("conv_hcl_run %s run failed: %d\n", ir_node->name, ret);
+            return ret;
         }
     }
     else
     {
-        printf("Tengine work node not support %d\n", exec_graph->mode);
+        TLOG_ERR("Tengine work node %s not support %d mode\n", ir_node->name, exec_graph->mode);
         return -1;
     }
 
@@ -147,95 +144,46 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc
 
 static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
-
-    /* fp32 postrun */
     if (exec_graph->mode == TENGINE_MODE_FP32)
     {
-        if (conv_hcl_postrun(conv_priv_info) < 0)
-        {
-            TLOG_ERR("hcl conv postrun failed\n");
-            return -1;
-        }
+        return conv_hcl_postrun_rv64(exec_node->ir_node, exec_node->ops_priv);
     }
     else
     {
-        printf("Tengine work node not support %d\n", exec_graph->mode);
+        TLOG_ERR("Tengine work node %s not support %d mode\n", exec_node->ir_node->name, exec_graph->mode);
         return -1;
     }
-
-    return 0;
-}
-
-static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
-{
-    struct node* ir_node = exec_node->ir_node;
-    struct graph* ir_graph = ir_node->graph;
-    struct tensor* input_tensor;
-    struct tensor* filter_tensor;
-    struct tensor* output_tensor;
-
-    input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
-    filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
-    output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-
-    struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
-
-    /* init the private info data of convolution op */
-    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)sys_malloc(sizeof(struct conv_priv_info));
-    if (conv_priv_info == NULL)
-    {
-        return -1;
-    }
-    memset(conv_priv_info, 0, sizeof(struct conv_priv_info));
-    exec_node->ops_priv = conv_priv_info;
-
-    /* get shared memory size */
-    if (exec_graph->mode == TENGINE_MODE_FP32)
-    {
-        exec_node->shared_mem_size = conv_hcl_get_shared_mem_size_rv64(input_tensor, output_tensor, conv_param);
-        exec_node->shared_pack4_mem_size = conv_hcl_get_shared_pack4_mem_size(filter_tensor, output_tensor, conv_param);
-    }
-    else
-    {
-        printf("Tengine work node not support %d\n", exec_graph->mode);
-        return -1;
-    }
-
-    return 0;
 }
 
 static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 {
-    struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv;
-    sys_free(conv_priv_info);
+    struct conv_priv_info* info = exec_node->ops_priv;
+    sys_free(info);
     exec_node->ops_priv = NULL;
 
     return 0;
 }
 
-static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
+static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* ir_node)
 {
-    struct node* ir_node = exec_node;
     struct graph* ir_graph = ir_node->graph;
     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+    struct tensor* kernel_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-    struct conv_param* param = (struct conv_param*)exec_node->op.param_mem;
-    int group = param->group;
-    int kernel_h = param->kernel_h;
-    int kernel_w = param->kernel_w;
-    int in_c = input_tensor->dims[1] / group;
-    int out_c = output_tensor->dims[1] / group;
+    struct conv_param* param = ir_node->op.param_mem;
 
     if (input_tensor->data_type != TENGINE_DT_FP32)
+    {
         return 0;
+    }
 
-    if (group != 1)
+    if (param->group != 1)
+    {
         return 0;
+    }
 
     return OPS_SCORE_PREFER;
 }
-
 static struct node_ops hcl_node_ops = {
     .prerun = prerun,
     .run = run,
@@ -243,7 +191,8 @@ static struct node_ops hcl_node_ops = {
     .postrun = postrun,
     .init_node = init_node,
     .release_node = release_node,
-    .score = score};
+    .score = score,
+};
 
 int register_conv_hcl_rv64_op()
 {
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c
index 999a49d4e..23e6dc5f7 100644
--- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c
@@ -1,43 +1,19 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2021, OPEN AI LAB
- * Author: ddzhao@openailab.com
- */
-
 #include <stdint.h>
+#include <stdio.h>
 #include <stdlib.h>
-#include <math.h>
-
-#include "conv_kernel_rv64.h"
-// #include "wino_conv_kernel_arm.h"    // FIXME: add wino support
-// #include "wino_conv_kernel_1_arm.h"  // FIXME: add wino support
+#include "convolution_param.h"
+#include "graph/tensor.h"
+#include "op/conv/x86/conv_kernel_x86.h"
+#include "utility/sys_port.h"
+#include <errno.h>
+#include <string.h>
 
-#define PER_OUT_CHAN 16
-void sgemm_4x16_rv64(float* biases, float* input, float* kernel, long kernel_size, float* output, long output_xy,
-                     int activation, int layout);
-void sgemm_4x4_rv64(float* biases, float* input, float* kernel, long kernel_size, float* output, long output_xy,
-                    int activation, int layout);
+#define PER_OUT_CHAN 8
+#define min(a, b)    ((a) < (b) ? (a) : (b))
 
-void im2col_fp32_1x1(float* input, int input_xy, float* col, int col_cnt, int input_chan);
-void im2col_fp32_3x3(float* input, int w, int h, int channel, float* cur_col, int stride);
+extern void sgemm_8x8_rv64(const float* cur_col, const float* cur_kernel, const float* bias, const int act, float* cur_output, const int output_xy, const int kernel_size, const int n);
+extern void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k_w, int k_h, int s_w, int s_h, int d_w,
+                   int d_h, int pad_w0, int pad_w1, int pad_h0, int pad_h1, int out_w, int out_h, int num_thread);
 
 static void interleave_kernel(float* kernel, float* kernel_interleaved, int kernel_chan, int kernel_size)
 {
@@ -56,29 +32,67 @@ static void interleave_kernel(float* kernel, float* kernel_interleaved, int kern
                 *(cur_kernel_interleaved++) = cur_kernel[k][j];
         }
     }
-    for (; i < (kernel_chan & -4); i += 4)
+
+    // last 7 kernel
+    for (k = 0; i + k < kernel_chan; k++)
+        cur_kernel[k] = kernel + kernel_size * (i + k);
+
+    if ((kernel_chan & 0x7) == 7)
+    {
+        for (j = 0; j < kernel_size; j++)
+        {
+            for (k = 0; k < 7; k++)
+                *(cur_kernel_interleaved++) = cur_kernel[k][j];
+            *(cur_kernel_interleaved++) = 0.f;
+        }
+    }
+    else if ((kernel_chan & 0x7) == 6)
+    {
+        for (j = 0; j < kernel_size; j++)
+        {
+            for (k = 0; k < 6; k++)
+                *(cur_kernel_interleaved++) = cur_kernel[k][j];
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
+        }
+    }
+    else if ((kernel_chan & 0x7) == 5)
+    {
+        for (j = 0; j < kernel_size; j++)
+        {
+            for (k = 0; k < 5; k++)
+                *(cur_kernel_interleaved++) = cur_kernel[k][j];
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
+        }
+    }
+    else if ((kernel_chan & 0x7) == 4)
     {
-        for (k = 0; k < 4; k++)
-            cur_kernel[k] = kernel + kernel_size * (i + k);
         for (j = 0; j < kernel_size; j++)
         {
             for (k = 0; k < 4; k++)
                 *(cur_kernel_interleaved++) = cur_kernel[k][j];
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
         }
     }
-    // last 4 kernel
-    for (k = 0; k < 3; k++)
-        cur_kernel[k] = kernel + kernel_size * (i + k);
-    if ((kernel_chan & 0x3) == 3)
+    else if ((kernel_chan & 0x7) == 3)
     {
         for (j = 0; j < kernel_size; j++)
         {
             for (k = 0; k < 3; k++)
                 *(cur_kernel_interleaved++) = cur_kernel[k][j];
             *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
         }
     }
-    else if ((kernel_chan & 0x3) == 2)
+    else if ((kernel_chan & 0x7) == 2)
     {
         for (j = 0; j < kernel_size; j++)
         {
@@ -86,9 +100,13 @@ static void interleave_kernel(float* kernel, float* kernel_interleaved, int kern
                 *(cur_kernel_interleaved++) = cur_kernel[k][j];
             *(cur_kernel_interleaved++) = 0.f;
             *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
         }
     }
-    else if ((kernel_chan & 0x3) == 1)
+    else if ((kernel_chan & 0x7) == 1)
     {
         for (j = 0; j < kernel_size; j++)
         {
@@ -96,6 +114,10 @@ static void interleave_kernel(float* kernel, float* kernel_interleaved, int kern
             *(cur_kernel_interleaved++) = 0.f;
             *(cur_kernel_interleaved++) = 0.f;
             *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
+            *(cur_kernel_interleaved++) = 0.f;
         }
     }
 }
@@ -104,14 +126,19 @@ static void interleave_kernel(float* kernel, float* kernel_interleaved, int kern
 static void interleave(struct tensor* filter, struct conv_priv_info* priv_info, struct conv_param* param)
 {
     int group = param->group;
-    int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3];
+    int in_c = filter->dims[1];
+    int kernel_h = filter->dims[2];
+    int kernel_w = filter->dims[3];
+    int kernel_size = in_c * kernel_h * kernel_w;
+
     int out_chan = filter->dims[0] / group;
-    int out_chan_align4 = (out_chan + 3) / 4 * 4;
+    int out_chan_align8 = (out_chan + 7) / 8 * 8;
 
-    int kernel_size_algin = kernel_size * out_chan_align4;
+    int kernel_size_algin = kernel_size * out_chan_align8;
     int kernel_size_group = kernel_size * out_chan;
 
     float* kernel = filter->data;
+
     float* interleave_buf = priv_info->interleave_buffer;
     for (int g = 0; g < group; g++)
     {
@@ -121,520 +148,144 @@ static void interleave(struct tensor* filter, struct conv_priv_info* priv_info,
     }
 }
 
-static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k_w, int k_h, int s_w, int s_h, int d_w,
-                   int d_h, int pad_w0, int pad_w1, int pad_h0, int pad_h1, int out_w, int out_h, int num_thread)
-{
-    if (k_w == 1 && k_h == 1 && s_w == 1 && s_h == 1)
-    {
-        int kernel_size = k_w * k_h * in_c;
-        int in_xy = in_w * in_h;
-        int out_xy = out_w * out_h;
-        int col_end3 = out_xy & 3;
-#pragma omp parallel for num_threads(num_thread)
-        for (int col_i = 0; col_i < out_xy - 3; col_i += 4)
-        {
-            float* cur_col = col + col_i * kernel_size;
-
-            float* cur_input = input + col_i;
-            im2col_fp32_1x1(cur_input, in_xy, cur_col, 4, in_c);
-        }
-        int col_i = out_xy & -4;
-        float* cur_col;
-        // final 4 input
-        if (col_end3)
-        {
-            cur_col = col + col_i * kernel_size;
-            for (int col_j = 0; col_j < kernel_size; col_j++)
-            {
-                for (int i = 0; i < 4; i++)
-                {
-                    if (i < col_end3)
-                        *cur_col++ = *(input + col_j * in_xy + col_i + i);
-                    else
-                        *cur_col++ = 0;
-                }
-            }
-        }
-    }
-    else if (d_w == 1 && d_h == 1 && k_w == 3 && k_h == 3 && s_w == s_h)
-    {
-        int kernel_size = k_w * k_h * in_c;
-        int in_xy = in_w * in_h;
-        int out_xy = out_w * out_h;
-        int col_end3 = out_xy & 3;
-        int is_pad0 = (pad_w0 == 0) && (pad_h0 == 0) && (pad_w1 == 0) && (pad_h1 == 0);
-#pragma omp parallel for num_threads(num_thread)
-        for (int col_i = 0; col_i < (out_xy & -4); col_i += 4)
-        {
-            float* cur_col = col + col_i * kernel_size;
-            int imy0 = col_i / out_w;
-            int imy3 = (col_i + 3) / out_w;
-            int imx0 = col_i - imy0 * out_w;
-            int imx3 = (col_i + 3) - imy3 * out_w;
-            if ((imy0 == imy3) && (is_pad0 || (imy0 != 0 && imx0 != 0 && imy0 != (out_h - 1) && imx3 != (out_w - 1))))
-            {
-                float* l0 = input + (imy0 * s_h - pad_h0) * in_w + (imx0 * s_w - pad_w0);
-                {
-                    im2col_fp32_3x3(l0, in_w, in_h, in_c, cur_col, s_w); // add im2col 3x3
-                    cur_col += 4 * kernel_size;
-                }
-            }
-            else
-            {
-                int cnt_y[4] = {imy0, (col_i + 1) / out_w, (col_i + 2) / out_w, imy3};
-                int cnt_x[4] = {imx0, col_i - cnt_y[1] * out_w + 1, col_i - cnt_y[2] * out_w + 2, imx3};
-                int imx_start[4] = {cnt_x[0] * s_w - pad_w0, cnt_x[1] * s_w - pad_w0, cnt_x[2] * s_w - pad_w0,
-                                    cnt_x[3] * s_w - pad_w0};
-                int imy_start[4] = {cnt_y[0] * s_h - pad_h0, cnt_y[1] * s_h - pad_h0, cnt_y[2] * s_h - pad_h0,
-                                    cnt_y[3] * s_h - pad_h0};
-                for (int kch = 0; kch < in_c; kch++)
-                    for (int ky = 0; ky < 3; ky++)
-                        for (int kx = 0; kx < 3; kx++)
-                        {
-                            int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
-                            int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
-                            for (int i = 0; i < 4; i++)
-                            {
-                                if (imx[i] >= 0 && imx[i] < in_w && imy[i] >= 0 && imy[i] < in_h)
-                                    *cur_col++ = *(input + in_xy * kch + in_w * imy[i] + imx[i]);
-                                else
-                                    *cur_col++ = 0.f;
-                            }
-                        }
-            }
-        }
-        // final 4 input
-        int col_i = out_xy & -4;
-        if (col_end3)
-        {
-            float* cur_col = col + col_i * kernel_size;
-            int cnt_y[4] = {col_i / out_w, (col_i + 1) / out_w, (col_i + 2) / out_w, (col_i + 3) / out_w};
-            int cnt_x[4] = {col_i - cnt_y[0] * out_w, col_i - cnt_y[1] * out_w + 1, col_i - cnt_y[2] * out_w + 2,
-                            col_i - cnt_y[3] * out_w + 3};
-            int imx_start[4] = {cnt_x[0] * s_w - pad_w0, cnt_x[1] * s_w - pad_w0, cnt_x[2] * s_w - pad_w0,
-                                cnt_x[3] * s_w - pad_w0};
-            int imy_start[4] = {cnt_y[0] * s_h - pad_h0, cnt_y[1] * s_h - pad_h0, cnt_y[2] * s_h - pad_h0,
-                                cnt_y[3] * s_h - pad_h0};
-            for (int kch = 0; kch < in_c; kch++)
-            {
-                for (int ky = 0; ky < 3; ky++)
-                {
-                    for (int kx = 0; kx < 3; kx++)
-                    {
-                        int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
-                        int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
-                        for (int i = 0; i < 4; i++)
-                        {
-                            if (i < col_end3 && imx[i] >= 0 && imx[i] < in_w && imy[i] >= 0 && imy[i] < in_h)
-                                *cur_col++ = *(input + in_xy * kch + in_w * imy[i] + imx[i]);
-                            else
-                                *cur_col++ = 0.f;
-                        }
-                    }
-                }
-            }
-        }
-    }
-    else
-    {
-        int out_xy = out_w * out_h;
-#pragma omp parallel for num_threads(num_thread)
-        for (int col_i = 0; col_i < out_xy - 3; col_i += 4)
-        {
-            int kernel_size = k_w * k_h * in_c;
-            int in_xy = in_w * in_h;
-            int col_end3 = out_xy & 3;
-            float* cur_col = col + col_i * kernel_size;
-            int cnt_y[4] = {col_i / out_w, (col_i + 1) / out_w, (col_i + 2) / out_w, (col_i + 3) / out_w};
-            int cnt_x[4] = {col_i - cnt_y[0] * out_w, col_i - cnt_y[1] * out_w + 1, col_i - cnt_y[2] * out_w + 2,
-                            col_i - cnt_y[3] * out_w + 3};
-            int imx_start[4] = {cnt_x[0] * s_w - pad_w0, cnt_x[1] * s_w - pad_w0, cnt_x[2] * s_w - pad_w0,
-                                cnt_x[3] * s_w - pad_w0};
-            int imy_start[4] = {cnt_y[0] * s_h - pad_h0, cnt_y[1] * s_h - pad_h0, cnt_y[2] * s_h - pad_h0,
-                                cnt_y[3] * s_h - pad_h0};
-            for (int kch = 0; kch < in_c; kch++)
-                for (int ky = 0; ky < (k_h * d_h); ky += d_h)
-                    for (int kx = 0; kx < (k_w * d_w); kx += d_w)
-                    {
-                        int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
-                        int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
-                        for (int i = 0; i < 4; i++)
-                        {
-                            if (imx[i] >= 0 && imx[i] < in_w && imy[i] >= 0 && imy[i] < in_h)
-                                *cur_col++ = *(input + in_xy * kch + in_w * imy[i] + imx[i]);
-                            else
-                                *cur_col++ = 0.f;
-                        }
-                    }
-        }
-        int col_i = out_xy & -4;
-        float* cur_col;
-        int kernel_size = k_w * k_h * in_c;
-        int in_xy = in_w * in_h;
-        int col_end3 = out_xy & 3;
-        if (col_end3)
-        {
-            cur_col = col + col_i * kernel_size;
-            int cnt_y[4] = {col_i / out_w, (col_i + 1) / out_w, (col_i + 2) / out_w, (col_i + 3) / out_w};
-            int cnt_x[4] = {col_i - cnt_y[0] * out_w, col_i - cnt_y[1] * out_w + 1, col_i - cnt_y[2] * out_w + 2,
-                            col_i - cnt_y[3] * out_w + 3};
-            int imx_start[4] = {cnt_x[0] * s_w - pad_w0, cnt_x[1] * s_w - pad_w0, cnt_x[2] * s_w - pad_w0,
-                                cnt_x[3] * s_w - pad_w0};
-            int imy_start[4] = {cnt_y[0] * s_h - pad_h0, cnt_y[1] * s_h - pad_h0, cnt_y[2] * s_h - pad_h0,
-                                cnt_y[3] * s_h - pad_h0};
-            for (int kch = 0; kch < in_c; kch++)
-                for (int ky = 0; ky < (k_h * d_h); ky += d_h)
-                    for (int kx = 0; kx < (k_w * d_w); kx += d_w)
-                    {
-                        int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx};
-                        int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky};
-                        for (int i = 0; i < 4; i++)
-                        {
-                            if (i < col_end3 && imx[i] >= 0 && imx[i] < in_w && imy[i] >= 0 && imy[i] < in_h)
-                                *cur_col++ = *(input + in_xy * kch + in_w * imy[i] + imx[i]);
-                            else
-                                *cur_col++ = 0.f;
-                        }
-                    }
-        }
-    }
-}
-
-static void sgemm_set(float* col, float* kernel, float* biases, float* output, int kernel_size, int ch_start,
-                      int ch_end, int output_xy, int activation, int num_thread, int cpu_affinity)
-{
-    int nn_outch = ch_end / PER_OUT_CHAN;
-    int col_end3 = output_xy & 0x3;
-
-    if (col_end3)
-    {
-#pragma omp parallel for num_threads(num_thread)
-        for (int pp = 0; pp < nn_outch; pp++)
-        {
-            int p = pp * PER_OUT_CHAN;
-
-            float* biasptr = biases ? (float*)(biases + p) : NULL;
-            float* kernel_tmp = (float*)(kernel + p * kernel_size);
-            float* output_tmp = (float*)(output + p * output_xy);
-
-            int col_line = 0;
-            for (col_line = 0; col_line + 3 < output_xy; col_line += 4)
-            {
-                float* col_tmp = (float*)(col + col_line * kernel_size);
-                sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0); // FIXME: replace with sgemm_4x16_rv64
-            }
-            {
-                float result[64];
-                float* col_tmp = (float*)(col + col_line * kernel_size);
-                sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, result, 4, activation, 0); // FIXME: replace with sgemm_4x16_rv64
-                for (int i = 0; i < 16; i++)
-                {
-                    for (int j = 0; j < (col_end3); j++)
-                        *(output + (p + i) * output_xy + col_line + j) = result[(i << 2) + j];
-                }
-            }
-        }
-    }
-    else
-    {
-#pragma omp parallel for num_threads(num_thread)
-        for (int pp = 0; pp < nn_outch; pp++)
-        {
-            int p = pp * PER_OUT_CHAN;
-
-            float* biasptr = biases ? (float*)(biases + p) : NULL;
-            float* kernel_tmp = (float*)(kernel + p * kernel_size);
-            float* output_tmp = (float*)(output + p * output_xy);
-
-            for (int col_line = 0; col_line + 3 < output_xy; col_line += 4)
-            {
-                float* col_tmp = (float*)(col + col_line * kernel_size);
-                sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0); // FIXME: replace with sgemm_4x16_rv64
-            }
-        }
-    }
-}
-
-static void sgemm4x4(float* col, float* kernel, float* biases, float* output, int kernel_size, int ch_start, int ch_end,
-                     int output_xy, int activation, int num_thread, int cpu_affinity)
-{
-    float result[16];
-    int col_end3 = output_xy & 0x3;
-    int kernel_end3 = ch_end & 0x3;
-
-#pragma omp parallel for num_threads(num_thread) private(result)
-    for (int kernel_num = ch_start; kernel_num < ((ch_end & -4) - 3); kernel_num += 4)
-    {
-        float* cur_biases = NULL;
-        float *cur_col, *cur_kernel, *cur_output;
-        int col_line;
-        if (biases)
-            cur_biases = (float*)(biases + kernel_num);
-        cur_kernel = (float*)(kernel + kernel_num * kernel_size);
-        cur_output = (float*)(output + kernel_num * output_xy);
-        for (col_line = 0; col_line < (output_xy & -4); col_line += 4)
-        {
-            cur_col = (float*)(col + col_line * kernel_size);
-            sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, cur_output + col_line, output_xy, activation, 0);
-        }
-        if (col_end3)
-        {
-            cur_col = (float*)(col + col_line * kernel_size);
-            sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0);
-            for (int i = 0; i < 4; i++)
-            {
-                for (int j = 0; j < (col_end3); j++)
-                    *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
-            }
-        }
-    }
-    if (kernel_end3)
-    {
-        int kernel_num = (ch_end & -4);
-        float* cur_biases = NULL;
-        if (biases)
-            cur_biases = (float*)(biases + kernel_num);
-        float* cur_kernel = (float*)(kernel + kernel_num * kernel_size);
-#pragma omp parallel for num_threads(num_thread) private(result)
-        for (int col_line = 0; col_line < (output_xy & -4); col_line += 4)
-        {
-            float* cur_col = (float*)(col + col_line * kernel_size);
-            sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0);
-            for (int i = 0; i < kernel_end3; i++)
-                for (int j = 0; j < 4; j++)
-                    *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
-        }
-        int col_line = output_xy & -4;
-        if (col_end3)
-        {
-            float* cur_col = (float*)(col + col_line * kernel_size);
-            sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0);
-            for (int i = 0; i < (kernel_end3); i++)
-            {
-                for (int j = 0; j < (col_end3); j++)
-                    *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j];
-            }
-        }
-    }
-}
-
-/* check the conv wheather need to be using winograd */
-static int winograd_support(struct conv_param* param, int in_h, int in_w)
+int conv_hcl_get_shared_mem_size_rv64(struct tensor* input_tensor, struct tensor* output_tensor, struct conv_param* param)
 {
-    int kernel_h = param->kernel_h;
-    int kernel_w = param->kernel_w;
-    int stride_h = param->stride_h;
-    int stride_w = param->stride_w;
-    int dilation_h = param->dilation_h;
-    int dilation_w = param->dilation_w;
-    int output_chan = param->output_channel;
-    int group = param->group;
-
-    if (in_h < 7 && in_w < 7)
-        return 0;
-    if (in_h < 10 && in_w < 10 && output_chan < 16)
-        return 0;
-    if (group != 1 || kernel_h != 3 || kernel_w != 3)
-        return 0;
-    if (dilation_h != 1 || dilation_w != 1 || stride_h != 1 || stride_w != 1)
-        return 0;
-
-    return 1;
-}
-
-/*
- * get the memory size for im2col of input tensor
- */
-int conv_hcl_get_shared_mem_size_rv64(struct tensor* input, struct tensor* output, struct conv_param* param)
-{
-    int in_h = input->dims[2];
-    int in_w = input->dims[3];
-    int out_h = output->dims[2];
-    int out_w = output->dims[3];
-    int group = param->group;
-    int input_chan = param->input_channel / group;
-    int kernel_size = input_chan * param->kernel_h * param->kernel_w;
-    int out_cstep = out_h * out_w;    // channel cstep, output_h * output_w
-    int elem_size = input->elem_size; // uint8/int8 is 1 byte, fp32 is 4 bytes
-
-    out_cstep = (out_cstep + 3) / 4 * 4;
-    int mem_size = elem_size * kernel_size * out_cstep + 128;
-
-    return mem_size;
-}
-
-/*
- * get the memory size for im2col + sgemm of kernel tensor interleave
- */
-static int get_private_mem_size(struct tensor* filter, struct conv_param* param)
-{
-    int group = param->group;
-    int out_chan = filter->dims[0] / group;
-    int out_chan_align4 = (out_chan + 3) / 4 * 4;
-    int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3];
-    int mem_size = kernel_size * filter->elem_size * out_chan_align4 * group + 128; // caution
+    int kernel_size = param->kernel_h * param->kernel_w * param->input_channel / param->group;
+    int cstep = output_tensor->dims[2] * output_tensor->dims[3];
 
+    cstep = (cstep + 7) / 8 * 8; //align to 8
+    int mem_size = input_tensor->elem_size * cstep * kernel_size + 128 * sizeof(float);
     return mem_size;
 }
 
-int conv_hcl_set_shared_mem(struct conv_priv_info* priv_info, void* mem, int mem_size)
-{
-    priv_info->external_im2col_mem = 1;
-    priv_info->im2col_buffer = mem;
-    priv_info->im2col_buffer_size = mem_size;
-
-    return 0;
-}
-
-int conv_hcl_set_shared_pack4_mem(struct conv_priv_info* priv_info, void* mem, int mem_size)
-{
-    priv_info->external_im2col_pack4_mem = 0;
-    priv_info->im2col_buffer_pack4 = NULL;
-    priv_info->im2col_buffer_pack4_size = 0;
-
-    return 0;
-}
-
-int conv_hcl_get_shared_pack4_mem_size(struct tensor* filter, struct tensor* output, struct conv_param* param)
-{
-    return 0;
-}
-
-int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor,
-                    struct conv_priv_info* priv_info, struct conv_param* param)
+int conv_hcl_prerun_rv64(struct node* ir_node, struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param)
 {
-    int in_c = input_tensor->dims[1];
-    int in_h = input_tensor->dims[2];
-    int in_w = input_tensor->dims[3];
-
-    /* check winograd implement, only for conv3x3s1 */
-    // priv_info->winograd = winograd_support(param, in_h, in_w);
-    // if (priv_info->winograd)
-    // {
-    //     if(in_c >= 256)
-    //         // return wino_conv_hcl_prerun_1(input_tensor, filter_tensor, output_tensor, priv_info, param); // FIXME: add wino support
-    //     else
-    //         // return wino_conv_hcl_prerun(input_tensor, filter_tensor, output_tensor, priv_info, param);   // FIXME: add wino support
-    // }
-
-    /* alloc mem of im2col  */
-    if (!priv_info->external_im2col_mem)
+    // alloc im2col buffer = kernel_size * out_xy
+    if (!info->external_im2col_mem)
     {
         int mem_size = conv_hcl_get_shared_mem_size_rv64(input_tensor, output_tensor, param);
-        void* mem = sys_malloc(mem_size);
-        priv_info->im2col_buffer = mem;
-        priv_info->im2col_buffer_size = mem_size;
+        info->im2col_buffer = sys_malloc(mem_size);
+        info->im2col_buffer_size = mem_size;
     }
 
-    /* alloc mem of kernel interleave */
-    if (!priv_info->external_interleave_mem)
+    // alloc kernel interleave buffer
+    if (!info->external_interleave_mem)
     {
-        int mem_size = get_private_mem_size(filter_tensor, param);
-        void* mem = sys_malloc(mem_size);
-        priv_info->interleave_buffer = mem;
-        priv_info->interleave_buffer_size = mem_size;
+        int kernel_size = filter_tensor->dims[1] * filter_tensor->dims[2] * filter_tensor->dims[3];
+        int out_chan = filter_tensor->dims[0] / param->group;
+        out_chan = (out_chan + 7) / 8 * 8; //align to 8
+        int mem_size = out_chan * kernel_size * filter_tensor->elem_size * param->group;
+        info->interleave_buffer = sys_malloc(mem_size);
+        info->interleave_buffer_size = mem_size;
     }
 
-    /* kernel interleave */
-    interleave(filter_tensor, priv_info, param);
-
+    // interleave kernel
+    interleave(filter_tensor, info, param);
     return 0;
 }
 
-int conv_hcl_postrun(struct conv_priv_info* priv_info)
+int conv_hcl_postrun_rv64(struct node* ir_node, struct conv_priv_info* info)
 {
-    // if (priv_info->winograd)
-    // {
-    //     wino_conv_hcl_postrun(priv_info);        // FIXME: add wino support
-    // }
-
-    if (!priv_info->external_interleave_mem && priv_info->interleave_buffer != NULL)
+    if (!info->external_interleave_mem && info->interleave_buffer)
     {
-        sys_free(priv_info->interleave_buffer);
-        priv_info->interleave_buffer = NULL;
+        sys_free(info->interleave_buffer);
+        info->interleave_buffer = NULL;
     }
 
-    if (!priv_info->external_im2col_mem && priv_info->im2col_buffer != NULL)
+    if (!info->external_im2col_mem && info->im2col_buffer)
     {
-        sys_free(priv_info->im2col_buffer);
-        priv_info->im2col_buffer = NULL;
+        sys_free(info->im2col_buffer);
+        info->im2col_buffer = NULL;
     }
 
     return 0;
 }
 
-int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
-                 struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param,
-                 int num_thread, int cpu_affinity)
+int conv_hcl_run_rv64(struct node* ir_node, struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param, int num_thread, int cpu_affinity)
 {
-    /* param */
     int group = param->group;
-    int kernel_h = param->kernel_h;
-    int kernel_w = param->kernel_w;
-    int stride_h = param->stride_h;
-    int stride_w = param->stride_w;
-    int dilation_h = param->dilation_h;
-    int dilation_w = param->dilation_w;
-    int pad_h0 = param->pad_h0;
-    int pad_h1 = param->pad_h1;
-    int pad_w0 = param->pad_w0;
-    int pad_w1 = param->pad_w1;
-    int act_type = param->activation;
-
     int batch = input_tensor->dims[0];
-    int in_c = input_tensor->dims[1] / group;
+    float* input = input_tensor->data;
+    float* output = output_tensor->data;
+    float* bias = NULL;
+    if (bias_tensor)
+    {
+        bias = bias_tensor->data;
+    }
+
+    int in_c = input_tensor->dims[1];
+    in_c /= group;
     int in_h = input_tensor->dims[2];
     int in_w = input_tensor->dims[3];
     int input_size = in_c * in_h * in_w;
-    int kernel_size = in_c * kernel_h * kernel_w;
-    int input_image_size = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3];
 
-    // if (priv_info->winograd)
-    // {
-    //     if(in_c >= 256)
-    //         return wino_conv_hcl_run_1(input_tensor, filter_tensor, bias_tensor, output_tensor, priv_info, param, num_thread, cpu_affinity);     // FIXME: add wino support
-    //     else
-    //         return wino_conv_hcl_run(input_tensor, filter_tensor, bias_tensor, output_tensor, priv_info, param, num_thread, cpu_affinity);       // FIXME: add wino support
-    // }
-
-    int out_c = output_tensor->dims[1] / group;
+    int k_h = param->kernel_h;
+    int k_w = param->kernel_w;
+    int s_w = param->stride_w;
+    int s_h = param->stride_h;
+    int d_h = param->dilation_h;
+    int d_w = param->dilation_w;
+    int p_h0 = param->pad_h0;
+    int p_w0 = param->pad_w0;
+    int p_h1 = param->pad_h1;
+    int p_w1 = param->pad_w1;
+    int act = param->activation;
+    int kernel_size = in_c * k_h * k_w;
+
+    int out_c = param->output_channel / group;
     int out_h = output_tensor->dims[2];
     int out_w = output_tensor->dims[3];
-    int out_hw = out_h * out_w;
+    int out_xy = out_h * out_w;
     int output_size = out_c * out_h * out_w;
-    int out_c_align = ((out_c + 3) & -4);
-    int output_image_size = output_tensor->dims[1] * output_tensor->dims[2] * output_tensor->dims[3];
+    int output_image_size = output_tensor->dims[1] * output_tensor->dims[2] * output_tensor->dims[3]; //不是8倍数怎么办
 
-    /* buffer addr */
-    float* input_buf = (float*)input_tensor->data;
-    float* output_buf = (float*)output_tensor->data;
-    float* biases_buf = NULL;
-    if (bias_tensor != NULL)
-        biases_buf = (float*)bias_tensor->data;
-    float* col_buf = (float*)priv_info->im2col_buffer;
-    float* interleave_buf = (float*)priv_info->interleave_buffer;
+    int out_c_align8 = (out_c + 7) / 8 * 8;
+    int input_image_size = in_c * in_h * in_w;
+    int input_group_size = input_image_size * group;
 
-    int sgemm_set_chan = out_c / PER_OUT_CHAN * PER_OUT_CHAN;
-    int sgemm_set_remain = out_c % PER_OUT_CHAN;
+    float* col = info->im2col_buffer; // FIXME: split by [batch, group]
+    float* interleaved_kernel = info->interleave_buffer;
 
-    for (int n = 0; n < batch; n++) // batch size
+    for (int n = 0; n < batch; ++n)
     {
-        for (int g = 0; g < group; g++)
+        for (int g = 0; g < group; ++g)
         {
-            /* im2col */
-            float* cur_input = input_buf + n * input_image_size + g * input_size;
-            im2col(cur_input, col_buf, in_c, in_w, in_h, kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h,
-                   pad_w0, pad_w1, pad_h0, pad_h1, out_w, out_h, num_thread);
+            float* cur_input = input + n * input_image_size + g * input_size;
+            //output shape: [batch, group, output_xy/8, ksize, 8]
+            im2col(cur_input, col, in_c, in_w, in_h, k_w, k_h, s_w, s_h, d_w, d_h, p_w0, p_w1, p_h0, p_h1, out_w, out_h, num_thread);
+
+            float* output_base = output + n * output_image_size + g * output_size;
+            //FIXME: out_chan_ 可能不是8对齐的
+            int out_chan_ = 0;
+            for (; out_chan_ < out_c_align8; out_chan_ += PER_OUT_CHAN)
+            {
+                float* cur_kernel = interleaved_kernel + g * out_c_align8 * kernel_size + out_chan_ * kernel_size;
+                float* cur_bias = bias ? bias + g * out_c + out_chan_ : NULL;
+                float* cur_output = output_base + out_chan_ * out_xy;
+                const int n = min(8, out_c - out_chan_);
+
+                int col_i = 0;
+                for (; col_i + 7 < out_xy; col_i += 8)
+                {
+                    float* cur_col = col + col_i * kernel_size;
+                    sgemm_8x8_rv64(cur_col, cur_kernel, cur_bias, act, cur_output + col_i, out_xy, kernel_size, n);
+                }
+                if (col_i < out_xy)
+                {
+                    float result[64];
+                    float* cur_col = (col + col_i * kernel_size);
+                    sgemm_8x8_rv64(cur_col, cur_kernel, cur_bias, act, result, 8, kernel_size, n);
 
-            /* gemm */
-            float* cur_kernel = interleave_buf + g * kernel_size * out_c_align;
-            float* cur_output = output_buf + n * output_image_size + g * output_size;
-            float* cur_bias = biases_buf ? (biases_buf + g * out_c) : NULL;
-            sgemm_set(col_buf, cur_kernel, cur_bias, cur_output, kernel_size, 0, sgemm_set_chan, out_hw, act_type,
-                      num_thread, cpu_affinity);
-            if (sgemm_set_remain)
-                sgemm4x4(col_buf, cur_kernel, cur_bias, cur_output, kernel_size, sgemm_set_chan, out_c, out_hw,
-                         act_type, num_thread, cpu_affinity);
+                    int col_end3 = (out_xy & 7);
+
+                    for (int i = 0; i < n; i++)
+                    {
+                        int j = 0;
+                        for (; j < (col_end3); j++)
+                            *(cur_output + i * out_xy + col_i + j) = result[(i << 3) + j];
+                    }
+                }
+            }
         }
     }
 
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.h b/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.h
deleted file mode 100644
index f2f9051a6..000000000
--- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2021, OPEN AI LAB
- * Author: ddzhao@openailab.com
- */
-
-#ifndef _CONV_KERNEL_RV64_H_
-#define _CONV_KERNEL_RV64_H_
-
-#include "convolution_param.h"
-
-#include "graph/tensor.h"
-#include "graph/node.h"
-#include "graph/graph.h"
-#include "module/module.h"
-#include "operator/op.h"
-#include "utility/sys_port.h"
-#include "utility/log.h"
-#include "device/cpu/cpu_node.h"
-#include "device/cpu/cpu_graph.h"
-#include "device/cpu/cpu_module.h"
-
-/* float32 */
-int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor,
-                    struct conv_priv_info* info, struct conv_param* param) __attribute__((weak));
-
-int conv_hcl_postrun(struct conv_priv_info* info) __attribute__((weak));
-
-int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor,
-                 struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param,
-                 int num_thread, int cpu_affinity) __attribute__((weak));
-
-int conv_hcl_get_shared_mem_size_rv64(struct tensor* input_tensor, struct tensor* output_tensor,
-                                      struct conv_param* param);
-int conv_hcl_get_shared_pack4_mem_size(struct tensor* input_tensor, struct tensor* output_tensor,
-                                       struct conv_param* param) __attribute__((weak));
-
-int conv_hcl_set_shared_mem(struct conv_priv_info* priv_info, void* mem, int mem_size) __attribute__((weak));
-
-int conv_hcl_set_shared_pack4_mem(struct conv_priv_info* priv_info, void* mem, int mem_size) __attribute__((weak));
-
-#endif
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_1x1.S b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_1x1.S
index 700fe7e55..e69de29bb 100644
--- a/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_1x1.S
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_1x1.S
@@ -1,114 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
- * Copyright (c) 2021, OPEN AI LAB
- * Author: ddzhao@openailab.com
- */
-//
-// im2col for kernel 1x1 s1p0d1
-//
-// input:
-//         x0 arg0  input address 
-//         x1 arg1  input_xy
-//         x2 arg2  col address
-//         x3 arg3  col_cnt must be multiply of 4
-//         x4 arg4  input channel
-//
-// register definition
-//    x0 input address 
-//    x1 input_xy x 4
-//    x2 col address
-//    x3 col_cnt
-//    x4 input channel
-//    x6 input start pointer		t6
-//    x7 input pointer
-//    x9 channel cnt
-//    x11
-//    x12 = input_xy size * 2		// x12 -> t5
-
-        .section .text,"ax"
-        .align 5
-
-        .type   im2col_fp32_1x1 STT_FUNC
-        .global im2col_fp32_1x1
-        .hidden im2col_fp32_1x1
-im2col_fp32_1x1:
-	addi    sp, sp, -56
-	sd      t0, 0(sp)
-	sd      t1, 8(sp)
-	sd      t2, 16(sp)
-	sd      t3, 24(sp)
-	sd      t4, 32(sp)
-	sd      t5, 40(sp)
-	sd      t6, 48(sp)
-	vsetvli	t0, a0, e32
-	li 		t0, 4
-	blt 	a3, t0, col_end
-	
-	srli	a3, a3, 2
-	
-	slli	a1, a1, 2
-	
-	mv 		t6, a0
-	
-	slli	t5, a1, 1
-	
-	add 	t4, a4, 1								// x10 -> t4
-
-	// col loop
-col_loop:
-	mv 		t3, t6
-	srli	t2, a4, 1
-	beqz	t2, channel_last
-	add 	t1, t3, a1						
-	// kernel size loop
-channel_loop2:
-	vlw.v 	v0,(t3)
-	vlw.v 	v1,(t1)
-	addi 	t2, t2, -1
-	add 	t3, t3, t5
-	add 	t1, t1, t5
-	vsw.v 	v0, (a2)
-	addi 	a2, a2, 16
-	vsw.v 	v1, (a2)
-	addi 	a2, a2, 16
-	bnez	t2, channel_loop2
-
-channel_last:
-	beqz 	t4, channel_loop_end
-	vlw.v 	v0,(t3)
-	vsw.v 	v0, (a2)
-	addi 	a2, a2, 16
-
-channel_loop_end:
-	addi 	t6, t6, 16
-	addi 	a3, a3, -1
-	bnez	a3, col_loop
-
-col_end:
-	ld      t0, 0(sp)
-	ld      t1, 8(sp)
-	ld      t2, 16(sp)
-	ld      t3, 24(sp)
-	ld      t4, 32(sp)
-	ld      t5, 40(sp)
-	ld      t6, 48(sp)
-	addi    sp, sp, 56
-	ret
-	.end
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_1x1.c b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_1x1.c
new file mode 100644
index 000000000..a6ffb1ed7
--- /dev/null
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_1x1.c
@@ -0,0 +1,40 @@
+#include "vsetvl_rvv.h"
+
+// FIXME: optimize vectorize loop
+void im2col_fp32_1x1(const float* input, const int input_xy, const int input_channels, float* col)
+{
+    vsetvl_e32_m2();
+
+    const float* c0 = input;
+    const float* c1 = input + input_xy;
+    const int input_xy_stride = 2 * input_xy;
+
+    float* o0 = col;
+    float* o1 = col + 8;
+
+    int c = 0;
+    for (; c < (input_channels & -2); c += 2)
+    {
+        __asm__(
+            "vle32.v    v0, (%0); \n"
+            "vle32.v    v2, (%1); \n"
+            "vse32.v    v0, (%2); \n"
+            "vse32.v    v2, (%3); \n"
+            :
+            : "r"(c0), "r"(c1), "r"(o0), "r"(o1)
+            : "memory");
+        o0 += 16;
+        o1 += 16;
+        c0 += input_xy_stride;
+        c1 += input_xy_stride;
+    }
+
+    if (c < input_channels)
+    {
+        __asm__("vle32.v    v0, (%0);\n"
+                "vse32.v    v0, (%1);\n"
+                :
+                : "r"(c0), "r"(o0)
+                : "memory");
+    }
+}
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_3x3.S b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_3x3.S
deleted file mode 100644
index d928093c6..000000000
--- a/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_3x3.S
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: ddzhao@openailab.com
- */
-//
-// im2col fp16 for kernel 3x3  include 2 function  stride 1 and stride 2
-// ABCDABCD
-//
-// input:
-//         x0 arg0  input address 
-//         x1 arg1  input_x
-//         x2 arg2  input_y
-//         x3 arg3  input channel cnt
-//         x4 arg4  col address
-//         x5 arg5  stride_x
-//
-// register definition
-//    x0 cl0 address  q0  q1    d16 d17 d18
-//    x1 input_x x 4
-//    x2 input_xy x 4
-//    x3 input channel
-//    x4 col address
-//    x5 stride_x
-//    x11 cl1 address q2  q3    d19 d20 d21
-//    x12 cl2 address q4  q5    d22 d23 d24
-
-        .section .text,"ax"
-        .align 5
-
-        .type   im2col_fp32_3x3 STT_FUNC
-        .global im2col_fp32_3x3
-        .hidden im2col_fp32_3x3
-
-.balign 16
-mask_32b:
-  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
-        0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
-
-im2col_fp32_3x3:
-        addi            sp, sp, -56
-        sd              t0, 0(sp)
-        sd              t1, 8(sp)
-        sd              t2, 16(sp)
-        sd              t3, 24(sp)
-        sd              t4, 32(sp)
-        sd              t5, 40(sp)
-        sd              t6, 48(sp)
-        vsetvli         t0, a0, e32
-	// initial
-        beqz            a3, finish
-        li              t0, 2
-        slli	        a1, a1, 2
-        mul             a2, a2, a1
-        add             t5, a0, a1
-        slli	        t1, a1, 1
-        add             t6, a0, t1
-        li              t2, 8
-        beq             a5, t0, stride2_channel_loop
-
-stride1_channel_loop:
-        vlw.v           v0, (a0)
-        addi            t0, a0, 16
-        vlw.v           v1, (t0)
-        vlw.v           v2, (t5)
-        addi            t0, t5, 16
-        vlw.v           v3, (t0)
-        vlw.v           v4, (t6)
-        addi            t0, t6, 16
-        vlw.v           v5, (t0)
-        
-        addi             a3, a3, -1
-        
-        addi            t0, a0, 4
-        vlw.v           v16, (t0)
-        addi            t0, a0, 8
-        vlw.v           v17, (t0)
-        add             a0, a0, a2
-        
-        addi            t0, t5, 4
-        vlw.v           v19, (t0)
-        
-        addi            t0, t5, 8
-        vlw.v           v20, (t0)
-        add             t5, t5, a2
-        addi            t0, t6, 4
-        vlw.v           v22, (t0)
-        addi            t0, t6, 8
-        vlw.v           v23, (t0)
-        add             t6, t6, a2
-        vsw.v           v0, (a4)
-        addi            a4, a4, 16
-        vsw.v           v16, (a4)
-        addi            a4, a4, 16
-        vsw.v           v17, (a4)
-        addi            a4, a4, 16
-        vsw.v           v2, (a4)
-        addi            a4, a4, 16
-        vsw.v           v19, (a4)
-        addi            a4, a4, 16
-        vsw.v           v20, (a4)
-        addi            a4, a4, 16
-        vsw.v           v4, (a4)
-        addi            a4, a4, 16
-        vsw.v           v22, (a4)
-        addi            a4, a4, 16
-        vsw.v           v23, (a4)
-        addi            a4, a4, 16
-        bnez            a3, stride1_channel_loop
-        j               finish
-
-stride2_channel_loop:
-        la              t0, mask_32b
-        vlw.v           v0, (t0)
-        addi            t0, a0, 0
-        vlsw.v          v16, (t0), t2
-        addi            t0, a0, 0x4
-        vlsw.v          v17, (t0), t2
-        addi            t0, a0, 32
-        vlw.v           v18, (t0)
-        vslidedown.vi   v1, v16, 1
-        vslideup.vi     v2, v18, 3
-        vmerge.vvm      v18, v1, v2, v0
-        
-        addi            t0, t5, 0
-        vlsw.v           v19, (t0), t2
-        addi            t0, t5, 0x4
-        vlsw.v           v20, (t0), t2
-        addi            t0, t5, 0x20
-        vlw.v           v21, (t0)
-        vslidedown.vi   v1, v19, 1
-        vslideup.vi     v2, v21, 3
-        vmerge.vvm      v21, v1, v2, v0
-        
-        addi            t0, t6, 0
-        vlsw.v           v22, (t0), t2
-        addi            t0, t6, 0x4
-        vlsw.v           v23, (t0), t2
-        addi            t0, t6, 0x20
-        vlw.v           v24, (t0)
-        vslidedown.vi   v1, v22, 1
-        vslideup.vi     v2, v24, 3
-        vmerge.vvm      v24, v1, v2, v0
-        
-        addi            a3, a3, -1
-        
-        vsw.v           v16, (a4)
-        addi            a4, a4, 0x10
-        vsw.v           v17, (a4)
-        addi            a4, a4, 0x10
-        vsw.v           v18, (a4)
-        addi            a4, a4, 0x10
-        vsw.v           v19, (a4)
-        addi            a4, a4, 0x10
-        vsw.v           v20, (a4)
-        addi            a4, a4, 0x10
-        vsw.v           v21, (a4)
-        addi            a4, a4, 0x10
-        vsw.v           v22, (a4)
-        addi            a4, a4, 0x10
-        vsw.v           v23, (a4)
-        addi            a4, a4, 0x10
-        vsw.v           v24, (a4)
-        addi            a4, a4, 0x10
-        
-	add	        a0, a0, a2
-        add	        t5, t5, a2
-        add	        t6, t6, a2
-        
-        bnez            a3, stride2_channel_loop
-finish:
-        ld              t0, 0(sp)
-        ld              t1, 8(sp)
-        ld              t2, 16(sp)
-        ld              t3, 24(sp)
-        ld              t4, 32(sp)
-        ld              t5, 40(sp)
-        ld              t6, 48(sp)
-        addi            sp, sp, 56
-	ret
-	.end
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_3x3.c b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_3x3.c
new file mode 100644
index 000000000..74f574057
--- /dev/null
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_3x3.c
@@ -0,0 +1,117 @@
+#include "vsetvl_rvv.h"
+
+void im2col_fp32_3x3(const float* input, const int input_x, const int input_y, const int input_channels, float* col, const int stride)
+{
+    vsetvl_e32_m2();
+    const int in_xy = input_x * input_y;
+    const float* row0 = input;
+    const float* row1 = row0 + input_x;
+    const float* row2 = row1 + input_x;
+    float* cur_col = col;
+
+    if (stride == 1)
+    {
+        for (int c = 0; c < input_channels; ++c)
+        {
+            asm("vle32.v    v0, (%0);\n"
+                "vle32.v    v2, (%1);\n"
+                "vle32.v    v4, (%2);\n"
+
+                "addi       t0,  %0, 4;\n"
+                "addi       t1,  %0, 8;\n"
+
+                "vle32.v    v6, (t0);\n"
+                "vle32.v    v8, (t1);\n"
+
+                "addi       t0,  %1, 4;\n"
+                "addi       t1,  %1, 8;\n"
+
+                "vle32.v    v10, (t0);\n"
+                "vle32.v    v12, (t1);\n"
+
+                "addi       t0, %2, 4;\n"
+                "addi       t1, %2, 8;\n"
+
+                "vle32.v    v14, (t0);\n"
+                "vle32.v    v16, (t1);\n"
+
+                "vse32.v    v0, (%3);\n"
+                "addi       t0, %3, 32;\n"
+                "vse32.v    v6, (t0);\n"
+                "addi       t0, t0, 32;\n"
+                "vse32.v    v8, (t0);\n"
+                "addi       t0, t0, 32;\n"
+
+                "vse32.v    v2, (t0);\n"
+                "addi       t0, t0, 32;\n"
+                "vse32.v    v10, (t0);\n"
+                "addi       t0, t0, 32;\n"
+                "vse32.v    v12, (t0);\n"
+                "addi       t0, t0, 32;\n"
+
+                "vse32.v    v4, (t0);\n"
+                "addi       t0, t0, 32;\n"
+                "vse32.v    v14, (t0);\n"
+                "addi       t0, t0, 32;\n"
+                "vse32.v    v16, (t0);\n"
+                "addi       t0, t0, 32;\n"
+                :
+                : "r"(row0), "r"(row1), "r"(row2), "r"(cur_col)
+                : "t0", "t1", "memory");
+
+            row0 += in_xy;
+            row1 += in_xy;
+            row2 += in_xy;
+            cur_col += 72;
+        }
+    }
+    else
+    {
+        for (int c = 0; c < input_channels; ++c)
+        {
+            asm("li         t0, 8;\n"
+                "vlse32.v   v0, (%0), t0;\n"
+                "add        t1, %0, 0x4;\n"
+                "vlse32.v   v2, (t1), t0;\n"
+                "add        t1, t1, 0x4;\n"
+                "vlse32.v   v4, (t1), t0;\n"
+
+                "vlse32.v   v6, (%1), t0;\n"
+                "add        t1, %1, 0x4;\n"
+                "vlse32.v   v8, (t1), t0;\n"
+                "add        t1, t1, 0x4;\n"
+                "vlse32.v   v10, (t1), t0;\n"
+
+                "vlse32.v   v12, (%2), t0;\n"
+                "add        t1, %2, 0x4;\n"
+                "vlse32.v   v14, (t1), t0;\n"
+                "add        t1, t1, 0x4;\n"
+                "vlse32.v   v16, (t1), t0;\n"
+
+                "vse32.v    v0, (%3);\n"
+                "addi       t0, %3, 32;\n"
+                "vse32.v    v2, (t0);\n"
+                "addi       t0, t0, 32;\n"
+                "vse32.v    v4, (t0);\n"
+                "addi       t0, t0, 32;\n"
+                "vse32.v    v6, (t0);\n"
+                "addi       t0, t0, 32;\n"
+                "vse32.v    v8, (t0);\n"
+                "addi       t0, t0, 32;\n"
+                "vse32.v    v10, (t0);\n"
+                "addi       t0, t0, 32;\n"
+                "vse32.v    v12, (t0);\n"
+                "addi       t0, t0, 32;\n"
+                "vse32.v    v14, (t0);\n"
+                "addi       t0, t0, 32;\n"
+                "vse32.v    v16, (t0);\n"
+                :
+                : "r"(row0), "r"(row1), "r"(row2), "r"(cur_col)
+                : "t0", "t1", "memory");
+            row0 += in_xy;
+            row1 += in_xy;
+            row2 += in_xy;
+            cur_col += 72;
+        }
+    }
+}
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_tile8.c b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_tile8.c
new file mode 100644
index 000000000..295d16cbb
--- /dev/null
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_tile8.c
@@ -0,0 +1,189 @@
+#include <stdbool.h>
+extern void im2col_fp32_1x1(const float* input, const int input_xy, const int input_chan, float* col);
+extern void im2col_fp32_3x3(const float* input, int w, int h, int channel, float* cur_col, int stride);
+
+static void trans_col(float* input, float* cur_col, int col_i, int in_c, int in_h, int in_w, int k_w, int k_h, int s_w, int s_h, int pad_w0, int pad_h0, int out_w, int out_h, int d_h, int d_w)
+{
+    const int in_xy = in_w * in_h;
+    int cnt_y[] = {
+        col_i / out_w,
+        (col_i + 1) / out_w,
+        (col_i + 2) / out_w,
+        (col_i + 3) / out_w,
+        (col_i + 4) / out_w,
+        (col_i + 5) / out_w,
+        (col_i + 6) / out_w,
+        (col_i + 7) / out_w,
+    };
+
+    int cnt_x[] = {
+        col_i - cnt_y[0] * out_w,
+        col_i - cnt_y[1] * out_w + 1,
+        col_i - cnt_y[2] * out_w + 2,
+        col_i - cnt_y[3] * out_w + 3,
+        col_i - cnt_y[4] * out_w + 4,
+        col_i - cnt_y[5] * out_w + 5,
+        col_i - cnt_y[6] * out_w + 6,
+        col_i - cnt_y[7] * out_w + 7,
+    };
+
+    int imx_start[] = {
+        cnt_x[0] * s_w - pad_w0,
+        cnt_x[1] * s_w - pad_w0,
+        cnt_x[2] * s_w - pad_w0,
+        cnt_x[3] * s_w - pad_w0,
+        cnt_x[4] * s_w - pad_w0,
+        cnt_x[5] * s_w - pad_w0,
+        cnt_x[6] * s_w - pad_w0,
+        cnt_x[7] * s_w - pad_w0,
+    };
+
+    int imy_start[] = {
+        cnt_y[0] * s_h - pad_h0,
+        cnt_y[1] * s_h - pad_h0,
+        cnt_y[2] * s_h - pad_h0,
+        cnt_y[3] * s_h - pad_h0,
+        cnt_y[4] * s_h - pad_h0,
+        cnt_y[5] * s_h - pad_h0,
+        cnt_y[6] * s_h - pad_h0,
+        cnt_y[7] * s_h - pad_h0,
+    };
+
+    for (int kch = 0; kch < in_c; kch++)
+    {
+        for (int ky = 0; ky < (k_h * d_h); ky += d_h)
+        {
+            for (int kx = 0; kx < (k_w * d_w); kx += d_w)
+            {
+                int imx[8] = {
+                    imx_start[0] + kx,
+                    imx_start[1] + kx,
+                    imx_start[2] + kx,
+                    imx_start[3] + kx,
+                    imx_start[4] + kx,
+                    imx_start[5] + kx,
+                    imx_start[6] + kx,
+                    imx_start[7] + kx,
+                };
+
+                int imy[8] = {
+                    imy_start[0] + ky,
+                    imy_start[1] + ky,
+                    imy_start[2] + ky,
+                    imy_start[3] + ky,
+                    imy_start[4] + ky,
+                    imy_start[5] + ky,
+                    imy_start[6] + ky,
+                    imy_start[7] + ky,
+                };
+
+                for (int i = 0; i < 8; ++i)
+                {
+                    if (imx[i] >= 0 && imx[i] < in_w && imy[i] >= 0 && imy[i] < in_h)
+                    {
+                        *cur_col++ = *(input + in_xy * kch + in_w * imy[i] + imx[i]);
+                    }
+                    else
+                    {
+                        *cur_col++ = .0f;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k_w, int k_h, int s_w, int s_h, int d_w,
+            int d_h, int pad_w0, int pad_w1, int pad_h0, int pad_h1, int out_w, int out_h, int num_thread)
+{
+    const int kernel_size = k_w * k_h * in_c;
+    const int in_xy = in_w * in_h;
+    const int out_xy = out_w * out_h;
+    const int col_end7 = out_xy & 7;
+    const int is_pad0 = !(pad_h0 || pad_w0 || pad_h1 || pad_w1);
+
+    if (k_w == 1 && k_h == 1 && s_w == 1 && s_h == 1)
+    {
+#pragma omp parallel for num_threads(num_thread)
+        int col_i = 0;
+        for (; col_i < (out_xy & -8); col_i += 8)
+        {
+            float* cur_col = col + col_i * kernel_size;
+
+            int imy0 = col_i / out_w;
+            int imy7 = (col_i + 7) / out_w;
+            int imx0 = col_i - imy0 * out_w;
+            int imx7 = (col_i + 7) - imy7 * out_w;
+
+            int imx_start = imx0 * s_w - pad_w0;
+            int imx_end = imx7 * s_w - pad_w0;
+            int imy_start = imy0 * s_h - pad_h0;
+            int imy_end = imy7 * s_h - pad_h0;
+
+            // is pad ?
+            if (imy0 == imy7 && (is_pad0 || (imx_start >= 0 && imx_end < in_w && imy_start >= 0 && imy_end < in_h)))
+            {
+                const float* cur_input = input + imy_start * in_w + imx_start;
+                im2col_fp32_1x1(cur_input, in_xy, in_c, cur_col);
+            }
+            else
+            {
+                trans_col(input, cur_col, col_i, in_c, in_h, in_w, k_w, k_h, s_w, s_h, pad_w0, pad_h0, out_w, out_h, d_h, d_w);
+            }
+        }
+
+        if (col_end7)
+        {
+            float* cur_col = col + col_i * kernel_size;
+            trans_col(input, cur_col, col_i, in_c, in_h, in_w, k_w, k_h, s_w, s_h, pad_w0, pad_h0, out_w, out_h, d_h, d_w);
+        }
+    }
+    else if (d_w == 1 && d_h == 1 && k_w == 3 && k_h == 3 && s_w == s_h)
+    {
+        int col_i = 0;
+        for (; col_i < (out_xy & -8); col_i += 8)
+        {
+            float* cur_col = col + col_i * kernel_size;
+            int imy0 = col_i / out_w;
+            int imy7 = (col_i + 7) / out_w;
+            int imx0 = col_i - imy0 * out_w;
+            int imx7 = (col_i + 7) - imy7 * out_w;
+
+            int imx_start = imx0 * s_w - pad_w0;
+            int imx_end = imx7 * s_w - pad_w0;
+            int imy_start = imy0 * s_h - pad_h0;
+            int imy_end = imy7 * s_h - pad_h0;
+            if ((imy0 == imy7) && (is_pad0 || (imx_start >= 0 && imx_end < in_w - 8 && imy_start >= 0 && imy_end + 2 < in_h)))
+            {
+                float* cur_input = input + imy_start * in_w + imx_start;
+                im2col_fp32_3x3(cur_input, in_w, in_h, in_c, cur_col, s_w);
+                cur_col += 8 * kernel_size;
+            }
+            else
+            {
+                trans_col(input, cur_col, col_i, in_c, in_h, in_w, k_w, k_h, s_w, s_h, pad_w0, pad_h0, out_w, out_h, d_h, d_w);
+            }
+        }
+
+        if (col_end7)
+        {
+            float* cur_col = col + col_i * kernel_size;
+            trans_col(input, cur_col, col_i, in_c, in_h, in_w, k_w, k_h, s_w, s_h, pad_w0, pad_h0, out_w, out_h, d_h, d_w);
+        }
+    }
+    else
+    {
+        int col_i = 0;
+        for (; col_i < (out_xy & -8); col_i += 8)
+        {
+            float* cur_col = col + col_i * kernel_size;
+            trans_col(input, cur_col, col_i, in_c, in_h, in_w, k_w, k_h, s_w, s_h, pad_w0, pad_h0, out_w, out_h, d_h, d_w);
+        }
+
+        if (col_end7)
+        {
+            float* cur_col = col + col_i * kernel_size;
+            trans_col(input, cur_col, col_i, in_c, in_h, in_w, k_w, k_h, s_w, s_h, pad_w0, pad_h0, out_w, out_h, d_h, d_w);
+        }
+    }
+}
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_4x16.S b/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_4x16.S
deleted file mode 100644
index b8b7431ea..000000000
--- a/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_4x16.S
+++ /dev/null
@@ -1,690 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: ddzhao@openailab.com
-*/
-//
-// 4*16 single precise floating point matric multiplication
-//
-//    --              --      --               --     --               --         --                 --
-//    | i0 - - - - - - |      |  k0  k1  ..  kf |     |  b0  b1  ..  bf |         | i0k0 i0k1 .. i0kf |
-//    |                |      |  .   .   .   .  |     |                 |         |                   |
-//    | i1 - - - - - - |      |  .   .   .   .  |     |  b0  b1  .   bf |         | i1k0 i1k1 .. i1kf |
-//    |                |  x   |  .   .   .   .  |  +  |                 |     =   |                   |
-//    | i2 - - - - - - |      |  .   .   .   .  |     |  b0  b1  .   bf |         | i2k0 i2k1 .. i2kf |
-//    |                |      |  .   .   .   .  |     |                 |         |                   |
-//    | i3 - - - - - - |      |  .   .   .   .  |     |  b0  b1  .   bf |         | i3k0 i3k1 .. i3kf |
-//    --              --      --               --     --               --         --                 --
-//      input 4 x p             kernel p x 16            biases 4 x 16                 output 4 x 16           p = kernel size
-//
-//
-// load 4 more input and 8 more kernel to improve loop performance
-//
-// input: 
-//         x0 arg0  biases address {b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,b10,b11,b12,b13,b14,b15}  nullptr means no biases 
-//         x1 arg1  input  address {i[0-3][0],i1[0-3][1],i[0-3][2],i[0-3][3],i[0-3][4],...}
-//         x2 arg2  kernel address {k[0-15][0],k[0-15][1],k[0-15][2],k[0-15][3],...}
-//         x3 arg3  kernel size
-//         x4 arg4  output address 
-//                  indirect save: output {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3],i[0-3]k[4]..}
-//                    direct save: output                 : {i0k0  i1k0  i2k0  i3k0}
-//                                 output + ouput_xy      : {i0k1  i1k1  i2k1  i3k1}
-//                                 output + ouput_xy * 2  : {i0k2  i1k2  i2k2  i3k2}
-//                                 ...
-//                                 output + ouput_xy * 15 : {i0k15 i1k15 i2k15 i3k15}
-//         x5 arg5  output xy 
-//         x6 arg6  activation flag     activation layers is integrated after convolution
-//
-// output: no
-//
-// register definition
-// x0        biases start address
-// x1        input start address
-// x2        kernel start address
-// x3        kernal size 
-// x4        output start address
-// x5        output_x * output_y
-// x6        activation flag
-// x9 ~ x10  temp loop counter
-// x11~ x13  temp output save address
-// x14       output_xy * 4
-// x7~8 x15  not used
-// x9       t1
-// x10      t2
-// x11      t3
-// x12      t4
-// x13      t5
-// x14      t6
-//
-// v0~1 4S data of input0   {i3   i2   i1   i0}
-// v2~3 not used
-// v4   4S kernal data      {k3 | k2 | k1 | k0}
-// v5   4S kernal data      {k7 | k6 | k5 | k4}
-// v6   4S kernal data      {kb | ka | k9 | k8}
-// v7   4S kernal data      {kf | ke | kd | kc}
-// v8~15 not used
-// v16 dot product for {i3k0, i2k0, i1k0, i0k0}
-// v17 dot product for {i3k1, i2k1, i1k1, i0k1}
-// v18 dot product for {i3k2, i2k2, i1k2, i0k2}
-// v19 dot product for {i3k3, i2k3, i1k3, i0k3}
-// v20 dot product for {i3k4, i2k4, i1k4, i0k4}
-// v21 dot product for {i3k5, i2k5, i1k5, i0k5}
-// v22 dot product for {i3k6, i2k6, i1k6, i0k6}
-// v23 dot product for {i3k7, i2k7, i1k7, i0k7}
-// v24 dot product for {i3k8, i2k8, i1k8, i0k8}
-// v25 dot product for {i3k9, i2k9, i1k9, i0k9}
-// v26 dot product for {i3ka, i2ka, i1ka, i0ka}
-// v27 dot product for {i3kb, i2kb, i1kb, i0kb}
-// v28 dot product for {i3kc, i2kc, i1kc, i0kc}
-// v29 dot product for {i3kd, i2kd, i1kd, i0kd}
-// v30 dot product for {i3ke, i2ke, i1ke, i0ke}
-// v31 dot product for {i3kf, i2kf, i1kf, i0kf}
-
-    .section .text,"ax"
-    .align 5
-
-    .type sgemm_4x16_rv64 STT_FUNC
-    .global sgemm_4x16_rv64
-    .hidden sgemm_4x16_rv64
-sgemm_4x16_rv64:
-    addi            sp, sp, -56
-    sd              t0, 0(sp)
-    sd              t1, 8(sp)
-    sd              t2, 16(sp)
-    sd              t3, 24(sp)
-    sd              t4, 32(sp)
-    sd              t5, 40(sp)
-    sd              t6, 48(sp)
-    vsetvli         t0, t1, e32
-#     // biases_initial
-    beqz            a0, none_biases
-    vlw.v           v0, (a0)
-    vrgather.vi     v16, v0, 0
-    vrgather.vi     v17, v0, 1
-    vrgather.vi     v18, v0, 2
-    vrgather.vi     v19, v0, 3
-    addi            a0, a0, 0x10
-    vlw.v           v0, (a0)
-    vrgather.vi     v20, v0, 0
-    vrgather.vi     v21, v0, 1
-    vrgather.vi     v22, v0, 2
-    vrgather.vi     v23, v0, 3
-    addi            a0, a0, 0x10
-    vlw.v           v0, (a0)
-    vrgather.vi     v24, v0, 0
-    vrgather.vi     v25, v0, 1
-    vrgather.vi     v26, v0, 2
-    vrgather.vi     v27, v0, 3
-    addi            a0, a0, 0x10
-    vlw.v           v0, (a0)
-    vrgather.vi     v28, v0, 0
-    vrgather.vi     v29, v0, 1
-    vrgather.vi     v30, v0, 2
-    vrgather.vi     v31, v0, 3
-
-    j               convolution_start
-
-none_biases:
-    vmv.v.x         v16, x0
-    vmv.v.x         v17, x0
-    vmv.v.x         v18, x0
-    vmv.v.x         v19, x0
-    vmv.v.x         v20, x0
-    vmv.v.x         v21, x0
-    vmv.v.x         v22, x0
-    vmv.v.x         v23, x0
-    vmv.v.x         v24, x0
-    vmv.v.x         v25, x0
-    vmv.v.x         v26, x0
-    vmv.v.x         v27, x0
-    vmv.v.x         v28, x0
-    vmv.v.x         v29, x0
-    vmv.v.x         v30, x0
-    vmv.v.x         v31, x0
-
-convolution_start:
-    vlw.v           v0, (a1)
-    addi            t0, a2, 0
-    vlw.v           v4, (t0)
-    addi            t0, a2, 0x10
-    vlw.v           v5, (t0)
-
-    andi             t2, a3, 0x3
-    slli            a5, a5, 0x2
-    bltz            t2, loop4_end
-    srli            t1, a3, 0x2
-
-// main loop     each loop generate dot prodcut for 4x16x4SP
-loop4:  
-    addi            t1, t1, -1
-    addi            t0, a2, 0x20
-    vlw.v           v6, (t0)
-    addi            t0, a2, 0x30
-    vlw.v           v7, (t0)
-
-    vrgather.vi     v8, v4, 0
-    vrgather.vi     v9, v4, 1
-    vrgather.vi     v10, v4, 2
-    vrgather.vi     v11, v4, 3
-    vfmacc.vv       v16, v0, v8
-    vfmacc.vv       v17, v0, v9
-    vfmacc.vv       v18, v0, v10
-    vfmacc.vv       v19, v0, v11
-    
-    addi            t0, a1, 0x10
-    vlw.v           v1, (t0)
-    
-    vrgather.vi     v8,  v5, 0
-    vrgather.vi     v9,  v5, 1
-    vrgather.vi     v10, v5, 2
-    vrgather.vi     v11, v5, 3
-    vfmacc.vv       v20, v0, v8
-    vfmacc.vv       v21, v0, v9
-    vfmacc.vv       v22, v0, v10
-    vfmacc.vv       v23, v0, v11
-    
-    addi            t0, a2, 0x40
-    vlw.v           v4, (t0)
-    addi            t0, a2, 0x50
-    vlw.v           v5, (t0)
-    
-    vrgather.vi     v8,  v6, 0
-    vrgather.vi     v9,  v6, 1
-    vrgather.vi     v10, v6, 2
-    vrgather.vi     v11, v6, 3
-    vfmacc.vv       v24, v0, v8
-    vfmacc.vv       v25, v0, v9
-    vfmacc.vv       v26, v0, v10
-    vfmacc.vv       v27, v0, v11
-    
-    vrgather.vi     v8,  v7, 0
-    vrgather.vi     v9,  v7, 1
-    vrgather.vi     v10, v7, 2
-    vrgather.vi     v11, v7, 3
-    vfmacc.vv       v28, v0, v8
-    vfmacc.vv       v29, v0, v9
-    vfmacc.vv       v30, v0, v10
-    vfmacc.vv       v31, v0, v11
-
-    addi            t0, a2, 0x60
-    vlw.v           v6, (t0)
-    addi            t0, a2, 0x70
-    vlw.v           v7, (t0)
-    
-    vrgather.vi     v8, v4, 0
-    vrgather.vi     v9, v4, 1
-    vrgather.vi     v10, v4, 2
-    vrgather.vi     v11, v4, 3
-    vfmacc.vv       v16, v1, v8
-    vfmacc.vv       v17, v1, v9
-    vfmacc.vv       v18, v1, v10
-    vfmacc.vv       v19, v1, v11
-    
-    addi            t0, a1, 0x20
-    vlw.v           v0, (t0)
-    
-    vrgather.vi     v8,  v5, 0
-    vrgather.vi     v9,  v5, 1
-    vrgather.vi     v10, v5, 2
-    vrgather.vi     v11, v5, 3
-    vfmacc.vv       v20, v1, v8
-    vfmacc.vv       v21, v1, v9
-    vfmacc.vv       v22, v1, v10
-    vfmacc.vv       v23, v1, v11
-    
-    addi            t0, a2, 0x80
-    vlw.v           v4, (t0)
-    addi            t0, a2, 0x90
-    vlw.v           v5, (t0)
-    
-    vrgather.vi     v8,  v6, 0
-    vrgather.vi     v9,  v6, 1
-    vrgather.vi     v10, v6, 2
-    vrgather.vi     v11, v6, 3
-    vfmacc.vv       v24, v1, v8
-    vfmacc.vv       v25, v1, v9
-    vfmacc.vv       v26, v1, v10
-    vfmacc.vv       v27, v1, v11
-    
-    vrgather.vi     v8,  v7, 0
-    vrgather.vi     v9,  v7, 1
-    vrgather.vi     v10, v7, 2
-    vrgather.vi     v11, v7, 3
-    vfmacc.vv       v28, v1, v8
-    vfmacc.vv       v29, v1, v9
-    vfmacc.vv       v30, v1, v10
-    vfmacc.vv       v31, v1, v11
-    
-    addi            t0, a2, 0xa0
-    vlw.v           v6, (t0)
-    addi            t0, a2, 0xb0
-    vlw.v           v7, (t0)
-    
-    vrgather.vi     v8, v4, 0
-    vrgather.vi     v9, v4, 1
-    vrgather.vi     v10, v4, 2
-    vrgather.vi     v11, v4, 3
-    vfmacc.vv       v16, v0, v8
-    vfmacc.vv       v17, v0, v9
-    vfmacc.vv       v18, v0, v10
-    vfmacc.vv       v19, v0, v11
-    
-    addi            t0, a1, 0x30
-    vlw.v           v1, (t0)
-    addi             a1, a1, 0x40
-    
-    vrgather.vi     v8,  v5, 0
-    vrgather.vi     v9,  v5, 1
-    vrgather.vi     v10, v5, 2
-    vrgather.vi     v11, v5, 3
-    vfmacc.vv       v20, v0, v8
-    vfmacc.vv       v21, v0, v9
-    vfmacc.vv       v22, v0, v10
-    vfmacc.vv       v23, v0, v11
-    
-    addi            t0, a2, 0xc0
-    vlw.v           v4, (t0)
-    addi            t0, a2, 0xd0
-    vlw.v           v5, (t0)
-    
-    vrgather.vi     v8,  v6, 0
-    vrgather.vi     v9,  v6, 1
-    vrgather.vi     v10, v6, 2
-    vrgather.vi     v11, v6, 3
-    vfmacc.vv       v24, v0, v8
-    vfmacc.vv       v25, v0, v9
-    vfmacc.vv       v26, v0, v10
-    vfmacc.vv       v27, v0, v11
-    
-    vrgather.vi     v8,  v7, 0
-    vrgather.vi     v9,  v7, 1
-    vrgather.vi     v10, v7, 2
-    vrgather.vi     v11, v7, 3
-    vfmacc.vv       v28, v0, v8
-    vfmacc.vv       v29, v0, v9
-    vfmacc.vv       v30, v0, v10
-    vfmacc.vv       v31, v0, v11
-    
-    addi            t0, a2, 0xe0
-    vlw.v           v6, (t0)
-    addi            t0, a2, 0xf0
-    vlw.v           v7, (t0)
-    addi            a2, a2, 0x100
-    vrgather.vi     v8, v4, 0
-    vrgather.vi     v9, v4, 1
-    vrgather.vi     v10, v4, 2
-    vrgather.vi     v11, v4, 3
-    vfmacc.vv       v16, v1, v8
-    vfmacc.vv       v17, v1, v9
-    vfmacc.vv       v18, v1, v10
-    vfmacc.vv       v19, v1, v11
-
-    vlw.v           v0, (a1)
-    
-    vrgather.vi     v8,  v5, 0
-    vrgather.vi     v9,  v5, 1
-    vrgather.vi     v10, v5, 2
-    vrgather.vi     v11, v5, 3
-    vfmacc.vv       v20, v1, v8
-    vfmacc.vv       v21, v1, v9
-    vfmacc.vv       v22, v1, v10
-    vfmacc.vv       v23, v1, v11
-    
-    addi            t0, a2, 0x0
-    vlw.v           v4, (t0)
-    addi            t0, a2, 0x10
-    vlw.v           v5, (t0)
-    
-    vrgather.vi     v8,  v6, 0
-    vrgather.vi     v9,  v6, 1
-    vrgather.vi     v10, v6, 2
-    vrgather.vi     v11, v6, 3
-    vfmacc.vv       v24, v1, v8
-    vfmacc.vv       v25, v1, v9
-    vfmacc.vv       v26, v1, v10
-    vfmacc.vv       v27, v1, v11
-
-    vrgather.vi     v8,  v7, 0
-    vrgather.vi     v9,  v7, 1
-    vrgather.vi     v10, v7, 2
-    vrgather.vi     v11, v7, 3
-    vfmacc.vv       v28, v1, v8
-    vfmacc.vv       v29, v1, v9
-    vfmacc.vv       v30, v1, v10
-    vfmacc.vv       v31, v1, v11
-    bnez            t1, loop4
-
-loop4_end:
-    slli            t6, a5, 2
-    beqz            t2, activation
-
-loop1:
-    addi            t0, a2, 0x20
-    vlw.v           v6, (t0)
-    addi            t0, a2, 0x30
-    vlw.v           v7, (t0)
-    addi            a2, a2, 0x40
-    vrgather.vi     v8, v4, 0
-    vrgather.vi     v9, v4, 1
-    vrgather.vi     v10, v4, 2
-    vrgather.vi     v11, v4, 3
-    vfmacc.vv       v16, v0, v8
-    vfmacc.vv       v17, v0, v9
-    vfmacc.vv       v18, v0, v10
-    vfmacc.vv       v19, v0, v11
-    addi            a1, a1, 0x10
-    addi            t2, t2, -1
-    vrgather.vi     v8,  v5, 0
-    vrgather.vi     v9,  v5, 1
-    vrgather.vi     v10, v5, 2
-    vrgather.vi     v11, v5, 3
-    vfmacc.vv       v20, v0, v8
-    vfmacc.vv       v21, v0, v9
-    vfmacc.vv       v22, v0, v10
-    vfmacc.vv       v23, v0, v11
-    addi            t0, a2, 0x0
-    vlw.v           v4, (t0)
-    addi            t0, a2, 0x10
-    vlw.v           v5, (t0)
-    vrgather.vi     v8,  v6, 0
-    vrgather.vi     v9,  v6, 1
-    vrgather.vi     v10, v6, 2
-    vrgather.vi     v11, v6, 3
-    vfmacc.vv       v24, v0, v8
-    vfmacc.vv       v25, v0, v9
-    vfmacc.vv       v26, v0, v10
-    vfmacc.vv       v27, v0, v11
-    vrgather.vi     v8,  v7, 0
-    vrgather.vi     v9,  v7, 1
-    vrgather.vi     v10, v7, 2
-    vrgather.vi     v11, v7, 3
-    vfmacc.vv       v28, v0, v8
-    vfmacc.vv       v29, v0, v9
-    vfmacc.vv       v30, v0, v10
-    vfmacc.vv       v31, v0, v11
-    
-    vlw.v           v0, (a1)
-    bnez            t2, loop1
-
-activation:
-    add             t3, a4, a5
-    bltz            a6, save_result
-    vmv.v.x         v0, x0
-    vmv.v.x         v0, a6          // FIXME: change DataType
-    vfmax.vv        v16, v16, v0
-    vfmax.vv        v17, v17, v0
-    vfmax.vv        v18, v18, v0
-    vfmax.vv        v19, v19, v0
-    vfmax.vv        v20, v20, v0
-    vfmax.vv        v21, v21, v0
-    vfmax.vv        v22, v22, v0
-    vfmax.vv        v23, v23, v0
-    vfmax.vv        v24, v24, v0
-    vfmax.vv        v25, v25, v0
-    vfmax.vv        v26, v26, v0
-    vfmax.vv        v27, v27, v0
-    vfmax.vv        v28, v28, v0
-    vfmax.vv        v29, v29, v0
-    vfmax.vv        v30, v30, v0
-    vfmax.vv        v31, v31, v0
-
-    beqz            a6, save_result
-    vfmin.vv        v16, v16, v1
-    vfmin.vv        v17, v17, v1
-    vfmin.vv        v18, v18, v1
-    vfmin.vv        v19, v19, v1
-    vfmin.vv        v20, v20, v1
-    vfmin.vv        v21, v21, v1
-    vfmin.vv        v22, v22, v1
-    vfmin.vv        v23, v23, v1
-    vfmin.vv        v24, v24, v1
-    vfmin.vv        v25, v25, v1
-    vfmin.vv        v26, v26, v1
-    vfmin.vv        v27, v27, v1
-    vfmin.vv        v28, v28, v1
-    vfmin.vv        v29, v29, v1
-    vfmin.vv        v30, v30, v1
-    vfmin.vv        v31, v31, v1
-
-save_result:
-    slli            t0, a5, 1
-    add             t4, a4, t0
-    add             t5, t3, t0
-#     // store result
-    beqz            a7, save_result_nchw
-    li              t1, 0
-    vext.x.v        t0, v16, t1
-    sw              t0, 0(a4)
-    vext.x.v        t0, v17, t1
-    sw              t0, 4(a4)
-    vext.x.v        t0, v18, t1
-    sw              t0, 8(a4)
-    vext.x.v        t0, v19, t1
-    sw              t0, 12(a4)
-    add             a4, a4, 0x10
-    
-    li              t1, 1
-    vext.x.v        t0, v16, t1
-    sw              t0, 0(t3)
-    vext.x.v        t0, v17, t1
-    sw              t0, 4(t3)
-    vext.x.v        t0, v18, t1
-    sw              t0, 8(t3)
-    vext.x.v        t0, v19, t1
-    sw              t0, 12(t3)
-    add             t3, t3, 0x10
-
-    li              t1, 2
-    vext.x.v        t0, v16, t1
-    sw              t0, 0(t4)
-    vext.x.v        t0, v17, t1
-    sw              t0, 4(t4)
-    vext.x.v        t0, v18, t1
-    sw              t0, 8(t4)
-    vext.x.v        t0, v19, t1
-    sw              t0, 12(t4)
-    add             t4, t4, 0x10
-    
-    li              t1, 3
-    vext.x.v        t0, v16, t1
-    sw              t0, 0(t5)
-    vext.x.v        t0, v17, t1
-    sw              t0, 4(t5)
-    vext.x.v        t0, v18, t1
-    sw              t0, 8(t5)
-    vext.x.v        t0, v19, t1
-    sw              t0, 12(t5)
-    add             t5, t5, 0x10
-    
-    li              t1, 0
-    vext.x.v        t0, v20, t1
-    sw              t0, 0(a4)
-    vext.x.v        t0, v21, t1
-    sw              t0, 4(a4)
-    vext.x.v        t0, v22, t1
-    sw              t0, 8(a4)
-    vext.x.v        t0, v23, t1
-    sw              t0, 12(a4)
-    add             a4, a4, 0x10
-    
-    li              t1, 1
-    vext.x.v        t0, v20, t1
-    sw              t0, 0(t3)
-    vext.x.v        t0, v21, t1
-    sw              t0, 4(t3)
-    vext.x.v        t0, v22, t1
-    sw              t0, 8(t3)
-    vext.x.v        t0, v23, t1
-    sw              t0, 12(t3)
-    add             t3, t3, 0x10
-    
-    li              t1, 2
-    vext.x.v        t0, v20, t1
-    sw              t0, 0(t4)
-    vext.x.v        t0, v21, t1
-    sw              t0, 4(t4)
-    vext.x.v        t0, v22, t1
-    sw              t0, 8(t4)
-    vext.x.v        t0, v23, t1
-    sw              t0, 12(t4)
-    add             t3, t3, 0x10
-    
-    li              t1, 3
-    vext.x.v        t0, v20, t1
-    sw              t0, 0(t5)
-    vext.x.v        t0, v21, t1
-    sw              t0, 4(t5)
-    vext.x.v        t0, v22, t1
-    sw              t0, 8(t5)
-    vext.x.v        t0, v23, t1
-    sw              t0, 12(t5)
-    add             t5, t5, 0x10
-    
-    li              t1, 0
-    vext.x.v        t0, v24, t1
-    sw              t0, 0(a4)
-    vext.x.v        t0, v25, t1
-    sw              t0, 4(a4)
-    vext.x.v        t0, v26, t1
-    sw              t0, 8(a4)
-    vext.x.v        t0, v27, t1
-    sw              t0, 12(a4)
-    add             a4, a4, 0x10
-    
-    li              t1, 1
-    vext.x.v        t0, v24, t1
-    sw              t0, 0(t3)
-    vext.x.v        t0, v25, t1
-    sw              t0, 4(t3)
-    vext.x.v        t0, v26, t1
-    sw              t0, 8(t3)
-    vext.x.v        t0, v27, t1
-    sw              t0, 12(t3)
-    add             t3, t3, 0x10
-    
-    li              t1, 2
-    vext.x.v        t0, v24, t1
-    sw              t0, 0(t4)
-    vext.x.v        t0, v25, t1
-    sw              t0, 4(t4)
-    vext.x.v        t0, v26, t1
-    sw              t0, 8(t4)
-    vext.x.v        t0, v27, t1
-    sw              t0, 12(t4)
-    add             t3, t3, 0x10
-    
-    li              t1, 3
-    vext.x.v        t0, v24, t1
-    sw              t0, 0(t5)
-    vext.x.v        t0, v25, t1
-    sw              t0, 4(t5)
-    vext.x.v        t0, v26, t1
-    sw              t0, 8(t5)
-    vext.x.v        t0, v27, t1
-    sw              t0, 12(t5)
-    add             t5, t5, 0x10
-
-    li              t1, 0
-    vext.x.v        t0, v28, t1
-    sw              t0, 0(a4)
-    vext.x.v        t0, v29, t1
-    sw              t0, 4(a4)
-    vext.x.v        t0, v30, t1
-    sw              t0, 8(a4)
-    vext.x.v        t0, v31, t1
-    sw              t0, 12(a4)
-    
-    li              t1, 1
-    vext.x.v        t0, v28, t1
-    sw              t0, 0(t3)
-    vext.x.v        t0, v29, t1
-    sw              t0, 4(t3)
-    vext.x.v        t0, v30, t1
-    sw              t0, 8(t3)
-    vext.x.v        t0, v31, t1
-    sw              t0, 12(t3)
-    
-    li              t1, 2
-    vext.x.v        t0, v28, t1
-    sw              t0, 0(t4)
-    vext.x.v        t0, v29, t1
-    sw              t0, 4(t4)
-    vext.x.v        t0, v30, t1
-    sw              t0, 8(t4)
-    vext.x.v        t0, v31, t1
-    sw              t0, 12(t4)
-    
-    li              t1, 3
-    vext.x.v        t0, v28, t1
-    sw              t0, 0(t5)
-    vext.x.v        t0, v29, t1
-    sw              t0, 4(t5)
-    vext.x.v        t0, v30, t1
-    sw              t0, 8(t5)
-    vext.x.v        t0, v31, t1
-    sw              t0, 12(t5)
-    
-    j               end
-
-save_result_nchw:
-    vsw.v           v16, (a4)
-    add             a4, a4, t6
-    vsw.v           v17, (t3)
-    add             t3, t3, t6
-    vsw.v           v18, (t4)
-    add             t4, t4, t6
-    vsw.v           v19, (t5)
-    add             t5, t5, t6
-    
-    vsw.v           v20, (a4)
-    add             a4, a4, t6
-    vsw.v           v21, (t3)
-    add             t3, t3, t6
-    vsw.v           v22, (t4)
-    add             t4, t4, t6
-    vsw.v           v23, (t5)
-    add             t5, t5, t6
-    
-    vsw.v           v24, (a4)
-    add             a4, a4, t6
-    vsw.v           v25, (t3)
-    add             t3, t3, t6
-    vsw.v           v26, (t4)
-    add             t4, t4, t6
-    vsw.v           v27, (t5)
-    add             t5, t5, t6
-    
-    vsw.v           v28, (a4)
-    vsw.v           v29, (t3)
-    vsw.v           v30, (t4)
-    vsw.v           v31, (t5)
-
-end:
-    ld            t0, 0(sp)
-    ld            t1, 8(sp)
-    ld            t2, 16(sp)
-    ld            t3, 24(sp)
-    ld            t4, 32(sp)
-    ld            t5, 40(sp)
-    ld            t6, 48(sp)
-    addi          sp, sp, 56
-    ret
-    .end
\ No newline at end of file
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_4x4.S b/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_4x4.S
deleted file mode 100644
index c9ce7b8c8..000000000
--- a/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_4x4.S
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: ddzhao@openailab.com
- */
-//
-// 4*4 single precise floating point matric multiplication
-//
-//    --              --      --               --     --               --         --                   --
-//    | i0 - - - - - - |      |  k0  k1  k2  k3 |     |  b0  b1  b2  b3 |         | i0k0 i0k1 i0k2 i0k3 |
-//    |                |      |  .   .   .   .  |     |                 |         |                     |
-//    | i1 - - - - - - |      |  .   .   .   .  |     |  b0  b1  b2  b3 |         | i1k0 i1k1 i1k2 i1k3 |
-//    |                |  x   |  .   .   .   .  |  +  |                 |     =   |                     |
-//    | i2 - - - - - - |      |  .   .   .   .  |     |  b0  b1  b2  b3 |         | i2k0 i2k1 i2k2 i2k3 |
-//    |                |      |  .   .   .   .  |     |                 |         |                     |
-//    | i3 - - - - - - |      |  .   .   .   .  |     |  b0  b1  b2  b3 |         | i3k0 i3k1 i3k2 i3k3 |
-//    --              --      --               --     --               --         --                   --
-//      input 4 x p             kernel p x 4             biases 4 x 4                 output 4 x 4         p = kernel size
-//
-//
-//
-// input:  
-//         x0 arg0  biases address {b0,b1,b2,b3}  nullptr means no biases 
-//         x1 arg1  input  address {i[0-3][0],i1[0-3][1],i[0-3][2],i[0-3][3],i[0-3][4],...}
-//         x2 arg2  kernel address {k[0-3][0],k[0-3][1],k[0-3][2],k[0-3][3],...}
-//         x3 arg3  kernel size
-//         x4 arg4  output address 
-//                  indirect save: output {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3]}
-//                    direct save: output                 : {i0k0  i1k0  i2k0  i3k0}
-//                                 output + ouput_xy      : {i0k1  i1k1  i2k1  i3k1}
-//                                 output + ouput_xy * 2  : {i0k2  i1k2  i2k2  i3k2}
-//                                 output + ouput_xy * 3  : {i0k3  i1k3  i2k3  i3k3}
-//         x5 arg5  output xy
-//         x6 arg6  activation flag     relu layers is integrated after convolution
-//
-// output: no
-//
-// register definition
-// x0        biases start address
-// x1        input start address
-// x2        kernel start address
-// x3        kernal size
-// x4        output start address
-// x5        output_x * output_y
-// x6        fused relu flag
-// x9 ~ x10  temp loop counter
-// x11~ x13  temp output save address
-// x7~8 14~15 not used
-
-//
-// v0-3 4S data of input0   {i3   i2   i1   i0}
-// v4-7 4S kernal data      {k3   k2   k1   k0}
-// v8~v15 not used
-// v16 dot product for {i3k0, i2k0, i1k0, i0k0}
-// v17 dot product for {i3k1, i2k1, i1k1, i0k1}
-// v18 dot product for {i3k2, i2k2, i1k2, i0k2}
-// v19 dot product for {i3k3, i2k3, i1k3, i0k3}
-// v20~V31 not used
-
-        .section .text,"ax"
-        .align 5
-
-        .type sgemm_4x4_rv64 STT_FUNC
-        .global sgemm_4x4_rv64
-        .hidden sgemm_4x4_rv64
-sgemm_4x4_rv64:
-        slli            a5, a5, 0x2
-#       // initial biases
-        beqz            a0, non_biases
-        vsetvli         t0, a0, e32
-        vlw.v           v0, (a0)
-        vrgather.vi     v16, v0, 0
-        vrgather.vi     v17, v0, 1
-        vrgather.vi     v18, v0, 2
-        vrgather.vi     v19, v0, 3
-
-        j               convoluation_start
-	
-non_biases:
-        vmv.v.x         v16, x0
-        vmv.v.x         v17, x0
-        vmv.v.x         v18, x0
-        vmv.v.x         v19, x0
-
-convoluation_start:
-        add             t4, a4, a5
-        
-        andi	        t3, a3, 0x3
-
-        li              t0, 4
-        blt             a3, t0, loop4_end
-        srli            t2, a3, 0x2
-
-// main loop: each loop generate dot prodcut for 4x4SFP
-loop4:  
-        addi            t2, t2, -1
-        
-        vlw.v           v0, (a1)
-        addi            a1, a1, 16
-        vlw.v           v1, (a1)
-        addi            a1, a1, 16
-        vlw.v           v2, (a1)
-        addi            a1, a1, 16
-        vlw.v           v3, (a1)
-        addi            a1, a1, 16
-        
-        vlw.v           v4, (a2)
-        addi            a2, a2, 16
-        vlw.v           v5, (a2)
-        addi            a2, a2, 16
-        vlw.v           v6, (a2)
-        addi            a2, a2, 16
-        vlw.v           v7, (a2)
-        addi            a2, a2, 16
-        
-        vrgather.vi     v20, v4, 0
-        vrgather.vi     v21, v4, 1
-        vrgather.vi     v22, v4, 2
-        vrgather.vi     v23, v4, 3
-        vfmacc.vv       v16, v20, v0
-        vfmacc.vv       v17, v21, v0
-        vfmacc.vv       v18, v22, v0
-        vfmacc.vv       v19, v23, v0
-        
-        vrgather.vi     v20, v5, 0
-        vrgather.vi     v21, v5, 1
-        vrgather.vi     v22, v5, 2
-        vrgather.vi     v23, v5, 3
-        vfmacc.vv       v16, v20, v1
-        vfmacc.vv       v17, v21, v1
-        vfmacc.vv       v18, v22, v1
-        vfmacc.vv       v19, v23, v1
-
-        vrgather.vi     v20, v6, 0
-        vrgather.vi     v21, v6, 1
-        vrgather.vi     v22, v6, 2
-        vrgather.vi     v23, v6, 3
-        vfmacc.vv       v16, v20, v2
-        vfmacc.vv       v17, v21, v2
-        vfmacc.vv       v18, v22, v2
-        vfmacc.vv       v19, v23, v2
-
-        vrgather.vi     v20, v7, 0
-        vrgather.vi     v21, v7, 1
-        vrgather.vi     v22, v7, 2
-        vrgather.vi     v23, v7, 3
-        vfmacc.vv       v16, v20, v3
-        vfmacc.vv       v17, v21, v3
-        vfmacc.vv       v18, v22, v3
-        vfmacc.vv       v19, v23, v3
-
-        bnez            t2, loop4
-
-loop4_end:
-        slli            t0, a5, 1
-        add             t5, a4, t0
-        beqz            t3, activation
-
-loop1:
-        addi            t3, t3, -1
-        
-        vlw.v           v0, (a1)
-        addi            a1, a1, 16
-        
-        vlw.v           v4, (a2)
-        addi            a2, a2, 16
-
-        vrgather.vi     v20, v4, 0
-        vrgather.vi     v21, v4, 1
-        vrgather.vi     v22, v4, 2
-        vrgather.vi     v23, v4, 3
-        vfmacc.vv       v16, v20, v0
-        vfmacc.vv       v17, v21, v0
-        vfmacc.vv       v18, v22, v0
-        vfmacc.vv       v19, v23, v0
-
-        bnez            t3, loop1
-
-
-activation:
-        slli            t0, a5, 1
-        add             t6, t4, t0
-        
-        bltz            a6, save_result
-        
-        vmv.v.i         v0, 0
-        vmv.v.x         v1, a6
-
-        vfmax.vv        v16, v16, v0
-        vfmax.vv        v17, v17, v0
-        vfmax.vv        v18, v18, v0
-        vfmax.vv        v19, v19, v0    
-
-        beqz            a6, save_result
-        vfmin.vv        v16, v16, v1
-        vfmin.vv        v17, v17, v1
-        vfmin.vv        v18, v18, v1
-        vfmin.vv        v19, v19, v1 
-
-save_result:
-# 	// store result
-        beqz            a7, save_result_nchw
-        
-        li              t1, 0
-        vext.x.v        t0, v16, t1
-        sw              t0, 0(a4)
-        vext.x.v        t0, v17, t1
-        sw              t0, 4(a4)
-        vext.x.v        t0, v18, t1
-        sw              t0, 8(a4)
-        vext.x.v        t0, v19, t1
-        sw              t0, 12(a4)
-        
-        li              t1, 1
-        vext.x.v        t0, v16, t1
-        sw              t0, 0(t4)
-        vext.x.v        t0, v17, t1
-        sw              t0, 4(t4)
-        vext.x.v        t0, v18, t1
-        sw              t0, 8(t4)
-        vext.x.v        t0, v19, t1
-        sw              t0, 12(t4)
-        
-        li              t1, 2
-        vext.x.v        t0, v16, t1
-        sw              t0, 0(t5)
-        vext.x.v        t0, v17, t1
-        sw              t0, 4(t5)
-        vext.x.v        t0, v18, t1
-        sw              t0, 8(t5)
-        vext.x.v        t0, v19, t1
-        sw              t0, 12(t5)
-        
-        li              t1, 3
-        vext.x.v        t0, v16, t1
-        sw              t0, 0(t6)
-        vext.x.v        t0, v17, t1
-        sw              t0, 4(t6)
-        vext.x.v        t0, v18, t1
-        sw              t0, 8(t6)
-        vext.x.v        t0, v19, t1
-        sw              t0, 12(t6)
-        j               end
-
-save_result_nchw:
-        vsw.v           v16, (a4)
-        vsw.v           v17, (t4)
-        vsw.v           v18, (t5)
-        vsw.v           v19, (t6)
-
-end:
-	ret
-    .end
-
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_8x8.c b/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_8x8.c
new file mode 100644
index 000000000..832123b97
--- /dev/null
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_8x8.c
@@ -0,0 +1,308 @@
+#include "vsetvl_rvv.h"
+
+void sgemm_8x8_rv64(const float* cur_col, const float* cur_kernel, const float* bias, const int act, float* cur_output, const int output_xy, const int kernel_size, const int n)
+{
+    vsetvl_e32_m2();
+
+    // v16 ~ v30: result of c0 ~ v7
+    if (bias)
+    {
+        asm("vle32.v        v0, (%0);\n"
+            "vrgather.vi    v16, v0, 0;\n"
+            "vrgather.vi    v18, v0, 1;\n"
+            "vrgather.vi    v20, v0, 2;\n"
+            "vrgather.vi    v22, v0, 3;\n"
+            "vrgather.vi    v24, v0, 4;\n"
+            "vrgather.vi    v26, v0, 5;\n"
+            "vrgather.vi    v28, v0, 6;\n"
+            "vrgather.vi    v30, v0, 7;\n"
+            :
+            : "r"(bias));
+    }
+    else
+    {
+        asm(
+            "vmv.v.x      v16,    x0;\n"
+            "vmv.v.x      v18,    x0;\n"
+            "vmv.v.x      v20,    x0;\n"
+            "vmv.v.x      v22,    x0;\n"
+            "vmv.v.x      v24,    x0;\n"
+            "vmv.v.x      v26,    x0;\n"
+            "vmv.v.x      v28,    x0;\n"
+            "vmv.v.x      v30,    x0;\n");
+    }
+
+    const float* k0 = cur_kernel;
+    const float* k1 = k0 + 8;
+    const float* k2 = k1 + 8;
+    const float* k3 = k2 + 8;
+
+    const float* col0 = cur_col;
+    const float* col1 = col0 + 8;
+    const float* col2 = col1 + 8;
+    const float* col3 = col2 + 8;
+
+    int k = 0;
+    for (; k < (kernel_size & -4); k += 4)
+    {
+        asm(
+            "vle32.v      v0,   (%0);\n"
+            "vle32.v      v2,   (%4);\n"
+            "vle32.v      v4,   (%1);\n"
+            "vle32.v      v6,   (%5);\n"
+
+            "vrgather.vi  v8,    v2, 0;\n"
+            "vrgather.vi  v10,   v2, 1;\n"
+            "vrgather.vi  v12,   v2, 2;\n"
+            "vrgather.vi  v14,   v2, 3;\n"
+
+            "vfmacc.vv    v16,   v0, v8;\n"
+            "vfmacc.vv    v18,   v0, v10;\n"
+            "vfmacc.vv    v20,   v0, v12;\n"
+            "vfmacc.vv    v22,   v0, v14;\n"
+
+            "vrgather.vi  v8,    v2, 4;\n"
+            "vrgather.vi  v10,   v2, 5;\n"
+            "vrgather.vi  v12,   v2, 6;\n"
+            "vrgather.vi  v14,   v2, 7;\n"
+
+            "vfmacc.vv    v24,   v0, v8;\n"
+            "vfmacc.vv    v26,   v0, v10;\n"
+            "vfmacc.vv    v28,   v0, v12;\n"
+            "vfmacc.vv    v30,   v0, v14;\n"
+
+            "vrgather.vi  v8,    v6, 0;\n"
+            "vrgather.vi  v10,   v6, 1;\n"
+            "vrgather.vi  v12,   v6, 2;\n"
+            "vrgather.vi  v14,   v6, 3;\n"
+
+            "vfmacc.vv    v16,   v4, v8;\n"
+            "vfmacc.vv    v18,   v4, v10;\n"
+            "vfmacc.vv    v20,   v4, v12;\n"
+            "vfmacc.vv    v22,   v4, v14;\n"
+
+            "vrgather.vi  v8,    v6, 4;\n"
+            "vrgather.vi  v10,   v6, 5;\n"
+            "vrgather.vi  v12,   v6, 6;\n"
+            "vrgather.vi  v14,   v6, 7;\n"
+
+            "vfmacc.vv    v24,   v4, v8;\n"
+            "vfmacc.vv    v26,   v4, v10;\n"
+            "vfmacc.vv    v28,   v4, v12;\n"
+            "vfmacc.vv    v30,   v4, v14;\n"
+
+            "vle32.v      v0,     (%2); \n"
+            "vle32.v      v2,     (%6); \n"
+            "vle32.v      v4,     (%3); \n"
+            "vle32.v      v6,     (%7); \n"
+
+            "vrgather.vi  v8,    v2, 0;\n"
+            "vrgather.vi  v10,   v2, 1;\n"
+            "vrgather.vi  v12,   v2, 2;\n"
+            "vrgather.vi  v14,   v2, 3;\n"
+
+            "vfmacc.vv    v16,   v0, v8;\n"
+            "vfmacc.vv    v18,   v0, v10;\n"
+            "vfmacc.vv    v20,   v0, v12;\n"
+            "vfmacc.vv    v22,   v0, v14;\n"
+
+            "vrgather.vi  v8,    v2, 4;\n"
+            "vrgather.vi  v10,   v2, 5;\n"
+            "vrgather.vi  v12,   v2, 6;\n"
+            "vrgather.vi  v14,   v2, 7;\n"
+
+            "vfmacc.vv    v24,   v0, v8;\n"
+            "vfmacc.vv    v26,   v0, v10;\n"
+            "vfmacc.vv    v28,   v0, v12;\n"
+            "vfmacc.vv    v30,   v0, v14;\n"
+
+            "vrgather.vi  v8,    v6, 0;\n"
+            "vrgather.vi  v10,   v6, 1;\n"
+            "vrgather.vi  v12,   v6, 2;\n"
+            "vrgather.vi  v14,   v6, 3;\n"
+
+            "vfmacc.vv    v16,   v4, v8;\n"
+            "vfmacc.vv    v18,   v4, v10;\n"
+            "vfmacc.vv    v20,   v4, v12;\n"
+            "vfmacc.vv    v22,   v4, v14;\n"
+
+            "vrgather.vi  v8,    v6, 4;\n"
+            "vrgather.vi  v10,   v6, 5;\n"
+            "vrgather.vi  v12,   v6, 6;\n"
+            "vrgather.vi  v14,   v6, 7;\n"
+
+            "vfmacc.vv    v24,   v4, v8;\n"
+            "vfmacc.vv    v26,   v4, v10;\n"
+            "vfmacc.vv    v28,   v4, v12;\n"
+            "vfmacc.vv    v30,   v4, v14;\n"
+            :
+            : "r"(col0), "r"(col1), "r"(col2), "r"(col3), "r"(k0), "r"(k1), "r"(k2), "r"(k3));
+
+        col0 += 32;
+        col1 += 32;
+        col2 += 32;
+        col3 += 32;
+
+        k0 += 32;
+        k1 += 32;
+        k2 += 32;
+        k3 += 32;
+    }
+
+    for (; k < kernel_size; ++k)
+    {
+        asm("vle32.v        v0, (%0);\n"
+            "vle32.v        v2, (%1);\n"
+
+            "vrgather.vi    v8,  v2, 0;\n"
+            "vrgather.vi    v10, v2, 1;\n"
+            "vrgather.vi    v12, v2, 2;\n"
+            "vrgather.vi    v14, v2, 3;\n"
+
+            "vfmacc.vv      v16, v0, v8;\n"
+            "vfmacc.vv      v18, v0, v10;\n"
+            "vfmacc.vv      v20, v0, v12;\n"
+            "vfmacc.vv      v22, v0, v14;\n"
+
+            "vrgather.vi    v8,  v2, 4;\n"
+            "vrgather.vi    v10, v2, 5;\n"
+            "vrgather.vi    v12, v2, 6;\n"
+            "vrgather.vi    v14, v2, 7;\n"
+
+            "vfmacc.vv      v24, v0, v8;\n"
+            "vfmacc.vv      v26, v0, v10;\n"
+            "vfmacc.vv      v28, v0, v12;\n"
+            "vfmacc.vv      v30, v0, v14;\n"
+            :
+            : "r"(col0), "r"(k0));
+        col0 += 8;
+        k0 += 8;
+    }
+
+    if (act >= 0)
+    {
+        asm(
+            "vmv.v.x    v0, x0;\n"
+            "vfmax.vv  v16, v16, v0;\n"
+            "vfmax.vv  v18, v18, v0;\n"
+            "vfmax.vv  v20, v20, v0;\n"
+            "vfmax.vv  v22, v22, v0;\n"
+            "vfmax.vv  v24, v24, v0;\n"
+            "vfmax.vv  v26, v26, v0;\n"
+            "vfmax.vv  v28, v28, v0;\n"
+            "vfmax.vv  v30, v30, v0;\n");
+
+        if (act > 0)
+        {
+            asm(
+                "vmv.v.x    v2, %0;\n"
+                "vfmin.vv  v16, v16, v2;\n"
+                "vfmin.vv  v18, v18, v2;\n"
+                "vfmin.vv  v20, v20, v2;\n"
+                "vfmin.vv  v22, v22, v2;\n"
+                "vfmin.vv  v24, v24, v2;\n"
+                "vfmin.vv  v26, v26, v2;\n"
+                "vfmin.vv  v28, v28, v2;\n"
+                "vfmin.vv  v30, v30, v2;\n"
+                :
+                : "r"(act));
+        }
+    }
+
+    float* r0 = cur_output;
+    float* r1 = r0 + output_xy;
+    float* r2 = r1 + output_xy;
+    float* r3 = r2 + output_xy;
+    float* r4 = r3 + output_xy;
+    float* r5 = r4 + output_xy;
+    float* r6 = r5 + output_xy;
+    float* r7 = r6 + output_xy;
+
+    switch (n)
+    {
+    case 8:
+        asm(
+            "vse32.v        v16, (%0);\n"
+            "vse32.v        v18, (%1);\n"
+            "vse32.v        v20, (%2);\n"
+            "vse32.v        v22, (%3);\n"
+            "vse32.v        v24, (%4);\n"
+            "vse32.v        v26, (%5);\n"
+            "vse32.v        v28, (%6);\n"
+            "vse32.v        v30, (%7);\n"
+            :
+            : "r"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r4), "r"(r5), "r"(r6), "r"(r7));
+        break;
+    case 7:
+        asm(
+            "vse32.v        v16, (%0);\n"
+            "vse32.v        v18, (%1);\n"
+            "vse32.v        v20, (%2);\n"
+            "vse32.v        v22, (%3);\n"
+            "vse32.v        v24, (%4);\n"
+            "vse32.v        v26, (%5);\n"
+            "vse32.v        v28, (%6);\n"
+            :
+            : "r"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r4), "r"(r5), "r"(r6));
+        break;
+
+    case 6:
+        asm(
+            "vse32.v        v16, (%0);\n"
+            "vse32.v        v18, (%1);\n"
+            "vse32.v        v20, (%2);\n"
+            "vse32.v        v22, (%3);\n"
+            "vse32.v        v24, (%4);\n"
+            "vse32.v        v26, (%5);\n"
+            :
+            : "r"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r4), "r"(r5));
+        break;
+
+    case 5:
+        asm(
+            "vse32.v        v16, (%0);\n"
+            "vse32.v        v18, (%1);\n"
+            "vse32.v        v20, (%2);\n"
+            "vse32.v        v22, (%3);\n"
+            "vse32.v        v24, (%4);\n"
+            :
+            : "r"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r4));
+        break;
+
+    case 4:
+        asm(
+            "vse32.v        v16, (%0);\n"
+            "vse32.v        v18, (%1);\n"
+            "vse32.v        v20, (%2);\n"
+            "vse32.v        v22, (%3);\n"
+            :
+            : "r"(r0), "r"(r1), "r"(r2), "r"(r3));
+        break;
+
+    case 3:
+        asm(
+            "vse32.v        v16, (%0);\n"
+            "vse32.v        v18, (%1);\n"
+            "vse32.v        v20, (%2);\n"
+            :
+            : "r"(r0), "r"(r1), "r"(r2));
+        break;
+
+    case 2:
+        asm(
+            "vse32.v        v16, (%0);\n"
+            "vse32.v        v18, (%1);\n"
+            :
+            : "r"(r0), "r"(r1));
+        break;
+
+    case 1:
+        asm(
+            "vse32.v        v16, (%0);\n"
+            :
+            : "r"(r0));
+        break;
+    default:
+        break;
+    }
+}
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/vsetvl_rvv.c b/source/device/cpu/op/conv/risc-v/lp64dv/vsetvl_rvv.c
new file mode 100644
index 000000000..febf67f3e
--- /dev/null
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/vsetvl_rvv.c
@@ -0,0 +1,38 @@
+#include "vsetvl_rvv.h"
+
+void vsetvl_e32_m1(void)
+{
+#ifdef __FIX_RVV_C906
+    __asm__("li     t0, 8;\n"
+            "li     t1, 4;\n"
+            "vsetvl t0, t1, t0;\n"
+            :
+            :
+            : "t0", "t1");
+#else
+    __asm__("li t0, 4; \n"
+            "vsetvli t1, t0, e32, m1;\n"
+            :
+            :
+            : "t0", "t1");
+#endif
+}
+
+void vsetvl_e32_m2(void)
+{
+#ifdef __FIX_RVV_C906
+    __asm__("li t0, 9;\n"
+            "li t1, 8;\n"
+            "vsetvl t0, t1, t0;\n"
+            :
+            :
+            : "t0", "t1");
+#else
+    __asm__(
+        "li t1, 8;\n"
+        "vsetvli t0, t1, e32, m2;\n"
+        :
+        :
+        : "t0", "t1");
+#endif
+}
diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/vsetvl_rvv.h b/source/device/cpu/op/conv/risc-v/lp64dv/vsetvl_rvv.h
new file mode 100644
index 000000000..1245479ff
--- /dev/null
+++ b/source/device/cpu/op/conv/risc-v/lp64dv/vsetvl_rvv.h
@@ -0,0 +1,7 @@
+#ifndef __VSETVL_RVV_H__
+#define __VSETVL_RVV_H__
+
+extern void vsetvl_e32_m1(void);
+extern void vsetvl_e32_m2(void);
+
+#endif
diff --git a/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c b/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c
index b94bcb363..6ab1b3f63 100644
--- a/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c
+++ b/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c
@@ -542,13 +542,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
         return 0;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_conv_dw_hcl_x86_op()
 {
diff --git a/source/device/cpu/op/conv/x86/conv_hcl_x86.c b/source/device/cpu/op/conv/x86/conv_hcl_x86.c
index b1a3cf689..e4400df84 100644
--- a/source/device/cpu/op/conv/x86/conv_hcl_x86.c
+++ b/source/device/cpu/op/conv/x86/conv_hcl_x86.c
@@ -370,13 +370,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_PREFER;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = postrun,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = reshape,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_conv_hcl_x86_op()
 {
diff --git a/source/device/cpu/op/crop/crop_ref.c b/source/device/cpu/op/crop/crop_ref.c
index f59650a39..a123ed839 100644
--- a/source/device/cpu/op/crop/crop_ref.c
+++ b/source/device/cpu/op/crop/crop_ref.c
@@ -284,13 +284,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_crop_ref_op()
 {
diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c b/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c
index 51dae78fe..3137ed19b 100644
--- a/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c
+++ b/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c
@@ -109,13 +109,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
         return 0;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_deconv_dw_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c b/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c
index a81fa1e8c..df41df448 100644
--- a/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c
+++ b/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c
@@ -151,13 +151,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_PREFER;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = postrun,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = reshape,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_deconv_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/deconv/deconv_ref.c b/source/device/cpu/op/deconv/deconv_ref.c
index 7bdfa4b76..59ca6c6d1 100644
--- a/source/device/cpu/op/deconv/deconv_ref.c
+++ b/source/device/cpu/op/deconv/deconv_ref.c
@@ -328,13 +328,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = postrun,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = reshape,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_deconv_ref_op()
 {
diff --git a/source/device/cpu/op/depthtospace/depthtospace_ref.c b/source/device/cpu/op/depthtospace/depthtospace_ref.c
index 94d0919ff..1eef8a71c 100644
--- a/source/device/cpu/op/depthtospace/depthtospace_ref.c
+++ b/source/device/cpu/op/depthtospace/depthtospace_ref.c
@@ -218,13 +218,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_depthtospace_ref_op()
 {
diff --git a/source/device/cpu/op/detection_output/detection_output_ref.c b/source/device/cpu/op/detection_output/detection_output_ref.c
index ed9409118..593d69b80 100644
--- a/source/device/cpu/op/detection_output/detection_output_ref.c
+++ b/source/device/cpu/op/detection_output/detection_output_ref.c
@@ -400,13 +400,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops detection_output_node_ops = {.prerun = NULL,
-                                                    .run = run,
-                                                    .reshape = NULL,
-                                                    .postrun = NULL,
-                                                    .init_node = init_node,
-                                                    .release_node = release_node,
-                                                    .score = score};
+static struct node_ops detection_output_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_detection_output_ref_op()
 {
diff --git a/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c b/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c
index 25b14171a..62c72f3b5 100644
--- a/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c
+++ b/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c
@@ -515,13 +515,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
 {
     return OPS_SCORE_CANDO;
 }
-static struct node_ops detection_postprocess_node_ops = {.prerun = prerun,
-                                                         .run = run,
-                                                         .reshape = NULL,
-                                                         .postrun = NULL,
-                                                         .init_node = init_node,
-                                                         .release_node = release_node,
-                                                         .score = score};
+static struct node_ops detection_postprocess_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_detection_postprocess_ref_op()
 {
diff --git a/source/device/cpu/op/dropout/dropout_ref.c b/source/device/cpu/op/dropout/dropout_ref.c
index 144663971..99e8994c9 100644
--- a/source/device/cpu/op/dropout/dropout_ref.c
+++ b/source/device/cpu/op/dropout/dropout_ref.c
@@ -73,13 +73,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_dropout_ref_op()
 {
diff --git a/source/device/cpu/op/eltwise/eltwise_ref.c b/source/device/cpu/op/eltwise/eltwise_ref.c
index d42925360..29459b201 100644
--- a/source/device/cpu/op/eltwise/eltwise_ref.c
+++ b/source/device/cpu/op/eltwise/eltwise_ref.c
@@ -995,13 +995,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_eltwise_ref_op()
 {
diff --git a/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c b/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c
index 1f7a7aad5..b4e92c901 100644
--- a/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c
+++ b/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c
@@ -81,13 +81,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return 0;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_elu_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/elu/elu_ref.c b/source/device/cpu/op/elu/elu_ref.c
index 1d41d940d..51f5a63ea 100644
--- a/source/device/cpu/op/elu/elu_ref.c
+++ b/source/device/cpu/op/elu/elu_ref.c
@@ -159,13 +159,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_elu_ref_op()
 {
diff --git a/source/device/cpu/op/embedding/embedding_ref.c b/source/device/cpu/op/embedding/embedding_ref.c
index 5fe920a6a..b9e7a9da4 100644
--- a/source/device/cpu/op/embedding/embedding_ref.c
+++ b/source/device/cpu/op/embedding/embedding_ref.c
@@ -100,13 +100,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_embedding_ref_op()
 {
diff --git a/source/device/cpu/op/expand/expand_ref.c b/source/device/cpu/op/expand/expand_ref.c
index fc0bdcfe4..657316041 100644
--- a/source/device/cpu/op/expand/expand_ref.c
+++ b/source/device/cpu/op/expand/expand_ref.c
@@ -175,13 +175,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops expand_node_ops = {.prerun = NULL,
-                                          .run = run,
-                                          .reshape = NULL,
-                                          .postrun = NULL,
-                                          .init_node = init_node,
-                                          .release_node = release_node,
-                                          .score = score};
+static struct node_ops expand_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_expand_ref_op()
 {
diff --git a/source/device/cpu/op/expanddims/expanddims_ref.c b/source/device/cpu/op/expanddims/expanddims_ref.c
index 7cd37a4dd..59b387769 100644
--- a/source/device/cpu/op/expanddims/expanddims_ref.c
+++ b/source/device/cpu/op/expanddims/expanddims_ref.c
@@ -75,13 +75,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_expanddims_ref_op()
 {
diff --git a/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c b/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c
index d9322b864..eb37fb714 100644
--- a/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c
+++ b/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c
@@ -290,13 +290,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = postrun,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = reshape,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_fc_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/fc/cortex-m/fc_cmsis.c b/source/device/cpu/op/fc/cortex-m/fc_cmsis.c
index e53be5c71..e37e3d2f2 100644
--- a/source/device/cpu/op/fc/cortex-m/fc_cmsis.c
+++ b/source/device/cpu/op/fc/cortex-m/fc_cmsis.c
@@ -133,13 +133,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops cmsis_node_ops = {.prerun = NULL,
-                                         .run = run,
-                                         .reshape = reshape,
-                                         .postrun = NULL,
-                                         .init_node = init_node,
-                                         .release_node = release_node,
-                                         .score = score};
+static struct node_ops cmsis_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_fc_cmsis_op()
 {
diff --git a/source/device/cpu/op/fc/fc_ref.c b/source/device/cpu/op/fc/fc_ref.c
index b0da933ea..ffb590835 100644
--- a/source/device/cpu/op/fc/fc_ref.c
+++ b/source/device/cpu/op/fc/fc_ref.c
@@ -475,13 +475,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_fc_ref_op()
 {
diff --git a/source/device/cpu/op/fc/x86/fc_hcl_x86.c b/source/device/cpu/op/fc/x86/fc_hcl_x86.c
index 86acbb992..d2ae6a73c 100644
--- a/source/device/cpu/op/fc/x86/fc_hcl_x86.c
+++ b/source/device/cpu/op/fc/x86/fc_hcl_x86.c
@@ -290,13 +290,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_fc_hcl_x86_op()
 {
diff --git a/source/device/cpu/op/flatten/flatten_ref.c b/source/device/cpu/op/flatten/flatten_ref.c
index 9b4476d28..337474184 100644
--- a/source/device/cpu/op/flatten/flatten_ref.c
+++ b/source/device/cpu/op/flatten/flatten_ref.c
@@ -93,13 +93,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops flatten_node_ops = {.prerun = NULL,
-                                           .run = run,
-                                           .reshape = NULL,
-                                           .postrun = NULL,
-                                           .init_node = init_node,
-                                           .release_node = release_node,
-                                           .score = score};
+static struct node_ops flatten_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_flatten_ref_op()
 {
diff --git a/source/device/cpu/op/gather/gather_ref.c b/source/device/cpu/op/gather/gather_ref.c
index 37ce59ddb..99b6d5169 100644
--- a/source/device/cpu/op/gather/gather_ref.c
+++ b/source/device/cpu/op/gather/gather_ref.c
@@ -282,13 +282,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops gather_node_ops = {.prerun = prerun,
-                                          .run = run,
-                                          .reshape = NULL,
-                                          .postrun = NULL,
-                                          .init_node = init_node,
-                                          .release_node = release_node,
-                                          .score = score};
+static struct node_ops gather_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_gather_ref_op()
 {
diff --git a/source/device/cpu/op/gelu/gelu_ref.c b/source/device/cpu/op/gelu/gelu_ref.c
index 07cdec2df..da73913db 100644
--- a/source/device/cpu/op/gelu/gelu_ref.c
+++ b/source/device/cpu/op/gelu/gelu_ref.c
@@ -130,13 +130,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_gelu_ref_op()
 {
diff --git a/source/device/cpu/op/gru/gru_ref.c b/source/device/cpu/op/gru/gru_ref.c
index 056882f3c..76e3c04be 100644
--- a/source/device/cpu/op/gru/gru_ref.c
+++ b/source/device/cpu/op/gru/gru_ref.c
@@ -434,13 +434,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops gru_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops gru_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_gru_ref_op()
 {
diff --git a/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c b/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c
index adcb94298..9a84aba22 100644
--- a/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c
+++ b/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c
@@ -140,13 +140,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_hardsigmoid_ref_op()
 {
diff --git a/source/device/cpu/op/hardswish/hardswish_ref.c b/source/device/cpu/op/hardswish/hardswish_ref.c
index 3a1910c39..8621aea52 100644
--- a/source/device/cpu/op/hardswish/hardswish_ref.c
+++ b/source/device/cpu/op/hardswish/hardswish_ref.c
@@ -72,13 +72,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 int register_hardswish_ref_op()
 {
     return register_builtin_node_ops(OP_HARDSWISH, &hcl_node_ops);
diff --git a/source/device/cpu/op/input/input_ref.c b/source/device/cpu/op/input/input_ref.c
index 4118be0da..fcf9273f5 100644
--- a/source/device/cpu/op/input/input_ref.c
+++ b/source/device/cpu/op/input/input_ref.c
@@ -70,13 +70,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_input_ref_op()
 {
diff --git a/source/device/cpu/op/instancenorm/instancenorm_ref.c b/source/device/cpu/op/instancenorm/instancenorm_ref.c
index 94d943afb..887acdac0 100644
--- a/source/device/cpu/op/instancenorm/instancenorm_ref.c
+++ b/source/device/cpu/op/instancenorm/instancenorm_ref.c
@@ -229,13 +229,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_instancenorm_ref_op()
 {
diff --git a/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c b/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c
index c7fc11e26..8c88fde8d 100644
--- a/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c
+++ b/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c
@@ -81,13 +81,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return 0;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_interp_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/interp/interp_ref.c b/source/device/cpu/op/interp/interp_ref.c
index fb3736057..ec0f46358 100644
--- a/source/device/cpu/op/interp/interp_ref.c
+++ b/source/device/cpu/op/interp/interp_ref.c
@@ -509,13 +509,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_interp_ref_op()
 {
diff --git a/source/device/cpu/op/l2normalization/l2normalization_ref.c b/source/device/cpu/op/l2normalization/l2normalization_ref.c
index b420e92dd..80790ec0b 100644
--- a/source/device/cpu/op/l2normalization/l2normalization_ref.c
+++ b/source/device/cpu/op/l2normalization/l2normalization_ref.c
@@ -141,13 +141,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_l2normalization_ref_op()
 {
diff --git a/source/device/cpu/op/l2pool/l2pool_ref.c b/source/device/cpu/op/l2pool/l2pool_ref.c
index 5cf027d70..d748f6786 100644
--- a/source/device/cpu/op/l2pool/l2pool_ref.c
+++ b/source/device/cpu/op/l2pool/l2pool_ref.c
@@ -202,13 +202,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_l2pool_ref_op()
 {
diff --git a/source/device/cpu/op/layernorm/layernorm_ref.c b/source/device/cpu/op/layernorm/layernorm_ref.c
index 1a90e705e..15a20d5e8 100644
--- a/source/device/cpu/op/layernorm/layernorm_ref.c
+++ b/source/device/cpu/op/layernorm/layernorm_ref.c
@@ -202,13 +202,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_layernorm_ref_op()
 {
diff --git a/source/device/cpu/op/logical/logical_ref.c b/source/device/cpu/op/logical/logical_ref.c
index aef2ad3f7..fe2778f05 100644
--- a/source/device/cpu/op/logical/logical_ref.c
+++ b/source/device/cpu/op/logical/logical_ref.c
@@ -214,13 +214,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_logical_ref_op()
 {
diff --git a/source/device/cpu/op/logistic/logistic_ref.c b/source/device/cpu/op/logistic/logistic_ref.c
index 807ff90d9..1a6a7ae54 100644
--- a/source/device/cpu/op/logistic/logistic_ref.c
+++ b/source/device/cpu/op/logistic/logistic_ref.c
@@ -108,13 +108,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_logistic_ref_op()
 {
diff --git a/source/device/cpu/op/logsoftmax/logsoftmax_ref.c b/source/device/cpu/op/logsoftmax/logsoftmax_ref.c
index 2af74c63d..31b9ebf0e 100644
--- a/source/device/cpu/op/logsoftmax/logsoftmax_ref.c
+++ b/source/device/cpu/op/logsoftmax/logsoftmax_ref.c
@@ -177,13 +177,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_logsoftmax_ref_op()
 {
diff --git a/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c b/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c
index fc883f9f2..bcab4fc25 100644
--- a/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c
+++ b/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c
@@ -84,13 +84,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_lrn_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/lrn/lrn_ref.c b/source/device/cpu/op/lrn/lrn_ref.c
index ff71d6903..878dd913c 100644
--- a/source/device/cpu/op/lrn/lrn_ref.c
+++ b/source/device/cpu/op/lrn/lrn_ref.c
@@ -141,13 +141,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_lrn_ref_op()
 {
diff --git a/source/device/cpu/op/lstm/lstm_ref.c b/source/device/cpu/op/lstm/lstm_ref.c
index 0367e9f56..7f7831e3f 100644
--- a/source/device/cpu/op/lstm/lstm_ref.c
+++ b/source/device/cpu/op/lstm/lstm_ref.c
@@ -777,13 +777,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops lstm_node_ops = {.prerun = NULL,
-                                        .run = run,
-                                        .reshape = reshape,
-                                        .postrun = NULL,
-                                        .init_node = init_node,
-                                        .release_node = release_node,
-                                        .score = score};
+static struct node_ops lstm_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_lstm_ref_op()
 {
diff --git a/source/device/cpu/op/matmul/matmul_ref.c b/source/device/cpu/op/matmul/matmul_ref.c
index e039f4bd1..0993521f1 100644
--- a/source/device/cpu/op/matmul/matmul_ref.c
+++ b/source/device/cpu/op/matmul/matmul_ref.c
@@ -161,13 +161,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops matmul_node_ops = {.prerun = NULL,
-                                          .run = run,
-                                          .reshape = NULL,
-                                          .postrun = NULL,
-                                          .init_node = init_node,
-                                          .release_node = release_node,
-                                          .score = score};
+static struct node_ops matmul_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_matmul_ref_op()
 {
diff --git a/source/device/cpu/op/maximum/maximum_ref.c b/source/device/cpu/op/maximum/maximum_ref.c
index ecb34f774..4e887d7be 100644
--- a/source/device/cpu/op/maximum/maximum_ref.c
+++ b/source/device/cpu/op/maximum/maximum_ref.c
@@ -123,13 +123,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops maximum_node_ops = {.prerun = prerun,
-                                           .run = run,
-                                           .reshape = NULL,
-                                           .postrun = postrun,
-                                           .init_node = init_node,
-                                           .release_node = release_node,
-                                           .score = score};
+static struct node_ops maximum_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_maximum_ref_op()
 {
diff --git a/source/device/cpu/op/mean/mean_ref.c b/source/device/cpu/op/mean/mean_ref.c
index 1ccd4697b..de259b0e9 100644
--- a/source/device/cpu/op/mean/mean_ref.c
+++ b/source/device/cpu/op/mean/mean_ref.c
@@ -121,13 +121,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops mean_node_ops = {.prerun = prerun,
-                                        .run = run,
-                                        .reshape = NULL,
-                                        .postrun = postrun,
-                                        .init_node = init_node,
-                                        .release_node = release_node,
-                                        .score = score};
+static struct node_ops mean_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_mean_ref_op()
 {
diff --git a/source/device/cpu/op/minimum/minimum_ref.c b/source/device/cpu/op/minimum/minimum_ref.c
index 19319eb2f..afe803aeb 100644
--- a/source/device/cpu/op/minimum/minimum_ref.c
+++ b/source/device/cpu/op/minimum/minimum_ref.c
@@ -122,13 +122,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops minimum_node_ops = {.prerun = prerun,
-                                           .run = run,
-                                           .reshape = NULL,
-                                           .postrun = postrun,
-                                           .init_node = init_node,
-                                           .release_node = release_node,
-                                           .score = score};
+static struct node_ops minimum_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_minimum_ref_op()
 {
diff --git a/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c b/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c
index 8e3581c24..6197e3235 100644
--- a/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c
+++ b/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c
@@ -83,13 +83,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_mish_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/mish/mish_ref.c b/source/device/cpu/op/mish/mish_ref.c
index 91af5a417..b11e02035 100644
--- a/source/device/cpu/op/mish/mish_ref.c
+++ b/source/device/cpu/op/mish/mish_ref.c
@@ -82,13 +82,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_mish_ref_op()
 {
diff --git a/source/device/cpu/op/mvn/mvn_ref.c b/source/device/cpu/op/mvn/mvn_ref.c
index 306082d61..5af43ed65 100644
--- a/source/device/cpu/op/mvn/mvn_ref.c
+++ b/source/device/cpu/op/mvn/mvn_ref.c
@@ -243,13 +243,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_mvn_ref_op()
 {
diff --git a/source/device/cpu/op/noop/noop_ref.c b/source/device/cpu/op/noop/noop_ref.c
index 67722f5bb..c39e29a73 100644
--- a/source/device/cpu/op/noop/noop_ref.c
+++ b/source/device/cpu/op/noop/noop_ref.c
@@ -108,13 +108,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_noop_ref_op()
 {
diff --git a/source/device/cpu/op/normalize/normalize_ref.c b/source/device/cpu/op/normalize/normalize_ref.c
index 92990f780..96ca6f709 100644
--- a/source/device/cpu/op/normalize/normalize_ref.c
+++ b/source/device/cpu/op/normalize/normalize_ref.c
@@ -116,13 +116,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops normalize_node_ops = {.prerun = NULL,
-                                             .run = run,
-                                             .reshape = NULL,
-                                             .postrun = NULL,
-                                             .init_node = init_node,
-                                             .release_node = release_node,
-                                             .score = score};
+static struct node_ops normalize_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_normalize_ref_op()
 {
diff --git a/source/device/cpu/op/pad/pad_ref.c b/source/device/cpu/op/pad/pad_ref.c
index 85365bc80..76fa79603 100644
--- a/source/device/cpu/op/pad/pad_ref.c
+++ b/source/device/cpu/op/pad/pad_ref.c
@@ -672,13 +672,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops pad_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops pad_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_pad_ref_op()
 {
diff --git a/source/device/cpu/op/permute/permute_ref.c b/source/device/cpu/op/permute/permute_ref.c
index 6e705ab31..2c0bd6e32 100644
--- a/source/device/cpu/op/permute/permute_ref.c
+++ b/source/device/cpu/op/permute/permute_ref.c
@@ -420,13 +420,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops permute_node_ops = {.prerun = NULL,
-                                           .run = run,
-                                           .reshape = NULL,
-                                           .postrun = NULL,
-                                           .init_node = init_node,
-                                           .release_node = release_node,
-                                           .score = score};
+static struct node_ops permute_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_permute_ref_op()
 {
diff --git a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c
index 4b6d3fe7a..59c944b75 100644
--- a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c
+++ b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c
@@ -159,13 +159,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return 0;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = postrun,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_pooling_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c b/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c
index e30c84c7e..1a176eb11 100644
--- a/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c
+++ b/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c
@@ -66,13 +66,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops cmsis_node_ops = {.prerun = NULL,
-                                         .run = run,
-                                         .reshape = reshape,
-                                         .postrun = NULL,
-                                         .init_node = NULL,
-                                         .release_node = NULL,
-                                         .score = score};
+static struct node_ops cmsis_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = NULL,
+    .release_node = NULL,
+    .score = score,
+};
 
 int register_pooling_cmsis_op()
 {
diff --git a/source/device/cpu/op/pooling/pooling_ref.c b/source/device/cpu/op/pooling/pooling_ref.c
index df8ecb6a2..e06dc946d 100644
--- a/source/device/cpu/op/pooling/pooling_ref.c
+++ b/source/device/cpu/op/pooling/pooling_ref.c
@@ -159,13 +159,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = postrun,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_pooling_ref_op()
 {
diff --git a/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c b/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c
index 9012a5686..48c76f590 100644
--- a/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c
+++ b/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c
@@ -90,13 +90,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = NULL,
-                                       .release_node = NULL,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = NULL,
+    .release_node = NULL,
+    .score = score,
+};
 
 int register_prelu_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/prelu/prelu_ref.c b/source/device/cpu/op/prelu/prelu_ref.c
index da069d8bb..6e8822c2d 100644
--- a/source/device/cpu/op/prelu/prelu_ref.c
+++ b/source/device/cpu/op/prelu/prelu_ref.c
@@ -443,13 +443,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_prelu_ref_op()
 {
diff --git a/source/device/cpu/op/priorbox/priorbox_ref.c b/source/device/cpu/op/priorbox/priorbox_ref.c
index 39df5ec09..c3aa6aaa7 100644
--- a/source/device/cpu/op/priorbox/priorbox_ref.c
+++ b/source/device/cpu/op/priorbox/priorbox_ref.c
@@ -217,13 +217,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops priorbox_node_ops = {.prerun = NULL,
-                                            .run = run,
-                                            .reshape = NULL,
-                                            .postrun = NULL,
-                                            .init_node = init_node,
-                                            .release_node = release_node,
-                                            .score = score};
+static struct node_ops priorbox_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_priorbox_ref_op()
 {
diff --git a/source/device/cpu/op/psroipooling/psroipooling_ref.c b/source/device/cpu/op/psroipooling/psroipooling_ref.c
index 9039a3f8d..27152f52a 100644
--- a/source/device/cpu/op/psroipooling/psroipooling_ref.c
+++ b/source/device/cpu/op/psroipooling/psroipooling_ref.c
@@ -144,13 +144,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_psroipooling_ref_op()
 {
diff --git a/source/device/cpu/op/reciprocal/reciprocal_ref.c b/source/device/cpu/op/reciprocal/reciprocal_ref.c
index c770bb657..9d7ba443d 100644
--- a/source/device/cpu/op/reciprocal/reciprocal_ref.c
+++ b/source/device/cpu/op/reciprocal/reciprocal_ref.c
@@ -104,7 +104,8 @@ static struct node_ops hcl_node_ops = {
     .postrun = NULL,
     .init_node = init_node,
     .release_node = release_node,
-    .score = score};
+    .score = score,
+};
 
 int register_reciprocal_ref_op()
 {
diff --git a/source/device/cpu/op/reducel2/reducel2_ref.c b/source/device/cpu/op/reducel2/reducel2_ref.c
index e92f98caf..9fff807d4 100644
--- a/source/device/cpu/op/reducel2/reducel2_ref.c
+++ b/source/device/cpu/op/reducel2/reducel2_ref.c
@@ -118,13 +118,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops reducel2_node_ops = {.prerun = NULL,
-                                            .run = run,
-                                            .reshape = NULL,
-                                            .postrun = NULL,
-                                            .init_node = init_node,
-                                            .release_node = release_node,
-                                            .score = score};
+static struct node_ops reducel2_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_reducel2_ref_op()
 {
diff --git a/source/device/cpu/op/reduction/reduction_ref.c b/source/device/cpu/op/reduction/reduction_ref.c
index fd92f23d9..57f7c632d 100644
--- a/source/device/cpu/op/reduction/reduction_ref.c
+++ b/source/device/cpu/op/reduction/reduction_ref.c
@@ -120,13 +120,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_reduction_ref_op()
 {
diff --git a/source/device/cpu/op/region/region_ref.c b/source/device/cpu/op/region/region_ref.c
index 3bb0b37a1..884eaf168 100644
--- a/source/device/cpu/op/region/region_ref.c
+++ b/source/device/cpu/op/region/region_ref.c
@@ -168,13 +168,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_region_ref_op()
 {
diff --git a/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c b/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c
index 0f885ba8b..8980d051d 100644
--- a/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c
+++ b/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c
@@ -82,13 +82,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_relu_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/relu/cortex-m/relu_cmsis.c b/source/device/cpu/op/relu/cortex-m/relu_cmsis.c
index 72d506512..1bf5b0e27 100644
--- a/source/device/cpu/op/relu/cortex-m/relu_cmsis.c
+++ b/source/device/cpu/op/relu/cortex-m/relu_cmsis.c
@@ -93,13 +93,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops cmsis_node_ops = {.prerun = NULL,
-                                         .run = run,
-                                         .reshape = NULL,
-                                         .postrun = NULL,
-                                         .init_node = init_node,
-                                         .release_node = release_node,
-                                         .score = score};
+static struct node_ops cmsis_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_relu_cmsis_op()
 {
diff --git a/source/device/cpu/op/relu/relu_ref.c b/source/device/cpu/op/relu/relu_ref.c
index 2b0372686..3ef1dc364 100644
--- a/source/device/cpu/op/relu/relu_ref.c
+++ b/source/device/cpu/op/relu/relu_ref.c
@@ -92,13 +92,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_relu_ref_op()
 {
diff --git a/source/device/cpu/op/relu1/relu1_ref.c b/source/device/cpu/op/relu1/relu1_ref.c
index 337bc5812..17e59f1d4 100644
--- a/source/device/cpu/op/relu1/relu1_ref.c
+++ b/source/device/cpu/op/relu1/relu1_ref.c
@@ -103,13 +103,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_relu1_ref_op()
 {
diff --git a/source/device/cpu/op/relu6/relu6_ref.c b/source/device/cpu/op/relu6/relu6_ref.c
index 98bfa2006..697634057 100644
--- a/source/device/cpu/op/relu6/relu6_ref.c
+++ b/source/device/cpu/op/relu6/relu6_ref.c
@@ -167,13 +167,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_relu6_ref_op()
 {
diff --git a/source/device/cpu/op/reorg/reorg_ref.c b/source/device/cpu/op/reorg/reorg_ref.c
index 3cff628a0..7d97fea57 100644
--- a/source/device/cpu/op/reorg/reorg_ref.c
+++ b/source/device/cpu/op/reorg/reorg_ref.c
@@ -111,13 +111,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_reorg_ref_op()
 {
diff --git a/source/device/cpu/op/reshape/reshape_ref.c b/source/device/cpu/op/reshape/reshape_ref.c
index 09ddd5f5b..0c071eb54 100644
--- a/source/device/cpu/op/reshape/reshape_ref.c
+++ b/source/device/cpu/op/reshape/reshape_ref.c
@@ -331,13 +331,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops reshape_node_ops = {.prerun = NULL,
-                                           .run = run,
-                                           .reshape = NULL,
-                                           .postrun = NULL,
-                                           .init_node = init_node,
-                                           .release_node = release_node,
-                                           .score = score};
+static struct node_ops reshape_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_reshape_ref_op()
 {
diff --git a/source/device/cpu/op/resize/resize_ref.c b/source/device/cpu/op/resize/resize_ref.c
index 3dda3b135..fc3425768 100644
--- a/source/device/cpu/op/resize/resize_ref.c
+++ b/source/device/cpu/op/resize/resize_ref.c
@@ -490,13 +490,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_resize_ref_op()
 {
diff --git a/source/device/cpu/op/reverse/reverse_ref.c b/source/device/cpu/op/reverse/reverse_ref.c
index 7ed7d36f5..7e5bcdff2 100644
--- a/source/device/cpu/op/reverse/reverse_ref.c
+++ b/source/device/cpu/op/reverse/reverse_ref.c
@@ -271,13 +271,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_reverse_ref_op()
 {
diff --git a/source/device/cpu/op/rnn/rnn_ref.c b/source/device/cpu/op/rnn/rnn_ref.c
index ee60e4247..fc2a3ebe6 100644
--- a/source/device/cpu/op/rnn/rnn_ref.c
+++ b/source/device/cpu/op/rnn/rnn_ref.c
@@ -268,13 +268,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_rnn_ref_op()
 {
diff --git a/source/device/cpu/op/roialign/roialign_ref.c b/source/device/cpu/op/roialign/roialign_ref.c
index 61de55300..04531a160 100644
--- a/source/device/cpu/op/roialign/roialign_ref.c
+++ b/source/device/cpu/op/roialign/roialign_ref.c
@@ -189,13 +189,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_roialign_ref_op()
 {
diff --git a/source/device/cpu/op/roipooling/roipooling_ref.c b/source/device/cpu/op/roipooling/roipooling_ref.c
index cf554bbec..9a5b37c8e 100644
--- a/source/device/cpu/op/roipooling/roipooling_ref.c
+++ b/source/device/cpu/op/roipooling/roipooling_ref.c
@@ -174,13 +174,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_roipooling_ref_op()
 {
diff --git a/source/device/cpu/op/round/round_ref.c b/source/device/cpu/op/round/round_ref.c
index ca76ee7d6..75869afd5 100644
--- a/source/device/cpu/op/round/round_ref.c
+++ b/source/device/cpu/op/round/round_ref.c
@@ -130,13 +130,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_round_ref_op()
 {
diff --git a/source/device/cpu/op/rpn/rpn_ref.c b/source/device/cpu/op/rpn/rpn_ref.c
index 6d9ba42b3..8923575bb 100644
--- a/source/device/cpu/op/rpn/rpn_ref.c
+++ b/source/device/cpu/op/rpn/rpn_ref.c
@@ -357,13 +357,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops rpn_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops rpn_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_rpn_ref_op()
 {
diff --git a/source/device/cpu/op/scale/scale_ref.c b/source/device/cpu/op/scale/scale_ref.c
index 426fcd2c8..13a717749 100644
--- a/source/device/cpu/op/scale/scale_ref.c
+++ b/source/device/cpu/op/scale/scale_ref.c
@@ -121,13 +121,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_scale_ref_op()
 {
diff --git a/source/device/cpu/op/scatter/scatter_ref.c b/source/device/cpu/op/scatter/scatter_ref.c
index 5aae5d8d0..299845260 100644
--- a/source/device/cpu/op/scatter/scatter_ref.c
+++ b/source/device/cpu/op/scatter/scatter_ref.c
@@ -406,13 +406,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_scatter_ref_op()
 {
diff --git a/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c b/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c
index 026625d71..bc1249023 100644
--- a/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c
+++ b/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c
@@ -81,13 +81,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_selu_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/selu/selu_ref.c b/source/device/cpu/op/selu/selu_ref.c
index 557f8105d..afbecfb63 100644
--- a/source/device/cpu/op/selu/selu_ref.c
+++ b/source/device/cpu/op/selu/selu_ref.c
@@ -177,13 +177,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_selu_ref_op()
 {
diff --git a/source/device/cpu/op/shape/shape_ref.c b/source/device/cpu/op/shape/shape_ref.c
index ec27a9c41..d45d23b0a 100644
--- a/source/device/cpu/op/shape/shape_ref.c
+++ b/source/device/cpu/op/shape/shape_ref.c
@@ -80,13 +80,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_shape_ref_op()
 {
diff --git a/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c b/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c
index 545bf2fc0..794180f79 100644
--- a/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c
+++ b/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c
@@ -175,13 +175,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_shuffle_channel_ref_op()
 {
diff --git a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c
index 1b7b3fbaf..41870ffc5 100644
--- a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c
+++ b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c
@@ -71,13 +71,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return 0;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_sigmoid_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/sigmoid/sigmoid_ref.c b/source/device/cpu/op/sigmoid/sigmoid_ref.c
index 8e4ca0899..a72864ef7 100644
--- a/source/device/cpu/op/sigmoid/sigmoid_ref.c
+++ b/source/device/cpu/op/sigmoid/sigmoid_ref.c
@@ -226,13 +226,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops sigmoid_node_ops = {.prerun = prerun,
-                                           .run = run,
-                                           .reshape = reshape_node,
-                                           .postrun = NULL,
-                                           .init_node = init_node,
-                                           .release_node = release_node,
-                                           .score = score};
+static struct node_ops sigmoid_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = reshape_node,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_sigmoid_ref_op()
 {
diff --git a/source/device/cpu/op/slice/slice_ref.c b/source/device/cpu/op/slice/slice_ref.c
index 037c413b7..3c5714eaf 100644
--- a/source/device/cpu/op/slice/slice_ref.c
+++ b/source/device/cpu/op/slice/slice_ref.c
@@ -520,13 +520,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops slice_node_ops = {.prerun = NULL,
-                                         .run = run,
-                                         .reshape = NULL,
-                                         .postrun = NULL,
-                                         .init_node = init_node,
-                                         .release_node = release_node,
-                                         .score = score};
+static struct node_ops slice_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_slice_ref_op()
 {
diff --git a/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c b/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c
index 9ffe8e5c2..84cbe490b 100644
--- a/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c
+++ b/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c
@@ -257,13 +257,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_softmax_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c b/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c
index 93678c225..0901b1c7a 100644
--- a/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c
+++ b/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c
@@ -82,13 +82,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops cmsis_node_ops = {.prerun = NULL,
-                                         .run = run,
-                                         .reshape = reshape,
-                                         .postrun = NULL,
-                                         .init_node = NULL,
-                                         .release_node = NULL,
-                                         .score = score};
+static struct node_ops cmsis_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = NULL,
+    .release_node = NULL,
+    .score = score,
+};
 
 int register_softmax_cmsis_op()
 {
diff --git a/source/device/cpu/op/softmax/softmax_ref.c b/source/device/cpu/op/softmax/softmax_ref.c
index cb1a3b49d..e4a321979 100644
--- a/source/device/cpu/op/softmax/softmax_ref.c
+++ b/source/device/cpu/op/softmax/softmax_ref.c
@@ -110,13 +110,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_softmax_ref_op()
 {
diff --git a/source/device/cpu/op/softplus/softplus_ref.c b/source/device/cpu/op/softplus/softplus_ref.c
index 6931ab047..b8c178b5a 100644
--- a/source/device/cpu/op/softplus/softplus_ref.c
+++ b/source/device/cpu/op/softplus/softplus_ref.c
@@ -118,7 +118,8 @@ static struct node_ops hcl_node_ops = {
     .postrun = NULL,
     .init_node = init_node,
     .release_node = release_node,
-    .score = score};
+    .score = score,
+};
 
 int register_softplus_ref_op()
 {
diff --git a/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c b/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c
index 6a0aa26a4..2358f2cbf 100644
--- a/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c
+++ b/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c
@@ -249,13 +249,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_spacetobatchnd_ref_op()
 {
diff --git a/source/device/cpu/op/spacetodepth/spacetodepth_ref.c b/source/device/cpu/op/spacetodepth/spacetodepth_ref.c
index aa8217929..ce8e023ea 100644
--- a/source/device/cpu/op/spacetodepth/spacetodepth_ref.c
+++ b/source/device/cpu/op/spacetodepth/spacetodepth_ref.c
@@ -102,13 +102,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_spacetodepth_ref_op()
 {
diff --git a/source/device/cpu/op/sparsetodense/sparsetodense_ref.c b/source/device/cpu/op/sparsetodense/sparsetodense_ref.c
index 6179ad14c..75db4c907 100644
--- a/source/device/cpu/op/sparsetodense/sparsetodense_ref.c
+++ b/source/device/cpu/op/sparsetodense/sparsetodense_ref.c
@@ -180,13 +180,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_sparsetodense_ref_op()
 {
diff --git a/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c b/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c
index 2a6bc1435..ae0942b65 100644
--- a/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c
+++ b/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c
@@ -332,13 +332,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_spatialtransformer_ref_op()
 {
diff --git a/source/device/cpu/op/split/split_ref.c b/source/device/cpu/op/split/split_ref.c
index bb0c23595..0d11730bf 100644
--- a/source/device/cpu/op/split/split_ref.c
+++ b/source/device/cpu/op/split/split_ref.c
@@ -197,13 +197,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_split_ref_op()
 {
diff --git a/source/device/cpu/op/squareddifference/squareddifference_ref.c b/source/device/cpu/op/squareddifference/squareddifference_ref.c
index 66a600291..2014293f9 100644
--- a/source/device/cpu/op/squareddifference/squareddifference_ref.c
+++ b/source/device/cpu/op/squareddifference/squareddifference_ref.c
@@ -211,13 +211,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_squareddifference_ref_op()
 {
diff --git a/source/device/cpu/op/squeeze/squeeze_ref.c b/source/device/cpu/op/squeeze/squeeze_ref.c
index 1928d299e..99a8495b0 100644
--- a/source/device/cpu/op/squeeze/squeeze_ref.c
+++ b/source/device/cpu/op/squeeze/squeeze_ref.c
@@ -93,13 +93,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops squeeze_node_ops = {.prerun = NULL,
-                                           .run = run,
-                                           .reshape = NULL,
-                                           .postrun = NULL,
-                                           .init_node = init_node,
-                                           .release_node = release_node,
-                                           .score = score};
+static struct node_ops squeeze_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_squeeze_ref_op()
 {
diff --git a/source/device/cpu/op/strided_slice/strided_slice_ref.c b/source/device/cpu/op/strided_slice/strided_slice_ref.c
index bb3cb9111..9647d3d09 100644
--- a/source/device/cpu/op/strided_slice/strided_slice_ref.c
+++ b/source/device/cpu/op/strided_slice/strided_slice_ref.c
@@ -153,13 +153,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops strided_slice_node_ops = {.prerun = NULL,
-                                                 .run = run,
-                                                 .reshape = NULL,
-                                                 .postrun = NULL,
-                                                 .init_node = init_node,
-                                                 .release_node = release_node,
-                                                 .score = score};
+static struct node_ops strided_slice_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_strided_slice_ref_op()
 {
diff --git a/source/device/cpu/op/swap_axis/swap_axis_ref.c b/source/device/cpu/op/swap_axis/swap_axis_ref.c
index 6aeef17bb..11fddd4d4 100644
--- a/source/device/cpu/op/swap_axis/swap_axis_ref.c
+++ b/source/device/cpu/op/swap_axis/swap_axis_ref.c
@@ -136,13 +136,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops swap_axis_node_ops = {.prerun = NULL,
-                                             .run = run,
-                                             .reshape = NULL,
-                                             .postrun = NULL,
-                                             .init_node = init_node,
-                                             .release_node = release_node,
-                                             .score = score};
+static struct node_ops swap_axis_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_swap_axis_ref_op()
 {
diff --git a/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c b/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c
index de5975df5..825208dca 100644
--- a/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c
+++ b/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c
@@ -83,13 +83,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return 0;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_tanh_hcl_arm_op()
 {
diff --git a/source/device/cpu/op/tanh/tanh_ref.c b/source/device/cpu/op/tanh/tanh_ref.c
index 390f64332..98a048ab6 100644
--- a/source/device/cpu/op/tanh/tanh_ref.c
+++ b/source/device/cpu/op/tanh/tanh_ref.c
@@ -121,13 +121,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_tanh_ref_op()
 {
diff --git a/source/device/cpu/op/threshold/threshold_ref.c b/source/device/cpu/op/threshold/threshold_ref.c
index 4672086a5..bddbcdfc2 100644
--- a/source/device/cpu/op/threshold/threshold_ref.c
+++ b/source/device/cpu/op/threshold/threshold_ref.c
@@ -130,13 +130,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_threshold_ref_op()
 {
diff --git a/source/device/cpu/op/tile/tile_ref.c b/source/device/cpu/op/tile/tile_ref.c
index 0f51a5310..697136547 100644
--- a/source/device/cpu/op/tile/tile_ref.c
+++ b/source/device/cpu/op/tile/tile_ref.c
@@ -180,7 +180,8 @@ static struct node_ops hcl_node_ops = {
     .postrun = NULL,
     .init_node = init_node,
     .release_node = release_node,
-    .score = score};
+    .score = score,
+};
 
 int register_tile_ref_op()
 {
diff --git a/source/device/cpu/op/topkv2/topkv2_ref.c b/source/device/cpu/op/topkv2/topkv2_ref.c
index b84cc2433..8f8722811 100644
--- a/source/device/cpu/op/topkv2/topkv2_ref.c
+++ b/source/device/cpu/op/topkv2/topkv2_ref.c
@@ -231,13 +231,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_topkv2_ref_op()
 {
diff --git a/source/device/cpu/op/transpose/transpose_ref.c b/source/device/cpu/op/transpose/transpose_ref.c
index 31187f4f3..b216e2b46 100644
--- a/source/device/cpu/op/transpose/transpose_ref.c
+++ b/source/device/cpu/op/transpose/transpose_ref.c
@@ -477,13 +477,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = postrun,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = postrun,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_transpose_ref_op()
 {
diff --git a/source/device/cpu/op/unary/unary_ref.c b/source/device/cpu/op/unary/unary_ref.c
index 0f9610a2e..e3c430242 100644
--- a/source/device/cpu/op/unary/unary_ref.c
+++ b/source/device/cpu/op/unary/unary_ref.c
@@ -71,13 +71,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_unary_ref_op()
 {
diff --git a/source/device/cpu/op/unsqueeze/unsqueeze_ref.c b/source/device/cpu/op/unsqueeze/unsqueeze_ref.c
index 70847a7d9..066d2d1dc 100644
--- a/source/device/cpu/op/unsqueeze/unsqueeze_ref.c
+++ b/source/device/cpu/op/unsqueeze/unsqueeze_ref.c
@@ -93,13 +93,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops unsqueeze_node_ops = {.prerun = NULL,
-                                             .run = run,
-                                             .reshape = NULL,
-                                             .postrun = NULL,
-                                             .init_node = init_node,
-                                             .release_node = release_node,
-                                             .score = score};
+static struct node_ops unsqueeze_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_unsqueeze_ref_op()
 {
diff --git a/source/device/cpu/op/upsample/upsample_ref.c b/source/device/cpu/op/upsample/upsample_ref.c
index 729b7f263..f3c0de300 100644
--- a/source/device/cpu/op/upsample/upsample_ref.c
+++ b/source/device/cpu/op/upsample/upsample_ref.c
@@ -172,13 +172,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_BEST;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_upsample_ref_op()
 {
diff --git a/source/device/cpu/op/where/where_ref.c b/source/device/cpu/op/where/where_ref.c
index 52a2fd778..f2fd9b931 100644
--- a/source/device/cpu/op/where/where_ref.c
+++ b/source/device/cpu/op/where/where_ref.c
@@ -99,13 +99,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = NULL,
-                                       .run = run,
-                                       .reshape = reshape,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = NULL,
+    .run = run,
+    .reshape = reshape,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_where_ref_op()
 {
diff --git a/source/device/cpu/op/zeroslike/zeroslike_ref.c b/source/device/cpu/op/zeroslike/zeroslike_ref.c
index 47b83d417..f770ad6e5 100644
--- a/source/device/cpu/op/zeroslike/zeroslike_ref.c
+++ b/source/device/cpu/op/zeroslike/zeroslike_ref.c
@@ -167,13 +167,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc
     return OPS_SCORE_CANDO;
 }
 
-static struct node_ops hcl_node_ops = {.prerun = prerun,
-                                       .run = run,
-                                       .reshape = NULL,
-                                       .postrun = NULL,
-                                       .init_node = init_node,
-                                       .release_node = release_node,
-                                       .score = score};
+static struct node_ops hcl_node_ops = {
+    .prerun = prerun,
+    .run = run,
+    .reshape = NULL,
+    .postrun = NULL,
+    .init_node = init_node,
+    .release_node = release_node,
+    .score = score,
+};
 
 int register_zeroslike_ref_op()
 {
diff --git a/source/device/opencl/include/CL/cl_ext.h b/source/device/opencl/include/CL/cl_ext.h
index ed0db6dfa..c58990ec4 100644
--- a/source/device/opencl/include/CL/cl_ext.h
+++ b/source/device/opencl/include/CL/cl_ext.h
@@ -72,7 +72,7 @@ extern "C" {
  */
 #define cl_APPLE_SetMemObjectDestructor 1
 cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE(cl_mem /* memobj */,
-                                                  void (*/*pfn_notify*/)(cl_mem /* memobj */, void* /*user_data*/),
+                                                  void (* /*pfn_notify*/)(cl_mem /* memobj */, void* /*user_data*/),
                                                   void* /*user_data */) CL_EXT_SUFFIX__VERSION_1_0;
 
 /* Context Logging Functions
diff --git a/source/device/vulkan/layer/concat_vulkan.cpp b/source/device/vulkan/layer/concat_vulkan.cpp
index 99357ba52..8e69bb2bc 100644
--- a/source/device/vulkan/layer/concat_vulkan.cpp
+++ b/source/device/vulkan/layer/concat_vulkan.cpp
@@ -39,33 +39,14 @@
 
 #include "concat_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-Concat_vulkan::Concat_vulkan()
+Concat_vulkan::Concat_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    support_image_storage = false;
-
-    pipeline_concat[0] = 0;
-    pipeline_concat[1] = 0;
-    pipeline_concat_pack4[0] = 0;
-    pipeline_concat_pack4[1] = 0;
-    pipeline_concat_pack4to1[0] = 0;
-    pipeline_concat_pack4to1[1] = 0;
-    pipeline_concat_pack8[0] = 0;
-    pipeline_concat_pack8[1] = 0;
-    pipeline_concat_pack8to4[0] = 0;
-    pipeline_concat_pack8to4[1] = 0;
-    pipeline_concat_pack8to1[0] = 0;
-    pipeline_concat_pack8to1[1] = 0;
-}
-
-Concat_vulkan::Concat_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-{
-    support_vulkan = true;
-    support_image_storage = false;
-
+    one_blob_only = false;
     pipeline_concat[0] = 0;
     pipeline_concat[1] = 0;
     pipeline_concat_pack4[0] = 0;
@@ -91,7 +72,7 @@ Concat_vulkan::Concat_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
 
     for (int i = 0; i < ir_node->output_num; i++)
     {
-        struct tensor* output = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[i]);
         std::string name = output->name;
         tops.push_back(name);
     }
@@ -172,9 +153,7 @@ int Concat_vulkan::create_pipeline(const Option& _opt)
     if (out_shape.dims == 2) out_shape_unpacked = Tensor(out_shape.w, out_shape.h / elempack, (void*)0, elemsize, elempack);
     if (out_shape.dims == 3) out_shape_unpacked = Tensor(out_shape.w, out_shape.h, out_shape.c / elempack, (void*)0, elemsize, elempack);
 
-    // if (!vkdev->shape_support_image_storage(out_shape_unpacked))
     {
-        support_image_storage = false;
         opt.use_image_storage = false;
     }
 
@@ -794,4 +773,4 @@ int Concat_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, st
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/concat_vulkan.hpp b/source/device/vulkan/layer/concat_vulkan.hpp
index b03d8efe6..7711c16f0 100644
--- a/source/device/vulkan/layer/concat_vulkan.hpp
+++ b/source/device/vulkan/layer/concat_vulkan.hpp
@@ -50,8 +50,7 @@ namespace TEngine {
 class Concat_vulkan : public Layer
 {
 public:
-    Concat_vulkan();
-    Concat_vulkan(ir_graph_t* graph, ir_node_t* ir_node);
+    Concat_vulkan(ir_graph_t* graph, ir_node_t* ir_node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
@@ -78,4 +77,4 @@ class Concat_vulkan : public Layer
 
 } // namespace TEngine
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/device/vulkan/layer/convolution_vulkan.cpp b/source/device/vulkan/layer/convolution_vulkan.cpp
index d1c7335b6..4a742b29d 100644
--- a/source/device/vulkan/layer/convolution_vulkan.cpp
+++ b/source/device/vulkan/layer/convolution_vulkan.cpp
@@ -39,18 +39,14 @@
 
 #include "convolution_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-Convolution_vulkan::Convolution_vulkan()
+Convolution_vulkan::Convolution_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    pipeline_convolution = 0;
-}
-
-Convolution_vulkan::Convolution_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-{
-    support_vulkan = true;
+    one_blob_only = true;
     padding = 0;
     innerproduct = 0;
 
@@ -206,18 +202,12 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
     // bool is_conv1x1s1d1 = false;
     bool is_conv3x3s1d1 = false;
 
-    // if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16 && ((elempack == 4 && out_elempack == 4) || (elempack == 8 && out_elempack == 8)))
-    {
-        // TODO do nothing for wino fix me!!!!!
-    }
-    // else
     {
-        support_image_storage = false;
         opt.use_image_storage = false;
     }
 
     {
-        padding = new Padding_vulkan();
+        padding = new Padding_vulkan(vkdev);
         padding->vkdev = vkdev;
 
         padding->top = pad_h0;
@@ -443,12 +433,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
 
     // ir_tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
     // cmd.record_upload(weight_tensor, weight_data_gpu, opt);
-    if (support_image_storage && opt.use_image_storage)
-    {
-        TLOG_INFO("not record_upload weight_data_gpu_image, fix me\n");
-        // cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);
-    }
-    else
     {
         cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
     }
@@ -464,11 +448,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
         Tensor bias_data_packed;
         convert_packing(bias_data, bias_data_packed, out_elempack);
 
-        if (support_image_storage && opt.use_image_storage)
-        {
-            // cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
-        }
-        else
         {
             cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
         }
@@ -615,4 +594,4 @@ int Convolution_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& t
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/convolution_vulkan.hpp b/source/device/vulkan/layer/convolution_vulkan.hpp
index c0799f877..ff01f1bf2 100644
--- a/source/device/vulkan/layer/convolution_vulkan.hpp
+++ b/source/device/vulkan/layer/convolution_vulkan.hpp
@@ -52,9 +52,7 @@ namespace TEngine {
 class Convolution_vulkan : public Layer
 {
 public:
-    Convolution_vulkan();
-    // Convolution_vulkan(ir_node* node);
-    Convolution_vulkan(ir_graph_t* graph, ir_node_t* node);
+    Convolution_vulkan(ir_graph_t* graph, ir_node_t* node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
diff --git a/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp b/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp
index 51f83b773..88e3ebf9a 100644
--- a/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp
+++ b/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp
@@ -39,21 +39,15 @@
 
 #include "convolutiondepthwise_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan()
+ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    pipeline_convolutiondepthwise = 0;
-}
-
-ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-{
-    support_vulkan = true;
-
+    one_blob_only = true;
     padding = 0;
-
     pipeline_convolutiondepthwise = 0;
     pipeline_convolutiondepthwise_pack4 = 0;
     pipeline_convolutiondepthwise_pack8 = 0;
@@ -94,8 +88,7 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
     Option opt = _opt;
 
     {
-        padding = new Padding_vulkan();
-        padding->vkdev = vkdev;
+        padding = new Padding_vulkan(vkdev);
 
         padding->top = pad_h0;
         padding->bottom = pad_h1;
@@ -299,4 +292,4 @@ int ConvolutionDepthWise_vulkan::record_pipeline(const VkTensor& bottom_blob, Vk
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp b/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp
index 7b867529b..03a2c0688 100644
--- a/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp
+++ b/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp
@@ -51,8 +51,7 @@ namespace TEngine {
 class ConvolutionDepthWise_vulkan : public Layer
 {
 public:
-    ConvolutionDepthWise_vulkan();
-    ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* node);
+    ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
diff --git a/source/device/vulkan/layer/crop_vulkan.cpp b/source/device/vulkan/layer/crop_vulkan.cpp
index d00325e34..700930e04 100644
--- a/source/device/vulkan/layer/crop_vulkan.cpp
+++ b/source/device/vulkan/layer/crop_vulkan.cpp
@@ -39,30 +39,14 @@
 
 #include "crop_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-Crop_vulkan::Crop_vulkan()
+Crop_vulkan::Crop_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    support_image_storage = false;
-
-    pipeline_crop = 0;
-    pipeline_crop_pack4 = 0;
-    pipeline_crop_pack1to4 = 0;
-    pipeline_crop_pack4to1 = 0;
-    pipeline_crop_pack8 = 0;
-    pipeline_crop_pack1to8 = 0;
-    pipeline_crop_pack4to8 = 0;
-    pipeline_crop_pack8to4 = 0;
-    pipeline_crop_pack8to1 = 0;
-}
-
-Crop_vulkan::Crop_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-{
-    support_vulkan = true;
-    support_image_storage = false;
-
+    one_blob_only = true;
     pipeline_crop = 0;
     pipeline_crop_pack4 = 0;
     pipeline_crop_pack1to4 = 0;
@@ -616,4 +600,4 @@ int Crop_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, std:
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/crop_vulkan.hpp b/source/device/vulkan/layer/crop_vulkan.hpp
index 2316f07c0..8dab47750 100644
--- a/source/device/vulkan/layer/crop_vulkan.hpp
+++ b/source/device/vulkan/layer/crop_vulkan.hpp
@@ -50,8 +50,7 @@ namespace TEngine {
 class Crop_vulkan : public Layer
 {
 public:
-    Crop_vulkan();
-    Crop_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+    Crop_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
@@ -92,4 +91,4 @@ class Crop_vulkan : public Layer
 
 } // namespace TEngine
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/device/vulkan/layer/dropout_vulkan.cpp b/source/device/vulkan/layer/dropout_vulkan.cpp
index bf46fa34c..3e1f12739 100644
--- a/source/device/vulkan/layer/dropout_vulkan.cpp
+++ b/source/device/vulkan/layer/dropout_vulkan.cpp
@@ -39,24 +39,15 @@
 
 #include "dropout_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-Dropout_vulkan::Dropout_vulkan()
+Dropout_vulkan::Dropout_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    support_image_storage = false;
-
-    pipeline_dropout = 0;
-    pipeline_dropout_pack4 = 0;
-    pipeline_dropout_pack8 = 0;
-}
-
-Dropout_vulkan::Dropout_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-{
-    support_vulkan = true;
-    support_image_storage = false;
-
+    one_blob_only = true;
+    support_inplace = true;
     pipeline_dropout = 0;
     pipeline_dropout_pack4 = 0;
     pipeline_dropout_pack8 = 0;
@@ -214,4 +205,4 @@ int Dropout_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, c
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/dropout_vulkan.hpp b/source/device/vulkan/layer/dropout_vulkan.hpp
index 478345ca7..6cb66fb4e 100644
--- a/source/device/vulkan/layer/dropout_vulkan.hpp
+++ b/source/device/vulkan/layer/dropout_vulkan.hpp
@@ -48,8 +48,7 @@ namespace TEngine {
 class Dropout_vulkan : public Layer
 {
 public:
-    Dropout_vulkan();
-    Dropout_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+    Dropout_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
@@ -74,4 +73,4 @@ class Dropout_vulkan : public Layer
 
 } // namespace TEngine
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/device/vulkan/layer/eltwise_vulkan.cpp b/source/device/vulkan/layer/eltwise_vulkan.cpp
index a8d112bf4..4cb8f2f77 100644
--- a/source/device/vulkan/layer/eltwise_vulkan.cpp
+++ b/source/device/vulkan/layer/eltwise_vulkan.cpp
@@ -39,27 +39,14 @@
 
 #include "eltwise_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-Eltwise_vulkan::Eltwise_vulkan()
+Eltwise_vulkan::Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    support_image_storage = false;
-
-    pipeline_eltwise[0] = 0;
-    pipeline_eltwise[1] = 0;
-    pipeline_eltwise_pack4[0] = 0;
-    pipeline_eltwise_pack4[1] = 0;
-    pipeline_eltwise_pack8[0] = 0;
-    pipeline_eltwise_pack8[1] = 0;
-}
-
-Eltwise_vulkan::Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-{
-    support_vulkan = true;
-    support_image_storage = true;
-
+    one_blob_only = false;
     pipeline_eltwise[0] = 0;
     pipeline_eltwise[1] = 0;
     pipeline_eltwise_pack4[0] = 0;
@@ -77,12 +64,13 @@ Eltwise_vulkan::Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
         bottoms.push_back(name);
     }
 
-    for (int i = 0; i < ir_node->output_num; i++)
-    {
-        struct tensor* output = get_ir_graph_tensor(graph, node->input_tensors[i]);
-        std::string name = output->name;
-        tops.push_back(name);
-    }
+    struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    std::string name = output->name;
+    tops.push_back(name);
+
+    output_c = output->dims[1];
+    output_h = output->dims[2];
+    output_w = output->dims[3];
 
     struct eltwise_param* param = (struct eltwise_param*)ir_node->op.param_mem;
     op_type = (param->type) / 2;
@@ -266,4 +254,4 @@ int Eltwise_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, s
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/eltwise_vulkan.hpp b/source/device/vulkan/layer/eltwise_vulkan.hpp
index 5830aea6a..d2fe76c7c 100644
--- a/source/device/vulkan/layer/eltwise_vulkan.hpp
+++ b/source/device/vulkan/layer/eltwise_vulkan.hpp
@@ -50,8 +50,7 @@ namespace TEngine {
 class Eltwise_vulkan : public Layer
 {
 public:
-    Eltwise_vulkan();
-    Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+    Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
@@ -86,9 +85,6 @@ class Eltwise_vulkan : public Layer
     };
     int op_type; // Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2
 
-    int input_c;
-    int input_h;
-    int input_w;
     int output_c;
     int output_h;
     int output_w;
@@ -96,4 +92,4 @@ class Eltwise_vulkan : public Layer
 
 } // namespace TEngine
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/device/vulkan/layer/flatten_vulkan.cpp b/source/device/vulkan/layer/flatten_vulkan.cpp
index 798402f2c..0c35079f6 100644
--- a/source/device/vulkan/layer/flatten_vulkan.cpp
+++ b/source/device/vulkan/layer/flatten_vulkan.cpp
@@ -39,14 +39,14 @@
 
 #include "flatten_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
-
-Flatten_vulkan::Flatten_vulkan()
+Flatten_vulkan::Flatten_vulkan(const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    support_image_storage = false;
-
+    support_inplace = false;
+    one_blob_only = true;
     pipeline_flatten = 0;
     pipeline_flatten_pack4 = 0;
     pipeline_flatten_pack1to4 = 0;
@@ -55,11 +55,10 @@ Flatten_vulkan::Flatten_vulkan()
     pipeline_flatten_pack4to8 = 0;
 }
 
-Flatten_vulkan::Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+Flatten_vulkan::Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    support_image_storage = true;
-
+    one_blob_only = true;
     pipeline_flatten = 0;
     pipeline_flatten_pack4 = 0;
     pipeline_flatten_pack1to4 = 0;
@@ -82,18 +81,15 @@ Flatten_vulkan::Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
     input_c = input->dims[1]; // param->input_channel;
     input_h = input->dims[2];
     input_w = input->dims[3];
-    output_c = output->dims[1]; // param->output_channel;
-    output_h = output->dims[2];
-    output_w = output->dims[3];
-    output_size = output->dims[3] * output->dims[2] * output->dims[1];
+    output_size = output->elem_num;
 }
 
 int Flatten_vulkan::create_pipeline(const Option& _opt)
 {
     Option opt = _opt;
-    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Mat() : bottom_shapes[0];
+    const Tensor shape(input_w, input_h, input_c, nullptr); // bottom_shapes.empty() ? Mat() : bottom_shapes[0];
     // const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0];
-    const Tensor& out_shape = Tensor(output_size, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0];
+    const Tensor out_shape(output_size, nullptr); // top_shapes.empty() ? Mat() : top_shapes[0];
 
     int elempack = 1;
     if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4
@@ -133,9 +129,7 @@ int Flatten_vulkan::create_pipeline(const Option& _opt)
     Tensor out_shape_packed;
     if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
 
-    // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
     {
-        support_image_storage = false;
         opt.use_image_storage = false;
     }
 
@@ -325,4 +319,4 @@ int Flatten_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/flatten_vulkan.hpp b/source/device/vulkan/layer/flatten_vulkan.hpp
index cd364ddf2..d752b233d 100644
--- a/source/device/vulkan/layer/flatten_vulkan.hpp
+++ b/source/device/vulkan/layer/flatten_vulkan.hpp
@@ -50,8 +50,8 @@ namespace TEngine {
 class Flatten_vulkan : public Layer
 {
 public:
-    Flatten_vulkan();
-    Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+    Flatten_vulkan(const GPUDevice* vkdev);
+    Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
@@ -78,4 +78,4 @@ class Flatten_vulkan : public Layer
 
 } // namespace TEngine
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/device/vulkan/layer/innerproduct_vulkan.cpp b/source/device/vulkan/layer/innerproduct_vulkan.cpp
index 8e1d66b8a..df8d44a1e 100644
--- a/source/device/vulkan/layer/innerproduct_vulkan.cpp
+++ b/source/device/vulkan/layer/innerproduct_vulkan.cpp
@@ -39,32 +39,14 @@
 
 #include "innerproduct_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-InnerProduct_vulkan::InnerProduct_vulkan()
+InnerProduct_vulkan::InnerProduct_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    support_image_storage = true;
-
-    flatten = 0;
-
-    pipeline_innerproduct = 0;
-    pipeline_innerproduct_pack4 = 0;
-    pipeline_innerproduct_pack1to4 = 0;
-    pipeline_innerproduct_pack4to1 = 0;
-    pipeline_innerproduct_pack8 = 0;
-    pipeline_innerproduct_pack1to8 = 0;
-    pipeline_innerproduct_pack4to8 = 0;
-    pipeline_innerproduct_pack8to4 = 0;
-    pipeline_innerproduct_pack8to1 = 0;
-}
-
-InnerProduct_vulkan::InnerProduct_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-{
-    support_vulkan = true;
-    support_image_storage = false;
-
+    one_blob_only = true;
     flatten = 0;
 
     pipeline_innerproduct = 0;
@@ -148,13 +130,11 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
     if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
 
     {
-        support_image_storage = false;
         opt.use_image_storage = false;
     }
 
     {
-        flatten = new Flatten_vulkan();
-        flatten->vkdev = vkdev;
+        flatten = new Flatten_vulkan(vkdev);
 
         flatten->input_w = shape.w;
         flatten->input_h = shape.h;
@@ -346,11 +326,6 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
         }
     }
 
-    if (support_image_storage && opt.use_image_storage)
-    {
-        // cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);
-    }
-    else
     {
         cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
     }
@@ -362,11 +337,6 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
         Tensor bias_data_packed;
         convert_packing(bias_data, bias_data_packed, out_elempack);
 
-        if (support_image_storage && opt.use_image_storage)
-        {
-            // cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
-        }
-        else
         {
             cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
         }
@@ -464,4 +434,4 @@ int InnerProduct_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor&
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/innerproduct_vulkan.hpp b/source/device/vulkan/layer/innerproduct_vulkan.hpp
index 0549e24f6..7641dd2c8 100644
--- a/source/device/vulkan/layer/innerproduct_vulkan.hpp
+++ b/source/device/vulkan/layer/innerproduct_vulkan.hpp
@@ -52,8 +52,7 @@ namespace TEngine {
 class InnerProduct_vulkan : public Layer
 {
 public:
-    InnerProduct_vulkan();
-    InnerProduct_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+    InnerProduct_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
diff --git a/source/device/vulkan/layer/interp_vulkan.cpp b/source/device/vulkan/layer/interp_vulkan.cpp
index 81c8ae748..eaec37214 100644
--- a/source/device/vulkan/layer/interp_vulkan.cpp
+++ b/source/device/vulkan/layer/interp_vulkan.cpp
@@ -39,30 +39,14 @@
 
 #include "interp_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-Interp_vulkan::Interp_vulkan()
+Interp_vulkan::Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    support_image_storage = false;
-
-    pipeline_interp = 0;
-    pipeline_interp_pack4 = 0;
-    pipeline_interp_pack8 = 0;
-
-    pipeline_interp_bicubic_coeffs_x = 0;
-    pipeline_interp_bicubic_coeffs_y = 0;
-    pipeline_interp_bicubic = 0;
-    pipeline_interp_bicubic_pack4 = 0;
-    pipeline_interp_bicubic_pack8 = 0;
-}
-
-Interp_vulkan::Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-{
-    support_vulkan = true;
-    support_image_storage = false;
-
+    one_blob_only = true;
     pipeline_interp = 0;
     pipeline_interp_pack4 = 0;
     pipeline_interp_pack8 = 0;
@@ -158,9 +142,7 @@ int Interp_vulkan::create_pipeline(const Option& _opt)
     if (out_shape.dims == 3) out_shape_packed = Tensor(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
 
     // check blob shape
-    // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
     {
-        support_image_storage = false;
         opt.use_image_storage = false;
     }
 
@@ -467,4 +449,4 @@ int Interp_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_bl
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/interp_vulkan.hpp b/source/device/vulkan/layer/interp_vulkan.hpp
index 98574f499..b7b56945a 100644
--- a/source/device/vulkan/layer/interp_vulkan.hpp
+++ b/source/device/vulkan/layer/interp_vulkan.hpp
@@ -50,8 +50,7 @@ namespace TEngine {
 class Interp_vulkan : public Layer
 {
 public:
-    Interp_vulkan();
-    Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+    Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
@@ -87,4 +86,4 @@ class Interp_vulkan : public Layer
 
 } // namespace TEngine
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/device/vulkan/layer/packing_vulkan.cpp b/source/device/vulkan/layer/packing_vulkan.cpp
index 88a6de812..bea2692de 100644
--- a/source/device/vulkan/layer/packing_vulkan.cpp
+++ b/source/device/vulkan/layer/packing_vulkan.cpp
@@ -39,14 +39,14 @@
 
 #include "packing_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-Packing_vulkan::Packing_vulkan()
+Packing_vulkan::Packing_vulkan(const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    // support_image_storage = true;
-
+    one_blob_only = true;
     pipeline_packing = 0;
     pipeline_packing_pack4 = 0;
     pipeline_packing_pack8 = 0;
@@ -90,9 +90,7 @@ int Packing_vulkan::create_pipeline(const Option& _opt)
     // if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
 
     // check blob shape
-    // if (!vkdev->shape_support_image_storage(out_shape_packed))
     {
-        // support_image_storage = false;
         opt.use_image_storage = false;
     }
 
@@ -487,4 +485,4 @@ int Packing_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/packing_vulkan.hpp b/source/device/vulkan/layer/packing_vulkan.hpp
index f528edf11..dc5cf0a4e 100644
--- a/source/device/vulkan/layer/packing_vulkan.hpp
+++ b/source/device/vulkan/layer/packing_vulkan.hpp
@@ -48,7 +48,7 @@ namespace TEngine {
 class Packing_vulkan : public Layer
 {
 public:
-    Packing_vulkan();
+    Packing_vulkan(const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
diff --git a/source/device/vulkan/layer/padding_vulkan.cpp b/source/device/vulkan/layer/padding_vulkan.cpp
index 27fa57853..fb4bfd583 100644
--- a/source/device/vulkan/layer/padding_vulkan.cpp
+++ b/source/device/vulkan/layer/padding_vulkan.cpp
@@ -39,12 +39,14 @@
 
 #include "padding_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-Padding_vulkan::Padding_vulkan()
+Padding_vulkan::Padding_vulkan(const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
+    one_blob_only = true;
     pipeline_padding = 0;
     pipeline_padding_pack4 = 0;
     pipeline_padding_pack8 = 0;
@@ -169,4 +171,4 @@ int Padding_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/padding_vulkan.hpp b/source/device/vulkan/layer/padding_vulkan.hpp
index 03bbce43d..c99e0d005 100644
--- a/source/device/vulkan/layer/padding_vulkan.hpp
+++ b/source/device/vulkan/layer/padding_vulkan.hpp
@@ -48,7 +48,7 @@ namespace TEngine {
 class Padding_vulkan : public Layer
 {
 public:
-    Padding_vulkan();
+    Padding_vulkan(GPUDevice const* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
diff --git a/source/device/vulkan/layer/permute_vulkan.cpp b/source/device/vulkan/layer/permute_vulkan.cpp
index 0bead6791..d83a04f43 100644
--- a/source/device/vulkan/layer/permute_vulkan.cpp
+++ b/source/device/vulkan/layer/permute_vulkan.cpp
@@ -39,30 +39,14 @@
 
 #include "permute_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-Permute_vulkan::Permute_vulkan()
+Permute_vulkan::Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    support_image_storage = true;
-
-    pipeline_permute = 0;
-    pipeline_permute_pack4 = 0;
-    pipeline_permute_pack1to4 = 0;
-    pipeline_permute_pack4to1 = 0;
-    pipeline_permute_pack8 = 0;
-    pipeline_permute_pack1to8 = 0;
-    pipeline_permute_pack4to8 = 0;
-    pipeline_permute_pack8to4 = 0;
-    pipeline_permute_pack8to1 = 0;
-}
-
-Permute_vulkan::Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-{
-    support_vulkan = true;
-    support_image_storage = true;
-
+    one_blob_only = true;
     pipeline_permute = 0;
     pipeline_permute_pack4 = 0;
     pipeline_permute_pack1to4 = 0;
@@ -158,10 +142,7 @@ int Permute_vulkan::create_pipeline(const Option& _opt)
     if (out_shape.dims == 2) out_shape_packed = Tensor(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack);
     if (out_shape.dims == 3) out_shape_packed = Tensor(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
 
-    // check blob shape
-    // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
     {
-        support_image_storage = false;
         opt.use_image_storage = false;
     }
 
@@ -479,4 +460,4 @@ int Permute_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/permute_vulkan.hpp b/source/device/vulkan/layer/permute_vulkan.hpp
index 2a6763c13..9be16d8eb 100644
--- a/source/device/vulkan/layer/permute_vulkan.hpp
+++ b/source/device/vulkan/layer/permute_vulkan.hpp
@@ -50,8 +50,7 @@ namespace TEngine {
 class Permute_vulkan : public Layer
 {
 public:
-    Permute_vulkan();
-    Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+    Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
@@ -81,4 +80,4 @@ class Permute_vulkan : public Layer
 
 } // namespace TEngine
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/device/vulkan/layer/pooling_vulkan.cpp b/source/device/vulkan/layer/pooling_vulkan.cpp
index 8f4234367..90e8c1574 100644
--- a/source/device/vulkan/layer/pooling_vulkan.cpp
+++ b/source/device/vulkan/layer/pooling_vulkan.cpp
@@ -39,23 +39,15 @@
 
 #include "pooling_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-Pooling_vulkan::Pooling_vulkan()
+Pooling_vulkan::Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    pipeline_pooling = 0;
-    pipeline_pooling_pack4 = 0;
-    pipeline_pooling_pack8 = 0;
-    pipeline_pooling_global = 0;
-    pipeline_pooling_global_pack4 = 0;
-    pipeline_pooling_global_pack8 = 0;
-}
+    one_blob_only = true;
 
-Pooling_vulkan::Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-{
-    support_vulkan = true;
     pipeline_pooling = 0;
     pipeline_pooling_pack4 = 0;
     pipeline_pooling_pack8 = 0;
@@ -123,8 +115,7 @@ int Pooling_vulkan::create_pipeline(const Option& opt)
     }
 
     {
-        padding = new Padding_vulkan();
-        padding->vkdev = vkdev;
+        padding = new Padding_vulkan(vkdev);
 
         padding->top = pad_h0;
         padding->bottom = pad_h1;
diff --git a/source/device/vulkan/layer/pooling_vulkan.hpp b/source/device/vulkan/layer/pooling_vulkan.hpp
index 33be747b2..c12858c9f 100644
--- a/source/device/vulkan/layer/pooling_vulkan.hpp
+++ b/source/device/vulkan/layer/pooling_vulkan.hpp
@@ -51,8 +51,7 @@ namespace TEngine {
 class Pooling_vulkan : public Layer
 {
 public:
-    Pooling_vulkan();
-    Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+    Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
diff --git a/source/device/vulkan/layer/priorbox_vulkan.cpp b/source/device/vulkan/layer/priorbox_vulkan.cpp
index 23198f4e8..efb6f36ca 100644
--- a/source/device/vulkan/layer/priorbox_vulkan.cpp
+++ b/source/device/vulkan/layer/priorbox_vulkan.cpp
@@ -42,18 +42,10 @@
 
 namespace TEngine {
 
-PriorBox_vulkan::PriorBox_vulkan()
+PriorBox_vulkan::PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-
-    pipeline_priorbox = 0;
-    pipeline_priorbox_mxnet = 0;
-}
-
-PriorBox_vulkan::PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-{
-    support_vulkan = true;
-
+    one_blob_only = false;
     pipeline_priorbox = 0;
     pipeline_priorbox_mxnet = 0;
 
@@ -351,4 +343,4 @@ int PriorBox_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs,
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/priorbox_vulkan.hpp b/source/device/vulkan/layer/priorbox_vulkan.hpp
index 3ae12f99e..8bf388b1c 100644
--- a/source/device/vulkan/layer/priorbox_vulkan.hpp
+++ b/source/device/vulkan/layer/priorbox_vulkan.hpp
@@ -50,8 +50,7 @@ namespace TEngine {
 class PriorBox_vulkan : public Layer
 {
 public:
-    PriorBox_vulkan();
-    PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+    PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
@@ -93,4 +92,4 @@ class PriorBox_vulkan : public Layer
 
 } // namespace TEngine
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/device/vulkan/layer/relu_vulkan.cpp b/source/device/vulkan/layer/relu_vulkan.cpp
index 510d4245b..101fe10ee 100644
--- a/source/device/vulkan/layer/relu_vulkan.cpp
+++ b/source/device/vulkan/layer/relu_vulkan.cpp
@@ -39,24 +39,15 @@
 
 #include "relu_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-ReLU_vulkan::ReLU_vulkan()
+ReLU_vulkan::ReLU_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    support_image_storage = true;
-
-    pipeline_relu = 0;
-    pipeline_relu_pack4 = 0;
-    pipeline_relu_pack8 = 0;
-}
-
-ReLU_vulkan::ReLU_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-{
-    support_vulkan = true;
-    support_image_storage = false;
-
+    one_blob_only = true;
+    support_inplace = true;
     pipeline_relu = 0;
     pipeline_relu_pack4 = 0;
     pipeline_relu_pack8 = 0;
@@ -213,4 +204,4 @@ int ReLU_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/relu_vulkan.hpp b/source/device/vulkan/layer/relu_vulkan.hpp
index c707481c8..ed5170e3b 100644
--- a/source/device/vulkan/layer/relu_vulkan.hpp
+++ b/source/device/vulkan/layer/relu_vulkan.hpp
@@ -50,8 +50,7 @@ namespace TEngine {
 class ReLU_vulkan : public Layer
 {
 public:
-    ReLU_vulkan();
-    ReLU_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+    ReLU_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
@@ -76,4 +75,4 @@ class ReLU_vulkan : public Layer
 
 } // namespace TEngine
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/device/vulkan/layer/reshape_vulkan.cpp b/source/device/vulkan/layer/reshape_vulkan.cpp
index 3f12e241f..4e7bac661 100644
--- a/source/device/vulkan/layer/reshape_vulkan.cpp
+++ b/source/device/vulkan/layer/reshape_vulkan.cpp
@@ -39,35 +39,13 @@
 
 #include "reshape_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-Reshape_vulkan::Reshape_vulkan()
+Reshape_vulkan::Reshape_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    support_image_storage = true;
-
-    permute_hwc = 0;
-    permute_hc = 0;
-    permute_hw = 0;
-    permute_chw = 0;
-
-    pipeline_reshape = 0;
-    pipeline_reshape_pack4 = 0;
-    pipeline_reshape_pack1to4 = 0;
-    pipeline_reshape_pack4to1 = 0;
-    pipeline_reshape_pack8 = 0;
-    pipeline_reshape_pack1to8 = 0;
-    pipeline_reshape_pack4to8 = 0;
-    pipeline_reshape_pack8to4 = 0;
-    pipeline_reshape_pack8to1 = 0;
-}
-
-Reshape_vulkan::Reshape_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-{
-    support_vulkan = true;
-    support_image_storage = true;
-
     permute_hwc = 0;
     permute_hc = 0;
     permute_hw = 0;
@@ -202,9 +180,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt)
     if (out_shape_permuted.dims == 3) out_shape_packed = Tensor(out_shape_permuted.w, out_shape_permuted.h, out_shape_permuted.c / out_elempack, (void*)0, out_elemsize, out_elempack);
 
     // check blob shape
-    // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
     {
-        support_image_storage = false;
         opt.use_image_storage = false;
     }
 
@@ -582,4 +558,4 @@ int Reshape_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/reshape_vulkan.hpp b/source/device/vulkan/layer/reshape_vulkan.hpp
index 1d52e48a8..b1349dcd6 100644
--- a/source/device/vulkan/layer/reshape_vulkan.hpp
+++ b/source/device/vulkan/layer/reshape_vulkan.hpp
@@ -50,8 +50,7 @@ namespace TEngine {
 class Reshape_vulkan : public Layer
 {
 public:
-    Reshape_vulkan();
-    Reshape_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+    Reshape_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
@@ -94,4 +93,4 @@ class Reshape_vulkan : public Layer
 
 } // namespace TEngine
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/device/vulkan/layer/softmax_vulkan.cpp b/source/device/vulkan/layer/softmax_vulkan.cpp
index 8ee653505..1c4c565ce 100644
--- a/source/device/vulkan/layer/softmax_vulkan.cpp
+++ b/source/device/vulkan/layer/softmax_vulkan.cpp
@@ -39,35 +39,15 @@
 
 #include "softmax_vulkan.hpp"
 #include "../layer_shader_type.h"
+#include "vulkan_layer.hpp"
 
 namespace TEngine {
 
-Softmax_vulkan::Softmax_vulkan()
+Softmax_vulkan::Softmax_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev)
+    : Layer(vkdev)
 {
-    support_vulkan = true;
-    support_image_storage = true;
-
-    pipeline_softmax_reduce_max = 0;
-    pipeline_softmax_exp_sub_max = 0;
-    pipeline_softmax_reduce_sum = 0;
-    pipeline_softmax_div_sum = 0;
-
-    pipeline_softmax_reduce_max_pack4 = 0;
-    pipeline_softmax_exp_sub_max_pack4 = 0;
-    pipeline_softmax_reduce_sum_pack4 = 0;
-    pipeline_softmax_div_sum_pack4 = 0;
-
-    pipeline_softmax_reduce_max_pack8 = 0;
-    pipeline_softmax_exp_sub_max_pack8 = 0;
-    pipeline_softmax_reduce_sum_pack8 = 0;
-    pipeline_softmax_div_sum_pack8 = 0;
-}
-
-Softmax_vulkan::Softmax_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
-{
-    support_vulkan = true;
-    support_image_storage = true;
-
+    one_blob_only = true;
+    support_inplace = true;
     pipeline_softmax_reduce_max = 0;
     pipeline_softmax_exp_sub_max = 0;
     pipeline_softmax_reduce_sum = 0;
diff --git a/source/device/vulkan/layer/softmax_vulkan.hpp b/source/device/vulkan/layer/softmax_vulkan.hpp
index 94c1be27c..a52eea16e 100644
--- a/source/device/vulkan/layer/softmax_vulkan.hpp
+++ b/source/device/vulkan/layer/softmax_vulkan.hpp
@@ -50,8 +50,7 @@ namespace TEngine {
 class Softmax_vulkan : public Layer
 {
 public:
-    Softmax_vulkan();
-    Softmax_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+    Softmax_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev);
 
     virtual int create_pipeline(const Option& opt);
     virtual int destroy_pipeline(const Option& opt);
@@ -86,4 +85,4 @@ class Softmax_vulkan : public Layer
 
 } // namespace TEngine
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/device/vulkan/shaders/concat.comp b/source/device/vulkan/shaders/concat.comp
index 5c904b42e..6275ecca1 100644
--- a/source/device/vulkan/shaders/concat.comp
+++ b/source/device/vulkan/shaders/concat.comp
@@ -27,25 +27,19 @@ layout (constant_id = 0) const int axis = 0;
 layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int d = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int cstep = 0;
 
-layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
-
-layout (local_size_x_id = 233) in;
-layout (local_size_y_id = 234) in;
-layout (local_size_z_id = 235) in;
+layout (constant_id = shape_constant_id_offset + 6) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outd = 0;
+layout (constant_id = shape_constant_id_offset + 10) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 11) const int outcstep = 0;
 
 #if NCNN_image_shader
-layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
-layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
 layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
-layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
-layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
 layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
 #else
 layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
@@ -57,12 +51,14 @@ layout (push_constant) uniform parameter
     int dims;
     int w;
     int h;
+    int d;
     int c;
     int cstep;
 
     int outdims;
     int outw;
     int outh;
+    int outd;
     int outc;
     int outcstep;
 
@@ -75,32 +71,34 @@ void main()
     int gy = int(gl_GlobalInvocationID.y);
     int gz = int(gl_GlobalInvocationID.z);
 
-    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+    if (gx >= psc(w) || gy >= psc(h) * psc(d) || gz >= psc(c))
         return;
 
-#if NCNN_image_shader
-    if (psc(dims) == 1)
-    {
-        image1d_cp1(top_blob_1d, gx + p.offset, bottom_blob_1d, gx);
-    }
-    else if (psc(dims) == 2)
+    int positive_axis = axis < 0 ? psc(dims) + axis : axis;
+
+    ivec3 gxyz;
+
+    if (psc(dims) == 4)
     {
-        if (axis == 0) image2d_cp1(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy));
-        if (axis == 1) image2d_cp1(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy));
+        int yd = gy / psc(h);
+        int yh = gy % psc(h);
+
+        ivec4 gxydz = ivec4(gx, yh, yd, gz);
+        gxydz[psc(dims) - 1 - positive_axis] += p.offset;
+
+        gxyz = ivec3(gxydz.r, gxydz.g + gxydz.b * psc(outh), gxydz.a);
     }
-    else // if (psc(dims) == 3)
+    else
     {
-        if (axis == 0) image3d_cp1(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz));
-        if (axis == 1) image3d_cp1(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz));
-        if (axis == 2) image3d_cp1(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz));
+        gxyz = ivec3(gx, gy, gz);
+        gxyz[psc(dims) - 1 - positive_axis] += p.offset;
     }
+
+#if NCNN_image_shader
+    image3d_cp1(top_blob_3d, gxyz, bottom_blob_3d, ivec3(gx, gy, gz));
 #else
     const int gi = gz * psc(cstep) + gy * psc(w) + gx;
 
-    ivec3 gxyz = ivec3(gx, gy, gz);
-
-    gxyz[psc(dims) - 1 - axis] += p.offset;
-
     int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x;
 
     buffer_cp1(top_blob_data, v_offset, bottom_blob_data, gi);
diff --git a/source/device/vulkan/vulkan_allocator.cpp b/source/device/vulkan/vulkan_allocator.cpp
index b901923cd..be765183e 100644
--- a/source/device/vulkan/vulkan_allocator.cpp
+++ b/source/device/vulkan/vulkan_allocator.cpp
@@ -1428,7 +1428,6 @@ VkWeightStagingAllocator::~VkWeightStagingAllocator()
 
 VkBufferMemory* VkWeightStagingAllocator::fastMalloc(size_t size)
 {
-    printf("VkWeightStagingAllocator fastMalloc %lu\n", size);
     VkBufferMemory* ptr = new VkBufferMemory;
 
     ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
diff --git a/source/device/vulkan/vulkan_device.cc b/source/device/vulkan/vulkan_device.cc
index 57067405b..df45ec145 100644
--- a/source/device/vulkan/vulkan_device.cc
+++ b/source/device/vulkan/vulkan_device.cc
@@ -27,8 +27,7 @@
 #include "vulkan_limit.hpp"
 #include "vulkan_graph.hpp"
 
-extern "C"
-{
+extern "C" {
 #include "api/c_api.h"
 #include "device/device.h"
 #include "graph/tensor.h"
@@ -44,7 +43,6 @@ extern "C"
 
 #include <cstring>
 
-
 int vulkan_describe(struct device* device, struct vector* allowed_ops, struct vector* blocked_ops, struct vector* precision)
 {
     (void)device;
@@ -78,7 +76,6 @@ int vulkan_describe(struct device* device, struct vector* allowed_ops, struct ve
     return 0;
 }
 
-
 int vulkan_evaluation(struct device* device, struct subgraph* sub_graph, struct vector* evolution_tensors, struct vector* evolution_nodes)
 {
     // nothing to do with vulkan
@@ -90,7 +87,6 @@ int vulkan_evaluation(struct device* device, struct subgraph* sub_graph, struct
     return 0;
 }
 
-
 int vulkan_allocate(struct device* device, struct subgraph* sub_graph)
 {
     if (nullptr == device)
@@ -112,7 +108,6 @@ int vulkan_allocate(struct device* device, struct subgraph* sub_graph)
     return 0;
 }
 
-
 int vulkan_release(struct device* device, struct subgraph* sub_graph)
 {
     (void)sub_graph;
@@ -162,48 +157,41 @@ int vulkan_split_graph(struct graph* ir_graph)
     return 0;
 }
 
-
-extern "C"
-{
+extern "C" {
 static struct interface vulkan_interface = {
-        .init           = vulkan_dev_init,
-        .pre_run        = vulkan_dev_prerun,
-        .run            = vulkan_dev_run,
-        .post_run       = vulkan_dev_postrun,
-        .async_run      = nullptr,
-        .async_wait     = nullptr,
-        .release_graph  = nullptr,
-        .release_device = vulkan_dev_release,
+    .init = vulkan_dev_init,
+    .pre_run = vulkan_dev_prerun,
+    .run = vulkan_dev_run,
+    .post_run = vulkan_dev_postrun,
+    .async_run = nullptr,
+    .async_wait = nullptr,
+    .release_graph = nullptr,
+    .release_device = vulkan_dev_release,
 };
 
-
 static struct allocator vulkan_allocator = {
-        .describe       = vulkan_describe,
-        .evaluation     = vulkan_evaluation,
-        .allocate       = vulkan_allocate,
-        .release        = vulkan_release,
+    .describe = vulkan_describe,
+    .evaluation = vulkan_evaluation,
+    .allocate = vulkan_allocate,
+    .release = vulkan_release,
 };
 
-
 static struct optimizer vulkan_optimizer = {
-        .split_graph    = vulkan_split_graph,
-        .optimize_graph = nullptr,
+    .split_graph = vulkan_split_graph,
+    .optimize_graph = nullptr,
 };
 
-
-
 static struct vulkan_device vulkan_dev = {
-        .base = {
-                .name       = VULKAN_DEV_NAME,
-                .interface  = &vulkan_interface,
-                .allocator  = &vulkan_allocator,
-                .optimizer  = &vulkan_optimizer,
-                .scheduler  = nullptr,
-                .privacy    = nullptr,
-        },
+    .base = {
+        .name = VULKAN_DEV_NAME,
+        .interface = &vulkan_interface,
+        .allocator = &vulkan_allocator,
+        .optimizer = &vulkan_optimizer,
+        .scheduler = nullptr,
+        .privacy = nullptr,
+    },
 };
 
-
 int register_vulkan_device(void)
 {
     int ret = register_device(&vulkan_dev.base);
@@ -217,7 +205,6 @@ int register_vulkan_device(void)
     return 0;
 }
 
-
 int unregister_vulkan_device(void)
 {
     int ret = unregister_device(&vulkan_dev.base);
diff --git a/source/device/vulkan/vulkan_executor.cc b/source/device/vulkan/vulkan_executor.cc
index ca030e894..b2f0c1b41 100644
--- a/source/device/vulkan/vulkan_executor.cc
+++ b/source/device/vulkan/vulkan_executor.cc
@@ -45,7 +45,6 @@ bool VULKANEngine::init()
 int VULKANEngine::VULKANEnginePreRun(struct subgraph* subgraph)
 {
     // TLOG_INFO("==== vulkan prerun start ====\n");
-    create_gpu_instance();
     // struct device *vk_dev = (struct device *)dev;
     struct graph *orig_graph = subgraph->graph;
     // struct vk_dev_priv *priv = (struct vk_dev_priv *)orig_graph->dev_priv;
@@ -93,6 +92,5 @@ int VULKANEngine::VULKANEngineRun(struct subgraph* subgraph)
 
 void VULKANEngine::VULKANEnginePostRun()
 {
-    destroy_gpu_instance();
     return;
-};
\ No newline at end of file
+};
diff --git a/source/device/vulkan/vulkan_executor.hpp b/source/device/vulkan/vulkan_executor.hpp
index c4cc99a6c..244b5e40e 100644
--- a/source/device/vulkan/vulkan_executor.hpp
+++ b/source/device/vulkan/vulkan_executor.hpp
@@ -49,16 +49,6 @@ extern "C" {
 
 // typedef std::map<uint32_t, cl_mem> dict_uint2clmem;
 
-struct VULKANqueue
-{
-    std::string name;
-    int dims;
-    // cl_kernel queue_kernel;
-    // cl_event enentPoint;
-    size_t* queue_global_work_size;
-    size_t* queue_local_work_size;
-};
-
 class VULKANEngine
 {
 public:
@@ -72,11 +62,6 @@ class VULKANEngine
 private:
     bool init();
 
-private:
-public:
-    // dict_uint2clmem             vulkan_tensor_map;
-    std::vector<struct VULKANqueue> queue_list;
-
 public:
     int bin_num;
 };
diff --git a/source/device/vulkan/vulkan_gpu.cpp b/source/device/vulkan/vulkan_gpu.cpp
index fba68aa70..b42bd8a52 100644
--- a/source/device/vulkan/vulkan_gpu.cpp
+++ b/source/device/vulkan/vulkan_gpu.cpp
@@ -798,7 +798,7 @@ int create_gpu_instance()
             }
             if (gpu_info.support_VK_KHR_16bit_storage)
             {
-                gpu_info.support_fp16_storage = query16BitStorageFeatures.storageBuffer16BitAccess && query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess;
+                gpu_info.support_fp16_storage = query16BitStorageFeatures.storageBuffer16BitAccess && query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess && query16BitStorageFeatures.storageInputOutput16;
             }
             if (gpu_info.support_VK_KHR_shader_float16_int8)
             {
@@ -1945,8 +1945,7 @@ int GPUDevice::create_utility_operator()
                         opt.use_shader_pack8 = true;
 
                         { // create packing layer
-                            TEngine::Packing_vulkan* uop = new Packing_vulkan();
-                            uop->vkdev = this;
+                            TEngine::Packing_vulkan* uop = new Packing_vulkan(this);
 
                             uop->out_elempack = k == 0 ? 1 : k == 1 ? 4
                                                                     : 8;
diff --git a/source/device/vulkan/vulkan_graph.cc b/source/device/vulkan/vulkan_graph.cc
index 222477f80..84c9365ff 100644
--- a/source/device/vulkan/vulkan_graph.cc
+++ b/source/device/vulkan/vulkan_graph.cc
@@ -23,8 +23,13 @@
  */
 
 #include "vulkan_graph.hpp"
+#include "api/c_api.h"
 #include "vulkan_executor.hpp"
+#include "vulkan_gpu.hpp"
 
+#include <cstdio>
+#include <cassert>
+#include <functional>
 #include <iostream>
 #include "vulkan_graph.hpp"
 #include "vulkan_pipeline.hpp"
@@ -51,23 +56,46 @@
 #include "layer/crop_vulkan.hpp"
 
 #include <sys/time.h>
+#include <vector>
 
-extern "C"
-{
+extern "C" {
 #include "graph/tensor.h"
 #include "graph/node.h"
 #include "graph/graph.h"
 #include "graph/subgraph.h"
 }
 
+#define VULKAN_DEBUG_TENSOR 0
+
+static void save_tensor(const char* fname, const float* vals, std::vector<int> const& dims)
+{
+    auto fout = fopen(fname, "w+");
+    assert(fout);
+    int n = 1;
+
+    for (auto const d : dims)
+    {
+        fprintf(fout, "%d ", d);
+        n *= d;
+    }
+    fprintf(fout, "\n");
+
+    for (int i = 0; i < n; ++i)
+    {
+        fprintf(fout, "%f ", vals[i]);
+    }
+    fprintf(fout, "\n");
+    fflush(fout);
+    fclose(fout);
+}
 
 int vulkan_dev_init(struct device* dev)
 {
     (void)dev;
+    TEngine::create_gpu_instance();
     return 0;
 }
 
-
 int vulkan_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options)
 {
     subgraph->device_graph = new VULKANEngine;
@@ -76,14 +104,12 @@ int vulkan_dev_prerun(struct device* dev, struct subgraph* subgraph, void* optio
     return engine->VULKANEnginePreRun(subgraph);
 }
 
-
 int vulkan_dev_run(struct device* dev, struct subgraph* subgraph)
 {
     auto engine = (VULKANEngine*)subgraph->device_graph;
     return engine->VULKANEngineRun(subgraph);
 }
 
-
 int vulkan_dev_postrun(struct device* dev, struct subgraph* subgraph)
 {
     auto engine = (VULKANEngine*)subgraph->device_graph;
@@ -93,15 +119,13 @@ int vulkan_dev_postrun(struct device* dev, struct subgraph* subgraph)
     return 0;
 }
 
-
 int vulkan_dev_release(struct device* dev)
 {
     (void)dev;
+    TEngine::destroy_gpu_instance();
     return 0;
 }
 
-
-
 namespace TEngine {
 
 static double get_cur_time(void)
@@ -113,7 +137,6 @@ static double get_cur_time(void)
     return tv.tv_sec * 1000.0 + (tv.tv_usec / 1000.0);
 }
 
-
 VulkanGraph::VulkanGraph(struct subgraph* graph)
 {
     vkdev = get_gpu_device();
@@ -123,13 +146,13 @@ VulkanGraph::VulkanGraph(struct subgraph* graph)
     // set graph options
     if (!vkdev->info.support_fp16_packed || !vkdev->info.support_fp16_storage)
         opt.use_fp16_packed = false;
-    if (!vkdev->info.support_fp16_storage) 
+    if (!vkdev->info.support_fp16_storage)
     {
         opt.use_fp16_storage = false;
         opt.use_shader_pack8 = false;
-    }    
+    }
 
-    if (!vkdev->info.support_fp16_arithmetic) 
+    if (!vkdev->info.support_fp16_arithmetic)
         opt.use_fp16_arithmetic = false;
 
     TLOG_INFO("use_fp16_packed %d\n", opt.use_fp16_packed);
@@ -137,169 +160,141 @@ VulkanGraph::VulkanGraph(struct subgraph* graph)
     TLOG_INFO("use_shader_pack8 %d\n", opt.use_shader_pack8);
     TLOG_INFO("use_fp16_arithmetic %d\n", opt.use_fp16_arithmetic);
 
-    struct subgraph *subgraph = (struct subgraph *)graph;
-    struct graph *ir_graph = subgraph->graph;
+    struct subgraph* subgraph = (struct subgraph*)graph;
+    struct graph* ir_graph = subgraph->graph;
     int node_num = subgraph->node_num;
 
     sgraph = graph;
-    for(int i = 0; i < node_num; i++)
+    for (int i = 0; i < node_num; i++)
     {
-        struct node *ir_node = get_ir_graph_node(ir_graph, subgraph->node_list[i]);
+        struct node* ir_node = get_ir_graph_node(ir_graph, subgraph->node_list[i]);
+        for (int i = 0; i < ir_node->input_num; ++i)
+        {
+            struct tensor* input = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[i]);
+            const auto name = input->name;
+            tensor_map_[name] = input;
+            tensor_map[name] = Tensor(input);
+            VkTensor vktensor;
+            vktensor_map_[name] = vktensor;
+        }
+
+        for (int i = 0; i < ir_node->output_num; ++i)
+        {
+            struct tensor* output = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[i]);
+            const auto name = output->name;
+            tensor_map_[name] = output;
+            tensor_map[name] = Tensor(output);
+        }
 
         if (ir_node->op.type == OP_CONST || ir_node->op.type == OP_INPUT)
             continue;
         else if (ir_node->op.type == OP_CLIP)
             ir_node->op.type = OP_RELU6;
 
-        if(ir_node->op.type == OP_CONV)
+        if (ir_node->op.type == OP_CONV)
         {
-            struct conv_param *conv_param = (struct conv_param *)ir_node->op.param_mem;
+            struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem;
 
             if (conv_param->group == conv_param->output_channel && conv_param->group != 1 && ir_graph->graph_layout == TENGINE_LAYOUT_NCHW) // DW
             {
-                Layer* layer = new ConvolutionDepthWise_vulkan(ir_graph, ir_node);
-                layer->vkdev = vkdev;
-                layer->name = "ConvolutionDepthWise";
+                Layer* layer = new ConvolutionDepthWise_vulkan(ir_graph, ir_node, vkdev);
                 layers.push_back(layer);
             }
             else
             {
-                Layer* layer = new Convolution_vulkan(ir_graph, ir_node);
-                layer->vkdev = vkdev;
-                layer->name = "Convolution";
+                Layer* layer = new Convolution_vulkan(ir_graph, ir_node, vkdev);
                 layers.push_back(layer);
             }
         }
 
-        if(ir_node->op.type == OP_POOL)
+        if (ir_node->op.type == OP_POOL)
         {
-            Layer* layer = new Pooling_vulkan(ir_graph, ir_node);
-            layer->vkdev = vkdev;
-            layer->name = "Pooling";
+            Layer* layer = new Pooling_vulkan(ir_graph, ir_node, vkdev);
             layers.push_back(layer);
         }
 
-        if(ir_node->op.type == OP_FC)
+        if (ir_node->op.type == OP_FC)
         {
-            Layer* layer = new InnerProduct_vulkan(ir_graph, ir_node);
-            layer->vkdev = vkdev;
-            layer->name = "InnerProduct";
+            Layer* layer = new InnerProduct_vulkan(ir_graph, ir_node, vkdev);
             layers.push_back(layer);
         }
 
-        if(ir_node->op.type == OP_FLATTEN)
+        if (ir_node->op.type == OP_FLATTEN)
         {
-            Layer* layer = new Flatten_vulkan(ir_graph, ir_node);
-            layer->vkdev = vkdev;
-            layer->name = "Flatten";
+            Layer* layer = new Flatten_vulkan(ir_graph, ir_node, vkdev);
             layers.push_back(layer);
         }
 
-        if(ir_node->op.type == OP_SOFTMAX)
+        if (ir_node->op.type == OP_SOFTMAX)
         {
-            Layer* layer = new Softmax_vulkan(ir_graph, ir_node);
-            layer->vkdev = vkdev;
-            layer->name = "Softmax";
+            Layer* layer = new Softmax_vulkan(ir_graph, ir_node, vkdev);
             layers.push_back(layer);
         }
 
-        if(ir_node->op.type == OP_RELU)
+        if (ir_node->op.type == OP_RELU)
         {
-            Layer* layer = new ReLU_vulkan(ir_graph, ir_node);
-            layer->vkdev = vkdev;
-            layer->name = "ReLU";
+            Layer* layer = new ReLU_vulkan(ir_graph, ir_node, vkdev);
             layers.push_back(layer);
         }
 
-        if(ir_node->op.type == OP_DROPOUT)
+        if (ir_node->op.type == OP_DROPOUT)
         {
-            Layer* layer = new Dropout_vulkan(ir_graph, ir_node);
-            layer->vkdev = vkdev;
-            layer->name = "Dropout";
+            Layer* layer = new Dropout_vulkan(ir_graph, ir_node, vkdev);
             layers.push_back(layer);
         }
 
-        if(ir_node->op.type == OP_ELTWISE)
+        if (ir_node->op.type == OP_ELTWISE)
         {
-            Layer* layer = new Eltwise_vulkan(ir_graph, ir_node);
-            layer->vkdev = vkdev;
-            layer->name = "Eltwise";
+            Layer* layer = new Eltwise_vulkan(ir_graph, ir_node, vkdev);
             layers.push_back(layer);
         }
 
-        if(ir_node->op.type == OP_PRIORBOX)
+        if (ir_node->op.type == OP_PRIORBOX)
         {
-            Layer* layer = new PriorBox_vulkan(ir_graph, ir_node);
-            layer->vkdev = vkdev;
-            layer->name = "PriorBox";
+            Layer* layer = new PriorBox_vulkan(ir_graph, ir_node, vkdev);
             layers.push_back(layer);
         }
 
-        if(ir_node->op.type == OP_PERMUTE)
+        if (ir_node->op.type == OP_PERMUTE)
         {
-            Layer* layer = new Permute_vulkan(ir_graph, ir_node);
-            layer->vkdev = vkdev;
-            layer->name = "Permute";
+            Layer* layer = new Permute_vulkan(ir_graph, ir_node, vkdev);
             layers.push_back(layer);
         }
 
-        if(ir_node->op.type == OP_CONCAT)
+        if (ir_node->op.type == OP_CONCAT)
         {
-            Layer* layer = new Concat_vulkan(ir_graph, ir_node);
-            layer->vkdev = vkdev;
-            layer->name = "Concat";
+            Layer* layer = new Concat_vulkan(ir_graph, ir_node, vkdev);
             layers.push_back(layer);
         }
 
-        if(ir_node->op.type == OP_RESHAPE)
+        if (ir_node->op.type == OP_RESHAPE)
         {
-            Layer* layer = new Reshape_vulkan(ir_graph, ir_node);
-            layer->vkdev = vkdev;
-            layer->name = "Reshape";
+            Layer* layer = new Reshape_vulkan(ir_graph, ir_node, vkdev);
             layers.push_back(layer);
         }
 
-        if(ir_node->op.type == OP_INTERP || ir_node->op.type == OP_UPSAMPLE)
+        if (ir_node->op.type == OP_INTERP || ir_node->op.type == OP_UPSAMPLE)
         {
-            Layer* layer = new Interp_vulkan(ir_graph, ir_node);
-            layer->vkdev = vkdev;
-            layer->name = "Interp";
+            Layer* layer = new Interp_vulkan(ir_graph, ir_node, vkdev);
             layers.push_back(layer);
         }
 
-        if(ir_node->op.type == OP_CROP)
+        if (ir_node->op.type == OP_CROP)
         {
-            Layer* layer = new Crop_vulkan(ir_graph, ir_node);
-            layer->vkdev = vkdev;
-            layer->name = "Crop";
+            Layer* layer = new Crop_vulkan(ir_graph, ir_node, vkdev);
             layers.push_back(layer);
         }
-        
-        struct tensor *input = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
-        std::string name = input->name;
-        tensor_map_[name] = input;
-        tensor_map[name] = Tensor(input);
-
-        VkTensor vktensor;
-        vktensor_map_[name] = vktensor;
-
-        struct tensor *output = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
-        name = output->name;
-        tensor_map_[name] = output;
-        tensor_map[name] = Tensor(output);      
     }
 }
 
 VulkanGraph::~VulkanGraph()
 {
-   for(auto& ptr: mem_buf_vector_)
-	   std::free(ptr);
+    for (auto& ptr : mem_buf_vector_)
+        std::free(ptr);
 }
 
 int VulkanGraph::upload_model()
 {
-    
-// printf("run upload_model\n");
     TEngine::VkTransfer cmd(vkdev);
     if (!weight_vkallocator)
     {
@@ -309,31 +304,28 @@ int VulkanGraph::upload_model()
     {
         weight_staging_vkallocator = new VkWeightStagingAllocator(vkdev);
     }
-    
+
     Option opt_upload = opt;
     opt_upload.blob_vkallocator = weight_vkallocator;
     opt_upload.workspace_vkallocator = weight_vkallocator;
     opt_upload.staging_vkallocator = weight_staging_vkallocator;
 
     int layer_size = layers.size();
-    for(int i = 0; i < layer_size; i++)
+    for (int i = 0; i < layer_size; i++)
     {
         layers[i]->upload_model(cmd, opt_upload);
-    }    
-    
+    }
+
     cmd.submit_and_wait();
-// printf("run upload_model done\n");
     return 0;
 }
 
 int VulkanGraph::create_pipeline()
 {
-    // printf("start to run create pipeline\n");
-    for (size_t i=0; i<layers.size(); i++)
+    for (size_t i = 0; i < layers.size(); i++)
     {
         Layer* layer = layers[i];
         Option opt1 = opt;
-        // printf("create pipeline layer name: %s \n", layers[i]->name.c_str());
         int cret = layer->create_pipeline(opt1);
         if (cret != 0)
         {
@@ -341,14 +333,11 @@ int VulkanGraph::create_pipeline()
             return -1;
         }
     }
-// printf("run create_pipeline done\n");
     return 0;
 }
 
 int VulkanGraph::record_graph_pipeline()
 {
-    // printf("start to run record pipeline, layer size:%d\n", layers.size());
-
     TEngine::VkCompute cmd(vkdev);
 
     if (!opt.blob_vkallocator)
@@ -365,63 +354,50 @@ int VulkanGraph::record_graph_pipeline()
         local_staging_vkallocator = vkdev->acquire_staging_allocator();
         opt.staging_vkallocator = local_staging_vkallocator;
     }
-    std::string name;
 
-    Tensor input;
-    Tensor output;
-
-    // printf("tensor_map size:%d ---------------------\n", tensor_map.size());
+    // upload input tensor
+    for (int i = 0; i < sgraph->input_num; ++i)
+    {
+        auto input_tensor = get_ir_graph_tensor(sgraph->graph, sgraph->input_tensor_list[i]);
+        const auto name = get_tensor_name(input_tensor);
+        tensor_map_[name] = input_tensor;
+        cmd.record_upload(tensor_map_[name], vktensor_map_[name], opt);
+    }
 
-    for (size_t i=0; i<layers.size(); i++)
+    for (size_t i = 0; i < layers.size(); i++)
     {
         Layer* layer = layers[i];
-        // printf("layer type: %s\n", layer->name.c_str());
-
-        std::string in_name = layer->bottoms[0];
         std::string out_name = layer->tops[0];
-        name = out_name;
 
-        // upload Tensor data to VkTensor 
-        if((i==0) && vktensor_map_[in_name].dims == 0)
+        int cret = 0;
+        if (layer->one_blob_only)
         {
-            cmd.record_upload(tensor_map_[in_name], vktensor_map_[in_name], opt);
-            // cmd.record_download(vktensor_map_[in_name], tensor_map[in_name], opt);
-        }
-        
-        int cret;
-        if(layer->name == "ReLU" || layer->name == "Dropout" || layer->name == "Softmax")   // inplace
-        {
-            VkTensor bottom_tensor = vktensor_map_[in_name];
-            cret = layer->record_pipeline(bottom_tensor, cmd, opt);
-            vktensor_map_[out_name] = bottom_tensor;
+            std::string const& in_name = layer->bottoms[0];
+            auto& bottom_tensor = vktensor_map_[in_name];
+            if (layer->support_inplace)
+            {
+                cret = layer->record_pipeline(bottom_tensor, cmd, opt);
+                //FIXME: chec and log here
+                vktensor_map_[out_name] = bottom_tensor;
+            }
+            else
+            {
+                VkTensor top_blob;
+                cret = layer->record_pipeline(bottom_tensor, top_blob, cmd, opt);
+                vktensor_map_[out_name] = top_blob;
+            }
         }
-        else if(layer->name == "Eltwise" || layer->name == "Concat" || layer->name == "PriorBox" || layer->name == "Crop") // multi-in, one-out
+        else
         {
             std::vector<VkTensor> bottom_blobs;
-            for(int i = 0; i < layer->bottoms.size(); i++)
+            for (auto const& inp : layer->bottoms)
             {
-                bottom_blobs.push_back(vktensor_map_[layer->bottoms[i]]);
+                bottom_blobs.push_back(vktensor_map_[inp]);
             }
 
-            VkTensor top_tensor;
-            std::vector<VkTensor> top_blobs;
-            top_blobs.push_back(top_tensor);
+            std::vector<VkTensor> top_blobs(1);
             cret = layer->record_pipeline(bottom_blobs, top_blobs, cmd, opt);
-            vktensor_map_[out_name] = top_blobs[0];
-        }
-        else    // original one-in one-out
-        {
-            VkTensor bottom_tensor = vktensor_map_[in_name];
-            VkTensor top_tensor;
-            cret = layer->record_pipeline(bottom_tensor, top_tensor, cmd, opt);
-            vktensor_map_[out_name] = top_tensor;
-        }
-
-        // download all nodes data
-        {
-            // Tensor tmp_tensor;
-            // cmd.record_download(vktensor_map_[out_name], tmp_tensor, opt);
-            // tensor_map[out_name] = tmp_tensor;
+            vktensor_map_[out_name] = top_blobs.front();
         }
 
         if (cret != 0)
@@ -431,108 +407,61 @@ int VulkanGraph::record_graph_pipeline()
         }
     }
 
-    cmd.record_download(vktensor_map_[name], output, opt);
-
-    // // download output
-    // int byte_size=tensor_map_[name]->elem_size * tensor_map_[name]->elem_num;
-    // void* mem=std::malloc(byte_size);
-    // tensor_map_[name]->data = mem;
-    // cmd.record_download(vktensor_map_[name], tensor_map_[name], opt);
+    auto for_each_output = [this](std::function<void(const char* name)> const& fn) {
+        auto output_num = sgraph->output_num;
+        for (int i = 0; i < output_num; ++i)
+        {
+            auto output_tensor = sgraph->graph->tensor_list[sgraph->output_tensor_list[i]];
+            auto const* name = get_tensor_name(output_tensor);
+            fn(name);
+        }
+    };
 
-// double total_time, min_time, max_time;
-//     min_time = 999999999;
-//     max_time = 0;
-//     total_time = 0;
-// double start_time = get_cur_time();
+    for_each_output([this, &cmd](const char* name) {
+        auto vkoutput = vktensor_map_.find(name);
+        if (vkoutput == vktensor_map_.cend())
+        {
+            fprintf(stderr, "%s output tensor is not found.\n", name);
+            return;
+        };
+        cmd.record_download(vkoutput->second, tensor_map[name], opt);
+    });
 
     cmd.submit_and_wait();
 
-// double end_time = get_cur_time();
-// double cur_time = end_time - start_time;
-// total_time += cur_time;
-// if (cur_time > max_time)
-//     max_time = cur_time;
-// if (cur_time < min_time)
-//     min_time = cur_time;
-// printf("vulkan Repeat [1] min %.3f ms, max %.3f ms, avg %.3f ms\n", min_time, max_time, total_time / 1);
-
-    Tensor tmp_fp32;
-    if(output.elemsize == output.elempack * 2)
-    {
-        TEngine::cast_float16_to_float32(output, tmp_fp32, opt);
-    }
-    else
-    {
-        tmp_fp32 = output;
-    }
-
-    Tensor blob_unpacked;
-    if (opt.use_packing_layout)
-    {
-        convert_packing(tmp_fp32, blob_unpacked, 1, opt);
-    }
-    else
-    {
-        blob_unpacked = tmp_fp32;
-    }
-
-    tensor_map_[name]->data = blob_unpacked.data;
-
-
-// #define DEBUG_OUTPUT
-#ifdef DEBUG_OUTPUT
-    printf("run save tensor data\n");
-    for (size_t j=0; j<layers.size(); j++)
-    {
-        Layer* layer = layers[j];
-
-        std::string in_name = layer->tops[0];
-        // std::string in_name = layer->bottoms[0];
-        printf("%s\n", in_name.c_str());
+    for_each_output([this](const char* name) {
+        auto pos = tensor_map.find(name);
+        if (pos == tensor_map.cend())
+        {
+            fprintf(stderr, "%s output tensor is not found.\n", name);
+            return;
+        }
 
-        std::string fname = std::to_string(j)+".data";
-        FILE* fp = fopen(fname.c_str(), "w");
+        auto& output = pos->second;
 
-        // float * data = (float*)get_tensor_buffer(tensor_map_[name]);
-        // float* data = (float*)vktensor_map_[in_name].mapped_ptr();
-        // float* data = (float*)tensor_map_[in_name]->data;
-        // float* data = (float*)tensor_map[in_name].data;
-        Tensor tmp_fp16 = tensor_map[in_name];
         Tensor tmp_fp32;
-        if(tmp_fp16.elemsize == tmp_fp16.elempack * 2)
-            TEngine::cast_float16_to_float32(tmp_fp16, tmp_fp32, opt);
+        if (output.elemsize == output.elempack * 2)
+        {
+            TEngine::cast_float16_to_float32(output, tmp_fp32, opt);
+        }
         else
-            tmp_fp32 = tmp_fp16;
-    
+        {
+            tmp_fp32 = output;
+        }
+
         Tensor blob_unpacked;
         if (opt.use_packing_layout)
+        {
             convert_packing(tmp_fp32, blob_unpacked, 1, opt);
+        }
         else
-            blob_unpacked = tmp_fp32;
-
-        int byte_size=tensor_map_[in_name]->elem_size * tensor_map_[name]->elem_num;
-        void* mem=std::malloc(byte_size);
-        memcpy(mem, blob_unpacked.data, byte_size);
-        tensor_map_[in_name]->data = mem;
-        // tensor_map_[in_name]->data = blob_unpacked.data;
-
-        // float* data = (float*)tmp_fp32.data;
-        float* data = (float*)blob_unpacked.data;
-        printf("tensor shape:%d %d %d %d\n", tensor_map_[in_name]->dims[0], tensor_map_[in_name]->dims[1], tensor_map_[in_name]->dims[2], tensor_map_[in_name]->dims[3]);
-        byte_size=tensor_map_[in_name]->elem_size * tensor_map_[in_name]->elem_num;
-        for(int i = 0; i < byte_size/sizeof(float); i++)
         {
-            if(i % 16 == 0)
-            {
-                fprintf(fp, "\n%d:", i);
-            }
-            fprintf(fp, " %.6f", data[i]);
+            blob_unpacked = tmp_fp32;
         }
-        fprintf(fp, "\n");
 
-        fclose(fp);
-    }
-#endif
+        tensor_map[name] = blob_unpacked; // don't release blob_unpacked
+        tensor_map_[name]->data = blob_unpacked.data;
+    });
 
     return 0;
 }
@@ -542,4 +471,4 @@ int VulkanGraph::destory_pipeline()
     return 0;
 }
 
-}
+} // namespace TEngine
diff --git a/source/device/vulkan/vulkan_layer.cpp b/source/device/vulkan/vulkan_layer.cpp
index 84f2b9de2..4b97cb4d1 100644
--- a/source/device/vulkan/vulkan_layer.cpp
+++ b/source/device/vulkan/vulkan_layer.cpp
@@ -41,9 +41,9 @@
 
 namespace TEngine {
 
-Layer::Layer()
+Layer::Layer(const GPUDevice* vkdev)
+    : vkdev(vkdev), one_blob_only(true), support_inplace(false)
 {
-    support_vulkan = false;
 }
 
 Layer::~Layer()
@@ -81,4 +81,4 @@ int Layer::record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vecto
     return 0;
 }
 
-} // namespace TEngine
\ No newline at end of file
+} // namespace TEngine
diff --git a/source/device/vulkan/vulkan_layer.hpp b/source/device/vulkan/vulkan_layer.hpp
index 2c2be9710..624fd5072 100644
--- a/source/device/vulkan/vulkan_layer.hpp
+++ b/source/device/vulkan/vulkan_layer.hpp
@@ -64,7 +64,7 @@ class Layer
 {
 public:
     // empty
-    Layer();
+    Layer(const GPUDevice* vkdev);
     // virtual destructor
     virtual ~Layer();
 
@@ -86,17 +86,14 @@ class Layer
     virtual int record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const;
 
 public:
-    // support vulkan compute
-    bool support_vulkan;
-
     // accept input blob with packed storage
     bool support_packing;
 
     // accept bf16
     bool support_bf16_storage;
 
-    // shader image storage
-    bool support_image_storage;
+    bool one_blob_only;
+    bool support_inplace;
 
 public:
     const GPUDevice* vkdev;
@@ -104,8 +101,6 @@ class Layer
     std::vector<std::string> tops;
 
 public:
-    // layer name
-    std::string name;
     // Node* node;
     ir_graph_t* graph;
     ir_node_t* node;
diff --git a/source/device/vulkan/vulkan_limit.hpp b/source/device/vulkan/vulkan_limit.hpp
index fbb45e089..d77c1201e 100644
--- a/source/device/vulkan/vulkan_limit.hpp
+++ b/source/device/vulkan/vulkan_limit.hpp
@@ -64,7 +64,7 @@ const int vulkan_supported_ops[] = {
     ////        OP_CONCAT,
     //        OP_CONST,
     //        OP_CONV,
-    ////        OP_CROP,
+    OP_CROP,
     ////        OP_DECONV,
     ////        OP_DEPTHTOSPACE,
     ////        OP_DETECTION_OUTPUT,
@@ -84,7 +84,7 @@ const int vulkan_supported_ops[] = {
     ////        OP_HARDSWISH,
     //        OP_INPUT,
     ////        OP_INSTANCENORM,
-    ////        OP_INTERP,
+    OP_INTERP,
     ////        OP_LOGICAL,
     ////        OP_LOGISTIC,
     ////        OP_LRN,
diff --git a/source/graph/tensor.c b/source/graph/tensor.c
index 5b065a458..fc92aee92 100644
--- a/source/graph/tensor.c
+++ b/source/graph/tensor.c
@@ -359,3 +359,36 @@ int set_ir_tensor_consumer(ir_tensor_t* ir_tensor, const int index)
 
     return 0;
 }
+
+float tensor_mean(ir_tensor_t* ir_tensor)
+{
+    float sum = .0;
+    float* p = ir_tensor->data;
+    for (int i = 0; i < ir_tensor->elem_num; ++i)
+    {
+        sum += p[i];
+    }
+
+    float mean = sum / (float)ir_tensor->elem_num;
+    return mean;
+}
+
+void save_tensor(const char* fname, const float* data, const int* dims, const int dim_num)
+{
+    FILE* fout = fopen(fname, "w+");
+    int n = 1;
+    for (int i = 0; i < dim_num; ++i)
+    {
+        n *= dims[i];
+        fprintf(fout, "%d ", dims[i]);
+    }
+    fprintf(fout, "\n");
+
+    for (int i = 0; i < n; ++i)
+    {
+        fprintf(fout, "%f ", data[i]);
+    }
+    fprintf(fout, "\n");
+    fflush(fout);
+    fclose(fout);
+}
diff --git a/source/graph/tensor.h b/source/graph/tensor.h
index 9d392f8b3..dd246c162 100644
--- a/source/graph/tensor.h
+++ b/source/graph/tensor.h
@@ -193,6 +193,8 @@ void dump_ir_tensor(struct graph* ir_graph, ir_tensor_t* ir_tensor);
  * @return statue value, 0 success, other value failure.
  */
 int set_ir_tensor_consumer(ir_tensor_t* ir_tensor, const int index);
+float tensor_mean(ir_tensor_t* tensor);
+void save_tensor(const char* fname, const float* data, const int* dims, const int dim_num);
 
 #ifdef __cplusplus
 }
diff --git a/source/serializer/tmfile/op/tm2_layernorm.c b/source/serializer/tmfile/op/tm2_layernorm.c
index 4645e8405..4dbfa7e31 100644
--- a/source/serializer/tmfile/op/tm2_layernorm.c
+++ b/source/serializer/tmfile/op/tm2_layernorm.c
@@ -40,7 +40,7 @@ static int layernorm_op_map(int op)
 }
 
 static int tm2_load_layernorm(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node,
-                                 const TM2_Operator* tm_op)
+                              const TM2_Operator* tm_op)
 {
     struct layernorm_Param* gather_param = (struct layernorm_Param*)ir_node->op.param_mem;
     const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index ed7c12b41..6c7c8f522 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,7 +1,35 @@
-# generate tengine header file
-FILE (MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tengine)
-FILE (COPY ${CMAKE_SOURCE_DIR}/source/api/c_api.h DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tengine)
-FILE (COPY ${CMAKE_SOURCE_DIR}/source/api/c_api_ex.h DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tengine)
+#generate tengine header file
+FILE(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tengine)
+FILE(COPY ${CMAKE_SOURCE_DIR}/source/api/c_api.h DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tengine)
+FILE(COPY ${CMAKE_SOURCE_DIR}/source/api/c_api_ex.h DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tengine)
+
+function(tengine_op_test name)
+    file(GLOB TENGINE_UTIL_SOURCE_FILES      ${PROJECT_SOURCE_DIR}/tests/common/util/*.c)
+    add_executable(${name} "${CMAKE_CURRENT_SOURCE_DIR}/op/${name}.c" "${TENGINE_UTIL_SOURCE_FILES}")
+
+    target_link_libraries(${name} PUBLIC "${CMAKE_PROJECT_NAME}-static")
+
+    target_include_directories (${name} PRIVATE "${PROJECT_SOURCE_DIR}/source")
+    target_include_directories (${name} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}")
+    target_include_directories (${name} PRIVATE "${PROJECT_BINARY_DIR}")
+    target_include_directories (${name} PRIVATE "${PROJECT_BINARY_DIR}/source")
+    target_include_directories (${name} PRIVATE "${PROJECT_SOURCE_DIR}/tests/common")
+    target_include_directories (${name} PRIVATE "${PROJECT_SOURCE_DIR}/tests/common/util")
+
+endfunction()
+tengine_op_test(test_op_absval)
+tengine_op_test(test_op_add_n)
+tengine_op_test(test_op_argmax)
+tengine_op_test(test_op_argmin)
+tengine_op_test(test_op_batchnorm)
+tengine_op_test(test_op_batchtospacend)
+tengine_op_test(test_op_bias)
+tengine_op_test(test_op_broadmul)
+tengine_op_test(test_op_cast)
+tengine_op_test(test_op_ceil)
+tengine_op_test(test_op_clip)
+tengine_op_test(test_op_comparison)
+tengine_op_test(test_op_conv)
 
 if (TENGINE_ENABLE_OPENDLA)
     function (tengine_opendla_op_test name file)
diff --git a/tests/op/test_op.h b/tests/op/test_op.h
index 91106e187..5a5aaac51 100644
--- a/tests/op/test_op.h
+++ b/tests/op/test_op.h
@@ -1,27 +1,423 @@
 #ifndef __TEST_COMMON_H__
 #define __TEST_COMMON_H__
 
-#include <vector>
+#include <string.h>
 #include <math.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
+#include <stddef.h>
+#include <math.h>
 
 //#include "float.h"
-#include "compiler_fp16.h"
+#include "api/c_api.h"
 #include "tengine/c_api.h"
+#include "mathp.h"
+#include "vector.h"
 
 #include "graph/graph.h"
 #include "graph/subgraph.h"
 #include "graph/node.h"
 #include "graph/tensor.h"
+#include <assert.h>
 
 #define TENSOR_SHOW_LEADING_BLANK "    "
 #define TENSOR_FLOAT_EPSILON      0.0001f
+typedef union
+{
+    struct
+    {
+        uint16_t frac : 10;
+        uint16_t exp : 5;
+        uint16_t sign : 1;
+    } __attribute__((packed)) bits;
+
+    uint16_t u16;
+} __attribute__((packed)) __pack16_t;
+
+typedef union
+{
+    struct
+    {
+        uint32_t frac : 23;
+        uint32_t exp : 8;
+        uint32_t sign : 1;
+    } __attribute__((packed)) bits;
+    uint32_t u32;
+    float fp32;
+} __attribute__((packed)) __pack32_t;
+
+static uint16_t __fp32_to_fp16(float fp32)
+{
+    const float fp32_abs = fabs(fp32);
+    __pack32_t pack32 = {.fp32 = fp32};
+    __pack16_t pack16 = {.u16 = 0};
+
+    if (pack32.bits.exp == 0 && pack32.bits.frac == 0)
+    {
+        pack16.bits.sign = pack32.bits.sign;
+        pack16.bits.frac = 0;
+        pack16.bits.exp = 0;
+        return pack16.u16;
+    }
+
+    // nan
+    if (isnan(fp32))
+    {
+        pack16.bits.exp = 0x1f;
+        pack16.bits.frac = 1;
+        pack16.bits.sign = pack32.bits.sign;
+        return pack16.u16;
+    }
+
+    // inf
+    if (isinf(fp32))
+    {
+        pack16.bits.exp = 0x1f;
+        pack16.bits.frac = 0;
+        pack16.bits.sign = pack32.bits.sign;
+        return pack16.u16;
+    }
+
+    // upper to fp16 max norm
+    if (fp32_abs > 65504.0f)
+    {
+        pack16.bits.sign = pack32.bits.sign;
+        pack16.bits.exp = 0x1e;
+        pack16.bits.frac = 1023;
+        return pack16.u16;
+    }
+
+    // lower than min subnormalnorm
+    if (fp32_abs < 5.96046448e-8f)
+    {
+        return .0f;
+    }
+
+    // lower than fp16 min norm: fp32 normalized to fp16 subnormal
+    if (fp32_abs < 6.103515625e-5)
+    {
+        pack16.bits.sign = pack32.bits.sign;
+        pack16.bits.exp = pack32.bits.exp - 127 + 15;
+        pack16.bits.frac = pack32.bits.frac >> 13;
+        return pack16.u16;
+    }
+
+    // fp32 normalized to fp16 normalzied
+    if (pack32.bits.exp != 0 && pack32.bits.frac != 0)
+    {
+        pack16.bits.sign = pack32.bits.sign;
+        pack16.bits.exp = pack32.bits.exp - 127 + 15;
+        pack16.bits.frac = pack32.bits.frac >> 13;
+        return pack16.u16;
+    }
+
+    return pack16.u16;
+}
+
+static float __fp16_to_fp32(uint16_t const value)
+{
+    __pack16_t pack16 = {.u16 = value};
+    __pack32_t pack32 = {.u32 = 0};
+
+    if (pack16.bits.exp == 0 && pack16.bits.frac == 0)
+    {
+        return pack16.bits.sign == 0 ? .0f : -.0f;
+    }
+
+    // normalized case
+    if (pack16.bits.exp != 0xff && pack16.bits.exp != 0)
+    {
+        pack32.bits.sign = pack16.bits.sign;
+        pack32.bits.exp = pack16.bits.exp - 15 + 127;
+        pack32.bits.frac = pack16.bits.frac << 13;
+        return pack32.fp32;
+    }
+
+    // subnormal case
+    // 5.96046448e-8f = 2**-14 * 1/1024.0
+    if (pack16.bits.exp == 0 && pack16.bits.frac != 0)
+    {
+        const float alpha = pack16.bits.sign == 0 ? 5.96046448e-8f : -5.96046448e-8f;
+        return pack16.bits.frac * alpha;
+    }
+
+    if (pack16.bits.exp == 0x1f && pack16.bits.frac == 0)
+    {
+        pack32.bits.sign = pack16.bits.sign;
+        pack32.bits.exp = 0xff;
+        pack32.bits.frac = 0;
+        return pack32.fp32;
+    }
+
+    if (pack16.bits.exp == 0x1f && pack16.bits.frac != 0)
+    {
+        pack32.bits.sign = pack16.bits.sign;
+        pack32.bits.exp = 0xff;
+        pack32.bits.frac = 1;
+        return pack32.fp32;
+    }
+
+    return pack32.fp32;
+}
+struct data_buffer
+{
+    void* data;
+    size_t size;
+    int dims[8];
+    int dim_num;
+    int dtype;
+    float scale;
+    int32_t zero_point;
+};
+
+float random_float(float a, float b)
+{
+    float random = ((float)rand()) / (float)RAND_MAX;
+    float diff = b - a;
+    float r = random * diff;
+    float v = a + r;
+    // generate denormal as zero
+    if (v < 0.0001 && v > -0.0001)
+        v = 0.f;
+    return v;
+}
+
+int rand_int(const int a, const int b)
+{
+    const int delta = b - a;
+    return a + rand() % delta;
+}
+
+struct data_buffer* create_data_buffer_from_tensor(tensor_t tensor)
+{
+    struct data_buffer* buf = (struct data_buffer*)malloc(sizeof(struct data_buffer));
+    buf->size = get_tensor_buffer_size(tensor);
+    buf->data = malloc(buf->size);
+    memcpy(buf->data, get_tensor_buffer(tensor), buf->size);
+    buf->dim_num = get_tensor_shape(tensor, buf->dims, 8);
+    buf->dtype = get_tensor_data_type(tensor);
+    get_tensor_quant_param(tensor, &buf->scale, &buf->zero_point, 1);
+    return buf;
+}
+
+int dtype_to_size(const int dtype)
+{
+    switch (dtype)
+    {
+    case TENGINE_DT_FP32:
+        return sizeof(float);
+    case TENGINE_DT_INT8:
+        return sizeof(int8_t);
+    case TENGINE_DT_UINT8:
+        return sizeof(uint8_t);
+    case TENGINE_DT_FP16:
+        return sizeof(uint16_t);
+    case TENGINE_DT_INT16:
+        return sizeof(int16_t);
+    case TENGINE_DT_INT32:
+        return sizeof(int32_t);
+    default:
+        assert(0 && "Unsupported dtype");
+        return -1;
+    }
+}
+
+static int fill_random_data(void* p, size_t total_size, int dtype)
+{
+#define __fill(__dtype)                               \
+    do {                                              \
+        __dtype* data = p;                            \
+        const int n = total_size / sizeof(__dtype);   \
+        for (int i = 0; i < n; ++i)                   \
+        {                                             \
+            if (dtype == TENGINE_DT_UINT8)            \
+            {                                         \
+                data[i] = (__dtype)rand_int(0, 30);   \
+            }                                         \
+            else                                      \
+            {                                         \
+                data[i] = (__dtype)rand_int(-15, 15); \
+            }                                         \
+        }                                             \
+    } while (0);
+
+    if (dtype == TENGINE_DT_FP32)
+    {
+        float* data = p;
+        for (int i = 0; i < total_size / sizeof(float); ++i)
+        {
+            data[i] = random_float(-1.2, 1.2);
+        }
+        return 0;
+    }
+    else if (dtype == TENGINE_DT_FP16)
+    {
+        uint16_t* data = p;
+        for (int i = 0; i < total_size / sizeof(uint16_t); ++i)
+        {
+            data[i] = __fp32_to_fp16(random_float(-1.2, 1.2));
+        }
+        return 0;
+    }
+    else if (dtype == TENGINE_DT_INT8)
+    {
+        __fill(int8_t);
+        return 0;
+    }
+    else if (dtype == TENGINE_DT_UINT8)
+    {
+        __fill(uint8_t);
+        return 0;
+    }
+    else if (dtype == TENGINE_DT_INT32)
+    {
+        __fill(int32_t);
+        return 0;
+    }
+
+    assert(0 && "Unsupported dtype");
+    return -1;
+}
+
+struct data_buffer* create_data_buffer(const int* dims, const int dim_num, const int dtype)
+{
+    const int elem_size = dtype_to_size(dtype);
+    if (elem_size < 0) return NULL;
+
+    struct data_buffer* buf = (struct data_buffer*)malloc(sizeof(struct data_buffer));
+    if (!buf) return NULL;
+    buf->size = (int)(dim_num > 0);
+    buf->dim_num = dim_num;
+
+    for (int i = 0; i < dim_num; ++i)
+    {
+        buf->size *= dims[i];
+        buf->dims[i] = dims[i];
+    }
+
+    buf->size *= elem_size;
+    buf->dtype = dtype;
+    buf->data = malloc(buf->size);
+    if (!buf->data)
+    {
+        free(buf);
+        return NULL;
+    }
+
+    buf->scale = random_float(0.1, 2.0) + 0.01;
+    buf->zero_point = rand_int(-5, 5);
+
+    int ret = fill_random_data(buf->data, buf->size, buf->dtype);
+    if (ret != 0)
+    {
+        free(buf->data);
+        free(buf);
+        return NULL;
+    }
+    return buf;
+}
+
+struct data_buffer* create_data_buffer_fp32(const int* dims, const int dim_num)
+{
+    return create_data_buffer(dims, dim_num, TENGINE_DT_FP32);
+}
+
+void free_data_buffer_in_vector(void* p)
+{
+    struct data_buffer* buf = *(struct data_buffer**)p;
+    free(buf->data);
+    free(buf);
+}
+
+bool is_match_buffer(const struct data_buffer* lhs, const struct data_buffer* rhs, const float eps)
+{
+    if (lhs->dim_num != rhs->dim_num || lhs->size != rhs->size || lhs->dtype != rhs->dtype) return false;
+#define __compare(__dtype)                                                                                                                                                                                                                                   \
+    do {                                                                                                                                                                                                                                                     \
+        const __dtype* p1 = lhs->data;                                                                                                                                                                                                                       \
+        const __dtype* p2 = rhs->data;                                                                                                                                                                                                                       \
+        if (lhs->scale != rhs->scale || lhs->zero_point != rhs->zero_point) return false;                                                                                                                                                                    \
+        for (int i = 0; i < lhs->size / dtype_to_size(lhs->dtype); ++i)                                                                                                                                                                                      \
+        {                                                                                                                                                                                                                                                    \
+            const int a = (int)p1[i];                                                                                                                                                                                                                        \
+            const int b = (int)p2[i];                                                                                                                                                                                                                        \
+            if (abs(a - b) != 0)                                                                                                                                                                                                                             \
+            {                                                                                                                                                                                                                                                \
+                fprintf(stderr, "buffer mismatch at %d, lhs = %d, rhs = %d, dims1 = {%d, %d, %d, %d}, dims2 = {%d, %d, %d, %d}\n", i, a, b, lhs->dims[0], lhs->dims[1], lhs->dims[2], lhs->dims[3], rhs->dims[0], rhs->dims[1], rhs->dims[2], rhs->dims[3]); \
+                return false;                                                                                                                                                                                                                                \
+            }                                                                                                                                                                                                                                                \
+        }                                                                                                                                                                                                                                                    \
+        return true;                                                                                                                                                                                                                                         \
+    } while (0)
+
+    for (int i = 0; i < lhs->dim_num; ++i)
+    {
+        if (lhs->dims[i] != rhs->dims[i]) return false;
+    }
+
+    if (lhs->dtype == TENGINE_DT_FP32)
+    {
+        const float* p1 = lhs->data;
+        const float* p2 = rhs->data;
+
+        for (int i = 0; i < lhs->size / sizeof(float); ++i)
+        {
+            if (fabs(p1[i] - p2[i]) > eps)
+            {
+                fprintf(stderr, "buffer mismatch at %d, lhs = %f, rhs = %f, dims1 = {%d, %d, %d, %d}, dims2 = {%d, %d, %d, %d}\n", i, p1[i], p2[i], lhs->dims[0], lhs->dims[1], lhs->dims[2], lhs->dims[3], rhs->dims[0], rhs->dims[1], rhs->dims[2], rhs->dims[3]);
+                return false;
+            }
+        }
+
+        return true;
+    }
+    else if (lhs->dtype == TENGINE_DT_UINT8)
+    {
+        __compare(uint8_t);
+    }
+    else if (lhs->dtype == TENGINE_DT_INT8)
+    {
+        __compare(int8_t);
+    }
+    else if (lhs->dtype == TENGINE_DT_INT32)
+    {
+        __compare(int32_t);
+    }
+    else if (lhs->dtype == TENGINE_DT_INT16)
+    {
+        __compare(int16_t);
+    }
+    else if (lhs->dtype == TENGINE_DT_FP16)
+    {
+        const uint16_t* p1 = lhs->data;
+        const uint16_t* p2 = lhs->data;
+
+        for (int i = 0; i < lhs->size / sizeof(uint16_t); ++i)
+        {
+            const uint16_t a = p1[i];
+            const uint16_t b = p2[i];
+            const float fpa = __fp16_to_fp32(a);
+            const float fpb = __fp16_to_fp32(b);
+
+            if (fabs(fpa - fpb) > eps)
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+#undef __compare
 
+    return false;
+}
+
+typedef int (*node_setup_hook_fn)(graph_t graph, const char* test_node_name, const char* op, const char* input_name, int data_type, int input_num, int output_num);
 typedef int (*common_test)(graph_t, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w);
 
+#if 0
 void dump_tensor_line(void* data_ptr, int offset, int data_type, int w)
 {
     if (0 >= w)
@@ -48,7 +444,7 @@ void dump_tensor_line(void* data_ptr, int offset, int data_type, int w)
     }
     case TENGINE_DT_FP16:
     {
-        __fp16* p = (__fp16*)data_ptr;
+        uint16_t* p = (uint16_t*)data_ptr;
 
 #ifdef __ARM_ARCH
         for (int i = 0; i < w - 1; i++)
@@ -213,6 +609,7 @@ void dump_node_output(node_t test_node, int index)
 
     release_graph_tensor(tensor);
 }
+#endif
 
 int create_node(graph_t graph, const char* node_name, int n, int c, int h, int w, int data_type, int layout)
 {
@@ -252,7 +649,7 @@ int create_node(graph_t graph, const char* node_name, int n, int c, int h, int w
     return 0;
 }
 
-int create_input_node(graph_t graph, const char* node_name, int data_type, int layout, int n, int c, int h, int w, int dims_count = 4)
+int create_input_node_with_multi_inputs(graph_t graph, const char* node_name, int data_type, int input_num, int layout, int n, int c, int h, int w, int dims_count)
 {
     if (0 == n) dims_count = 3;
     if (0 == c) dims_count = 2;
@@ -263,106 +660,110 @@ int create_input_node(graph_t graph, const char* node_name, int data_type, int l
         return -1;
     }
 
-    node_t node = create_graph_node(graph, node_name, "InputOp");
+    node_t node = create_graph_node(graph, node_name, OP_INPUT_NAME);
     if (NULL == node)
     {
         fprintf(stderr, "Create %d dims node(%s) failed. ", dims_count, node_name);
         return -1;
     }
 
-    tensor_t tensor = create_graph_tensor(graph, node_name, data_type);
-    if (NULL == tensor)
-    {
-        release_graph_node(node);
-
-        fprintf(stderr, "Create %d dims tensor for node(%s) failed. ", dims_count, node_name);
-
-        return -1;
-    }
-
-    int ret = set_node_output_tensor(node, 0, tensor, TENSOR_TYPE_INPUT);
-    if (0 != ret)
+    for (int i = 0; i < input_num; ++i)
     {
-        release_graph_tensor(tensor);
-        release_graph_node(node);
-
-        fprintf(stderr, "Set %d dims output tensor for node(%s) failed. ", dims_count, node_name);
-
-        return -1;
-    }
+        char tensor_name[512];
+        snprintf(tensor_name, sizeof(tensor_name), "%s_%d", node_name, i);
+        tensor_t tensor = create_graph_tensor(graph, tensor_name, data_type);
 
-    switch (dims_count)
-    {
-    case 1:
-    {
-        int dims_array[1] = {w};
-        set_tensor_shape(tensor, dims_array, dims_count);
-        break;
-    }
-    case 2:
-    {
-        int dims_array[2] = {h, w};
-        set_tensor_shape(tensor, dims_array, dims_count);
-        break;
-    }
-    case 3:
-    {
-        if (TENGINE_LAYOUT_NCHW == layout)
+        if (NULL == tensor)
         {
-            int dims_array[3] = {c, h, w};
-            set_tensor_shape(tensor, dims_array, dims_count);
-            break;
+            release_graph_node(node);
+            fprintf(stderr, "Create %d dims tensor for node(%s) failed. ", dims_count, node_name);
+            return -1;
         }
 
-        if (TENGINE_LAYOUT_NHWC == layout)
+        int ret = set_node_output_tensor(node, i, tensor, TENSOR_TYPE_INPUT);
+        if (0 != ret)
         {
-            int dims_array[3] = {h, w, c};
-            set_tensor_shape(tensor, dims_array, dims_count);
-            break;
+            release_graph_tensor(tensor);
+            release_graph_node(node);
+            fprintf(stderr, "Set %d dims output tensor for node(%s) failed. ", dims_count, node_name);
+            return -1;
         }
-    }
-    case 4:
-    {
-        if (TENGINE_LAYOUT_NCHW == layout)
+
+        switch (dims_count)
+        {
+        case 1:
         {
-            int dims_array[4] = {n, c, h, w};
+            int dims_array[1] = {w};
             set_tensor_shape(tensor, dims_array, dims_count);
             break;
         }
-
-        if (TENGINE_LAYOUT_NHWC == layout)
+        case 2:
         {
-            int dims_array[4] = {n, h, w, c};
+            int dims_array[2] = {h, w};
             set_tensor_shape(tensor, dims_array, dims_count);
             break;
         }
-    }
-    case 5:
-    {
-        if (TENGINE_LAYOUT_NCHW == layout)
+        case 3:
         {
-            int dims_array[5] = {1, n, c, h, w};
-            set_tensor_shape(tensor, dims_array, dims_count);
-            break;
+            if (TENGINE_LAYOUT_NCHW == layout)
+            {
+                int dims_array[3] = {c, h, w};
+                set_tensor_shape(tensor, dims_array, dims_count);
+                break;
+            }
+
+            if (TENGINE_LAYOUT_NHWC == layout)
+            {
+                int dims_array[3] = {h, w, c};
+                set_tensor_shape(tensor, dims_array, dims_count);
+                break;
+            }
         }
+        case 4:
+        {
+            if (TENGINE_LAYOUT_NCHW == layout)
+            {
+                int dims_array[4] = {n, c, h, w};
+                set_tensor_shape(tensor, dims_array, dims_count);
+                break;
+            }
 
-        if (TENGINE_LAYOUT_NHWC == layout)
+            if (TENGINE_LAYOUT_NHWC == layout)
+            {
+                int dims_array[4] = {n, h, w, c};
+                set_tensor_shape(tensor, dims_array, dims_count);
+                break;
+            }
+        }
+        case 5:
         {
-            int dims_array[5] = {1, n, h, w, c};
-            set_tensor_shape(tensor, dims_array, dims_count);
-            break;
+            if (TENGINE_LAYOUT_NCHW == layout)
+            {
+                int dims_array[5] = {1, n, c, h, w};
+                set_tensor_shape(tensor, dims_array, dims_count);
+                break;
+            }
+
+            if (TENGINE_LAYOUT_NHWC == layout)
+            {
+                int dims_array[5] = {1, n, h, w, c};
+                set_tensor_shape(tensor, dims_array, dims_count);
+                break;
+            }
+        }
+        default:
+            fprintf(stderr, "Cannot support %d dims tensor.\n", dims_count);
         }
     }
-    default:
-        fprintf(stderr, "Cannot support %d dims tensor.\n", dims_count);
-    }
-
-    release_graph_tensor(tensor);
-    release_graph_node(node);
 
     return 0;
 }
 
+int create_input_node(graph_t graph, const char* node_name, int data_type, int layout, int n, int c, int h, int w, int dims_count)
+{
+    return create_input_node_with_multi_inputs(graph, node_name, data_type, 1, layout, n, c, h, w, dims_count);
+}
+
 int fill_fp32_tensor(tensor_t tensor, float value)
 {
     int dims[MAX_SHAPE_DIM_NUM];
@@ -457,6 +858,16 @@ int fill_uint8_tensor(tensor_t tensor, float value)
     return 0;
 }
 
+void feed_input_tensor(graph_t graph, int input_node_idx, int input_tensor_idx, const float* values, int* dims, const int dim_num)
+{
+    tensor_t tensor = get_graph_input_tensor(graph, input_node_idx, input_tensor_idx);
+    if (!tensor)
+    {
+        fprintf(stderr, "Cannot find %dth tensor with node idex %d\n", input_tensor_idx, input_node_idx);
+        return;
+    }
+}
+
 void fill_input_float_tensor_by_index(graph_t graph, int input_node_index, int tensor_index, float value)
 {
     tensor_t tensor = get_graph_input_tensor(graph, input_node_index, tensor_index);
@@ -585,9 +996,7 @@ int test_graph_init()
 {
     // now init tengine will mask critical filed and return an error
     // TODO: fix this fatal issue
-    init_tengine();
-
-    return 0;
+    return init_tengine();
 }
 
 int test_graph_run(graph_t graph)
@@ -598,7 +1007,7 @@ int test_graph_run(graph_t graph)
         return -1;
     }
 
-    dump_graph(graph);
+    // dump_graph(graph);
 
     if (0 != run_graph(graph, 1))
     {
@@ -613,10 +1022,41 @@ void test_graph_release(graph_t graph)
 {
     postrun_graph(graph);
     destroy_graph(graph);
-    release_tengine();
 }
 
-graph_t create_common_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4)
+static int craete_common_test_node(graph_t graph, const char* test_node_name, const char* op, const char* input_name, int data_type, int input_num, int output_num)
+{
+    node_t test_node = create_graph_node(graph, test_node_name, op);
+    if (NULL == test_node)
+    {
+        fprintf(stderr, "create test node failed.\n");
+        return -1;
+    }
+
+    node_t input_node = get_graph_node(graph, input_name);
+    for (int i = 0; i < get_node_output_number(input_node); ++i)
+    {
+        tensor_t input_tensor = get_node_output_tensor(input_node, i);
+        set_node_input_tensor(test_node, i, input_tensor);
+    }
+
+    char tensor_name[512];
+    for (int i = 0; i < output_num; ++i)
+    {
+        snprintf(tensor_name, sizeof(tensor_name), "%s_%d", test_node_name, i);
+        tensor_t output_tensor = create_graph_tensor(graph, tensor_name, data_type);
+        if (!output_tensor)
+        {
+            fprintf(stderr, "create graph output tensor failed.\n");
+            return -1;
+        }
+
+        set_node_output_tensor(test_node, i, output_tensor, TENSOR_TYPE_VAR);
+    }
+    return 0;
+}
+
+graph_t create_common_test_graph(const char* op, const char* test_node_name, const void* params, const size_t param_size, vector_t* inputs, int output_num, int data_type, int layout)
 {
     graph_t graph = create_graph(NULL, NULL, NULL);
     if (NULL == graph)
@@ -632,29 +1072,80 @@ graph_t create_common_test_graph(const char* test_node_name, int data_type, int
     }
 
     const char* input_name = "input_node";
-    if (create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0)
+    node_t input_node = create_graph_node(graph, input_name, OP_INPUT_NAME);
+    node_t test_node = create_graph_node(graph, test_node_name, op);
+    if (!input_node || !test_node)
     {
         fprintf(stderr, "create input node failed.\n");
         return NULL;
     }
 
-    if (test_func(graph, input_name, test_node_name, data_type, layout, n, c, h, w) < 0)
+    // setup input tensor
+    char tensor_name[512];
+    float scale = 1.0;
+    int zero_point = 0.0;
+
+    for (int i = 0; i < get_vector_num(inputs); ++i)
     {
-        fprintf(stderr, "create test node failed.\n");
-        return NULL;
+        struct data_buffer* input = *(struct data_buffer**)get_vector_data(inputs, i);
+        snprintf(tensor_name, sizeof(tensor_name), "%s_%d", input_name, i);
+        tensor_t tensor = create_graph_tensor(graph, tensor_name, input->dtype);
+        if (!tensor) return NULL;
+
+        set_tensor_shape(tensor, input->dims, input->dim_num);
+        set_tensor_buffer(tensor, input->data, input->size);
+        scale = input->scale;
+        zero_point = input->zero_point;
+        set_tensor_quant_param(tensor, &scale, &zero_point, 1);
+
+        if (set_node_output_tensor(input_node, i, tensor, TENSOR_TYPE_VAR))
+        {
+            return NULL;
+        }
+
+        if (set_node_input_tensor(test_node, i, tensor))
+        {
+            return NULL;
+        }
+    }
+
+    // setup output tensor
+    for (int i = 0; i < output_num; ++i)
+    {
+        snprintf(tensor_name, sizeof(tensor_name), "%s_%d", test_node_name, i);
+        tensor_t output_tensor = create_graph_tensor(graph, tensor_name, data_type);
+
+        if (data_type != TENGINE_DT_FP16 && data_type != TENGINE_DT_FP32)
+        {
+            set_tensor_quant_param(output_tensor, &scale, &zero_point, 1);
+        }
+
+        if (set_node_output_tensor(test_node, i, output_tensor, TENSOR_TYPE_VAR))
+        {
+            return NULL;
+        }
+    }
+
+    // setup test node param
+    if (params)
+    {
+        struct node* ir_node = (struct node*)test_node;
+        memcpy(ir_node->op.param_mem, params, param_size);
     }
 
+    // setup test node end.
+
     /* set input/output node */
-    const char* inputs[] = {input_name};
-    const char* outputs[] = {test_node_name};
+    const char* input_nodes[] = {input_name};
+    const char* output_nodes[] = {test_node_name};
 
-    if (set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0)
+    if (set_graph_input_node(graph, input_nodes, sizeof(input_nodes) / sizeof(char*)) < 0)
     {
         fprintf(stderr, "set inputs failed.\n");
         return NULL;
     }
 
-    if (set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0)
+    if (set_graph_output_node(graph, output_nodes, sizeof(output_nodes) / sizeof(char*)) < 0)
     {
         fprintf(stderr, "set outputs failed.\n");
         return NULL;
@@ -663,74 +1154,134 @@ graph_t create_common_test_graph(const char* test_node_name, int data_type, int
     return graph;
 }
 
-graph_t create_opendla_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4)
+vector_t* create_and_forward_test_graph(const char* op, const void* params, const size_t param_size, vector_t* inputs, int output_num, int data_type, int layout)
 {
-    /* create OpenDLA backend */
-    context_t odla_context = create_context("odla", 1);
-    int rtt = set_context_device(odla_context, "OPENDLA", NULL, 0);
-    if (0 > rtt)
+    int ret = 0;
+    vector_t* outputs_ref = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);
+    graph_t graph_ref = create_common_test_graph(op, "test_node", params, param_size, inputs, output_num, data_type, layout);
+
+    if (!outputs_ref)
     {
-        fprintf(stderr, " add_context_device VSI DEVICE failed.\n");
-        return NULL;
+        ret = -1;
+        goto out;
     }
 
-    graph_t graph = create_graph(odla_context, NULL, NULL);
-    if (NULL == graph)
+    if (!graph_ref)
     {
-        fprintf(stderr, "get graph failed.\n");
-        return NULL;
+        goto failed;
     }
 
-    if (set_graph_layout(graph, layout) < 0)
+    struct options opt;
+    opt.num_thread = 1;
+    opt.cluster = TENGINE_CLUSTER_ALL;
+    opt.precision = TENGINE_MODE_FP32;
+    opt.affinity = 255;
+
+    if ((ret = prerun_graph_multithread(graph_ref, opt)) != 0)
     {
-        fprintf(stderr, "set layout failed.\n");
-        return NULL;
+        fprintf(stderr, "prerun graph failed: %d\n", ret);
+        goto failed;
     }
 
-    const char* input_name = "input_node";
-    if (create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0)
+    if ((ret = run_graph(graph_ref, 1)) < 0)
     {
-        fprintf(stderr, "create input node failed.\n");
-        return NULL;
+        fprintf(stderr, "run graph failed: %d\n", ret);
+        goto out;
     }
 
-    if (test_func(graph, input_name, test_node_name, data_type, layout, n, c, h, w) < 0)
+    for (int i = 0; i < get_graph_output_node_number(graph_ref); ++i)
     {
-        fprintf(stderr, "create test node failed.\n");
-        return NULL;
+        node_t output_node = get_graph_output_node(graph_ref, i);
+        for (int t = 0; t < get_node_output_number(output_node); ++t)
+        {
+            tensor_t output_tensor = get_graph_output_tensor(graph_ref, i, t);
+            struct data_buffer* data = create_data_buffer_from_tensor(output_tensor);
+            push_vector_data(outputs_ref, &data);
+        }
     }
 
-    /* set input/output node */
-    const char* inputs[] = {input_name};
-    const char* outputs[] = {test_node_name};
+    if ((ret = postrun_graph(graph_ref)))
+    {
+        goto failed;
+    }
 
-    if (set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0)
+    goto out;
+
+failed:
+    release_vector(outputs_ref);
+    outputs_ref = NULL;
+
+out:
+    if (graph_ref)
     {
-        fprintf(stderr, "set inputs failed.\n");
-        return NULL;
+        destroy_graph(graph_ref);
     }
+    return outputs_ref;
+}
 
-    if (set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0)
+//inputs: vector<struct data_buffer>
+int create_common_op_test_case(const char* op, const void* params, const size_t param_size, vector_t* inputs, int output_num, int data_type, int layout, const float eps)
+{
+    int ret = init_tengine();
+    if (ret)
     {
-        fprintf(stderr, "set outputs failed.\n");
-        return NULL;
+        fprintf(stderr, "init tengine failed: %d\n", ret);
+        return ret;
     }
 
-    return graph;
+    setenv("TG_DEBUG_REF", "1", 1);
+    vector_t* outputs_ref = create_and_forward_test_graph(op, params, param_size, inputs, 1, data_type, layout);
+    if (!outputs_ref)
+    {
+        return -1;
+    }
+
+    setenv("TG_DEBUG_REF", "0", 1);
+    vector_t* outputs = create_and_forward_test_graph(op, params, param_size, inputs, 1, data_type, layout);
+    if (!outputs)
+    {
+        ret = -1;
+        goto out;
+    }
+
+    if (get_vector_num(outputs) != get_vector_num(outputs_ref))
+    {
+        fprintf(stderr, "output num is not equal to ref. test = %d, ref = %d\n", get_vector_num(outputs), get_vector_num(outputs_ref));
+        goto out;
+    }
+
+    for (int i = 0; i < get_vector_num(outputs_ref); ++i)
+    {
+        struct data_buffer* p1 = *(struct data_buffer**)get_vector_data(outputs_ref, i);
+        struct data_buffer* p2 = *(struct data_buffer**)get_vector_data(outputs, i);
+
+        if (!is_match_buffer(p1, p2, eps))
+        {
+            fprintf(stderr, "%dth output is mismatch\n", i);
+            ret = -1;
+            goto out;
+        }
+    }
+
+out:
+    if (outputs_ref) release_vector(outputs_ref);
+    if (outputs) release_vector(outputs);
+    release_tengine();
+    return ret;
 }
 
-graph_t create_timvx_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4)
+graph_t create_opendla_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num)
 {
-    /* create VeriSilicon TIM-VX backend */
-    context_t timvx_context = create_context("timvx", 1);
-    int rtt = set_context_device(timvx_context, "TIMVX", NULL, 0);
+    /* create OpenDLA backend */
+    context_t odla_context = create_context("odla", 1);
+    int rtt = set_context_device(odla_context, "OPENDLA", NULL, 0);
     if (0 > rtt)
     {
         fprintf(stderr, " add_context_device VSI DEVICE failed.\n");
         return NULL;
     }
 
-    graph_t graph = create_graph(timvx_context, NULL, NULL);
+    graph_t graph = create_graph(odla_context, NULL, NULL);
     if (NULL == graph)
     {
         fprintf(stderr, "get graph failed.\n");
@@ -775,18 +1326,18 @@ graph_t create_timvx_test_graph(const char* test_node_name, int data_type, int l
     return graph;
 }
 
-graph_t create_tensorrt_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4)
+graph_t create_timvx_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num)
 {
-    /* create TensorRT backend */
-    context_t trt_context = create_context("tensorrt", 1);
-    int rtt = set_context_device(trt_context, "TensorRT", NULL, 0);
+    /* create VeriSilicon TIM-VX backend */
+    context_t timvx_context = create_context("timvx", 1);
+    int rtt = set_context_device(timvx_context, "TIMVX", NULL, 0);
     if (0 > rtt)
     {
         fprintf(stderr, " add_context_device VSI DEVICE failed.\n");
         return NULL;
     }
 
-    graph_t graph = create_graph(trt_context, NULL, NULL);
+    graph_t graph = create_graph(timvx_context, NULL, NULL);
     if (NULL == graph)
     {
         fprintf(stderr, "get graph failed.\n");
@@ -831,18 +1382,18 @@ graph_t create_tensorrt_test_graph(const char* test_node_name, int data_type, in
     return graph;
 }
 
-graph_t create_torch_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4)
+graph_t create_tensorrt_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num)
 {
-    /* create libTorch backend */
-    context_t torch_context = create_context("torch", 1);
-    int rtt = set_context_device(torch_context, "TORCH", NULL, 0);
+    /* create TensorRT backend */
+    context_t trt_context = create_context("tensorrt", 1);
+    int rtt = set_context_device(trt_context, "TensorRT", NULL, 0);
     if (0 > rtt)
     {
         fprintf(stderr, " add_context_device VSI DEVICE failed.\n");
         return NULL;
     }
 
-    graph_t graph = create_graph(torch_context, NULL, NULL);
+    graph_t graph = create_graph(trt_context, NULL, NULL);
     if (NULL == graph)
     {
         fprintf(stderr, "get graph failed.\n");
@@ -887,9 +1438,18 @@ graph_t create_torch_test_graph(const char* test_node_name, int data_type, int l
     return graph;
 }
 
-graph_t create_cpu_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4)
+graph_t create_torch_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num)
 {
-    graph_t graph = create_graph(NULL, NULL, NULL);
+    /* create libTorch backend */
+    context_t torch_context = create_context("torch", 1);
+    int rtt = set_context_device(torch_context, "TORCH", NULL, 0);
+    if (0 > rtt)
+    {
+        fprintf(stderr, " add_context_device VSI DEVICE failed.\n");
+        return NULL;
+    }
+
+    graph_t graph = create_graph(torch_context, NULL, NULL);
     if (NULL == graph)
     {
         fprintf(stderr, "get graph failed.\n");
@@ -934,105 +1494,6 @@ graph_t create_cpu_test_graph(const char* test_node_name, int data_type, int lay
     return graph;
 }
 
-int compare_tensor(tensor_t a, tensor_t b)
-{
-    int a_dim[MAX_SHAPE_DIM_NUM], b_dim[MAX_SHAPE_DIM_NUM];
-    int a_dim_count = get_tensor_shape(a, a_dim, MAX_SHAPE_DIM_NUM);
-    int b_dim_count = get_tensor_shape(b, b_dim, MAX_SHAPE_DIM_NUM);
-
-    if (a_dim_count <= 0 || a_dim_count != b_dim_count)
-        return -1;
-
-    for (int i = 0; i < a_dim_count; i++)
-        if (a_dim[i] != b_dim[i])
-            return -1;
-
-    int a_type = get_tensor_data_type(a);
-    int b_type = get_tensor_data_type(b);
-
-    if (a_type != b_type)
-        return -1;
-
-    int element_size = 1;
-    for (int i = 0; i < a_dim_count; i++)
-        element_size *= a_dim[i];
-
-    if (element_size <= 0)
-    {
-        fprintf(stderr, "One of dims is 0. Zero is not allowed.\n");
-        return -1;
-    }
-
-    switch (a_type)
-    {
-    case TENGINE_DT_FP32:
-    {
-        float* a_data_ptr = (float*)get_tensor_buffer(a);
-        float* b_data_ptr = (float*)get_tensor_buffer(b);
-
-        for (int i = 0; i < element_size; i++)
-            if (fabsf(a_data_ptr[i] - b_data_ptr[i]) < TENSOR_FLOAT_EPSILON)
-                return -1;
-
-        break;
-    }
-    case TENGINE_DT_FP16:
-    {
-        __fp16* a_data_ptr = (__fp16*)get_tensor_buffer(a);
-        __fp16* b_data_ptr = (__fp16*)get_tensor_buffer(b);
-
-        for (int i = 0; i < element_size; i++)
-        {
-            if (fabsf((float)fp16_to_fp32(a_data_ptr[i]) - (float)fp16_to_fp32(b_data_ptr[i])) < TENSOR_FLOAT_EPSILON)
-                return -1;
-        }
-
-        break;
-    }
-    case TENGINE_DT_INT32:
-    {
-        int32_t* a_data_ptr = (int32_t*)get_tensor_buffer(a);
-        int32_t* b_data_ptr = (int32_t*)get_tensor_buffer(b);
-
-        for (int i = 0; i < element_size; i++)
-            if (a_data_ptr[i] != b_data_ptr[i])
-                return -1;
-
-        break;
-    }
-    case TENGINE_DT_INT16:
-    {
-        int16_t* a_data_ptr = (int16_t*)get_tensor_buffer(a);
-        int16_t* b_data_ptr = (int16_t*)get_tensor_buffer(b);
-
-        for (int i = 0; i < element_size; i++)
-            if (a_data_ptr[i] != b_data_ptr[i])
-                return -1;
-
-        break;
-    }
-    case TENGINE_DT_UINT8:
-    case TENGINE_DT_INT8:
-    {
-        int8_t* a_data_ptr = (int8_t*)get_tensor_buffer(a);
-        int8_t* b_data_ptr = (int8_t*)get_tensor_buffer(b);
-
-        for (int i = 0; i < element_size; i++)
-            if (a_data_ptr[i] != b_data_ptr[i])
-                return -1;
-
-        break;
-    }
-    default:
-    {
-        fprintf(stderr, "The type of tensor was not supported.\n");
-        return -1;
-    }
-    }
-
-    return 0;
-}
-
 static inline unsigned long get_current_time(void)
 {
     struct timespec tm;
diff --git a/tests/op/test_op_absval.c b/tests/op/test_op_absval.c
new file mode 100644
index 000000000..aa8ab2c66
--- /dev/null
+++ b/tests/op/test_op_absval.c
@@ -0,0 +1,41 @@
+#include "api/c_api.h"
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "test_op.h"
+#include "tengine/c_api.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "util/vector.h"
+
+#define define_test_case(__func, __layout, ...)                                                             \
+    static int __func()                                                                                     \
+    {                                                                                                       \
+        int data_type = TENGINE_DT_FP32;                                                                    \
+        int layout = __layout;                                                                              \
+        int dims[] = {__VA_ARGS__};                                                                         \
+        int dims_num = sizeof(dims) / sizeof(dims[0]);                                                      \
+        vector_t* inputs = create_vector(sizeof(struct data_buffer), free_data_buffer_in_vector);           \
+        struct data_buffer* input = create_data_buffer_fp32(dims, sizeof(dims) / sizeof(int));              \
+        push_vector_data(inputs, &input);                                                                   \
+        int ret = create_common_op_test_case(OP_ABSVAL_NAME, NULL, 0, inputs, 1, data_type, layout, 0.001); \
+        release_vector(inputs);                                                                             \
+        return ret;                                                                                         \
+    }
+
+define_test_case(absval_op_test_case_0, TENGINE_LAYOUT_NCHW, 1, 3, 64, 128);
+define_test_case(absval_op_test_case_1, TENGINE_LAYOUT_NCHW, 1, 3, 128, 128);
+define_test_case(absval_op_test_case_2, TENGINE_LAYOUT_NCHW, 1, 3, 128, 64);
+define_test_case(absval_op_test_case_3, TENGINE_LAYOUT_NCHW, 1, 3, 111, 111);
+define_test_case(absval_op_test_case_4, TENGINE_LAYOUT_NCHW, 1, 3, 65, 111);
+
+#define __NHWC_SUPPORTED__ 0
+#if __NHWC_SUPPORTED__
+#endif
+
+int main(void)
+{
+    return absval_op_test_case_0() || absval_op_test_case_1() || absval_op_test_case_2() || absval_op_test_case_3() || absval_op_test_case_4()
+#if __NHWC_SUPPORTED__
+#endif
+        ;
+}
diff --git a/tests/op/test_op_add_n.c b/tests/op/test_op_add_n.c
new file mode 100644
index 000000000..0f4118c02
--- /dev/null
+++ b/tests/op/test_op_add_n.c
@@ -0,0 +1,47 @@
+#include "api/c_api.h"
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "test_op.h"
+#include "tengine/c_api.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "util/vector.h"
+
+#define define_common_test_case(__op_name, __case_name, __layout, ...)                                     \
+    static int __case_name()                                                                               \
+    {                                                                                                      \
+        int data_type = TENGINE_DT_FP32;                                                                   \
+        int layout = __layout;                                                                             \
+        int dims[] = {__VA_ARGS__};                                                                        \
+        int dims_num = sizeof(dims) / sizeof(dims[0]);                                                     \
+        vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);         \
+        for (int i = 0; i < 64; ++i)                                                                       \
+        {                                                                                                  \
+            struct data_buffer* input = create_data_buffer_fp32(dims, sizeof(dims) / sizeof(int));         \
+            push_vector_data(inputs, &input);                                                              \
+            int ret = create_common_op_test_case(__op_name, NULL, 0, inputs, 1, data_type, layout, 0.001); \
+            if (ret) return ret;                                                                           \
+        }                                                                                                  \
+        release_vector(inputs);                                                                            \
+        return 0;                                                                                          \
+    }
+
+#define define_test_case(__case_name, __layout, ...) define_common_test_case(OP_ADD_N_NAME, __case_name, __layout, __VA_ARGS__)
+
+define_test_case(op_test_case_0, TENGINE_LAYOUT_NCHW, 1, 3, 64, 128);
+define_test_case(op_test_case_1, TENGINE_LAYOUT_NCHW, 1, 3, 128, 128);
+define_test_case(op_test_case_2, TENGINE_LAYOUT_NCHW, 1, 3, 128, 64);
+define_test_case(op_test_case_3, TENGINE_LAYOUT_NCHW, 1, 3, 111, 111);
+define_test_case(op_test_case_4, TENGINE_LAYOUT_NCHW, 1, 3, 65, 111);
+
+#define __NHWC_SUPPORTED__ 0
+#if __NHWC_SUPPORTED__
+#endif
+
+int main(void)
+{
+    return op_test_case_0() || op_test_case_1() || op_test_case_2() || op_test_case_3() || op_test_case_4()
+#if __NHWC_SUPPORTED__
+#endif
+        ;
+}
diff --git a/tests/op/test_op_argmax.c b/tests/op/test_op_argmax.c
new file mode 100644
index 000000000..8d6846519
--- /dev/null
+++ b/tests/op/test_op_argmax.c
@@ -0,0 +1,70 @@
+#include "api/c_api.h"
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "test_op.h"
+#include "operator/prototype/argmax_param.h"
+#include "tengine/c_api.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "util/vector.h"
+
+#define define_common_test_case(__op_name, __case_name, __layout, __axis, __keepdims, ...)                                 \
+    static int __case_name()                                                                                               \
+    {                                                                                                                      \
+        int layout = __layout;                                                                                             \
+        int dims[] = {__VA_ARGS__};                                                                                        \
+        int dims_num = sizeof(dims) / sizeof(dims[0]);                                                                     \
+        argmax_param_t param = {.axis = __axis, .keepdims = __keepdims};                                                   \
+        vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);                         \
+        struct data_buffer* input = create_data_buffer_fp32(dims, sizeof(dims) / sizeof(int));                             \
+        push_vector_data(inputs, &input);                                                                                  \
+        int ret = create_common_op_test_case(__op_name, &param, sizeof(param), inputs, 1, TENGINE_DT_FP32, layout, 0.001); \
+        if (ret)                                                                                                           \
+        {                                                                                                                  \
+            fprintf(stderr, "test argmax op failed: dims = [%d, %d, %d], dtype = fp32\n", dims[0], dims[1], dims[2]);      \
+            return ret;                                                                                                    \
+        }                                                                                                                  \
+        release_vector(inputs);                                                                                            \
+        inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);                                   \
+        input = create_data_buffer(dims, sizeof(dims) / sizeof(int), TENGINE_DT_UINT8);                                    \
+        push_vector_data(inputs, &input);                                                                                  \
+        ret = create_common_op_test_case(__op_name, &param, sizeof(param), inputs, 1, TENGINE_DT_UINT8, layout, 0.001);    \
+        if (ret)                                                                                                           \
+        {                                                                                                                  \
+            fprintf(stderr, "test argmax op failed: dims = [%d, %d, %d], dtype = uint8\n", dims[0], dims[1], dims[2]);     \
+            return ret;                                                                                                    \
+        }                                                                                                                  \
+        release_vector(inputs);                                                                                            \
+        fprintf(stderr, "test case pass, axis=%d, keepdims: %d\n", __axis, __keepdims);                                    \
+        return 0;                                                                                                          \
+    }
+
+#define define_test_case(__case_name, __layout, ...)                                                                                             \
+    define_common_test_case(OP_ARGMAX_NAME, __case_name##_00, __layout, 0, 0, __VA_ARGS__);                                                      \
+    define_common_test_case(OP_ARGMAX_NAME, __case_name##_01, __layout, 1, 0, __VA_ARGS__);                                                      \
+    define_common_test_case(OP_ARGMAX_NAME, __case_name##_02, __layout, 2, 0, __VA_ARGS__);                                                      \
+    define_common_test_case(OP_ARGMAX_NAME, __case_name##_10, __layout, 0, 1, __VA_ARGS__);                                                      \
+    define_common_test_case(OP_ARGMAX_NAME, __case_name##_11, __layout, 1, 1, __VA_ARGS__);                                                      \
+    define_common_test_case(OP_ARGMAX_NAME, __case_name##_12, __layout, 2, 1, __VA_ARGS__);                                                      \
+    static int __case_name()                                                                                                                     \
+    {                                                                                                                                            \
+        return __case_name##_00() || __case_name##_01() || __case_name##_02() || __case_name##_10() || __case_name##_11() || __case_name##_12(); \
+    }
+
+define_test_case(op_test_case_0, TENGINE_LAYOUT_NCHW, 3, 64, 128);
+define_test_case(op_test_case_1, TENGINE_LAYOUT_NCHW, 3, 128, 128);
+define_test_case(op_test_case_2, TENGINE_LAYOUT_NCHW, 3, 128, 64);
+define_test_case(op_test_case_3, TENGINE_LAYOUT_NCHW, 3, 111, 111);
+define_test_case(op_test_case_4, TENGINE_LAYOUT_NCHW, 3, 65, 111);
+
+#define __NHWC_SUPPORTED__ 0
+#if __NHWC_SUPPORTED__
+#endif
+
+int main(void)
+{
+    return op_test_case_0() || op_test_case_1() || op_test_case_2() || op_test_case_3() || op_test_case_4()
+#if __NHWC_SUPPORTED__
+#endif
+        ;
+}
diff --git a/tests/op/test_op_argmin.c b/tests/op/test_op_argmin.c
new file mode 100644
index 000000000..7b2f20bd1
--- /dev/null
+++ b/tests/op/test_op_argmin.c
@@ -0,0 +1,67 @@
+#include "api/c_api.h"
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "test_op.h"
+#include "operator/prototype/argmax_param.h"
+#include "tengine/c_api.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "util/vector.h"
+
+#define define_common_test_case(__op_name, __case_name, __layout, __axis, __keepdims, ...)                                 \
+    static int __case_name()                                                                                               \
+    {                                                                                                                      \
+        int layout = __layout;                                                                                             \
+        int dims[] = {__VA_ARGS__};                                                                                        \
+        int dims_num = sizeof(dims) / sizeof(dims[0]);                                                                     \
+        argmax_param_t param = {.axis = __axis, .keepdims = __keepdims};                                                   \
+        vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);                         \
+        struct data_buffer* input = create_data_buffer_fp32(dims, sizeof(dims) / sizeof(int));                             \
+        push_vector_data(inputs, &input);                                                                                  \
+        int ret = create_common_op_test_case(__op_name, &param, sizeof(param), inputs, 1, TENGINE_DT_FP32, layout, 0.001); \
+        if (ret)                                                                                                           \
+        {                                                                                                                  \
+            fprintf(stderr, "test argmin op failed: dims = [%d, %d, %d], dtype = fp32\n", dims[0], dims[1], dims[2]);      \
+            return ret;                                                                                                    \
+        }                                                                                                                  \
+        release_vector(inputs);                                                                                            \
+        inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);                                   \
+        input = create_data_buffer(dims, sizeof(dims) / sizeof(int), TENGINE_DT_UINT8);                                    \
+        push_vector_data(inputs, &input);                                                                                  \
+        ret = create_common_op_test_case(__op_name, &param, sizeof(param), inputs, 1, TENGINE_DT_UINT8, layout, 0.001);    \
+        if (ret)                                                                                                           \
+        {                                                                                                                  \
+            fprintf(stderr, "test argmin op failed: dims = [%d, %d, %d], dtype = uint8\n", dims[0], dims[1], dims[2]);     \
+            return ret;                                                                                                    \
+        }                                                                                                                  \
+        release_vector(inputs);                                                                                            \
+        fprintf(stderr, "test case pass, axis=%d, keepdims: %d\n", __axis, __keepdims);                                    \
+        return 0;                                                                                                          \
+    }
+
+#define define_test_case(__case_name, __layout, ...)                                                                                             \
+    define_common_test_case(OP_ARGMIN_NAME, __case_name##_00, __layout, 0, 0, __VA_ARGS__);                                                      \
+    define_common_test_case(OP_ARGMIN_NAME, __case_name##_01, __layout, 1, 0, __VA_ARGS__);                                                      \
+    define_common_test_case(OP_ARGMIN_NAME, __case_name##_02, __layout, 2, 0, __VA_ARGS__);                                                      \
+    define_common_test_case(OP_ARGMIN_NAME, __case_name##_10, __layout, 0, 1, __VA_ARGS__);                                                      \
+    define_common_test_case(OP_ARGMIN_NAME, __case_name##_11, __layout, 1, 1, __VA_ARGS__);                                                      \
+    define_common_test_case(OP_ARGMIN_NAME, __case_name##_12, __layout, 2, 1, __VA_ARGS__);                                                      \
+    static int __case_name()                                                                                                                     \
+    {                                                                                                                                            \
+        return __case_name##_00() || __case_name##_01() || __case_name##_02() || __case_name##_10() || __case_name##_11() || __case_name##_12(); \
+    }
+
+define_test_case(op_test_case_0, TENGINE_LAYOUT_NCHW, 3, 64, 128);
+define_test_case(op_test_case_1, TENGINE_LAYOUT_NCHW, 3, 128, 128);
+define_test_case(op_test_case_2, TENGINE_LAYOUT_NCHW, 3, 128, 64);
+define_test_case(op_test_case_3, TENGINE_LAYOUT_NCHW, 3, 111, 111);
+define_test_case(op_test_case_4, TENGINE_LAYOUT_NCHW, 3, 65, 111);
+
+#define __NHWC_SUPPORTED__ 0
+#if __NHWC_SUPPORTED__
+#endif
+
+int main(void)
+{
+    return op_test_case_0() || op_test_case_1() || op_test_case_2() || op_test_case_3() || op_test_case_4();
+}
diff --git a/tests/op/test_op_batchnorm.c b/tests/op/test_op_batchnorm.c
new file mode 100644
index 000000000..00361732c
--- /dev/null
+++ b/tests/op/test_op_batchnorm.c
@@ -0,0 +1,98 @@
+#include "api/c_api.h"
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "test_op.h"
+#include "operator/prototype/batchnorm_param.h"
+#include "tengine/c_api.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "util/vector.h"
+
+static void allocate_bn_inputs(vector_t* inputs, const int* dims, const int dim_num, const int dtype)
+{
+    struct data_buffer* input = create_data_buffer(dims, dim_num, dtype);
+    struct data_buffer *mean, *var, *gamma, *beta;
+
+    int dim = dims[1];
+    mean = create_data_buffer_fp32(&dim, 1);
+    var = create_data_buffer_fp32(&dim, 1);
+    gamma = create_data_buffer_fp32(&dim, 1);
+    beta = create_data_buffer_fp32(&dim, 1);
+
+    push_vector_data(inputs, &input);
+    push_vector_data(inputs, &gamma);
+    push_vector_data(inputs, &beta);
+    push_vector_data(inputs, &mean);
+    push_vector_data(inputs, &var);
+}
+
+static int __max(const int n, const int m)
+{
+    return n > m ? n : m;
+}
+
+static void shuffle_array(int* arr, const int n)
+{
+    for (int i = 0; i < 20 * n; ++i)
+    {
+        int a = rand() % n;
+        int b = rand() % n;
+        int bak = arr[a];
+        arr[a] = arr[b];
+        arr[b] = bak;
+    }
+}
+
+int op_test_case_0()
+{
+    int dims[4];
+    for (int i = 0; i < 10; ++i)
+    {
+#define __run_test_case(__dim_num, __caffe_flavor)                                                                                              \
+    do {                                                                                                                                        \
+        dims[0] = __max(rand() % 10, 1);                                                                                                        \
+        dims[1] = __max(rand() % 128, 1);                                                                                                       \
+        dims[2] = __max(rand() % 128, 1);                                                                                                       \
+        dims[3] = __max(rand() % 128, 1);                                                                                                       \
+        shuffle_array(dims, 4);                                                                                                                 \
+        float rescale_factor = random_float(-100.0f, 100.0f);                                                                                   \
+        rescale_factor = rand() % 100 > 50 ? rescale_factor : .0;                                                                               \
+        batchnorm_param_t param = {.caffe_flavor = __caffe_flavor, .rescale_factor = rescale_factor, .eps = 0.001};                             \
+        vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);                                              \
+        allocate_bn_inputs(inputs, dims, __dim_num, TENGINE_DT_FP32);                                                                           \
+        int ret = create_common_op_test_case(OP_BATCHNORM_NAME, &param, sizeof(param), inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001); \
+        release_vector(inputs);                                                                                                                 \
+        if (ret)                                                                                                                                \
+        {                                                                                                                                       \
+            fprintf(stderr, "batchnorm op test failed. dim_num = %d, caffe_flavor = %d, dtype = fp32\n", __dim_num, __caffe_flavor);            \
+            return ret;                                                                                                                         \
+        }                                                                                                                                       \
+        inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);                                                        \
+        allocate_bn_inputs(inputs, dims, __dim_num, TENGINE_DT_UINT8);                                                                          \
+        ret = create_common_op_test_case(OP_BATCHNORM_NAME, &param, sizeof(param), inputs, 1, TENGINE_DT_UINT8, TENGINE_LAYOUT_NCHW, 0.001);    \
+        release_vector(inputs);                                                                                                                 \
+        if (ret)                                                                                                                                \
+        {                                                                                                                                       \
+            fprintf(stderr, "batchnorm op test failed. dim_num = %d, caffe_flavor = %d, dtype = uint8\n", __dim_num, __caffe_flavor);           \
+            return ret;                                                                                                                         \
+        }                                                                                                                                       \
+        fprintf(stderr, "batchnorm op test pass: dim_num = %d, caffe_flavor = %d\n", __dim_num, __caffe_flavor);                                \
+    } while (0)
+
+        __run_test_case(2, 0);
+        __run_test_case(3, 0);
+        __run_test_case(4, 0);
+        __run_test_case(2, 1);
+        __run_test_case(3, 1);
+        __run_test_case(4, 1);
+    }
+
+    return 0;
+}
+
+int main(void)
+{
+    time_t tim = time(NULL);
+    srand((unsigned int)tim);
+    return op_test_case_0();
+}
diff --git a/tests/op/test_op_batchtospacend.c b/tests/op/test_op_batchtospacend.c
new file mode 100644
index 000000000..c3081b81b
--- /dev/null
+++ b/tests/op/test_op_batchtospacend.c
@@ -0,0 +1,72 @@
+#include "api/c_api.h"
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "test_op.h"
+#include "operator/prototype/batchtospacend_param.h"
+#include "tengine/c_api.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "util/vector.h"
+
+static int __min(const int n, const int m)
+{
+    return n < m ? n : m;
+}
+
+static void shuffle_array(int* arr, const int n)
+{
+    for (int i = 0; i < 20 * n; ++i)
+    {
+        int a = rand() % n;
+        int b = rand() % n;
+        int bak = arr[a];
+        arr[a] = arr[b];
+        arr[b] = bak;
+    }
+}
+
+static int op_test_case(const int crop_left, const int crop_right, const int crop_bottom, const int crop_top, const int dilation_x, const int dilation_y)
+{
+    struct batchtospacend_param params = {
+        .crop_top = crop_top,
+        .crop_bottom = crop_bottom,
+        .crop_left = crop_left,
+        .crop_right = crop_right,
+        .dilation_x = dilation_x,
+        .dilation_y = dilation_y};
+
+    int dims[4] = {rand_int(1, 256) * params.dilation_x * params.dilation_y, rand_int(1, 16), rand_int(1, 16), rand_int(1, 32)};
+
+    const int expand = dims[0] / (params.dilation_x * params.dilation_y);
+
+    int h = expand * dims[2];
+    int w = expand * dims[3];
+
+    if (params.crop_right > h)
+    {
+        dims[2] = params.crop_right / expand + 1;
+    }
+
+    if (params.crop_bottom > w)
+    {
+        dims[3] = params.crop_bottom / expand + 1;
+    }
+
+    struct data_buffer* input = create_data_buffer(dims, 4, TENGINE_DT_FP32);
+    vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);
+    push_vector_data(inputs, &input);
+
+    int ret = create_common_op_test_case(OP_BATCHTOSPACEND_NAME, &params, sizeof(params), inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001);
+    if (ret)
+    {
+        fprintf(stderr, "test op batchtospacend failed.");
+        return ret;
+    }
+
+    return 0;
+}
+
+int main(void)
+{
+    return op_test_case(0, 0, 0, 0, 1, 1) || op_test_case(1, 2, 1, 2, 1, 2) || op_test_case(1, 1, 1, 1, 2, 2);
+}
diff --git a/tests/op/test_op_bias.c b/tests/op/test_op_bias.c
new file mode 100644
index 000000000..ff90e0ad6
--- /dev/null
+++ b/tests/op/test_op_bias.c
@@ -0,0 +1,39 @@
+#include "api/c_api.h"
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "test_op.h"
+#include "tengine/c_api.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "util/vector.h"
+
+#define define_common_test_case(__op_name, __case_name, __layout, ...)                                                                              \
+    static int __case_name()                                                                                                                        \
+    {                                                                                                                                               \
+        int data_type = TENGINE_DT_FP32;                                                                                                            \
+        int layout = __layout;                                                                                                                      \
+        int dims[] = {__VA_ARGS__};                                                                                                                 \
+        int dims_num = sizeof(dims) / sizeof(dims[0]);                                                                                              \
+        vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);                                                  \
+        struct data_buffer* input = create_data_buffer(dims, dims_num, data_type);                                                                  \
+        push_vector_data(inputs, &input);                                                                                                           \
+        struct data_buffer* bias = create_data_buffer(&dims[1], 1, data_type);                                                                      \
+        push_vector_data(inputs, &bias);                                                                                                            \
+        int ret = create_common_op_test_case(__op_name, NULL, 0, inputs, 1, data_type, layout, 0.001);                                              \
+        if (ret) { fprintf(stderr, "test op %s failed: ret = %d, dims = {%d, %d, %d, %d}\n", __op_name, ret, dims[0], dims[1], dims[2], dims[3]); } \
+        release_vector(inputs);                                                                                                                     \
+        return 0;                                                                                                                                   \
+    }
+
+#define define_test_case(__case_name, __layout, ...) define_common_test_case(OP_BIAS_NAME, __case_name, __layout, __VA_ARGS__)
+
+define_test_case(op_test_case_0, TENGINE_LAYOUT_NCHW, 1, 3, 64, 128);
+define_test_case(op_test_case_1, TENGINE_LAYOUT_NCHW, 1, 3, 128, 128);
+define_test_case(op_test_case_2, TENGINE_LAYOUT_NCHW, 1, 3, 128, 64);
+define_test_case(op_test_case_3, TENGINE_LAYOUT_NCHW, 1, 3, 111, 111);
+define_test_case(op_test_case_4, TENGINE_LAYOUT_NCHW, 1, 3, 65, 111);
+
+int main(void)
+{
+    return op_test_case_0() || op_test_case_1() || op_test_case_2() || op_test_case_3() || op_test_case_4();
+}
diff --git a/tests/op/test_op_broadmul.c b/tests/op/test_op_broadmul.c
new file mode 100644
index 000000000..3aa9b5014
--- /dev/null
+++ b/tests/op/test_op_broadmul.c
@@ -0,0 +1,53 @@
+#include "api/c_api.h"
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "test_op.h"
+#include "tengine/c_api.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "util/vector.h"
+
+static int test_op_case()
+{
+    // broadmul 只支持一个维度的广播，例如[2, 2, 3] * [2, 2, 1]是支持的, 但是[2, 2, 3] * [2, 1, 1]不支持
+    // broadmul 只支持input1向input0广播，例如[2, 2, 3] * [2, 2, 1]是支持的 但是[2, 2, 1] * [2, 2, 3]是不支持的, 当然 [2, 1, 2] * [1, 2, 1]也是不支持的
+    // broadmul 要求input0 input1最后一维必须相等
+    for (int loop = 0; loop < 10; ++loop)
+    {
+        int dims1[4] = {rand_int(10, 64), rand_int(10, 64), rand_int(10, 64), rand_int(10, 64)};
+
+        int i = rand() % 3;
+        int dims2[4] = {0};
+
+        memcpy(dims2, dims1, sizeof(dims1));
+        dims2[i] = 1;
+
+        struct data_buffer* input1 = create_data_buffer(dims1, 4, TENGINE_DT_FP32);
+        struct data_buffer* input2 = create_data_buffer(dims2, 4, TENGINE_DT_FP32);
+        vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);
+
+        push_vector_data(inputs, &input1);
+        push_vector_data(inputs, &input2);
+
+        int ret = create_common_op_test_case(OP_BROADMUL_NAME, NULL, 0, inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001);
+        if (ret)
+        {
+            fprintf(stderr, "test op %s failed. ret = %d, dims1 = {%d, %d, %d, %d}, dims2 = {%d, %d, %d, %d}\n", OP_BROADMUL_NAME, ret, dims1[0], dims1[1], dims1[2], dims1[3], dims2[0], dims2[1], dims2[2], dims2[3]);
+            return ret;
+        }
+        else
+        {
+            fprintf(stderr, "test op %s pass. ret = %d, dims1 = {%d, %d, %d, %d}, dims2 = {%d, %d, %d, %d}\n", OP_BROADMUL_NAME, ret, dims1[0], dims1[1], dims1[2], dims1[3], dims2[0], dims2[1], dims2[2], dims2[3]);
+        }
+
+        release_vector(inputs);
+    }
+}
+
+int main(void)
+{
+    time_t tim = time(NULL);
+    srand((unsigned int)tim);
+    return test_op_case();
+}
diff --git a/tests/op/test_op_cast.c b/tests/op/test_op_cast.c
new file mode 100644
index 000000000..43cb48490
--- /dev/null
+++ b/tests/op/test_op_cast.c
@@ -0,0 +1,41 @@
+#include "api/c_api.h"
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "operator/prototype/cast_param.h"
+#include "test_op.h"
+#include "tengine/c_api.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "util/vector.h"
+
+static int test_cast_op(const int from, const int to)
+{
+    int dims[4] = {rand_int(10, 64), rand_int(10, 64), rand_int(10, 64), rand_int(10, 64)};
+    struct data_buffer* input = create_data_buffer(dims, 4, from);
+    vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);
+    push_vector_data(inputs, &input);
+
+    struct cast_param params = {.type_from = from, .type_to = to};
+
+    int ret = create_common_op_test_case(OP_CAST_NAME, &params, sizeof(params), inputs, 1, to, TENGINE_LAYOUT_NCHW, 0.001);
+    if (ret)
+    {
+        fprintf(stderr, "test op %s failed. ret = %d, dims1 = {%d, %d, %d, %d}, from type = %d, to type = %d\n", OP_CAST_NAME, ret, dims[0], dims[1], dims[2], dims[3], from, to);
+        return ret;
+    }
+
+    release_vector(inputs);
+    return 0;
+}
+
+int main(void)
+{
+    time_t tim = time(NULL);
+    srand((unsigned int)tim);
+    return test_cast_op(TENGINE_DT_FP32, TENGINE_DT_FP16)
+           || test_cast_op(TENGINE_DT_FP16, TENGINE_DT_FP32)
+           || test_cast_op(TENGINE_DT_FP32, TENGINE_DT_UINT8)
+           || test_cast_op(TENGINE_DT_UINT8, TENGINE_DT_FP32)
+           || test_cast_op(TENGINE_DT_FP32, TENGINE_DT_FP32)
+           || test_cast_op(TENGINE_DT_UINT8, TENGINE_DT_UINT8);
+}
diff --git a/tests/op/test_op_ceil.c b/tests/op/test_op_ceil.c
new file mode 100644
index 000000000..c24849732
--- /dev/null
+++ b/tests/op/test_op_ceil.c
@@ -0,0 +1,44 @@
+#include "api/c_api.h"
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "test_op.h"
+#include "tengine/c_api.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "util/vector.h"
+
+static int test_ceil_op()
+{
+    for (int i = 0; i < 10; ++i)
+    {
+        int dims[4] = {rand_int(10, 64), rand_int(10, 64), rand_int(10, 64), rand_int(10, 64)};
+        struct data_buffer* input = create_data_buffer(dims, 4, TENGINE_DT_FP32);
+        vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);
+        push_vector_data(inputs, &input);
+
+        int ret = create_common_op_test_case(OP_CEIL_NAME, NULL, 0, inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001);
+        if (ret)
+        {
+            return ret;
+        }
+
+        release_vector(inputs);
+        input = create_data_buffer(dims, 4, TENGINE_DT_UINT8);
+        inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);
+        push_vector_data(inputs, &input);
+
+        ret = create_common_op_test_case(OP_CEIL_NAME, NULL, 0, inputs, 1, TENGINE_DT_UINT8, TENGINE_LAYOUT_NCHW, 0.001);
+
+        if (ret) { return ret; }
+
+        release_vector(inputs);
+    }
+    return 0;
+}
+
+int main(void)
+{
+    time_t tim = time(NULL);
+    srand((unsigned int)tim);
+    return test_ceil_op();
+}
diff --git a/tests/op/test_op_clip.c b/tests/op/test_op_clip.c
new file mode 100644
index 000000000..9108bd7e9
--- /dev/null
+++ b/tests/op/test_op_clip.c
@@ -0,0 +1,57 @@
+#include "api/c_api.h"
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "test_op.h"
+#include "tengine/c_api.h"
+#include "operator/prototype/clip_param.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include "util/vector.h"
+
+static int test_ceil_op()
+{
+    for (int i = 0; i < 10; ++i)
+    {
+        struct clip_param params = {.min = random_float(-1.0, 0.0), .max = random_float(0.0, 1.0)};
+        int dims[4] = {rand_int(10, 64), rand_int(10, 64), rand_int(10, 64), rand_int(10, 64)};
+        struct data_buffer* input = create_data_buffer(dims, 4, TENGINE_DT_FP32);
+        vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);
+        push_vector_data(inputs, &input);
+
+        int ret = create_common_op_test_case(OP_CLIP_NAME, &params, sizeof(params), inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001);
+        if (ret)
+        {
+            return ret;
+        }
+
+        release_vector(inputs);
+
+        input = create_data_buffer(dims, 4, TENGINE_DT_UINT8);
+        inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);
+        push_vector_data(inputs, &input);
+
+        ret = create_common_op_test_case(OP_CLIP_NAME, &params, sizeof(params), inputs, 1, TENGINE_DT_UINT8, TENGINE_LAYOUT_NCHW, 0.001);
+
+        if (ret) { return ret; }
+
+        release_vector(inputs);
+
+        input = create_data_buffer(dims, 4, TENGINE_DT_INT8);
+        inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);
+        push_vector_data(inputs, &input);
+
+        ret = create_common_op_test_case(OP_CLIP_NAME, &params, sizeof(params), inputs, 1, TENGINE_DT_INT8, TENGINE_LAYOUT_NCHW, 0.001);
+
+        if (ret) { return ret; }
+
+        release_vector(inputs);
+    }
+    return 0;
+}
+
+int main(void)
+{
+    time_t tim = time(NULL);
+    srand((unsigned int)tim);
+    return test_ceil_op();
+}
diff --git a/tests/op/test_op_comparison.c b/tests/op/test_op_comparison.c
new file mode 100644
index 000000000..2e5efc81d
--- /dev/null
+++ b/tests/op/test_op_comparison.c
@@ -0,0 +1,99 @@
+#include "api/c_api.h"
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "test_op.h"
+#include "tengine/c_api.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "util/vector.h"
+#include "operator/prototype/comparison_param.h"
+
+static int get_total_size(const int* dims, const int n)
+{
+    int s = 1;
+    for (int i = 0; i < n; ++i)
+    {
+        s *= dims[i];
+    }
+    return s;
+}
+
+static void random_mask(float* data, const int size)
+{
+    int n = (int)(0.5f * size);
+    for (int i = 0; i < n; ++i)
+    {
+        int k = rand() % n;
+        data[k] = random_float(-1.2f, 1.2f);
+    }
+}
+
+static int do_comparison_test(const int* dims1, const int* dims2, const int n1, const int n2)
+{
+    for (int i = 0; i <= 5; ++i)
+    {
+        struct comparison_param params = {.type = i};
+
+        struct data_buffer* input = create_data_buffer(dims1, n1, TENGINE_DT_FP32);
+        struct data_buffer* input1 = create_data_buffer(dims2, n2, TENGINE_DT_FP32);
+        vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);
+        push_vector_data(inputs, &input);
+        push_vector_data(inputs, &input1);
+
+        int ret = create_common_op_test_case(OP_COMPARISON_NAME, &params, sizeof(params), inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001);
+        if (ret)
+        {
+            fprintf(stderr, "test comparison op failed: %d, type = %d, dims1 = {%d, %d, %d, %d}, dims2 = {%d, %d, %d, %d}\n", ret, i, dims1[0], dims1[1], dims1[2], dims1[3], dims2[0], dims2[1], dims2[2], dims2[3]);
+            release_vector(inputs);
+            return ret;
+        }
+
+        const int total_size1 = get_total_size(dims1, n1);
+        const int total_size2 = get_total_size(dims2, n2);
+        if (total_size1 > total_size2)
+        {
+            random_mask(input->data, total_size1);
+        }
+        else
+        {
+            random_mask(input1->data, total_size2);
+        }
+
+        ret = create_common_op_test_case(OP_COMPARISON_NAME, &params, sizeof(params), inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001);
+        release_vector(inputs);
+        if (ret)
+        {
+            fprintf(stderr, "test comparison op after masked failed: %d, type = %d, dims1 = {%d, %d, %d, %d}, dims2 = {%d, %d, %d, %d}\n", ret, i, dims1[0], dims1[1], dims1[2], dims1[3], dims2[0], dims2[1], dims2[2], dims2[3]);
+            return ret;
+        }
+    }
+
+    fprintf(stderr, "test comparison op pass\n");
+    return 0;
+}
+
+static int test_comparison_op()
+{
+    int dims1[] = {rand_int(2, 10), rand_int(10, 32), rand_int(10, 32), rand_int(10, 32)};
+    int dims2[4] = {0};
+
+    memcpy(dims2, dims1, sizeof(dims1));
+    int ret = do_comparison_test(dims1, dims2, 4, 4);
+    if (ret) { return ret; }
+
+    dims2[0] = 1;
+    ret = do_comparison_test(dims1, dims2, 4, 1) || do_comparison_test(dims2, dims1, 1, 4);
+    if (ret) return ret;
+
+    dims2[0] = dims1[1];
+
+    return do_comparison_test(dims1, dims2, 4, 1) || do_comparison_test(dims2, dims1, 1, 4);
+}
+
+int main(void)
+{
+    time_t tim = time(NULL);
+    srand((unsigned int)tim);
+    return test_comparison_op();
+}
diff --git a/tests/op/test_op_conv.c b/tests/op/test_op_conv.c
new file mode 100644
index 000000000..fde13887a
--- /dev/null
+++ b/tests/op/test_op_conv.c
@@ -0,0 +1,80 @@
+#include "api/c_api.h"
+#include "graph/graph.h"
+#include "graph/node.h"
+#include "test_op.h"
+#include "tengine/c_api.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "util/vector.h"
+#include "operator/prototype/convolution_param.h"
+
+static int max(int lhs, int rhs)
+{
+    return lhs > rhs ? lhs : rhs;
+}
+
+static int test_conv_op_case(int kernel_h, int kernel_w, int pad_h, int pad_w, int stride_h, int stride_w, int dilation_h, int dilation_w)
+{
+    const int real_h = (kernel_h - 1) * dilation_h + stride_h + 1;
+    const int real_w = (kernel_w - 1) * dilation_w + stride_w + 1;
+
+    const int max_h = max(real_h + 1, 32);
+    const int max_w = max(real_w + 1, 32);
+
+    for (int i = 0; i < 10; ++i)
+    {
+        int dims[4] = {rand_int(2, 8), rand_int(2, 12), rand_int(real_h, max_h), rand_int(real_w, max_w)};
+        int kernel_shape[] = {rand_int(2, 32), dims[1], kernel_h, kernel_w};
+
+        vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector);
+
+        struct data_buffer* input = create_data_buffer(dims, 4, TENGINE_DT_FP32);
+        struct data_buffer* filter = create_data_buffer(kernel_shape, 4, TENGINE_DT_FP32);
+        push_vector_data(inputs, &input);
+        push_vector_data(inputs, &filter);
+
+        struct conv_param params = {.kernel_h = kernel_shape[2], .kernel_w = kernel_shape[3], .stride_h = stride_h, .stride_w = stride_w, .pad_h0 = pad_h, .pad_h1 = pad_h, .pad_w0 = pad_w, .pad_w1 = pad_w, .dilation_h = dilation_h, .dilation_w = dilation_w, .input_channel = kernel_shape[1], .output_channel = kernel_shape[0], .group = 1, .activation = -1, .wino_off = 1};
+
+        int ret = create_common_op_test_case(OP_CONV_NAME, &params, sizeof(params), inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001);
+        release_vector(inputs);
+
+        if (ret)
+        {
+            fprintf(stderr, "test conv op failed: %d, kernel_h = %d, kernel_w = %d, pad_h = %d, pad_w = %d, stride_h = %d, stride_w = %d, dilation_h = %d, dilation_w = %d, input dims = {%d, %d, %d, %d}, kernel dims = {%d, %d, %d, %d}\n", ret, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, dims[0], dims[1], dims[2], dims[3], kernel_shape[0], kernel_shape[1], kernel_shape[2], kernel_shape[3]);
+            return ret;
+        }
+    }
+
+    fprintf(stderr, "test conv op pass, kernel_h = %d, kernel_w = %d, pad_h = %d, pad_w = %d, stride_h = %d, stride_w = %d, dilation_h = %d, dilation_w = %d\n", kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w);
+    return 0;
+}
+
+#define __define_test_conv_op(kh, kw)                          \
+    static int test_conv_op_##kh##x##kw()                      \
+    {                                                          \
+        return test_conv_op_case(kh, kw, 0, 0, 1, 1, 1, 1)     \
+               || test_conv_op_case(kh, kw, 1, 1, 1, 1, 1, 1)  \
+               || test_conv_op_case(kh, kw, 2, 2, 1, 1, 1, 1)  \
+               || test_conv_op_case(kh, kw, 3, 3, 1, 1, 1, 1)  \
+               || test_conv_op_case(kh, kw, 3, 1, 1, 1, 1, 1)  \
+               || test_conv_op_case(kh, kw, 1, 3, 1, 1, 1, 1)  \
+               || test_conv_op_case(kh, kw, 1, 3, 2, 2, 1, 1)  \
+               || test_conv_op_case(kh, kw, 1, 3, 3, 3, 1, 1)  \
+               || test_conv_op_case(kh, kw, 1, 3, 3, 1, 1, 1)  \
+               || test_conv_op_case(kh, kw, 1, 3, 1, 3, 1, 1)  \
+               || test_conv_op_case(kh, kw, 1, 3, 1, 3, 2, 2)  \
+               || test_conv_op_case(kh, kw, 1, 3, 1, 3, 3, 3)  \
+               || test_conv_op_case(kh, kw, 1, 3, 1, 3, 1, 3)  \
+               || test_conv_op_case(kh, kw, 1, 3, 1, 3, 3, 1); \
+    }
+
+__define_test_conv_op(3, 3);
+__define_test_conv_op(1, 1);
+
+int main(void)
+{
+    time_t tim = time(NULL);
+    srand((unsigned int)tim);
+    return test_conv_op_1x1() || test_conv_op_3x3();
+}
diff --git a/tests/op/test_op_prelu.c b/tests/op/test_op_prelu.c
deleted file mode 100644
index dd31e4b1e..000000000
--- a/tests/op/test_op_prelu.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-#include "test_op.h"
-
-int create_test_prelu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout;
-    (void)n;
-    (void)c;
-    (void)h;
-    (void)w;
-
-    /* create the test node */
-    node_t test_node = create_graph_node(graph, node_name, "PReLU");
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-
-    if (NULL == input_tensor)
-    {
-        fprintf(stderr, "create test node failed. ERRNO: %d.\n", get_tengine_errno());
-        return -1;
-    }
-
-    /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */
-    node_t slope_node = create_graph_node(graph, "slope", "Const");
-    tensor_t slope_tensor = create_graph_tensor(graph, "slope", TENGINE_DT_FP32);
-    set_node_output_tensor(slope_node, 0, slope_tensor, TENSOR_TYPE_CONST);
-
-    int dims[4];
-    get_tensor_shape(input_tensor, dims, 4);
-    int slope_dims[1] = {dims[1]}; // channel num
-    set_tensor_shape(slope_tensor, slope_dims, 1);
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-    set_node_input_tensor(test_node, 1, slope_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    return 0;
-}
-
-float slope_value[3] = {0.1f, 0.2f, 0.3f};
-float result_value[3] = {-1.f, -2.f, -3.f};
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 3, h = 6, w = 6;
-    const char* test_node_name = "prelu";
-    int data_type = TENGINE_DT_FP32;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Tengine init failed. ERRNO: %d.", get_tengine_errno());
-
-    // create
-    graph_t graph = create_common_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_prelu_node);
-    if (NULL == graph)
-        return -1;
-
-    // set input data
-    fill_input_float_tensor_by_index(graph, 0, 0, -10.0f);
-
-    // set slope data
-    fill_input_float_buffer_tensor_by_name(graph, test_node_name, 1, (void*)slope_value, 3 * sizeof(float));
-
-    // graph run
-    ret = test_graph_run(graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(graph);
-        return -1;
-    }
-
-    // check the result
-    struct tensor* output_tensor = get_graph_output_tensor(graph, 0, 0);
-    int out_c = output_tensor->dims[1];
-    int cstep = output_tensor->dims[2] * output_tensor->dims[3];
-
-    ret = 0;
-    for (int i = 0; i < out_c; i++)
-    {
-        float* output_data = (float*)output_tensor->data + i * cstep;
-        for (int j = 0; j < cstep; j++)
-        {
-            if (output_data[j] != result_value[i])
-            {
-                fprintf(stderr, "Check result failed, current %f, expect %f\n", output_data[j], result_value[i]);
-                ret = -1;
-                break;
-            }
-        }
-    }
-
-    if (ret == 0)
-        fprintf(stderr, "test pass.\n");
-    else
-        fprintf(stderr, "test failed.\n");
-
-    // exit
-    test_graph_release(graph);
-
-    return ret;
-}
diff --git a/tests/op/test_op_relu.c b/tests/op/test_op_relu.c
deleted file mode 100644
index 730ab3260..000000000
--- a/tests/op/test_op_relu.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-#include "test_op.h"
-
-int create_test_relu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout;
-    (void)n;
-    (void)c;
-    (void)h;
-    (void)w;
-
-    /* create the test node */
-    node_t test_node = create_graph_node(graph, node_name, "ReLU");
-    if (NULL == test_node)
-    {
-        fprintf(stderr, "create test node failed. ERRNO: %d.\n", get_tengine_errno());
-        return -1;
-    }
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-    if (NULL == input_tensor)
-    {
-        fprintf(stderr, "get graph input tensor failed. ERRNO: %d.\n", get_tengine_errno());
-        return -1;
-    }
-
-    /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */
-    // None
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    if (NULL == output_tensor)
-    {
-        fprintf(stderr, "create graph output tensor failed. ERRNO: %d.\n", get_tengine_errno());
-        return -1;
-    }
-
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set the attr of test node */
-    // None
-
-    return 0;
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 3, h = 12, w = 12;
-    const char* test_node_name = "relu";
-    int data_type = TENGINE_DT_FP32;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Engine init failed. ERRNO: %d.", get_tengine_errno());
-
-    // create
-    graph_t graph = create_common_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_relu_node);
-    if (NULL == graph)
-        return -1;
-
-    // set input data
-    fill_input_float_tensor_by_index(graph, 0, 0, -10.0f);
-
-    // graph run
-    ret = test_graph_run(graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(graph);
-        return -1;
-    }
-
-    // dump input node
-    int input_node_count = get_graph_input_node_number(graph);
-    for (int i = 0; i < input_node_count; i++)
-    {
-        node_t input = get_graph_input_node(graph, i);
-        dump_node_output(input, 0);
-    }
-
-    // dump output node
-    int output_node_count = get_graph_output_node_number(graph);
-    for (int i = 0; i < output_node_count; i++)
-    {
-        node_t output = get_graph_output_node(graph, i);
-        dump_node_output(output, 0);
-    }
-
-    // exit
-    test_graph_release(graph);
-
-    return 0;
-}
diff --git a/tests/op/test_op_relu6.c b/tests/op/test_op_relu6.c
deleted file mode 100644
index 9315c6477..000000000
--- a/tests/op/test_op_relu6.c
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Copyright (c) 2020, OPEN AI LAB
- * Author: qtang@openailab.com
- */
-
-#include "test_op.h"
-
-int create_test_relu6_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w)
-{
-    (void)layout;
-    (void)n;
-    (void)c;
-    (void)h;
-    (void)w;
-
-    /* create the test node */
-    node_t test_node = create_graph_node(graph, node_name, "ReLU6");
-    if (NULL == test_node)
-    {
-        fprintf(stderr, "create test node failed. ERRNO: %d.\n", get_tengine_errno());
-        return -1;
-    }
-
-    tensor_t input_tensor = get_graph_tensor(graph, input_name);
-    if (NULL == input_tensor)
-    {
-        fprintf(stderr, "get graph input tensor failed. ERRNO: %d.\n", get_tengine_errno());
-        return -1;
-    }
-
-    /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */
-    // None
-
-    /* input tensors of test node */
-    set_node_input_tensor(test_node, 0, input_tensor);
-
-    /* output tensors of test node */
-    tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type);
-    if (NULL == output_tensor)
-    {
-        fprintf(stderr, "create graph output tensor failed. ERRNO: %d.\n", get_tengine_errno());
-        return -1;
-    }
-
-    set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR);
-
-    /* set the attr of test node */
-    // None
-
-    return 0;
-}
-
-int main(int argc, char* argv[])
-{
-    int n = 1, c = 3, h = 12, w = 12;
-    const char* test_node_name = "relu6";
-    int data_type = TENGINE_DT_FP32;
-    int layout = TENGINE_LAYOUT_NCHW;
-
-    // init
-    int ret = test_graph_init();
-    if (0 != ret)
-        fprintf(stderr, "Engine init failed. ERRNO: %d.", get_tengine_errno());
-
-    // create
-    graph_t graph = create_common_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_relu6_node);
-    if (NULL == graph)
-        return -1;
-
-    // set input data
-    fill_input_float_tensor_by_index(graph, 0, 0, -10.0f);
-
-    // graph run
-    ret = test_graph_run(graph);
-    if (0 != ret)
-    {
-        fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret);
-        test_graph_release(graph);
-        return -1;
-    }
-
-    // dump input node
-    int input_node_count = get_graph_input_node_number(graph);
-    for (int i = 0; i < input_node_count; i++)
-    {
-        node_t input = get_graph_input_node(graph, i);
-        dump_node_output(input, 0);
-    }
-
-    // dump output node
-    int output_node_count = get_graph_output_node_number(graph);
-    for (int i = 0; i < output_node_count; i++)
-    {
-        node_t output = get_graph_output_node(graph, i);
-        dump_node_output(output, 0);
-    }
-
-    // exit
-    test_graph_release(graph);
-
-    return 0;
-}
diff --git a/tests/test_rv64_models.sh b/tests/test_rv64_models.sh
new file mode 100755
index 000000000..6b3e926ef
--- /dev/null
+++ b/tests/test_rv64_models.sh
@@ -0,0 +1,42 @@
+#!/bin/bash -
+
+if [ ! "${QEMU_CMD}" ]; then
+    echo '$QEMU_CMD is required.'
+    exit -1
+fi
+
+test_models=(
+"${QEMU_CMD} ./tests/test_model_classification -m squeezenet     -i images/cat.jpg   -g 227,227 -w 104.007,116.669,122.679 -s 1,1,1"
+"${QEMU_CMD} ./tests/test_model_classification -m mobilenet      -i images/cat.jpg   -g 224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017"
+"${QEMU_CMD} ./tests/test_model_classification -m mobilenet_v2   -i images/cat.jpg   -g 224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017"
+"${QEMU_CMD} ./tests/test_model_classification -m googlenet      -i images/cat.jpg   -g 224,224 -w 104.007,116.669,122.679 -s 1,1,1"
+"${QEMU_CMD} ./tests/test_model_classification -m inception_v3   -i images/cat.jpg   -g 395,395 -w 104.007,116.669,122.679 -s 0.0078,0.0078,0.0078"
+"${QEMU_CMD} ./tests/test_model_classification -m inception_v4   -i images/cat.jpg   -g 299,299 -w 104.007,116.669,122.679 -s 0.007843,0.007843,0.007843"
+"${QEMU_CMD} ./tests/test_model_classification -m resnet50       -i images/bike.jpg  -g 224,224 -w 104.007,116.669,122.679 -s 1,1,1"
+"${QEMU_CMD} ./tests/test_model_classification -m mnasnet        -i images/cat.jpg   -g 224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017"
+"${QEMU_CMD} ./tests/test_model_classification -m shufflenet_1xg3 -i images/cat.jpg  -g 224,224 -w 103.940,116.780,123.680 -s 0.017,0.017,0.017"
+"${QEMU_CMD} ./tests/test_model_classification -m shufflenet_v2  -i images/cat.jpg   -g 224,224 -w 103.940,116.780,123.680 -s 0.00392156,0.00392156,0.00392156"
+"${QEMU_CMD} ./tests/test_model_hrnet"
+"${QEMU_CMD} ./tests/test_model_mobilefacenet"
+"${QEMU_CMD} ./tests/test_model_mobilenet_ssd"
+"${QEMU_CMD} ./tests/test_model_nanodet_m"
+"${QEMU_CMD} ./tests/test_model_retinaface"
+"${QEMU_CMD} ./tests/test_model_ultraface"
+"${QEMU_CMD} ./tests/test_model_yolofastest"
+"${QEMU_CMD} ./tests/test_model_yolov3"
+"${QEMU_CMD} ./tests/test_model_yolov3_tiny"
+"${QEMU_CMD} ./tests/test_model_yolov4"
+"${QEMU_CMD} ./tests/test_model_yolov4_tiny"
+"${QEMU_CMD} ./tests/test_model_yolov5s"
+)
+
+for (( i = 0 ; i < ${#test_models[@]} ; i++ ))
+do
+    echo ${test_models[$i]}
+    echo ${test_models[$i]} | xargs -i sh -c "{}"
+
+    if [ "$?" != 0 ]; then
+        echo "failed"
+        exit 1
+    fi
+done
diff --git a/tests/test_rv64_ops.sh b/tests/test_rv64_ops.sh
new file mode 100755
index 000000000..627161a48
--- /dev/null
+++ b/tests/test_rv64_ops.sh
@@ -0,0 +1,33 @@
+#!/bin/bash -
+
+if [ ! "${QEMU_CMD}" ]; then
+    echo '$QEMU_CMD is required.'
+    exit -1
+fi
+
+test_models=(
+"${QEMU_CMD} ./tests/test_op_absval"
+"${QEMU_CMD} ./tests/test_op_add_n"
+"${QEMU_CMD} ./tests/test_op_argmax"
+"${QEMU_CMD} ./tests/test_op_argmin"
+"${QEMU_CMD} ./tests/test_op_batchnorm"
+"${QEMU_CMD} ./tests/test_op_batchtospacend"
+# "${QEMU_CMD} ./tests/test_op_broadmul"
+"${QEMU_CMD} ./tests/test_op_bias"
+"${QEMU_CMD} ./tests/test_op_cast"
+"${QEMU_CMD} ./tests/test_op_ceil"
+"${QEMU_CMD} ./tests/test_op_clip"
+"${QEMU_CMD} ./tests/test_op_comparison"
+"${QEMU_CMD} ./tests/test_op_conv"
+)
+
+for (( i = 0 ; i < ${#test_models[@]} ; i++ ))
+do
+    echo ${test_models[$i]}
+    echo ${test_models[$i]} | xargs -i sh -c "{}"
+
+    if [ "$?" != 0 ]; then
+        echo "failed"
+        exit 1
+    fi
+done
diff --git a/toolchains/rv64-c906.toolchain.cmake b/toolchains/rv64-c906.toolchain.cmake
index e8268106d..ec28012b0 100644
--- a/toolchains/rv64-c906.toolchain.cmake
+++ b/toolchains/rv64-c906.toolchain.cmake
@@ -12,7 +12,16 @@ SET (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
 SET (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
 
 # other needed options
-SET (TENGINE_TOOLCHAIN_ASM_FLAG -march=rv64gcvxthead -mabi=lp64d -mtune=c906 -mfp16 -lc)
+IF (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "release" OR CMAKE_BUILD_TYPE STREQUAL "RELEASE")
+    SET (TENGINE_TOOLCHAIN_ASM_FLAG -march=rv64gcv -mabi=lp64d -mtune=thead-c906 -lc)
+ELSE()
+    SET (TENGINE_TOOLCHAIN_ASM_FLAG -march=rv64gcv -mabi=lp64d -g -O0 -lc)
+ENDIF()
+
+IF (TENGINE_RV64_RVV_C906)
+    SET(TENGINE_TOOLCHAIN_ASM_FLAG "-D__FIX_RVV_C906 ${TENGINE_TOOLCHAIN_ASM_FLAG}")
+ENDIF()
+
 #SET (TENGINE_TOOLCHAIN_FLAG -march=rv64imafdcvxtheadc -mabi=lp64dv -mtune=c906 -mfp16)
 #SET (TENGINE_TOOLCHAIN_FLAG -march=rv64imafdcvxtheadc -mabi=lp64dv -mtune=c910 -mfp16)