diff --git a/.drone.yml b/.drone.yml new file mode 100644 index 000000000..0b8f744db --- /dev/null +++ b/.drone.yml @@ -0,0 +1,81 @@ +--- +kind: pipeline +name: TengineRV64 +platform: + os: linux + arch: amd64 + +steps: + - name: build + image: ubuntu20.04:qemu + commands: + - PATH=$PATH:/home/riscv/bin cmake -DCMAKE_TOOLCHAIN_FILE=toolchains/rv64-c906.toolchain.cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_BUILD_TYPE=RELEASE -DTENGINE_BUILD_TESTS=ON -DTENGINE_COVERAGE=ON -B build + - PATH=$PATH:/home/riscv/bin cmake --build build -- -j`cat /proc/cpuinfo | grep 'processor' | wc -l` VERBOSE=1 + - name: test ops + image: ubuntu20.04:qemu + commands: + - cd build + - export QEMU_CMD='qemu-riscv64 -cpu rv64,v=true -E TG_DEBUG_TIME=1 -L /home/riscv/sysroot' + - ../tests/test_rv64_ops.sh + - name: test models + image: ubuntu20.04:qemu + environment: + DATA_SERVER_URL: + from_secret: DATA_SERVER_URL + commands: + - cd build + - wget -nv $${DATA_SERVER_URL}/tengine_model_zoo/ci_data/models.tar.gz + - wget -nv $${DATA_SERVER_URL}/tengine_model_zoo/ci_data/images.tar.gz + - wget -nv $${DATA_SERVER_URL}/tengine_model_zoo/ci_data/data_x86.tar.gz + - mkdir models images data + - tar zxvf models.tar.gz -C models + - tar zxvf images.tar.gz -C images + - tar zxvf data_x86.tar.gz -C data + - export QEMU_CMD='qemu-riscv64 -cpu rv64,v=true -E TG_DEBUG_TIME=1 -L /home/riscv/sysroot' + - ../tests/test_rv64_models.sh + when: + branch: + - master + - name: code coverage + image: ubuntu20.04:qemu + commands: + - cd build + - apt update && apt install lcov -y + - lcov --gcov-tool /home/riscv/bin/riscv64-unknown-linux-gnu-gcov --capture --directory . --output-file $${DRONE_REPO_NAME}.info + - genhtml --branch-coverage -o ../codecov $${DRONE_REPO_NAME}.info + - name: scp files + image: appleboy/drone-scp + settings: + host: conleylee.com + username: + from_secret: download_host_user + password: + from_secret: download_host_passwd + port: 38000 + target: /home/lee/codecov/${DRONE_REPO_NAME}/${DRONE_BUILD_NUMBER}/${DRONE_COMMIT_SHA} + strip_components: 1 + source: codecov/* + - name: upload_to_codecov + image: robertstettner/drone-codecov:latest + settings: + token: + from_secret: CODECOV_TOKEN + files: + - build/${DRONE_REPO_NAME}.info + flags: + - model_test + - name: notify + image: ubuntu20.04:drone_script + environment: + MATTERMOST_TOKEN: + from_secret: MATTERMOST_TOKEN + GITEA_API_TOKEN: + from_secret: gitea_api_token + commands: + - 'export DRONE_SCRIPT_DOWNLOAD_LINK=https://download.conleylee.com/scripts/drone_bot.py' + - 'export DRONE_CODECOV_LINK=https://codecov.conleylee.com/$${DRONE_REPO_NAME}/$${DRONE_BUILD_NUMBER}/$${DRONE_COMMIT_SHA}' + - 'wget $${DRONE_SCRIPT_DOWNLOAD_LINK}' + - pip3 install mattermostdriver + - python3 `basename $${DRONE_SCRIPT_DOWNLOAD_LINK}` + when: + status: [success, failure] diff --git a/CMakeLists.txt b/CMakeLists.txt index 32fae8481..42ac4eb43 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,18 +35,6 @@ ENDIF() # Enable the languages which in use ENABLE_LANGUAGE (C CXX) -IF (CMAKE_TOOLCHAIN_FILE) - SET (LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to") - - # get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :( - GET_FILENAME_COMPONENT (CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME) - FIND_FILE (CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH) - MESSAGE (STATUS "Using CMake tool chain file ${CMAKE_TOOLCHAIN_FILE}") -ENDIF() - -IF (NOT CMAKE_BUILD_TYPE) - SET (CMAKE_BUILD_TYPE release CACHE STRING "Choose the type of build" FORCE) -ENDIF() # Module options OPTION (TENGINE_BUILD_BENCHMARK "Build benchmark" ON) @@ -92,7 +80,23 @@ OPTION (TENGINE_ENABLE_ALL_SYMBOL "All symbol visible" OPTION (TENGINE_ENABLE_MODEL_CACHE "NPU kernel cache file option" OFF) # Online report -OPTION (TENGINE_ONLINE_REPORT "online report" ON) +OPTION (TENGINE_ONLINE_REPORT "online report" OFF) + +OPTION (TENGINE_RV64_RVV_C906 "build for c906" OFF) +OPTION (TENGINE_COVERAGE "build with coverage info" OFF) + +IF (CMAKE_TOOLCHAIN_FILE) + SET (LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to") + + # get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :( + GET_FILENAME_COMPONENT (CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME) + FIND_FILE (CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH) + MESSAGE (STATUS "Using CMake tool chain file ${CMAKE_TOOLCHAIN_FILE}") +ENDIF() + +IF (NOT CMAKE_BUILD_TYPE) + SET (CMAKE_BUILD_TYPE release CACHE STRING "Choose the type of build" FORCE) +ENDIF() # Do check list INCLUDE ("${CMAKE_CURRENT_SOURCE_DIR}/cmake/check.cmake") diff --git a/README.md b/README.md index 2b50777ef..73ad8af11 100644 --- a/README.md +++ b/README.md @@ -7,11 +7,9 @@ # Tengine -[![GitHub license](http://OAID.github.io/pics/apache_2.0.svg)](./LICENSE) -[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/OAID/Tengine/build-and-test.yml?branch=tengine-lite)](https://github.com/OAID/Tengine/actions) -[![Test Status](https://img.shields.io/travis/OAID/Tengine/tengine-lite?label=test)](https://travis-ci.org/OAID/Tengine) -[![codecov](https://codecov.io/gh/OAID/Tengine/branch/tengine-lite/graph/badge.svg?token=kz9NcQPRrk)](https://codecov.io/gh/OAID/Tengine) -[![Language grade: C/C++](https://img.shields.io/lgtm/grade/cpp/g/OAID/Tengine.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/OAID/Tengine/context:cpp) +[![License](https://img.shields.io/badge/license-Apache_2.0-blue)](./LICENSE) +[![Build Status](https://drone.conleylee.com/api/badges/conley/Tengine/status.svg?ref=refs/heads/master)](https://drone.conleylee.com/conley/Tengine) +[![codecov](https://codecov.io/gh/ComingToy/Tengine/graph/badge.svg?token=KVOX0LW1NJ)](https://codecov.io/gh/ComingToy/Tengine) ## 简介 diff --git a/README_EN.md b/README_EN.md index 5acaef03c..dfef60542 100644 --- a/README_EN.md +++ b/README_EN.md @@ -7,13 +7,9 @@ English | [简体中文](./README.md) # Tengine -[![GitHub license](http://OAID.github.io/pics/apache_2.0.svg)](./LICENSE) -[![Build Status](https://img.shields.io/github/workflow/status/OAID/Tengine/Tengine-Lite-Actions/tengine-lite)](https://github.com/OAID/Tengine/actions?query=workflow%3ATengine-Lite-Actions) -[![Build Status](https://img.shields.io/github/workflow/status/OAID/Tengine-Convert-Tools/Tengine-Convert-Tools-Actions?label=tools%20build)](https://github.com/OAID/Tengine-Convert-Tools/actions?query=workflow%3ATengine-Convert-Tools-Actions) -[![Test Status](https://img.shields.io/travis/OAID/Tengine/tengine-lite?label=test)](https://travis-ci.org/OAID/Tengine) -[![codecov](https://codecov.io/gh/OAID/Tengine/branch/tengine-lite/graph/badge.svg?token=kz9NcQPRrk)](https://codecov.io/gh/OAID/Tengine) -[![Language grade: C/C++](https://img.shields.io/lgtm/grade/cpp/g/OAID/Tengine.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/OAID/Tengine/context:cpp) - +[![License](https://img.shields.io/badge/license-Apache_2.0-blue)](./LICENSE) +[![Build Status](https://drone.conleylee.com/api/badges/conley/Tengine/status.svg?ref=refs/heads/master)](https://drone.conleylee.com/conley/Tengine) +[![codecov](https://codecov.io/gh/ComingToy/Tengine/graph/badge.svg?token=KVOX0LW1NJ)](https://codecov.io/gh/ComingToy/Tengine) ## Introduction diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index f610c0ed2..1041fe6ab 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -62,7 +62,9 @@ TENGINE_EXAMPLE (tm_efficientdet_uint8 tm_efficientdet_uint8.c) TENGINE_EXAMPLE (tm_mobilenet_ssd tm_mobilenet_ssd.c) TENGINE_EXAMPLE (tm_mobilenet_ssd_uint8 tm_mobilenet_ssd_uint8.cpp) TENGINE_EXAMPLE (tm_retinaface tm_retinaface.cpp) +TENGINE_EXAMPLE (tm_retinaface_vulkan tm_retinaface_vulkan.cpp) TENGINE_EXAMPLE (tm_landmark tm_landmark.cpp) +TENGINE_EXAMPLE (tm_landmark_vulkan tm_landmark_vulkan.cpp) TENGINE_EXAMPLE (tm_landmark_uint8 tm_landmark_uint8.cpp) TENGINE_EXAMPLE (tm_mobilefacenet tm_mobilefacenet.cpp) TENGINE_EXAMPLE (tm_mobilefacenet_uint8 tm_mobilefacenet_uint8.cpp) diff --git a/examples/tm_landmark_vulkan.cpp b/examples/tm_landmark_vulkan.cpp new file mode 100644 index 000000000..76f35245d --- /dev/null +++ b/examples/tm_landmark_vulkan.cpp @@ -0,0 +1,206 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: qtang@openailab.com + */ + +#include +#include + +#include "common.h" +#include "tengine/c_api.h" +#include "tengine_operations.h" + +#define DEFAULT_REPEAT_COUNT 1 +#define DEFAULT_THREAD_COUNT 1 + +void get_input_fp32_data(const char* image_file, float* input_data, int img_h, int img_w, float* mean, float* scale) +{ + image img = imread_process(image_file, img_w, img_h, mean, scale); + + float* image_data = (float*)img.data; + + for (int i = 0; i < img_w * img_h * 3; i++) + input_data[i] = image_data[i]; + + free_image(img); +} + +void show_usage() +{ + fprintf(stderr, "[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count]\n"); +} + +int main(int argc, char* argv[]) +{ + int repeat_count = DEFAULT_REPEAT_COUNT; + int num_thread = DEFAULT_THREAD_COUNT; + char* model_file = nullptr; + char* image_file = nullptr; + int img_h = 144; + int img_w = 144; + float mean[3] = {128.f, 128.f, 128.f}; + float scale[3] = {0.0039, 0.0039, 0.0039}; + + int res; + while ((res = getopt(argc, argv, "m:i:r:t:h:")) != -1) + { + switch (res) + { + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'h': + show_usage(); + return 0; + default: + break; + } + } + + /* check files */ + if (model_file == nullptr) + { + fprintf(stderr, "Error: Tengine model file not specified!\n"); + show_usage(); + return -1; + } + + if (image_file == nullptr) + { + fprintf(stderr, "Error: Image file not specified!\n"); + show_usage(); + return -1; + } + + if (!check_file_exist(model_file) || !check_file_exist(image_file)) + return -1; + + /* set runtime options */ + struct options opt; + opt.num_thread = num_thread; + opt.cluster = TENGINE_CLUSTER_ALL; + opt.precision = TENGINE_MODE_FP32; + opt.affinity = 0; + + /* inital tengine */ + init_tengine(); + fprintf(stderr, "tengine-lite library version: %s\n", get_tengine_version()); + + /* create graph, load tengine model xxx.tmfile */ + context_t vk_context = create_context("VK", 1); + add_context_device(vk_context, "VK"); + graph_t graph = create_graph(vk_context, "tengine", model_file); + set_graph_device(graph, "VK"); + if (graph == nullptr) + { + std::cout << "Create graph0 failed\n"; + return -1; + } + + /* set the input shape to initial the graph, and prerun graph to infer shape */ + int img_size = img_h * img_w * 3; + int dims[] = {1, 3, img_h, img_w}; // nchw + float* input_data = (float*)malloc(img_size * sizeof(float)); + + tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); + if (input_tensor == nullptr) + { + fprintf(stderr, "Get input tensor failed\n"); + return -1; + } + + if (set_tensor_shape(input_tensor, dims, 4) < 0) + { + fprintf(stderr, "Set input tensor shape failed\n"); + return -1; + } + + if (set_tensor_buffer(input_tensor, input_data, img_size * sizeof(float)) < 0) + { + fprintf(stderr, "Set input tensor buffer failed\n"); + return -1; + } + + /* prerun graph, set work options(num_thread, cluster, precision) */ + if (prerun_graph_multithread(graph, opt) < 0) + { + fprintf(stderr, "Prerun multithread graph failed.\n"); + return -1; + } + + /* prepare process input data, set the data mem to input tensor */ + get_input_fp32_data(image_file, input_data, img_h, img_w, mean, scale); + + /* run graph */ + double min_time = DBL_MAX; + double max_time = DBL_MIN; + double total_time = 0.; + for (int i = 0; i < repeat_count; i++) + { + double start = get_current_time(); + if (run_graph(graph, 1) < 0) + { + fprintf(stderr, "Run graph failed\n"); + return -1; + } + double end = get_current_time(); + double cur = end - start; + total_time += cur; + if (min_time > cur) + min_time = cur; + if (max_time < cur) + max_time = cur; + } + printf("Repeat [%d] min %.3f ms, max %.3f ms, avg %.3f ms\n", repeat_count, min_time, max_time, + total_time / repeat_count); + + /* get output tensor */ + tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); + + float* data = (float*)(get_tensor_buffer(output_tensor)); + int data_size = get_tensor_buffer_size(output_tensor) / sizeof(float); + + image img_out = imread(image_file); + for (int i = 0; i < data_size / 2; i++) + { + int x = (int)(data[2 * i] * (float)img_out.w / 144.f); + int y = (int)(data[2 * i + 1] * (float)img_out.h / 144.f); + draw_circle(img_out, x, y, 2, 0, 255, 0); + } + + save_image(img_out, "landmark_out"); + + postrun_graph(graph); + destroy_graph(graph); + release_tengine(); + + return 0; +} diff --git a/examples/tm_retinaface_vulkan.cpp b/examples/tm_retinaface_vulkan.cpp new file mode 100644 index 000000000..14f1936d8 --- /dev/null +++ b/examples/tm_retinaface_vulkan.cpp @@ -0,0 +1,606 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: jxyang@openailab.com + * + * original model: https://github.com/deepinsight/insightface/tree/master/RetinaFace#retinaface-pretrained-models + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/blob/master/examples/retinaface.cpp + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +#include +#include + +#ifdef _MSC_VER +#define NOMINMAX +#endif + +#include +#include +#include + +#include "common.h" + +#include "tengine/c_api.h" +#include "tengine_operations.h" + +#define DEFAULT_REPEAT_COUNT 1 +#define DEFAULT_THREAD_COUNT 1 + +#define MODEL_PATH "models/retinaface.tmfile" +#define IMAGE_PATH "images/selfie_960.jpg" + +const float CONF_THRESH = 0.8f; +const float NMS_THRESH = 0.4f; + +const char* input_name = "data"; + +const char* bbox_name[3] = {"face_rpn_bbox_pred_stride32", "face_rpn_bbox_pred_stride16", "face_rpn_bbox_pred_stride8"}; +const char* score_name[3] = {"face_rpn_cls_prob_reshape_stride32", "face_rpn_cls_prob_reshape_stride16", + "face_rpn_cls_prob_reshape_stride8"}; +const char* landmark_name[3] = {"face_rpn_landmark_pred_stride32", "face_rpn_landmark_pred_stride16", + "face_rpn_landmark_pred_stride8"}; + +const int stride[3] = {32, 16, 8}; + +const float g_scales[3][2] = {{32.f, 16.f}, {8.f, 4.f}, {2.f, 1.f}}; + +struct Size2i +{ + int width; + int height; +}; + +struct Point2f +{ + float x; + float y; +}; + +struct Box2f +{ + float x1; + float y1; + float x2; + float y2; +}; + +struct Rect2f +{ + float x; + float y; + float w; + float h; +}; + +struct Face2f +{ + float score; + Rect2f rect; + Point2f landmark[5]; +}; + +void draw_target(const std::vector& all_pred_boxes, image img) +{ + const char* class_names[] = {"faces"}; + + fprintf(stdout, "detected face num: %zu\n", all_pred_boxes.size()); + for (int b = 0; b < (int)all_pred_boxes.size(); b++) + { + Face2f box = all_pred_boxes[b]; + + printf("BOX %.2f:( %g , %g ),( %g , %g )\n", box.score, box.rect.x, box.rect.y, box.rect.w, box.rect.h); + + draw_box(img, box.rect.x, box.rect.y, box.rect.x + box.rect.w, box.rect.y + box.rect.h, 2, 0, 255, 0); + + for (int l = 0; l < 5; l++) + { + draw_circle(img, box.landmark[l].x, box.landmark[l].y, 1, 0, 128, 128); + } + } + save_image(img, "retinaface_out"); +} + +float iou(const Face2f& a, const Face2f& b) +{ + float area_a = a.rect.w * a.rect.h; + float area_b = b.rect.w * b.rect.h; + + float xx1 = std::max(a.rect.x, b.rect.x); + float yy1 = std::max(a.rect.y, b.rect.y); + float xx2 = std::min(a.rect.x + a.rect.w, b.rect.x + b.rect.w); + float yy2 = std::min(a.rect.y + a.rect.h, b.rect.y + b.rect.h); + + float w = std::max(float(0), xx2 - xx1 + 1); + float h = std::max(float(0), yy2 - yy1 + 1); + + float inter = w * h; + float ovr = inter / (area_a + area_b - inter); + return ovr; +} + +void nms_sorted_boxes(const std::vector& face_objects, std::vector& picked, float nms_threshold) +{ + picked.clear(); + + const int n = face_objects.size(); + + std::vector areas(n); + for (int i = 0; i < n; i++) + { + areas[i] = face_objects[i].rect.w * face_objects[i].rect.h; + } + + for (int i = 0; i < n; i++) + { + const Face2f& a = face_objects[i]; + + int keep = 1; + for (int j = 0; j < (int)picked.size(); j++) + { + const Face2f& b = face_objects[picked[j]]; + + // intersection over union + float inter_area = iou(a, b); + if (inter_area > nms_threshold) + keep = 0; + } + + if (keep) + picked.push_back(i); + } +} + +void qsort_descent_inplace(std::vector& face_objects, const int& left, const int& right) +{ + int i = left; + int j = right; + + float p = face_objects[(left + right) / 2].score; + + while (i <= j) + { + while (face_objects[i].score > p) + i++; + + while (face_objects[j].score < p) + j--; + + if (i <= j) + { + // swap + std::swap(face_objects[i], face_objects[j]); + + i++; + j--; + } + } + + if (left < j) + qsort_descent_inplace(face_objects, left, j); + if (i < right) + qsort_descent_inplace(face_objects, i, right); +} + +void qsort_descent_inplace(std::vector& face_objects) +{ + if (face_objects.empty()) + return; + + qsort_descent_inplace(face_objects, 0, face_objects.size() - 1); +} + +std::vector generate_anchors(int base_size, const std::vector& ratios, const std::vector& scales) +{ + size_t num_ratio = ratios.size(); + size_t num_scale = scales.size(); + + std::vector anchors(num_ratio * num_scale); + + const float cx = (float)base_size * 0.5f; + const float cy = (float)base_size * 0.5f; + + for (int i = 0; i < num_ratio; i++) + { + float ar = ratios[i]; + + int r_w = (int)round((float)base_size / sqrt(ar)); + int r_h = (int)round((float)r_w * ar); // round(base_size * sqrt(ar)); + + for (int j = 0; j < num_scale; j++) + { + float scale = scales[j]; + + float rs_w = (float)r_w * scale; + float rs_h = (float)r_h * scale; + + Box2f& anchor = anchors[i * num_scale + j]; + + anchor.x1 = cx - rs_w * 0.5f; + anchor.y1 = cy - rs_h * 0.5f; + anchor.x2 = cx + rs_w * 0.5f; + anchor.y2 = cy + rs_h * 0.5f; + } + } + + return anchors; +} + +static void generate_proposals(std::vector& anchors, int feat_stride, const float* score_blob, + const int score_dims[], const float* bbox_blob, const int bbox_dims[], + const float* landmark_blob, const int landmark_dims[], const float& prob_threshold, + std::vector& faces) +{ + int w = bbox_dims[3]; + int h = bbox_dims[2]; + int offset = w * h; + + // generate face proposal from bbox deltas and shifted anchors + const int num_anchors = anchors.size(); + + for (int q = 0; q < num_anchors; q++) + { + const Box2f& anchor = anchors[q]; + + const float* score = score_blob + (q + num_anchors) * offset; + const float* bbox = bbox_blob + (q * 4) * offset; + const float* landmark = landmark_blob + (q * 10) * offset; + + // shifted anchor + float anchor_y = anchor.y1; + + float anchor_w = anchor.x2 - anchor.x1; + float anchor_h = anchor.y2 - anchor.y1; + + for (int i = 0; i < h; i++) + { + float anchor_x = anchor.x1; + + for (int j = 0; j < w; j++) + { + int index = i * w + j; + + float prob = score[index]; + + if (prob >= prob_threshold) + { + // apply center size + float dx = bbox[index + offset * 0]; + float dy = bbox[index + offset * 1]; + float dw = bbox[index + offset * 2]; + float dh = bbox[index + offset * 3]; + + float cx = anchor_x + anchor_w * 0.5f; + float cy = anchor_y + anchor_h * 0.5f; + + float pb_cx = cx + anchor_w * dx; + float pb_cy = cy + anchor_h * dy; + + float pb_w = anchor_w * exp(dw); + float pb_h = anchor_h * exp(dh); + + float x0 = pb_cx - pb_w * 0.5f; + float y0 = pb_cy - pb_h * 0.5f; + float x1 = pb_cx + pb_w * 0.5f; + float y1 = pb_cy + pb_h * 0.5f; + + Face2f obj{}; + obj.rect.x = x0; + obj.rect.y = y0; + obj.rect.w = x1 - x0 + 1; + obj.rect.h = y1 - y0 + 1; + + obj.landmark[0].x = cx + (anchor_w + 1) * landmark[index + offset * 0]; + obj.landmark[0].y = cy + (anchor_h + 1) * landmark[index + offset * 1]; + obj.landmark[1].x = cx + (anchor_w + 1) * landmark[index + offset * 2]; + obj.landmark[1].y = cy + (anchor_h + 1) * landmark[index + offset * 3]; + obj.landmark[2].x = cx + (anchor_w + 1) * landmark[index + offset * 4]; + obj.landmark[2].y = cy + (anchor_h + 1) * landmark[index + offset * 5]; + obj.landmark[3].x = cx + (anchor_w + 1) * landmark[index + offset * 6]; + obj.landmark[3].y = cy + (anchor_h + 1) * landmark[index + offset * 7]; + obj.landmark[4].x = cx + (anchor_w + 1) * landmark[index + offset * 8]; + obj.landmark[4].y = cy + (anchor_h + 1) * landmark[index + offset * 9]; + + obj.score = prob; + + faces.push_back(obj); + } + + anchor_x += (float)feat_stride; + } + + anchor_y += (float)feat_stride; + } + } +} + +int get_input_data(const char* image_file, std::vector& image_data, Size2i& size) +{ + image img = imread(image_file); + + size.width = img.w; + size.height = img.h; + + int img_size = img.w * img.h * img.c; + + img = image_permute(img); + + image_data.resize(img_size); + + memcpy(image_data.data(), img.data, img_size * sizeof(float)); + + free_image(img); + + return img_size; +} + +void show_usage() +{ + printf("[Usage]: [-h]\n [-m model_file] [-i image_file] [-r repeat_count] [-t thread_count] [-n device_name]\n"); +} + +int main(int argc, char* argv[]) +{ + int repeat_count = DEFAULT_REPEAT_COUNT; + int num_thread = DEFAULT_THREAD_COUNT; + + const char* model_file = MODEL_PATH; + const char* image_file = IMAGE_PATH; + const char* device_name = ""; + + int res; + while ((res = getopt(argc, argv, "m:i:r:t:h:n:")) != -1) + { + switch (res) + { + case 'm': + model_file = optarg; + break; + case 'i': + image_file = optarg; + break; + case 'r': + repeat_count = atoi(optarg); + break; + case 't': + num_thread = atoi(optarg); + break; + case 'n': + device_name = optarg; + break; + case 'h': + show_usage(); + return 0; + default: + break; + } + } + + /* check files */ + if (model_file == nullptr) + { + printf("Error: Tengine model file not specified!\n"); + show_usage(); + return -1; + } + + if (image_file == nullptr) + { + printf("Error: Image file not specified!\n"); + show_usage(); + return -1; + } + + if (!check_file_exist(model_file) || !check_file_exist(image_file)) + return -1; + + /* set runtime options */ + struct options opt; + opt.num_thread = num_thread; + opt.cluster = TENGINE_CLUSTER_ALL; + opt.precision = TENGINE_MODE_FP32; + opt.affinity = 0; + + /* inital tengine */ + int ret = init_tengine(); + if (0 != ret) + { + printf("Init tengine-lite failed.\n"); + return -1; + } + + printf("tengine-lite library version: %s\n", get_tengine_version()); + + /* create graph, load tengine model xxx.tmfile */ + context_t vk_context = create_context("VK", 1); + add_context_device(vk_context, "VK"); + graph_t graph = create_graph(vk_context, "tengine", model_file); + set_graph_device(graph, "VK"); + if (graph == nullptr) + { + printf("Load model to graph failed.\n"); + return -1; + } + + /* prepare process input data */ + int target_size = 1024; + int max_size = 1980; + + std::vector image_data; + + Size2i image_size; + // Size2i tensor_size; + + float im_scale; + + int img_size = get_input_data(image_file, image_data, image_size); + + /* set the input shape to initial the graph, and pre-run graph to infer shape */ + int dims[] = {1, 3, image_size.height, image_size.width}; + + tensor_t input_tensor = get_graph_tensor(graph, input_name); + if (nullptr == input_tensor) + { + printf("Get input tensor failed\n"); + return -1; + } + + if (0 != set_tensor_shape(input_tensor, dims, 4)) + { + printf("Set input tensor shape failed\n"); + return -1; + } + + /* set the data mem to input tensor */ + if (set_tensor_buffer(input_tensor, image_data.data(), img_size * sizeof(float)) < 0) + { + printf("Set input tensor buffer failed\n"); + return -1; + } + + /* prerun graph, set work options(num_thread, cluster, precision) */ + if (0 != prerun_graph_multithread(graph, opt)) + { + printf("Pre-run graph failed\n"); + return -1; + } + + /* run graph */ + float min_time = FLT_MAX, max_time = 0, total_time = 0.f; + for (int i = 0; i < repeat_count; i++) + { + double start = get_current_time(); + if (run_graph(graph, 1) < 0) + { + printf("Run graph failed\n"); + return -1; + } + double end = get_current_time(); + + float cur = float(end - start); + + total_time += cur; + min_time = std::min(min_time, cur); + max_time = std::max(max_time, cur); + } + printf("img_h, img_w : %d, %d\n", image_size.height, image_size.width); + printf("Repeat %d times, thread %d, avg time %.2f ms, max_time %.2f ms, min_time %.2f ms\n", repeat_count, + num_thread, total_time / (float)repeat_count, max_time, min_time); + printf("--------------------------------------\n"); + + /* process the detection result */ + std::vector face_proposals; + + for (int stride_index = 0; stride_index < 3; stride_index++) + { + // ================================================================== + // ========== This part is to get tensor information ================ + // ================================================================== + tensor_t score_blob_tensor = get_graph_tensor(graph, score_name[stride_index]); + tensor_t bbox_blob_tensor = get_graph_tensor(graph, bbox_name[stride_index]); + tensor_t landmark_blob_tensor = get_graph_tensor(graph, landmark_name[stride_index]); + + int score_blob_dims[MAX_SHAPE_DIM_NUM] = {0}; + int bbox_blob_dims[MAX_SHAPE_DIM_NUM] = {0}; + int landmark_blob_dims[MAX_SHAPE_DIM_NUM] = {0}; + + get_tensor_shape(score_blob_tensor, score_blob_dims, MAX_SHAPE_DIM_NUM); + get_tensor_shape(bbox_blob_tensor, bbox_blob_dims, MAX_SHAPE_DIM_NUM); + get_tensor_shape(landmark_blob_tensor, landmark_blob_dims, MAX_SHAPE_DIM_NUM); + + float* score_blob = (float*)get_tensor_buffer(score_blob_tensor); + float* bbox_blob = (float*)get_tensor_buffer(bbox_blob_tensor); + float* landmark_blob = (float*)get_tensor_buffer(landmark_blob_tensor); + + const int base_size = 16; + const int feat_stride = stride[stride_index]; + + std::vector current_ratios(1); + current_ratios[0] = 1.f; + + std::vector current_scales(2); + current_scales[0] = g_scales[stride_index][0]; + current_scales[1] = g_scales[stride_index][1]; + + const float threshold = CONF_THRESH; + + std::vector anchors = generate_anchors(base_size, current_ratios, current_scales); + + std::vector face_objects; + generate_proposals(anchors, feat_stride, score_blob, score_blob_dims, bbox_blob, bbox_blob_dims, landmark_blob, + landmark_blob_dims, threshold, face_objects); + + face_proposals.insert(face_proposals.end(), face_objects.begin(), face_objects.end()); + } + + // sort all proposals by score from highest to lowest + qsort_descent_inplace(face_proposals); + + // apply nms with nms_threshold + std::vector picked; + nms_sorted_boxes(face_proposals, picked, NMS_THRESH); + + int face_count = picked.size(); + + std::vector face_objects(face_count); + for (int i = 0; i < face_count; i++) + { + face_objects[i] = face_proposals[picked[i]]; + + // clip to image size + float x0 = face_objects[i].rect.x; + float y0 = face_objects[i].rect.y; + float x1 = x0 + face_objects[i].rect.w; + float y1 = y0 + face_objects[i].rect.h; + + x0 = std::max(std::min(x0, (float)image_size.width - 1), 0.f); + y0 = std::max(std::min(y0, (float)image_size.height - 1), 0.f); + x1 = std::max(std::min(x1, (float)image_size.width - 1), 0.f); + y1 = std::max(std::min(y1, (float)image_size.height - 1), 0.f); + + face_objects[i].rect.x = x0; + face_objects[i].rect.y = y0; + face_objects[i].rect.w = x1 - x0; + face_objects[i].rect.h = y1 - y0; + } + + image img = imread(image_file); + draw_target(face_objects, img); + + postrun_graph(graph); + destroy_graph(graph); + release_tengine(); + + return 0; +} diff --git a/scripts/mm_bot.py b/scripts/mm_bot.py new file mode 100644 index 000000000..c4436d8b8 --- /dev/null +++ b/scripts/mm_bot.py @@ -0,0 +1,42 @@ +from mattermostdriver import Driver +import requests +import os + +bot_username = 'drone' +server_url = 'mm.conleylee.com' + +def main(): + status = os.environ['DRONE_STAGE_STATUS'] + bot_password = os.environ['MATTERMOST_TOKEN'] + repo = os.environ['DRONE_REPO_NAME'] + branch = os.environ['DRONE_SOURCE_BRANCH'] + repo_link = os.environ['DRONE_REPO_LINK'] + author = os.environ['DRONE_COMMIT_AUTHOR_NAME'] + build_number = os.environ['DRONE_BUILD_NUMBER'] + build_link = os.environ['DRONE_BUILD_LINK'] + + if status == 'success': + message = f'[{repo}/{branch}]({repo_link}/src/branch/{branch}) [build\#{build_number}]({build_link}) {status}. good job!' + else: + message = f'[{repo}/{branch}]({repo_link}/src/branch/{branch}) [build\#{build_number}]({build_link}) {status}. follow previous link for more details!' + + bot = Driver({ + 'url': server_url, # no firewall, proxy etc. + 'token': bot_password, + 'port': 443, + 'scheme': 'https', # no SSL issues + 'verify': False, + }) + + bot.login() + my_channel_id = bot.channels.get_channel_by_name_and_team_name( + 'stupidcode', + 'Tengine')['id'] + bot.posts.create_post(options={ + 'channel_id': my_channel_id, + 'message': message, + }) + + +if __name__ == '__main__': + main() diff --git a/source/device/cpu/CMakeLists.txt b/source/device/cpu/CMakeLists.txt index c975cdb66..7702e3b2d 100644 --- a/source/device/cpu/CMakeLists.txt +++ b/source/device/cpu/CMakeLists.txt @@ -279,9 +279,14 @@ IF (TENGINE_COMPILER_GCC OR TENGINE_COMPILER_CLANG) ENDIF() IF (${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv") - LIST (APPEND _CPU_COMPILER_OPTIONS "-march=rv64gcvxthead") + LIST (APPEND _CPU_COMPILER_OPTIONS "-march=rv64gcv") LIST (APPEND _CPU_COMPILER_OPTIONS "-mabi=lp64d") - LIST (APPEND _CPU_COMPILER_OPTIONS "-mfp16") + IF (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "release" OR CMAKE_BUILD_TYPE STREQUAL "RELEASE") + LIST (APPEND _CPU_COMPILER_OPTIONS "-mtune=thead-c906") + ENDIF() + IF (TENGINE_RV64_RVV_C906) + LIST (APPEND _CPU_COMPILER_OPTIONS "-D__FIX_RVV_C906") + ENDIF() LIST (APPEND _CPU_COMPILER_OPTIONS "-lc") ENDIF() ENDIF() diff --git a/source/device/cpu/cpu_device.c b/source/device/cpu/cpu_device.c index b5bea801f..aecf9045d 100644 --- a/source/device/cpu/cpu_device.c +++ b/source/device/cpu/cpu_device.c @@ -45,6 +45,7 @@ #include "utility/utils.h" #include "utility/log.h" +#include #include int init_cpu(struct device* device) @@ -94,6 +95,17 @@ static int prerun(struct device* dev, struct subgraph* subgraph, void* option) return 0; } +static void fname_normalize(char* fname) +{ + for (char* pos = fname; *pos != '\0'; ++pos) + { + if (*pos == '/') + { + *pos = '_'; + } + } +} + static int run(struct device* dev, struct subgraph* subgraph) { struct exec_graph* exec_graph = (struct exec_graph*)subgraph->device_graph; @@ -214,6 +226,26 @@ static int run(struct device* dev, struct subgraph* subgraph) dump_float(fname, ir_tensor->data, ir_tensor->elem_num); } +#endif +#if 0 + struct node* ir_node = node->ir_node; + struct graph* ir_graph = ir_node->graph; + char fname[512]; + + const char* root = getenv("TENGINE_DEBUG_DIR"); + if (!root) root = "./"; + char* pname = fname + sprintf(fname, "%s/", root); + + for (int i = 0; i < ir_node->output_num; ++i) + { + struct tensor* ir_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[i]); + float mean = tensor_mean(ir_tensor); + + fprintf(stderr, "%s output %d, mean: %f\n", ir_node->name, i, mean); + sprintf(pname, "%s_out_%d", ir_node->name, i); + fname_normalize(pname); + save_tensor(fname, ir_tensor->data, ir_tensor->dims, ir_tensor->dim_num); + } #endif } diff --git a/source/device/cpu/cpu_node.h b/source/device/cpu/cpu_node.h index b0c2fa575..2a2c8bd9b 100644 --- a/source/device/cpu/cpu_node.h +++ b/source/device/cpu/cpu_node.h @@ -28,6 +28,7 @@ #include "cpu_define.h" #include +#include struct node; struct node_ops; diff --git a/source/device/cpu/op/absval/absval_ref.c b/source/device/cpu/op/absval/absval_ref.c index 973bbae6d..786a451f6 100644 --- a/source/device/cpu/op/absval/absval_ref.c +++ b/source/device/cpu/op/absval/absval_ref.c @@ -30,6 +30,7 @@ #include "device/cpu/cpu_node.h" #include "device/cpu/cpu_graph.h" #include "device/cpu/cpu_module.h" +#include #include @@ -85,13 +86,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_absval_ref_op() { diff --git a/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c b/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c index c01c37a0c..0ec31e0d5 100644 --- a/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c +++ b/source/device/cpu/op/absval/cortex-a/absval_hcl_arm.c @@ -109,13 +109,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_absval_hcl_arm_op() { @@ -125,4 +127,4 @@ int register_absval_hcl_arm_op() int unregister_absval_hcl_arm_op() { return unregister_builtin_node_ops(OP_ABSVAL, &hcl_node_ops); -} \ No newline at end of file +} diff --git a/source/device/cpu/op/absval/risc-v/lp64dv/absval_hcl_rv64.c b/source/device/cpu/op/absval/risc-v/lp64dv/absval_hcl_rv64.c new file mode 100644 index 000000000..c79e36103 --- /dev/null +++ b/source/device/cpu/op/absval/risc-v/lp64dv/absval_hcl_rv64.c @@ -0,0 +1,100 @@ +#include "api/c_api.h" +#include "graph/tensor.h" +#include "graph/node.h" +#include "graph/graph.h" +#include "op/conv/risc-v/lp64dv/vsetvl_rvv.h" +#include "utility/sys_port.h" +#include "utility/log.h" +#include "device/cpu/cpu_node.h" +#include "device/cpu/cpu_graph.h" +#include "operator/op.h" +#include +#include "device/cpu/cpu_module.h" + +static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + return 0; +} + +static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + return 0; +} + +static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + return 0; +} + +static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + struct node* ir_node = exec_node->ir_node; + struct graph* ir_graph = ir_node->graph; + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); + struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); + + const float* input_data = input_tensor->data; + float* output_data = output_tensor->data; + + const int batch = input_tensor->dims[0]; + const int channel = input_tensor->dims[1]; + const int img_size = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; + + vsetvl_e32_m2(); + + for (int b = 0; b < batch; ++b) + { + int i = 0; + for (; i < (img_size & -8); i += 8) + { + asm("vle32.v v0, (%0);\n" + "vfabs.v v2, v0;\n" + "vse32.v v2, (%1);\n" + : + : "r"(input_data), "r"(output_data) + : "memory"); + input_data += 8; + output_data += 8; + } + + for (; i < img_size; ++i) + { + *output_data = fabsf(*input_data); + output_data++; + input_data++; + } + } + + return 0; +} + +static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* ir_node) +{ + struct graph* graph = ir_node->graph; + struct tensor* input_tensor = get_ir_graph_tensor(graph, ir_node->input_tensors[0]); + if (input_tensor->data_type != TENGINE_MODE_FP32 || input_tensor->layout != TENGINE_LAYOUT_NCHW) + { + return 0; + } + + return OPS_SCORE_PREFER; +} + +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score}; + +int register_absval_hcl_rv64_op() +{ + return register_builtin_node_ops(OP_ABSVAL, &hcl_node_ops); +} + +int unregister_absval_hcl_rv64_op() +{ + return unregister_builtin_node_ops(OP_ABSVAL, &hcl_node_ops); +} diff --git a/source/device/cpu/op/add_n/add_n_ref.c b/source/device/cpu/op/add_n/add_n_ref.c index 559b6cc44..c242dd29d 100644 --- a/source/device/cpu/op/add_n/add_n_ref.c +++ b/source/device/cpu/op/add_n/add_n_ref.c @@ -117,16 +117,27 @@ static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struc static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) { - return OPS_SCORE_BEST; + struct node* ir_node = exec_node; + struct graph* ir_graph = ir_node->graph; + struct tensor* input_tensor; + + input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); + + if (input_tensor->data_type != TENGINE_DT_FP32 || input_tensor->layout != TENGINE_LAYOUT_NCHW) + return 0; + + return OPS_SCORE_CANDO; } -static struct node_ops add_n_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops add_n_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_add_n_ref_op() { diff --git a/source/device/cpu/op/add_n/risc-v/lp64dv/add_n_hcl_rv64.c b/source/device/cpu/op/add_n/risc-v/lp64dv/add_n_hcl_rv64.c new file mode 100644 index 000000000..fc7780f6f --- /dev/null +++ b/source/device/cpu/op/add_n/risc-v/lp64dv/add_n_hcl_rv64.c @@ -0,0 +1,183 @@ +#include "graph/tensor.h" +#include "graph/node.h" +#include "graph/graph.h" +#include "op/conv/risc-v/lp64dv/vsetvl_rvv.h" +#include "utility/sys_port.h" +#include "utility/log.h" +#include "device/cpu/cpu_node.h" +#include "device/cpu/cpu_graph.h" +#include "device/cpu/cpu_module.h" + +#include + +struct add_n_op_param +{ + int in_num; + void** input_data; +}; + +static int ref_add_n_fp32(const float** input, float* output, int size, const struct add_n_op_param* param) +{ + int in_num = param->in_num; + vsetvl_e32_m2(); + + float* output_data = output; + int i = 0; + for (; i < (size & -8); i += 8) + { + asm("vmv.v.x v0, x0;\n"); + int n = 0; + for (; n < (in_num & -8); n += 8) + { + const float** inputs = input + n; + const float* in0 = inputs[0] + i; + const float* in1 = inputs[1] + i; + const float* in2 = inputs[2] + i; + const float* in3 = inputs[3] + i; + const float* in4 = inputs[4] + i; + const float* in5 = inputs[5] + i; + const float* in6 = inputs[6] + i; + const float* in7 = inputs[7] + i; + + asm("vle32.v v2, (%0);\n" + "vle32.v v4, (%1);\n" + "vle32.v v6, (%2);\n" + "vle32.v v8, (%3);\n" + "vle32.v v10, (%4);\n" + "vle32.v v12, (%5);\n" + "vle32.v v14, (%6);\n" + "vle32.v v16, (%7);\n" + "vfadd.vv v0, v0, v2;\n" + "vfadd.vv v0, v0, v4;\n" + "vfadd.vv v0, v0, v6;\n" + "vfadd.vv v0, v0, v8;\n" + "vfadd.vv v0, v0, v10;\n" + "vfadd.vv v0, v0, v12;\n" + "vfadd.vv v0, v0, v14;\n" + "vfadd.vv v0, v0, v16;\n" + : + : "r"(in0), "r"(in1), "r"(in2), "r"(in3), "r"(in4), "r"(in5), "r"(in6), "r"(in7)); + } + + for (; n < in_num; n += 1) + { + const float* in0 = input[n] + i; + asm("vle32.v v2, (%0);\n" + "vfadd.vv v0, v0, v2;\n" + : + : "r"(in0)); + } + + asm("vse32.v v0, (%0);\n" + : + : "r"(output_data) + : "memory"); + output_data += 8; + } + + for (; i < size; i += 1) + { + output[i] = input[0][i]; + for (int n = 1; n < in_num; n++) + { + output[i] += input[n][i]; + } + } + + return 0; +} + +static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)sys_malloc(sizeof(struct add_n_op_param)); + exec_node->ops_priv = add_n_op_param; + return 0; +} + +static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + sys_free(exec_node->ops_priv); + return 0; +} + +static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + struct node* ir_node = exec_node->ir_node; + struct graph* ir_graph = ir_node->graph; + struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)exec_node->ops_priv; + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); + + int in_num = ir_node->input_num; + add_n_op_param->in_num = in_num; + add_n_op_param->input_data = (void**)sys_malloc(sizeof(void*) * in_num); + + return 0; +} + +static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + struct node* ir_node = exec_node->ir_node; + struct graph* ir_graph = ir_node->graph; + struct tensor* input_tensor_a = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); + struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); + + uint32_t elem_num = input_tensor_a->elem_num; + struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)exec_node->ops_priv; + for (int i = 0; i < add_n_op_param->in_num; i++) + { + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[i]); + void* data = input_tensor->data; + add_n_op_param->input_data[i] = data; + } + const void** input = (const void**)add_n_op_param->input_data; + + float* output = (float*)output_tensor->data; + for (uint32_t i = 0; i < elem_num; i++) + { + output[i] = 0; + } + ref_add_n_fp32((const float**)input, output, elem_num, add_n_op_param); + return 0; +} + +static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + struct add_n_op_param* add_n_op_param = (struct add_n_op_param*)exec_node->ops_priv; + sys_free(add_n_op_param->input_data); + + return 0; +} + +static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) +{ + struct node* ir_node = exec_node; + struct graph* ir_graph = ir_node->graph; + struct tensor* input_tensor; + + input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); + + if (input_tensor->data_type != TENGINE_DT_FP32 || input_tensor->layout != TENGINE_LAYOUT_NCHW) + return 0; + + return OPS_SCORE_PREFER; +} + +static struct node_ops add_n_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; + +int register_add_n_hcl_rv64_op() +{ + return register_builtin_node_ops(OP_ADD_N, &add_n_node_ops); +} + +int unregister_add_n_hcl_rv64_op() +{ + return unregister_builtin_node_ops(OP_ADD_N, &add_n_node_ops); +} diff --git a/source/device/cpu/op/argmax/argmax_ref.c b/source/device/cpu/op/argmax/argmax_ref.c index ba8898a38..f3a810516 100644 --- a/source/device/cpu/op/argmax/argmax_ref.c +++ b/source/device/cpu/op/argmax/argmax_ref.c @@ -77,7 +77,7 @@ static int ref_argmax_fp32(float* input, int* output, const struct argmax_op_par return 0; } -static int ref_argmax_uint8(uint8_t* input, int* output, const struct argmax_op_param* param) +static int ref_argmax_uint8(uint8_t* input, uint8_t* output, const struct argmax_op_param* param) { uint8_t max_value; int max_value_index; @@ -175,13 +175,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct argmax_op_param* argmax_op_param = (struct argmax_op_param*)exec_node->ops_priv; - TLOG_ERR("output_tensor->elem_num:%d\n", output_tensor->elem_num); - TLOG_ERR("output_tensor->elem_size:%d\n", output_tensor->elem_size); - if (input_tensor->data_type == TENGINE_DT_FP32) ref_argmax_fp32((float*)in_data, (int*)out_data, argmax_op_param); else if (input_tensor->data_type == TENGINE_DT_UINT8) - ref_argmax_uint8((uint8_t*)in_data, (int*)out_data, argmax_op_param); + ref_argmax_uint8((uint8_t*)in_data, (uint8_t*)out_data, argmax_op_param); return 0; } @@ -196,13 +193,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops argmax_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops argmax_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_argmax_ref_op() { diff --git a/source/device/cpu/op/argmin/argmin_ref.c b/source/device/cpu/op/argmin/argmin_ref.c index 58da946b0..ca4f23466 100644 --- a/source/device/cpu/op/argmin/argmin_ref.c +++ b/source/device/cpu/op/argmin/argmin_ref.c @@ -77,7 +77,7 @@ static int ref_argmin_fp32(float* input, int* output, const struct argmin_op_par return 0; } -static int ref_argmin_uint8(uint8_t* input, int* output, const struct argmin_op_param* param) +static int ref_argmin_uint8(uint8_t* input, uint8_t* output, const struct argmin_op_param* param) { uint8_t min_value; int min_value_index; @@ -175,13 +175,10 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex struct argmin_op_param* argmin_op_param = (struct argmin_op_param*)exec_node->ops_priv; - TLOG_ERR("output_tensor->elem_num:%d\n", output_tensor->elem_num); - TLOG_ERR("output_tensor->elem_size:%d\n", output_tensor->elem_size); - if (input_tensor->data_type == TENGINE_DT_FP32) ref_argmin_fp32((float*)in_data, (int*)out_data, argmin_op_param); else if (input_tensor->data_type == TENGINE_DT_UINT8) - ref_argmin_uint8((uint8_t*)in_data, (int*)out_data, argmin_op_param); + ref_argmin_uint8((uint8_t*)in_data, (uint8_t*)out_data, argmin_op_param); return 0; } @@ -196,13 +193,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops argmin_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops argmin_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_argmin_ref_op() { diff --git a/source/device/cpu/op/batchnorm/batchnorm_ref.c b/source/device/cpu/op/batchnorm/batchnorm_ref.c index 5c7c5f526..5c2818aad 100644 --- a/source/device/cpu/op/batchnorm/batchnorm_ref.c +++ b/source/device/cpu/op/batchnorm/batchnorm_ref.c @@ -164,13 +164,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_batchnorm_ref_op() { diff --git a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c index 359b14ee5..2db14b462 100644 --- a/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c +++ b/source/device/cpu/op/batchnorm/cortex-a/batchnorm_hcl_arm.c @@ -145,13 +145,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_batchnorm_hcl_arm_op() { diff --git a/source/device/cpu/op/batchtospacend/batchtospacend_ref.c b/source/device/cpu/op/batchtospacend/batchtospacend_ref.c index 9c9aa6044..a755b6614 100644 --- a/source/device/cpu/op/batchtospacend/batchtospacend_ref.c +++ b/source/device/cpu/op/batchtospacend/batchtospacend_ref.c @@ -116,13 +116,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_batchtospacend_ref_op() { diff --git a/source/device/cpu/op/bias/bias_ref.c b/source/device/cpu/op/bias/bias_ref.c index 2eb39c085..56c128394 100644 --- a/source/device/cpu/op/bias/bias_ref.c +++ b/source/device/cpu/op/bias/bias_ref.c @@ -101,13 +101,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_bias_ref_op() { diff --git a/source/device/cpu/op/broadmul/broadmul_ref.c b/source/device/cpu/op/broadmul/broadmul_ref.c index 92ed72a28..92bb49cd8 100644 --- a/source/device/cpu/op/broadmul/broadmul_ref.c +++ b/source/device/cpu/op/broadmul/broadmul_ref.c @@ -53,10 +53,6 @@ typedef struct __ref_broadmul_param int out_size; int on_size; int in_size; - float in0_scale; - float in1_scale; - int in0_zero; - int in1_zero; } ref_broadmul_param, *p_ref_broadmul_param; static int ref_broadmul_fp32(float* in0, float* in1, float* out, p_ref_broadmul_param param) @@ -64,6 +60,7 @@ static int ref_broadmul_fp32(float* in0, float* in1, float* out, p_ref_broadmul_ int out_size = param->out_size; int in_size = param->in_size; int on_size = param->on_size; + int last_i = 0; for (int o = 0; o < out_size; o++) { @@ -74,6 +71,7 @@ static int ref_broadmul_fp32(float* in0, float* in1, float* out, p_ref_broadmul_ { int index = (o * on_size + j) * in_size + i; out[index] = in0[index] * data1; + last_i = index; } } } @@ -133,13 +131,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_broadmul_ref_op() { diff --git a/source/device/cpu/op/cast/cast_ref.c b/source/device/cpu/op/cast/cast_ref.c index 9eb88fb16..791eb8a1f 100644 --- a/source/device/cpu/op/cast/cast_ref.c +++ b/source/device/cpu/op/cast/cast_ref.c @@ -191,13 +191,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops ref_node_ops = {.prerun = prerun, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops ref_node_ops = { + .prerun = prerun, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_cast_ref_op() { diff --git a/source/device/cpu/op/ceil/ceil_ref.c b/source/device/cpu/op/ceil/ceil_ref.c index 95cc44f39..790bdbca1 100644 --- a/source/device/cpu/op/ceil/ceil_ref.c +++ b/source/device/cpu/op/ceil/ceil_ref.c @@ -34,51 +34,22 @@ #include "device/cpu/cpu_module.h" #include +#include int ref_ceil_fp32(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread) { // dims size = 2 or 3 - if (input_tensor->dim_num < 4) - { - float* input_data = (float*)input_tensor->data; - float* out_data = (float*)output_tensor->data; - int total_size = input_tensor->elem_num; - - for (int i = 0; i < total_size; i++) - { - input_data[i] = ceilf(out_data[i]); - } - - return 0; - } - // dims size 3 - else if (input_tensor->dim_num == 4) - { - int w = input_tensor->dims[3]; - int h = output_tensor->dims[2]; - int channels = input_tensor->dims[1]; - int size = h * w; - int c_step = h * w; - - float* input_data = (float*)input_tensor->data; - float* out_data = (float*)output_tensor->data; + float* input_data = (float*)input_tensor->data; + float* out_data = (float*)output_tensor->data; + int total_size = input_tensor->elem_num; #pragma omp parallel for num_threads(num_thread) - for (int q = 0; q < channels; q++) - { - float* src = input_data + c_step * q; - float* dst = out_data + c_step * q; - - for (int i = 0; i < size; i++) - { - dst[i] = ceilf(src[i]); - } - } - - return 0; + for (int i = 0; i < total_size; i++) + { + out_data[i] = ceilf(input_data[i]); } - return -1; + return 0; } int ref_ceil_uint8(struct tensor* input_tensor, struct tensor* output_tensor, int num_thread) @@ -101,40 +72,12 @@ int ref_ceil_uint8(struct tensor* input_tensor, struct tensor* output_tensor, in input_data[i] = ((float)input_uint8[i] - (float)input_zero) * input_scale; } - // dims size = 2 or 3 - if (input_tensor->dim_num < 4) - { - int total_size = input_tensor->elem_num; - - for (int i = 0; i < total_size; i++) - { - input_data[i] = ceil(out_data[i]); - } - - // return 0; - } - // dims size 3 - else if (input_tensor->dim_num == 4) - { - int w = input_tensor->dims[3]; - int h = output_tensor->dims[2]; - int channels = input_tensor->dims[1]; - int size = h * w; - int c_step = h * w; + int total_size = input_tensor->elem_num; #pragma omp parallel for num_threads(num_thread) - for (int q = 0; q < channels; q++) - { - float* src = input_data + c_step * q; - float* dst = out_data + c_step * q; - - for (int i = 0; i < size; i++) - { - dst[i] = ceil(src[i]); - } - } - - // return 0; + for (int i = 0; i < total_size; i++) + { + out_data[i] = ceil(input_data[i]); } /* quant */ @@ -192,13 +135,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_ceil_ref_op() { diff --git a/source/device/cpu/op/clip/clip_ref.c b/source/device/cpu/op/clip/clip_ref.c index 2582ef334..288a04194 100644 --- a/source/device/cpu/op/clip/clip_ref.c +++ b/source/device/cpu/op/clip/clip_ref.c @@ -84,13 +84,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_clip_ref_op() { diff --git a/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c b/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c index bfa3e4b70..8fa3719c4 100644 --- a/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c +++ b/source/device/cpu/op/comparison/comparison_kernel_ref_fp32.c @@ -43,7 +43,7 @@ void comp_equal(int input_hw, int input_hw_1, int input_count4, int input1_count } else if (input_count4 == 1) { - for (int i = 0; i < input_count4; ++i) + for (int i = 0; i < input1_count4; ++i) { *output++ = (input0[0] == input1[i]); } @@ -107,7 +107,7 @@ void comp_nequal(int input_hw, int input_hw_1, int input_count4, int input1_coun } else if (input_count4 == 1) { - for (int i = 0; i < input_count4; ++i) + for (int i = 0; i < input1_count4; ++i) { *output++ = (input0[0] != input1[i]); } @@ -171,7 +171,7 @@ void comp_less(int input_hw, int input_hw_1, int input_count4, int input1_count4 } else if (input_count4 == 1) { - for (int i = 0; i < input_count4; ++i) + for (int i = 0; i < input1_count4; ++i) { *output++ = (input0[0] < input1[i]); } @@ -235,7 +235,7 @@ void comp_lesse(int input_hw, int input_hw_1, int input_count4, int input1_count } else if (input_count4 == 1) { - for (int i = 0; i < input_count4; ++i) + for (int i = 0; i < input1_count4; ++i) { *output++ = (input0[0] <= input1[i]); } @@ -299,7 +299,7 @@ void comp_greater(int input_hw, int input_hw_1, int input_count4, int input1_cou } else if (input_count4 == 1) { - for (int i = 0; i < input_count4; ++i) + for (int i = 0; i < input1_count4; ++i) { *output++ = (input0[0] > input1[i]); } @@ -363,7 +363,7 @@ void comp_greatere(int input_hw, int input_hw_1, int input_count4, int input1_co } else if (input_count4 == 1) { - for (int i = 0; i < input_count4; ++i) + for (int i = 0; i < input1_count4; ++i) { *output++ = (input0[0] >= input1[i]); } diff --git a/source/device/cpu/op/comparison/comparison_ref.c b/source/device/cpu/op/comparison/comparison_ref.c index 14405732c..fb7e211a4 100644 --- a/source/device/cpu/op/comparison/comparison_ref.c +++ b/source/device/cpu/op/comparison/comparison_ref.c @@ -69,17 +69,35 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex void* output = output_tensor->data; _comparison_param op_param; - int ii = 0; - op_param.shape1[0] = input_tensor1->dims[ii++]; - op_param.shape1[1] = input_tensor1->dims[ii++]; - op_param.shape1[2] = input_tensor1->dims[ii++]; - op_param.shape1[3] = input_tensor1->dims[ii++]; - - ii = 0; - op_param.shape0[0] = input_tensor->dims[ii++]; - op_param.shape0[1] = input_tensor->dims[ii++]; - op_param.shape0[2] = input_tensor->dims[ii++]; - op_param.shape0[3] = input_tensor->dims[ii++]; + if (input_tensor1->dim_num == 4) + { + op_param.shape1[0] = input_tensor1->dims[0]; + op_param.shape1[1] = input_tensor1->dims[1]; + op_param.shape1[2] = input_tensor1->dims[2]; + op_param.shape1[3] = input_tensor1->dims[3]; + } + else if (input_tensor1->dim_num == 1) + { + op_param.shape1[0] = 1; + op_param.shape1[1] = input_tensor1->dims[0]; + op_param.shape1[2] = 1; + op_param.shape1[3] = 1; + } + + if (input_tensor->dim_num == 4) + { + op_param.shape0[0] = input_tensor->dims[0]; + op_param.shape0[1] = input_tensor->dims[1]; + op_param.shape0[2] = input_tensor->dims[2]; + op_param.shape0[3] = input_tensor->dims[3]; + } + else if (input_tensor->dim_num == 1) + { + op_param.shape0[0] = 1; + op_param.shape0[1] = input_tensor->dims[0]; + op_param.shape0[2] = 1; + op_param.shape0[3] = 1; + } op_param.layout = input_tensor->layout; op_param.type = param->type; @@ -92,13 +110,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_comparison_ref_op() { diff --git a/source/device/cpu/op/concat/concat_ref.c b/source/device/cpu/op/concat/concat_ref.c index 854f3a8a1..6a7939ac2 100644 --- a/source/device/cpu/op/concat/concat_ref.c +++ b/source/device/cpu/op/concat/concat_ref.c @@ -86,7 +86,8 @@ static struct node_ops hcl_node_ops = { .postrun = NULL, .init_node = init_node, .release_node = release_node, - .score = score}; + .score = score, +}; int register_concat_ref_op() { diff --git a/source/device/cpu/op/conv/conv_ref.c b/source/device/cpu/op/conv/conv_ref.c index 8f655f580..ea29309b8 100644 --- a/source/device/cpu/op/conv/conv_ref.c +++ b/source/device/cpu/op/conv/conv_ref.c @@ -199,13 +199,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_conv_ref_op() { diff --git a/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c b/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c index 5958c7c38..f68d5e3d4 100644 --- a/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c +++ b/source/device/cpu/op/conv/cortex-a/conv_hcl_arm.c @@ -468,7 +468,8 @@ static struct node_ops hcl_node_ops = { .postrun = postrun, .init_node = init_node, .release_node = release_node, - .score = score}; + .score = score, +}; int register_conv_hcl_arm_op() { diff --git a/source/device/cpu/op/conv/cortex-m/conv_cmsis.c b/source/device/cpu/op/conv/cortex-m/conv_cmsis.c index f9057f0b6..150878790 100644 --- a/source/device/cpu/op/conv/cortex-m/conv_cmsis.c +++ b/source/device/cpu/op/conv/cortex-m/conv_cmsis.c @@ -134,13 +134,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops cmsis_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops cmsis_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_conv_cmsis_op() { diff --git a/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c b/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c index 095dc59f8..62d822a14 100644 --- a/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c +++ b/source/device/cpu/op/conv/mips/conv_dw_hcl_mips.c @@ -113,13 +113,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return 0; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_conv_dw_hcl_mips_op() { diff --git a/source/device/cpu/op/conv/mips/conv_hcl_mips.c b/source/device/cpu/op/conv/mips/conv_hcl_mips.c index baa067b77..34b8619bd 100644 --- a/source/device/cpu/op/conv/mips/conv_hcl_mips.c +++ b/source/device/cpu/op/conv/mips/conv_hcl_mips.c @@ -241,13 +241,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_PREFER; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = reshape, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = reshape, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_conv_hcl_mips_op() { diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c index 338827acd..936f1457f 100644 --- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c +++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_hcl_rv64.c @@ -113,20 +113,22 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return 0; if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2))) - return OPS_SCORE_BEST; + return OPS_SCORE_PREFER; else if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 && dilation_h == 1 && dilation_w == 1 && kernel_h == 5 && kernel_w == 5 && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2))) - return OPS_SCORE_BEST; + return OPS_SCORE_PREFER; else return 0; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_conv_dw_hcl_rv64_op() { diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_packn_hcl_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_packn_hcl_rv64.c new file mode 100644 index 000000000..398575aa1 --- /dev/null +++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_packn_hcl_rv64.c @@ -0,0 +1,146 @@ +#include "convolution_param.h" +#include "api/c_api.h" + +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "device/cpu/cpu_graph.h" +#include "device/cpu/cpu_node.h" +#include "device/cpu/cpu_module.h" +#include "utility/sys_port.h" +#include +#include + +extern int conv_dw_packn_kernel_run(const ir_node_t* ir_node, const ir_tensor_t* input_tensor, const ir_tensor_t* filter_tensor, const ir_tensor_t* bias_tensor, ir_tensor_t* output_tensor, const struct conv_priv_info* priv_info, const struct conv_param* params, const int num_thread, const int cpu_affinity); +extern int conv_dw_packn_kernel_prerun(const ir_node_t* ir_node, const ir_tensor_t* input_tensor, const ir_tensor_t* filter_tensor, struct conv_priv_info* info, struct conv_param* params); +extern int conv_dw_packn_kernel_postrun(const ir_node_t* ir_node, struct conv_priv_info* info); + +static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + const ir_node_t* ir_node = exec_node->ir_node; + ir_graph_t* ir_graph = ir_node->graph; + const ir_tensor_t* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); + const ir_tensor_t* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); + const ir_tensor_t* bias_tensor = NULL; + ir_tensor_t* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); + const int num_thread = exec_graph->num_thread; + const int cpu_affinity = exec_graph->cpu_affinity; + + if (ir_node->input_num > 2) + { + bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); + } + + const struct conv_param* params = (const struct conv_param*)ir_node->op.param_mem; + const struct conv_priv_info* info = (const struct conv_priv_info*)exec_node->ops_priv; + + if (exec_graph->mode != TENGINE_MODE_FP32) + { + return -1; + } + + return conv_dw_packn_kernel_run(ir_node, input_tensor, filter_tensor, bias_tensor, output_tensor, info, params, num_thread, cpu_affinity); +} + +static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + struct conv_priv_info* info = sys_malloc(sizeof(struct conv_priv_info)); + if (!info) + { + return -1; + } + + memset(info, 0, sizeof(*info)); + exec_node->ops_priv = info; + + return 0; +} + +static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + struct conv_priv_info* info = exec_node->ops_priv; + sys_free(info); + exec_node->ops_priv = NULL; + return 0; +} + +static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* ir_node) +{ + struct conv_param* param = (struct conv_param*)ir_node->op.param_mem; + struct graph* ir_graph = ir_node->graph; + + struct tensor* input_tensor; + struct tensor* output_tensor; + + int group = param->group; + int kernel_h = param->kernel_h; + int kernel_w = param->kernel_w; + int stride_h = param->stride_h; + int stride_w = param->stride_w; + int dilation_h = param->dilation_h; + int dilation_w = param->dilation_w; + int pad_h0 = param->pad_h0; + int pad_w0 = param->pad_w0; + int pad_h1 = param->pad_h1; + int pad_w1 = param->pad_w1; + + input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); + output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); + + int in_c = input_tensor->dims[1] / group; + int out_c = output_tensor->dims[1] / group; + int outh = output_tensor->dims[2]; + int outw = output_tensor->dims[3]; + + if (!(input_tensor->data_type == TENGINE_DT_FP32)) + return 0; + + if (kernel_h != kernel_w || input_tensor->dims[0] > 1) + return 0; + + if (param->group > 1 + && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 + && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 + && ((stride_h == 1 && stride_w == 1) || (stride_h == 2 && stride_w == 2))) + return OPS_SCORE_BEST; + else + return 0; +} + +static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + const ir_node_t* ir_node = exec_node->ir_node; + ir_graph_t* ir_graph = ir_node->graph; + const ir_tensor_t* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); + const ir_tensor_t* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); + struct conv_priv_info* info = (struct conv_priv_info*)exec_node->ops_priv; + + struct conv_param* params = (struct conv_param*)ir_node->op.param_mem; + return conv_dw_packn_kernel_prerun(ir_node, input_tensor, filter_tensor, info, params); +} + +static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + const ir_node_t* ir_node = exec_node->ir_node; + struct conv_priv_info* info = (struct conv_priv_info*)exec_node->ops_priv; + return conv_dw_packn_kernel_postrun(ir_node, info); +} + +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score}; + +int register_conv_dw_packn_hcl_rv64_op() +{ + return register_builtin_node_ops(OP_CONV, &hcl_node_ops); +} + +int unregister_conv_dw_packn_hcl_rv64_op() +{ + return unregister_builtin_node_ops(OP_CONV, &hcl_node_ops); +} diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_packn_kernel_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_packn_kernel_rv64.c new file mode 100644 index 000000000..0d0b83625 --- /dev/null +++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_dw_packn_kernel_rv64.c @@ -0,0 +1,1747 @@ +#include "api/c_api.h" +#include +#include "graph/graph.h" +#include "graph/node.h" +#include "graph/tensor.h" +#include "device/cpu/cpu_node.h" +#include "device/cpu/cpu_graph.h" +#include "device/cpu/cpu_module.h" +#include "op/conv/risc-v/lp64dv/vsetvl_rvv.h" +#include "utility/sys_port.h" +#include +#include "utility/sys_port.h" +#include "convolution_param.h" + +#define __likely(x) __builtin_expect(!!(x), 1) +#define __unlikely(x) __builtin_expect(!!(x), 0) +#define max(a, b) ((a) > (b) ? (a) : (b)) +#define min(a, b) ((a) < (b) ? (a) : (b)) + +// TODO: vectorize +static void pad(const float* input, float* output, const int in_h, const int in_w, const int out_h, const int out_w, const int top, const int left, const float v) +{ + float* ptr = input; + float* outptr = output; + + int y = 0; + // fill top + for (; y < top; y++) + { + int x = 0; + for (; x < out_w; x++) + { + outptr[x] = v; + } + outptr += out_w; + } + // fill center + for (; y < (top + in_h); y++) + { + int x = 0; + for (; x < left; x++) + { + outptr[x] = v; + } + if (in_w < 12) + { + for (; x < (left + in_w); x++) + { + outptr[x] = ptr[x - left]; + } + } + else + { + memcpy(outptr + left, ptr, in_w * sizeof(float)); + x += in_w; + } + for (; x < out_w; x++) + { + outptr[x] = v; + } + ptr += in_w; + outptr += out_w; + } + // fill bottom + for (; y < out_h; y++) + { + int x = 0; + for (; x < out_w; x++) + { + outptr[x] = v; + } + outptr += out_w; + } +} + +static void do_pack(const float* input, float* output, const int channels, const int feat_size, const int packn) +{ + const int channels_packed = (channels + packn - 1) / packn; + const int feat_size_packed = feat_size * packn; + const int input_num = channels * feat_size; + + int in = 0; + + for (int c = 0; c < channels_packed; ++c) + { + for (int i = 0; i < feat_size_packed; i += packn) + { + float* output_base = output + c * feat_size_packed + i; + for (int k = 0; k < packn; ++k) + { + in = c * feat_size_packed + i / packn + k * feat_size; + if (__likely(in < input_num)) + { + output_base[k] = input[in]; + } + else + { + output_base[k] = .0f; + } + } + } + } +} + +// channels: packed_channels, feat_size: packed_feat_size +static void do_unpack(const float* packed, float* unpacked, const int packed_channels, const int packed_feat_size, const int unpacked_channels, const int packn) +{ + const int feat_size = packed_feat_size / packn; + const int unpacked_num = unpacked_channels * packed_feat_size / packn; + + for (int c = 0; c < packed_channels; ++c) + { + for (int i = 0; i < packed_feat_size; i += packn) + { + const float* packed_base = packed + c * packed_feat_size + i; + for (int k = 0; k < packn; ++k) + { + int out = c * packed_feat_size + i / packn + k * feat_size; + if (__likely(out < unpacked_num)) + { + unpacked[out] = packed_base[k]; + } + } + } + } +} + +int conv_dw_packn_kernel_prerun(const ir_node_t* ir_node, const ir_tensor_t* input_tensor, const ir_tensor_t* filter_tensor, struct conv_priv_info* info, struct conv_param* params) +{ + const int inb = input_tensor->dims[0]; + const int inc = input_tensor->dims[1]; + const int inh = input_tensor->dims[2]; + const int inw = input_tensor->dims[3]; + + const int pad_w = params->pad_w0; + const int pad_h = params->pad_h0; + const int inh_pad = inh + pad_h + pad_h; + const int inw_pad = inw + pad_w + pad_w; + + if (inh_pad == inh && inw_pad == inw) + { + return 0; + } + + if (!info->input_pad) + { + info->input_pad = sys_malloc(inb * inh_pad * inw_pad * inc * sizeof(float)); + } + + return 0; +} + +int conv_dw_packn_kernel_postrun(const ir_node_t* ir_node, struct conv_priv_info* info) +{ + if (info->input_pad) + { + sys_free(info->input_pad); + } + + return 0; +} + +void convdw3x3s1_pack8_rvv(const float* input, const float* kernel, const float* bias, float* output, const int inc, const int inh, const int inw, const int outc, const int outh, const int outw, const int act, const struct conv_param* params, int num_thread) +{ + const int packn = 8; + vsetvl_e32_m2(); + +#pragma omp parallel for num_threads(num_thread) + for (int c = 0; c < inc; ++c) + { + const float* feat_map = input + c * inh * inw; + const float* kernel_base = kernel + c * 9; + const float* bias_base = bias ? bias + c : NULL; + + __asm__( + "vle32.v v18, (%0);\n" + + "vrgather.vi v0, v18, 0;\n" + "vrgather.vi v2, v18, 1;\n" + "vrgather.vi v4, v18, 2;\n" + "vrgather.vi v6, v18, 3;\n" + "vrgather.vi v8, v18, 4;\n" + "vrgather.vi v10, v18, 5;\n" + "vrgather.vi v12, v18, 6;\n" + "vrgather.vi v14, v18, 7;\n" + + "lw t0, 32(%0);" + "vmv.v.x v16, t0;\n" + : + : "r"(kernel_base) + : "t0"); + + float* output_base = output + c * outw * outh; + + int h = 0; + for (; h < (outh & -2); h += 2) + { + const float* row0 = feat_map + h * inw; + const float* row1 = row0 + inw; + const float* row2 = row1 + inw; + const float* row3 = row2 + inw; + + int w = 0; + for (; w < (outw & -packn); w += packn) + { + // bias = v18 + if (bias_base) + { + __asm__("lw t0, (%0)\n" + "vmv.v.x v18, t0;\n" + "vmv.v.x v20, t0;\n" + : + : "r"(bias_base) + : "t0"); + } + else + { + __asm__("vmv.v.x v18, x0;\n" + "vmv.v.x v20, x0;\n"); + } + + // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17 + __asm__( + "vle32.v v22, (%1);\n" + "addi t0, %1, 4;\n" + "vle32.v v24, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v26, (t0);\n" + + "vfmacc.vv v18, v0, v22;\n" + "vfmacc.vv v18, v2, v24;\n" + "vfmacc.vv v18, v4, v26;\n" + + "vle32.v v22, (%2);\n" + "addi t0, %2, 4;\n" + "vle32.v v24, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v26, (t0);\n" + + "vfmacc.vv v18, v6, v22;\n" + "vfmacc.vv v18, v8, v24;\n" + "vfmacc.vv v18, v10, v26;\n" + + "vfmacc.vv v20, v0, v22;\n" + "vfmacc.vv v20, v2, v24;\n" + "vfmacc.vv v20, v4, v26;\n" + + "vle32.v v22, (%3);\n" + "addi t0, %3, 4;\n" + "vle32.v v24, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v26, (t0);\n" + + "vfmacc.vv v18, v12, v22;\n" + "vfmacc.vv v18, v14, v24;\n" + "vfmacc.vv v18, v16, v26;\n" + + "vfmacc.vv v20, v6, v22;\n" + "vfmacc.vv v20, v8, v24;\n" + "vfmacc.vv v20, v10, v26;\n" + + "vle32.v v22, (%4);\n" + "addi t0, %4, 4;\n" + "vle32.v v24, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v26, (t0);\n" + + "vfmacc.vv v20, v12, v22;\n" + "vfmacc.vv v20, v14, v24;\n" + "vfmacc.vv v20, v16, v26;\n" + : + : "r"(output_base), "r"(row0), "r"(row1), "r"(row2), "r"(row3) + : "t0"); + + if (act == 0) + { + __asm__("vmv.v.x v22, x0;\n" + "vfmax.vv v18, v18, v22;\n" + "vfmax.vv v20, v20, v22;\n"); + } + else if (act > 0) + { + __asm__("vmv.v.x v22, x0;\n" + "vmv.v.x v24, %0;\n" + "vfmax.vv v18, v18, v22;\n" + "vfmin.vv v18, v18, v24;\n" + "vfmax.vv v20, v20, v22;\n" + "vfmin.vv v20, v20, v24;\n" + : + : "r"(act)); + } + + __asm__("vse32.v v18, (%0);\n" ::"r"(output_base)); + __asm__("vse32.v v20, (%0);\n" ::"r"(output_base + outw)); + + row0 += packn; + row1 += packn; + row2 += packn; + row3 += packn; + output_base += packn; + } + + const float k00 = kernel_base[0]; + const float k01 = kernel_base[1]; + const float k02 = kernel_base[2]; + const float k10 = kernel_base[3]; + const float k11 = kernel_base[4]; + const float k12 = kernel_base[5]; + const float k20 = kernel_base[6]; + const float k21 = kernel_base[7]; + const float k22 = kernel_base[8]; + float bias_value = bias_base ? bias_base[0] : .0f; + + for (; w < outw; ++w) + { + const float i00 = row0[0]; + const float i01 = row0[1]; + const float i02 = row0[2]; + const float i10 = row1[0]; + const float i11 = row1[1]; + const float i12 = row1[2]; + const float i20 = row2[0]; + const float i21 = row2[1]; + const float i22 = row2[2]; + const float i30 = row3[0]; + const float i31 = row3[1]; + const float i32 = row3[2]; + + float out1 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value); + float out2 = (k00 * i10 + k01 * i11 + k02 * i12 + k10 * i20 + k11 * i21 + k12 * i22 + k20 * i30 + k21 * i31 + k22 * i32 + bias_value); + + if (act >= 0) + { + out1 = max(out1, .0f); + out2 = max(out2, .0f); + if (act > 0) + { + out1 = min(out1, (float)act); + out2 = min(out2, (float)act); + } + } + + *output_base = out1; + *(output_base + outw) = out2; + + output_base += 1; + row0 += 1; + row1 += 1; + row2 += 1; + row3 += 1; + } + + output_base += outw; + } + + for (; h < outh; ++h) + { + const float* row0 = feat_map + h * inw; + const float* row1 = row0 + inw; + const float* row2 = row1 + inw; + + int w = 0; + for (; w < (outw & -packn); w += packn) + { + // bias = v18 + if (bias_base) + { + __asm__("lw t0, (%0)\n" + "vmv.v.x v18, t0;\n" + : + : "r"(bias_base) + : "t0"); + } + else + { + __asm__("vmv.v.x v18, x0;\n"); + } + + // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17 + __asm__( + "vle32.v v22, (%0);\n" + "addi t0, %0, 4;\n" + "vle32.v v24, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v26, (t0);\n" + + "vfmacc.vv v18, v0, v22;\n" + "vfmacc.vv v18, v2, v24;\n" + "vfmacc.vv v18, v4, v26;\n" + + "vle32.v v22, (%1);\n" + "addi t0, %1, 4;\n" + "vle32.v v24, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v26, (t0);\n" + + "vfmacc.vv v18, v6, v22;\n" + "vfmacc.vv v18, v8, v24;\n" + "vfmacc.vv v18, v10, v26;\n" + + "vle32.v v22, (%2);\n" + "addi t0, %2, 4;\n" + "vle32.v v24, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v26, (t0);\n" + + "vfmacc.vv v18, v12, v22;\n" + "vfmacc.vv v18, v14, v24;\n" + "vfmacc.vv v18, v16, v26;\n" + : + : "r"(row0), "r"(row1), "r"(row2) + : "t0"); + + if (act == 0) + { + __asm__("vmv.v.x v22, x0;\n" + "vfmax.vv v18, v18, v22;\n"); + } + else if (act > 0) + { + __asm__("vmv.v.x v22, x0;\n" + "vmv.v.x v24, %0;\n" + "vfmax.vv v18, v18, v22;\n" + "vfmin.vv v18, v18, v24;\n" + : + : "r"(act)); + } + + __asm__("vse32.v v18, (%0);\n" ::"r"(output_base)); + + row0 += packn; + row1 += packn; + row2 += packn; + output_base += packn; + } + + const float k00 = kernel_base[0]; + const float k01 = kernel_base[1]; + const float k02 = kernel_base[2]; + const float k10 = kernel_base[3]; + const float k11 = kernel_base[4]; + const float k12 = kernel_base[5]; + const float k20 = kernel_base[6]; + const float k21 = kernel_base[7]; + const float k22 = kernel_base[8]; + const float bias_value = bias_base ? bias_base[0] : .0f; + + for (; w < outw; ++w) + { + const float i00 = row0[0]; + const float i01 = row0[1]; + const float i02 = row0[2]; + const float i10 = row1[0]; + const float i11 = row1[1]; + const float i12 = row1[2]; + const float i20 = row2[0]; + const float i21 = row2[1]; + const float i22 = row2[2]; + + float out1 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value); + + if (act >= 0) + { + out1 = max(out1, .0f); + if (act > 0) + { + out1 = min(out1, (float)act); + } + } + + *output_base = out1; + + output_base += 1; + row0 += 1; + row1 += 1; + row2 += 1; + } + + output_base += outw; + } + } +} + +void convdw3x3s1_pack4_rvv(const float* input, const float* kernel, const float* bias, float* output, const int inc, const int inh, const int inw, const int outc, const int outh, const int outw, const int act, const struct conv_param* params, int num_thread) +{ + const int packn = 4; + vsetvl_e32_m1(); + +#pragma omp parallel for num_threads(num_thread) + for (int c = 0; c < inc; ++c) + { + const float* feat_map = input + c * inh * inw; + const float* kernel_base = kernel + c * 9; + const float* bias_base = bias ? bias + c : NULL; + + __asm__( + "vle32.v v9, (%0);\n" + "addi t0, %0, 16;\n" + "vle32.v v10, (t0);\n" + + "vrgather.vi v0, v9, 0;\n" + "vrgather.vi v1, v9, 1;\n" + "vrgather.vi v2, v9, 2;\n" + "vrgather.vi v3, v9, 3;\n" + "vrgather.vi v4, v10, 0;\n" + "vrgather.vi v5, v10, 1;\n" + "vrgather.vi v6, v10, 2;\n" + "vrgather.vi v7, v10, 3;\n" + + "lw t0, 32(%0);" + "vmv.v.x v8, t0;\n" + : + : "r"(kernel_base) + : "t0"); + + float* out0 = output + c * outw * outh; + float* out1 = out0 + outw; + float* out2 = out1 + outw; + float* out3 = out2 + outw; + + int h = 0; + for (; h < (outh & -4); h += 4) + { + const float* row0 = feat_map + h * inw; + const float* row1 = row0 + inw; + const float* row2 = row1 + inw; + const float* row3 = row2 + inw; + const float* row4 = row3 + inw; + const float* row5 = row4 + inw; + + int w = 0; + for (; w < (outw & -packn); w += packn) + { + // bias = v18 + if (bias_base) + { + __asm__("lw t0, (%0)\n" + "vmv.v.x v28, t0;\n" + "vmv.v.x v29, t0;\n" + "vmv.v.x v30, t0;\n" + "vmv.v.x v31, t0;\n" + : + : "r"(bias_base) + : "t0"); + } + else + { + __asm__("vmv.v.x v28, x0;\n" + "vmv.v.x v29, x0;\n" + "vmv.v.x v30, x0;\n" + "vmv.v.x v31, x0;\n"); + } + + // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17 + __asm__( + "vle32.v v9, (%0);\n" + "addi t0, %0, 4;\n" + "vle32.v v10, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v11, (t0);\n" + + "vfmacc.vv v28, v0, v9;\n" + "vfmacc.vv v28, v1, v10;\n" + "vfmacc.vv v28, v2, v11;\n" + + "vle32.v v12, (%1);\n" + "addi t0, %1, 4;\n" + "vle32.v v13, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v14, (t0);\n" + + "vfmacc.vv v28, v3, v12;\n" + "vfmacc.vv v28, v4, v13;\n" + "vfmacc.vv v28, v5, v14;\n" + + "vfmacc.vv v29, v0, v12;\n" + "vfmacc.vv v29, v1, v13;\n" + "vfmacc.vv v29, v2, v14;\n" + + "vle32.v v15, (%2);\n" + "addi t0, %2, 4;\n" + "vle32.v v16, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v17, (t0);\n" + + "vfmacc.vv v28, v6, v15;\n" + "vfmacc.vv v28, v7, v16;\n" + "vfmacc.vv v28, v8, v17;\n" + + "vfmacc.vv v29, v3, v15;\n" + "vfmacc.vv v29, v4, v16;\n" + "vfmacc.vv v29, v5, v17;\n" + + "vfmacc.vv v30, v0, v15;\n" + "vfmacc.vv v30, v1, v16;\n" + "vfmacc.vv v30, v2, v17;\n" + + "vle32.v v18, (%3);\n" + "addi t0, %3, 4;\n" + "vle32.v v19, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v20, (t0);\n" + + "vfmacc.vv v29, v6, v18;\n" + "vfmacc.vv v29, v7, v19;\n" + "vfmacc.vv v29, v8, v20;\n" + + "vfmacc.vv v30, v3, v18;\n" + "vfmacc.vv v30, v4, v19;\n" + "vfmacc.vv v30, v5, v20;\n" + + "vfmacc.vv v31, v0, v18;\n" + "vfmacc.vv v31, v1, v19;\n" + "vfmacc.vv v31, v2, v20;\n" + + "vle32.v v21, (%4);\n" + "addi t0, %4, 4;\n" + "vle32.v v22, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v23, (t0);\n" + + "vfmacc.vv v30, v6, v21;\n" + "vfmacc.vv v30, v7, v22;\n" + "vfmacc.vv v30, v8, v23;\n" + + "vfmacc.vv v31, v3, v21;\n" + "vfmacc.vv v31, v4, v22;\n" + "vfmacc.vv v31, v5, v23;\n" + + "vle32.v v24, (%5);\n" + "addi t0, %5, 4;\n" + "vle32.v v25, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v26, (t0);\n" + + "vfmacc.vv v31, v6, v24;\n" + "vfmacc.vv v31, v7, v25;\n" + "vfmacc.vv v31, v8, v26;\n" + : + : "r"(row0), "r"(row1), "r"(row2), "r"(row3), "r"(row4), "r"(row5) + : "t0"); + + if (act == 0) + { + __asm__("vmv.v.x v22, x0;\n" + "vfmax.vv v28, v28, v22;\n" + "vfmax.vv v29, v29, v22;\n" + "vfmax.vv v30, v30, v22;\n" + "vfmax.vv v31, v31, v22;\n"); + } + else if (act > 0) + { + __asm__("vmv.v.x v22, x0;\n" + "vmv.v.x v23, %0;\n" + "vfmax.vv v28, v28, v22;\n" + "vfmin.vv v28, v28, v23;\n" + "vfmax.vv v29, v29, v22;\n" + "vfmin.vv v29, v29, v23;\n" + "vfmax.vv v30, v30, v22;\n" + "vfmin.vv v30, v30, v23;\n" + "vfmax.vv v31, v31, v22;\n" + "vfmin.vv v31, v31, v23;\n" + : + : "r"(act)); + } + + __asm__("vse32.v v28, (%0);\n" + "vse32.v v29, (%1);\n" + "vse32.v v30, (%2);\n" + "vse32.v v31, (%3);\n" + : + : "r"(out0), "r"(out1), "r"(out2), "r"(out3)); + + row0 += packn; + row1 += packn; + row2 += packn; + row3 += packn; + row4 += packn; + row5 += packn; + + out0 += packn; + out1 += packn; + out2 += packn; + out3 += packn; + } + + const float k00 = kernel_base[0]; + const float k01 = kernel_base[1]; + const float k02 = kernel_base[2]; + const float k10 = kernel_base[3]; + const float k11 = kernel_base[4]; + const float k12 = kernel_base[5]; + const float k20 = kernel_base[6]; + const float k21 = kernel_base[7]; + const float k22 = kernel_base[8]; + const float bias_value = bias_base ? bias_base[0] : .0f; + + for (; w < outw; ++w) + { + const float i00 = row0[0]; + const float i01 = row0[1]; + const float i02 = row0[2]; + + const float i10 = row1[0]; + const float i11 = row1[1]; + const float i12 = row1[2]; + + const float i20 = row2[0]; + const float i21 = row2[1]; + const float i22 = row2[2]; + + const float i30 = row3[0]; + const float i31 = row3[1]; + const float i32 = row3[2]; + + const float i40 = row4[0]; + const float i41 = row4[1]; + const float i42 = row4[2]; + + const float i50 = row5[0]; + const float i51 = row5[1]; + const float i52 = row5[2]; + + float v0 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value); + float v1 = (k00 * i10 + k01 * i11 + k02 * i12 + k10 * i20 + k11 * i21 + k12 * i22 + k20 * i30 + k21 * i31 + k22 * i32 + bias_value); + float v2 = (k00 * i20 + k01 * i21 + k02 * i22 + k10 * i30 + k11 * i31 + k12 * i32 + k20 * i40 + k21 * i41 + k22 * i42 + bias_value); + float v3 = (k00 * i30 + k01 * i31 + k02 * i32 + k10 * i40 + k11 * i41 + k12 * i42 + k20 * i50 + k21 * i51 + k22 * i52 + bias_value); + + if (act >= 0) + { + v0 = max(v0, .0f); + v1 = max(v1, .0f); + v2 = max(v2, .0f); + v3 = max(v3, .0f); + + if (act > 0) + { + v0 = min(v0, (float)act); + v1 = min(v1, (float)act); + v2 = min(v2, (float)act); + v3 = min(v3, (float)act); + } + } + + *out0 = v0; + *out1 = v1; + *out2 = v2; + *out3 = v3; + + out0 += 1; + out1 += 1; + out2 += 1; + out3 += 1; + + row0 += 1; + row1 += 1; + row2 += 1; + row3 += 1; + row4 += 1; + row5 += 1; + } + + out0 += 3 * outw; + out1 += 3 * outw; + out2 += 3 * outw; + out3 += 3 * outw; + } + + for (; h < outh; ++h) + { + const float* row0 = feat_map + h * inw; + const float* row1 = row0 + inw; + const float* row2 = row1 + inw; + + int w = 0; + for (; w < (outw & -packn); w += packn) + { + // bias = v18 + if (bias_base) + { + __asm__("lw t0, (%0)\n" + "vmv.v.x v28, t0;\n" + : + : "r"(bias_base) + : "t0"); + } + else + { + __asm__("vmv.v.x v28, x0;\n"); + } + + // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17 + __asm__( + "vle32.v v9, (%0);\n" + "addi t0, %0, 4;\n" + "vle32.v v10, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v11, (t0);\n" + + "vfmacc.vv v28, v0, v9;\n" + "vfmacc.vv v28, v1, v10;\n" + "vfmacc.vv v28, v2, v11;\n" + + "vle32.v v9, (%1);\n" + "addi t0, %1, 4;\n" + "vle32.v v10, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v11, (t0);\n" + + "vfmacc.vv v28, v3, v9;\n" + "vfmacc.vv v28, v4, v10;\n" + "vfmacc.vv v28, v5, v11;\n" + + "vle32.v v9, (%2);\n" + "addi t0, %2, 4;\n" + "vle32.v v10, (t0);\n" + "addi t0, t0, 4;\n" + "vle32.v v11, (t0);\n" + + "vfmacc.vv v28, v6, v9;\n" + "vfmacc.vv v28, v7, v10;\n" + "vfmacc.vv v28, v8, v11;\n" + : + : "r"(row0), "r"(row1), "r"(row2) + : "t0"); + + if (act == 0) + { + __asm__("vmv.v.x v22, x0;\n" + "vfmax.vv v28, v28, v22;\n"); + } + else if (act > 0) + { + __asm__("vmv.v.x v22, x0;\n" + "vmv.v.x v23, %0;\n" + "vfmax.vv v28, v28, v22;\n" + "vfmin.vv v28, v28, v23;\n" + : + : "r"(act)); + } + + __asm__("vse32.v v28, (%0);\n" + : + : "r"(out0)); + + row0 += packn; + row1 += packn; + row2 += packn; + + out0 += packn; + } + + const float k00 = kernel_base[0]; + const float k01 = kernel_base[1]; + const float k02 = kernel_base[2]; + const float k10 = kernel_base[3]; + const float k11 = kernel_base[4]; + const float k12 = kernel_base[5]; + const float k20 = kernel_base[6]; + const float k21 = kernel_base[7]; + const float k22 = kernel_base[8]; + const float bias_value = bias_base ? bias_base[0] : .0f; + + for (; w < outw; ++w) + { + const float i00 = row0[0]; + const float i01 = row0[1]; + const float i02 = row0[2]; + + const float i10 = row1[0]; + const float i11 = row1[1]; + const float i12 = row1[2]; + + const float i20 = row2[0]; + const float i21 = row2[1]; + const float i22 = row2[2]; + + float v0 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value); + + if (act >= 0) + { + v0 = max(v0, .0f); + + if (act > 0) + { + v0 = min(v0, (float)act); + } + } + + *out0 = v0; + out0 += 1; + + row0 += 1; + row1 += 1; + row2 += 1; + } + } + } +} + +void convdw3x3s2_pack4_rvv(const float* input, const float* kernel, const float* bias, float* output, const int inc, const int inh, const int inw, const int outc, const int outh, const int outw, const int act, const struct conv_param* params, int num_thread) +{ + const int packn = 4; + vsetvl_e32_m1(); + +#pragma omp parallel for num_threads(num_thread) + for (int c = 0; c < inc; ++c) + { + const float* feat_map = input + c * inh * inw; + const float* kernel_base = kernel + c * 9; + const float* bias_base = bias ? bias + c : NULL; + __asm__( + "vle32.v v9, (%0);\n" + "addi t0, %0, 16;\n" + "vle32.v v10, (t0);\n" + + "vrgather.vi v0, v9, 0;\n" + "vrgather.vi v1, v9, 1;\n" + "vrgather.vi v2, v9, 2;\n" + "vrgather.vi v3, v9, 3;\n" + "vrgather.vi v4, v10, 0;\n" + "vrgather.vi v5, v10, 1;\n" + "vrgather.vi v6, v10, 2;\n" + "vrgather.vi v7, v10, 3;\n" + + "lw t0, 32(%0);" + "vmv.v.x v8, t0;\n" + : + : "r"(kernel_base) + : "t0"); + + float* out0 = output + c * outw * outh; + float* out1 = out0 + outw; + float* out2 = out1 + outw; + float* out3 = out2 + outw; + + int h = 0; + for (; h < (outh & -4); h += 4) + { + const float* row0 = feat_map + 2 * h * inw; + const float* row1 = row0 + inw; + const float* row2 = row1 + inw; + const float* row3 = row2 + inw; + const float* row4 = row3 + inw; + const float* row5 = row4 + inw; + const float* row6 = row5 + inw; + const float* row7 = row6 + inw; + const float* row8 = row7 + inw; + + int w = 0; + for (; w < (outw & -packn); w += packn) + { + // bias = v18 + if (bias_base) + { + __asm__("lw t0, (%0)\n" + "vmv.v.x v28, t0;\n" + "vmv.v.x v29, t0;\n" + "vmv.v.x v30, t0;\n" + "vmv.v.x v31, t0;\n" + : + : "r"(bias_base) + : "t0"); + } + else + { + __asm__("vmv.v.x v28, x0;\n" + "vmv.v.x v29, x0;\n" + "vmv.v.x v30, x0;\n" + "vmv.v.x v31, x0;\n"); + } + + // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17 + __asm__( + "li t1, 8;\n" + "vlse32.v v9, (%0), t1;\n" + "addi t0, %0, 4;\n" + "vlse32.v v10, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v11, (t0), t1;\n" + + "vfmacc.vv v28, v0, v9;\n" + "vfmacc.vv v28, v1, v10;\n" + "vfmacc.vv v28, v2, v11;\n" + + "vlse32.v v9, (%1), t1;\n" + "addi t0, %1, 4;\n" + "vlse32.v v10, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v11, (t0), t1;\n" + + "vfmacc.vv v28, v3, v9;\n" + "vfmacc.vv v28, v4, v10;\n" + "vfmacc.vv v28, v5, v11;\n" + + "vlse32.v v9, (%2), t1;\n" + "addi t0, %2, 4;\n" + "vlse32.v v10, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v11, (t0), t1;\n" + + "vfmacc.vv v28, v6, v9;\n" + "vfmacc.vv v28, v7, v10;\n" + "vfmacc.vv v28, v8, v11;\n" + + "vfmacc.vv v29, v0, v9;\n" + "vfmacc.vv v29, v1, v10;\n" + "vfmacc.vv v29, v2, v11;\n" + + "vlse32.v v9, (%3), t1;\n" + "addi t0, %3, 4;\n" + "vlse32.v v10, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v11, (t0), t1;\n" + + "vfmacc.vv v29, v3, v9;\n" + "vfmacc.vv v29, v4, v10;\n" + "vfmacc.vv v29, v5, v11;\n" + + "vlse32.v v9, (%4), t1;\n" + "addi t0, %4, 4;\n" + "vlse32.v v10, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v11, (t0), t1;\n" + + "vfmacc.vv v29, v6, v9;\n" + "vfmacc.vv v29, v7, v10;\n" + "vfmacc.vv v29, v8, v11;\n" + + "vfmacc.vv v30, v0, v9;\n" + "vfmacc.vv v30, v1, v10;\n" + "vfmacc.vv v30, v2, v11;\n" + + "vlse32.v v9, (%5), t1;\n" + "addi t0, %5, 4;\n" + "vlse32.v v10, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v11, (t0), t1;\n" + + "vfmacc.vv v30, v3, v9;\n" + "vfmacc.vv v30, v4, v10;\n" + "vfmacc.vv v30, v5, v11;\n" + + "vlse32.v v9, (%6), t1;\n" + "addi t0, %6, 4;\n" + "vlse32.v v10, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v11, (t0), t1;\n" + + "vfmacc.vv v30, v6, v9;\n" + "vfmacc.vv v30, v7, v10;\n" + "vfmacc.vv v30, v8, v11;\n" + + "vfmacc.vv v31, v0, v9;\n" + "vfmacc.vv v31, v1, v10;\n" + "vfmacc.vv v31, v2, v11;\n" + + "vlse32.v v9, (%7), t1;\n" + "addi t0, %7, 4;\n" + "vlse32.v v10, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v11, (t0), t1;\n" + + "vfmacc.vv v31, v3, v9;\n" + "vfmacc.vv v31, v4, v10;\n" + "vfmacc.vv v31, v5, v11;\n" + + "vlse32.v v9, (%8), t1;\n" + "addi t0, %8, 4;\n" + "vlse32.v v10, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v11, (t0), t1;\n" + + "vfmacc.vv v31, v6, v9;\n" + "vfmacc.vv v31, v7, v10;\n" + "vfmacc.vv v31, v8, v11;\n" + : + : "r"(row0), "r"(row1), "r"(row2), "r"(row3), "r"(row4), "r"(row5), "r"(row6), "r"(row7), "r"(row8) + : "t0", "t1"); + + if (act == 0) + { + __asm__("vmv.v.x v27, x0;\n" + "vfmax.vv v28, v28, v27;\n" + "vfmax.vv v29, v29, v27;\n" + "vfmax.vv v30, v30, v27;\n" + "vfmax.vv v31, v31, v27;\n"); + } + else if (act > 0) + { + __asm__("vmv.v.x v26, x0;\n" + "vmv.v.x v27, %0;\n" + "vfmax.vv v28, v28, v26;\n" + "vfmin.vv v28, v28, v27;\n" + "vfmax.vv v29, v29, v26;\n" + "vfmin.vv v29, v29, v27;\n" + "vfmax.vv v30, v30, v26;\n" + "vfmin.vv v30, v30, v27;\n" + "vfmax.vv v31, v31, v26;\n" + "vfmin.vv v31, v31, v27;\n" + : + : "r"(act)); + } + + __asm__( + "vse32.v v28, (%0);\n" + "vse32.v v29, (%1);\n" + "vse32.v v30, (%2);\n" + "vse32.v v31, (%3);\n" + : + : "r"(out0), "r"(out1), "r"(out2), "r"(out3)); + + row0 += 2 * packn; + row1 += 2 * packn; + row2 += 2 * packn; + row3 += 2 * packn; + row4 += 2 * packn; + row5 += 2 * packn; + row6 += 2 * packn; + row7 += 2 * packn; + row8 += 2 * packn; + out0 += packn; + out1 += packn; + out2 += packn; + out3 += packn; + } + + const float k00 = kernel_base[0]; + const float k01 = kernel_base[1]; + const float k02 = kernel_base[2]; + const float k10 = kernel_base[3]; + const float k11 = kernel_base[4]; + const float k12 = kernel_base[5]; + const float k20 = kernel_base[6]; + const float k21 = kernel_base[7]; + const float k22 = kernel_base[8]; + const float bias_value = bias_base ? bias_base[0] : .0f; + + for (; w < outw; ++w) + { + const float i00 = row0[0]; + const float i01 = row0[1]; + const float i02 = row0[2]; + const float i10 = row1[0]; + const float i11 = row1[1]; + const float i12 = row1[2]; + const float i20 = row2[0]; + const float i21 = row2[1]; + const float i22 = row2[2]; + const float i30 = row3[0]; + const float i31 = row3[1]; + const float i32 = row3[2]; + const float i40 = row4[0]; + const float i41 = row4[1]; + const float i42 = row4[2]; + const float i50 = row5[0]; + const float i51 = row5[1]; + const float i52 = row5[2]; + const float i60 = row6[0]; + const float i61 = row6[1]; + const float i62 = row6[2]; + const float i70 = row7[0]; + const float i71 = row7[1]; + const float i72 = row7[2]; + const float i80 = row8[0]; + const float i81 = row8[1]; + const float i82 = row8[2]; + + float v0 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value); + float v1 = (k00 * i20 + k01 * i21 + k02 * i22 + k10 * i30 + k11 * i31 + k12 * i32 + k20 * i40 + k21 * i41 + k22 * i42 + bias_value); + float v2 = (k00 * i40 + k01 * i41 + k02 * i42 + k10 * i50 + k11 * i51 + k12 * i52 + k20 * i60 + k21 * i61 + k22 * i62 + bias_value); + float v3 = (k00 * i60 + k01 * i61 + k02 * i62 + k10 * i70 + k11 * i71 + k12 * i72 + k20 * i80 + k21 * i81 + k22 * i82 + bias_value); + + if (act >= 0) + { + v0 = max(v0, .0f); + v1 = max(v1, .0f); + v2 = max(v2, .0f); + v3 = max(v3, .0f); + if (act > 0) + { + v0 = min(v0, (float)act); + v1 = min(v1, (float)act); + v2 = min(v2, (float)act); + v3 = min(v3, (float)act); + } + } + + *out0 = v0; + *out1 = v1; + *out2 = v2; + *out3 = v3; + + out0 += 1; + out1 += 1; + out2 += 1; + out3 += 1; + + row0 += 2; + row1 += 2; + row2 += 2; + row3 += 2; + row4 += 2; + row5 += 2; + row6 += 2; + row7 += 2; + row8 += 2; + } + + out0 += 3 * outw; + out1 += 3 * outw; + out2 += 3 * outw; + out3 += 3 * outw; + } + + for (; h < outh; ++h) + { + const float* row0 = feat_map + 2 * h * inw; + const float* row1 = row0 + inw; + const float* row2 = row1 + inw; + + int w = 0; + for (; w < (outw & -packn); w += packn) + { + // bias = v18 + if (bias_base) + { + __asm__("lw t0, (%0)\n" + "vmv.v.x v28, t0;\n" + : + : "r"(bias_base) + : "t0"); + } + else + { + __asm__("vmv.v.x v28, x0;\n"); + } + + // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17 + __asm__( + "li t1, 8;\n" + "vlse32.v v9, (%0), t1;\n" + "addi t0, %0, 4;\n" + "vlse32.v v10, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v11, (t0), t1;\n" + + "vfmacc.vv v28, v0, v9;\n" + "vfmacc.vv v28, v1, v10;\n" + "vfmacc.vv v28, v2, v11;\n" + + "vlse32.v v9, (%1), t1;\n" + "addi t0, %1, 4;\n" + "vlse32.v v10, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v11, (t0), t1;\n" + + "vfmacc.vv v28, v3, v9;\n" + "vfmacc.vv v28, v4, v10;\n" + "vfmacc.vv v28, v5, v11;\n" + + "vlse32.v v9, (%2), t1;\n" + "addi t0, %2, 4;\n" + "vlse32.v v10, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v11, (t0), t1;\n" + + "vfmacc.vv v28, v6, v9;\n" + "vfmacc.vv v28, v7, v10;\n" + "vfmacc.vv v28, v8, v11;\n" + : + : "r"(row0), "r"(row1), "r"(row2) + : "t0", "t1"); + + if (act == 0) + { + __asm__("vmv.v.x v27, x0;\n" + "vfmax.vv v28, v28, v27;\n"); + } + else if (act > 0) + { + __asm__("vmv.v.x v26, x0;\n" + "vmv.v.x v27, %0;\n" + "vfmax.vv v28, v28, v26;\n" + "vfmin.vv v28, v28, v27;\n" + : + : "r"(act)); + } + + __asm__( + "vse32.v v28, (%0);\n" + : + : "r"(out0)); + + row0 += 2 * packn; + row1 += 2 * packn; + row2 += 2 * packn; + out0 += packn; + } + + const float k00 = kernel_base[0]; + const float k01 = kernel_base[1]; + const float k02 = kernel_base[2]; + const float k10 = kernel_base[3]; + const float k11 = kernel_base[4]; + const float k12 = kernel_base[5]; + const float k20 = kernel_base[6]; + const float k21 = kernel_base[7]; + const float k22 = kernel_base[8]; + const float bias_value = bias_base ? bias_base[0] : .0f; + + for (; w < outw; ++w) + { + const float i00 = row0[0]; + const float i01 = row0[1]; + const float i02 = row0[2]; + const float i10 = row1[0]; + const float i11 = row1[1]; + const float i12 = row1[2]; + const float i20 = row2[0]; + const float i21 = row2[1]; + const float i22 = row2[2]; + + float v0 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value); + + if (act >= 0) + { + v0 = max(v0, .0f); + if (act > 0) + { + v0 = min(v0, (float)act); + } + } + + *out0 = v0; + + out0 += 1; + row0 += 2; + row1 += 2; + row2 += 2; + } + } + } +} + +void convdw3x3s2_pack8_rvv(const float* input, const float* kernel, const float* bias, float* output, const int inc, const int inh, const int inw, const int outc, const int outh, const int outw, const int act, const struct conv_param* params, int num_thread) +{ + const int packn = 8; + + vsetvl_e32_m2(); +#pragma omp parallel for num_threads(num_thread) + for (int c = 0; c < inc; ++c) + { + const float* feat_map = input + c * inh * inw; + const float* kernel_base = kernel + c * 9; + const float* bias_base = bias ? bias + c : NULL; + + __asm__( + "vle32.v v18, (%0);\n" + + "vrgather.vi v0, v18, 0;\n" + "vrgather.vi v2, v18, 1;\n" + "vrgather.vi v4, v18, 2;\n" + "vrgather.vi v6, v18, 3;\n" + "vrgather.vi v8, v18, 4;\n" + "vrgather.vi v10, v18, 5;\n" + "vrgather.vi v12, v18, 6;\n" + "vrgather.vi v14, v18, 7;\n" + + "lw t0, 32(%0);" + "vmv.v.x v16, t0;\n" + : + : "r"(kernel_base)); + + float* output_base = output + c * outw * outh; + + int h = 0; + for (; h < (outh & -2); h += 2) + { + const float* row0 = feat_map + 2 * h * inw; + const float* row1 = row0 + inw; + const float* row2 = row1 + inw; + const float* row3 = row2 + inw; + const float* row4 = row3 + inw; + + int w = 0; + for (; w < (outw & -packn); w += packn) + { + // bias = v18 + if (bias_base) + { + __asm__("lw t0, (%0)\n" + "vmv.v.x v18, t0;\n" + "vmv.v.x v20, t0;\n" + : + : "r"(bias_base) + : "t0"); + } + else + { + __asm__("vmv.v.x v18, x0;\n" + "vmv.v.x v20, x0;\n"); + } + + // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17 + __asm__( + "li t1, 8;\n" + "vlse32.v v22, (%1), t1;\n" + "addi t0, %1, 4;\n" + "vlse32.v v24, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v26, (t0), t1;\n" + + "vfmacc.vv v18, v0, v22;\n" + "vfmacc.vv v18, v2, v24;\n" + "vfmacc.vv v18, v4, v26;\n" + + "vlse32.v v22, (%2), t1;\n" + "addi t0, %2, 4;\n" + "vlse32.v v24, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v26, (t0), t1;\n" + + "vfmacc.vv v18, v6, v22;\n" + "vfmacc.vv v18, v8, v24;\n" + "vfmacc.vv v18, v10, v26;\n" + + "vlse32.v v22, (%3), t1;\n" + "addi t0, %3, 4;\n" + "vlse32.v v24, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v26, (t0), t1;\n" + + "vfmacc.vv v18, v12, v22;\n" + "vfmacc.vv v18, v14, v24;\n" + "vfmacc.vv v18, v16, v26;\n" + + "vfmacc.vv v20, v0, v22;\n" + "vfmacc.vv v20, v2, v24;\n" + "vfmacc.vv v20, v4, v26;\n" + + "vlse32.v v22, (%4), t1;\n" + "addi t0, %4, 4;\n" + "vlse32.v v24, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v26, (t0), t1;\n" + + "vfmacc.vv v20, v6, v22;\n" + "vfmacc.vv v20, v8, v24;\n" + "vfmacc.vv v20, v10, v26;\n" + + "vlse32.v v22, (%5), t1;\n" + "addi t0, %5, 4;\n" + "vlse32.v v24, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v26, (t0), t1;\n" + + "vfmacc.vv v20, v12, v22;\n" + "vfmacc.vv v20, v14, v24;\n" + "vfmacc.vv v20, v16, v26;\n" + : + : "r"(output_base), "r"(row0), "r"(row1), "r"(row2), "r"(row3), "r"(row4) + : "t0", "t1"); + + if (act == 0) + { + __asm__("vmv.v.x v22, x0;\n" + "vfmax.vv v18, v18, v22;\n" + "vfmax.vv v20, v20, v22;\n"); + } + else if (act > 0) + { + __asm__("vmv.v.x v22, x0;\n" + "vmv.v.x v24, %0;\n" + "vfmax.vv v18, v18, v22;\n" + "vfmin.vv v18, v18, v24;\n" + "vfmax.vv v20, v20, v22;\n" + "vfmin.vv v20, v20, v24;\n" + : + : "r"(act)); + } + + __asm__("vse32.v v18, (%0);\n" ::"r"(output_base)); + __asm__("vse32.v v20, (%0);\n" ::"r"(output_base + outw)); + + row0 += 2 * packn; + row1 += 2 * packn; + row2 += 2 * packn; + row3 += 2 * packn; + row4 += 2 * packn; + output_base += packn; + } + + const float k00 = kernel_base[0]; + const float k01 = kernel_base[1]; + const float k02 = kernel_base[2]; + const float k10 = kernel_base[3]; + const float k11 = kernel_base[4]; + const float k12 = kernel_base[5]; + const float k20 = kernel_base[6]; + const float k21 = kernel_base[7]; + const float k22 = kernel_base[8]; + const float bias_value = bias_base ? bias_base[0] : .0f; + + for (; w < outw; ++w) + { + const float i00 = row0[0]; + const float i01 = row0[1]; + const float i02 = row0[2]; + const float i10 = row1[0]; + const float i11 = row1[1]; + const float i12 = row1[2]; + const float i20 = row2[0]; + const float i21 = row2[1]; + const float i22 = row2[2]; + const float i30 = row3[0]; + const float i31 = row3[1]; + const float i32 = row3[2]; + const float i40 = row4[0]; + const float i41 = row4[1]; + const float i42 = row4[2]; + + float out1 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value); + float out2 = (k00 * i20 + k01 * i21 + k02 * i22 + k10 * i30 + k11 * i31 + k12 * i32 + k20 * i40 + k21 * i41 + k22 * i42 + bias_value); + + if (act >= 0) + { + out1 = max(out1, .0f); + out2 = max(out2, .0f); + if (act > 0) + { + out1 = min(out1, (float)act); + out2 = min(out2, (float)act); + } + } + + *output_base = out1; + *(output_base + outw) = out2; + + output_base += 1; + row0 += 2; + row1 += 2; + row2 += 2; + row3 += 2; + row4 += 2; + } + + output_base += outw; + } + + for (; h < outh; ++h) + { + const float* row0 = feat_map + 2 * h * inw; + const float* row1 = row0 + inw; + const float* row2 = row1 + inw; + + int w = 0; + for (; w < (outw & -packn); w += packn) + { + // bias = v18 + if (bias_base) + { + __asm__("lw t0, (%0)\n" + "vmv.v.x v18, t0;\n" + : + : "r"(bias_base) + : "t0"); + } + else + { + __asm__("vmv.v.x v18, x0;\n"); + } + + // r00, r01, r02, ..., r22 = v9, v10, v11, ...v17 + __asm__( + "li t1, 8;\n" + "vlse32.v v22, (%0), t1;\n" + "addi t0, %0, 4;\n" + "vlse32.v v24, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v26, (t0), t1;\n" + + "vfmacc.vv v18, v0, v22;\n" + "vfmacc.vv v18, v2, v24;\n" + "vfmacc.vv v18, v4, v26;\n" + + "vlse32.v v22, (%1), t1;\n" + "addi t0, %1, 4;\n" + "vlse32.v v24, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v26, (t0), t1;\n" + + "vfmacc.vv v18, v6, v22;\n" + "vfmacc.vv v18, v8, v24;\n" + "vfmacc.vv v18, v10, v26;\n" + + "vlse32.v v22, (%2), t1;\n" + "addi t0, %2, 4;\n" + "vlse32.v v24, (t0), t1;\n" + "addi t0, t0, 4;\n" + "vlse32.v v26, (t0), t1;\n" + + "vfmacc.vv v18, v12, v22;\n" + "vfmacc.vv v18, v14, v24;\n" + "vfmacc.vv v18, v16, v26;\n" + : + : "r"(row0), "r"(row1), "r"(row2) + : "t0", "t1"); + + if (act == 0) + { + __asm__("vmv.v.x v22, x0;\n" + "vfmax.vv v18, v18, v22;\n"); + } + else if (act > 0) + { + __asm__("vmv.v.x v22, x0;\n" + "vfmax.vv v18, v18, v22;\n" + "vfmin.vv v18, v18, v24;\n" + : + : "r"(act)); + } + + __asm__("vse32.v v18, (%0);\n" ::"r"(output_base)); + + row0 += 2 * packn; + row1 += 2 * packn; + row2 += 2 * packn; + output_base += packn; + } + + const float k00 = kernel_base[0]; + const float k01 = kernel_base[1]; + const float k02 = kernel_base[2]; + const float k10 = kernel_base[3]; + const float k11 = kernel_base[4]; + const float k12 = kernel_base[5]; + const float k20 = kernel_base[6]; + const float k21 = kernel_base[7]; + const float k22 = kernel_base[8]; + const float bias_value = bias_base ? bias_base[0] : .0f; + + for (; w < outw; ++w) + { + const float i00 = row0[0]; + const float i01 = row0[1]; + const float i02 = row0[2]; + const float i10 = row1[0]; + const float i11 = row1[1]; + const float i12 = row1[2]; + const float i20 = row2[0]; + const float i21 = row2[1]; + const float i22 = row2[2]; + + float out1 = (k00 * i00 + k01 * i01 + k02 * i02 + k10 * i10 + k11 * i11 + k12 * i12 + k20 * i20 + k21 * i21 + k22 * i22 + bias_value); + + if (act >= 0) + { + out1 = max(out1, .0f); + if (act > 0) + { + out1 = min(out1, (float)act); + } + } + + *output_base = out1; + + output_base += 1; + row0 += 2; + row1 += 2; + row2 += 2; + } + output_base += outw; + } + } +} + +int conv_dw_packn_kernel_run(const ir_node_t* ir_node, const ir_tensor_t* input_tensor, const ir_tensor_t* filter_tensor, const ir_tensor_t* bias_tensor, ir_tensor_t* output_tensor, const struct conv_priv_info* priv_info, const struct conv_param* params, const int num_thread, const int cpu_affinity) +{ + float* input = (float*)input_tensor->data; + float* output = (float*)output_tensor->data; + const float* kernel = filter_tensor->data; + const float* bias = bias_tensor ? bias_tensor->data : NULL; + + const int inb = input_tensor->dims[0]; + const int inc = input_tensor->dims[1]; + const int inh = input_tensor->dims[2]; + const int inw = input_tensor->dims[3]; + + const int outb = output_tensor->dims[0]; + const int outc = output_tensor->dims[1]; + const int outh = output_tensor->dims[2]; + const int outw = output_tensor->dims[3]; + + const int ksize_h = params->kernel_h; + const int ksize_w = params->kernel_w; + const int pad_w = params->pad_w0; + const int pad_h = params->pad_h0; + const int stride_w = params->stride_w; + const int stride_h = params->stride_h; + + const int dilation_w = params->dilation_w; + const int dilation_h = params->dilation_h; + const int group = params->group; + const int act = params->activation; + + int inh_pad = inh + pad_h + pad_h; + int inw_pad = inw + pad_w + pad_w; + float* input_pad = NULL; + + if (inh_pad == inh && inw_pad == inw) + { + input_pad = input; + } + else + { + input_pad = priv_info->input_pad; + for (int b = 0; b < inb; ++b) + { + const float* input_batch_base = input + b * inc * inh * inw; + float* input_batch_padded_base = input_pad + b * inc * inh_pad * inw_pad; +#pragma omp parallel for num_threads(num_thread) + for (int g = 0; g < group; ++g) + { + const float* pad_in = input_batch_base + g * inh * inw; + float* pad_out = input_batch_padded_base + g * inh_pad * inw_pad; + pad(pad_in, pad_out, inh, inw, inh_pad, inw_pad, pad_h, pad_w, .0f); + } + } + } + + for (int b = 0; b < inb; ++b) + { + const float* input_batch_base = input_pad + b * inc * inh_pad * inw_pad; + float* output_batch_base = output + b * outc * outh * outw; + if (stride_h == 1) + { + convdw3x3s1_pack4_rvv(input_batch_base, kernel, bias, output_batch_base, inc, inh_pad, inw_pad, outc, outh, outw, act, params, num_thread); + } + else + { + convdw3x3s2_pack8_rvv(input_batch_base, kernel, bias, output_batch_base, inc, inh_pad, inw_pad, outc, outh, outw, act, params, num_thread); + } + } + + return 0; +} diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c index ac7333ff0..420f4cadc 100644 --- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c +++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64.c @@ -1,98 +1,100 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2021, OPEN AI LAB - * Author: ddzhao@openailab.com - */ - #include "convolution_param.h" - #include "graph/tensor.h" #include "graph/node.h" #include "graph/graph.h" -#include "module/module.h" -#include "operator/op.h" -#include "utility/sys_port.h" -#include "utility/log.h" #include "device/cpu/cpu_node.h" #include "device/cpu/cpu_graph.h" +#include "operator/op.h" +#include "api/c_api.h" +#include "utility/log.h" +#include "utility/sys_port.h" #include "device/cpu/cpu_module.h" +#include +#include + +extern int conv_hcl_prerun_rv64(struct node* ir_node, struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param); +extern int conv_hcl_run_rv64(struct node* ir_node, struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param, int num_thread, int cpu_affinity); +extern int conv_hcl_get_shared_mem_size_rv64(struct tensor* input_tensor, struct tensor* output_tensor, struct conv_param* param); +extern int conv_hcl_postrun_rv64(struct node* ir_node, struct conv_priv_info* info); + +static int init_node(struct node_ops* ops, struct exec_node* exec_node, struct exec_graph* exec_graph) +{ + struct node* ir_node = exec_node->ir_node; + struct graph* ir_graph = ir_node->graph; + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); + struct tensor* kernel_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); + struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); + struct conv_param* params = ir_node->op.param_mem; + struct conv_priv_info* info = sys_malloc(sizeof(struct conv_priv_info)); + if (!info) + { + return -1; + } + + memset(info, 0, sizeof(*info)); + exec_node->ops_priv = info; -#include "conv_kernel_rv64.h" + if (exec_graph->mode == TENGINE_MODE_FP32) + { + exec_node->shared_mem_size = conv_hcl_get_shared_mem_size_rv64(input_tensor, output_tensor, params); + exec_node->shared_pack4_mem_size = 0; + } + else + { + TLOG_ERR("Tengine work node %s not support %d\n", ir_node->name, exec_graph->mode); + return -1; + } -#include "string.h" + return 0; +} static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; + struct conv_param* param = ir_node->op.param_mem; + struct conv_priv_info* info = exec_node->ops_priv; - /* get cpu affinity */ - conv_priv_info->cpu_type = exec_graph->cpu_affinity; + info->cpu_type = exec_graph->cpu_affinity; - /* fp32 prerun */ if (exec_graph->mode == TENGINE_MODE_FP32) { - if (conv_hcl_set_shared_mem && exec_node->shared_mem_size < exec_graph->shared_mem_size) + if (exec_node->shared_mem_size < exec_graph->shared_mem_size) { - if (conv_hcl_set_shared_mem(conv_priv_info, exec_graph->shared_mem, exec_graph->shared_mem_size) < 0) - { - TLOG_ERR("hcl conv: set shared memory failed\n"); - return -1; - } + info->external_im2col_mem = 1; + info->im2col_buffer = exec_graph->shared_mem; + info->im2col_buffer_size = exec_graph->shared_mem_size; } - if (conv_hcl_set_shared_pack4_mem && exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size) + + if (exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size) { - if (conv_hcl_set_shared_pack4_mem(conv_priv_info, exec_graph->shared_pack4_mem, - exec_graph->shared_pack4_mem_size) - < 0) - { - TLOG_ERR("hcl conv: set shared pack4 memory failed\n"); - return -1; - } + info->external_im2col_pack4_mem = 0; + info->im2col_buffer_pack4 = NULL; + info->im2col_buffer_pack4_size = 0; } - int group = conv_param->group; - int kernel_h = conv_param->kernel_h; - int kernel_w = conv_param->kernel_w; - if (group > 1 && kernel_h == 7 && kernel_w == 7) - conv_priv_info->external_interleave_pack4_mem = 0; + if (param->group > 1 && param->kernel_h == 7 && param->kernel_w == 7) + { + info->external_interleave_pack4_mem = 0; + } else - conv_priv_info->external_interleave_pack4_mem = 1; + { + info->external_interleave_pack4_mem = 1; + } - /* do prerun */ - if (conv_hcl_prerun(input_tensor, filter_tensor, output_tensor, conv_priv_info, conv_param) < 0) + if (conv_hcl_prerun_rv64(ir_node, input_tensor, filter_tensor, output_tensor, info, param) < 0) { - TLOG_ERR("hcl conv prerun failed\n"); + TLOG_ERR("hcl conv prerun failed.\n"); return -1; } } else { - printf("Tengine work node not support %d\n", exec_graph->mode); return -1; } @@ -103,37 +105,32 @@ static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct ex { struct node* ir_node = exec_node->ir_node; struct graph* ir_graph = ir_node->graph; - struct tensor* input_tensor; - struct tensor* weight_tensor; - struct tensor* output_tensor; + struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); + struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); + struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); struct tensor* bias_tensor = NULL; - int num_thread = exec_graph->num_thread; - int cpu_affinity = exec_graph->cpu_affinity; - - /* set the input data and shape again, in case of reshape or dynamic shape */ - input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); - weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); - output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); if (ir_node->input_num > 2) + { bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); + } - struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; - struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; + struct conv_param* params = ir_node->op.param_mem; + struct conv_priv_info* info = exec_node->ops_priv; + int num_thread = exec_graph->num_thread; + int cpu_affinity = exec_graph->cpu_affinity; - /* fp32 run */ - if (exec_graph->mode == TENGINE_MODE_FP32) + if (exec_graph->mode == TENGINE_DT_FP32) { - if (conv_hcl_run(input_tensor, weight_tensor, bias_tensor, output_tensor, conv_priv_info, conv_param, num_thread, - cpu_affinity) - < 0) + int ret = conv_hcl_run_rv64(ir_node, input_tensor, filter_tensor, bias_tensor, output_tensor, info, params, num_thread, cpu_affinity); + if (ret < 0) { - TLOG_ERR("hcl conv run failed\n"); - return -1; + TLOG_ERR("conv_hcl_run %s run failed: %d\n", ir_node->name, ret); + return ret; } } else { - printf("Tengine work node not support %d\n", exec_graph->mode); + TLOG_ERR("Tengine work node %s not support %d mode\n", ir_node->name, exec_graph->mode); return -1; } @@ -147,95 +144,46 @@ static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struc static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; - - /* fp32 postrun */ if (exec_graph->mode == TENGINE_MODE_FP32) { - if (conv_hcl_postrun(conv_priv_info) < 0) - { - TLOG_ERR("hcl conv postrun failed\n"); - return -1; - } + return conv_hcl_postrun_rv64(exec_node->ir_node, exec_node->ops_priv); } else { - printf("Tengine work node not support %d\n", exec_graph->mode); + TLOG_ERR("Tengine work node %s not support %d mode\n", exec_node->ir_node->name, exec_graph->mode); return -1; } - - return 0; -} - -static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) -{ - struct node* ir_node = exec_node->ir_node; - struct graph* ir_graph = ir_node->graph; - struct tensor* input_tensor; - struct tensor* filter_tensor; - struct tensor* output_tensor; - - input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); - filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); - output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - - struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; - - /* init the private info data of convolution op */ - struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)sys_malloc(sizeof(struct conv_priv_info)); - if (conv_priv_info == NULL) - { - return -1; - } - memset(conv_priv_info, 0, sizeof(struct conv_priv_info)); - exec_node->ops_priv = conv_priv_info; - - /* get shared memory size */ - if (exec_graph->mode == TENGINE_MODE_FP32) - { - exec_node->shared_mem_size = conv_hcl_get_shared_mem_size_rv64(input_tensor, output_tensor, conv_param); - exec_node->shared_pack4_mem_size = conv_hcl_get_shared_pack4_mem_size(filter_tensor, output_tensor, conv_param); - } - else - { - printf("Tengine work node not support %d\n", exec_graph->mode); - return -1; - } - - return 0; } static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) { - struct conv_priv_info* conv_priv_info = (struct conv_priv_info*)exec_node->ops_priv; - sys_free(conv_priv_info); + struct conv_priv_info* info = exec_node->ops_priv; + sys_free(info); exec_node->ops_priv = NULL; return 0; } -static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) +static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* ir_node) { - struct node* ir_node = exec_node; struct graph* ir_graph = ir_node->graph; struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); + struct tensor* kernel_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - struct conv_param* param = (struct conv_param*)exec_node->op.param_mem; - int group = param->group; - int kernel_h = param->kernel_h; - int kernel_w = param->kernel_w; - int in_c = input_tensor->dims[1] / group; - int out_c = output_tensor->dims[1] / group; + struct conv_param* param = ir_node->op.param_mem; if (input_tensor->data_type != TENGINE_DT_FP32) + { return 0; + } - if (group != 1) + if (param->group != 1) + { return 0; + } return OPS_SCORE_PREFER; } - static struct node_ops hcl_node_ops = { .prerun = prerun, .run = run, @@ -243,7 +191,8 @@ static struct node_ops hcl_node_ops = { .postrun = postrun, .init_node = init_node, .release_node = release_node, - .score = score}; + .score = score, +}; int register_conv_hcl_rv64_op() { diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c b/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c index 999a49d4e..23e6dc5f7 100644 --- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c +++ b/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.c @@ -1,43 +1,19 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2021, OPEN AI LAB - * Author: ddzhao@openailab.com - */ - #include +#include #include -#include - -#include "conv_kernel_rv64.h" -// #include "wino_conv_kernel_arm.h" // FIXME: add wino support -// #include "wino_conv_kernel_1_arm.h" // FIXME: add wino support +#include "convolution_param.h" +#include "graph/tensor.h" +#include "op/conv/x86/conv_kernel_x86.h" +#include "utility/sys_port.h" +#include +#include -#define PER_OUT_CHAN 16 -void sgemm_4x16_rv64(float* biases, float* input, float* kernel, long kernel_size, float* output, long output_xy, - int activation, int layout); -void sgemm_4x4_rv64(float* biases, float* input, float* kernel, long kernel_size, float* output, long output_xy, - int activation, int layout); +#define PER_OUT_CHAN 8 +#define min(a, b) ((a) < (b) ? (a) : (b)) -void im2col_fp32_1x1(float* input, int input_xy, float* col, int col_cnt, int input_chan); -void im2col_fp32_3x3(float* input, int w, int h, int channel, float* cur_col, int stride); +extern void sgemm_8x8_rv64(const float* cur_col, const float* cur_kernel, const float* bias, const int act, float* cur_output, const int output_xy, const int kernel_size, const int n); +extern void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k_w, int k_h, int s_w, int s_h, int d_w, + int d_h, int pad_w0, int pad_w1, int pad_h0, int pad_h1, int out_w, int out_h, int num_thread); static void interleave_kernel(float* kernel, float* kernel_interleaved, int kernel_chan, int kernel_size) { @@ -56,29 +32,67 @@ static void interleave_kernel(float* kernel, float* kernel_interleaved, int kern *(cur_kernel_interleaved++) = cur_kernel[k][j]; } } - for (; i < (kernel_chan & -4); i += 4) + + // last 7 kernel + for (k = 0; i + k < kernel_chan; k++) + cur_kernel[k] = kernel + kernel_size * (i + k); + + if ((kernel_chan & 0x7) == 7) + { + for (j = 0; j < kernel_size; j++) + { + for (k = 0; k < 7; k++) + *(cur_kernel_interleaved++) = cur_kernel[k][j]; + *(cur_kernel_interleaved++) = 0.f; + } + } + else if ((kernel_chan & 0x7) == 6) + { + for (j = 0; j < kernel_size; j++) + { + for (k = 0; k < 6; k++) + *(cur_kernel_interleaved++) = cur_kernel[k][j]; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; + } + } + else if ((kernel_chan & 0x7) == 5) + { + for (j = 0; j < kernel_size; j++) + { + for (k = 0; k < 5; k++) + *(cur_kernel_interleaved++) = cur_kernel[k][j]; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; + } + } + else if ((kernel_chan & 0x7) == 4) { - for (k = 0; k < 4; k++) - cur_kernel[k] = kernel + kernel_size * (i + k); for (j = 0; j < kernel_size; j++) { for (k = 0; k < 4; k++) *(cur_kernel_interleaved++) = cur_kernel[k][j]; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; } } - // last 4 kernel - for (k = 0; k < 3; k++) - cur_kernel[k] = kernel + kernel_size * (i + k); - if ((kernel_chan & 0x3) == 3) + else if ((kernel_chan & 0x7) == 3) { for (j = 0; j < kernel_size; j++) { for (k = 0; k < 3; k++) *(cur_kernel_interleaved++) = cur_kernel[k][j]; *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; } } - else if ((kernel_chan & 0x3) == 2) + else if ((kernel_chan & 0x7) == 2) { for (j = 0; j < kernel_size; j++) { @@ -86,9 +100,13 @@ static void interleave_kernel(float* kernel, float* kernel_interleaved, int kern *(cur_kernel_interleaved++) = cur_kernel[k][j]; *(cur_kernel_interleaved++) = 0.f; *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; } } - else if ((kernel_chan & 0x3) == 1) + else if ((kernel_chan & 0x7) == 1) { for (j = 0; j < kernel_size; j++) { @@ -96,6 +114,10 @@ static void interleave_kernel(float* kernel, float* kernel_interleaved, int kern *(cur_kernel_interleaved++) = 0.f; *(cur_kernel_interleaved++) = 0.f; *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; + *(cur_kernel_interleaved++) = 0.f; } } } @@ -104,14 +126,19 @@ static void interleave_kernel(float* kernel, float* kernel_interleaved, int kern static void interleave(struct tensor* filter, struct conv_priv_info* priv_info, struct conv_param* param) { int group = param->group; - int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3]; + int in_c = filter->dims[1]; + int kernel_h = filter->dims[2]; + int kernel_w = filter->dims[3]; + int kernel_size = in_c * kernel_h * kernel_w; + int out_chan = filter->dims[0] / group; - int out_chan_align4 = (out_chan + 3) / 4 * 4; + int out_chan_align8 = (out_chan + 7) / 8 * 8; - int kernel_size_algin = kernel_size * out_chan_align4; + int kernel_size_algin = kernel_size * out_chan_align8; int kernel_size_group = kernel_size * out_chan; float* kernel = filter->data; + float* interleave_buf = priv_info->interleave_buffer; for (int g = 0; g < group; g++) { @@ -121,520 +148,144 @@ static void interleave(struct tensor* filter, struct conv_priv_info* priv_info, } } -static void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k_w, int k_h, int s_w, int s_h, int d_w, - int d_h, int pad_w0, int pad_w1, int pad_h0, int pad_h1, int out_w, int out_h, int num_thread) -{ - if (k_w == 1 && k_h == 1 && s_w == 1 && s_h == 1) - { - int kernel_size = k_w * k_h * in_c; - int in_xy = in_w * in_h; - int out_xy = out_w * out_h; - int col_end3 = out_xy & 3; -#pragma omp parallel for num_threads(num_thread) - for (int col_i = 0; col_i < out_xy - 3; col_i += 4) - { - float* cur_col = col + col_i * kernel_size; - - float* cur_input = input + col_i; - im2col_fp32_1x1(cur_input, in_xy, cur_col, 4, in_c); - } - int col_i = out_xy & -4; - float* cur_col; - // final 4 input - if (col_end3) - { - cur_col = col + col_i * kernel_size; - for (int col_j = 0; col_j < kernel_size; col_j++) - { - for (int i = 0; i < 4; i++) - { - if (i < col_end3) - *cur_col++ = *(input + col_j * in_xy + col_i + i); - else - *cur_col++ = 0; - } - } - } - } - else if (d_w == 1 && d_h == 1 && k_w == 3 && k_h == 3 && s_w == s_h) - { - int kernel_size = k_w * k_h * in_c; - int in_xy = in_w * in_h; - int out_xy = out_w * out_h; - int col_end3 = out_xy & 3; - int is_pad0 = (pad_w0 == 0) && (pad_h0 == 0) && (pad_w1 == 0) && (pad_h1 == 0); -#pragma omp parallel for num_threads(num_thread) - for (int col_i = 0; col_i < (out_xy & -4); col_i += 4) - { - float* cur_col = col + col_i * kernel_size; - int imy0 = col_i / out_w; - int imy3 = (col_i + 3) / out_w; - int imx0 = col_i - imy0 * out_w; - int imx3 = (col_i + 3) - imy3 * out_w; - if ((imy0 == imy3) && (is_pad0 || (imy0 != 0 && imx0 != 0 && imy0 != (out_h - 1) && imx3 != (out_w - 1)))) - { - float* l0 = input + (imy0 * s_h - pad_h0) * in_w + (imx0 * s_w - pad_w0); - { - im2col_fp32_3x3(l0, in_w, in_h, in_c, cur_col, s_w); // add im2col 3x3 - cur_col += 4 * kernel_size; - } - } - else - { - int cnt_y[4] = {imy0, (col_i + 1) / out_w, (col_i + 2) / out_w, imy3}; - int cnt_x[4] = {imx0, col_i - cnt_y[1] * out_w + 1, col_i - cnt_y[2] * out_w + 2, imx3}; - int imx_start[4] = {cnt_x[0] * s_w - pad_w0, cnt_x[1] * s_w - pad_w0, cnt_x[2] * s_w - pad_w0, - cnt_x[3] * s_w - pad_w0}; - int imy_start[4] = {cnt_y[0] * s_h - pad_h0, cnt_y[1] * s_h - pad_h0, cnt_y[2] * s_h - pad_h0, - cnt_y[3] * s_h - pad_h0}; - for (int kch = 0; kch < in_c; kch++) - for (int ky = 0; ky < 3; ky++) - for (int kx = 0; kx < 3; kx++) - { - int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; - int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; - for (int i = 0; i < 4; i++) - { - if (imx[i] >= 0 && imx[i] < in_w && imy[i] >= 0 && imy[i] < in_h) - *cur_col++ = *(input + in_xy * kch + in_w * imy[i] + imx[i]); - else - *cur_col++ = 0.f; - } - } - } - } - // final 4 input - int col_i = out_xy & -4; - if (col_end3) - { - float* cur_col = col + col_i * kernel_size; - int cnt_y[4] = {col_i / out_w, (col_i + 1) / out_w, (col_i + 2) / out_w, (col_i + 3) / out_w}; - int cnt_x[4] = {col_i - cnt_y[0] * out_w, col_i - cnt_y[1] * out_w + 1, col_i - cnt_y[2] * out_w + 2, - col_i - cnt_y[3] * out_w + 3}; - int imx_start[4] = {cnt_x[0] * s_w - pad_w0, cnt_x[1] * s_w - pad_w0, cnt_x[2] * s_w - pad_w0, - cnt_x[3] * s_w - pad_w0}; - int imy_start[4] = {cnt_y[0] * s_h - pad_h0, cnt_y[1] * s_h - pad_h0, cnt_y[2] * s_h - pad_h0, - cnt_y[3] * s_h - pad_h0}; - for (int kch = 0; kch < in_c; kch++) - { - for (int ky = 0; ky < 3; ky++) - { - for (int kx = 0; kx < 3; kx++) - { - int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; - int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; - for (int i = 0; i < 4; i++) - { - if (i < col_end3 && imx[i] >= 0 && imx[i] < in_w && imy[i] >= 0 && imy[i] < in_h) - *cur_col++ = *(input + in_xy * kch + in_w * imy[i] + imx[i]); - else - *cur_col++ = 0.f; - } - } - } - } - } - } - else - { - int out_xy = out_w * out_h; -#pragma omp parallel for num_threads(num_thread) - for (int col_i = 0; col_i < out_xy - 3; col_i += 4) - { - int kernel_size = k_w * k_h * in_c; - int in_xy = in_w * in_h; - int col_end3 = out_xy & 3; - float* cur_col = col + col_i * kernel_size; - int cnt_y[4] = {col_i / out_w, (col_i + 1) / out_w, (col_i + 2) / out_w, (col_i + 3) / out_w}; - int cnt_x[4] = {col_i - cnt_y[0] * out_w, col_i - cnt_y[1] * out_w + 1, col_i - cnt_y[2] * out_w + 2, - col_i - cnt_y[3] * out_w + 3}; - int imx_start[4] = {cnt_x[0] * s_w - pad_w0, cnt_x[1] * s_w - pad_w0, cnt_x[2] * s_w - pad_w0, - cnt_x[3] * s_w - pad_w0}; - int imy_start[4] = {cnt_y[0] * s_h - pad_h0, cnt_y[1] * s_h - pad_h0, cnt_y[2] * s_h - pad_h0, - cnt_y[3] * s_h - pad_h0}; - for (int kch = 0; kch < in_c; kch++) - for (int ky = 0; ky < (k_h * d_h); ky += d_h) - for (int kx = 0; kx < (k_w * d_w); kx += d_w) - { - int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; - int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; - for (int i = 0; i < 4; i++) - { - if (imx[i] >= 0 && imx[i] < in_w && imy[i] >= 0 && imy[i] < in_h) - *cur_col++ = *(input + in_xy * kch + in_w * imy[i] + imx[i]); - else - *cur_col++ = 0.f; - } - } - } - int col_i = out_xy & -4; - float* cur_col; - int kernel_size = k_w * k_h * in_c; - int in_xy = in_w * in_h; - int col_end3 = out_xy & 3; - if (col_end3) - { - cur_col = col + col_i * kernel_size; - int cnt_y[4] = {col_i / out_w, (col_i + 1) / out_w, (col_i + 2) / out_w, (col_i + 3) / out_w}; - int cnt_x[4] = {col_i - cnt_y[0] * out_w, col_i - cnt_y[1] * out_w + 1, col_i - cnt_y[2] * out_w + 2, - col_i - cnt_y[3] * out_w + 3}; - int imx_start[4] = {cnt_x[0] * s_w - pad_w0, cnt_x[1] * s_w - pad_w0, cnt_x[2] * s_w - pad_w0, - cnt_x[3] * s_w - pad_w0}; - int imy_start[4] = {cnt_y[0] * s_h - pad_h0, cnt_y[1] * s_h - pad_h0, cnt_y[2] * s_h - pad_h0, - cnt_y[3] * s_h - pad_h0}; - for (int kch = 0; kch < in_c; kch++) - for (int ky = 0; ky < (k_h * d_h); ky += d_h) - for (int kx = 0; kx < (k_w * d_w); kx += d_w) - { - int imx[4] = {imx_start[0] + kx, imx_start[1] + kx, imx_start[2] + kx, imx_start[3] + kx}; - int imy[4] = {imy_start[0] + ky, imy_start[1] + ky, imy_start[2] + ky, imy_start[3] + ky}; - for (int i = 0; i < 4; i++) - { - if (i < col_end3 && imx[i] >= 0 && imx[i] < in_w && imy[i] >= 0 && imy[i] < in_h) - *cur_col++ = *(input + in_xy * kch + in_w * imy[i] + imx[i]); - else - *cur_col++ = 0.f; - } - } - } - } -} - -static void sgemm_set(float* col, float* kernel, float* biases, float* output, int kernel_size, int ch_start, - int ch_end, int output_xy, int activation, int num_thread, int cpu_affinity) -{ - int nn_outch = ch_end / PER_OUT_CHAN; - int col_end3 = output_xy & 0x3; - - if (col_end3) - { -#pragma omp parallel for num_threads(num_thread) - for (int pp = 0; pp < nn_outch; pp++) - { - int p = pp * PER_OUT_CHAN; - - float* biasptr = biases ? (float*)(biases + p) : NULL; - float* kernel_tmp = (float*)(kernel + p * kernel_size); - float* output_tmp = (float*)(output + p * output_xy); - - int col_line = 0; - for (col_line = 0; col_line + 3 < output_xy; col_line += 4) - { - float* col_tmp = (float*)(col + col_line * kernel_size); - sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0); // FIXME: replace with sgemm_4x16_rv64 - } - { - float result[64]; - float* col_tmp = (float*)(col + col_line * kernel_size); - sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, result, 4, activation, 0); // FIXME: replace with sgemm_4x16_rv64 - for (int i = 0; i < 16; i++) - { - for (int j = 0; j < (col_end3); j++) - *(output + (p + i) * output_xy + col_line + j) = result[(i << 2) + j]; - } - } - } - } - else - { -#pragma omp parallel for num_threads(num_thread) - for (int pp = 0; pp < nn_outch; pp++) - { - int p = pp * PER_OUT_CHAN; - - float* biasptr = biases ? (float*)(biases + p) : NULL; - float* kernel_tmp = (float*)(kernel + p * kernel_size); - float* output_tmp = (float*)(output + p * output_xy); - - for (int col_line = 0; col_line + 3 < output_xy; col_line += 4) - { - float* col_tmp = (float*)(col + col_line * kernel_size); - sgemm_4x16_rv64(biasptr, col_tmp, kernel_tmp, kernel_size, output_tmp + col_line, output_xy, activation, 0); // FIXME: replace with sgemm_4x16_rv64 - } - } - } -} - -static void sgemm4x4(float* col, float* kernel, float* biases, float* output, int kernel_size, int ch_start, int ch_end, - int output_xy, int activation, int num_thread, int cpu_affinity) -{ - float result[16]; - int col_end3 = output_xy & 0x3; - int kernel_end3 = ch_end & 0x3; - -#pragma omp parallel for num_threads(num_thread) private(result) - for (int kernel_num = ch_start; kernel_num < ((ch_end & -4) - 3); kernel_num += 4) - { - float* cur_biases = NULL; - float *cur_col, *cur_kernel, *cur_output; - int col_line; - if (biases) - cur_biases = (float*)(biases + kernel_num); - cur_kernel = (float*)(kernel + kernel_num * kernel_size); - cur_output = (float*)(output + kernel_num * output_xy); - for (col_line = 0; col_line < (output_xy & -4); col_line += 4) - { - cur_col = (float*)(col + col_line * kernel_size); - sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, cur_output + col_line, output_xy, activation, 0); - } - if (col_end3) - { - cur_col = (float*)(col + col_line * kernel_size); - sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0); - for (int i = 0; i < 4; i++) - { - for (int j = 0; j < (col_end3); j++) - *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j]; - } - } - } - if (kernel_end3) - { - int kernel_num = (ch_end & -4); - float* cur_biases = NULL; - if (biases) - cur_biases = (float*)(biases + kernel_num); - float* cur_kernel = (float*)(kernel + kernel_num * kernel_size); -#pragma omp parallel for num_threads(num_thread) private(result) - for (int col_line = 0; col_line < (output_xy & -4); col_line += 4) - { - float* cur_col = (float*)(col + col_line * kernel_size); - sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0); - for (int i = 0; i < kernel_end3; i++) - for (int j = 0; j < 4; j++) - *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j]; - } - int col_line = output_xy & -4; - if (col_end3) - { - float* cur_col = (float*)(col + col_line * kernel_size); - sgemm_4x4_rv64(cur_biases, cur_col, cur_kernel, kernel_size, result, 4, activation, 0); - for (int i = 0; i < (kernel_end3); i++) - { - for (int j = 0; j < (col_end3); j++) - *(output + (kernel_num + i) * output_xy + col_line + j) = result[(i << 2) + j]; - } - } - } -} - -/* check the conv wheather need to be using winograd */ -static int winograd_support(struct conv_param* param, int in_h, int in_w) +int conv_hcl_get_shared_mem_size_rv64(struct tensor* input_tensor, struct tensor* output_tensor, struct conv_param* param) { - int kernel_h = param->kernel_h; - int kernel_w = param->kernel_w; - int stride_h = param->stride_h; - int stride_w = param->stride_w; - int dilation_h = param->dilation_h; - int dilation_w = param->dilation_w; - int output_chan = param->output_channel; - int group = param->group; - - if (in_h < 7 && in_w < 7) - return 0; - if (in_h < 10 && in_w < 10 && output_chan < 16) - return 0; - if (group != 1 || kernel_h != 3 || kernel_w != 3) - return 0; - if (dilation_h != 1 || dilation_w != 1 || stride_h != 1 || stride_w != 1) - return 0; - - return 1; -} - -/* - * get the memory size for im2col of input tensor - */ -int conv_hcl_get_shared_mem_size_rv64(struct tensor* input, struct tensor* output, struct conv_param* param) -{ - int in_h = input->dims[2]; - int in_w = input->dims[3]; - int out_h = output->dims[2]; - int out_w = output->dims[3]; - int group = param->group; - int input_chan = param->input_channel / group; - int kernel_size = input_chan * param->kernel_h * param->kernel_w; - int out_cstep = out_h * out_w; // channel cstep, output_h * output_w - int elem_size = input->elem_size; // uint8/int8 is 1 byte, fp32 is 4 bytes - - out_cstep = (out_cstep + 3) / 4 * 4; - int mem_size = elem_size * kernel_size * out_cstep + 128; - - return mem_size; -} - -/* - * get the memory size for im2col + sgemm of kernel tensor interleave - */ -static int get_private_mem_size(struct tensor* filter, struct conv_param* param) -{ - int group = param->group; - int out_chan = filter->dims[0] / group; - int out_chan_align4 = (out_chan + 3) / 4 * 4; - int kernel_size = filter->dims[1] * filter->dims[2] * filter->dims[3]; - int mem_size = kernel_size * filter->elem_size * out_chan_align4 * group + 128; // caution + int kernel_size = param->kernel_h * param->kernel_w * param->input_channel / param->group; + int cstep = output_tensor->dims[2] * output_tensor->dims[3]; + cstep = (cstep + 7) / 8 * 8; //align to 8 + int mem_size = input_tensor->elem_size * cstep * kernel_size + 128 * sizeof(float); return mem_size; } -int conv_hcl_set_shared_mem(struct conv_priv_info* priv_info, void* mem, int mem_size) -{ - priv_info->external_im2col_mem = 1; - priv_info->im2col_buffer = mem; - priv_info->im2col_buffer_size = mem_size; - - return 0; -} - -int conv_hcl_set_shared_pack4_mem(struct conv_priv_info* priv_info, void* mem, int mem_size) -{ - priv_info->external_im2col_pack4_mem = 0; - priv_info->im2col_buffer_pack4 = NULL; - priv_info->im2col_buffer_pack4_size = 0; - - return 0; -} - -int conv_hcl_get_shared_pack4_mem_size(struct tensor* filter, struct tensor* output, struct conv_param* param) -{ - return 0; -} - -int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, - struct conv_priv_info* priv_info, struct conv_param* param) +int conv_hcl_prerun_rv64(struct node* ir_node, struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param) { - int in_c = input_tensor->dims[1]; - int in_h = input_tensor->dims[2]; - int in_w = input_tensor->dims[3]; - - /* check winograd implement, only for conv3x3s1 */ - // priv_info->winograd = winograd_support(param, in_h, in_w); - // if (priv_info->winograd) - // { - // if(in_c >= 256) - // // return wino_conv_hcl_prerun_1(input_tensor, filter_tensor, output_tensor, priv_info, param); // FIXME: add wino support - // else - // // return wino_conv_hcl_prerun(input_tensor, filter_tensor, output_tensor, priv_info, param); // FIXME: add wino support - // } - - /* alloc mem of im2col */ - if (!priv_info->external_im2col_mem) + // alloc im2col buffer = kernel_size * out_xy + if (!info->external_im2col_mem) { int mem_size = conv_hcl_get_shared_mem_size_rv64(input_tensor, output_tensor, param); - void* mem = sys_malloc(mem_size); - priv_info->im2col_buffer = mem; - priv_info->im2col_buffer_size = mem_size; + info->im2col_buffer = sys_malloc(mem_size); + info->im2col_buffer_size = mem_size; } - /* alloc mem of kernel interleave */ - if (!priv_info->external_interleave_mem) + // alloc kernel interleave buffer + if (!info->external_interleave_mem) { - int mem_size = get_private_mem_size(filter_tensor, param); - void* mem = sys_malloc(mem_size); - priv_info->interleave_buffer = mem; - priv_info->interleave_buffer_size = mem_size; + int kernel_size = filter_tensor->dims[1] * filter_tensor->dims[2] * filter_tensor->dims[3]; + int out_chan = filter_tensor->dims[0] / param->group; + out_chan = (out_chan + 7) / 8 * 8; //align to 8 + int mem_size = out_chan * kernel_size * filter_tensor->elem_size * param->group; + info->interleave_buffer = sys_malloc(mem_size); + info->interleave_buffer_size = mem_size; } - /* kernel interleave */ - interleave(filter_tensor, priv_info, param); - + // interleave kernel + interleave(filter_tensor, info, param); return 0; } -int conv_hcl_postrun(struct conv_priv_info* priv_info) +int conv_hcl_postrun_rv64(struct node* ir_node, struct conv_priv_info* info) { - // if (priv_info->winograd) - // { - // wino_conv_hcl_postrun(priv_info); // FIXME: add wino support - // } - - if (!priv_info->external_interleave_mem && priv_info->interleave_buffer != NULL) + if (!info->external_interleave_mem && info->interleave_buffer) { - sys_free(priv_info->interleave_buffer); - priv_info->interleave_buffer = NULL; + sys_free(info->interleave_buffer); + info->interleave_buffer = NULL; } - if (!priv_info->external_im2col_mem && priv_info->im2col_buffer != NULL) + if (!info->external_im2col_mem && info->im2col_buffer) { - sys_free(priv_info->im2col_buffer); - priv_info->im2col_buffer = NULL; + sys_free(info->im2col_buffer); + info->im2col_buffer = NULL; } return 0; } -int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_priv_info* priv_info, struct conv_param* param, - int num_thread, int cpu_affinity) +int conv_hcl_run_rv64(struct node* ir_node, struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param, int num_thread, int cpu_affinity) { - /* param */ int group = param->group; - int kernel_h = param->kernel_h; - int kernel_w = param->kernel_w; - int stride_h = param->stride_h; - int stride_w = param->stride_w; - int dilation_h = param->dilation_h; - int dilation_w = param->dilation_w; - int pad_h0 = param->pad_h0; - int pad_h1 = param->pad_h1; - int pad_w0 = param->pad_w0; - int pad_w1 = param->pad_w1; - int act_type = param->activation; - int batch = input_tensor->dims[0]; - int in_c = input_tensor->dims[1] / group; + float* input = input_tensor->data; + float* output = output_tensor->data; + float* bias = NULL; + if (bias_tensor) + { + bias = bias_tensor->data; + } + + int in_c = input_tensor->dims[1]; + in_c /= group; int in_h = input_tensor->dims[2]; int in_w = input_tensor->dims[3]; int input_size = in_c * in_h * in_w; - int kernel_size = in_c * kernel_h * kernel_w; - int input_image_size = input_tensor->dims[1] * input_tensor->dims[2] * input_tensor->dims[3]; - // if (priv_info->winograd) - // { - // if(in_c >= 256) - // return wino_conv_hcl_run_1(input_tensor, filter_tensor, bias_tensor, output_tensor, priv_info, param, num_thread, cpu_affinity); // FIXME: add wino support - // else - // return wino_conv_hcl_run(input_tensor, filter_tensor, bias_tensor, output_tensor, priv_info, param, num_thread, cpu_affinity); // FIXME: add wino support - // } - - int out_c = output_tensor->dims[1] / group; + int k_h = param->kernel_h; + int k_w = param->kernel_w; + int s_w = param->stride_w; + int s_h = param->stride_h; + int d_h = param->dilation_h; + int d_w = param->dilation_w; + int p_h0 = param->pad_h0; + int p_w0 = param->pad_w0; + int p_h1 = param->pad_h1; + int p_w1 = param->pad_w1; + int act = param->activation; + int kernel_size = in_c * k_h * k_w; + + int out_c = param->output_channel / group; int out_h = output_tensor->dims[2]; int out_w = output_tensor->dims[3]; - int out_hw = out_h * out_w; + int out_xy = out_h * out_w; int output_size = out_c * out_h * out_w; - int out_c_align = ((out_c + 3) & -4); - int output_image_size = output_tensor->dims[1] * output_tensor->dims[2] * output_tensor->dims[3]; + int output_image_size = output_tensor->dims[1] * output_tensor->dims[2] * output_tensor->dims[3]; //不是8倍数怎么办 - /* buffer addr */ - float* input_buf = (float*)input_tensor->data; - float* output_buf = (float*)output_tensor->data; - float* biases_buf = NULL; - if (bias_tensor != NULL) - biases_buf = (float*)bias_tensor->data; - float* col_buf = (float*)priv_info->im2col_buffer; - float* interleave_buf = (float*)priv_info->interleave_buffer; + int out_c_align8 = (out_c + 7) / 8 * 8; + int input_image_size = in_c * in_h * in_w; + int input_group_size = input_image_size * group; - int sgemm_set_chan = out_c / PER_OUT_CHAN * PER_OUT_CHAN; - int sgemm_set_remain = out_c % PER_OUT_CHAN; + float* col = info->im2col_buffer; // FIXME: split by [batch, group] + float* interleaved_kernel = info->interleave_buffer; - for (int n = 0; n < batch; n++) // batch size + for (int n = 0; n < batch; ++n) { - for (int g = 0; g < group; g++) + for (int g = 0; g < group; ++g) { - /* im2col */ - float* cur_input = input_buf + n * input_image_size + g * input_size; - im2col(cur_input, col_buf, in_c, in_w, in_h, kernel_w, kernel_h, stride_w, stride_h, dilation_w, dilation_h, - pad_w0, pad_w1, pad_h0, pad_h1, out_w, out_h, num_thread); + float* cur_input = input + n * input_image_size + g * input_size; + //output shape: [batch, group, output_xy/8, ksize, 8] + im2col(cur_input, col, in_c, in_w, in_h, k_w, k_h, s_w, s_h, d_w, d_h, p_w0, p_w1, p_h0, p_h1, out_w, out_h, num_thread); + + float* output_base = output + n * output_image_size + g * output_size; + //FIXME: out_chan_ 可能不是8对齐的 + int out_chan_ = 0; + for (; out_chan_ < out_c_align8; out_chan_ += PER_OUT_CHAN) + { + float* cur_kernel = interleaved_kernel + g * out_c_align8 * kernel_size + out_chan_ * kernel_size; + float* cur_bias = bias ? bias + g * out_c + out_chan_ : NULL; + float* cur_output = output_base + out_chan_ * out_xy; + const int n = min(8, out_c - out_chan_); + + int col_i = 0; + for (; col_i + 7 < out_xy; col_i += 8) + { + float* cur_col = col + col_i * kernel_size; + sgemm_8x8_rv64(cur_col, cur_kernel, cur_bias, act, cur_output + col_i, out_xy, kernel_size, n); + } + if (col_i < out_xy) + { + float result[64]; + float* cur_col = (col + col_i * kernel_size); + sgemm_8x8_rv64(cur_col, cur_kernel, cur_bias, act, result, 8, kernel_size, n); - /* gemm */ - float* cur_kernel = interleave_buf + g * kernel_size * out_c_align; - float* cur_output = output_buf + n * output_image_size + g * output_size; - float* cur_bias = biases_buf ? (biases_buf + g * out_c) : NULL; - sgemm_set(col_buf, cur_kernel, cur_bias, cur_output, kernel_size, 0, sgemm_set_chan, out_hw, act_type, - num_thread, cpu_affinity); - if (sgemm_set_remain) - sgemm4x4(col_buf, cur_kernel, cur_bias, cur_output, kernel_size, sgemm_set_chan, out_c, out_hw, - act_type, num_thread, cpu_affinity); + int col_end3 = (out_xy & 7); + + for (int i = 0; i < n; i++) + { + int j = 0; + for (; j < (col_end3); j++) + *(cur_output + i * out_xy + col_i + j) = result[(i << 3) + j]; + } + } + } } } diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.h b/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.h deleted file mode 100644 index f2f9051a6..000000000 --- a/source/device/cpu/op/conv/risc-v/lp64dv/conv_kernel_rv64.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2021, OPEN AI LAB - * Author: ddzhao@openailab.com - */ - -#ifndef _CONV_KERNEL_RV64_H_ -#define _CONV_KERNEL_RV64_H_ - -#include "convolution_param.h" - -#include "graph/tensor.h" -#include "graph/node.h" -#include "graph/graph.h" -#include "module/module.h" -#include "operator/op.h" -#include "utility/sys_port.h" -#include "utility/log.h" -#include "device/cpu/cpu_node.h" -#include "device/cpu/cpu_graph.h" -#include "device/cpu/cpu_module.h" - -/* float32 */ -int conv_hcl_prerun(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, - struct conv_priv_info* info, struct conv_param* param) __attribute__((weak)); - -int conv_hcl_postrun(struct conv_priv_info* info) __attribute__((weak)); - -int conv_hcl_run(struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, - struct tensor* output_tensor, struct conv_priv_info* conv_info, struct conv_param* param, - int num_thread, int cpu_affinity) __attribute__((weak)); - -int conv_hcl_get_shared_mem_size_rv64(struct tensor* input_tensor, struct tensor* output_tensor, - struct conv_param* param); -int conv_hcl_get_shared_pack4_mem_size(struct tensor* input_tensor, struct tensor* output_tensor, - struct conv_param* param) __attribute__((weak)); - -int conv_hcl_set_shared_mem(struct conv_priv_info* priv_info, void* mem, int mem_size) __attribute__((weak)); - -int conv_hcl_set_shared_pack4_mem(struct conv_priv_info* priv_info, void* mem, int mem_size) __attribute__((weak)); - -#endif diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_1x1.S b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_1x1.S index 700fe7e55..e69de29bb 100644 --- a/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_1x1.S +++ b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_1x1.S @@ -1,114 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -/* - * Copyright (c) 2021, OPEN AI LAB - * Author: ddzhao@openailab.com - */ -// -// im2col for kernel 1x1 s1p0d1 -// -// input: -// x0 arg0 input address -// x1 arg1 input_xy -// x2 arg2 col address -// x3 arg3 col_cnt must be multiply of 4 -// x4 arg4 input channel -// -// register definition -// x0 input address -// x1 input_xy x 4 -// x2 col address -// x3 col_cnt -// x4 input channel -// x6 input start pointer t6 -// x7 input pointer -// x9 channel cnt -// x11 -// x12 = input_xy size * 2 // x12 -> t5 - - .section .text,"ax" - .align 5 - - .type im2col_fp32_1x1 STT_FUNC - .global im2col_fp32_1x1 - .hidden im2col_fp32_1x1 -im2col_fp32_1x1: - addi sp, sp, -56 - sd t0, 0(sp) - sd t1, 8(sp) - sd t2, 16(sp) - sd t3, 24(sp) - sd t4, 32(sp) - sd t5, 40(sp) - sd t6, 48(sp) - vsetvli t0, a0, e32 - li t0, 4 - blt a3, t0, col_end - - srli a3, a3, 2 - - slli a1, a1, 2 - - mv t6, a0 - - slli t5, a1, 1 - - add t4, a4, 1 // x10 -> t4 - - // col loop -col_loop: - mv t3, t6 - srli t2, a4, 1 - beqz t2, channel_last - add t1, t3, a1 - // kernel size loop -channel_loop2: - vlw.v v0,(t3) - vlw.v v1,(t1) - addi t2, t2, -1 - add t3, t3, t5 - add t1, t1, t5 - vsw.v v0, (a2) - addi a2, a2, 16 - vsw.v v1, (a2) - addi a2, a2, 16 - bnez t2, channel_loop2 - -channel_last: - beqz t4, channel_loop_end - vlw.v v0,(t3) - vsw.v v0, (a2) - addi a2, a2, 16 - -channel_loop_end: - addi t6, t6, 16 - addi a3, a3, -1 - bnez a3, col_loop - -col_end: - ld t0, 0(sp) - ld t1, 8(sp) - ld t2, 16(sp) - ld t3, 24(sp) - ld t4, 32(sp) - ld t5, 40(sp) - ld t6, 48(sp) - addi sp, sp, 56 - ret - .end diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_1x1.c b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_1x1.c new file mode 100644 index 000000000..a6ffb1ed7 --- /dev/null +++ b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_1x1.c @@ -0,0 +1,40 @@ +#include "vsetvl_rvv.h" + +// FIXME: optimize vectorize loop +void im2col_fp32_1x1(const float* input, const int input_xy, const int input_channels, float* col) +{ + vsetvl_e32_m2(); + + const float* c0 = input; + const float* c1 = input + input_xy; + const int input_xy_stride = 2 * input_xy; + + float* o0 = col; + float* o1 = col + 8; + + int c = 0; + for (; c < (input_channels & -2); c += 2) + { + __asm__( + "vle32.v v0, (%0); \n" + "vle32.v v2, (%1); \n" + "vse32.v v0, (%2); \n" + "vse32.v v2, (%3); \n" + : + : "r"(c0), "r"(c1), "r"(o0), "r"(o1) + : "memory"); + o0 += 16; + o1 += 16; + c0 += input_xy_stride; + c1 += input_xy_stride; + } + + if (c < input_channels) + { + __asm__("vle32.v v0, (%0);\n" + "vse32.v v0, (%1);\n" + : + : "r"(c0), "r"(o0) + : "memory"); + } +} diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_3x3.S b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_3x3.S deleted file mode 100644 index d928093c6..000000000 --- a/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_3x3.S +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: ddzhao@openailab.com - */ -// -// im2col fp16 for kernel 3x3 include 2 function stride 1 and stride 2 -// ABCDABCD -// -// input: -// x0 arg0 input address -// x1 arg1 input_x -// x2 arg2 input_y -// x3 arg3 input channel cnt -// x4 arg4 col address -// x5 arg5 stride_x -// -// register definition -// x0 cl0 address q0 q1 d16 d17 d18 -// x1 input_x x 4 -// x2 input_xy x 4 -// x3 input channel -// x4 col address -// x5 stride_x -// x11 cl1 address q2 q3 d19 d20 d21 -// x12 cl2 address q4 q5 d22 d23 d24 - - .section .text,"ax" - .align 5 - - .type im2col_fp32_3x3 STT_FUNC - .global im2col_fp32_3x3 - .hidden im2col_fp32_3x3 - -.balign 16 -mask_32b: - .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \ - 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff - -im2col_fp32_3x3: - addi sp, sp, -56 - sd t0, 0(sp) - sd t1, 8(sp) - sd t2, 16(sp) - sd t3, 24(sp) - sd t4, 32(sp) - sd t5, 40(sp) - sd t6, 48(sp) - vsetvli t0, a0, e32 - // initial - beqz a3, finish - li t0, 2 - slli a1, a1, 2 - mul a2, a2, a1 - add t5, a0, a1 - slli t1, a1, 1 - add t6, a0, t1 - li t2, 8 - beq a5, t0, stride2_channel_loop - -stride1_channel_loop: - vlw.v v0, (a0) - addi t0, a0, 16 - vlw.v v1, (t0) - vlw.v v2, (t5) - addi t0, t5, 16 - vlw.v v3, (t0) - vlw.v v4, (t6) - addi t0, t6, 16 - vlw.v v5, (t0) - - addi a3, a3, -1 - - addi t0, a0, 4 - vlw.v v16, (t0) - addi t0, a0, 8 - vlw.v v17, (t0) - add a0, a0, a2 - - addi t0, t5, 4 - vlw.v v19, (t0) - - addi t0, t5, 8 - vlw.v v20, (t0) - add t5, t5, a2 - addi t0, t6, 4 - vlw.v v22, (t0) - addi t0, t6, 8 - vlw.v v23, (t0) - add t6, t6, a2 - vsw.v v0, (a4) - addi a4, a4, 16 - vsw.v v16, (a4) - addi a4, a4, 16 - vsw.v v17, (a4) - addi a4, a4, 16 - vsw.v v2, (a4) - addi a4, a4, 16 - vsw.v v19, (a4) - addi a4, a4, 16 - vsw.v v20, (a4) - addi a4, a4, 16 - vsw.v v4, (a4) - addi a4, a4, 16 - vsw.v v22, (a4) - addi a4, a4, 16 - vsw.v v23, (a4) - addi a4, a4, 16 - bnez a3, stride1_channel_loop - j finish - -stride2_channel_loop: - la t0, mask_32b - vlw.v v0, (t0) - addi t0, a0, 0 - vlsw.v v16, (t0), t2 - addi t0, a0, 0x4 - vlsw.v v17, (t0), t2 - addi t0, a0, 32 - vlw.v v18, (t0) - vslidedown.vi v1, v16, 1 - vslideup.vi v2, v18, 3 - vmerge.vvm v18, v1, v2, v0 - - addi t0, t5, 0 - vlsw.v v19, (t0), t2 - addi t0, t5, 0x4 - vlsw.v v20, (t0), t2 - addi t0, t5, 0x20 - vlw.v v21, (t0) - vslidedown.vi v1, v19, 1 - vslideup.vi v2, v21, 3 - vmerge.vvm v21, v1, v2, v0 - - addi t0, t6, 0 - vlsw.v v22, (t0), t2 - addi t0, t6, 0x4 - vlsw.v v23, (t0), t2 - addi t0, t6, 0x20 - vlw.v v24, (t0) - vslidedown.vi v1, v22, 1 - vslideup.vi v2, v24, 3 - vmerge.vvm v24, v1, v2, v0 - - addi a3, a3, -1 - - vsw.v v16, (a4) - addi a4, a4, 0x10 - vsw.v v17, (a4) - addi a4, a4, 0x10 - vsw.v v18, (a4) - addi a4, a4, 0x10 - vsw.v v19, (a4) - addi a4, a4, 0x10 - vsw.v v20, (a4) - addi a4, a4, 0x10 - vsw.v v21, (a4) - addi a4, a4, 0x10 - vsw.v v22, (a4) - addi a4, a4, 0x10 - vsw.v v23, (a4) - addi a4, a4, 0x10 - vsw.v v24, (a4) - addi a4, a4, 0x10 - - add a0, a0, a2 - add t5, t5, a2 - add t6, t6, a2 - - bnez a3, stride2_channel_loop -finish: - ld t0, 0(sp) - ld t1, 8(sp) - ld t2, 16(sp) - ld t3, 24(sp) - ld t4, 32(sp) - ld t5, 40(sp) - ld t6, 48(sp) - addi sp, sp, 56 - ret - .end diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_3x3.c b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_3x3.c new file mode 100644 index 000000000..74f574057 --- /dev/null +++ b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_3x3.c @@ -0,0 +1,117 @@ +#include "vsetvl_rvv.h" + +void im2col_fp32_3x3(const float* input, const int input_x, const int input_y, const int input_channels, float* col, const int stride) +{ + vsetvl_e32_m2(); + const int in_xy = input_x * input_y; + const float* row0 = input; + const float* row1 = row0 + input_x; + const float* row2 = row1 + input_x; + float* cur_col = col; + + if (stride == 1) + { + for (int c = 0; c < input_channels; ++c) + { + asm("vle32.v v0, (%0);\n" + "vle32.v v2, (%1);\n" + "vle32.v v4, (%2);\n" + + "addi t0, %0, 4;\n" + "addi t1, %0, 8;\n" + + "vle32.v v6, (t0);\n" + "vle32.v v8, (t1);\n" + + "addi t0, %1, 4;\n" + "addi t1, %1, 8;\n" + + "vle32.v v10, (t0);\n" + "vle32.v v12, (t1);\n" + + "addi t0, %2, 4;\n" + "addi t1, %2, 8;\n" + + "vle32.v v14, (t0);\n" + "vle32.v v16, (t1);\n" + + "vse32.v v0, (%3);\n" + "addi t0, %3, 32;\n" + "vse32.v v6, (t0);\n" + "addi t0, t0, 32;\n" + "vse32.v v8, (t0);\n" + "addi t0, t0, 32;\n" + + "vse32.v v2, (t0);\n" + "addi t0, t0, 32;\n" + "vse32.v v10, (t0);\n" + "addi t0, t0, 32;\n" + "vse32.v v12, (t0);\n" + "addi t0, t0, 32;\n" + + "vse32.v v4, (t0);\n" + "addi t0, t0, 32;\n" + "vse32.v v14, (t0);\n" + "addi t0, t0, 32;\n" + "vse32.v v16, (t0);\n" + "addi t0, t0, 32;\n" + : + : "r"(row0), "r"(row1), "r"(row2), "r"(cur_col) + : "t0", "t1", "memory"); + + row0 += in_xy; + row1 += in_xy; + row2 += in_xy; + cur_col += 72; + } + } + else + { + for (int c = 0; c < input_channels; ++c) + { + asm("li t0, 8;\n" + "vlse32.v v0, (%0), t0;\n" + "add t1, %0, 0x4;\n" + "vlse32.v v2, (t1), t0;\n" + "add t1, t1, 0x4;\n" + "vlse32.v v4, (t1), t0;\n" + + "vlse32.v v6, (%1), t0;\n" + "add t1, %1, 0x4;\n" + "vlse32.v v8, (t1), t0;\n" + "add t1, t1, 0x4;\n" + "vlse32.v v10, (t1), t0;\n" + + "vlse32.v v12, (%2), t0;\n" + "add t1, %2, 0x4;\n" + "vlse32.v v14, (t1), t0;\n" + "add t1, t1, 0x4;\n" + "vlse32.v v16, (t1), t0;\n" + + "vse32.v v0, (%3);\n" + "addi t0, %3, 32;\n" + "vse32.v v2, (t0);\n" + "addi t0, t0, 32;\n" + "vse32.v v4, (t0);\n" + "addi t0, t0, 32;\n" + "vse32.v v6, (t0);\n" + "addi t0, t0, 32;\n" + "vse32.v v8, (t0);\n" + "addi t0, t0, 32;\n" + "vse32.v v10, (t0);\n" + "addi t0, t0, 32;\n" + "vse32.v v12, (t0);\n" + "addi t0, t0, 32;\n" + "vse32.v v14, (t0);\n" + "addi t0, t0, 32;\n" + "vse32.v v16, (t0);\n" + : + : "r"(row0), "r"(row1), "r"(row2), "r"(cur_col) + : "t0", "t1", "memory"); + row0 += in_xy; + row1 += in_xy; + row2 += in_xy; + cur_col += 72; + } + } +} diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_tile8.c b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_tile8.c new file mode 100644 index 000000000..295d16cbb --- /dev/null +++ b/source/device/cpu/op/conv/risc-v/lp64dv/im2col_fp32_tile8.c @@ -0,0 +1,189 @@ +#include +extern void im2col_fp32_1x1(const float* input, const int input_xy, const int input_chan, float* col); +extern void im2col_fp32_3x3(const float* input, int w, int h, int channel, float* cur_col, int stride); + +static void trans_col(float* input, float* cur_col, int col_i, int in_c, int in_h, int in_w, int k_w, int k_h, int s_w, int s_h, int pad_w0, int pad_h0, int out_w, int out_h, int d_h, int d_w) +{ + const int in_xy = in_w * in_h; + int cnt_y[] = { + col_i / out_w, + (col_i + 1) / out_w, + (col_i + 2) / out_w, + (col_i + 3) / out_w, + (col_i + 4) / out_w, + (col_i + 5) / out_w, + (col_i + 6) / out_w, + (col_i + 7) / out_w, + }; + + int cnt_x[] = { + col_i - cnt_y[0] * out_w, + col_i - cnt_y[1] * out_w + 1, + col_i - cnt_y[2] * out_w + 2, + col_i - cnt_y[3] * out_w + 3, + col_i - cnt_y[4] * out_w + 4, + col_i - cnt_y[5] * out_w + 5, + col_i - cnt_y[6] * out_w + 6, + col_i - cnt_y[7] * out_w + 7, + }; + + int imx_start[] = { + cnt_x[0] * s_w - pad_w0, + cnt_x[1] * s_w - pad_w0, + cnt_x[2] * s_w - pad_w0, + cnt_x[3] * s_w - pad_w0, + cnt_x[4] * s_w - pad_w0, + cnt_x[5] * s_w - pad_w0, + cnt_x[6] * s_w - pad_w0, + cnt_x[7] * s_w - pad_w0, + }; + + int imy_start[] = { + cnt_y[0] * s_h - pad_h0, + cnt_y[1] * s_h - pad_h0, + cnt_y[2] * s_h - pad_h0, + cnt_y[3] * s_h - pad_h0, + cnt_y[4] * s_h - pad_h0, + cnt_y[5] * s_h - pad_h0, + cnt_y[6] * s_h - pad_h0, + cnt_y[7] * s_h - pad_h0, + }; + + for (int kch = 0; kch < in_c; kch++) + { + for (int ky = 0; ky < (k_h * d_h); ky += d_h) + { + for (int kx = 0; kx < (k_w * d_w); kx += d_w) + { + int imx[8] = { + imx_start[0] + kx, + imx_start[1] + kx, + imx_start[2] + kx, + imx_start[3] + kx, + imx_start[4] + kx, + imx_start[5] + kx, + imx_start[6] + kx, + imx_start[7] + kx, + }; + + int imy[8] = { + imy_start[0] + ky, + imy_start[1] + ky, + imy_start[2] + ky, + imy_start[3] + ky, + imy_start[4] + ky, + imy_start[5] + ky, + imy_start[6] + ky, + imy_start[7] + ky, + }; + + for (int i = 0; i < 8; ++i) + { + if (imx[i] >= 0 && imx[i] < in_w && imy[i] >= 0 && imy[i] < in_h) + { + *cur_col++ = *(input + in_xy * kch + in_w * imy[i] + imx[i]); + } + else + { + *cur_col++ = .0f; + } + } + } + } + } +} + +void im2col(float* input, float* col, int in_c, int in_w, int in_h, int k_w, int k_h, int s_w, int s_h, int d_w, + int d_h, int pad_w0, int pad_w1, int pad_h0, int pad_h1, int out_w, int out_h, int num_thread) +{ + const int kernel_size = k_w * k_h * in_c; + const int in_xy = in_w * in_h; + const int out_xy = out_w * out_h; + const int col_end7 = out_xy & 7; + const int is_pad0 = !(pad_h0 || pad_w0 || pad_h1 || pad_w1); + + if (k_w == 1 && k_h == 1 && s_w == 1 && s_h == 1) + { +#pragma omp parallel for num_threads(num_thread) + int col_i = 0; + for (; col_i < (out_xy & -8); col_i += 8) + { + float* cur_col = col + col_i * kernel_size; + + int imy0 = col_i / out_w; + int imy7 = (col_i + 7) / out_w; + int imx0 = col_i - imy0 * out_w; + int imx7 = (col_i + 7) - imy7 * out_w; + + int imx_start = imx0 * s_w - pad_w0; + int imx_end = imx7 * s_w - pad_w0; + int imy_start = imy0 * s_h - pad_h0; + int imy_end = imy7 * s_h - pad_h0; + + // is pad ? + if (imy0 == imy7 && (is_pad0 || (imx_start >= 0 && imx_end < in_w && imy_start >= 0 && imy_end < in_h))) + { + const float* cur_input = input + imy_start * in_w + imx_start; + im2col_fp32_1x1(cur_input, in_xy, in_c, cur_col); + } + else + { + trans_col(input, cur_col, col_i, in_c, in_h, in_w, k_w, k_h, s_w, s_h, pad_w0, pad_h0, out_w, out_h, d_h, d_w); + } + } + + if (col_end7) + { + float* cur_col = col + col_i * kernel_size; + trans_col(input, cur_col, col_i, in_c, in_h, in_w, k_w, k_h, s_w, s_h, pad_w0, pad_h0, out_w, out_h, d_h, d_w); + } + } + else if (d_w == 1 && d_h == 1 && k_w == 3 && k_h == 3 && s_w == s_h) + { + int col_i = 0; + for (; col_i < (out_xy & -8); col_i += 8) + { + float* cur_col = col + col_i * kernel_size; + int imy0 = col_i / out_w; + int imy7 = (col_i + 7) / out_w; + int imx0 = col_i - imy0 * out_w; + int imx7 = (col_i + 7) - imy7 * out_w; + + int imx_start = imx0 * s_w - pad_w0; + int imx_end = imx7 * s_w - pad_w0; + int imy_start = imy0 * s_h - pad_h0; + int imy_end = imy7 * s_h - pad_h0; + if ((imy0 == imy7) && (is_pad0 || (imx_start >= 0 && imx_end < in_w - 8 && imy_start >= 0 && imy_end + 2 < in_h))) + { + float* cur_input = input + imy_start * in_w + imx_start; + im2col_fp32_3x3(cur_input, in_w, in_h, in_c, cur_col, s_w); + cur_col += 8 * kernel_size; + } + else + { + trans_col(input, cur_col, col_i, in_c, in_h, in_w, k_w, k_h, s_w, s_h, pad_w0, pad_h0, out_w, out_h, d_h, d_w); + } + } + + if (col_end7) + { + float* cur_col = col + col_i * kernel_size; + trans_col(input, cur_col, col_i, in_c, in_h, in_w, k_w, k_h, s_w, s_h, pad_w0, pad_h0, out_w, out_h, d_h, d_w); + } + } + else + { + int col_i = 0; + for (; col_i < (out_xy & -8); col_i += 8) + { + float* cur_col = col + col_i * kernel_size; + trans_col(input, cur_col, col_i, in_c, in_h, in_w, k_w, k_h, s_w, s_h, pad_w0, pad_h0, out_w, out_h, d_h, d_w); + } + + if (col_end7) + { + float* cur_col = col + col_i * kernel_size; + trans_col(input, cur_col, col_i, in_c, in_h, in_w, k_w, k_h, s_w, s_h, pad_w0, pad_h0, out_w, out_h, d_h, d_w); + } + } +} diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_4x16.S b/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_4x16.S deleted file mode 100644 index b8b7431ea..000000000 --- a/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_4x16.S +++ /dev/null @@ -1,690 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: ddzhao@openailab.com -*/ -// -// 4*16 single precise floating point matric multiplication -// -// -- -- -- -- -- -- -- -- -// | i0 - - - - - - | | k0 k1 .. kf | | b0 b1 .. bf | | i0k0 i0k1 .. i0kf | -// | | | . . . . | | | | | -// | i1 - - - - - - | | . . . . | | b0 b1 . bf | | i1k0 i1k1 .. i1kf | -// | | x | . . . . | + | | = | | -// | i2 - - - - - - | | . . . . | | b0 b1 . bf | | i2k0 i2k1 .. i2kf | -// | | | . . . . | | | | | -// | i3 - - - - - - | | . . . . | | b0 b1 . bf | | i3k0 i3k1 .. i3kf | -// -- -- -- -- -- -- -- -- -// input 4 x p kernel p x 16 biases 4 x 16 output 4 x 16 p = kernel size -// -// -// load 4 more input and 8 more kernel to improve loop performance -// -// input: -// x0 arg0 biases address {b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,b10,b11,b12,b13,b14,b15} nullptr means no biases -// x1 arg1 input address {i[0-3][0],i1[0-3][1],i[0-3][2],i[0-3][3],i[0-3][4],...} -// x2 arg2 kernel address {k[0-15][0],k[0-15][1],k[0-15][2],k[0-15][3],...} -// x3 arg3 kernel size -// x4 arg4 output address -// indirect save: output {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3],i[0-3]k[4]..} -// direct save: output : {i0k0 i1k0 i2k0 i3k0} -// output + ouput_xy : {i0k1 i1k1 i2k1 i3k1} -// output + ouput_xy * 2 : {i0k2 i1k2 i2k2 i3k2} -// ... -// output + ouput_xy * 15 : {i0k15 i1k15 i2k15 i3k15} -// x5 arg5 output xy -// x6 arg6 activation flag activation layers is integrated after convolution -// -// output: no -// -// register definition -// x0 biases start address -// x1 input start address -// x2 kernel start address -// x3 kernal size -// x4 output start address -// x5 output_x * output_y -// x6 activation flag -// x9 ~ x10 temp loop counter -// x11~ x13 temp output save address -// x14 output_xy * 4 -// x7~8 x15 not used -// x9 t1 -// x10 t2 -// x11 t3 -// x12 t4 -// x13 t5 -// x14 t6 -// -// v0~1 4S data of input0 {i3 i2 i1 i0} -// v2~3 not used -// v4 4S kernal data {k3 | k2 | k1 | k0} -// v5 4S kernal data {k7 | k6 | k5 | k4} -// v6 4S kernal data {kb | ka | k9 | k8} -// v7 4S kernal data {kf | ke | kd | kc} -// v8~15 not used -// v16 dot product for {i3k0, i2k0, i1k0, i0k0} -// v17 dot product for {i3k1, i2k1, i1k1, i0k1} -// v18 dot product for {i3k2, i2k2, i1k2, i0k2} -// v19 dot product for {i3k3, i2k3, i1k3, i0k3} -// v20 dot product for {i3k4, i2k4, i1k4, i0k4} -// v21 dot product for {i3k5, i2k5, i1k5, i0k5} -// v22 dot product for {i3k6, i2k6, i1k6, i0k6} -// v23 dot product for {i3k7, i2k7, i1k7, i0k7} -// v24 dot product for {i3k8, i2k8, i1k8, i0k8} -// v25 dot product for {i3k9, i2k9, i1k9, i0k9} -// v26 dot product for {i3ka, i2ka, i1ka, i0ka} -// v27 dot product for {i3kb, i2kb, i1kb, i0kb} -// v28 dot product for {i3kc, i2kc, i1kc, i0kc} -// v29 dot product for {i3kd, i2kd, i1kd, i0kd} -// v30 dot product for {i3ke, i2ke, i1ke, i0ke} -// v31 dot product for {i3kf, i2kf, i1kf, i0kf} - - .section .text,"ax" - .align 5 - - .type sgemm_4x16_rv64 STT_FUNC - .global sgemm_4x16_rv64 - .hidden sgemm_4x16_rv64 -sgemm_4x16_rv64: - addi sp, sp, -56 - sd t0, 0(sp) - sd t1, 8(sp) - sd t2, 16(sp) - sd t3, 24(sp) - sd t4, 32(sp) - sd t5, 40(sp) - sd t6, 48(sp) - vsetvli t0, t1, e32 -# // biases_initial - beqz a0, none_biases - vlw.v v0, (a0) - vrgather.vi v16, v0, 0 - vrgather.vi v17, v0, 1 - vrgather.vi v18, v0, 2 - vrgather.vi v19, v0, 3 - addi a0, a0, 0x10 - vlw.v v0, (a0) - vrgather.vi v20, v0, 0 - vrgather.vi v21, v0, 1 - vrgather.vi v22, v0, 2 - vrgather.vi v23, v0, 3 - addi a0, a0, 0x10 - vlw.v v0, (a0) - vrgather.vi v24, v0, 0 - vrgather.vi v25, v0, 1 - vrgather.vi v26, v0, 2 - vrgather.vi v27, v0, 3 - addi a0, a0, 0x10 - vlw.v v0, (a0) - vrgather.vi v28, v0, 0 - vrgather.vi v29, v0, 1 - vrgather.vi v30, v0, 2 - vrgather.vi v31, v0, 3 - - j convolution_start - -none_biases: - vmv.v.x v16, x0 - vmv.v.x v17, x0 - vmv.v.x v18, x0 - vmv.v.x v19, x0 - vmv.v.x v20, x0 - vmv.v.x v21, x0 - vmv.v.x v22, x0 - vmv.v.x v23, x0 - vmv.v.x v24, x0 - vmv.v.x v25, x0 - vmv.v.x v26, x0 - vmv.v.x v27, x0 - vmv.v.x v28, x0 - vmv.v.x v29, x0 - vmv.v.x v30, x0 - vmv.v.x v31, x0 - -convolution_start: - vlw.v v0, (a1) - addi t0, a2, 0 - vlw.v v4, (t0) - addi t0, a2, 0x10 - vlw.v v5, (t0) - - andi t2, a3, 0x3 - slli a5, a5, 0x2 - bltz t2, loop4_end - srli t1, a3, 0x2 - -// main loop each loop generate dot prodcut for 4x16x4SP -loop4: - addi t1, t1, -1 - addi t0, a2, 0x20 - vlw.v v6, (t0) - addi t0, a2, 0x30 - vlw.v v7, (t0) - - vrgather.vi v8, v4, 0 - vrgather.vi v9, v4, 1 - vrgather.vi v10, v4, 2 - vrgather.vi v11, v4, 3 - vfmacc.vv v16, v0, v8 - vfmacc.vv v17, v0, v9 - vfmacc.vv v18, v0, v10 - vfmacc.vv v19, v0, v11 - - addi t0, a1, 0x10 - vlw.v v1, (t0) - - vrgather.vi v8, v5, 0 - vrgather.vi v9, v5, 1 - vrgather.vi v10, v5, 2 - vrgather.vi v11, v5, 3 - vfmacc.vv v20, v0, v8 - vfmacc.vv v21, v0, v9 - vfmacc.vv v22, v0, v10 - vfmacc.vv v23, v0, v11 - - addi t0, a2, 0x40 - vlw.v v4, (t0) - addi t0, a2, 0x50 - vlw.v v5, (t0) - - vrgather.vi v8, v6, 0 - vrgather.vi v9, v6, 1 - vrgather.vi v10, v6, 2 - vrgather.vi v11, v6, 3 - vfmacc.vv v24, v0, v8 - vfmacc.vv v25, v0, v9 - vfmacc.vv v26, v0, v10 - vfmacc.vv v27, v0, v11 - - vrgather.vi v8, v7, 0 - vrgather.vi v9, v7, 1 - vrgather.vi v10, v7, 2 - vrgather.vi v11, v7, 3 - vfmacc.vv v28, v0, v8 - vfmacc.vv v29, v0, v9 - vfmacc.vv v30, v0, v10 - vfmacc.vv v31, v0, v11 - - addi t0, a2, 0x60 - vlw.v v6, (t0) - addi t0, a2, 0x70 - vlw.v v7, (t0) - - vrgather.vi v8, v4, 0 - vrgather.vi v9, v4, 1 - vrgather.vi v10, v4, 2 - vrgather.vi v11, v4, 3 - vfmacc.vv v16, v1, v8 - vfmacc.vv v17, v1, v9 - vfmacc.vv v18, v1, v10 - vfmacc.vv v19, v1, v11 - - addi t0, a1, 0x20 - vlw.v v0, (t0) - - vrgather.vi v8, v5, 0 - vrgather.vi v9, v5, 1 - vrgather.vi v10, v5, 2 - vrgather.vi v11, v5, 3 - vfmacc.vv v20, v1, v8 - vfmacc.vv v21, v1, v9 - vfmacc.vv v22, v1, v10 - vfmacc.vv v23, v1, v11 - - addi t0, a2, 0x80 - vlw.v v4, (t0) - addi t0, a2, 0x90 - vlw.v v5, (t0) - - vrgather.vi v8, v6, 0 - vrgather.vi v9, v6, 1 - vrgather.vi v10, v6, 2 - vrgather.vi v11, v6, 3 - vfmacc.vv v24, v1, v8 - vfmacc.vv v25, v1, v9 - vfmacc.vv v26, v1, v10 - vfmacc.vv v27, v1, v11 - - vrgather.vi v8, v7, 0 - vrgather.vi v9, v7, 1 - vrgather.vi v10, v7, 2 - vrgather.vi v11, v7, 3 - vfmacc.vv v28, v1, v8 - vfmacc.vv v29, v1, v9 - vfmacc.vv v30, v1, v10 - vfmacc.vv v31, v1, v11 - - addi t0, a2, 0xa0 - vlw.v v6, (t0) - addi t0, a2, 0xb0 - vlw.v v7, (t0) - - vrgather.vi v8, v4, 0 - vrgather.vi v9, v4, 1 - vrgather.vi v10, v4, 2 - vrgather.vi v11, v4, 3 - vfmacc.vv v16, v0, v8 - vfmacc.vv v17, v0, v9 - vfmacc.vv v18, v0, v10 - vfmacc.vv v19, v0, v11 - - addi t0, a1, 0x30 - vlw.v v1, (t0) - addi a1, a1, 0x40 - - vrgather.vi v8, v5, 0 - vrgather.vi v9, v5, 1 - vrgather.vi v10, v5, 2 - vrgather.vi v11, v5, 3 - vfmacc.vv v20, v0, v8 - vfmacc.vv v21, v0, v9 - vfmacc.vv v22, v0, v10 - vfmacc.vv v23, v0, v11 - - addi t0, a2, 0xc0 - vlw.v v4, (t0) - addi t0, a2, 0xd0 - vlw.v v5, (t0) - - vrgather.vi v8, v6, 0 - vrgather.vi v9, v6, 1 - vrgather.vi v10, v6, 2 - vrgather.vi v11, v6, 3 - vfmacc.vv v24, v0, v8 - vfmacc.vv v25, v0, v9 - vfmacc.vv v26, v0, v10 - vfmacc.vv v27, v0, v11 - - vrgather.vi v8, v7, 0 - vrgather.vi v9, v7, 1 - vrgather.vi v10, v7, 2 - vrgather.vi v11, v7, 3 - vfmacc.vv v28, v0, v8 - vfmacc.vv v29, v0, v9 - vfmacc.vv v30, v0, v10 - vfmacc.vv v31, v0, v11 - - addi t0, a2, 0xe0 - vlw.v v6, (t0) - addi t0, a2, 0xf0 - vlw.v v7, (t0) - addi a2, a2, 0x100 - vrgather.vi v8, v4, 0 - vrgather.vi v9, v4, 1 - vrgather.vi v10, v4, 2 - vrgather.vi v11, v4, 3 - vfmacc.vv v16, v1, v8 - vfmacc.vv v17, v1, v9 - vfmacc.vv v18, v1, v10 - vfmacc.vv v19, v1, v11 - - vlw.v v0, (a1) - - vrgather.vi v8, v5, 0 - vrgather.vi v9, v5, 1 - vrgather.vi v10, v5, 2 - vrgather.vi v11, v5, 3 - vfmacc.vv v20, v1, v8 - vfmacc.vv v21, v1, v9 - vfmacc.vv v22, v1, v10 - vfmacc.vv v23, v1, v11 - - addi t0, a2, 0x0 - vlw.v v4, (t0) - addi t0, a2, 0x10 - vlw.v v5, (t0) - - vrgather.vi v8, v6, 0 - vrgather.vi v9, v6, 1 - vrgather.vi v10, v6, 2 - vrgather.vi v11, v6, 3 - vfmacc.vv v24, v1, v8 - vfmacc.vv v25, v1, v9 - vfmacc.vv v26, v1, v10 - vfmacc.vv v27, v1, v11 - - vrgather.vi v8, v7, 0 - vrgather.vi v9, v7, 1 - vrgather.vi v10, v7, 2 - vrgather.vi v11, v7, 3 - vfmacc.vv v28, v1, v8 - vfmacc.vv v29, v1, v9 - vfmacc.vv v30, v1, v10 - vfmacc.vv v31, v1, v11 - bnez t1, loop4 - -loop4_end: - slli t6, a5, 2 - beqz t2, activation - -loop1: - addi t0, a2, 0x20 - vlw.v v6, (t0) - addi t0, a2, 0x30 - vlw.v v7, (t0) - addi a2, a2, 0x40 - vrgather.vi v8, v4, 0 - vrgather.vi v9, v4, 1 - vrgather.vi v10, v4, 2 - vrgather.vi v11, v4, 3 - vfmacc.vv v16, v0, v8 - vfmacc.vv v17, v0, v9 - vfmacc.vv v18, v0, v10 - vfmacc.vv v19, v0, v11 - addi a1, a1, 0x10 - addi t2, t2, -1 - vrgather.vi v8, v5, 0 - vrgather.vi v9, v5, 1 - vrgather.vi v10, v5, 2 - vrgather.vi v11, v5, 3 - vfmacc.vv v20, v0, v8 - vfmacc.vv v21, v0, v9 - vfmacc.vv v22, v0, v10 - vfmacc.vv v23, v0, v11 - addi t0, a2, 0x0 - vlw.v v4, (t0) - addi t0, a2, 0x10 - vlw.v v5, (t0) - vrgather.vi v8, v6, 0 - vrgather.vi v9, v6, 1 - vrgather.vi v10, v6, 2 - vrgather.vi v11, v6, 3 - vfmacc.vv v24, v0, v8 - vfmacc.vv v25, v0, v9 - vfmacc.vv v26, v0, v10 - vfmacc.vv v27, v0, v11 - vrgather.vi v8, v7, 0 - vrgather.vi v9, v7, 1 - vrgather.vi v10, v7, 2 - vrgather.vi v11, v7, 3 - vfmacc.vv v28, v0, v8 - vfmacc.vv v29, v0, v9 - vfmacc.vv v30, v0, v10 - vfmacc.vv v31, v0, v11 - - vlw.v v0, (a1) - bnez t2, loop1 - -activation: - add t3, a4, a5 - bltz a6, save_result - vmv.v.x v0, x0 - vmv.v.x v0, a6 // FIXME: change DataType - vfmax.vv v16, v16, v0 - vfmax.vv v17, v17, v0 - vfmax.vv v18, v18, v0 - vfmax.vv v19, v19, v0 - vfmax.vv v20, v20, v0 - vfmax.vv v21, v21, v0 - vfmax.vv v22, v22, v0 - vfmax.vv v23, v23, v0 - vfmax.vv v24, v24, v0 - vfmax.vv v25, v25, v0 - vfmax.vv v26, v26, v0 - vfmax.vv v27, v27, v0 - vfmax.vv v28, v28, v0 - vfmax.vv v29, v29, v0 - vfmax.vv v30, v30, v0 - vfmax.vv v31, v31, v0 - - beqz a6, save_result - vfmin.vv v16, v16, v1 - vfmin.vv v17, v17, v1 - vfmin.vv v18, v18, v1 - vfmin.vv v19, v19, v1 - vfmin.vv v20, v20, v1 - vfmin.vv v21, v21, v1 - vfmin.vv v22, v22, v1 - vfmin.vv v23, v23, v1 - vfmin.vv v24, v24, v1 - vfmin.vv v25, v25, v1 - vfmin.vv v26, v26, v1 - vfmin.vv v27, v27, v1 - vfmin.vv v28, v28, v1 - vfmin.vv v29, v29, v1 - vfmin.vv v30, v30, v1 - vfmin.vv v31, v31, v1 - -save_result: - slli t0, a5, 1 - add t4, a4, t0 - add t5, t3, t0 -# // store result - beqz a7, save_result_nchw - li t1, 0 - vext.x.v t0, v16, t1 - sw t0, 0(a4) - vext.x.v t0, v17, t1 - sw t0, 4(a4) - vext.x.v t0, v18, t1 - sw t0, 8(a4) - vext.x.v t0, v19, t1 - sw t0, 12(a4) - add a4, a4, 0x10 - - li t1, 1 - vext.x.v t0, v16, t1 - sw t0, 0(t3) - vext.x.v t0, v17, t1 - sw t0, 4(t3) - vext.x.v t0, v18, t1 - sw t0, 8(t3) - vext.x.v t0, v19, t1 - sw t0, 12(t3) - add t3, t3, 0x10 - - li t1, 2 - vext.x.v t0, v16, t1 - sw t0, 0(t4) - vext.x.v t0, v17, t1 - sw t0, 4(t4) - vext.x.v t0, v18, t1 - sw t0, 8(t4) - vext.x.v t0, v19, t1 - sw t0, 12(t4) - add t4, t4, 0x10 - - li t1, 3 - vext.x.v t0, v16, t1 - sw t0, 0(t5) - vext.x.v t0, v17, t1 - sw t0, 4(t5) - vext.x.v t0, v18, t1 - sw t0, 8(t5) - vext.x.v t0, v19, t1 - sw t0, 12(t5) - add t5, t5, 0x10 - - li t1, 0 - vext.x.v t0, v20, t1 - sw t0, 0(a4) - vext.x.v t0, v21, t1 - sw t0, 4(a4) - vext.x.v t0, v22, t1 - sw t0, 8(a4) - vext.x.v t0, v23, t1 - sw t0, 12(a4) - add a4, a4, 0x10 - - li t1, 1 - vext.x.v t0, v20, t1 - sw t0, 0(t3) - vext.x.v t0, v21, t1 - sw t0, 4(t3) - vext.x.v t0, v22, t1 - sw t0, 8(t3) - vext.x.v t0, v23, t1 - sw t0, 12(t3) - add t3, t3, 0x10 - - li t1, 2 - vext.x.v t0, v20, t1 - sw t0, 0(t4) - vext.x.v t0, v21, t1 - sw t0, 4(t4) - vext.x.v t0, v22, t1 - sw t0, 8(t4) - vext.x.v t0, v23, t1 - sw t0, 12(t4) - add t3, t3, 0x10 - - li t1, 3 - vext.x.v t0, v20, t1 - sw t0, 0(t5) - vext.x.v t0, v21, t1 - sw t0, 4(t5) - vext.x.v t0, v22, t1 - sw t0, 8(t5) - vext.x.v t0, v23, t1 - sw t0, 12(t5) - add t5, t5, 0x10 - - li t1, 0 - vext.x.v t0, v24, t1 - sw t0, 0(a4) - vext.x.v t0, v25, t1 - sw t0, 4(a4) - vext.x.v t0, v26, t1 - sw t0, 8(a4) - vext.x.v t0, v27, t1 - sw t0, 12(a4) - add a4, a4, 0x10 - - li t1, 1 - vext.x.v t0, v24, t1 - sw t0, 0(t3) - vext.x.v t0, v25, t1 - sw t0, 4(t3) - vext.x.v t0, v26, t1 - sw t0, 8(t3) - vext.x.v t0, v27, t1 - sw t0, 12(t3) - add t3, t3, 0x10 - - li t1, 2 - vext.x.v t0, v24, t1 - sw t0, 0(t4) - vext.x.v t0, v25, t1 - sw t0, 4(t4) - vext.x.v t0, v26, t1 - sw t0, 8(t4) - vext.x.v t0, v27, t1 - sw t0, 12(t4) - add t3, t3, 0x10 - - li t1, 3 - vext.x.v t0, v24, t1 - sw t0, 0(t5) - vext.x.v t0, v25, t1 - sw t0, 4(t5) - vext.x.v t0, v26, t1 - sw t0, 8(t5) - vext.x.v t0, v27, t1 - sw t0, 12(t5) - add t5, t5, 0x10 - - li t1, 0 - vext.x.v t0, v28, t1 - sw t0, 0(a4) - vext.x.v t0, v29, t1 - sw t0, 4(a4) - vext.x.v t0, v30, t1 - sw t0, 8(a4) - vext.x.v t0, v31, t1 - sw t0, 12(a4) - - li t1, 1 - vext.x.v t0, v28, t1 - sw t0, 0(t3) - vext.x.v t0, v29, t1 - sw t0, 4(t3) - vext.x.v t0, v30, t1 - sw t0, 8(t3) - vext.x.v t0, v31, t1 - sw t0, 12(t3) - - li t1, 2 - vext.x.v t0, v28, t1 - sw t0, 0(t4) - vext.x.v t0, v29, t1 - sw t0, 4(t4) - vext.x.v t0, v30, t1 - sw t0, 8(t4) - vext.x.v t0, v31, t1 - sw t0, 12(t4) - - li t1, 3 - vext.x.v t0, v28, t1 - sw t0, 0(t5) - vext.x.v t0, v29, t1 - sw t0, 4(t5) - vext.x.v t0, v30, t1 - sw t0, 8(t5) - vext.x.v t0, v31, t1 - sw t0, 12(t5) - - j end - -save_result_nchw: - vsw.v v16, (a4) - add a4, a4, t6 - vsw.v v17, (t3) - add t3, t3, t6 - vsw.v v18, (t4) - add t4, t4, t6 - vsw.v v19, (t5) - add t5, t5, t6 - - vsw.v v20, (a4) - add a4, a4, t6 - vsw.v v21, (t3) - add t3, t3, t6 - vsw.v v22, (t4) - add t4, t4, t6 - vsw.v v23, (t5) - add t5, t5, t6 - - vsw.v v24, (a4) - add a4, a4, t6 - vsw.v v25, (t3) - add t3, t3, t6 - vsw.v v26, (t4) - add t4, t4, t6 - vsw.v v27, (t5) - add t5, t5, t6 - - vsw.v v28, (a4) - vsw.v v29, (t3) - vsw.v v30, (t4) - vsw.v v31, (t5) - -end: - ld t0, 0(sp) - ld t1, 8(sp) - ld t2, 16(sp) - ld t3, 24(sp) - ld t4, 32(sp) - ld t5, 40(sp) - ld t6, 48(sp) - addi sp, sp, 56 - ret - .end \ No newline at end of file diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_4x4.S b/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_4x4.S deleted file mode 100644 index c9ce7b8c8..000000000 --- a/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_4x4.S +++ /dev/null @@ -1,272 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: ddzhao@openailab.com - */ -// -// 4*4 single precise floating point matric multiplication -// -// -- -- -- -- -- -- -- -- -// | i0 - - - - - - | | k0 k1 k2 k3 | | b0 b1 b2 b3 | | i0k0 i0k1 i0k2 i0k3 | -// | | | . . . . | | | | | -// | i1 - - - - - - | | . . . . | | b0 b1 b2 b3 | | i1k0 i1k1 i1k2 i1k3 | -// | | x | . . . . | + | | = | | -// | i2 - - - - - - | | . . . . | | b0 b1 b2 b3 | | i2k0 i2k1 i2k2 i2k3 | -// | | | . . . . | | | | | -// | i3 - - - - - - | | . . . . | | b0 b1 b2 b3 | | i3k0 i3k1 i3k2 i3k3 | -// -- -- -- -- -- -- -- -- -// input 4 x p kernel p x 4 biases 4 x 4 output 4 x 4 p = kernel size -// -// -// -// input: -// x0 arg0 biases address {b0,b1,b2,b3} nullptr means no biases -// x1 arg1 input address {i[0-3][0],i1[0-3][1],i[0-3][2],i[0-3][3],i[0-3][4],...} -// x2 arg2 kernel address {k[0-3][0],k[0-3][1],k[0-3][2],k[0-3][3],...} -// x3 arg3 kernel size -// x4 arg4 output address -// indirect save: output {i[0-3]k[0],i[0-3]k[1],i[0-3]k[2],i[0-3]k[3]} -// direct save: output : {i0k0 i1k0 i2k0 i3k0} -// output + ouput_xy : {i0k1 i1k1 i2k1 i3k1} -// output + ouput_xy * 2 : {i0k2 i1k2 i2k2 i3k2} -// output + ouput_xy * 3 : {i0k3 i1k3 i2k3 i3k3} -// x5 arg5 output xy -// x6 arg6 activation flag relu layers is integrated after convolution -// -// output: no -// -// register definition -// x0 biases start address -// x1 input start address -// x2 kernel start address -// x3 kernal size -// x4 output start address -// x5 output_x * output_y -// x6 fused relu flag -// x9 ~ x10 temp loop counter -// x11~ x13 temp output save address -// x7~8 14~15 not used - -// -// v0-3 4S data of input0 {i3 i2 i1 i0} -// v4-7 4S kernal data {k3 k2 k1 k0} -// v8~v15 not used -// v16 dot product for {i3k0, i2k0, i1k0, i0k0} -// v17 dot product for {i3k1, i2k1, i1k1, i0k1} -// v18 dot product for {i3k2, i2k2, i1k2, i0k2} -// v19 dot product for {i3k3, i2k3, i1k3, i0k3} -// v20~V31 not used - - .section .text,"ax" - .align 5 - - .type sgemm_4x4_rv64 STT_FUNC - .global sgemm_4x4_rv64 - .hidden sgemm_4x4_rv64 -sgemm_4x4_rv64: - slli a5, a5, 0x2 -# // initial biases - beqz a0, non_biases - vsetvli t0, a0, e32 - vlw.v v0, (a0) - vrgather.vi v16, v0, 0 - vrgather.vi v17, v0, 1 - vrgather.vi v18, v0, 2 - vrgather.vi v19, v0, 3 - - j convoluation_start - -non_biases: - vmv.v.x v16, x0 - vmv.v.x v17, x0 - vmv.v.x v18, x0 - vmv.v.x v19, x0 - -convoluation_start: - add t4, a4, a5 - - andi t3, a3, 0x3 - - li t0, 4 - blt a3, t0, loop4_end - srli t2, a3, 0x2 - -// main loop: each loop generate dot prodcut for 4x4SFP -loop4: - addi t2, t2, -1 - - vlw.v v0, (a1) - addi a1, a1, 16 - vlw.v v1, (a1) - addi a1, a1, 16 - vlw.v v2, (a1) - addi a1, a1, 16 - vlw.v v3, (a1) - addi a1, a1, 16 - - vlw.v v4, (a2) - addi a2, a2, 16 - vlw.v v5, (a2) - addi a2, a2, 16 - vlw.v v6, (a2) - addi a2, a2, 16 - vlw.v v7, (a2) - addi a2, a2, 16 - - vrgather.vi v20, v4, 0 - vrgather.vi v21, v4, 1 - vrgather.vi v22, v4, 2 - vrgather.vi v23, v4, 3 - vfmacc.vv v16, v20, v0 - vfmacc.vv v17, v21, v0 - vfmacc.vv v18, v22, v0 - vfmacc.vv v19, v23, v0 - - vrgather.vi v20, v5, 0 - vrgather.vi v21, v5, 1 - vrgather.vi v22, v5, 2 - vrgather.vi v23, v5, 3 - vfmacc.vv v16, v20, v1 - vfmacc.vv v17, v21, v1 - vfmacc.vv v18, v22, v1 - vfmacc.vv v19, v23, v1 - - vrgather.vi v20, v6, 0 - vrgather.vi v21, v6, 1 - vrgather.vi v22, v6, 2 - vrgather.vi v23, v6, 3 - vfmacc.vv v16, v20, v2 - vfmacc.vv v17, v21, v2 - vfmacc.vv v18, v22, v2 - vfmacc.vv v19, v23, v2 - - vrgather.vi v20, v7, 0 - vrgather.vi v21, v7, 1 - vrgather.vi v22, v7, 2 - vrgather.vi v23, v7, 3 - vfmacc.vv v16, v20, v3 - vfmacc.vv v17, v21, v3 - vfmacc.vv v18, v22, v3 - vfmacc.vv v19, v23, v3 - - bnez t2, loop4 - -loop4_end: - slli t0, a5, 1 - add t5, a4, t0 - beqz t3, activation - -loop1: - addi t3, t3, -1 - - vlw.v v0, (a1) - addi a1, a1, 16 - - vlw.v v4, (a2) - addi a2, a2, 16 - - vrgather.vi v20, v4, 0 - vrgather.vi v21, v4, 1 - vrgather.vi v22, v4, 2 - vrgather.vi v23, v4, 3 - vfmacc.vv v16, v20, v0 - vfmacc.vv v17, v21, v0 - vfmacc.vv v18, v22, v0 - vfmacc.vv v19, v23, v0 - - bnez t3, loop1 - - -activation: - slli t0, a5, 1 - add t6, t4, t0 - - bltz a6, save_result - - vmv.v.i v0, 0 - vmv.v.x v1, a6 - - vfmax.vv v16, v16, v0 - vfmax.vv v17, v17, v0 - vfmax.vv v18, v18, v0 - vfmax.vv v19, v19, v0 - - beqz a6, save_result - vfmin.vv v16, v16, v1 - vfmin.vv v17, v17, v1 - vfmin.vv v18, v18, v1 - vfmin.vv v19, v19, v1 - -save_result: -# // store result - beqz a7, save_result_nchw - - li t1, 0 - vext.x.v t0, v16, t1 - sw t0, 0(a4) - vext.x.v t0, v17, t1 - sw t0, 4(a4) - vext.x.v t0, v18, t1 - sw t0, 8(a4) - vext.x.v t0, v19, t1 - sw t0, 12(a4) - - li t1, 1 - vext.x.v t0, v16, t1 - sw t0, 0(t4) - vext.x.v t0, v17, t1 - sw t0, 4(t4) - vext.x.v t0, v18, t1 - sw t0, 8(t4) - vext.x.v t0, v19, t1 - sw t0, 12(t4) - - li t1, 2 - vext.x.v t0, v16, t1 - sw t0, 0(t5) - vext.x.v t0, v17, t1 - sw t0, 4(t5) - vext.x.v t0, v18, t1 - sw t0, 8(t5) - vext.x.v t0, v19, t1 - sw t0, 12(t5) - - li t1, 3 - vext.x.v t0, v16, t1 - sw t0, 0(t6) - vext.x.v t0, v17, t1 - sw t0, 4(t6) - vext.x.v t0, v18, t1 - sw t0, 8(t6) - vext.x.v t0, v19, t1 - sw t0, 12(t6) - j end - -save_result_nchw: - vsw.v v16, (a4) - vsw.v v17, (t4) - vsw.v v18, (t5) - vsw.v v19, (t6) - -end: - ret - .end - diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_8x8.c b/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_8x8.c new file mode 100644 index 000000000..832123b97 --- /dev/null +++ b/source/device/cpu/op/conv/risc-v/lp64dv/sgemm_8x8.c @@ -0,0 +1,308 @@ +#include "vsetvl_rvv.h" + +void sgemm_8x8_rv64(const float* cur_col, const float* cur_kernel, const float* bias, const int act, float* cur_output, const int output_xy, const int kernel_size, const int n) +{ + vsetvl_e32_m2(); + + // v16 ~ v30: result of c0 ~ v7 + if (bias) + { + asm("vle32.v v0, (%0);\n" + "vrgather.vi v16, v0, 0;\n" + "vrgather.vi v18, v0, 1;\n" + "vrgather.vi v20, v0, 2;\n" + "vrgather.vi v22, v0, 3;\n" + "vrgather.vi v24, v0, 4;\n" + "vrgather.vi v26, v0, 5;\n" + "vrgather.vi v28, v0, 6;\n" + "vrgather.vi v30, v0, 7;\n" + : + : "r"(bias)); + } + else + { + asm( + "vmv.v.x v16, x0;\n" + "vmv.v.x v18, x0;\n" + "vmv.v.x v20, x0;\n" + "vmv.v.x v22, x0;\n" + "vmv.v.x v24, x0;\n" + "vmv.v.x v26, x0;\n" + "vmv.v.x v28, x0;\n" + "vmv.v.x v30, x0;\n"); + } + + const float* k0 = cur_kernel; + const float* k1 = k0 + 8; + const float* k2 = k1 + 8; + const float* k3 = k2 + 8; + + const float* col0 = cur_col; + const float* col1 = col0 + 8; + const float* col2 = col1 + 8; + const float* col3 = col2 + 8; + + int k = 0; + for (; k < (kernel_size & -4); k += 4) + { + asm( + "vle32.v v0, (%0);\n" + "vle32.v v2, (%4);\n" + "vle32.v v4, (%1);\n" + "vle32.v v6, (%5);\n" + + "vrgather.vi v8, v2, 0;\n" + "vrgather.vi v10, v2, 1;\n" + "vrgather.vi v12, v2, 2;\n" + "vrgather.vi v14, v2, 3;\n" + + "vfmacc.vv v16, v0, v8;\n" + "vfmacc.vv v18, v0, v10;\n" + "vfmacc.vv v20, v0, v12;\n" + "vfmacc.vv v22, v0, v14;\n" + + "vrgather.vi v8, v2, 4;\n" + "vrgather.vi v10, v2, 5;\n" + "vrgather.vi v12, v2, 6;\n" + "vrgather.vi v14, v2, 7;\n" + + "vfmacc.vv v24, v0, v8;\n" + "vfmacc.vv v26, v0, v10;\n" + "vfmacc.vv v28, v0, v12;\n" + "vfmacc.vv v30, v0, v14;\n" + + "vrgather.vi v8, v6, 0;\n" + "vrgather.vi v10, v6, 1;\n" + "vrgather.vi v12, v6, 2;\n" + "vrgather.vi v14, v6, 3;\n" + + "vfmacc.vv v16, v4, v8;\n" + "vfmacc.vv v18, v4, v10;\n" + "vfmacc.vv v20, v4, v12;\n" + "vfmacc.vv v22, v4, v14;\n" + + "vrgather.vi v8, v6, 4;\n" + "vrgather.vi v10, v6, 5;\n" + "vrgather.vi v12, v6, 6;\n" + "vrgather.vi v14, v6, 7;\n" + + "vfmacc.vv v24, v4, v8;\n" + "vfmacc.vv v26, v4, v10;\n" + "vfmacc.vv v28, v4, v12;\n" + "vfmacc.vv v30, v4, v14;\n" + + "vle32.v v0, (%2); \n" + "vle32.v v2, (%6); \n" + "vle32.v v4, (%3); \n" + "vle32.v v6, (%7); \n" + + "vrgather.vi v8, v2, 0;\n" + "vrgather.vi v10, v2, 1;\n" + "vrgather.vi v12, v2, 2;\n" + "vrgather.vi v14, v2, 3;\n" + + "vfmacc.vv v16, v0, v8;\n" + "vfmacc.vv v18, v0, v10;\n" + "vfmacc.vv v20, v0, v12;\n" + "vfmacc.vv v22, v0, v14;\n" + + "vrgather.vi v8, v2, 4;\n" + "vrgather.vi v10, v2, 5;\n" + "vrgather.vi v12, v2, 6;\n" + "vrgather.vi v14, v2, 7;\n" + + "vfmacc.vv v24, v0, v8;\n" + "vfmacc.vv v26, v0, v10;\n" + "vfmacc.vv v28, v0, v12;\n" + "vfmacc.vv v30, v0, v14;\n" + + "vrgather.vi v8, v6, 0;\n" + "vrgather.vi v10, v6, 1;\n" + "vrgather.vi v12, v6, 2;\n" + "vrgather.vi v14, v6, 3;\n" + + "vfmacc.vv v16, v4, v8;\n" + "vfmacc.vv v18, v4, v10;\n" + "vfmacc.vv v20, v4, v12;\n" + "vfmacc.vv v22, v4, v14;\n" + + "vrgather.vi v8, v6, 4;\n" + "vrgather.vi v10, v6, 5;\n" + "vrgather.vi v12, v6, 6;\n" + "vrgather.vi v14, v6, 7;\n" + + "vfmacc.vv v24, v4, v8;\n" + "vfmacc.vv v26, v4, v10;\n" + "vfmacc.vv v28, v4, v12;\n" + "vfmacc.vv v30, v4, v14;\n" + : + : "r"(col0), "r"(col1), "r"(col2), "r"(col3), "r"(k0), "r"(k1), "r"(k2), "r"(k3)); + + col0 += 32; + col1 += 32; + col2 += 32; + col3 += 32; + + k0 += 32; + k1 += 32; + k2 += 32; + k3 += 32; + } + + for (; k < kernel_size; ++k) + { + asm("vle32.v v0, (%0);\n" + "vle32.v v2, (%1);\n" + + "vrgather.vi v8, v2, 0;\n" + "vrgather.vi v10, v2, 1;\n" + "vrgather.vi v12, v2, 2;\n" + "vrgather.vi v14, v2, 3;\n" + + "vfmacc.vv v16, v0, v8;\n" + "vfmacc.vv v18, v0, v10;\n" + "vfmacc.vv v20, v0, v12;\n" + "vfmacc.vv v22, v0, v14;\n" + + "vrgather.vi v8, v2, 4;\n" + "vrgather.vi v10, v2, 5;\n" + "vrgather.vi v12, v2, 6;\n" + "vrgather.vi v14, v2, 7;\n" + + "vfmacc.vv v24, v0, v8;\n" + "vfmacc.vv v26, v0, v10;\n" + "vfmacc.vv v28, v0, v12;\n" + "vfmacc.vv v30, v0, v14;\n" + : + : "r"(col0), "r"(k0)); + col0 += 8; + k0 += 8; + } + + if (act >= 0) + { + asm( + "vmv.v.x v0, x0;\n" + "vfmax.vv v16, v16, v0;\n" + "vfmax.vv v18, v18, v0;\n" + "vfmax.vv v20, v20, v0;\n" + "vfmax.vv v22, v22, v0;\n" + "vfmax.vv v24, v24, v0;\n" + "vfmax.vv v26, v26, v0;\n" + "vfmax.vv v28, v28, v0;\n" + "vfmax.vv v30, v30, v0;\n"); + + if (act > 0) + { + asm( + "vmv.v.x v2, %0;\n" + "vfmin.vv v16, v16, v2;\n" + "vfmin.vv v18, v18, v2;\n" + "vfmin.vv v20, v20, v2;\n" + "vfmin.vv v22, v22, v2;\n" + "vfmin.vv v24, v24, v2;\n" + "vfmin.vv v26, v26, v2;\n" + "vfmin.vv v28, v28, v2;\n" + "vfmin.vv v30, v30, v2;\n" + : + : "r"(act)); + } + } + + float* r0 = cur_output; + float* r1 = r0 + output_xy; + float* r2 = r1 + output_xy; + float* r3 = r2 + output_xy; + float* r4 = r3 + output_xy; + float* r5 = r4 + output_xy; + float* r6 = r5 + output_xy; + float* r7 = r6 + output_xy; + + switch (n) + { + case 8: + asm( + "vse32.v v16, (%0);\n" + "vse32.v v18, (%1);\n" + "vse32.v v20, (%2);\n" + "vse32.v v22, (%3);\n" + "vse32.v v24, (%4);\n" + "vse32.v v26, (%5);\n" + "vse32.v v28, (%6);\n" + "vse32.v v30, (%7);\n" + : + : "r"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r4), "r"(r5), "r"(r6), "r"(r7)); + break; + case 7: + asm( + "vse32.v v16, (%0);\n" + "vse32.v v18, (%1);\n" + "vse32.v v20, (%2);\n" + "vse32.v v22, (%3);\n" + "vse32.v v24, (%4);\n" + "vse32.v v26, (%5);\n" + "vse32.v v28, (%6);\n" + : + : "r"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r4), "r"(r5), "r"(r6)); + break; + + case 6: + asm( + "vse32.v v16, (%0);\n" + "vse32.v v18, (%1);\n" + "vse32.v v20, (%2);\n" + "vse32.v v22, (%3);\n" + "vse32.v v24, (%4);\n" + "vse32.v v26, (%5);\n" + : + : "r"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r4), "r"(r5)); + break; + + case 5: + asm( + "vse32.v v16, (%0);\n" + "vse32.v v18, (%1);\n" + "vse32.v v20, (%2);\n" + "vse32.v v22, (%3);\n" + "vse32.v v24, (%4);\n" + : + : "r"(r0), "r"(r1), "r"(r2), "r"(r3), "r"(r4)); + break; + + case 4: + asm( + "vse32.v v16, (%0);\n" + "vse32.v v18, (%1);\n" + "vse32.v v20, (%2);\n" + "vse32.v v22, (%3);\n" + : + : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); + break; + + case 3: + asm( + "vse32.v v16, (%0);\n" + "vse32.v v18, (%1);\n" + "vse32.v v20, (%2);\n" + : + : "r"(r0), "r"(r1), "r"(r2)); + break; + + case 2: + asm( + "vse32.v v16, (%0);\n" + "vse32.v v18, (%1);\n" + : + : "r"(r0), "r"(r1)); + break; + + case 1: + asm( + "vse32.v v16, (%0);\n" + : + : "r"(r0)); + break; + default: + break; + } +} diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/vsetvl_rvv.c b/source/device/cpu/op/conv/risc-v/lp64dv/vsetvl_rvv.c new file mode 100644 index 000000000..febf67f3e --- /dev/null +++ b/source/device/cpu/op/conv/risc-v/lp64dv/vsetvl_rvv.c @@ -0,0 +1,38 @@ +#include "vsetvl_rvv.h" + +void vsetvl_e32_m1(void) +{ +#ifdef __FIX_RVV_C906 + __asm__("li t0, 8;\n" + "li t1, 4;\n" + "vsetvl t0, t1, t0;\n" + : + : + : "t0", "t1"); +#else + __asm__("li t0, 4; \n" + "vsetvli t1, t0, e32, m1;\n" + : + : + : "t0", "t1"); +#endif +} + +void vsetvl_e32_m2(void) +{ +#ifdef __FIX_RVV_C906 + __asm__("li t0, 9;\n" + "li t1, 8;\n" + "vsetvl t0, t1, t0;\n" + : + : + : "t0", "t1"); +#else + __asm__( + "li t1, 8;\n" + "vsetvli t0, t1, e32, m2;\n" + : + : + : "t0", "t1"); +#endif +} diff --git a/source/device/cpu/op/conv/risc-v/lp64dv/vsetvl_rvv.h b/source/device/cpu/op/conv/risc-v/lp64dv/vsetvl_rvv.h new file mode 100644 index 000000000..1245479ff --- /dev/null +++ b/source/device/cpu/op/conv/risc-v/lp64dv/vsetvl_rvv.h @@ -0,0 +1,7 @@ +#ifndef __VSETVL_RVV_H__ +#define __VSETVL_RVV_H__ + +extern void vsetvl_e32_m1(void); +extern void vsetvl_e32_m2(void); + +#endif diff --git a/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c b/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c index b94bcb363..6ab1b3f63 100644 --- a/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c +++ b/source/device/cpu/op/conv/x86/conv_dw_hcl_x86.c @@ -542,13 +542,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return 0; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_conv_dw_hcl_x86_op() { diff --git a/source/device/cpu/op/conv/x86/conv_hcl_x86.c b/source/device/cpu/op/conv/x86/conv_hcl_x86.c index b1a3cf689..e4400df84 100644 --- a/source/device/cpu/op/conv/x86/conv_hcl_x86.c +++ b/source/device/cpu/op/conv/x86/conv_hcl_x86.c @@ -370,13 +370,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_PREFER; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = reshape, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = reshape, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_conv_hcl_x86_op() { diff --git a/source/device/cpu/op/crop/crop_ref.c b/source/device/cpu/op/crop/crop_ref.c index f59650a39..a123ed839 100644 --- a/source/device/cpu/op/crop/crop_ref.c +++ b/source/device/cpu/op/crop/crop_ref.c @@ -284,13 +284,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_crop_ref_op() { diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c b/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c index 51dae78fe..3137ed19b 100644 --- a/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c +++ b/source/device/cpu/op/deconv/cortex_a/deconv_dw_hcl_arm.c @@ -109,13 +109,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return 0; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_deconv_dw_hcl_arm_op() { diff --git a/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c b/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c index a81fa1e8c..df41df448 100644 --- a/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c +++ b/source/device/cpu/op/deconv/cortex_a/deconv_hcl_arm.c @@ -151,13 +151,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_PREFER; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = reshape, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = reshape, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_deconv_hcl_arm_op() { diff --git a/source/device/cpu/op/deconv/deconv_ref.c b/source/device/cpu/op/deconv/deconv_ref.c index 7bdfa4b76..59ca6c6d1 100644 --- a/source/device/cpu/op/deconv/deconv_ref.c +++ b/source/device/cpu/op/deconv/deconv_ref.c @@ -328,13 +328,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = reshape, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = reshape, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_deconv_ref_op() { diff --git a/source/device/cpu/op/depthtospace/depthtospace_ref.c b/source/device/cpu/op/depthtospace/depthtospace_ref.c index 94d0919ff..1eef8a71c 100644 --- a/source/device/cpu/op/depthtospace/depthtospace_ref.c +++ b/source/device/cpu/op/depthtospace/depthtospace_ref.c @@ -218,13 +218,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_depthtospace_ref_op() { diff --git a/source/device/cpu/op/detection_output/detection_output_ref.c b/source/device/cpu/op/detection_output/detection_output_ref.c index ed9409118..593d69b80 100644 --- a/source/device/cpu/op/detection_output/detection_output_ref.c +++ b/source/device/cpu/op/detection_output/detection_output_ref.c @@ -400,13 +400,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops detection_output_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops detection_output_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_detection_output_ref_op() { diff --git a/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c b/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c index 25b14171a..62c72f3b5 100644 --- a/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c +++ b/source/device/cpu/op/detection_postprocess/detection_postprocess_ref.c @@ -515,13 +515,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc { return OPS_SCORE_CANDO; } -static struct node_ops detection_postprocess_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops detection_postprocess_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_detection_postprocess_ref_op() { diff --git a/source/device/cpu/op/dropout/dropout_ref.c b/source/device/cpu/op/dropout/dropout_ref.c index 144663971..99e8994c9 100644 --- a/source/device/cpu/op/dropout/dropout_ref.c +++ b/source/device/cpu/op/dropout/dropout_ref.c @@ -73,13 +73,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_dropout_ref_op() { diff --git a/source/device/cpu/op/eltwise/eltwise_ref.c b/source/device/cpu/op/eltwise/eltwise_ref.c index d42925360..29459b201 100644 --- a/source/device/cpu/op/eltwise/eltwise_ref.c +++ b/source/device/cpu/op/eltwise/eltwise_ref.c @@ -995,13 +995,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_eltwise_ref_op() { diff --git a/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c b/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c index 1f7a7aad5..b4e92c901 100644 --- a/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c +++ b/source/device/cpu/op/elu/cortex-a/elu_hcl_arm.c @@ -81,13 +81,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return 0; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_elu_hcl_arm_op() { diff --git a/source/device/cpu/op/elu/elu_ref.c b/source/device/cpu/op/elu/elu_ref.c index 1d41d940d..51f5a63ea 100644 --- a/source/device/cpu/op/elu/elu_ref.c +++ b/source/device/cpu/op/elu/elu_ref.c @@ -159,13 +159,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_elu_ref_op() { diff --git a/source/device/cpu/op/embedding/embedding_ref.c b/source/device/cpu/op/embedding/embedding_ref.c index 5fe920a6a..b9e7a9da4 100644 --- a/source/device/cpu/op/embedding/embedding_ref.c +++ b/source/device/cpu/op/embedding/embedding_ref.c @@ -100,13 +100,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_embedding_ref_op() { diff --git a/source/device/cpu/op/expand/expand_ref.c b/source/device/cpu/op/expand/expand_ref.c index fc0bdcfe4..657316041 100644 --- a/source/device/cpu/op/expand/expand_ref.c +++ b/source/device/cpu/op/expand/expand_ref.c @@ -175,13 +175,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops expand_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops expand_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_expand_ref_op() { diff --git a/source/device/cpu/op/expanddims/expanddims_ref.c b/source/device/cpu/op/expanddims/expanddims_ref.c index 7cd37a4dd..59b387769 100644 --- a/source/device/cpu/op/expanddims/expanddims_ref.c +++ b/source/device/cpu/op/expanddims/expanddims_ref.c @@ -75,13 +75,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_expanddims_ref_op() { diff --git a/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c b/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c index d9322b864..eb37fb714 100644 --- a/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c +++ b/source/device/cpu/op/fc/cortex-a/fc_hcl_arm.c @@ -290,13 +290,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = reshape, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = reshape, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_fc_hcl_arm_op() { diff --git a/source/device/cpu/op/fc/cortex-m/fc_cmsis.c b/source/device/cpu/op/fc/cortex-m/fc_cmsis.c index e53be5c71..e37e3d2f2 100644 --- a/source/device/cpu/op/fc/cortex-m/fc_cmsis.c +++ b/source/device/cpu/op/fc/cortex-m/fc_cmsis.c @@ -133,13 +133,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops cmsis_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops cmsis_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_fc_cmsis_op() { diff --git a/source/device/cpu/op/fc/fc_ref.c b/source/device/cpu/op/fc/fc_ref.c index b0da933ea..ffb590835 100644 --- a/source/device/cpu/op/fc/fc_ref.c +++ b/source/device/cpu/op/fc/fc_ref.c @@ -475,13 +475,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_fc_ref_op() { diff --git a/source/device/cpu/op/fc/x86/fc_hcl_x86.c b/source/device/cpu/op/fc/x86/fc_hcl_x86.c index 86acbb992..d2ae6a73c 100644 --- a/source/device/cpu/op/fc/x86/fc_hcl_x86.c +++ b/source/device/cpu/op/fc/x86/fc_hcl_x86.c @@ -290,13 +290,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_fc_hcl_x86_op() { diff --git a/source/device/cpu/op/flatten/flatten_ref.c b/source/device/cpu/op/flatten/flatten_ref.c index 9b4476d28..337474184 100644 --- a/source/device/cpu/op/flatten/flatten_ref.c +++ b/source/device/cpu/op/flatten/flatten_ref.c @@ -93,13 +93,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops flatten_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops flatten_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_flatten_ref_op() { diff --git a/source/device/cpu/op/gather/gather_ref.c b/source/device/cpu/op/gather/gather_ref.c index 37ce59ddb..99b6d5169 100644 --- a/source/device/cpu/op/gather/gather_ref.c +++ b/source/device/cpu/op/gather/gather_ref.c @@ -282,13 +282,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops gather_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops gather_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_gather_ref_op() { diff --git a/source/device/cpu/op/gelu/gelu_ref.c b/source/device/cpu/op/gelu/gelu_ref.c index 07cdec2df..da73913db 100644 --- a/source/device/cpu/op/gelu/gelu_ref.c +++ b/source/device/cpu/op/gelu/gelu_ref.c @@ -130,13 +130,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_gelu_ref_op() { diff --git a/source/device/cpu/op/gru/gru_ref.c b/source/device/cpu/op/gru/gru_ref.c index 056882f3c..76e3c04be 100644 --- a/source/device/cpu/op/gru/gru_ref.c +++ b/source/device/cpu/op/gru/gru_ref.c @@ -434,13 +434,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops gru_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops gru_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_gru_ref_op() { diff --git a/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c b/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c index adcb94298..9a84aba22 100644 --- a/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c +++ b/source/device/cpu/op/hardsigmoid/hardsigmoid_ref.c @@ -140,13 +140,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_hardsigmoid_ref_op() { diff --git a/source/device/cpu/op/hardswish/hardswish_ref.c b/source/device/cpu/op/hardswish/hardswish_ref.c index 3a1910c39..8621aea52 100644 --- a/source/device/cpu/op/hardswish/hardswish_ref.c +++ b/source/device/cpu/op/hardswish/hardswish_ref.c @@ -72,13 +72,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_hardswish_ref_op() { return register_builtin_node_ops(OP_HARDSWISH, &hcl_node_ops); diff --git a/source/device/cpu/op/input/input_ref.c b/source/device/cpu/op/input/input_ref.c index 4118be0da..fcf9273f5 100644 --- a/source/device/cpu/op/input/input_ref.c +++ b/source/device/cpu/op/input/input_ref.c @@ -70,13 +70,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_input_ref_op() { diff --git a/source/device/cpu/op/instancenorm/instancenorm_ref.c b/source/device/cpu/op/instancenorm/instancenorm_ref.c index 94d943afb..887acdac0 100644 --- a/source/device/cpu/op/instancenorm/instancenorm_ref.c +++ b/source/device/cpu/op/instancenorm/instancenorm_ref.c @@ -229,13 +229,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_instancenorm_ref_op() { diff --git a/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c b/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c index c7fc11e26..8c88fde8d 100644 --- a/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c +++ b/source/device/cpu/op/interp/cortex-a/interp_hcl_arm.c @@ -81,13 +81,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return 0; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_interp_hcl_arm_op() { diff --git a/source/device/cpu/op/interp/interp_ref.c b/source/device/cpu/op/interp/interp_ref.c index fb3736057..ec0f46358 100644 --- a/source/device/cpu/op/interp/interp_ref.c +++ b/source/device/cpu/op/interp/interp_ref.c @@ -509,13 +509,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_interp_ref_op() { diff --git a/source/device/cpu/op/l2normalization/l2normalization_ref.c b/source/device/cpu/op/l2normalization/l2normalization_ref.c index b420e92dd..80790ec0b 100644 --- a/source/device/cpu/op/l2normalization/l2normalization_ref.c +++ b/source/device/cpu/op/l2normalization/l2normalization_ref.c @@ -141,13 +141,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_l2normalization_ref_op() { diff --git a/source/device/cpu/op/l2pool/l2pool_ref.c b/source/device/cpu/op/l2pool/l2pool_ref.c index 5cf027d70..d748f6786 100644 --- a/source/device/cpu/op/l2pool/l2pool_ref.c +++ b/source/device/cpu/op/l2pool/l2pool_ref.c @@ -202,13 +202,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_l2pool_ref_op() { diff --git a/source/device/cpu/op/layernorm/layernorm_ref.c b/source/device/cpu/op/layernorm/layernorm_ref.c index 1a90e705e..15a20d5e8 100644 --- a/source/device/cpu/op/layernorm/layernorm_ref.c +++ b/source/device/cpu/op/layernorm/layernorm_ref.c @@ -202,13 +202,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_layernorm_ref_op() { diff --git a/source/device/cpu/op/logical/logical_ref.c b/source/device/cpu/op/logical/logical_ref.c index aef2ad3f7..fe2778f05 100644 --- a/source/device/cpu/op/logical/logical_ref.c +++ b/source/device/cpu/op/logical/logical_ref.c @@ -214,13 +214,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_logical_ref_op() { diff --git a/source/device/cpu/op/logistic/logistic_ref.c b/source/device/cpu/op/logistic/logistic_ref.c index 807ff90d9..1a6a7ae54 100644 --- a/source/device/cpu/op/logistic/logistic_ref.c +++ b/source/device/cpu/op/logistic/logistic_ref.c @@ -108,13 +108,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_logistic_ref_op() { diff --git a/source/device/cpu/op/logsoftmax/logsoftmax_ref.c b/source/device/cpu/op/logsoftmax/logsoftmax_ref.c index 2af74c63d..31b9ebf0e 100644 --- a/source/device/cpu/op/logsoftmax/logsoftmax_ref.c +++ b/source/device/cpu/op/logsoftmax/logsoftmax_ref.c @@ -177,13 +177,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_logsoftmax_ref_op() { diff --git a/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c b/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c index fc883f9f2..bcab4fc25 100644 --- a/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c +++ b/source/device/cpu/op/lrn/cortex-a/lrn_hcl_arm.c @@ -84,13 +84,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_lrn_hcl_arm_op() { diff --git a/source/device/cpu/op/lrn/lrn_ref.c b/source/device/cpu/op/lrn/lrn_ref.c index ff71d6903..878dd913c 100644 --- a/source/device/cpu/op/lrn/lrn_ref.c +++ b/source/device/cpu/op/lrn/lrn_ref.c @@ -141,13 +141,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_lrn_ref_op() { diff --git a/source/device/cpu/op/lstm/lstm_ref.c b/source/device/cpu/op/lstm/lstm_ref.c index 0367e9f56..7f7831e3f 100644 --- a/source/device/cpu/op/lstm/lstm_ref.c +++ b/source/device/cpu/op/lstm/lstm_ref.c @@ -777,13 +777,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops lstm_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops lstm_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_lstm_ref_op() { diff --git a/source/device/cpu/op/matmul/matmul_ref.c b/source/device/cpu/op/matmul/matmul_ref.c index e039f4bd1..0993521f1 100644 --- a/source/device/cpu/op/matmul/matmul_ref.c +++ b/source/device/cpu/op/matmul/matmul_ref.c @@ -161,13 +161,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops matmul_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops matmul_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_matmul_ref_op() { diff --git a/source/device/cpu/op/maximum/maximum_ref.c b/source/device/cpu/op/maximum/maximum_ref.c index ecb34f774..4e887d7be 100644 --- a/source/device/cpu/op/maximum/maximum_ref.c +++ b/source/device/cpu/op/maximum/maximum_ref.c @@ -123,13 +123,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops maximum_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops maximum_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_maximum_ref_op() { diff --git a/source/device/cpu/op/mean/mean_ref.c b/source/device/cpu/op/mean/mean_ref.c index 1ccd4697b..de259b0e9 100644 --- a/source/device/cpu/op/mean/mean_ref.c +++ b/source/device/cpu/op/mean/mean_ref.c @@ -121,13 +121,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops mean_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops mean_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_mean_ref_op() { diff --git a/source/device/cpu/op/minimum/minimum_ref.c b/source/device/cpu/op/minimum/minimum_ref.c index 19319eb2f..afe803aeb 100644 --- a/source/device/cpu/op/minimum/minimum_ref.c +++ b/source/device/cpu/op/minimum/minimum_ref.c @@ -122,13 +122,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops minimum_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops minimum_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_minimum_ref_op() { diff --git a/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c b/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c index 8e3581c24..6197e3235 100644 --- a/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c +++ b/source/device/cpu/op/mish/cortex-a/mish_hcl_arm.c @@ -83,13 +83,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_mish_hcl_arm_op() { diff --git a/source/device/cpu/op/mish/mish_ref.c b/source/device/cpu/op/mish/mish_ref.c index 91af5a417..b11e02035 100644 --- a/source/device/cpu/op/mish/mish_ref.c +++ b/source/device/cpu/op/mish/mish_ref.c @@ -82,13 +82,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_mish_ref_op() { diff --git a/source/device/cpu/op/mvn/mvn_ref.c b/source/device/cpu/op/mvn/mvn_ref.c index 306082d61..5af43ed65 100644 --- a/source/device/cpu/op/mvn/mvn_ref.c +++ b/source/device/cpu/op/mvn/mvn_ref.c @@ -243,13 +243,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_mvn_ref_op() { diff --git a/source/device/cpu/op/noop/noop_ref.c b/source/device/cpu/op/noop/noop_ref.c index 67722f5bb..c39e29a73 100644 --- a/source/device/cpu/op/noop/noop_ref.c +++ b/source/device/cpu/op/noop/noop_ref.c @@ -108,13 +108,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_noop_ref_op() { diff --git a/source/device/cpu/op/normalize/normalize_ref.c b/source/device/cpu/op/normalize/normalize_ref.c index 92990f780..96ca6f709 100644 --- a/source/device/cpu/op/normalize/normalize_ref.c +++ b/source/device/cpu/op/normalize/normalize_ref.c @@ -116,13 +116,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops normalize_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops normalize_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_normalize_ref_op() { diff --git a/source/device/cpu/op/pad/pad_ref.c b/source/device/cpu/op/pad/pad_ref.c index 85365bc80..76fa79603 100644 --- a/source/device/cpu/op/pad/pad_ref.c +++ b/source/device/cpu/op/pad/pad_ref.c @@ -672,13 +672,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops pad_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops pad_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_pad_ref_op() { diff --git a/source/device/cpu/op/permute/permute_ref.c b/source/device/cpu/op/permute/permute_ref.c index 6e705ab31..2c0bd6e32 100644 --- a/source/device/cpu/op/permute/permute_ref.c +++ b/source/device/cpu/op/permute/permute_ref.c @@ -420,13 +420,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops permute_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops permute_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_permute_ref_op() { diff --git a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c index 4b6d3fe7a..59c944b75 100644 --- a/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c +++ b/source/device/cpu/op/pooling/cortex-a/pooling_hcl_arm.c @@ -159,13 +159,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return 0; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_pooling_hcl_arm_op() { diff --git a/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c b/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c index e30c84c7e..1a176eb11 100644 --- a/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c +++ b/source/device/cpu/op/pooling/cortex-m/pooling_cmsis.c @@ -66,13 +66,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops cmsis_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = NULL, - .release_node = NULL, - .score = score}; +static struct node_ops cmsis_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = NULL, + .release_node = NULL, + .score = score, +}; int register_pooling_cmsis_op() { diff --git a/source/device/cpu/op/pooling/pooling_ref.c b/source/device/cpu/op/pooling/pooling_ref.c index df8ecb6a2..e06dc946d 100644 --- a/source/device/cpu/op/pooling/pooling_ref.c +++ b/source/device/cpu/op/pooling/pooling_ref.c @@ -159,13 +159,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_pooling_ref_op() { diff --git a/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c b/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c index 9012a5686..48c76f590 100644 --- a/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c +++ b/source/device/cpu/op/prelu/cortex_a/prelu_hcl_arm.c @@ -90,13 +90,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = NULL, - .release_node = NULL, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = NULL, + .release_node = NULL, + .score = score, +}; int register_prelu_hcl_arm_op() { diff --git a/source/device/cpu/op/prelu/prelu_ref.c b/source/device/cpu/op/prelu/prelu_ref.c index da069d8bb..6e8822c2d 100644 --- a/source/device/cpu/op/prelu/prelu_ref.c +++ b/source/device/cpu/op/prelu/prelu_ref.c @@ -443,13 +443,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_prelu_ref_op() { diff --git a/source/device/cpu/op/priorbox/priorbox_ref.c b/source/device/cpu/op/priorbox/priorbox_ref.c index 39df5ec09..c3aa6aaa7 100644 --- a/source/device/cpu/op/priorbox/priorbox_ref.c +++ b/source/device/cpu/op/priorbox/priorbox_ref.c @@ -217,13 +217,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops priorbox_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops priorbox_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_priorbox_ref_op() { diff --git a/source/device/cpu/op/psroipooling/psroipooling_ref.c b/source/device/cpu/op/psroipooling/psroipooling_ref.c index 9039a3f8d..27152f52a 100644 --- a/source/device/cpu/op/psroipooling/psroipooling_ref.c +++ b/source/device/cpu/op/psroipooling/psroipooling_ref.c @@ -144,13 +144,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_psroipooling_ref_op() { diff --git a/source/device/cpu/op/reciprocal/reciprocal_ref.c b/source/device/cpu/op/reciprocal/reciprocal_ref.c index c770bb657..9d7ba443d 100644 --- a/source/device/cpu/op/reciprocal/reciprocal_ref.c +++ b/source/device/cpu/op/reciprocal/reciprocal_ref.c @@ -104,7 +104,8 @@ static struct node_ops hcl_node_ops = { .postrun = NULL, .init_node = init_node, .release_node = release_node, - .score = score}; + .score = score, +}; int register_reciprocal_ref_op() { diff --git a/source/device/cpu/op/reducel2/reducel2_ref.c b/source/device/cpu/op/reducel2/reducel2_ref.c index e92f98caf..9fff807d4 100644 --- a/source/device/cpu/op/reducel2/reducel2_ref.c +++ b/source/device/cpu/op/reducel2/reducel2_ref.c @@ -118,13 +118,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops reducel2_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops reducel2_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_reducel2_ref_op() { diff --git a/source/device/cpu/op/reduction/reduction_ref.c b/source/device/cpu/op/reduction/reduction_ref.c index fd92f23d9..57f7c632d 100644 --- a/source/device/cpu/op/reduction/reduction_ref.c +++ b/source/device/cpu/op/reduction/reduction_ref.c @@ -120,13 +120,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_reduction_ref_op() { diff --git a/source/device/cpu/op/region/region_ref.c b/source/device/cpu/op/region/region_ref.c index 3bb0b37a1..884eaf168 100644 --- a/source/device/cpu/op/region/region_ref.c +++ b/source/device/cpu/op/region/region_ref.c @@ -168,13 +168,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_region_ref_op() { diff --git a/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c b/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c index 0f885ba8b..8980d051d 100644 --- a/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c +++ b/source/device/cpu/op/relu/cortex-a/relu_hcl_arm.c @@ -82,13 +82,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_relu_hcl_arm_op() { diff --git a/source/device/cpu/op/relu/cortex-m/relu_cmsis.c b/source/device/cpu/op/relu/cortex-m/relu_cmsis.c index 72d506512..1bf5b0e27 100644 --- a/source/device/cpu/op/relu/cortex-m/relu_cmsis.c +++ b/source/device/cpu/op/relu/cortex-m/relu_cmsis.c @@ -93,13 +93,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops cmsis_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops cmsis_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_relu_cmsis_op() { diff --git a/source/device/cpu/op/relu/relu_ref.c b/source/device/cpu/op/relu/relu_ref.c index 2b0372686..3ef1dc364 100644 --- a/source/device/cpu/op/relu/relu_ref.c +++ b/source/device/cpu/op/relu/relu_ref.c @@ -92,13 +92,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_relu_ref_op() { diff --git a/source/device/cpu/op/relu1/relu1_ref.c b/source/device/cpu/op/relu1/relu1_ref.c index 337bc5812..17e59f1d4 100644 --- a/source/device/cpu/op/relu1/relu1_ref.c +++ b/source/device/cpu/op/relu1/relu1_ref.c @@ -103,13 +103,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_relu1_ref_op() { diff --git a/source/device/cpu/op/relu6/relu6_ref.c b/source/device/cpu/op/relu6/relu6_ref.c index 98bfa2006..697634057 100644 --- a/source/device/cpu/op/relu6/relu6_ref.c +++ b/source/device/cpu/op/relu6/relu6_ref.c @@ -167,13 +167,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_relu6_ref_op() { diff --git a/source/device/cpu/op/reorg/reorg_ref.c b/source/device/cpu/op/reorg/reorg_ref.c index 3cff628a0..7d97fea57 100644 --- a/source/device/cpu/op/reorg/reorg_ref.c +++ b/source/device/cpu/op/reorg/reorg_ref.c @@ -111,13 +111,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_reorg_ref_op() { diff --git a/source/device/cpu/op/reshape/reshape_ref.c b/source/device/cpu/op/reshape/reshape_ref.c index 09ddd5f5b..0c071eb54 100644 --- a/source/device/cpu/op/reshape/reshape_ref.c +++ b/source/device/cpu/op/reshape/reshape_ref.c @@ -331,13 +331,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops reshape_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops reshape_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_reshape_ref_op() { diff --git a/source/device/cpu/op/resize/resize_ref.c b/source/device/cpu/op/resize/resize_ref.c index 3dda3b135..fc3425768 100644 --- a/source/device/cpu/op/resize/resize_ref.c +++ b/source/device/cpu/op/resize/resize_ref.c @@ -490,13 +490,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_resize_ref_op() { diff --git a/source/device/cpu/op/reverse/reverse_ref.c b/source/device/cpu/op/reverse/reverse_ref.c index 7ed7d36f5..7e5bcdff2 100644 --- a/source/device/cpu/op/reverse/reverse_ref.c +++ b/source/device/cpu/op/reverse/reverse_ref.c @@ -271,13 +271,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_reverse_ref_op() { diff --git a/source/device/cpu/op/rnn/rnn_ref.c b/source/device/cpu/op/rnn/rnn_ref.c index ee60e4247..fc2a3ebe6 100644 --- a/source/device/cpu/op/rnn/rnn_ref.c +++ b/source/device/cpu/op/rnn/rnn_ref.c @@ -268,13 +268,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_rnn_ref_op() { diff --git a/source/device/cpu/op/roialign/roialign_ref.c b/source/device/cpu/op/roialign/roialign_ref.c index 61de55300..04531a160 100644 --- a/source/device/cpu/op/roialign/roialign_ref.c +++ b/source/device/cpu/op/roialign/roialign_ref.c @@ -189,13 +189,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_roialign_ref_op() { diff --git a/source/device/cpu/op/roipooling/roipooling_ref.c b/source/device/cpu/op/roipooling/roipooling_ref.c index cf554bbec..9a5b37c8e 100644 --- a/source/device/cpu/op/roipooling/roipooling_ref.c +++ b/source/device/cpu/op/roipooling/roipooling_ref.c @@ -174,13 +174,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_roipooling_ref_op() { diff --git a/source/device/cpu/op/round/round_ref.c b/source/device/cpu/op/round/round_ref.c index ca76ee7d6..75869afd5 100644 --- a/source/device/cpu/op/round/round_ref.c +++ b/source/device/cpu/op/round/round_ref.c @@ -130,13 +130,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_round_ref_op() { diff --git a/source/device/cpu/op/rpn/rpn_ref.c b/source/device/cpu/op/rpn/rpn_ref.c index 6d9ba42b3..8923575bb 100644 --- a/source/device/cpu/op/rpn/rpn_ref.c +++ b/source/device/cpu/op/rpn/rpn_ref.c @@ -357,13 +357,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops rpn_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops rpn_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_rpn_ref_op() { diff --git a/source/device/cpu/op/scale/scale_ref.c b/source/device/cpu/op/scale/scale_ref.c index 426fcd2c8..13a717749 100644 --- a/source/device/cpu/op/scale/scale_ref.c +++ b/source/device/cpu/op/scale/scale_ref.c @@ -121,13 +121,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_scale_ref_op() { diff --git a/source/device/cpu/op/scatter/scatter_ref.c b/source/device/cpu/op/scatter/scatter_ref.c index 5aae5d8d0..299845260 100644 --- a/source/device/cpu/op/scatter/scatter_ref.c +++ b/source/device/cpu/op/scatter/scatter_ref.c @@ -406,13 +406,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_scatter_ref_op() { diff --git a/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c b/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c index 026625d71..bc1249023 100644 --- a/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c +++ b/source/device/cpu/op/selu/cortex-a/selu_hcl_arm.c @@ -81,13 +81,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_selu_hcl_arm_op() { diff --git a/source/device/cpu/op/selu/selu_ref.c b/source/device/cpu/op/selu/selu_ref.c index 557f8105d..afbecfb63 100644 --- a/source/device/cpu/op/selu/selu_ref.c +++ b/source/device/cpu/op/selu/selu_ref.c @@ -177,13 +177,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_selu_ref_op() { diff --git a/source/device/cpu/op/shape/shape_ref.c b/source/device/cpu/op/shape/shape_ref.c index ec27a9c41..d45d23b0a 100644 --- a/source/device/cpu/op/shape/shape_ref.c +++ b/source/device/cpu/op/shape/shape_ref.c @@ -80,13 +80,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_shape_ref_op() { diff --git a/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c b/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c index 545bf2fc0..794180f79 100644 --- a/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c +++ b/source/device/cpu/op/shuffle_channel/shuffle_channel_ref.c @@ -175,13 +175,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_shuffle_channel_ref_op() { diff --git a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c index 1b7b3fbaf..41870ffc5 100644 --- a/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c +++ b/source/device/cpu/op/sigmoid/cortex-a/sigmoid_hcl_arm.c @@ -71,13 +71,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return 0; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_sigmoid_hcl_arm_op() { diff --git a/source/device/cpu/op/sigmoid/sigmoid_ref.c b/source/device/cpu/op/sigmoid/sigmoid_ref.c index 8e4ca0899..a72864ef7 100644 --- a/source/device/cpu/op/sigmoid/sigmoid_ref.c +++ b/source/device/cpu/op/sigmoid/sigmoid_ref.c @@ -226,13 +226,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops sigmoid_node_ops = {.prerun = prerun, - .run = run, - .reshape = reshape_node, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops sigmoid_node_ops = { + .prerun = prerun, + .run = run, + .reshape = reshape_node, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_sigmoid_ref_op() { diff --git a/source/device/cpu/op/slice/slice_ref.c b/source/device/cpu/op/slice/slice_ref.c index 037c413b7..3c5714eaf 100644 --- a/source/device/cpu/op/slice/slice_ref.c +++ b/source/device/cpu/op/slice/slice_ref.c @@ -520,13 +520,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops slice_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops slice_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_slice_ref_op() { diff --git a/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c b/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c index 9ffe8e5c2..84cbe490b 100644 --- a/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c +++ b/source/device/cpu/op/softmax/cortex-a/softmax_hcl_arm.c @@ -257,13 +257,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_softmax_hcl_arm_op() { diff --git a/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c b/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c index 93678c225..0901b1c7a 100644 --- a/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c +++ b/source/device/cpu/op/softmax/cortex-m/softmax_cmsis.c @@ -82,13 +82,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops cmsis_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = NULL, - .release_node = NULL, - .score = score}; +static struct node_ops cmsis_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = NULL, + .release_node = NULL, + .score = score, +}; int register_softmax_cmsis_op() { diff --git a/source/device/cpu/op/softmax/softmax_ref.c b/source/device/cpu/op/softmax/softmax_ref.c index cb1a3b49d..e4a321979 100644 --- a/source/device/cpu/op/softmax/softmax_ref.c +++ b/source/device/cpu/op/softmax/softmax_ref.c @@ -110,13 +110,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_softmax_ref_op() { diff --git a/source/device/cpu/op/softplus/softplus_ref.c b/source/device/cpu/op/softplus/softplus_ref.c index 6931ab047..b8c178b5a 100644 --- a/source/device/cpu/op/softplus/softplus_ref.c +++ b/source/device/cpu/op/softplus/softplus_ref.c @@ -118,7 +118,8 @@ static struct node_ops hcl_node_ops = { .postrun = NULL, .init_node = init_node, .release_node = release_node, - .score = score}; + .score = score, +}; int register_softplus_ref_op() { diff --git a/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c b/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c index 6a0aa26a4..2358f2cbf 100644 --- a/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c +++ b/source/device/cpu/op/spacetobatchnd/spacetobatchnd_ref.c @@ -249,13 +249,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_spacetobatchnd_ref_op() { diff --git a/source/device/cpu/op/spacetodepth/spacetodepth_ref.c b/source/device/cpu/op/spacetodepth/spacetodepth_ref.c index aa8217929..ce8e023ea 100644 --- a/source/device/cpu/op/spacetodepth/spacetodepth_ref.c +++ b/source/device/cpu/op/spacetodepth/spacetodepth_ref.c @@ -102,13 +102,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_spacetodepth_ref_op() { diff --git a/source/device/cpu/op/sparsetodense/sparsetodense_ref.c b/source/device/cpu/op/sparsetodense/sparsetodense_ref.c index 6179ad14c..75db4c907 100644 --- a/source/device/cpu/op/sparsetodense/sparsetodense_ref.c +++ b/source/device/cpu/op/sparsetodense/sparsetodense_ref.c @@ -180,13 +180,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_sparsetodense_ref_op() { diff --git a/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c b/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c index 2a6bc1435..ae0942b65 100644 --- a/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c +++ b/source/device/cpu/op/spatialtransformer/spatialtransformer_ref.c @@ -332,13 +332,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_spatialtransformer_ref_op() { diff --git a/source/device/cpu/op/split/split_ref.c b/source/device/cpu/op/split/split_ref.c index bb0c23595..0d11730bf 100644 --- a/source/device/cpu/op/split/split_ref.c +++ b/source/device/cpu/op/split/split_ref.c @@ -197,13 +197,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_split_ref_op() { diff --git a/source/device/cpu/op/squareddifference/squareddifference_ref.c b/source/device/cpu/op/squareddifference/squareddifference_ref.c index 66a600291..2014293f9 100644 --- a/source/device/cpu/op/squareddifference/squareddifference_ref.c +++ b/source/device/cpu/op/squareddifference/squareddifference_ref.c @@ -211,13 +211,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_squareddifference_ref_op() { diff --git a/source/device/cpu/op/squeeze/squeeze_ref.c b/source/device/cpu/op/squeeze/squeeze_ref.c index 1928d299e..99a8495b0 100644 --- a/source/device/cpu/op/squeeze/squeeze_ref.c +++ b/source/device/cpu/op/squeeze/squeeze_ref.c @@ -93,13 +93,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops squeeze_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops squeeze_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_squeeze_ref_op() { diff --git a/source/device/cpu/op/strided_slice/strided_slice_ref.c b/source/device/cpu/op/strided_slice/strided_slice_ref.c index bb3cb9111..9647d3d09 100644 --- a/source/device/cpu/op/strided_slice/strided_slice_ref.c +++ b/source/device/cpu/op/strided_slice/strided_slice_ref.c @@ -153,13 +153,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops strided_slice_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops strided_slice_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_strided_slice_ref_op() { diff --git a/source/device/cpu/op/swap_axis/swap_axis_ref.c b/source/device/cpu/op/swap_axis/swap_axis_ref.c index 6aeef17bb..11fddd4d4 100644 --- a/source/device/cpu/op/swap_axis/swap_axis_ref.c +++ b/source/device/cpu/op/swap_axis/swap_axis_ref.c @@ -136,13 +136,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops swap_axis_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops swap_axis_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_swap_axis_ref_op() { diff --git a/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c b/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c index de5975df5..825208dca 100644 --- a/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c +++ b/source/device/cpu/op/tanh/cortex-a/tanh_hcl_arm.c @@ -83,13 +83,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return 0; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_tanh_hcl_arm_op() { diff --git a/source/device/cpu/op/tanh/tanh_ref.c b/source/device/cpu/op/tanh/tanh_ref.c index 390f64332..98a048ab6 100644 --- a/source/device/cpu/op/tanh/tanh_ref.c +++ b/source/device/cpu/op/tanh/tanh_ref.c @@ -121,13 +121,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_tanh_ref_op() { diff --git a/source/device/cpu/op/threshold/threshold_ref.c b/source/device/cpu/op/threshold/threshold_ref.c index 4672086a5..bddbcdfc2 100644 --- a/source/device/cpu/op/threshold/threshold_ref.c +++ b/source/device/cpu/op/threshold/threshold_ref.c @@ -130,13 +130,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_threshold_ref_op() { diff --git a/source/device/cpu/op/tile/tile_ref.c b/source/device/cpu/op/tile/tile_ref.c index 0f51a5310..697136547 100644 --- a/source/device/cpu/op/tile/tile_ref.c +++ b/source/device/cpu/op/tile/tile_ref.c @@ -180,7 +180,8 @@ static struct node_ops hcl_node_ops = { .postrun = NULL, .init_node = init_node, .release_node = release_node, - .score = score}; + .score = score, +}; int register_tile_ref_op() { diff --git a/source/device/cpu/op/topkv2/topkv2_ref.c b/source/device/cpu/op/topkv2/topkv2_ref.c index b84cc2433..8f8722811 100644 --- a/source/device/cpu/op/topkv2/topkv2_ref.c +++ b/source/device/cpu/op/topkv2/topkv2_ref.c @@ -231,13 +231,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_topkv2_ref_op() { diff --git a/source/device/cpu/op/transpose/transpose_ref.c b/source/device/cpu/op/transpose/transpose_ref.c index 31187f4f3..b216e2b46 100644 --- a/source/device/cpu/op/transpose/transpose_ref.c +++ b/source/device/cpu/op/transpose/transpose_ref.c @@ -477,13 +477,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = postrun, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = postrun, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_transpose_ref_op() { diff --git a/source/device/cpu/op/unary/unary_ref.c b/source/device/cpu/op/unary/unary_ref.c index 0f9610a2e..e3c430242 100644 --- a/source/device/cpu/op/unary/unary_ref.c +++ b/source/device/cpu/op/unary/unary_ref.c @@ -71,13 +71,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_unary_ref_op() { diff --git a/source/device/cpu/op/unsqueeze/unsqueeze_ref.c b/source/device/cpu/op/unsqueeze/unsqueeze_ref.c index 70847a7d9..066d2d1dc 100644 --- a/source/device/cpu/op/unsqueeze/unsqueeze_ref.c +++ b/source/device/cpu/op/unsqueeze/unsqueeze_ref.c @@ -93,13 +93,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops unsqueeze_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops unsqueeze_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_unsqueeze_ref_op() { diff --git a/source/device/cpu/op/upsample/upsample_ref.c b/source/device/cpu/op/upsample/upsample_ref.c index 729b7f263..f3c0de300 100644 --- a/source/device/cpu/op/upsample/upsample_ref.c +++ b/source/device/cpu/op/upsample/upsample_ref.c @@ -172,13 +172,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_BEST; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_upsample_ref_op() { diff --git a/source/device/cpu/op/where/where_ref.c b/source/device/cpu/op/where/where_ref.c index 52a2fd778..f2fd9b931 100644 --- a/source/device/cpu/op/where/where_ref.c +++ b/source/device/cpu/op/where/where_ref.c @@ -99,13 +99,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = NULL, - .run = run, - .reshape = reshape, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = NULL, + .run = run, + .reshape = reshape, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_where_ref_op() { diff --git a/source/device/cpu/op/zeroslike/zeroslike_ref.c b/source/device/cpu/op/zeroslike/zeroslike_ref.c index 47b83d417..f770ad6e5 100644 --- a/source/device/cpu/op/zeroslike/zeroslike_ref.c +++ b/source/device/cpu/op/zeroslike/zeroslike_ref.c @@ -167,13 +167,15 @@ static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struc return OPS_SCORE_CANDO; } -static struct node_ops hcl_node_ops = {.prerun = prerun, - .run = run, - .reshape = NULL, - .postrun = NULL, - .init_node = init_node, - .release_node = release_node, - .score = score}; +static struct node_ops hcl_node_ops = { + .prerun = prerun, + .run = run, + .reshape = NULL, + .postrun = NULL, + .init_node = init_node, + .release_node = release_node, + .score = score, +}; int register_zeroslike_ref_op() { diff --git a/source/device/opencl/include/CL/cl_ext.h b/source/device/opencl/include/CL/cl_ext.h index ed0db6dfa..c58990ec4 100644 --- a/source/device/opencl/include/CL/cl_ext.h +++ b/source/device/opencl/include/CL/cl_ext.h @@ -72,7 +72,7 @@ extern "C" { */ #define cl_APPLE_SetMemObjectDestructor 1 cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE(cl_mem /* memobj */, - void (*/*pfn_notify*/)(cl_mem /* memobj */, void* /*user_data*/), + void (* /*pfn_notify*/)(cl_mem /* memobj */, void* /*user_data*/), void* /*user_data */) CL_EXT_SUFFIX__VERSION_1_0; /* Context Logging Functions diff --git a/source/device/vulkan/layer/concat_vulkan.cpp b/source/device/vulkan/layer/concat_vulkan.cpp index 99357ba52..8e69bb2bc 100644 --- a/source/device/vulkan/layer/concat_vulkan.cpp +++ b/source/device/vulkan/layer/concat_vulkan.cpp @@ -39,33 +39,14 @@ #include "concat_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -Concat_vulkan::Concat_vulkan() +Concat_vulkan::Concat_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - support_image_storage = false; - - pipeline_concat[0] = 0; - pipeline_concat[1] = 0; - pipeline_concat_pack4[0] = 0; - pipeline_concat_pack4[1] = 0; - pipeline_concat_pack4to1[0] = 0; - pipeline_concat_pack4to1[1] = 0; - pipeline_concat_pack8[0] = 0; - pipeline_concat_pack8[1] = 0; - pipeline_concat_pack8to4[0] = 0; - pipeline_concat_pack8to4[1] = 0; - pipeline_concat_pack8to1[0] = 0; - pipeline_concat_pack8to1[1] = 0; -} - -Concat_vulkan::Concat_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) -{ - support_vulkan = true; - support_image_storage = false; - + one_blob_only = false; pipeline_concat[0] = 0; pipeline_concat[1] = 0; pipeline_concat_pack4[0] = 0; @@ -91,7 +72,7 @@ Concat_vulkan::Concat_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) for (int i = 0; i < ir_node->output_num; i++) { - struct tensor* output = get_ir_graph_tensor(graph, node->input_tensors[i]); + struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[i]); std::string name = output->name; tops.push_back(name); } @@ -172,9 +153,7 @@ int Concat_vulkan::create_pipeline(const Option& _opt) if (out_shape.dims == 2) out_shape_unpacked = Tensor(out_shape.w, out_shape.h / elempack, (void*)0, elemsize, elempack); if (out_shape.dims == 3) out_shape_unpacked = Tensor(out_shape.w, out_shape.h, out_shape.c / elempack, (void*)0, elemsize, elempack); - // if (!vkdev->shape_support_image_storage(out_shape_unpacked)) { - support_image_storage = false; opt.use_image_storage = false; } @@ -794,4 +773,4 @@ int Concat_vulkan::record_pipeline(const std::vector& bottom_blobs, st return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/concat_vulkan.hpp b/source/device/vulkan/layer/concat_vulkan.hpp index b03d8efe6..7711c16f0 100644 --- a/source/device/vulkan/layer/concat_vulkan.hpp +++ b/source/device/vulkan/layer/concat_vulkan.hpp @@ -50,8 +50,7 @@ namespace TEngine { class Concat_vulkan : public Layer { public: - Concat_vulkan(); - Concat_vulkan(ir_graph_t* graph, ir_node_t* ir_node); + Concat_vulkan(ir_graph_t* graph, ir_node_t* ir_node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); @@ -78,4 +77,4 @@ class Concat_vulkan : public Layer } // namespace TEngine -#endif \ No newline at end of file +#endif diff --git a/source/device/vulkan/layer/convolution_vulkan.cpp b/source/device/vulkan/layer/convolution_vulkan.cpp index d1c7335b6..4a742b29d 100644 --- a/source/device/vulkan/layer/convolution_vulkan.cpp +++ b/source/device/vulkan/layer/convolution_vulkan.cpp @@ -39,18 +39,14 @@ #include "convolution_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -Convolution_vulkan::Convolution_vulkan() +Convolution_vulkan::Convolution_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - pipeline_convolution = 0; -} - -Convolution_vulkan::Convolution_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) -{ - support_vulkan = true; + one_blob_only = true; padding = 0; innerproduct = 0; @@ -206,18 +202,12 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) // bool is_conv1x1s1d1 = false; bool is_conv3x3s1d1 = false; - // if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16 && ((elempack == 4 && out_elempack == 4) || (elempack == 8 && out_elempack == 8))) - { - // TODO do nothing for wino fix me!!!!! - } - // else { - support_image_storage = false; opt.use_image_storage = false; } { - padding = new Padding_vulkan(); + padding = new Padding_vulkan(vkdev); padding->vkdev = vkdev; padding->top = pad_h0; @@ -443,12 +433,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) // ir_tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]); // cmd.record_upload(weight_tensor, weight_data_gpu, opt); - if (support_image_storage && opt.use_image_storage) - { - TLOG_INFO("not record_upload weight_data_gpu_image, fix me\n"); - // cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt); - } - else { cmd.record_upload(weight_data_packed, weight_data_gpu, opt); } @@ -464,11 +448,6 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) Tensor bias_data_packed; convert_packing(bias_data, bias_data_packed, out_elempack); - if (support_image_storage && opt.use_image_storage) - { - // cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); - } - else { cmd.record_upload(bias_data_packed, bias_data_gpu, opt); } @@ -615,4 +594,4 @@ int Convolution_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& t return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/convolution_vulkan.hpp b/source/device/vulkan/layer/convolution_vulkan.hpp index c0799f877..ff01f1bf2 100644 --- a/source/device/vulkan/layer/convolution_vulkan.hpp +++ b/source/device/vulkan/layer/convolution_vulkan.hpp @@ -52,9 +52,7 @@ namespace TEngine { class Convolution_vulkan : public Layer { public: - Convolution_vulkan(); - // Convolution_vulkan(ir_node* node); - Convolution_vulkan(ir_graph_t* graph, ir_node_t* node); + Convolution_vulkan(ir_graph_t* graph, ir_node_t* node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); diff --git a/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp b/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp index 51f83b773..88e3ebf9a 100644 --- a/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp +++ b/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp @@ -39,21 +39,15 @@ #include "convolutiondepthwise_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan() +ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - pipeline_convolutiondepthwise = 0; -} - -ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) -{ - support_vulkan = true; - + one_blob_only = true; padding = 0; - pipeline_convolutiondepthwise = 0; pipeline_convolutiondepthwise_pack4 = 0; pipeline_convolutiondepthwise_pack8 = 0; @@ -94,8 +88,7 @@ int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt) Option opt = _opt; { - padding = new Padding_vulkan(); - padding->vkdev = vkdev; + padding = new Padding_vulkan(vkdev); padding->top = pad_h0; padding->bottom = pad_h1; @@ -299,4 +292,4 @@ int ConvolutionDepthWise_vulkan::record_pipeline(const VkTensor& bottom_blob, Vk return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp b/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp index 7b867529b..03a2c0688 100644 --- a/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp +++ b/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp @@ -51,8 +51,7 @@ namespace TEngine { class ConvolutionDepthWise_vulkan : public Layer { public: - ConvolutionDepthWise_vulkan(); - ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* node); + ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); diff --git a/source/device/vulkan/layer/crop_vulkan.cpp b/source/device/vulkan/layer/crop_vulkan.cpp index d00325e34..700930e04 100644 --- a/source/device/vulkan/layer/crop_vulkan.cpp +++ b/source/device/vulkan/layer/crop_vulkan.cpp @@ -39,30 +39,14 @@ #include "crop_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -Crop_vulkan::Crop_vulkan() +Crop_vulkan::Crop_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - support_image_storage = false; - - pipeline_crop = 0; - pipeline_crop_pack4 = 0; - pipeline_crop_pack1to4 = 0; - pipeline_crop_pack4to1 = 0; - pipeline_crop_pack8 = 0; - pipeline_crop_pack1to8 = 0; - pipeline_crop_pack4to8 = 0; - pipeline_crop_pack8to4 = 0; - pipeline_crop_pack8to1 = 0; -} - -Crop_vulkan::Crop_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) -{ - support_vulkan = true; - support_image_storage = false; - + one_blob_only = true; pipeline_crop = 0; pipeline_crop_pack4 = 0; pipeline_crop_pack1to4 = 0; @@ -616,4 +600,4 @@ int Crop_vulkan::record_pipeline(const std::vector& bottom_blobs, std: return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/crop_vulkan.hpp b/source/device/vulkan/layer/crop_vulkan.hpp index 2316f07c0..8dab47750 100644 --- a/source/device/vulkan/layer/crop_vulkan.hpp +++ b/source/device/vulkan/layer/crop_vulkan.hpp @@ -50,8 +50,7 @@ namespace TEngine { class Crop_vulkan : public Layer { public: - Crop_vulkan(); - Crop_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + Crop_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); @@ -92,4 +91,4 @@ class Crop_vulkan : public Layer } // namespace TEngine -#endif \ No newline at end of file +#endif diff --git a/source/device/vulkan/layer/dropout_vulkan.cpp b/source/device/vulkan/layer/dropout_vulkan.cpp index bf46fa34c..3e1f12739 100644 --- a/source/device/vulkan/layer/dropout_vulkan.cpp +++ b/source/device/vulkan/layer/dropout_vulkan.cpp @@ -39,24 +39,15 @@ #include "dropout_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -Dropout_vulkan::Dropout_vulkan() +Dropout_vulkan::Dropout_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - support_image_storage = false; - - pipeline_dropout = 0; - pipeline_dropout_pack4 = 0; - pipeline_dropout_pack8 = 0; -} - -Dropout_vulkan::Dropout_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) -{ - support_vulkan = true; - support_image_storage = false; - + one_blob_only = true; + support_inplace = true; pipeline_dropout = 0; pipeline_dropout_pack4 = 0; pipeline_dropout_pack8 = 0; @@ -214,4 +205,4 @@ int Dropout_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, c return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/dropout_vulkan.hpp b/source/device/vulkan/layer/dropout_vulkan.hpp index 478345ca7..6cb66fb4e 100644 --- a/source/device/vulkan/layer/dropout_vulkan.hpp +++ b/source/device/vulkan/layer/dropout_vulkan.hpp @@ -48,8 +48,7 @@ namespace TEngine { class Dropout_vulkan : public Layer { public: - Dropout_vulkan(); - Dropout_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + Dropout_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); @@ -74,4 +73,4 @@ class Dropout_vulkan : public Layer } // namespace TEngine -#endif \ No newline at end of file +#endif diff --git a/source/device/vulkan/layer/eltwise_vulkan.cpp b/source/device/vulkan/layer/eltwise_vulkan.cpp index a8d112bf4..4cb8f2f77 100644 --- a/source/device/vulkan/layer/eltwise_vulkan.cpp +++ b/source/device/vulkan/layer/eltwise_vulkan.cpp @@ -39,27 +39,14 @@ #include "eltwise_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -Eltwise_vulkan::Eltwise_vulkan() +Eltwise_vulkan::Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - support_image_storage = false; - - pipeline_eltwise[0] = 0; - pipeline_eltwise[1] = 0; - pipeline_eltwise_pack4[0] = 0; - pipeline_eltwise_pack4[1] = 0; - pipeline_eltwise_pack8[0] = 0; - pipeline_eltwise_pack8[1] = 0; -} - -Eltwise_vulkan::Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) -{ - support_vulkan = true; - support_image_storage = true; - + one_blob_only = false; pipeline_eltwise[0] = 0; pipeline_eltwise[1] = 0; pipeline_eltwise_pack4[0] = 0; @@ -77,12 +64,13 @@ Eltwise_vulkan::Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) bottoms.push_back(name); } - for (int i = 0; i < ir_node->output_num; i++) - { - struct tensor* output = get_ir_graph_tensor(graph, node->input_tensors[i]); - std::string name = output->name; - tops.push_back(name); - } + struct tensor* output = get_ir_graph_tensor(graph, node->output_tensors[0]); + std::string name = output->name; + tops.push_back(name); + + output_c = output->dims[1]; + output_h = output->dims[2]; + output_w = output->dims[3]; struct eltwise_param* param = (struct eltwise_param*)ir_node->op.param_mem; op_type = (param->type) / 2; @@ -266,4 +254,4 @@ int Eltwise_vulkan::record_pipeline(const std::vector& bottom_blobs, s return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/eltwise_vulkan.hpp b/source/device/vulkan/layer/eltwise_vulkan.hpp index 5830aea6a..d2fe76c7c 100644 --- a/source/device/vulkan/layer/eltwise_vulkan.hpp +++ b/source/device/vulkan/layer/eltwise_vulkan.hpp @@ -50,8 +50,7 @@ namespace TEngine { class Eltwise_vulkan : public Layer { public: - Eltwise_vulkan(); - Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); @@ -86,9 +85,6 @@ class Eltwise_vulkan : public Layer }; int op_type; // Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 - int input_c; - int input_h; - int input_w; int output_c; int output_h; int output_w; @@ -96,4 +92,4 @@ class Eltwise_vulkan : public Layer } // namespace TEngine -#endif \ No newline at end of file +#endif diff --git a/source/device/vulkan/layer/flatten_vulkan.cpp b/source/device/vulkan/layer/flatten_vulkan.cpp index 798402f2c..0c35079f6 100644 --- a/source/device/vulkan/layer/flatten_vulkan.cpp +++ b/source/device/vulkan/layer/flatten_vulkan.cpp @@ -39,14 +39,14 @@ #include "flatten_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { - -Flatten_vulkan::Flatten_vulkan() +Flatten_vulkan::Flatten_vulkan(const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - support_image_storage = false; - + support_inplace = false; + one_blob_only = true; pipeline_flatten = 0; pipeline_flatten_pack4 = 0; pipeline_flatten_pack1to4 = 0; @@ -55,11 +55,10 @@ Flatten_vulkan::Flatten_vulkan() pipeline_flatten_pack4to8 = 0; } -Flatten_vulkan::Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +Flatten_vulkan::Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - support_image_storage = true; - + one_blob_only = true; pipeline_flatten = 0; pipeline_flatten_pack4 = 0; pipeline_flatten_pack1to4 = 0; @@ -82,18 +81,15 @@ Flatten_vulkan::Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) input_c = input->dims[1]; // param->input_channel; input_h = input->dims[2]; input_w = input->dims[3]; - output_c = output->dims[1]; // param->output_channel; - output_h = output->dims[2]; - output_w = output->dims[3]; - output_size = output->dims[3] * output->dims[2] * output->dims[1]; + output_size = output->elem_num; } int Flatten_vulkan::create_pipeline(const Option& _opt) { Option opt = _opt; - const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Mat() : bottom_shapes[0]; + const Tensor shape(input_w, input_h, input_c, nullptr); // bottom_shapes.empty() ? Mat() : bottom_shapes[0]; // const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0]; - const Tensor& out_shape = Tensor(output_size, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0]; + const Tensor out_shape(output_size, nullptr); // top_shapes.empty() ? Mat() : top_shapes[0]; int elempack = 1; if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 @@ -133,9 +129,7 @@ int Flatten_vulkan::create_pipeline(const Option& _opt) Tensor out_shape_packed; if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack); - // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed)) { - support_image_storage = false; opt.use_image_storage = false; } @@ -325,4 +319,4 @@ int Flatten_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/flatten_vulkan.hpp b/source/device/vulkan/layer/flatten_vulkan.hpp index cd364ddf2..d752b233d 100644 --- a/source/device/vulkan/layer/flatten_vulkan.hpp +++ b/source/device/vulkan/layer/flatten_vulkan.hpp @@ -50,8 +50,8 @@ namespace TEngine { class Flatten_vulkan : public Layer { public: - Flatten_vulkan(); - Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + Flatten_vulkan(const GPUDevice* vkdev); + Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); @@ -78,4 +78,4 @@ class Flatten_vulkan : public Layer } // namespace TEngine -#endif \ No newline at end of file +#endif diff --git a/source/device/vulkan/layer/innerproduct_vulkan.cpp b/source/device/vulkan/layer/innerproduct_vulkan.cpp index 8e1d66b8a..df8d44a1e 100644 --- a/source/device/vulkan/layer/innerproduct_vulkan.cpp +++ b/source/device/vulkan/layer/innerproduct_vulkan.cpp @@ -39,32 +39,14 @@ #include "innerproduct_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -InnerProduct_vulkan::InnerProduct_vulkan() +InnerProduct_vulkan::InnerProduct_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - support_image_storage = true; - - flatten = 0; - - pipeline_innerproduct = 0; - pipeline_innerproduct_pack4 = 0; - pipeline_innerproduct_pack1to4 = 0; - pipeline_innerproduct_pack4to1 = 0; - pipeline_innerproduct_pack8 = 0; - pipeline_innerproduct_pack1to8 = 0; - pipeline_innerproduct_pack4to8 = 0; - pipeline_innerproduct_pack8to4 = 0; - pipeline_innerproduct_pack8to1 = 0; -} - -InnerProduct_vulkan::InnerProduct_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) -{ - support_vulkan = true; - support_image_storage = false; - + one_blob_only = true; flatten = 0; pipeline_innerproduct = 0; @@ -148,13 +130,11 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt) if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack); { - support_image_storage = false; opt.use_image_storage = false; } { - flatten = new Flatten_vulkan(); - flatten->vkdev = vkdev; + flatten = new Flatten_vulkan(vkdev); flatten->input_w = shape.w; flatten->input_h = shape.h; @@ -346,11 +326,6 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt) } } - if (support_image_storage && opt.use_image_storage) - { - // cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt); - } - else { cmd.record_upload(weight_data_packed, weight_data_gpu, opt); } @@ -362,11 +337,6 @@ int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt) Tensor bias_data_packed; convert_packing(bias_data, bias_data_packed, out_elempack); - if (support_image_storage && opt.use_image_storage) - { - // cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); - } - else { cmd.record_upload(bias_data_packed, bias_data_gpu, opt); } @@ -464,4 +434,4 @@ int InnerProduct_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/innerproduct_vulkan.hpp b/source/device/vulkan/layer/innerproduct_vulkan.hpp index 0549e24f6..7641dd2c8 100644 --- a/source/device/vulkan/layer/innerproduct_vulkan.hpp +++ b/source/device/vulkan/layer/innerproduct_vulkan.hpp @@ -52,8 +52,7 @@ namespace TEngine { class InnerProduct_vulkan : public Layer { public: - InnerProduct_vulkan(); - InnerProduct_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + InnerProduct_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); diff --git a/source/device/vulkan/layer/interp_vulkan.cpp b/source/device/vulkan/layer/interp_vulkan.cpp index 81c8ae748..eaec37214 100644 --- a/source/device/vulkan/layer/interp_vulkan.cpp +++ b/source/device/vulkan/layer/interp_vulkan.cpp @@ -39,30 +39,14 @@ #include "interp_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -Interp_vulkan::Interp_vulkan() +Interp_vulkan::Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - support_image_storage = false; - - pipeline_interp = 0; - pipeline_interp_pack4 = 0; - pipeline_interp_pack8 = 0; - - pipeline_interp_bicubic_coeffs_x = 0; - pipeline_interp_bicubic_coeffs_y = 0; - pipeline_interp_bicubic = 0; - pipeline_interp_bicubic_pack4 = 0; - pipeline_interp_bicubic_pack8 = 0; -} - -Interp_vulkan::Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) -{ - support_vulkan = true; - support_image_storage = false; - + one_blob_only = true; pipeline_interp = 0; pipeline_interp_pack4 = 0; pipeline_interp_pack8 = 0; @@ -158,9 +142,7 @@ int Interp_vulkan::create_pipeline(const Option& _opt) if (out_shape.dims == 3) out_shape_packed = Tensor(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); // check blob shape - // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed)) { - support_image_storage = false; opt.use_image_storage = false; } @@ -467,4 +449,4 @@ int Interp_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_bl return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/interp_vulkan.hpp b/source/device/vulkan/layer/interp_vulkan.hpp index 98574f499..b7b56945a 100644 --- a/source/device/vulkan/layer/interp_vulkan.hpp +++ b/source/device/vulkan/layer/interp_vulkan.hpp @@ -50,8 +50,7 @@ namespace TEngine { class Interp_vulkan : public Layer { public: - Interp_vulkan(); - Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); @@ -87,4 +86,4 @@ class Interp_vulkan : public Layer } // namespace TEngine -#endif \ No newline at end of file +#endif diff --git a/source/device/vulkan/layer/packing_vulkan.cpp b/source/device/vulkan/layer/packing_vulkan.cpp index 88a6de812..bea2692de 100644 --- a/source/device/vulkan/layer/packing_vulkan.cpp +++ b/source/device/vulkan/layer/packing_vulkan.cpp @@ -39,14 +39,14 @@ #include "packing_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -Packing_vulkan::Packing_vulkan() +Packing_vulkan::Packing_vulkan(const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - // support_image_storage = true; - + one_blob_only = true; pipeline_packing = 0; pipeline_packing_pack4 = 0; pipeline_packing_pack8 = 0; @@ -90,9 +90,7 @@ int Packing_vulkan::create_pipeline(const Option& _opt) // if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); // check blob shape - // if (!vkdev->shape_support_image_storage(out_shape_packed)) { - // support_image_storage = false; opt.use_image_storage = false; } @@ -487,4 +485,4 @@ int Packing_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/packing_vulkan.hpp b/source/device/vulkan/layer/packing_vulkan.hpp index f528edf11..dc5cf0a4e 100644 --- a/source/device/vulkan/layer/packing_vulkan.hpp +++ b/source/device/vulkan/layer/packing_vulkan.hpp @@ -48,7 +48,7 @@ namespace TEngine { class Packing_vulkan : public Layer { public: - Packing_vulkan(); + Packing_vulkan(const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); diff --git a/source/device/vulkan/layer/padding_vulkan.cpp b/source/device/vulkan/layer/padding_vulkan.cpp index 27fa57853..fb4bfd583 100644 --- a/source/device/vulkan/layer/padding_vulkan.cpp +++ b/source/device/vulkan/layer/padding_vulkan.cpp @@ -39,12 +39,14 @@ #include "padding_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -Padding_vulkan::Padding_vulkan() +Padding_vulkan::Padding_vulkan(const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; + one_blob_only = true; pipeline_padding = 0; pipeline_padding_pack4 = 0; pipeline_padding_pack8 = 0; @@ -169,4 +171,4 @@ int Padding_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/padding_vulkan.hpp b/source/device/vulkan/layer/padding_vulkan.hpp index 03bbce43d..c99e0d005 100644 --- a/source/device/vulkan/layer/padding_vulkan.hpp +++ b/source/device/vulkan/layer/padding_vulkan.hpp @@ -48,7 +48,7 @@ namespace TEngine { class Padding_vulkan : public Layer { public: - Padding_vulkan(); + Padding_vulkan(GPUDevice const* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); diff --git a/source/device/vulkan/layer/permute_vulkan.cpp b/source/device/vulkan/layer/permute_vulkan.cpp index 0bead6791..d83a04f43 100644 --- a/source/device/vulkan/layer/permute_vulkan.cpp +++ b/source/device/vulkan/layer/permute_vulkan.cpp @@ -39,30 +39,14 @@ #include "permute_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -Permute_vulkan::Permute_vulkan() +Permute_vulkan::Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - support_image_storage = true; - - pipeline_permute = 0; - pipeline_permute_pack4 = 0; - pipeline_permute_pack1to4 = 0; - pipeline_permute_pack4to1 = 0; - pipeline_permute_pack8 = 0; - pipeline_permute_pack1to8 = 0; - pipeline_permute_pack4to8 = 0; - pipeline_permute_pack8to4 = 0; - pipeline_permute_pack8to1 = 0; -} - -Permute_vulkan::Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) -{ - support_vulkan = true; - support_image_storage = true; - + one_blob_only = true; pipeline_permute = 0; pipeline_permute_pack4 = 0; pipeline_permute_pack1to4 = 0; @@ -158,10 +142,7 @@ int Permute_vulkan::create_pipeline(const Option& _opt) if (out_shape.dims == 2) out_shape_packed = Tensor(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack); if (out_shape.dims == 3) out_shape_packed = Tensor(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); - // check blob shape - // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed)) { - support_image_storage = false; opt.use_image_storage = false; } @@ -479,4 +460,4 @@ int Permute_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/permute_vulkan.hpp b/source/device/vulkan/layer/permute_vulkan.hpp index 2a6763c13..9be16d8eb 100644 --- a/source/device/vulkan/layer/permute_vulkan.hpp +++ b/source/device/vulkan/layer/permute_vulkan.hpp @@ -50,8 +50,7 @@ namespace TEngine { class Permute_vulkan : public Layer { public: - Permute_vulkan(); - Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); @@ -81,4 +80,4 @@ class Permute_vulkan : public Layer } // namespace TEngine -#endif \ No newline at end of file +#endif diff --git a/source/device/vulkan/layer/pooling_vulkan.cpp b/source/device/vulkan/layer/pooling_vulkan.cpp index 8f4234367..90e8c1574 100644 --- a/source/device/vulkan/layer/pooling_vulkan.cpp +++ b/source/device/vulkan/layer/pooling_vulkan.cpp @@ -39,23 +39,15 @@ #include "pooling_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -Pooling_vulkan::Pooling_vulkan() +Pooling_vulkan::Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - pipeline_pooling = 0; - pipeline_pooling_pack4 = 0; - pipeline_pooling_pack8 = 0; - pipeline_pooling_global = 0; - pipeline_pooling_global_pack4 = 0; - pipeline_pooling_global_pack8 = 0; -} + one_blob_only = true; -Pooling_vulkan::Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) -{ - support_vulkan = true; pipeline_pooling = 0; pipeline_pooling_pack4 = 0; pipeline_pooling_pack8 = 0; @@ -123,8 +115,7 @@ int Pooling_vulkan::create_pipeline(const Option& opt) } { - padding = new Padding_vulkan(); - padding->vkdev = vkdev; + padding = new Padding_vulkan(vkdev); padding->top = pad_h0; padding->bottom = pad_h1; diff --git a/source/device/vulkan/layer/pooling_vulkan.hpp b/source/device/vulkan/layer/pooling_vulkan.hpp index 33be747b2..c12858c9f 100644 --- a/source/device/vulkan/layer/pooling_vulkan.hpp +++ b/source/device/vulkan/layer/pooling_vulkan.hpp @@ -51,8 +51,7 @@ namespace TEngine { class Pooling_vulkan : public Layer { public: - Pooling_vulkan(); - Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); diff --git a/source/device/vulkan/layer/priorbox_vulkan.cpp b/source/device/vulkan/layer/priorbox_vulkan.cpp index 23198f4e8..efb6f36ca 100644 --- a/source/device/vulkan/layer/priorbox_vulkan.cpp +++ b/source/device/vulkan/layer/priorbox_vulkan.cpp @@ -42,18 +42,10 @@ namespace TEngine { -PriorBox_vulkan::PriorBox_vulkan() +PriorBox_vulkan::PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - - pipeline_priorbox = 0; - pipeline_priorbox_mxnet = 0; -} - -PriorBox_vulkan::PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) -{ - support_vulkan = true; - + one_blob_only = false; pipeline_priorbox = 0; pipeline_priorbox_mxnet = 0; @@ -351,4 +343,4 @@ int PriorBox_vulkan::record_pipeline(const std::vector& bottom_blobs, return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/priorbox_vulkan.hpp b/source/device/vulkan/layer/priorbox_vulkan.hpp index 3ae12f99e..8bf388b1c 100644 --- a/source/device/vulkan/layer/priorbox_vulkan.hpp +++ b/source/device/vulkan/layer/priorbox_vulkan.hpp @@ -50,8 +50,7 @@ namespace TEngine { class PriorBox_vulkan : public Layer { public: - PriorBox_vulkan(); - PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); @@ -93,4 +92,4 @@ class PriorBox_vulkan : public Layer } // namespace TEngine -#endif \ No newline at end of file +#endif diff --git a/source/device/vulkan/layer/relu_vulkan.cpp b/source/device/vulkan/layer/relu_vulkan.cpp index 510d4245b..101fe10ee 100644 --- a/source/device/vulkan/layer/relu_vulkan.cpp +++ b/source/device/vulkan/layer/relu_vulkan.cpp @@ -39,24 +39,15 @@ #include "relu_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -ReLU_vulkan::ReLU_vulkan() +ReLU_vulkan::ReLU_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - support_image_storage = true; - - pipeline_relu = 0; - pipeline_relu_pack4 = 0; - pipeline_relu_pack8 = 0; -} - -ReLU_vulkan::ReLU_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) -{ - support_vulkan = true; - support_image_storage = false; - + one_blob_only = true; + support_inplace = true; pipeline_relu = 0; pipeline_relu_pack4 = 0; pipeline_relu_pack8 = 0; @@ -213,4 +204,4 @@ int ReLU_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/relu_vulkan.hpp b/source/device/vulkan/layer/relu_vulkan.hpp index c707481c8..ed5170e3b 100644 --- a/source/device/vulkan/layer/relu_vulkan.hpp +++ b/source/device/vulkan/layer/relu_vulkan.hpp @@ -50,8 +50,7 @@ namespace TEngine { class ReLU_vulkan : public Layer { public: - ReLU_vulkan(); - ReLU_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + ReLU_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); @@ -76,4 +75,4 @@ class ReLU_vulkan : public Layer } // namespace TEngine -#endif \ No newline at end of file +#endif diff --git a/source/device/vulkan/layer/reshape_vulkan.cpp b/source/device/vulkan/layer/reshape_vulkan.cpp index 3f12e241f..4e7bac661 100644 --- a/source/device/vulkan/layer/reshape_vulkan.cpp +++ b/source/device/vulkan/layer/reshape_vulkan.cpp @@ -39,35 +39,13 @@ #include "reshape_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -Reshape_vulkan::Reshape_vulkan() +Reshape_vulkan::Reshape_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - support_image_storage = true; - - permute_hwc = 0; - permute_hc = 0; - permute_hw = 0; - permute_chw = 0; - - pipeline_reshape = 0; - pipeline_reshape_pack4 = 0; - pipeline_reshape_pack1to4 = 0; - pipeline_reshape_pack4to1 = 0; - pipeline_reshape_pack8 = 0; - pipeline_reshape_pack1to8 = 0; - pipeline_reshape_pack4to8 = 0; - pipeline_reshape_pack8to4 = 0; - pipeline_reshape_pack8to1 = 0; -} - -Reshape_vulkan::Reshape_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) -{ - support_vulkan = true; - support_image_storage = true; - permute_hwc = 0; permute_hc = 0; permute_hw = 0; @@ -202,9 +180,7 @@ int Reshape_vulkan::create_pipeline(const Option& _opt) if (out_shape_permuted.dims == 3) out_shape_packed = Tensor(out_shape_permuted.w, out_shape_permuted.h, out_shape_permuted.c / out_elempack, (void*)0, out_elemsize, out_elempack); // check blob shape - // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed)) { - support_image_storage = false; opt.use_image_storage = false; } @@ -582,4 +558,4 @@ int Reshape_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_b return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/layer/reshape_vulkan.hpp b/source/device/vulkan/layer/reshape_vulkan.hpp index 1d52e48a8..b1349dcd6 100644 --- a/source/device/vulkan/layer/reshape_vulkan.hpp +++ b/source/device/vulkan/layer/reshape_vulkan.hpp @@ -50,8 +50,7 @@ namespace TEngine { class Reshape_vulkan : public Layer { public: - Reshape_vulkan(); - Reshape_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + Reshape_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); @@ -94,4 +93,4 @@ class Reshape_vulkan : public Layer } // namespace TEngine -#endif \ No newline at end of file +#endif diff --git a/source/device/vulkan/layer/softmax_vulkan.cpp b/source/device/vulkan/layer/softmax_vulkan.cpp index 8ee653505..1c4c565ce 100644 --- a/source/device/vulkan/layer/softmax_vulkan.cpp +++ b/source/device/vulkan/layer/softmax_vulkan.cpp @@ -39,35 +39,15 @@ #include "softmax_vulkan.hpp" #include "../layer_shader_type.h" +#include "vulkan_layer.hpp" namespace TEngine { -Softmax_vulkan::Softmax_vulkan() +Softmax_vulkan::Softmax_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev) + : Layer(vkdev) { - support_vulkan = true; - support_image_storage = true; - - pipeline_softmax_reduce_max = 0; - pipeline_softmax_exp_sub_max = 0; - pipeline_softmax_reduce_sum = 0; - pipeline_softmax_div_sum = 0; - - pipeline_softmax_reduce_max_pack4 = 0; - pipeline_softmax_exp_sub_max_pack4 = 0; - pipeline_softmax_reduce_sum_pack4 = 0; - pipeline_softmax_div_sum_pack4 = 0; - - pipeline_softmax_reduce_max_pack8 = 0; - pipeline_softmax_exp_sub_max_pack8 = 0; - pipeline_softmax_reduce_sum_pack8 = 0; - pipeline_softmax_div_sum_pack8 = 0; -} - -Softmax_vulkan::Softmax_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) -{ - support_vulkan = true; - support_image_storage = true; - + one_blob_only = true; + support_inplace = true; pipeline_softmax_reduce_max = 0; pipeline_softmax_exp_sub_max = 0; pipeline_softmax_reduce_sum = 0; diff --git a/source/device/vulkan/layer/softmax_vulkan.hpp b/source/device/vulkan/layer/softmax_vulkan.hpp index 94c1be27c..a52eea16e 100644 --- a/source/device/vulkan/layer/softmax_vulkan.hpp +++ b/source/device/vulkan/layer/softmax_vulkan.hpp @@ -50,8 +50,7 @@ namespace TEngine { class Softmax_vulkan : public Layer { public: - Softmax_vulkan(); - Softmax_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + Softmax_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node, const GPUDevice* vkdev); virtual int create_pipeline(const Option& opt); virtual int destroy_pipeline(const Option& opt); @@ -86,4 +85,4 @@ class Softmax_vulkan : public Layer } // namespace TEngine -#endif \ No newline at end of file +#endif diff --git a/source/device/vulkan/shaders/concat.comp b/source/device/vulkan/shaders/concat.comp index 5c904b42e..6275ecca1 100644 --- a/source/device/vulkan/shaders/concat.comp +++ b/source/device/vulkan/shaders/concat.comp @@ -27,25 +27,19 @@ layout (constant_id = 0) const int axis = 0; layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; layout (constant_id = shape_constant_id_offset + 1) const int w = 0; layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; +layout (constant_id = shape_constant_id_offset + 3) const int d = 0; +layout (constant_id = shape_constant_id_offset + 4) const int c = 0; +layout (constant_id = shape_constant_id_offset + 5) const int cstep = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; - -layout (local_size_x_id = 233) in; -layout (local_size_y_id = 234) in; -layout (local_size_z_id = 235) in; +layout (constant_id = shape_constant_id_offset + 6) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outd = 0; +layout (constant_id = shape_constant_id_offset + 10) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 11) const int outcstep = 0; #if NCNN_image_shader -layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; -layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; -layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; -layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; #else layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; @@ -57,12 +51,14 @@ layout (push_constant) uniform parameter int dims; int w; int h; + int d; int c; int cstep; int outdims; int outw; int outh; + int outd; int outc; int outcstep; @@ -75,32 +71,34 @@ void main() int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + if (gx >= psc(w) || gy >= psc(h) * psc(d) || gz >= psc(c)) return; -#if NCNN_image_shader - if (psc(dims) == 1) - { - image1d_cp1(top_blob_1d, gx + p.offset, bottom_blob_1d, gx); - } - else if (psc(dims) == 2) + int positive_axis = axis < 0 ? psc(dims) + axis : axis; + + ivec3 gxyz; + + if (psc(dims) == 4) { - if (axis == 0) image2d_cp1(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy)); - if (axis == 1) image2d_cp1(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy)); + int yd = gy / psc(h); + int yh = gy % psc(h); + + ivec4 gxydz = ivec4(gx, yh, yd, gz); + gxydz[psc(dims) - 1 - positive_axis] += p.offset; + + gxyz = ivec3(gxydz.r, gxydz.g + gxydz.b * psc(outh), gxydz.a); } - else // if (psc(dims) == 3) + else { - if (axis == 0) image3d_cp1(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz)); - if (axis == 1) image3d_cp1(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz)); - if (axis == 2) image3d_cp1(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + gxyz = ivec3(gx, gy, gz); + gxyz[psc(dims) - 1 - positive_axis] += p.offset; } + +#if NCNN_image_shader + image3d_cp1(top_blob_3d, gxyz, bottom_blob_3d, ivec3(gx, gy, gz)); #else const int gi = gz * psc(cstep) + gy * psc(w) + gx; - ivec3 gxyz = ivec3(gx, gy, gz); - - gxyz[psc(dims) - 1 - axis] += p.offset; - int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x; buffer_cp1(top_blob_data, v_offset, bottom_blob_data, gi); diff --git a/source/device/vulkan/vulkan_allocator.cpp b/source/device/vulkan/vulkan_allocator.cpp index b901923cd..be765183e 100644 --- a/source/device/vulkan/vulkan_allocator.cpp +++ b/source/device/vulkan/vulkan_allocator.cpp @@ -1428,7 +1428,6 @@ VkWeightStagingAllocator::~VkWeightStagingAllocator() VkBufferMemory* VkWeightStagingAllocator::fastMalloc(size_t size) { - printf("VkWeightStagingAllocator fastMalloc %lu\n", size); VkBufferMemory* ptr = new VkBufferMemory; ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); diff --git a/source/device/vulkan/vulkan_device.cc b/source/device/vulkan/vulkan_device.cc index 57067405b..df45ec145 100644 --- a/source/device/vulkan/vulkan_device.cc +++ b/source/device/vulkan/vulkan_device.cc @@ -27,8 +27,7 @@ #include "vulkan_limit.hpp" #include "vulkan_graph.hpp" -extern "C" -{ +extern "C" { #include "api/c_api.h" #include "device/device.h" #include "graph/tensor.h" @@ -44,7 +43,6 @@ extern "C" #include - int vulkan_describe(struct device* device, struct vector* allowed_ops, struct vector* blocked_ops, struct vector* precision) { (void)device; @@ -78,7 +76,6 @@ int vulkan_describe(struct device* device, struct vector* allowed_ops, struct ve return 0; } - int vulkan_evaluation(struct device* device, struct subgraph* sub_graph, struct vector* evolution_tensors, struct vector* evolution_nodes) { // nothing to do with vulkan @@ -90,7 +87,6 @@ int vulkan_evaluation(struct device* device, struct subgraph* sub_graph, struct return 0; } - int vulkan_allocate(struct device* device, struct subgraph* sub_graph) { if (nullptr == device) @@ -112,7 +108,6 @@ int vulkan_allocate(struct device* device, struct subgraph* sub_graph) return 0; } - int vulkan_release(struct device* device, struct subgraph* sub_graph) { (void)sub_graph; @@ -162,48 +157,41 @@ int vulkan_split_graph(struct graph* ir_graph) return 0; } - -extern "C" -{ +extern "C" { static struct interface vulkan_interface = { - .init = vulkan_dev_init, - .pre_run = vulkan_dev_prerun, - .run = vulkan_dev_run, - .post_run = vulkan_dev_postrun, - .async_run = nullptr, - .async_wait = nullptr, - .release_graph = nullptr, - .release_device = vulkan_dev_release, + .init = vulkan_dev_init, + .pre_run = vulkan_dev_prerun, + .run = vulkan_dev_run, + .post_run = vulkan_dev_postrun, + .async_run = nullptr, + .async_wait = nullptr, + .release_graph = nullptr, + .release_device = vulkan_dev_release, }; - static struct allocator vulkan_allocator = { - .describe = vulkan_describe, - .evaluation = vulkan_evaluation, - .allocate = vulkan_allocate, - .release = vulkan_release, + .describe = vulkan_describe, + .evaluation = vulkan_evaluation, + .allocate = vulkan_allocate, + .release = vulkan_release, }; - static struct optimizer vulkan_optimizer = { - .split_graph = vulkan_split_graph, - .optimize_graph = nullptr, + .split_graph = vulkan_split_graph, + .optimize_graph = nullptr, }; - - static struct vulkan_device vulkan_dev = { - .base = { - .name = VULKAN_DEV_NAME, - .interface = &vulkan_interface, - .allocator = &vulkan_allocator, - .optimizer = &vulkan_optimizer, - .scheduler = nullptr, - .privacy = nullptr, - }, + .base = { + .name = VULKAN_DEV_NAME, + .interface = &vulkan_interface, + .allocator = &vulkan_allocator, + .optimizer = &vulkan_optimizer, + .scheduler = nullptr, + .privacy = nullptr, + }, }; - int register_vulkan_device(void) { int ret = register_device(&vulkan_dev.base); @@ -217,7 +205,6 @@ int register_vulkan_device(void) return 0; } - int unregister_vulkan_device(void) { int ret = unregister_device(&vulkan_dev.base); diff --git a/source/device/vulkan/vulkan_executor.cc b/source/device/vulkan/vulkan_executor.cc index ca030e894..b2f0c1b41 100644 --- a/source/device/vulkan/vulkan_executor.cc +++ b/source/device/vulkan/vulkan_executor.cc @@ -45,7 +45,6 @@ bool VULKANEngine::init() int VULKANEngine::VULKANEnginePreRun(struct subgraph* subgraph) { // TLOG_INFO("==== vulkan prerun start ====\n"); - create_gpu_instance(); // struct device *vk_dev = (struct device *)dev; struct graph *orig_graph = subgraph->graph; // struct vk_dev_priv *priv = (struct vk_dev_priv *)orig_graph->dev_priv; @@ -93,6 +92,5 @@ int VULKANEngine::VULKANEngineRun(struct subgraph* subgraph) void VULKANEngine::VULKANEnginePostRun() { - destroy_gpu_instance(); return; -}; \ No newline at end of file +}; diff --git a/source/device/vulkan/vulkan_executor.hpp b/source/device/vulkan/vulkan_executor.hpp index c4cc99a6c..244b5e40e 100644 --- a/source/device/vulkan/vulkan_executor.hpp +++ b/source/device/vulkan/vulkan_executor.hpp @@ -49,16 +49,6 @@ extern "C" { // typedef std::map dict_uint2clmem; -struct VULKANqueue -{ - std::string name; - int dims; - // cl_kernel queue_kernel; - // cl_event enentPoint; - size_t* queue_global_work_size; - size_t* queue_local_work_size; -}; - class VULKANEngine { public: @@ -72,11 +62,6 @@ class VULKANEngine private: bool init(); -private: -public: - // dict_uint2clmem vulkan_tensor_map; - std::vector queue_list; - public: int bin_num; }; diff --git a/source/device/vulkan/vulkan_gpu.cpp b/source/device/vulkan/vulkan_gpu.cpp index fba68aa70..b42bd8a52 100644 --- a/source/device/vulkan/vulkan_gpu.cpp +++ b/source/device/vulkan/vulkan_gpu.cpp @@ -798,7 +798,7 @@ int create_gpu_instance() } if (gpu_info.support_VK_KHR_16bit_storage) { - gpu_info.support_fp16_storage = query16BitStorageFeatures.storageBuffer16BitAccess && query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess; + gpu_info.support_fp16_storage = query16BitStorageFeatures.storageBuffer16BitAccess && query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess && query16BitStorageFeatures.storageInputOutput16; } if (gpu_info.support_VK_KHR_shader_float16_int8) { @@ -1945,8 +1945,7 @@ int GPUDevice::create_utility_operator() opt.use_shader_pack8 = true; { // create packing layer - TEngine::Packing_vulkan* uop = new Packing_vulkan(); - uop->vkdev = this; + TEngine::Packing_vulkan* uop = new Packing_vulkan(this); uop->out_elempack = k == 0 ? 1 : k == 1 ? 4 : 8; diff --git a/source/device/vulkan/vulkan_graph.cc b/source/device/vulkan/vulkan_graph.cc index 222477f80..84c9365ff 100644 --- a/source/device/vulkan/vulkan_graph.cc +++ b/source/device/vulkan/vulkan_graph.cc @@ -23,8 +23,13 @@ */ #include "vulkan_graph.hpp" +#include "api/c_api.h" #include "vulkan_executor.hpp" +#include "vulkan_gpu.hpp" +#include +#include +#include #include #include "vulkan_graph.hpp" #include "vulkan_pipeline.hpp" @@ -51,23 +56,46 @@ #include "layer/crop_vulkan.hpp" #include +#include -extern "C" -{ +extern "C" { #include "graph/tensor.h" #include "graph/node.h" #include "graph/graph.h" #include "graph/subgraph.h" } +#define VULKAN_DEBUG_TENSOR 0 + +static void save_tensor(const char* fname, const float* vals, std::vector const& dims) +{ + auto fout = fopen(fname, "w+"); + assert(fout); + int n = 1; + + for (auto const d : dims) + { + fprintf(fout, "%d ", d); + n *= d; + } + fprintf(fout, "\n"); + + for (int i = 0; i < n; ++i) + { + fprintf(fout, "%f ", vals[i]); + } + fprintf(fout, "\n"); + fflush(fout); + fclose(fout); +} int vulkan_dev_init(struct device* dev) { (void)dev; + TEngine::create_gpu_instance(); return 0; } - int vulkan_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options) { subgraph->device_graph = new VULKANEngine; @@ -76,14 +104,12 @@ int vulkan_dev_prerun(struct device* dev, struct subgraph* subgraph, void* optio return engine->VULKANEnginePreRun(subgraph); } - int vulkan_dev_run(struct device* dev, struct subgraph* subgraph) { auto engine = (VULKANEngine*)subgraph->device_graph; return engine->VULKANEngineRun(subgraph); } - int vulkan_dev_postrun(struct device* dev, struct subgraph* subgraph) { auto engine = (VULKANEngine*)subgraph->device_graph; @@ -93,15 +119,13 @@ int vulkan_dev_postrun(struct device* dev, struct subgraph* subgraph) return 0; } - int vulkan_dev_release(struct device* dev) { (void)dev; + TEngine::destroy_gpu_instance(); return 0; } - - namespace TEngine { static double get_cur_time(void) @@ -113,7 +137,6 @@ static double get_cur_time(void) return tv.tv_sec * 1000.0 + (tv.tv_usec / 1000.0); } - VulkanGraph::VulkanGraph(struct subgraph* graph) { vkdev = get_gpu_device(); @@ -123,13 +146,13 @@ VulkanGraph::VulkanGraph(struct subgraph* graph) // set graph options if (!vkdev->info.support_fp16_packed || !vkdev->info.support_fp16_storage) opt.use_fp16_packed = false; - if (!vkdev->info.support_fp16_storage) + if (!vkdev->info.support_fp16_storage) { opt.use_fp16_storage = false; opt.use_shader_pack8 = false; - } + } - if (!vkdev->info.support_fp16_arithmetic) + if (!vkdev->info.support_fp16_arithmetic) opt.use_fp16_arithmetic = false; TLOG_INFO("use_fp16_packed %d\n", opt.use_fp16_packed); @@ -137,169 +160,141 @@ VulkanGraph::VulkanGraph(struct subgraph* graph) TLOG_INFO("use_shader_pack8 %d\n", opt.use_shader_pack8); TLOG_INFO("use_fp16_arithmetic %d\n", opt.use_fp16_arithmetic); - struct subgraph *subgraph = (struct subgraph *)graph; - struct graph *ir_graph = subgraph->graph; + struct subgraph* subgraph = (struct subgraph*)graph; + struct graph* ir_graph = subgraph->graph; int node_num = subgraph->node_num; sgraph = graph; - for(int i = 0; i < node_num; i++) + for (int i = 0; i < node_num; i++) { - struct node *ir_node = get_ir_graph_node(ir_graph, subgraph->node_list[i]); + struct node* ir_node = get_ir_graph_node(ir_graph, subgraph->node_list[i]); + for (int i = 0; i < ir_node->input_num; ++i) + { + struct tensor* input = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[i]); + const auto name = input->name; + tensor_map_[name] = input; + tensor_map[name] = Tensor(input); + VkTensor vktensor; + vktensor_map_[name] = vktensor; + } + + for (int i = 0; i < ir_node->output_num; ++i) + { + struct tensor* output = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[i]); + const auto name = output->name; + tensor_map_[name] = output; + tensor_map[name] = Tensor(output); + } if (ir_node->op.type == OP_CONST || ir_node->op.type == OP_INPUT) continue; else if (ir_node->op.type == OP_CLIP) ir_node->op.type = OP_RELU6; - if(ir_node->op.type == OP_CONV) + if (ir_node->op.type == OP_CONV) { - struct conv_param *conv_param = (struct conv_param *)ir_node->op.param_mem; + struct conv_param* conv_param = (struct conv_param*)ir_node->op.param_mem; if (conv_param->group == conv_param->output_channel && conv_param->group != 1 && ir_graph->graph_layout == TENGINE_LAYOUT_NCHW) // DW { - Layer* layer = new ConvolutionDepthWise_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "ConvolutionDepthWise"; + Layer* layer = new ConvolutionDepthWise_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } else { - Layer* layer = new Convolution_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "Convolution"; + Layer* layer = new Convolution_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } } - if(ir_node->op.type == OP_POOL) + if (ir_node->op.type == OP_POOL) { - Layer* layer = new Pooling_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "Pooling"; + Layer* layer = new Pooling_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } - if(ir_node->op.type == OP_FC) + if (ir_node->op.type == OP_FC) { - Layer* layer = new InnerProduct_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "InnerProduct"; + Layer* layer = new InnerProduct_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } - if(ir_node->op.type == OP_FLATTEN) + if (ir_node->op.type == OP_FLATTEN) { - Layer* layer = new Flatten_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "Flatten"; + Layer* layer = new Flatten_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } - if(ir_node->op.type == OP_SOFTMAX) + if (ir_node->op.type == OP_SOFTMAX) { - Layer* layer = new Softmax_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "Softmax"; + Layer* layer = new Softmax_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } - if(ir_node->op.type == OP_RELU) + if (ir_node->op.type == OP_RELU) { - Layer* layer = new ReLU_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "ReLU"; + Layer* layer = new ReLU_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } - if(ir_node->op.type == OP_DROPOUT) + if (ir_node->op.type == OP_DROPOUT) { - Layer* layer = new Dropout_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "Dropout"; + Layer* layer = new Dropout_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } - if(ir_node->op.type == OP_ELTWISE) + if (ir_node->op.type == OP_ELTWISE) { - Layer* layer = new Eltwise_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "Eltwise"; + Layer* layer = new Eltwise_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } - if(ir_node->op.type == OP_PRIORBOX) + if (ir_node->op.type == OP_PRIORBOX) { - Layer* layer = new PriorBox_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "PriorBox"; + Layer* layer = new PriorBox_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } - if(ir_node->op.type == OP_PERMUTE) + if (ir_node->op.type == OP_PERMUTE) { - Layer* layer = new Permute_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "Permute"; + Layer* layer = new Permute_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } - if(ir_node->op.type == OP_CONCAT) + if (ir_node->op.type == OP_CONCAT) { - Layer* layer = new Concat_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "Concat"; + Layer* layer = new Concat_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } - if(ir_node->op.type == OP_RESHAPE) + if (ir_node->op.type == OP_RESHAPE) { - Layer* layer = new Reshape_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "Reshape"; + Layer* layer = new Reshape_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } - if(ir_node->op.type == OP_INTERP || ir_node->op.type == OP_UPSAMPLE) + if (ir_node->op.type == OP_INTERP || ir_node->op.type == OP_UPSAMPLE) { - Layer* layer = new Interp_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "Interp"; + Layer* layer = new Interp_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } - if(ir_node->op.type == OP_CROP) + if (ir_node->op.type == OP_CROP) { - Layer* layer = new Crop_vulkan(ir_graph, ir_node); - layer->vkdev = vkdev; - layer->name = "Crop"; + Layer* layer = new Crop_vulkan(ir_graph, ir_node, vkdev); layers.push_back(layer); } - - struct tensor *input = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); - std::string name = input->name; - tensor_map_[name] = input; - tensor_map[name] = Tensor(input); - - VkTensor vktensor; - vktensor_map_[name] = vktensor; - - struct tensor *output = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); - name = output->name; - tensor_map_[name] = output; - tensor_map[name] = Tensor(output); } } VulkanGraph::~VulkanGraph() { - for(auto& ptr: mem_buf_vector_) - std::free(ptr); + for (auto& ptr : mem_buf_vector_) + std::free(ptr); } int VulkanGraph::upload_model() { - -// printf("run upload_model\n"); TEngine::VkTransfer cmd(vkdev); if (!weight_vkallocator) { @@ -309,31 +304,28 @@ int VulkanGraph::upload_model() { weight_staging_vkallocator = new VkWeightStagingAllocator(vkdev); } - + Option opt_upload = opt; opt_upload.blob_vkallocator = weight_vkallocator; opt_upload.workspace_vkallocator = weight_vkallocator; opt_upload.staging_vkallocator = weight_staging_vkallocator; int layer_size = layers.size(); - for(int i = 0; i < layer_size; i++) + for (int i = 0; i < layer_size; i++) { layers[i]->upload_model(cmd, opt_upload); - } - + } + cmd.submit_and_wait(); -// printf("run upload_model done\n"); return 0; } int VulkanGraph::create_pipeline() { - // printf("start to run create pipeline\n"); - for (size_t i=0; iname.c_str()); int cret = layer->create_pipeline(opt1); if (cret != 0) { @@ -341,14 +333,11 @@ int VulkanGraph::create_pipeline() return -1; } } -// printf("run create_pipeline done\n"); return 0; } int VulkanGraph::record_graph_pipeline() { - // printf("start to run record pipeline, layer size:%d\n", layers.size()); - TEngine::VkCompute cmd(vkdev); if (!opt.blob_vkallocator) @@ -365,63 +354,50 @@ int VulkanGraph::record_graph_pipeline() local_staging_vkallocator = vkdev->acquire_staging_allocator(); opt.staging_vkallocator = local_staging_vkallocator; } - std::string name; - Tensor input; - Tensor output; - - // printf("tensor_map size:%d ---------------------\n", tensor_map.size()); + // upload input tensor + for (int i = 0; i < sgraph->input_num; ++i) + { + auto input_tensor = get_ir_graph_tensor(sgraph->graph, sgraph->input_tensor_list[i]); + const auto name = get_tensor_name(input_tensor); + tensor_map_[name] = input_tensor; + cmd.record_upload(tensor_map_[name], vktensor_map_[name], opt); + } - for (size_t i=0; iname.c_str()); - - std::string in_name = layer->bottoms[0]; std::string out_name = layer->tops[0]; - name = out_name; - // upload Tensor data to VkTensor - if((i==0) && vktensor_map_[in_name].dims == 0) + int cret = 0; + if (layer->one_blob_only) { - cmd.record_upload(tensor_map_[in_name], vktensor_map_[in_name], opt); - // cmd.record_download(vktensor_map_[in_name], tensor_map[in_name], opt); - } - - int cret; - if(layer->name == "ReLU" || layer->name == "Dropout" || layer->name == "Softmax") // inplace - { - VkTensor bottom_tensor = vktensor_map_[in_name]; - cret = layer->record_pipeline(bottom_tensor, cmd, opt); - vktensor_map_[out_name] = bottom_tensor; + std::string const& in_name = layer->bottoms[0]; + auto& bottom_tensor = vktensor_map_[in_name]; + if (layer->support_inplace) + { + cret = layer->record_pipeline(bottom_tensor, cmd, opt); + //FIXME: chec and log here + vktensor_map_[out_name] = bottom_tensor; + } + else + { + VkTensor top_blob; + cret = layer->record_pipeline(bottom_tensor, top_blob, cmd, opt); + vktensor_map_[out_name] = top_blob; + } } - else if(layer->name == "Eltwise" || layer->name == "Concat" || layer->name == "PriorBox" || layer->name == "Crop") // multi-in, one-out + else { std::vector bottom_blobs; - for(int i = 0; i < layer->bottoms.size(); i++) + for (auto const& inp : layer->bottoms) { - bottom_blobs.push_back(vktensor_map_[layer->bottoms[i]]); + bottom_blobs.push_back(vktensor_map_[inp]); } - VkTensor top_tensor; - std::vector top_blobs; - top_blobs.push_back(top_tensor); + std::vector top_blobs(1); cret = layer->record_pipeline(bottom_blobs, top_blobs, cmd, opt); - vktensor_map_[out_name] = top_blobs[0]; - } - else // original one-in one-out - { - VkTensor bottom_tensor = vktensor_map_[in_name]; - VkTensor top_tensor; - cret = layer->record_pipeline(bottom_tensor, top_tensor, cmd, opt); - vktensor_map_[out_name] = top_tensor; - } - - // download all nodes data - { - // Tensor tmp_tensor; - // cmd.record_download(vktensor_map_[out_name], tmp_tensor, opt); - // tensor_map[out_name] = tmp_tensor; + vktensor_map_[out_name] = top_blobs.front(); } if (cret != 0) @@ -431,108 +407,61 @@ int VulkanGraph::record_graph_pipeline() } } - cmd.record_download(vktensor_map_[name], output, opt); - - // // download output - // int byte_size=tensor_map_[name]->elem_size * tensor_map_[name]->elem_num; - // void* mem=std::malloc(byte_size); - // tensor_map_[name]->data = mem; - // cmd.record_download(vktensor_map_[name], tensor_map_[name], opt); + auto for_each_output = [this](std::function const& fn) { + auto output_num = sgraph->output_num; + for (int i = 0; i < output_num; ++i) + { + auto output_tensor = sgraph->graph->tensor_list[sgraph->output_tensor_list[i]]; + auto const* name = get_tensor_name(output_tensor); + fn(name); + } + }; -// double total_time, min_time, max_time; -// min_time = 999999999; -// max_time = 0; -// total_time = 0; -// double start_time = get_cur_time(); + for_each_output([this, &cmd](const char* name) { + auto vkoutput = vktensor_map_.find(name); + if (vkoutput == vktensor_map_.cend()) + { + fprintf(stderr, "%s output tensor is not found.\n", name); + return; + }; + cmd.record_download(vkoutput->second, tensor_map[name], opt); + }); cmd.submit_and_wait(); -// double end_time = get_cur_time(); -// double cur_time = end_time - start_time; -// total_time += cur_time; -// if (cur_time > max_time) -// max_time = cur_time; -// if (cur_time < min_time) -// min_time = cur_time; -// printf("vulkan Repeat [1] min %.3f ms, max %.3f ms, avg %.3f ms\n", min_time, max_time, total_time / 1); - - Tensor tmp_fp32; - if(output.elemsize == output.elempack * 2) - { - TEngine::cast_float16_to_float32(output, tmp_fp32, opt); - } - else - { - tmp_fp32 = output; - } - - Tensor blob_unpacked; - if (opt.use_packing_layout) - { - convert_packing(tmp_fp32, blob_unpacked, 1, opt); - } - else - { - blob_unpacked = tmp_fp32; - } - - tensor_map_[name]->data = blob_unpacked.data; - - -// #define DEBUG_OUTPUT -#ifdef DEBUG_OUTPUT - printf("run save tensor data\n"); - for (size_t j=0; jtops[0]; - // std::string in_name = layer->bottoms[0]; - printf("%s\n", in_name.c_str()); + for_each_output([this](const char* name) { + auto pos = tensor_map.find(name); + if (pos == tensor_map.cend()) + { + fprintf(stderr, "%s output tensor is not found.\n", name); + return; + } - std::string fname = std::to_string(j)+".data"; - FILE* fp = fopen(fname.c_str(), "w"); + auto& output = pos->second; - // float * data = (float*)get_tensor_buffer(tensor_map_[name]); - // float* data = (float*)vktensor_map_[in_name].mapped_ptr(); - // float* data = (float*)tensor_map_[in_name]->data; - // float* data = (float*)tensor_map[in_name].data; - Tensor tmp_fp16 = tensor_map[in_name]; Tensor tmp_fp32; - if(tmp_fp16.elemsize == tmp_fp16.elempack * 2) - TEngine::cast_float16_to_float32(tmp_fp16, tmp_fp32, opt); + if (output.elemsize == output.elempack * 2) + { + TEngine::cast_float16_to_float32(output, tmp_fp32, opt); + } else - tmp_fp32 = tmp_fp16; - + { + tmp_fp32 = output; + } + Tensor blob_unpacked; if (opt.use_packing_layout) + { convert_packing(tmp_fp32, blob_unpacked, 1, opt); + } else - blob_unpacked = tmp_fp32; - - int byte_size=tensor_map_[in_name]->elem_size * tensor_map_[name]->elem_num; - void* mem=std::malloc(byte_size); - memcpy(mem, blob_unpacked.data, byte_size); - tensor_map_[in_name]->data = mem; - // tensor_map_[in_name]->data = blob_unpacked.data; - - // float* data = (float*)tmp_fp32.data; - float* data = (float*)blob_unpacked.data; - printf("tensor shape:%d %d %d %d\n", tensor_map_[in_name]->dims[0], tensor_map_[in_name]->dims[1], tensor_map_[in_name]->dims[2], tensor_map_[in_name]->dims[3]); - byte_size=tensor_map_[in_name]->elem_size * tensor_map_[in_name]->elem_num; - for(int i = 0; i < byte_size/sizeof(float); i++) { - if(i % 16 == 0) - { - fprintf(fp, "\n%d:", i); - } - fprintf(fp, " %.6f", data[i]); + blob_unpacked = tmp_fp32; } - fprintf(fp, "\n"); - fclose(fp); - } -#endif + tensor_map[name] = blob_unpacked; // don't release blob_unpacked + tensor_map_[name]->data = blob_unpacked.data; + }); return 0; } @@ -542,4 +471,4 @@ int VulkanGraph::destory_pipeline() return 0; } -} +} // namespace TEngine diff --git a/source/device/vulkan/vulkan_layer.cpp b/source/device/vulkan/vulkan_layer.cpp index 84f2b9de2..4b97cb4d1 100644 --- a/source/device/vulkan/vulkan_layer.cpp +++ b/source/device/vulkan/vulkan_layer.cpp @@ -41,9 +41,9 @@ namespace TEngine { -Layer::Layer() +Layer::Layer(const GPUDevice* vkdev) + : vkdev(vkdev), one_blob_only(true), support_inplace(false) { - support_vulkan = false; } Layer::~Layer() @@ -81,4 +81,4 @@ int Layer::record_pipeline(const std::vector& bottom_blobs, std::vecto return 0; } -} // namespace TEngine \ No newline at end of file +} // namespace TEngine diff --git a/source/device/vulkan/vulkan_layer.hpp b/source/device/vulkan/vulkan_layer.hpp index 2c2be9710..624fd5072 100644 --- a/source/device/vulkan/vulkan_layer.hpp +++ b/source/device/vulkan/vulkan_layer.hpp @@ -64,7 +64,7 @@ class Layer { public: // empty - Layer(); + Layer(const GPUDevice* vkdev); // virtual destructor virtual ~Layer(); @@ -86,17 +86,14 @@ class Layer virtual int record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; public: - // support vulkan compute - bool support_vulkan; - // accept input blob with packed storage bool support_packing; // accept bf16 bool support_bf16_storage; - // shader image storage - bool support_image_storage; + bool one_blob_only; + bool support_inplace; public: const GPUDevice* vkdev; @@ -104,8 +101,6 @@ class Layer std::vector tops; public: - // layer name - std::string name; // Node* node; ir_graph_t* graph; ir_node_t* node; diff --git a/source/device/vulkan/vulkan_limit.hpp b/source/device/vulkan/vulkan_limit.hpp index fbb45e089..d77c1201e 100644 --- a/source/device/vulkan/vulkan_limit.hpp +++ b/source/device/vulkan/vulkan_limit.hpp @@ -64,7 +64,7 @@ const int vulkan_supported_ops[] = { //// OP_CONCAT, // OP_CONST, // OP_CONV, - //// OP_CROP, + OP_CROP, //// OP_DECONV, //// OP_DEPTHTOSPACE, //// OP_DETECTION_OUTPUT, @@ -84,7 +84,7 @@ const int vulkan_supported_ops[] = { //// OP_HARDSWISH, // OP_INPUT, //// OP_INSTANCENORM, - //// OP_INTERP, + OP_INTERP, //// OP_LOGICAL, //// OP_LOGISTIC, //// OP_LRN, diff --git a/source/graph/tensor.c b/source/graph/tensor.c index 5b065a458..fc92aee92 100644 --- a/source/graph/tensor.c +++ b/source/graph/tensor.c @@ -359,3 +359,36 @@ int set_ir_tensor_consumer(ir_tensor_t* ir_tensor, const int index) return 0; } + +float tensor_mean(ir_tensor_t* ir_tensor) +{ + float sum = .0; + float* p = ir_tensor->data; + for (int i = 0; i < ir_tensor->elem_num; ++i) + { + sum += p[i]; + } + + float mean = sum / (float)ir_tensor->elem_num; + return mean; +} + +void save_tensor(const char* fname, const float* data, const int* dims, const int dim_num) +{ + FILE* fout = fopen(fname, "w+"); + int n = 1; + for (int i = 0; i < dim_num; ++i) + { + n *= dims[i]; + fprintf(fout, "%d ", dims[i]); + } + fprintf(fout, "\n"); + + for (int i = 0; i < n; ++i) + { + fprintf(fout, "%f ", data[i]); + } + fprintf(fout, "\n"); + fflush(fout); + fclose(fout); +} diff --git a/source/graph/tensor.h b/source/graph/tensor.h index 9d392f8b3..dd246c162 100644 --- a/source/graph/tensor.h +++ b/source/graph/tensor.h @@ -193,6 +193,8 @@ void dump_ir_tensor(struct graph* ir_graph, ir_tensor_t* ir_tensor); * @return statue value, 0 success, other value failure. */ int set_ir_tensor_consumer(ir_tensor_t* ir_tensor, const int index); +float tensor_mean(ir_tensor_t* tensor); +void save_tensor(const char* fname, const float* data, const int* dims, const int dim_num); #ifdef __cplusplus } diff --git a/source/serializer/tmfile/op/tm2_layernorm.c b/source/serializer/tmfile/op/tm2_layernorm.c index 4645e8405..4dbfa7e31 100644 --- a/source/serializer/tmfile/op/tm2_layernorm.c +++ b/source/serializer/tmfile/op/tm2_layernorm.c @@ -40,7 +40,7 @@ static int layernorm_op_map(int op) } static int tm2_load_layernorm(struct graph* ir_graph, struct node* ir_node, const TM2_Node* tm_node, - const TM2_Operator* tm_op) + const TM2_Operator* tm_op) { struct layernorm_Param* gather_param = (struct layernorm_Param*)ir_node->op.param_mem; const struct tm2_priv* tm2_priv = (struct tm2_priv*)ir_graph->serializer_privacy; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ed7c12b41..6c7c8f522 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,7 +1,35 @@ -# generate tengine header file -FILE (MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tengine) -FILE (COPY ${CMAKE_SOURCE_DIR}/source/api/c_api.h DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tengine) -FILE (COPY ${CMAKE_SOURCE_DIR}/source/api/c_api_ex.h DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tengine) +#generate tengine header file +FILE(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/tengine) +FILE(COPY ${CMAKE_SOURCE_DIR}/source/api/c_api.h DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tengine) +FILE(COPY ${CMAKE_SOURCE_DIR}/source/api/c_api_ex.h DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tengine) + +function(tengine_op_test name) + file(GLOB TENGINE_UTIL_SOURCE_FILES ${PROJECT_SOURCE_DIR}/tests/common/util/*.c) + add_executable(${name} "${CMAKE_CURRENT_SOURCE_DIR}/op/${name}.c" "${TENGINE_UTIL_SOURCE_FILES}") + + target_link_libraries(${name} PUBLIC "${CMAKE_PROJECT_NAME}-static") + + target_include_directories (${name} PRIVATE "${PROJECT_SOURCE_DIR}/source") + target_include_directories (${name} PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") + target_include_directories (${name} PRIVATE "${PROJECT_BINARY_DIR}") + target_include_directories (${name} PRIVATE "${PROJECT_BINARY_DIR}/source") + target_include_directories (${name} PRIVATE "${PROJECT_SOURCE_DIR}/tests/common") + target_include_directories (${name} PRIVATE "${PROJECT_SOURCE_DIR}/tests/common/util") + +endfunction() +tengine_op_test(test_op_absval) +tengine_op_test(test_op_add_n) +tengine_op_test(test_op_argmax) +tengine_op_test(test_op_argmin) +tengine_op_test(test_op_batchnorm) +tengine_op_test(test_op_batchtospacend) +tengine_op_test(test_op_bias) +tengine_op_test(test_op_broadmul) +tengine_op_test(test_op_cast) +tengine_op_test(test_op_ceil) +tengine_op_test(test_op_clip) +tengine_op_test(test_op_comparison) +tengine_op_test(test_op_conv) if (TENGINE_ENABLE_OPENDLA) function (tengine_opendla_op_test name file) diff --git a/tests/op/test_op.h b/tests/op/test_op.h index 91106e187..5a5aaac51 100644 --- a/tests/op/test_op.h +++ b/tests/op/test_op.h @@ -1,27 +1,423 @@ #ifndef __TEST_COMMON_H__ #define __TEST_COMMON_H__ -#include +#include #include #include #include #include #include +#include +#include //#include "float.h" -#include "compiler_fp16.h" +#include "api/c_api.h" #include "tengine/c_api.h" +#include "mathp.h" +#include "vector.h" #include "graph/graph.h" #include "graph/subgraph.h" #include "graph/node.h" #include "graph/tensor.h" +#include #define TENSOR_SHOW_LEADING_BLANK " " #define TENSOR_FLOAT_EPSILON 0.0001f +typedef union +{ + struct + { + uint16_t frac : 10; + uint16_t exp : 5; + uint16_t sign : 1; + } __attribute__((packed)) bits; + + uint16_t u16; +} __attribute__((packed)) __pack16_t; + +typedef union +{ + struct + { + uint32_t frac : 23; + uint32_t exp : 8; + uint32_t sign : 1; + } __attribute__((packed)) bits; + uint32_t u32; + float fp32; +} __attribute__((packed)) __pack32_t; + +static uint16_t __fp32_to_fp16(float fp32) +{ + const float fp32_abs = fabs(fp32); + __pack32_t pack32 = {.fp32 = fp32}; + __pack16_t pack16 = {.u16 = 0}; + + if (pack32.bits.exp == 0 && pack32.bits.frac == 0) + { + pack16.bits.sign = pack32.bits.sign; + pack16.bits.frac = 0; + pack16.bits.exp = 0; + return pack16.u16; + } + + // nan + if (isnan(fp32)) + { + pack16.bits.exp = 0x1f; + pack16.bits.frac = 1; + pack16.bits.sign = pack32.bits.sign; + return pack16.u16; + } + + // inf + if (isinf(fp32)) + { + pack16.bits.exp = 0x1f; + pack16.bits.frac = 0; + pack16.bits.sign = pack32.bits.sign; + return pack16.u16; + } + + // upper to fp16 max norm + if (fp32_abs > 65504.0f) + { + pack16.bits.sign = pack32.bits.sign; + pack16.bits.exp = 0x1e; + pack16.bits.frac = 1023; + return pack16.u16; + } + + // lower than min subnormalnorm + if (fp32_abs < 5.96046448e-8f) + { + return .0f; + } + + // lower than fp16 min norm: fp32 normalized to fp16 subnormal + if (fp32_abs < 6.103515625e-5) + { + pack16.bits.sign = pack32.bits.sign; + pack16.bits.exp = pack32.bits.exp - 127 + 15; + pack16.bits.frac = pack32.bits.frac >> 13; + return pack16.u16; + } + + // fp32 normalized to fp16 normalzied + if (pack32.bits.exp != 0 && pack32.bits.frac != 0) + { + pack16.bits.sign = pack32.bits.sign; + pack16.bits.exp = pack32.bits.exp - 127 + 15; + pack16.bits.frac = pack32.bits.frac >> 13; + return pack16.u16; + } + + return pack16.u16; +} + +static float __fp16_to_fp32(uint16_t const value) +{ + __pack16_t pack16 = {.u16 = value}; + __pack32_t pack32 = {.u32 = 0}; + + if (pack16.bits.exp == 0 && pack16.bits.frac == 0) + { + return pack16.bits.sign == 0 ? .0f : -.0f; + } + + // normalized case + if (pack16.bits.exp != 0xff && pack16.bits.exp != 0) + { + pack32.bits.sign = pack16.bits.sign; + pack32.bits.exp = pack16.bits.exp - 15 + 127; + pack32.bits.frac = pack16.bits.frac << 13; + return pack32.fp32; + } + + // subnormal case + // 5.96046448e-8f = 2**-14 * 1/1024.0 + if (pack16.bits.exp == 0 && pack16.bits.frac != 0) + { + const float alpha = pack16.bits.sign == 0 ? 5.96046448e-8f : -5.96046448e-8f; + return pack16.bits.frac * alpha; + } + + if (pack16.bits.exp == 0x1f && pack16.bits.frac == 0) + { + pack32.bits.sign = pack16.bits.sign; + pack32.bits.exp = 0xff; + pack32.bits.frac = 0; + return pack32.fp32; + } + + if (pack16.bits.exp == 0x1f && pack16.bits.frac != 0) + { + pack32.bits.sign = pack16.bits.sign; + pack32.bits.exp = 0xff; + pack32.bits.frac = 1; + return pack32.fp32; + } + + return pack32.fp32; +} +struct data_buffer +{ + void* data; + size_t size; + int dims[8]; + int dim_num; + int dtype; + float scale; + int32_t zero_point; +}; + +float random_float(float a, float b) +{ + float random = ((float)rand()) / (float)RAND_MAX; + float diff = b - a; + float r = random * diff; + float v = a + r; + // generate denormal as zero + if (v < 0.0001 && v > -0.0001) + v = 0.f; + return v; +} + +int rand_int(const int a, const int b) +{ + const int delta = b - a; + return a + rand() % delta; +} + +struct data_buffer* create_data_buffer_from_tensor(tensor_t tensor) +{ + struct data_buffer* buf = (struct data_buffer*)malloc(sizeof(struct data_buffer)); + buf->size = get_tensor_buffer_size(tensor); + buf->data = malloc(buf->size); + memcpy(buf->data, get_tensor_buffer(tensor), buf->size); + buf->dim_num = get_tensor_shape(tensor, buf->dims, 8); + buf->dtype = get_tensor_data_type(tensor); + get_tensor_quant_param(tensor, &buf->scale, &buf->zero_point, 1); + return buf; +} + +int dtype_to_size(const int dtype) +{ + switch (dtype) + { + case TENGINE_DT_FP32: + return sizeof(float); + case TENGINE_DT_INT8: + return sizeof(int8_t); + case TENGINE_DT_UINT8: + return sizeof(uint8_t); + case TENGINE_DT_FP16: + return sizeof(uint16_t); + case TENGINE_DT_INT16: + return sizeof(int16_t); + case TENGINE_DT_INT32: + return sizeof(int32_t); + default: + assert(0 && "Unsupported dtype"); + return -1; + } +} + +static int fill_random_data(void* p, size_t total_size, int dtype) +{ +#define __fill(__dtype) \ + do { \ + __dtype* data = p; \ + const int n = total_size / sizeof(__dtype); \ + for (int i = 0; i < n; ++i) \ + { \ + if (dtype == TENGINE_DT_UINT8) \ + { \ + data[i] = (__dtype)rand_int(0, 30); \ + } \ + else \ + { \ + data[i] = (__dtype)rand_int(-15, 15); \ + } \ + } \ + } while (0); + + if (dtype == TENGINE_DT_FP32) + { + float* data = p; + for (int i = 0; i < total_size / sizeof(float); ++i) + { + data[i] = random_float(-1.2, 1.2); + } + return 0; + } + else if (dtype == TENGINE_DT_FP16) + { + uint16_t* data = p; + for (int i = 0; i < total_size / sizeof(uint16_t); ++i) + { + data[i] = __fp32_to_fp16(random_float(-1.2, 1.2)); + } + return 0; + } + else if (dtype == TENGINE_DT_INT8) + { + __fill(int8_t); + return 0; + } + else if (dtype == TENGINE_DT_UINT8) + { + __fill(uint8_t); + return 0; + } + else if (dtype == TENGINE_DT_INT32) + { + __fill(int32_t); + return 0; + } + + assert(0 && "Unsupported dtype"); + return -1; +} + +struct data_buffer* create_data_buffer(const int* dims, const int dim_num, const int dtype) +{ + const int elem_size = dtype_to_size(dtype); + if (elem_size < 0) return NULL; + + struct data_buffer* buf = (struct data_buffer*)malloc(sizeof(struct data_buffer)); + if (!buf) return NULL; + buf->size = (int)(dim_num > 0); + buf->dim_num = dim_num; + + for (int i = 0; i < dim_num; ++i) + { + buf->size *= dims[i]; + buf->dims[i] = dims[i]; + } + + buf->size *= elem_size; + buf->dtype = dtype; + buf->data = malloc(buf->size); + if (!buf->data) + { + free(buf); + return NULL; + } + + buf->scale = random_float(0.1, 2.0) + 0.01; + buf->zero_point = rand_int(-5, 5); + + int ret = fill_random_data(buf->data, buf->size, buf->dtype); + if (ret != 0) + { + free(buf->data); + free(buf); + return NULL; + } + return buf; +} + +struct data_buffer* create_data_buffer_fp32(const int* dims, const int dim_num) +{ + return create_data_buffer(dims, dim_num, TENGINE_DT_FP32); +} + +void free_data_buffer_in_vector(void* p) +{ + struct data_buffer* buf = *(struct data_buffer**)p; + free(buf->data); + free(buf); +} + +bool is_match_buffer(const struct data_buffer* lhs, const struct data_buffer* rhs, const float eps) +{ + if (lhs->dim_num != rhs->dim_num || lhs->size != rhs->size || lhs->dtype != rhs->dtype) return false; +#define __compare(__dtype) \ + do { \ + const __dtype* p1 = lhs->data; \ + const __dtype* p2 = rhs->data; \ + if (lhs->scale != rhs->scale || lhs->zero_point != rhs->zero_point) return false; \ + for (int i = 0; i < lhs->size / dtype_to_size(lhs->dtype); ++i) \ + { \ + const int a = (int)p1[i]; \ + const int b = (int)p2[i]; \ + if (abs(a - b) != 0) \ + { \ + fprintf(stderr, "buffer mismatch at %d, lhs = %d, rhs = %d, dims1 = {%d, %d, %d, %d}, dims2 = {%d, %d, %d, %d}\n", i, a, b, lhs->dims[0], lhs->dims[1], lhs->dims[2], lhs->dims[3], rhs->dims[0], rhs->dims[1], rhs->dims[2], rhs->dims[3]); \ + return false; \ + } \ + } \ + return true; \ + } while (0) + + for (int i = 0; i < lhs->dim_num; ++i) + { + if (lhs->dims[i] != rhs->dims[i]) return false; + } + + if (lhs->dtype == TENGINE_DT_FP32) + { + const float* p1 = lhs->data; + const float* p2 = rhs->data; + + for (int i = 0; i < lhs->size / sizeof(float); ++i) + { + if (fabs(p1[i] - p2[i]) > eps) + { + fprintf(stderr, "buffer mismatch at %d, lhs = %f, rhs = %f, dims1 = {%d, %d, %d, %d}, dims2 = {%d, %d, %d, %d}\n", i, p1[i], p2[i], lhs->dims[0], lhs->dims[1], lhs->dims[2], lhs->dims[3], rhs->dims[0], rhs->dims[1], rhs->dims[2], rhs->dims[3]); + return false; + } + } + + return true; + } + else if (lhs->dtype == TENGINE_DT_UINT8) + { + __compare(uint8_t); + } + else if (lhs->dtype == TENGINE_DT_INT8) + { + __compare(int8_t); + } + else if (lhs->dtype == TENGINE_DT_INT32) + { + __compare(int32_t); + } + else if (lhs->dtype == TENGINE_DT_INT16) + { + __compare(int16_t); + } + else if (lhs->dtype == TENGINE_DT_FP16) + { + const uint16_t* p1 = lhs->data; + const uint16_t* p2 = lhs->data; + + for (int i = 0; i < lhs->size / sizeof(uint16_t); ++i) + { + const uint16_t a = p1[i]; + const uint16_t b = p2[i]; + const float fpa = __fp16_to_fp32(a); + const float fpb = __fp16_to_fp32(b); + + if (fabs(fpa - fpb) > eps) + { + return false; + } + } + + return true; + } +#undef __compare + return false; +} + +typedef int (*node_setup_hook_fn)(graph_t graph, const char* test_node_name, const char* op, const char* input_name, int data_type, int input_num, int output_num); typedef int (*common_test)(graph_t, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w); +#if 0 void dump_tensor_line(void* data_ptr, int offset, int data_type, int w) { if (0 >= w) @@ -48,7 +444,7 @@ void dump_tensor_line(void* data_ptr, int offset, int data_type, int w) } case TENGINE_DT_FP16: { - __fp16* p = (__fp16*)data_ptr; + uint16_t* p = (uint16_t*)data_ptr; #ifdef __ARM_ARCH for (int i = 0; i < w - 1; i++) @@ -213,6 +609,7 @@ void dump_node_output(node_t test_node, int index) release_graph_tensor(tensor); } +#endif int create_node(graph_t graph, const char* node_name, int n, int c, int h, int w, int data_type, int layout) { @@ -252,7 +649,7 @@ int create_node(graph_t graph, const char* node_name, int n, int c, int h, int w return 0; } -int create_input_node(graph_t graph, const char* node_name, int data_type, int layout, int n, int c, int h, int w, int dims_count = 4) +int create_input_node_with_multi_inputs(graph_t graph, const char* node_name, int data_type, int input_num, int layout, int n, int c, int h, int w, int dims_count) { if (0 == n) dims_count = 3; if (0 == c) dims_count = 2; @@ -263,106 +660,110 @@ int create_input_node(graph_t graph, const char* node_name, int data_type, int l return -1; } - node_t node = create_graph_node(graph, node_name, "InputOp"); + node_t node = create_graph_node(graph, node_name, OP_INPUT_NAME); if (NULL == node) { fprintf(stderr, "Create %d dims node(%s) failed. ", dims_count, node_name); return -1; } - tensor_t tensor = create_graph_tensor(graph, node_name, data_type); - if (NULL == tensor) - { - release_graph_node(node); - - fprintf(stderr, "Create %d dims tensor for node(%s) failed. ", dims_count, node_name); - - return -1; - } - - int ret = set_node_output_tensor(node, 0, tensor, TENSOR_TYPE_INPUT); - if (0 != ret) + for (int i = 0; i < input_num; ++i) { - release_graph_tensor(tensor); - release_graph_node(node); - - fprintf(stderr, "Set %d dims output tensor for node(%s) failed. ", dims_count, node_name); - - return -1; - } + char tensor_name[512]; + snprintf(tensor_name, sizeof(tensor_name), "%s_%d", node_name, i); + tensor_t tensor = create_graph_tensor(graph, tensor_name, data_type); - switch (dims_count) - { - case 1: - { - int dims_array[1] = {w}; - set_tensor_shape(tensor, dims_array, dims_count); - break; - } - case 2: - { - int dims_array[2] = {h, w}; - set_tensor_shape(tensor, dims_array, dims_count); - break; - } - case 3: - { - if (TENGINE_LAYOUT_NCHW == layout) + if (NULL == tensor) { - int dims_array[3] = {c, h, w}; - set_tensor_shape(tensor, dims_array, dims_count); - break; + release_graph_node(node); + fprintf(stderr, "Create %d dims tensor for node(%s) failed. ", dims_count, node_name); + return -1; } - if (TENGINE_LAYOUT_NHWC == layout) + int ret = set_node_output_tensor(node, i, tensor, TENSOR_TYPE_INPUT); + if (0 != ret) { - int dims_array[3] = {h, w, c}; - set_tensor_shape(tensor, dims_array, dims_count); - break; + release_graph_tensor(tensor); + release_graph_node(node); + fprintf(stderr, "Set %d dims output tensor for node(%s) failed. ", dims_count, node_name); + return -1; } - } - case 4: - { - if (TENGINE_LAYOUT_NCHW == layout) + + switch (dims_count) + { + case 1: { - int dims_array[4] = {n, c, h, w}; + int dims_array[1] = {w}; set_tensor_shape(tensor, dims_array, dims_count); break; } - - if (TENGINE_LAYOUT_NHWC == layout) + case 2: { - int dims_array[4] = {n, h, w, c}; + int dims_array[2] = {h, w}; set_tensor_shape(tensor, dims_array, dims_count); break; } - } - case 5: - { - if (TENGINE_LAYOUT_NCHW == layout) + case 3: { - int dims_array[5] = {1, n, c, h, w}; - set_tensor_shape(tensor, dims_array, dims_count); - break; + if (TENGINE_LAYOUT_NCHW == layout) + { + int dims_array[3] = {c, h, w}; + set_tensor_shape(tensor, dims_array, dims_count); + break; + } + + if (TENGINE_LAYOUT_NHWC == layout) + { + int dims_array[3] = {h, w, c}; + set_tensor_shape(tensor, dims_array, dims_count); + break; + } } + case 4: + { + if (TENGINE_LAYOUT_NCHW == layout) + { + int dims_array[4] = {n, c, h, w}; + set_tensor_shape(tensor, dims_array, dims_count); + break; + } - if (TENGINE_LAYOUT_NHWC == layout) + if (TENGINE_LAYOUT_NHWC == layout) + { + int dims_array[4] = {n, h, w, c}; + set_tensor_shape(tensor, dims_array, dims_count); + break; + } + } + case 5: { - int dims_array[5] = {1, n, h, w, c}; - set_tensor_shape(tensor, dims_array, dims_count); - break; + if (TENGINE_LAYOUT_NCHW == layout) + { + int dims_array[5] = {1, n, c, h, w}; + set_tensor_shape(tensor, dims_array, dims_count); + break; + } + + if (TENGINE_LAYOUT_NHWC == layout) + { + int dims_array[5] = {1, n, h, w, c}; + set_tensor_shape(tensor, dims_array, dims_count); + break; + } + } + default: + fprintf(stderr, "Cannot support %d dims tensor.\n", dims_count); } } - default: - fprintf(stderr, "Cannot support %d dims tensor.\n", dims_count); - } - - release_graph_tensor(tensor); - release_graph_node(node); return 0; } +int create_input_node(graph_t graph, const char* node_name, int data_type, int layout, int n, int c, int h, int w, int dims_count) +{ + return create_input_node_with_multi_inputs(graph, node_name, data_type, 1, layout, n, c, h, w, dims_count); +} + int fill_fp32_tensor(tensor_t tensor, float value) { int dims[MAX_SHAPE_DIM_NUM]; @@ -457,6 +858,16 @@ int fill_uint8_tensor(tensor_t tensor, float value) return 0; } +void feed_input_tensor(graph_t graph, int input_node_idx, int input_tensor_idx, const float* values, int* dims, const int dim_num) +{ + tensor_t tensor = get_graph_input_tensor(graph, input_node_idx, input_tensor_idx); + if (!tensor) + { + fprintf(stderr, "Cannot find %dth tensor with node idex %d\n", input_tensor_idx, input_node_idx); + return; + } +} + void fill_input_float_tensor_by_index(graph_t graph, int input_node_index, int tensor_index, float value) { tensor_t tensor = get_graph_input_tensor(graph, input_node_index, tensor_index); @@ -585,9 +996,7 @@ int test_graph_init() { // now init tengine will mask critical filed and return an error // TODO: fix this fatal issue - init_tengine(); - - return 0; + return init_tengine(); } int test_graph_run(graph_t graph) @@ -598,7 +1007,7 @@ int test_graph_run(graph_t graph) return -1; } - dump_graph(graph); + // dump_graph(graph); if (0 != run_graph(graph, 1)) { @@ -613,10 +1022,41 @@ void test_graph_release(graph_t graph) { postrun_graph(graph); destroy_graph(graph); - release_tengine(); } -graph_t create_common_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4) +static int craete_common_test_node(graph_t graph, const char* test_node_name, const char* op, const char* input_name, int data_type, int input_num, int output_num) +{ + node_t test_node = create_graph_node(graph, test_node_name, op); + if (NULL == test_node) + { + fprintf(stderr, "create test node failed.\n"); + return -1; + } + + node_t input_node = get_graph_node(graph, input_name); + for (int i = 0; i < get_node_output_number(input_node); ++i) + { + tensor_t input_tensor = get_node_output_tensor(input_node, i); + set_node_input_tensor(test_node, i, input_tensor); + } + + char tensor_name[512]; + for (int i = 0; i < output_num; ++i) + { + snprintf(tensor_name, sizeof(tensor_name), "%s_%d", test_node_name, i); + tensor_t output_tensor = create_graph_tensor(graph, tensor_name, data_type); + if (!output_tensor) + { + fprintf(stderr, "create graph output tensor failed.\n"); + return -1; + } + + set_node_output_tensor(test_node, i, output_tensor, TENSOR_TYPE_VAR); + } + return 0; +} + +graph_t create_common_test_graph(const char* op, const char* test_node_name, const void* params, const size_t param_size, vector_t* inputs, int output_num, int data_type, int layout) { graph_t graph = create_graph(NULL, NULL, NULL); if (NULL == graph) @@ -632,29 +1072,80 @@ graph_t create_common_test_graph(const char* test_node_name, int data_type, int } const char* input_name = "input_node"; - if (create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0) + node_t input_node = create_graph_node(graph, input_name, OP_INPUT_NAME); + node_t test_node = create_graph_node(graph, test_node_name, op); + if (!input_node || !test_node) { fprintf(stderr, "create input node failed.\n"); return NULL; } - if (test_func(graph, input_name, test_node_name, data_type, layout, n, c, h, w) < 0) + // setup input tensor + char tensor_name[512]; + float scale = 1.0; + int zero_point = 0.0; + + for (int i = 0; i < get_vector_num(inputs); ++i) { - fprintf(stderr, "create test node failed.\n"); - return NULL; + struct data_buffer* input = *(struct data_buffer**)get_vector_data(inputs, i); + snprintf(tensor_name, sizeof(tensor_name), "%s_%d", input_name, i); + tensor_t tensor = create_graph_tensor(graph, tensor_name, input->dtype); + if (!tensor) return NULL; + + set_tensor_shape(tensor, input->dims, input->dim_num); + set_tensor_buffer(tensor, input->data, input->size); + scale = input->scale; + zero_point = input->zero_point; + set_tensor_quant_param(tensor, &scale, &zero_point, 1); + + if (set_node_output_tensor(input_node, i, tensor, TENSOR_TYPE_VAR)) + { + return NULL; + } + + if (set_node_input_tensor(test_node, i, tensor)) + { + return NULL; + } + } + + // setup output tensor + for (int i = 0; i < output_num; ++i) + { + snprintf(tensor_name, sizeof(tensor_name), "%s_%d", test_node_name, i); + tensor_t output_tensor = create_graph_tensor(graph, tensor_name, data_type); + + if (data_type != TENGINE_DT_FP16 && data_type != TENGINE_DT_FP32) + { + set_tensor_quant_param(output_tensor, &scale, &zero_point, 1); + } + + if (set_node_output_tensor(test_node, i, output_tensor, TENSOR_TYPE_VAR)) + { + return NULL; + } + } + + // setup test node param + if (params) + { + struct node* ir_node = (struct node*)test_node; + memcpy(ir_node->op.param_mem, params, param_size); } + // setup test node end. + /* set input/output node */ - const char* inputs[] = {input_name}; - const char* outputs[] = {test_node_name}; + const char* input_nodes[] = {input_name}; + const char* output_nodes[] = {test_node_name}; - if (set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0) + if (set_graph_input_node(graph, input_nodes, sizeof(input_nodes) / sizeof(char*)) < 0) { fprintf(stderr, "set inputs failed.\n"); return NULL; } - if (set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0) + if (set_graph_output_node(graph, output_nodes, sizeof(output_nodes) / sizeof(char*)) < 0) { fprintf(stderr, "set outputs failed.\n"); return NULL; @@ -663,74 +1154,134 @@ graph_t create_common_test_graph(const char* test_node_name, int data_type, int return graph; } -graph_t create_opendla_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4) +vector_t* create_and_forward_test_graph(const char* op, const void* params, const size_t param_size, vector_t* inputs, int output_num, int data_type, int layout) { - /* create OpenDLA backend */ - context_t odla_context = create_context("odla", 1); - int rtt = set_context_device(odla_context, "OPENDLA", NULL, 0); - if (0 > rtt) + int ret = 0; + vector_t* outputs_ref = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); + graph_t graph_ref = create_common_test_graph(op, "test_node", params, param_size, inputs, output_num, data_type, layout); + + if (!outputs_ref) { - fprintf(stderr, " add_context_device VSI DEVICE failed.\n"); - return NULL; + ret = -1; + goto out; } - graph_t graph = create_graph(odla_context, NULL, NULL); - if (NULL == graph) + if (!graph_ref) { - fprintf(stderr, "get graph failed.\n"); - return NULL; + goto failed; } - if (set_graph_layout(graph, layout) < 0) + struct options opt; + opt.num_thread = 1; + opt.cluster = TENGINE_CLUSTER_ALL; + opt.precision = TENGINE_MODE_FP32; + opt.affinity = 255; + + if ((ret = prerun_graph_multithread(graph_ref, opt)) != 0) { - fprintf(stderr, "set layout failed.\n"); - return NULL; + fprintf(stderr, "prerun graph failed: %d\n", ret); + goto failed; } - const char* input_name = "input_node"; - if (create_input_node(graph, input_name, data_type, layout, n, c, h, w, dims_num) < 0) + if ((ret = run_graph(graph_ref, 1)) < 0) { - fprintf(stderr, "create input node failed.\n"); - return NULL; + fprintf(stderr, "run graph failed: %d\n", ret); + goto out; } - if (test_func(graph, input_name, test_node_name, data_type, layout, n, c, h, w) < 0) + for (int i = 0; i < get_graph_output_node_number(graph_ref); ++i) { - fprintf(stderr, "create test node failed.\n"); - return NULL; + node_t output_node = get_graph_output_node(graph_ref, i); + for (int t = 0; t < get_node_output_number(output_node); ++t) + { + tensor_t output_tensor = get_graph_output_tensor(graph_ref, i, t); + struct data_buffer* data = create_data_buffer_from_tensor(output_tensor); + push_vector_data(outputs_ref, &data); + } } - /* set input/output node */ - const char* inputs[] = {input_name}; - const char* outputs[] = {test_node_name}; + if ((ret = postrun_graph(graph_ref))) + { + goto failed; + } - if (set_graph_input_node(graph, inputs, sizeof(inputs) / sizeof(char*)) < 0) + goto out; + +failed: + release_vector(outputs_ref); + outputs_ref = NULL; + +out: + if (graph_ref) { - fprintf(stderr, "set inputs failed.\n"); - return NULL; + destroy_graph(graph_ref); } + return outputs_ref; +} - if (set_graph_output_node(graph, outputs, sizeof(outputs) / sizeof(char*)) < 0) +//inputs: vector +int create_common_op_test_case(const char* op, const void* params, const size_t param_size, vector_t* inputs, int output_num, int data_type, int layout, const float eps) +{ + int ret = init_tengine(); + if (ret) { - fprintf(stderr, "set outputs failed.\n"); - return NULL; + fprintf(stderr, "init tengine failed: %d\n", ret); + return ret; } - return graph; + setenv("TG_DEBUG_REF", "1", 1); + vector_t* outputs_ref = create_and_forward_test_graph(op, params, param_size, inputs, 1, data_type, layout); + if (!outputs_ref) + { + return -1; + } + + setenv("TG_DEBUG_REF", "0", 1); + vector_t* outputs = create_and_forward_test_graph(op, params, param_size, inputs, 1, data_type, layout); + if (!outputs) + { + ret = -1; + goto out; + } + + if (get_vector_num(outputs) != get_vector_num(outputs_ref)) + { + fprintf(stderr, "output num is not equal to ref. test = %d, ref = %d\n", get_vector_num(outputs), get_vector_num(outputs_ref)); + goto out; + } + + for (int i = 0; i < get_vector_num(outputs_ref); ++i) + { + struct data_buffer* p1 = *(struct data_buffer**)get_vector_data(outputs_ref, i); + struct data_buffer* p2 = *(struct data_buffer**)get_vector_data(outputs, i); + + if (!is_match_buffer(p1, p2, eps)) + { + fprintf(stderr, "%dth output is mismatch\n", i); + ret = -1; + goto out; + } + } + +out: + if (outputs_ref) release_vector(outputs_ref); + if (outputs) release_vector(outputs); + release_tengine(); + return ret; } -graph_t create_timvx_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4) +graph_t create_opendla_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num) { - /* create VeriSilicon TIM-VX backend */ - context_t timvx_context = create_context("timvx", 1); - int rtt = set_context_device(timvx_context, "TIMVX", NULL, 0); + /* create OpenDLA backend */ + context_t odla_context = create_context("odla", 1); + int rtt = set_context_device(odla_context, "OPENDLA", NULL, 0); if (0 > rtt) { fprintf(stderr, " add_context_device VSI DEVICE failed.\n"); return NULL; } - graph_t graph = create_graph(timvx_context, NULL, NULL); + graph_t graph = create_graph(odla_context, NULL, NULL); if (NULL == graph) { fprintf(stderr, "get graph failed.\n"); @@ -775,18 +1326,18 @@ graph_t create_timvx_test_graph(const char* test_node_name, int data_type, int l return graph; } -graph_t create_tensorrt_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4) +graph_t create_timvx_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num) { - /* create TensorRT backend */ - context_t trt_context = create_context("tensorrt", 1); - int rtt = set_context_device(trt_context, "TensorRT", NULL, 0); + /* create VeriSilicon TIM-VX backend */ + context_t timvx_context = create_context("timvx", 1); + int rtt = set_context_device(timvx_context, "TIMVX", NULL, 0); if (0 > rtt) { fprintf(stderr, " add_context_device VSI DEVICE failed.\n"); return NULL; } - graph_t graph = create_graph(trt_context, NULL, NULL); + graph_t graph = create_graph(timvx_context, NULL, NULL); if (NULL == graph) { fprintf(stderr, "get graph failed.\n"); @@ -831,18 +1382,18 @@ graph_t create_tensorrt_test_graph(const char* test_node_name, int data_type, in return graph; } -graph_t create_torch_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4) +graph_t create_tensorrt_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num) { - /* create libTorch backend */ - context_t torch_context = create_context("torch", 1); - int rtt = set_context_device(torch_context, "TORCH", NULL, 0); + /* create TensorRT backend */ + context_t trt_context = create_context("tensorrt", 1); + int rtt = set_context_device(trt_context, "TensorRT", NULL, 0); if (0 > rtt) { fprintf(stderr, " add_context_device VSI DEVICE failed.\n"); return NULL; } - graph_t graph = create_graph(torch_context, NULL, NULL); + graph_t graph = create_graph(trt_context, NULL, NULL); if (NULL == graph) { fprintf(stderr, "get graph failed.\n"); @@ -887,9 +1438,18 @@ graph_t create_torch_test_graph(const char* test_node_name, int data_type, int l return graph; } -graph_t create_cpu_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num = 4) +graph_t create_torch_test_graph(const char* test_node_name, int data_type, int layout, int n, int c, int h, int w, common_test test_func, int dims_num) { - graph_t graph = create_graph(NULL, NULL, NULL); + /* create libTorch backend */ + context_t torch_context = create_context("torch", 1); + int rtt = set_context_device(torch_context, "TORCH", NULL, 0); + if (0 > rtt) + { + fprintf(stderr, " add_context_device VSI DEVICE failed.\n"); + return NULL; + } + + graph_t graph = create_graph(torch_context, NULL, NULL); if (NULL == graph) { fprintf(stderr, "get graph failed.\n"); @@ -934,105 +1494,6 @@ graph_t create_cpu_test_graph(const char* test_node_name, int data_type, int lay return graph; } -int compare_tensor(tensor_t a, tensor_t b) -{ - int a_dim[MAX_SHAPE_DIM_NUM], b_dim[MAX_SHAPE_DIM_NUM]; - int a_dim_count = get_tensor_shape(a, a_dim, MAX_SHAPE_DIM_NUM); - int b_dim_count = get_tensor_shape(b, b_dim, MAX_SHAPE_DIM_NUM); - - if (a_dim_count <= 0 || a_dim_count != b_dim_count) - return -1; - - for (int i = 0; i < a_dim_count; i++) - if (a_dim[i] != b_dim[i]) - return -1; - - int a_type = get_tensor_data_type(a); - int b_type = get_tensor_data_type(b); - - if (a_type != b_type) - return -1; - - int element_size = 1; - for (int i = 0; i < a_dim_count; i++) - element_size *= a_dim[i]; - - if (element_size <= 0) - { - fprintf(stderr, "One of dims is 0. Zero is not allowed.\n"); - return -1; - } - - switch (a_type) - { - case TENGINE_DT_FP32: - { - float* a_data_ptr = (float*)get_tensor_buffer(a); - float* b_data_ptr = (float*)get_tensor_buffer(b); - - for (int i = 0; i < element_size; i++) - if (fabsf(a_data_ptr[i] - b_data_ptr[i]) < TENSOR_FLOAT_EPSILON) - return -1; - - break; - } - case TENGINE_DT_FP16: - { - __fp16* a_data_ptr = (__fp16*)get_tensor_buffer(a); - __fp16* b_data_ptr = (__fp16*)get_tensor_buffer(b); - - for (int i = 0; i < element_size; i++) - { - if (fabsf((float)fp16_to_fp32(a_data_ptr[i]) - (float)fp16_to_fp32(b_data_ptr[i])) < TENSOR_FLOAT_EPSILON) - return -1; - } - - break; - } - case TENGINE_DT_INT32: - { - int32_t* a_data_ptr = (int32_t*)get_tensor_buffer(a); - int32_t* b_data_ptr = (int32_t*)get_tensor_buffer(b); - - for (int i = 0; i < element_size; i++) - if (a_data_ptr[i] != b_data_ptr[i]) - return -1; - - break; - } - case TENGINE_DT_INT16: - { - int16_t* a_data_ptr = (int16_t*)get_tensor_buffer(a); - int16_t* b_data_ptr = (int16_t*)get_tensor_buffer(b); - - for (int i = 0; i < element_size; i++) - if (a_data_ptr[i] != b_data_ptr[i]) - return -1; - - break; - } - case TENGINE_DT_UINT8: - case TENGINE_DT_INT8: - { - int8_t* a_data_ptr = (int8_t*)get_tensor_buffer(a); - int8_t* b_data_ptr = (int8_t*)get_tensor_buffer(b); - - for (int i = 0; i < element_size; i++) - if (a_data_ptr[i] != b_data_ptr[i]) - return -1; - - break; - } - default: - { - fprintf(stderr, "The type of tensor was not supported.\n"); - return -1; - } - } - - return 0; -} - static inline unsigned long get_current_time(void) { struct timespec tm; diff --git a/tests/op/test_op_absval.c b/tests/op/test_op_absval.c new file mode 100644 index 000000000..aa8ab2c66 --- /dev/null +++ b/tests/op/test_op_absval.c @@ -0,0 +1,41 @@ +#include "api/c_api.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "test_op.h" +#include "tengine/c_api.h" +#include +#include +#include "util/vector.h" + +#define define_test_case(__func, __layout, ...) \ + static int __func() \ + { \ + int data_type = TENGINE_DT_FP32; \ + int layout = __layout; \ + int dims[] = {__VA_ARGS__}; \ + int dims_num = sizeof(dims) / sizeof(dims[0]); \ + vector_t* inputs = create_vector(sizeof(struct data_buffer), free_data_buffer_in_vector); \ + struct data_buffer* input = create_data_buffer_fp32(dims, sizeof(dims) / sizeof(int)); \ + push_vector_data(inputs, &input); \ + int ret = create_common_op_test_case(OP_ABSVAL_NAME, NULL, 0, inputs, 1, data_type, layout, 0.001); \ + release_vector(inputs); \ + return ret; \ + } + +define_test_case(absval_op_test_case_0, TENGINE_LAYOUT_NCHW, 1, 3, 64, 128); +define_test_case(absval_op_test_case_1, TENGINE_LAYOUT_NCHW, 1, 3, 128, 128); +define_test_case(absval_op_test_case_2, TENGINE_LAYOUT_NCHW, 1, 3, 128, 64); +define_test_case(absval_op_test_case_3, TENGINE_LAYOUT_NCHW, 1, 3, 111, 111); +define_test_case(absval_op_test_case_4, TENGINE_LAYOUT_NCHW, 1, 3, 65, 111); + +#define __NHWC_SUPPORTED__ 0 +#if __NHWC_SUPPORTED__ +#endif + +int main(void) +{ + return absval_op_test_case_0() || absval_op_test_case_1() || absval_op_test_case_2() || absval_op_test_case_3() || absval_op_test_case_4() +#if __NHWC_SUPPORTED__ +#endif + ; +} diff --git a/tests/op/test_op_add_n.c b/tests/op/test_op_add_n.c new file mode 100644 index 000000000..0f4118c02 --- /dev/null +++ b/tests/op/test_op_add_n.c @@ -0,0 +1,47 @@ +#include "api/c_api.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "test_op.h" +#include "tengine/c_api.h" +#include +#include +#include "util/vector.h" + +#define define_common_test_case(__op_name, __case_name, __layout, ...) \ + static int __case_name() \ + { \ + int data_type = TENGINE_DT_FP32; \ + int layout = __layout; \ + int dims[] = {__VA_ARGS__}; \ + int dims_num = sizeof(dims) / sizeof(dims[0]); \ + vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); \ + for (int i = 0; i < 64; ++i) \ + { \ + struct data_buffer* input = create_data_buffer_fp32(dims, sizeof(dims) / sizeof(int)); \ + push_vector_data(inputs, &input); \ + int ret = create_common_op_test_case(__op_name, NULL, 0, inputs, 1, data_type, layout, 0.001); \ + if (ret) return ret; \ + } \ + release_vector(inputs); \ + return 0; \ + } + +#define define_test_case(__case_name, __layout, ...) define_common_test_case(OP_ADD_N_NAME, __case_name, __layout, __VA_ARGS__) + +define_test_case(op_test_case_0, TENGINE_LAYOUT_NCHW, 1, 3, 64, 128); +define_test_case(op_test_case_1, TENGINE_LAYOUT_NCHW, 1, 3, 128, 128); +define_test_case(op_test_case_2, TENGINE_LAYOUT_NCHW, 1, 3, 128, 64); +define_test_case(op_test_case_3, TENGINE_LAYOUT_NCHW, 1, 3, 111, 111); +define_test_case(op_test_case_4, TENGINE_LAYOUT_NCHW, 1, 3, 65, 111); + +#define __NHWC_SUPPORTED__ 0 +#if __NHWC_SUPPORTED__ +#endif + +int main(void) +{ + return op_test_case_0() || op_test_case_1() || op_test_case_2() || op_test_case_3() || op_test_case_4() +#if __NHWC_SUPPORTED__ +#endif + ; +} diff --git a/tests/op/test_op_argmax.c b/tests/op/test_op_argmax.c new file mode 100644 index 000000000..8d6846519 --- /dev/null +++ b/tests/op/test_op_argmax.c @@ -0,0 +1,70 @@ +#include "api/c_api.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "test_op.h" +#include "operator/prototype/argmax_param.h" +#include "tengine/c_api.h" +#include +#include +#include "util/vector.h" + +#define define_common_test_case(__op_name, __case_name, __layout, __axis, __keepdims, ...) \ + static int __case_name() \ + { \ + int layout = __layout; \ + int dims[] = {__VA_ARGS__}; \ + int dims_num = sizeof(dims) / sizeof(dims[0]); \ + argmax_param_t param = {.axis = __axis, .keepdims = __keepdims}; \ + vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); \ + struct data_buffer* input = create_data_buffer_fp32(dims, sizeof(dims) / sizeof(int)); \ + push_vector_data(inputs, &input); \ + int ret = create_common_op_test_case(__op_name, ¶m, sizeof(param), inputs, 1, TENGINE_DT_FP32, layout, 0.001); \ + if (ret) \ + { \ + fprintf(stderr, "test argmax op failed: dims = [%d, %d, %d], dtype = fp32\n", dims[0], dims[1], dims[2]); \ + return ret; \ + } \ + release_vector(inputs); \ + inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); \ + input = create_data_buffer(dims, sizeof(dims) / sizeof(int), TENGINE_DT_UINT8); \ + push_vector_data(inputs, &input); \ + ret = create_common_op_test_case(__op_name, ¶m, sizeof(param), inputs, 1, TENGINE_DT_UINT8, layout, 0.001); \ + if (ret) \ + { \ + fprintf(stderr, "test argmax op failed: dims = [%d, %d, %d], dtype = uint8\n", dims[0], dims[1], dims[2]); \ + return ret; \ + } \ + release_vector(inputs); \ + fprintf(stderr, "test case pass, axis=%d, keepdims: %d\n", __axis, __keepdims); \ + return 0; \ + } + +#define define_test_case(__case_name, __layout, ...) \ + define_common_test_case(OP_ARGMAX_NAME, __case_name##_00, __layout, 0, 0, __VA_ARGS__); \ + define_common_test_case(OP_ARGMAX_NAME, __case_name##_01, __layout, 1, 0, __VA_ARGS__); \ + define_common_test_case(OP_ARGMAX_NAME, __case_name##_02, __layout, 2, 0, __VA_ARGS__); \ + define_common_test_case(OP_ARGMAX_NAME, __case_name##_10, __layout, 0, 1, __VA_ARGS__); \ + define_common_test_case(OP_ARGMAX_NAME, __case_name##_11, __layout, 1, 1, __VA_ARGS__); \ + define_common_test_case(OP_ARGMAX_NAME, __case_name##_12, __layout, 2, 1, __VA_ARGS__); \ + static int __case_name() \ + { \ + return __case_name##_00() || __case_name##_01() || __case_name##_02() || __case_name##_10() || __case_name##_11() || __case_name##_12(); \ + } + +define_test_case(op_test_case_0, TENGINE_LAYOUT_NCHW, 3, 64, 128); +define_test_case(op_test_case_1, TENGINE_LAYOUT_NCHW, 3, 128, 128); +define_test_case(op_test_case_2, TENGINE_LAYOUT_NCHW, 3, 128, 64); +define_test_case(op_test_case_3, TENGINE_LAYOUT_NCHW, 3, 111, 111); +define_test_case(op_test_case_4, TENGINE_LAYOUT_NCHW, 3, 65, 111); + +#define __NHWC_SUPPORTED__ 0 +#if __NHWC_SUPPORTED__ +#endif + +int main(void) +{ + return op_test_case_0() || op_test_case_1() || op_test_case_2() || op_test_case_3() || op_test_case_4() +#if __NHWC_SUPPORTED__ +#endif + ; +} diff --git a/tests/op/test_op_argmin.c b/tests/op/test_op_argmin.c new file mode 100644 index 000000000..7b2f20bd1 --- /dev/null +++ b/tests/op/test_op_argmin.c @@ -0,0 +1,67 @@ +#include "api/c_api.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "test_op.h" +#include "operator/prototype/argmax_param.h" +#include "tengine/c_api.h" +#include +#include +#include "util/vector.h" + +#define define_common_test_case(__op_name, __case_name, __layout, __axis, __keepdims, ...) \ + static int __case_name() \ + { \ + int layout = __layout; \ + int dims[] = {__VA_ARGS__}; \ + int dims_num = sizeof(dims) / sizeof(dims[0]); \ + argmax_param_t param = {.axis = __axis, .keepdims = __keepdims}; \ + vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); \ + struct data_buffer* input = create_data_buffer_fp32(dims, sizeof(dims) / sizeof(int)); \ + push_vector_data(inputs, &input); \ + int ret = create_common_op_test_case(__op_name, ¶m, sizeof(param), inputs, 1, TENGINE_DT_FP32, layout, 0.001); \ + if (ret) \ + { \ + fprintf(stderr, "test argmin op failed: dims = [%d, %d, %d], dtype = fp32\n", dims[0], dims[1], dims[2]); \ + return ret; \ + } \ + release_vector(inputs); \ + inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); \ + input = create_data_buffer(dims, sizeof(dims) / sizeof(int), TENGINE_DT_UINT8); \ + push_vector_data(inputs, &input); \ + ret = create_common_op_test_case(__op_name, ¶m, sizeof(param), inputs, 1, TENGINE_DT_UINT8, layout, 0.001); \ + if (ret) \ + { \ + fprintf(stderr, "test argmin op failed: dims = [%d, %d, %d], dtype = uint8\n", dims[0], dims[1], dims[2]); \ + return ret; \ + } \ + release_vector(inputs); \ + fprintf(stderr, "test case pass, axis=%d, keepdims: %d\n", __axis, __keepdims); \ + return 0; \ + } + +#define define_test_case(__case_name, __layout, ...) \ + define_common_test_case(OP_ARGMIN_NAME, __case_name##_00, __layout, 0, 0, __VA_ARGS__); \ + define_common_test_case(OP_ARGMIN_NAME, __case_name##_01, __layout, 1, 0, __VA_ARGS__); \ + define_common_test_case(OP_ARGMIN_NAME, __case_name##_02, __layout, 2, 0, __VA_ARGS__); \ + define_common_test_case(OP_ARGMIN_NAME, __case_name##_10, __layout, 0, 1, __VA_ARGS__); \ + define_common_test_case(OP_ARGMIN_NAME, __case_name##_11, __layout, 1, 1, __VA_ARGS__); \ + define_common_test_case(OP_ARGMIN_NAME, __case_name##_12, __layout, 2, 1, __VA_ARGS__); \ + static int __case_name() \ + { \ + return __case_name##_00() || __case_name##_01() || __case_name##_02() || __case_name##_10() || __case_name##_11() || __case_name##_12(); \ + } + +define_test_case(op_test_case_0, TENGINE_LAYOUT_NCHW, 3, 64, 128); +define_test_case(op_test_case_1, TENGINE_LAYOUT_NCHW, 3, 128, 128); +define_test_case(op_test_case_2, TENGINE_LAYOUT_NCHW, 3, 128, 64); +define_test_case(op_test_case_3, TENGINE_LAYOUT_NCHW, 3, 111, 111); +define_test_case(op_test_case_4, TENGINE_LAYOUT_NCHW, 3, 65, 111); + +#define __NHWC_SUPPORTED__ 0 +#if __NHWC_SUPPORTED__ +#endif + +int main(void) +{ + return op_test_case_0() || op_test_case_1() || op_test_case_2() || op_test_case_3() || op_test_case_4(); +} diff --git a/tests/op/test_op_batchnorm.c b/tests/op/test_op_batchnorm.c new file mode 100644 index 000000000..00361732c --- /dev/null +++ b/tests/op/test_op_batchnorm.c @@ -0,0 +1,98 @@ +#include "api/c_api.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "test_op.h" +#include "operator/prototype/batchnorm_param.h" +#include "tengine/c_api.h" +#include +#include +#include "util/vector.h" + +static void allocate_bn_inputs(vector_t* inputs, const int* dims, const int dim_num, const int dtype) +{ + struct data_buffer* input = create_data_buffer(dims, dim_num, dtype); + struct data_buffer *mean, *var, *gamma, *beta; + + int dim = dims[1]; + mean = create_data_buffer_fp32(&dim, 1); + var = create_data_buffer_fp32(&dim, 1); + gamma = create_data_buffer_fp32(&dim, 1); + beta = create_data_buffer_fp32(&dim, 1); + + push_vector_data(inputs, &input); + push_vector_data(inputs, &gamma); + push_vector_data(inputs, &beta); + push_vector_data(inputs, &mean); + push_vector_data(inputs, &var); +} + +static int __max(const int n, const int m) +{ + return n > m ? n : m; +} + +static void shuffle_array(int* arr, const int n) +{ + for (int i = 0; i < 20 * n; ++i) + { + int a = rand() % n; + int b = rand() % n; + int bak = arr[a]; + arr[a] = arr[b]; + arr[b] = bak; + } +} + +int op_test_case_0() +{ + int dims[4]; + for (int i = 0; i < 10; ++i) + { +#define __run_test_case(__dim_num, __caffe_flavor) \ + do { \ + dims[0] = __max(rand() % 10, 1); \ + dims[1] = __max(rand() % 128, 1); \ + dims[2] = __max(rand() % 128, 1); \ + dims[3] = __max(rand() % 128, 1); \ + shuffle_array(dims, 4); \ + float rescale_factor = random_float(-100.0f, 100.0f); \ + rescale_factor = rand() % 100 > 50 ? rescale_factor : .0; \ + batchnorm_param_t param = {.caffe_flavor = __caffe_flavor, .rescale_factor = rescale_factor, .eps = 0.001}; \ + vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); \ + allocate_bn_inputs(inputs, dims, __dim_num, TENGINE_DT_FP32); \ + int ret = create_common_op_test_case(OP_BATCHNORM_NAME, ¶m, sizeof(param), inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001); \ + release_vector(inputs); \ + if (ret) \ + { \ + fprintf(stderr, "batchnorm op test failed. dim_num = %d, caffe_flavor = %d, dtype = fp32\n", __dim_num, __caffe_flavor); \ + return ret; \ + } \ + inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); \ + allocate_bn_inputs(inputs, dims, __dim_num, TENGINE_DT_UINT8); \ + ret = create_common_op_test_case(OP_BATCHNORM_NAME, ¶m, sizeof(param), inputs, 1, TENGINE_DT_UINT8, TENGINE_LAYOUT_NCHW, 0.001); \ + release_vector(inputs); \ + if (ret) \ + { \ + fprintf(stderr, "batchnorm op test failed. dim_num = %d, caffe_flavor = %d, dtype = uint8\n", __dim_num, __caffe_flavor); \ + return ret; \ + } \ + fprintf(stderr, "batchnorm op test pass: dim_num = %d, caffe_flavor = %d\n", __dim_num, __caffe_flavor); \ + } while (0) + + __run_test_case(2, 0); + __run_test_case(3, 0); + __run_test_case(4, 0); + __run_test_case(2, 1); + __run_test_case(3, 1); + __run_test_case(4, 1); + } + + return 0; +} + +int main(void) +{ + time_t tim = time(NULL); + srand((unsigned int)tim); + return op_test_case_0(); +} diff --git a/tests/op/test_op_batchtospacend.c b/tests/op/test_op_batchtospacend.c new file mode 100644 index 000000000..c3081b81b --- /dev/null +++ b/tests/op/test_op_batchtospacend.c @@ -0,0 +1,72 @@ +#include "api/c_api.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "test_op.h" +#include "operator/prototype/batchtospacend_param.h" +#include "tengine/c_api.h" +#include +#include +#include "util/vector.h" + +static int __min(const int n, const int m) +{ + return n < m ? n : m; +} + +static void shuffle_array(int* arr, const int n) +{ + for (int i = 0; i < 20 * n; ++i) + { + int a = rand() % n; + int b = rand() % n; + int bak = arr[a]; + arr[a] = arr[b]; + arr[b] = bak; + } +} + +static int op_test_case(const int crop_left, const int crop_right, const int crop_bottom, const int crop_top, const int dilation_x, const int dilation_y) +{ + struct batchtospacend_param params = { + .crop_top = crop_top, + .crop_bottom = crop_bottom, + .crop_left = crop_left, + .crop_right = crop_right, + .dilation_x = dilation_x, + .dilation_y = dilation_y}; + + int dims[4] = {rand_int(1, 256) * params.dilation_x * params.dilation_y, rand_int(1, 16), rand_int(1, 16), rand_int(1, 32)}; + + const int expand = dims[0] / (params.dilation_x * params.dilation_y); + + int h = expand * dims[2]; + int w = expand * dims[3]; + + if (params.crop_right > h) + { + dims[2] = params.crop_right / expand + 1; + } + + if (params.crop_bottom > w) + { + dims[3] = params.crop_bottom / expand + 1; + } + + struct data_buffer* input = create_data_buffer(dims, 4, TENGINE_DT_FP32); + vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); + push_vector_data(inputs, &input); + + int ret = create_common_op_test_case(OP_BATCHTOSPACEND_NAME, ¶ms, sizeof(params), inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001); + if (ret) + { + fprintf(stderr, "test op batchtospacend failed."); + return ret; + } + + return 0; +} + +int main(void) +{ + return op_test_case(0, 0, 0, 0, 1, 1) || op_test_case(1, 2, 1, 2, 1, 2) || op_test_case(1, 1, 1, 1, 2, 2); +} diff --git a/tests/op/test_op_bias.c b/tests/op/test_op_bias.c new file mode 100644 index 000000000..ff90e0ad6 --- /dev/null +++ b/tests/op/test_op_bias.c @@ -0,0 +1,39 @@ +#include "api/c_api.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "test_op.h" +#include "tengine/c_api.h" +#include +#include +#include "util/vector.h" + +#define define_common_test_case(__op_name, __case_name, __layout, ...) \ + static int __case_name() \ + { \ + int data_type = TENGINE_DT_FP32; \ + int layout = __layout; \ + int dims[] = {__VA_ARGS__}; \ + int dims_num = sizeof(dims) / sizeof(dims[0]); \ + vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); \ + struct data_buffer* input = create_data_buffer(dims, dims_num, data_type); \ + push_vector_data(inputs, &input); \ + struct data_buffer* bias = create_data_buffer(&dims[1], 1, data_type); \ + push_vector_data(inputs, &bias); \ + int ret = create_common_op_test_case(__op_name, NULL, 0, inputs, 1, data_type, layout, 0.001); \ + if (ret) { fprintf(stderr, "test op %s failed: ret = %d, dims = {%d, %d, %d, %d}\n", __op_name, ret, dims[0], dims[1], dims[2], dims[3]); } \ + release_vector(inputs); \ + return 0; \ + } + +#define define_test_case(__case_name, __layout, ...) define_common_test_case(OP_BIAS_NAME, __case_name, __layout, __VA_ARGS__) + +define_test_case(op_test_case_0, TENGINE_LAYOUT_NCHW, 1, 3, 64, 128); +define_test_case(op_test_case_1, TENGINE_LAYOUT_NCHW, 1, 3, 128, 128); +define_test_case(op_test_case_2, TENGINE_LAYOUT_NCHW, 1, 3, 128, 64); +define_test_case(op_test_case_3, TENGINE_LAYOUT_NCHW, 1, 3, 111, 111); +define_test_case(op_test_case_4, TENGINE_LAYOUT_NCHW, 1, 3, 65, 111); + +int main(void) +{ + return op_test_case_0() || op_test_case_1() || op_test_case_2() || op_test_case_3() || op_test_case_4(); +} diff --git a/tests/op/test_op_broadmul.c b/tests/op/test_op_broadmul.c new file mode 100644 index 000000000..3aa9b5014 --- /dev/null +++ b/tests/op/test_op_broadmul.c @@ -0,0 +1,53 @@ +#include "api/c_api.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "test_op.h" +#include "tengine/c_api.h" +#include +#include +#include +#include "util/vector.h" + +static int test_op_case() +{ + // broadmul 只支持一个维度的广播,例如[2, 2, 3] * [2, 2, 1]是支持的, 但是[2, 2, 3] * [2, 1, 1]不支持 + // broadmul 只支持input1向input0广播,例如[2, 2, 3] * [2, 2, 1]是支持的 但是[2, 2, 1] * [2, 2, 3]是不支持的, 当然 [2, 1, 2] * [1, 2, 1]也是不支持的 + // broadmul 要求input0 input1最后一维必须相等 + for (int loop = 0; loop < 10; ++loop) + { + int dims1[4] = {rand_int(10, 64), rand_int(10, 64), rand_int(10, 64), rand_int(10, 64)}; + + int i = rand() % 3; + int dims2[4] = {0}; + + memcpy(dims2, dims1, sizeof(dims1)); + dims2[i] = 1; + + struct data_buffer* input1 = create_data_buffer(dims1, 4, TENGINE_DT_FP32); + struct data_buffer* input2 = create_data_buffer(dims2, 4, TENGINE_DT_FP32); + vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); + + push_vector_data(inputs, &input1); + push_vector_data(inputs, &input2); + + int ret = create_common_op_test_case(OP_BROADMUL_NAME, NULL, 0, inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001); + if (ret) + { + fprintf(stderr, "test op %s failed. ret = %d, dims1 = {%d, %d, %d, %d}, dims2 = {%d, %d, %d, %d}\n", OP_BROADMUL_NAME, ret, dims1[0], dims1[1], dims1[2], dims1[3], dims2[0], dims2[1], dims2[2], dims2[3]); + return ret; + } + else + { + fprintf(stderr, "test op %s pass. ret = %d, dims1 = {%d, %d, %d, %d}, dims2 = {%d, %d, %d, %d}\n", OP_BROADMUL_NAME, ret, dims1[0], dims1[1], dims1[2], dims1[3], dims2[0], dims2[1], dims2[2], dims2[3]); + } + + release_vector(inputs); + } +} + +int main(void) +{ + time_t tim = time(NULL); + srand((unsigned int)tim); + return test_op_case(); +} diff --git a/tests/op/test_op_cast.c b/tests/op/test_op_cast.c new file mode 100644 index 000000000..43cb48490 --- /dev/null +++ b/tests/op/test_op_cast.c @@ -0,0 +1,41 @@ +#include "api/c_api.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "operator/prototype/cast_param.h" +#include "test_op.h" +#include "tengine/c_api.h" +#include +#include +#include "util/vector.h" + +static int test_cast_op(const int from, const int to) +{ + int dims[4] = {rand_int(10, 64), rand_int(10, 64), rand_int(10, 64), rand_int(10, 64)}; + struct data_buffer* input = create_data_buffer(dims, 4, from); + vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); + push_vector_data(inputs, &input); + + struct cast_param params = {.type_from = from, .type_to = to}; + + int ret = create_common_op_test_case(OP_CAST_NAME, ¶ms, sizeof(params), inputs, 1, to, TENGINE_LAYOUT_NCHW, 0.001); + if (ret) + { + fprintf(stderr, "test op %s failed. ret = %d, dims1 = {%d, %d, %d, %d}, from type = %d, to type = %d\n", OP_CAST_NAME, ret, dims[0], dims[1], dims[2], dims[3], from, to); + return ret; + } + + release_vector(inputs); + return 0; +} + +int main(void) +{ + time_t tim = time(NULL); + srand((unsigned int)tim); + return test_cast_op(TENGINE_DT_FP32, TENGINE_DT_FP16) + || test_cast_op(TENGINE_DT_FP16, TENGINE_DT_FP32) + || test_cast_op(TENGINE_DT_FP32, TENGINE_DT_UINT8) + || test_cast_op(TENGINE_DT_UINT8, TENGINE_DT_FP32) + || test_cast_op(TENGINE_DT_FP32, TENGINE_DT_FP32) + || test_cast_op(TENGINE_DT_UINT8, TENGINE_DT_UINT8); +} diff --git a/tests/op/test_op_ceil.c b/tests/op/test_op_ceil.c new file mode 100644 index 000000000..c24849732 --- /dev/null +++ b/tests/op/test_op_ceil.c @@ -0,0 +1,44 @@ +#include "api/c_api.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "test_op.h" +#include "tengine/c_api.h" +#include +#include +#include "util/vector.h" + +static int test_ceil_op() +{ + for (int i = 0; i < 10; ++i) + { + int dims[4] = {rand_int(10, 64), rand_int(10, 64), rand_int(10, 64), rand_int(10, 64)}; + struct data_buffer* input = create_data_buffer(dims, 4, TENGINE_DT_FP32); + vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); + push_vector_data(inputs, &input); + + int ret = create_common_op_test_case(OP_CEIL_NAME, NULL, 0, inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001); + if (ret) + { + return ret; + } + + release_vector(inputs); + input = create_data_buffer(dims, 4, TENGINE_DT_UINT8); + inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); + push_vector_data(inputs, &input); + + ret = create_common_op_test_case(OP_CEIL_NAME, NULL, 0, inputs, 1, TENGINE_DT_UINT8, TENGINE_LAYOUT_NCHW, 0.001); + + if (ret) { return ret; } + + release_vector(inputs); + } + return 0; +} + +int main(void) +{ + time_t tim = time(NULL); + srand((unsigned int)tim); + return test_ceil_op(); +} diff --git a/tests/op/test_op_clip.c b/tests/op/test_op_clip.c new file mode 100644 index 000000000..9108bd7e9 --- /dev/null +++ b/tests/op/test_op_clip.c @@ -0,0 +1,57 @@ +#include "api/c_api.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "test_op.h" +#include "tengine/c_api.h" +#include "operator/prototype/clip_param.h" +#include +#include +#include "util/vector.h" + +static int test_ceil_op() +{ + for (int i = 0; i < 10; ++i) + { + struct clip_param params = {.min = random_float(-1.0, 0.0), .max = random_float(0.0, 1.0)}; + int dims[4] = {rand_int(10, 64), rand_int(10, 64), rand_int(10, 64), rand_int(10, 64)}; + struct data_buffer* input = create_data_buffer(dims, 4, TENGINE_DT_FP32); + vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); + push_vector_data(inputs, &input); + + int ret = create_common_op_test_case(OP_CLIP_NAME, ¶ms, sizeof(params), inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001); + if (ret) + { + return ret; + } + + release_vector(inputs); + + input = create_data_buffer(dims, 4, TENGINE_DT_UINT8); + inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); + push_vector_data(inputs, &input); + + ret = create_common_op_test_case(OP_CLIP_NAME, ¶ms, sizeof(params), inputs, 1, TENGINE_DT_UINT8, TENGINE_LAYOUT_NCHW, 0.001); + + if (ret) { return ret; } + + release_vector(inputs); + + input = create_data_buffer(dims, 4, TENGINE_DT_INT8); + inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); + push_vector_data(inputs, &input); + + ret = create_common_op_test_case(OP_CLIP_NAME, ¶ms, sizeof(params), inputs, 1, TENGINE_DT_INT8, TENGINE_LAYOUT_NCHW, 0.001); + + if (ret) { return ret; } + + release_vector(inputs); + } + return 0; +} + +int main(void) +{ + time_t tim = time(NULL); + srand((unsigned int)tim); + return test_ceil_op(); +} diff --git a/tests/op/test_op_comparison.c b/tests/op/test_op_comparison.c new file mode 100644 index 000000000..2e5efc81d --- /dev/null +++ b/tests/op/test_op_comparison.c @@ -0,0 +1,99 @@ +#include "api/c_api.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "test_op.h" +#include "tengine/c_api.h" +#include +#include +#include +#include "util/vector.h" +#include "operator/prototype/comparison_param.h" + +static int get_total_size(const int* dims, const int n) +{ + int s = 1; + for (int i = 0; i < n; ++i) + { + s *= dims[i]; + } + return s; +} + +static void random_mask(float* data, const int size) +{ + int n = (int)(0.5f * size); + for (int i = 0; i < n; ++i) + { + int k = rand() % n; + data[k] = random_float(-1.2f, 1.2f); + } +} + +static int do_comparison_test(const int* dims1, const int* dims2, const int n1, const int n2) +{ + for (int i = 0; i <= 5; ++i) + { + struct comparison_param params = {.type = i}; + + struct data_buffer* input = create_data_buffer(dims1, n1, TENGINE_DT_FP32); + struct data_buffer* input1 = create_data_buffer(dims2, n2, TENGINE_DT_FP32); + vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); + push_vector_data(inputs, &input); + push_vector_data(inputs, &input1); + + int ret = create_common_op_test_case(OP_COMPARISON_NAME, ¶ms, sizeof(params), inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001); + if (ret) + { + fprintf(stderr, "test comparison op failed: %d, type = %d, dims1 = {%d, %d, %d, %d}, dims2 = {%d, %d, %d, %d}\n", ret, i, dims1[0], dims1[1], dims1[2], dims1[3], dims2[0], dims2[1], dims2[2], dims2[3]); + release_vector(inputs); + return ret; + } + + const int total_size1 = get_total_size(dims1, n1); + const int total_size2 = get_total_size(dims2, n2); + if (total_size1 > total_size2) + { + random_mask(input->data, total_size1); + } + else + { + random_mask(input1->data, total_size2); + } + + ret = create_common_op_test_case(OP_COMPARISON_NAME, ¶ms, sizeof(params), inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001); + release_vector(inputs); + if (ret) + { + fprintf(stderr, "test comparison op after masked failed: %d, type = %d, dims1 = {%d, %d, %d, %d}, dims2 = {%d, %d, %d, %d}\n", ret, i, dims1[0], dims1[1], dims1[2], dims1[3], dims2[0], dims2[1], dims2[2], dims2[3]); + return ret; + } + } + + fprintf(stderr, "test comparison op pass\n"); + return 0; +} + +static int test_comparison_op() +{ + int dims1[] = {rand_int(2, 10), rand_int(10, 32), rand_int(10, 32), rand_int(10, 32)}; + int dims2[4] = {0}; + + memcpy(dims2, dims1, sizeof(dims1)); + int ret = do_comparison_test(dims1, dims2, 4, 4); + if (ret) { return ret; } + + dims2[0] = 1; + ret = do_comparison_test(dims1, dims2, 4, 1) || do_comparison_test(dims2, dims1, 1, 4); + if (ret) return ret; + + dims2[0] = dims1[1]; + + return do_comparison_test(dims1, dims2, 4, 1) || do_comparison_test(dims2, dims1, 1, 4); +} + +int main(void) +{ + time_t tim = time(NULL); + srand((unsigned int)tim); + return test_comparison_op(); +} diff --git a/tests/op/test_op_conv.c b/tests/op/test_op_conv.c new file mode 100644 index 000000000..fde13887a --- /dev/null +++ b/tests/op/test_op_conv.c @@ -0,0 +1,80 @@ +#include "api/c_api.h" +#include "graph/graph.h" +#include "graph/node.h" +#include "test_op.h" +#include "tengine/c_api.h" +#include +#include +#include +#include "util/vector.h" +#include "operator/prototype/convolution_param.h" + +static int max(int lhs, int rhs) +{ + return lhs > rhs ? lhs : rhs; +} + +static int test_conv_op_case(int kernel_h, int kernel_w, int pad_h, int pad_w, int stride_h, int stride_w, int dilation_h, int dilation_w) +{ + const int real_h = (kernel_h - 1) * dilation_h + stride_h + 1; + const int real_w = (kernel_w - 1) * dilation_w + stride_w + 1; + + const int max_h = max(real_h + 1, 32); + const int max_w = max(real_w + 1, 32); + + for (int i = 0; i < 10; ++i) + { + int dims[4] = {rand_int(2, 8), rand_int(2, 12), rand_int(real_h, max_h), rand_int(real_w, max_w)}; + int kernel_shape[] = {rand_int(2, 32), dims[1], kernel_h, kernel_w}; + + vector_t* inputs = create_vector(sizeof(struct data_buffer*), free_data_buffer_in_vector); + + struct data_buffer* input = create_data_buffer(dims, 4, TENGINE_DT_FP32); + struct data_buffer* filter = create_data_buffer(kernel_shape, 4, TENGINE_DT_FP32); + push_vector_data(inputs, &input); + push_vector_data(inputs, &filter); + + struct conv_param params = {.kernel_h = kernel_shape[2], .kernel_w = kernel_shape[3], .stride_h = stride_h, .stride_w = stride_w, .pad_h0 = pad_h, .pad_h1 = pad_h, .pad_w0 = pad_w, .pad_w1 = pad_w, .dilation_h = dilation_h, .dilation_w = dilation_w, .input_channel = kernel_shape[1], .output_channel = kernel_shape[0], .group = 1, .activation = -1, .wino_off = 1}; + + int ret = create_common_op_test_case(OP_CONV_NAME, ¶ms, sizeof(params), inputs, 1, TENGINE_DT_FP32, TENGINE_LAYOUT_NCHW, 0.001); + release_vector(inputs); + + if (ret) + { + fprintf(stderr, "test conv op failed: %d, kernel_h = %d, kernel_w = %d, pad_h = %d, pad_w = %d, stride_h = %d, stride_w = %d, dilation_h = %d, dilation_w = %d, input dims = {%d, %d, %d, %d}, kernel dims = {%d, %d, %d, %d}\n", ret, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, dims[0], dims[1], dims[2], dims[3], kernel_shape[0], kernel_shape[1], kernel_shape[2], kernel_shape[3]); + return ret; + } + } + + fprintf(stderr, "test conv op pass, kernel_h = %d, kernel_w = %d, pad_h = %d, pad_w = %d, stride_h = %d, stride_w = %d, dilation_h = %d, dilation_w = %d\n", kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w); + return 0; +} + +#define __define_test_conv_op(kh, kw) \ + static int test_conv_op_##kh##x##kw() \ + { \ + return test_conv_op_case(kh, kw, 0, 0, 1, 1, 1, 1) \ + || test_conv_op_case(kh, kw, 1, 1, 1, 1, 1, 1) \ + || test_conv_op_case(kh, kw, 2, 2, 1, 1, 1, 1) \ + || test_conv_op_case(kh, kw, 3, 3, 1, 1, 1, 1) \ + || test_conv_op_case(kh, kw, 3, 1, 1, 1, 1, 1) \ + || test_conv_op_case(kh, kw, 1, 3, 1, 1, 1, 1) \ + || test_conv_op_case(kh, kw, 1, 3, 2, 2, 1, 1) \ + || test_conv_op_case(kh, kw, 1, 3, 3, 3, 1, 1) \ + || test_conv_op_case(kh, kw, 1, 3, 3, 1, 1, 1) \ + || test_conv_op_case(kh, kw, 1, 3, 1, 3, 1, 1) \ + || test_conv_op_case(kh, kw, 1, 3, 1, 3, 2, 2) \ + || test_conv_op_case(kh, kw, 1, 3, 1, 3, 3, 3) \ + || test_conv_op_case(kh, kw, 1, 3, 1, 3, 1, 3) \ + || test_conv_op_case(kh, kw, 1, 3, 1, 3, 3, 1); \ + } + +__define_test_conv_op(3, 3); +__define_test_conv_op(1, 1); + +int main(void) +{ + time_t tim = time(NULL); + srand((unsigned int)tim); + return test_conv_op_1x1() || test_conv_op_3x3(); +} diff --git a/tests/op/test_op_prelu.c b/tests/op/test_op_prelu.c deleted file mode 100644 index dd31e4b1e..000000000 --- a/tests/op/test_op_prelu.c +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - -#include "test_op.h" - -int create_test_prelu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; - (void)n; - (void)c; - (void)h; - (void)w; - - /* create the test node */ - node_t test_node = create_graph_node(graph, node_name, "PReLU"); - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - - if (NULL == input_tensor) - { - fprintf(stderr, "create test node failed. ERRNO: %d.\n", get_tengine_errno()); - return -1; - } - - /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */ - node_t slope_node = create_graph_node(graph, "slope", "Const"); - tensor_t slope_tensor = create_graph_tensor(graph, "slope", TENGINE_DT_FP32); - set_node_output_tensor(slope_node, 0, slope_tensor, TENSOR_TYPE_CONST); - - int dims[4]; - get_tensor_shape(input_tensor, dims, 4); - int slope_dims[1] = {dims[1]}; // channel num - set_tensor_shape(slope_tensor, slope_dims, 1); - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - set_node_input_tensor(test_node, 1, slope_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - return 0; -} - -float slope_value[3] = {0.1f, 0.2f, 0.3f}; -float result_value[3] = {-1.f, -2.f, -3.f}; - -int main(int argc, char* argv[]) -{ - int n = 1, c = 3, h = 6, w = 6; - const char* test_node_name = "prelu"; - int data_type = TENGINE_DT_FP32; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Tengine init failed. ERRNO: %d.", get_tengine_errno()); - - // create - graph_t graph = create_common_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_prelu_node); - if (NULL == graph) - return -1; - - // set input data - fill_input_float_tensor_by_index(graph, 0, 0, -10.0f); - - // set slope data - fill_input_float_buffer_tensor_by_name(graph, test_node_name, 1, (void*)slope_value, 3 * sizeof(float)); - - // graph run - ret = test_graph_run(graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(graph); - return -1; - } - - // check the result - struct tensor* output_tensor = get_graph_output_tensor(graph, 0, 0); - int out_c = output_tensor->dims[1]; - int cstep = output_tensor->dims[2] * output_tensor->dims[3]; - - ret = 0; - for (int i = 0; i < out_c; i++) - { - float* output_data = (float*)output_tensor->data + i * cstep; - for (int j = 0; j < cstep; j++) - { - if (output_data[j] != result_value[i]) - { - fprintf(stderr, "Check result failed, current %f, expect %f\n", output_data[j], result_value[i]); - ret = -1; - break; - } - } - } - - if (ret == 0) - fprintf(stderr, "test pass.\n"); - else - fprintf(stderr, "test failed.\n"); - - // exit - test_graph_release(graph); - - return ret; -} diff --git a/tests/op/test_op_relu.c b/tests/op/test_op_relu.c deleted file mode 100644 index 730ab3260..000000000 --- a/tests/op/test_op_relu.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - -#include "test_op.h" - -int create_test_relu_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; - (void)n; - (void)c; - (void)h; - (void)w; - - /* create the test node */ - node_t test_node = create_graph_node(graph, node_name, "ReLU"); - if (NULL == test_node) - { - fprintf(stderr, "create test node failed. ERRNO: %d.\n", get_tengine_errno()); - return -1; - } - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - if (NULL == input_tensor) - { - fprintf(stderr, "get graph input tensor failed. ERRNO: %d.\n", get_tengine_errno()); - return -1; - } - - /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */ - // None - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - if (NULL == output_tensor) - { - fprintf(stderr, "create graph output tensor failed. ERRNO: %d.\n", get_tengine_errno()); - return -1; - } - - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set the attr of test node */ - // None - - return 0; -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 3, h = 12, w = 12; - const char* test_node_name = "relu"; - int data_type = TENGINE_DT_FP32; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Engine init failed. ERRNO: %d.", get_tengine_errno()); - - // create - graph_t graph = create_common_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_relu_node); - if (NULL == graph) - return -1; - - // set input data - fill_input_float_tensor_by_index(graph, 0, 0, -10.0f); - - // graph run - ret = test_graph_run(graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(graph); - return -1; - } - - // dump input node - int input_node_count = get_graph_input_node_number(graph); - for (int i = 0; i < input_node_count; i++) - { - node_t input = get_graph_input_node(graph, i); - dump_node_output(input, 0); - } - - // dump output node - int output_node_count = get_graph_output_node_number(graph); - for (int i = 0; i < output_node_count; i++) - { - node_t output = get_graph_output_node(graph, i); - dump_node_output(output, 0); - } - - // exit - test_graph_release(graph); - - return 0; -} diff --git a/tests/op/test_op_relu6.c b/tests/op/test_op_relu6.c deleted file mode 100644 index 9315c6477..000000000 --- a/tests/op/test_op_relu6.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* - * Copyright (c) 2020, OPEN AI LAB - * Author: qtang@openailab.com - */ - -#include "test_op.h" - -int create_test_relu6_node(graph_t graph, const char* input_name, const char* node_name, int data_type, int layout, int n, int c, int h, int w) -{ - (void)layout; - (void)n; - (void)c; - (void)h; - (void)w; - - /* create the test node */ - node_t test_node = create_graph_node(graph, node_name, "ReLU6"); - if (NULL == test_node) - { - fprintf(stderr, "create test node failed. ERRNO: %d.\n", get_tengine_errno()); - return -1; - } - - tensor_t input_tensor = get_graph_tensor(graph, input_name); - if (NULL == input_tensor) - { - fprintf(stderr, "get graph input tensor failed. ERRNO: %d.\n", get_tengine_errno()); - return -1; - } - - /* create the sub node to product another input tensors which the test node is needed, such as weight/bias/slope tensor. */ - // None - - /* input tensors of test node */ - set_node_input_tensor(test_node, 0, input_tensor); - - /* output tensors of test node */ - tensor_t output_tensor = create_graph_tensor(graph, node_name, data_type); - if (NULL == output_tensor) - { - fprintf(stderr, "create graph output tensor failed. ERRNO: %d.\n", get_tengine_errno()); - return -1; - } - - set_node_output_tensor(test_node, 0, output_tensor, TENSOR_TYPE_VAR); - - /* set the attr of test node */ - // None - - return 0; -} - -int main(int argc, char* argv[]) -{ - int n = 1, c = 3, h = 12, w = 12; - const char* test_node_name = "relu6"; - int data_type = TENGINE_DT_FP32; - int layout = TENGINE_LAYOUT_NCHW; - - // init - int ret = test_graph_init(); - if (0 != ret) - fprintf(stderr, "Engine init failed. ERRNO: %d.", get_tengine_errno()); - - // create - graph_t graph = create_common_test_graph(test_node_name, data_type, layout, n, c, h, w, &create_test_relu6_node); - if (NULL == graph) - return -1; - - // set input data - fill_input_float_tensor_by_index(graph, 0, 0, -10.0f); - - // graph run - ret = test_graph_run(graph); - if (0 != ret) - { - fprintf(stderr, "Run graph error. ERRNO: %d.\n", ret); - test_graph_release(graph); - return -1; - } - - // dump input node - int input_node_count = get_graph_input_node_number(graph); - for (int i = 0; i < input_node_count; i++) - { - node_t input = get_graph_input_node(graph, i); - dump_node_output(input, 0); - } - - // dump output node - int output_node_count = get_graph_output_node_number(graph); - for (int i = 0; i < output_node_count; i++) - { - node_t output = get_graph_output_node(graph, i); - dump_node_output(output, 0); - } - - // exit - test_graph_release(graph); - - return 0; -} diff --git a/tests/test_rv64_models.sh b/tests/test_rv64_models.sh new file mode 100755 index 000000000..6b3e926ef --- /dev/null +++ b/tests/test_rv64_models.sh @@ -0,0 +1,42 @@ +#!/bin/bash - + +if [ ! "${QEMU_CMD}" ]; then + echo '$QEMU_CMD is required.' + exit -1 +fi + +test_models=( +"${QEMU_CMD} ./tests/test_model_classification -m squeezenet -i images/cat.jpg -g 227,227 -w 104.007,116.669,122.679 -s 1,1,1" +"${QEMU_CMD} ./tests/test_model_classification -m mobilenet -i images/cat.jpg -g 224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017" +"${QEMU_CMD} ./tests/test_model_classification -m mobilenet_v2 -i images/cat.jpg -g 224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017" +"${QEMU_CMD} ./tests/test_model_classification -m googlenet -i images/cat.jpg -g 224,224 -w 104.007,116.669,122.679 -s 1,1,1" +"${QEMU_CMD} ./tests/test_model_classification -m inception_v3 -i images/cat.jpg -g 395,395 -w 104.007,116.669,122.679 -s 0.0078,0.0078,0.0078" +"${QEMU_CMD} ./tests/test_model_classification -m inception_v4 -i images/cat.jpg -g 299,299 -w 104.007,116.669,122.679 -s 0.007843,0.007843,0.007843" +"${QEMU_CMD} ./tests/test_model_classification -m resnet50 -i images/bike.jpg -g 224,224 -w 104.007,116.669,122.679 -s 1,1,1" +"${QEMU_CMD} ./tests/test_model_classification -m mnasnet -i images/cat.jpg -g 224,224 -w 104.007,116.669,122.679 -s 0.017,0.017,0.017" +"${QEMU_CMD} ./tests/test_model_classification -m shufflenet_1xg3 -i images/cat.jpg -g 224,224 -w 103.940,116.780,123.680 -s 0.017,0.017,0.017" +"${QEMU_CMD} ./tests/test_model_classification -m shufflenet_v2 -i images/cat.jpg -g 224,224 -w 103.940,116.780,123.680 -s 0.00392156,0.00392156,0.00392156" +"${QEMU_CMD} ./tests/test_model_hrnet" +"${QEMU_CMD} ./tests/test_model_mobilefacenet" +"${QEMU_CMD} ./tests/test_model_mobilenet_ssd" +"${QEMU_CMD} ./tests/test_model_nanodet_m" +"${QEMU_CMD} ./tests/test_model_retinaface" +"${QEMU_CMD} ./tests/test_model_ultraface" +"${QEMU_CMD} ./tests/test_model_yolofastest" +"${QEMU_CMD} ./tests/test_model_yolov3" +"${QEMU_CMD} ./tests/test_model_yolov3_tiny" +"${QEMU_CMD} ./tests/test_model_yolov4" +"${QEMU_CMD} ./tests/test_model_yolov4_tiny" +"${QEMU_CMD} ./tests/test_model_yolov5s" +) + +for (( i = 0 ; i < ${#test_models[@]} ; i++ )) +do + echo ${test_models[$i]} + echo ${test_models[$i]} | xargs -i sh -c "{}" + + if [ "$?" != 0 ]; then + echo "failed" + exit 1 + fi +done diff --git a/tests/test_rv64_ops.sh b/tests/test_rv64_ops.sh new file mode 100755 index 000000000..627161a48 --- /dev/null +++ b/tests/test_rv64_ops.sh @@ -0,0 +1,33 @@ +#!/bin/bash - + +if [ ! "${QEMU_CMD}" ]; then + echo '$QEMU_CMD is required.' + exit -1 +fi + +test_models=( +"${QEMU_CMD} ./tests/test_op_absval" +"${QEMU_CMD} ./tests/test_op_add_n" +"${QEMU_CMD} ./tests/test_op_argmax" +"${QEMU_CMD} ./tests/test_op_argmin" +"${QEMU_CMD} ./tests/test_op_batchnorm" +"${QEMU_CMD} ./tests/test_op_batchtospacend" +# "${QEMU_CMD} ./tests/test_op_broadmul" +"${QEMU_CMD} ./tests/test_op_bias" +"${QEMU_CMD} ./tests/test_op_cast" +"${QEMU_CMD} ./tests/test_op_ceil" +"${QEMU_CMD} ./tests/test_op_clip" +"${QEMU_CMD} ./tests/test_op_comparison" +"${QEMU_CMD} ./tests/test_op_conv" +) + +for (( i = 0 ; i < ${#test_models[@]} ; i++ )) +do + echo ${test_models[$i]} + echo ${test_models[$i]} | xargs -i sh -c "{}" + + if [ "$?" != 0 ]; then + echo "failed" + exit 1 + fi +done diff --git a/toolchains/rv64-c906.toolchain.cmake b/toolchains/rv64-c906.toolchain.cmake index e8268106d..ec28012b0 100644 --- a/toolchains/rv64-c906.toolchain.cmake +++ b/toolchains/rv64-c906.toolchain.cmake @@ -12,7 +12,16 @@ SET (CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) SET (CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) # other needed options -SET (TENGINE_TOOLCHAIN_ASM_FLAG -march=rv64gcvxthead -mabi=lp64d -mtune=c906 -mfp16 -lc) +IF (CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL "release" OR CMAKE_BUILD_TYPE STREQUAL "RELEASE") + SET (TENGINE_TOOLCHAIN_ASM_FLAG -march=rv64gcv -mabi=lp64d -mtune=thead-c906 -lc) +ELSE() + SET (TENGINE_TOOLCHAIN_ASM_FLAG -march=rv64gcv -mabi=lp64d -g -O0 -lc) +ENDIF() + +IF (TENGINE_RV64_RVV_C906) + SET(TENGINE_TOOLCHAIN_ASM_FLAG "-D__FIX_RVV_C906 ${TENGINE_TOOLCHAIN_ASM_FLAG}") +ENDIF() + #SET (TENGINE_TOOLCHAIN_FLAG -march=rv64imafdcvxtheadc -mabi=lp64dv -mtune=c906 -mfp16) #SET (TENGINE_TOOLCHAIN_FLAG -march=rv64imafdcvxtheadc -mabi=lp64dv -mtune=c910 -mfp16)