diff --git a/c++/custom-operator/CMakeLists.txt b/c++/custom-operator/CMakeLists.txt new file mode 100755 index 0000000000000..f734267daa5d4 --- /dev/null +++ b/c++/custom-operator/CMakeLists.txt @@ -0,0 +1,204 @@ +cmake_minimum_required(VERSION 3.0) +project(cpp_inference_demo CXX C) +option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) +option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." ON) +option(USE_TENSORRT "Compile demo with TensorRT." ON) +option(CUSTOM_OPERATOR_FILES "List of file names for custom operators" "") + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +include(external/boost) + +if(WITH_GPU) + find_package(CUDA REQUIRED) + add_definitions("-DPADDLE_WITH_CUDA") +endif() + +if(NOT WITH_STATIC_LIB) + add_definitions("-DPADDLE_WITH_SHARED_LIB") +else() + # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode. + # Set it to empty in static library mode to avoid compilation issues. + add_definitions("/DPD_INFER_DECL=") +endif() + +macro(safe_set_static_flag) + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) +endmacro() + +if(NOT DEFINED PADDLE_LIB) + message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") +endif() +if(NOT DEFINED DEMO_NAME) + message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") +endif() + +include_directories("${PADDLE_LIB}/") +set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") + +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") +link_directories("${PADDLE_LIB}/paddle/lib") + +if (WIN32) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) + if (MSVC_STATIC_CRT) + if (WITH_MKL) + set(FLAG_OPENMP "/openmp") + endif() + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}") + safe_set_static_flag() + if (WITH_STATIC_LIB) + add_definitions(-DSTATIC_LIB) + endif() + endif() +else() + if(WITH_MKL) + set(FLAG_OPENMP "-fopenmp") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 ${FLAG_OPENMP}") +endif() + +if(WITH_GPU) + if(NOT WIN32) + set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") + else() + if(CUDA_LIB STREQUAL "") + set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") + endif() + endif(NOT WIN32) +endif() + +if (USE_TENSORRT AND WITH_GPU) + set(TENSORRT_ROOT "" CACHE STRING "The root directory of TensorRT library") + if("${TENSORRT_ROOT}" STREQUAL "") + message(FATAL_ERROR "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH ") + endif() + set(TENSORRT_INCLUDE_DIR ${TENSORRT_ROOT}/include) + set(TENSORRT_LIB_DIR ${TENSORRT_ROOT}/lib) +endif() + +if (NOT WIN32) + if (USE_TENSORRT AND WITH_GPU) + include_directories("${TENSORRT_INCLUDE_DIR}") + link_directories("${TENSORRT_LIB_DIR}") + endif() +endif(NOT WIN32) + +if(WITH_MKL) + set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml") + include_directories("${MATH_LIB_PATH}/include") + if(WIN32) + set(MATH_LIB ${MATH_LIB_PATH}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX} + ${MATH_LIB_PATH}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX}) + else() + set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} + ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() + set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn") + if(EXISTS ${MKLDNN_PATH}) + include_directories("${MKLDNN_PATH}/include") + if(WIN32) + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib) + else(WIN32) + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) + endif(WIN32) + endif() +else() + set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas") + include_directories("${OPENBLAS_LIB_PATH}/include/openblas") + if(WIN32) + set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}) + else() + set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() +endif() + +if(WITH_STATIC_LIB) + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) +else() + if(WIN32) + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) + else() + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() +endif() + +if (NOT WIN32) + set(EXTERNAL_LIB "-lrt -ldl -lpthread") + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags protobuf xxhash + ${EXTERNAL_LIB}) +else() + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags_static libprotobuf xxhash ${EXTERNAL_LIB}) + set(DEPS ${DEPS} shlwapi.lib) +endif(NOT WIN32) + +if(WITH_GPU) + if(NOT WIN32) + if (USE_TENSORRT) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() + set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) + else() + if(USE_TENSORRT) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() + set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) + endif() +endif() + +cuda_add_library(pd_infer_custom_op ${CUSTOM_OPERATOR_FILES} SHARED) +add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) +set(DEPS ${DEPS} boost pd_infer_custom_op) + +if(WIN32) + if(USE_TENSORRT) + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX} + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX} + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + ) + endif() + if(WITH_MKL) + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release + COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll ${CMAKE_BINARY_DIR}/Release + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll ${CMAKE_BINARY_DIR}/Release + ) + else() + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release + ) + endif() + if(NOT WITH_STATIC_LIB) + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_fluid.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + ) + endif() +endif() + +target_link_libraries(${DEMO_NAME} ${DEPS}) \ No newline at end of file diff --git a/c++/custom-operator/README.md b/c++/custom-operator/README.md new file mode 100644 index 0000000000000..cd50fc7b36923 --- /dev/null +++ b/c++/custom-operator/README.md @@ -0,0 +1,53 @@ +## 自定义算子模型构建运行示例 + +### 一:获取本样例中的自定义算子模型 +下载地址:https://paddle-inference-dist.bj.bcebos.com/inference_demo/custom_operator/custom_relu_infer_model.tgz + +### 二:**样例编译** + +文件 `custom_relu_op.cc`、`custom_relu_op.cu` 为自定义算子源文件,自定义算子编写方式请参考[飞桨官网文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html)。 +注意:自定义算子目前是试验功能,需要依赖 boost,并需要与飞桨预测库 `libpaddle_inference.so` 联合构建。 + +文件`custom_op_test.cc` 为预测的样例程序。 +文件`CMakeLists.txt` 为编译构建文件。 +脚本`run_impl.sh` 包含了第三方库、预编译库的信息配置。 + +我们首先需要对脚本`run_impl.sh` 文件中的配置进行修改。 + +1)**修改`run_impl.sh`** + +打开`run_impl.sh`,我们对以下的几处信息进行修改: + +```shell +# 根据预编译库中的version.txt信息判断是否将以下三个标记打开 +WITH_MKL=ON +WITH_GPU=ON +USE_TENSORRT=OFF + +# 配置预测库的根目录 +LIB_DIR=${YOUR_LIB_DIR}/paddle_inference_install_dir + +# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON,请设置对应的CUDA, CUDNN, TENSORRT的路径。 +CUDNN_LIB=/paddle/nvidia-downloads/cudnn_v7.5_cuda10.1/lib64 +CUDA_LIB=/paddle/nvidia-downloads/cuda-10.1/lib64 +# TENSORRT_ROOT=/paddle/nvidia-downloads/TensorRT-6.0.1.5 +``` + +运行 `sh run_impl.sh`, 会在目录下产生build目录。 + + +2) **运行样例** + +```shell +# 进入build目录 +cd build +# 运行样例 +./custom_op_test +``` + +运行结束后,程序会将模型结果打印到屏幕,说明运行成功。 + +### 更多链接 +- [Paddle Inference使用Quick Start!](https://paddle-inference.readthedocs.io/en/latest/introduction/quick_start.html) +- [Paddle Inference C++ Api使用](https://paddle-inference.readthedocs.io/en/latest/api_reference/cxx_api_index.html) +- [Paddle Inference Python Api使用](https://paddle-inference.readthedocs.io/en/latest/api_reference/python_api_index.html) diff --git a/c++/custom-operator/cmake/external/boost.cmake b/c++/custom-operator/cmake/external/boost.cmake new file mode 100644 index 0000000000000..8b4e084af2802 --- /dev/null +++ b/c++/custom-operator/cmake/external/boost.cmake @@ -0,0 +1,60 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +include(ExternalProject) + +set(BOOST_PROJECT "extern_boost") +# To release PaddlePaddle as a pip package, we have to follow the +# manylinux1 standard, which features as old Linux kernels and +# compilers as possible and recommends CentOS 5. Indeed, the earliest +# CentOS version that works with NVIDIA CUDA is CentOS 6. And a new +# version of boost, say, 1.66.0, doesn't build on CentOS 6. We +# checked that the devtools package of CentOS 6 installs boost 1.41.0. +# So we use 1.41.0 here. +set(BOOST_VER "1.41.0") +set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE) +set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) + +MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") + +set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) +set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") + +set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}" CACHE PATH "boost include directory." FORCE) +set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) +include_directories(${BOOST_INCLUDE_DIR}) + +ExternalProject_Add( + ${BOOST_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR} + URL ${BOOST_URL} + DOWNLOAD_NO_PROGRESS 1 + PREFIX ${BOOST_SOURCES_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + UPDATE_COMMAND "" + ) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32) + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(boost STATIC ${dummyfile}) +else() + add_library(boost INTERFACE) +endif() + +add_dependencies(boost ${BOOST_PROJECT}) +set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR}) \ No newline at end of file diff --git a/c++/custom-operator/custom_op_test.cc b/c++/custom-operator/custom_op_test.cc new file mode 100644 index 0000000000000..7ca5bd6261120 --- /dev/null +++ b/c++/custom-operator/custom_op_test.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/include/paddle_inference_api.h" + +using paddle_infer::Config; +using paddle_infer::Predictor; +using paddle_infer::CreatePredictor; + +void run(Predictor *predictor, const std::vector &input, + const std::vector &input_shape, std::vector *out_data) { + auto input_names = predictor->GetInputNames(); + auto input_t = predictor->GetInputHandle(input_names[0]); + input_t->Reshape(input_shape); + input_t->CopyFromCpu(input.data()); + + CHECK(predictor->Run()); + + auto output_names = predictor->GetOutputNames(); + auto output_t = predictor->GetOutputHandle(output_names[0]); + std::vector output_shape = output_t->shape(); + int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + + out_data->resize(out_num); + output_t->CopyToCpu(out_data->data()); +} + +int main() { + paddle::AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel("custom_relu_infer_model/custom_relu.pdmodel", + "custom_op_inference/custom_relu_infer_model/custom_relu.pdiparams"); + auto predictor{paddle_infer::CreatePredictor(config)}; + std::vector input_shape = {1, 1, 28, 28}; + std::vector input_data(1 * 1 * 28 * 28, 1); + std::vector out_data; + run(predictor.get(), input_data, input_shape, &out_data); + for (auto e : out_data) { + LOG(INFO) << e << '\n'; + } + return 0; +} diff --git a/c++/custom-operator/custom_relu_op.cc b/c++/custom-operator/custom_relu_op.cc new file mode 100755 index 0000000000000..6947d0bf1f89a --- /dev/null +++ b/c++/custom-operator/custom_relu_op.cc @@ -0,0 +1,107 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/include/experimental/ext_all.h" + +template +void relu_cpu_forward_kernel(const data_t* x_data, + data_t* out_data, + int64_t x_numel) { + for (int i = 0; i < x_numel; ++i) { + out_data[i] = std::max(static_cast(0.), x_data[i]); + } +} + +template +void relu_cpu_backward_kernel(const data_t* grad_out_data, + const data_t* out_data, + data_t* grad_x_data, + int64_t out_numel) { + for (int i = 0; i < out_numel; ++i) { + grad_x_data[i] = + grad_out_data[i] * (out_data[i] > static_cast(0) ? 1. : 0.); + } +} + +std::vector relu_cpu_forward(const paddle::Tensor& x) { + auto out = paddle::Tensor(paddle::PlaceType::kCPU); + + out.reshape(x.shape()); + PD_DISPATCH_FLOATING_TYPES( + x.type(), "relu_cpu_forward", ([&] { + relu_cpu_forward_kernel( + x.data(), out.mutable_data(x.place()), x.size()); + })); + + return {out}; +} + +std::vector relu_cpu_backward(const paddle::Tensor& x, + const paddle::Tensor& out, + const paddle::Tensor& grad_out) { + auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU); + grad_x.reshape(x.shape()); + + PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] { + relu_cpu_backward_kernel( + grad_out.data(), + out.data(), + grad_x.mutable_data(x.place()), + out.size()); + })); + + return {grad_x}; +} + +std::vector relu_cuda_forward(const paddle::Tensor& x); +std::vector relu_cuda_backward(const paddle::Tensor& x, + const paddle::Tensor& out, + const paddle::Tensor& grad_out); + +std::vector ReluForward(const paddle::Tensor& x) { + // TODO(chenweihang): Check Input + if (x.place() == paddle::PlaceType::kCPU) { + return relu_cpu_forward(x); + } else if (x.place() == paddle::PlaceType::kGPU) { + return relu_cuda_forward(x); + } else { + throw std::runtime_error("Not implemented."); + } +} + +std::vector ReluBackward(const paddle::Tensor& x, + const paddle::Tensor& out, + const paddle::Tensor& grad_out) { + // TODO(chenweihang): Check Input + if (x.place() == paddle::PlaceType::kCPU) { + return relu_cpu_backward(x, out, grad_out); + } else if (x.place() == paddle::PlaceType::kGPU) { + return relu_cuda_backward(x, out, grad_out); + } else { + throw std::runtime_error("Not implemented."); + } +} + +PD_BUILD_OP(custom_relu) + .Inputs({"X"}) + .Outputs({"Out"}) + .SetKernelFn(PD_KERNEL(ReluForward)); + +PD_BUILD_GRAD_OP(custom_relu) + .Inputs({"X", "Out", paddle::Grad("Out")}) + .Outputs({paddle::Grad("X")}) + .SetKernelFn(PD_KERNEL(ReluBackward)); diff --git a/c++/custom-operator/custom_relu_op.cu b/c++/custom-operator/custom_relu_op.cu new file mode 100755 index 0000000000000..df3e8d133d162 --- /dev/null +++ b/c++/custom-operator/custom_relu_op.cu @@ -0,0 +1,73 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/include/experimental/ext_all.h" + +template +__global__ void relu_cuda_forward_kernel(const data_t* x, + data_t* y, + const int num) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = gid; i < num; i += blockDim.x * gridDim.x) { + y[i] = max(x[i], static_cast(0.)); + } +} + +template +__global__ void relu_cuda_backward_kernel(const data_t* dy, + const data_t* y, + data_t* dx, + const int num) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = gid; i < num; i += blockDim.x * gridDim.x) { + dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.); + } +} + +std::vector relu_cuda_forward(const paddle::Tensor& x) { + auto out = paddle::Tensor(paddle::PlaceType::kGPU); + + out.reshape(x.shape()); + int numel = x.size(); + int block = 512; + int grid = (numel + block - 1) / block; + PD_DISPATCH_FLOATING_TYPES( + x.type(), "relu_cuda_forward_kernel", ([&] { + relu_cuda_forward_kernel<<>>( + x.data(), out.mutable_data(x.place()), numel); + })); + + return {out}; +} + +std::vector relu_cuda_backward(const paddle::Tensor& x, + const paddle::Tensor& out, + const paddle::Tensor& grad_out) { + auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU); + grad_x.reshape(x.shape()); + + int numel = out.size(); + int block = 512; + int grid = (numel + block - 1) / block; + PD_DISPATCH_FLOATING_TYPES( + out.type(), "relu_cuda_backward_kernel", ([&] { + relu_cuda_backward_kernel<<>>( + grad_out.data(), + out.data(), + grad_x.mutable_data(x.place()), + numel); + })); + + return {grad_x}; +} diff --git a/c++/custom-operator/run.sh b/c++/custom-operator/run.sh new file mode 100755 index 0000000000000..6b7a8642828ba --- /dev/null +++ b/c++/custom-operator/run.sh @@ -0,0 +1,29 @@ +mkdir -p build +cd build +rm -rf * + +DEMO_NAME=custom_op_test + +WITH_MKL=ON +WITH_GPU=ON +USE_TENSORRT=OFF + +LIB_DIR=/shixiaowei02/Paddle-custom-op-src/Paddle/build/paddle_inference_install_dir +CUDNN_LIB=/usr/local/cudnn/lib64 +CUDA_LIB=/usr/local/cuda/lib64 +TENSORRT_ROOT=/root/work/nvidia/TensorRT-6.0.1.5.cuda-10.1.cudnn7.6-OSS7.2.1 +CUSTOM_OPERATOR_FILES="custom_relu_op.cc;custom_relu_op.cu" + + +cmake .. -DPADDLE_LIB=${LIB_DIR} \ + -DWITH_MKL=${WITH_MKL} \ + -DDEMO_NAME=${DEMO_NAME} \ + -DWITH_GPU=${WITH_GPU} \ + -DWITH_STATIC_LIB=OFF \ + -DUSE_TENSORRT=${USE_TENSORRT} \ + -DCUDNN_LIB=${CUDNN_LIB} \ + -DCUDA_LIB=${CUDA_LIB} \ + -DTENSORRT_ROOT=${TENSORRT_ROOT} \ + -DCUSTOM_OPERATOR_FILES=${CUSTOM_OPERATOR_FILES} + +make -j