diff --git a/c++/custom-operator/CMakeLists.txt b/c++/custom-operator/CMakeLists.txt
new file mode 100755
index 0000000000000..f734267daa5d4
--- /dev/null
+++ b/c++/custom-operator/CMakeLists.txt
@@ -0,0 +1,204 @@
+cmake_minimum_required(VERSION 3.0)
+project(cpp_inference_demo CXX C)
+option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
+option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    ON)
+option(USE_TENSORRT "Compile demo with TensorRT."   ON)
+option(CUSTOM_OPERATOR_FILES "List of file names for custom operators" "")
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+include(external/boost)
+
+if(WITH_GPU)
+  find_package(CUDA REQUIRED)
+  add_definitions("-DPADDLE_WITH_CUDA")
+endif()
+
+if(NOT WITH_STATIC_LIB)
+  add_definitions("-DPADDLE_WITH_SHARED_LIB")
+else()
+  # PD_INFER_DECL is mainly used to set the dllimport/dllexport attribute in dynamic library mode. 
+  # Set it to empty in static library mode to avoid compilation issues.
+  add_definitions("/DPD_INFER_DECL=")
+endif()
+
+macro(safe_set_static_flag)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()
+
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+if(NOT DEFINED DEMO_NAME)
+  message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
+endif()
+
+include_directories("${PADDLE_LIB}/")
+set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include")
+
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
+
+if (WIN32)
+  add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
+  option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
+  if (MSVC_STATIC_CRT)
+    if (WITH_MKL)
+      set(FLAG_OPENMP "/openmp")
+    endif()
+    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
+    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
+    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd ${FLAG_OPENMP}")
+    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT ${FLAG_OPENMP}")
+    safe_set_static_flag()
+    if (WITH_STATIC_LIB)
+      add_definitions(-DSTATIC_LIB)
+    endif()
+  endif()
+else()
+  if(WITH_MKL)
+    set(FLAG_OPENMP "-fopenmp")
+  endif()
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 ${FLAG_OPENMP}")
+endif()
+
+if(WITH_GPU)
+  if(NOT WIN32)
+    set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
+  else()
+    if(CUDA_LIB STREQUAL "")
+      set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
+    endif()
+  endif(NOT WIN32)
+endif()
+
+if (USE_TENSORRT AND WITH_GPU)
+  set(TENSORRT_ROOT "" CACHE STRING "The root directory of TensorRT library")
+  if("${TENSORRT_ROOT}" STREQUAL "")
+      message(FATAL_ERROR "The TENSORRT_ROOT is empty, you must assign it a value with CMake command. Such as: -DTENSORRT_ROOT=TENSORRT_ROOT_PATH ")
+  endif()
+  set(TENSORRT_INCLUDE_DIR ${TENSORRT_ROOT}/include)
+  set(TENSORRT_LIB_DIR ${TENSORRT_ROOT}/lib)
+endif()
+
+if (NOT WIN32)
+  if (USE_TENSORRT AND WITH_GPU)
+      include_directories("${TENSORRT_INCLUDE_DIR}")
+      link_directories("${TENSORRT_LIB_DIR}")
+  endif()
+endif(NOT WIN32)
+
+if(WITH_MKL)
+  set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml")
+  include_directories("${MATH_LIB_PATH}/include")
+  if(WIN32)
+    set(MATH_LIB ${MATH_LIB_PATH}/lib/mklml${CMAKE_STATIC_LIBRARY_SUFFIX}
+                 ${MATH_LIB_PATH}/lib/libiomp5md${CMAKE_STATIC_LIBRARY_SUFFIX})
+  else()
+    set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+                 ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif()
+  set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn")
+  if(EXISTS ${MKLDNN_PATH})
+    include_directories("${MKLDNN_PATH}/include")
+    if(WIN32)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
+    else(WIN32)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+    endif(WIN32)
+  endif()
+else()
+  set(OPENBLAS_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}openblas")
+  include_directories("${OPENBLAS_LIB_PATH}/include/openblas")
+  if(WIN32)
+    set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+  else()
+    set(MATH_LIB ${OPENBLAS_LIB_PATH}/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX})
+  endif()
+endif()
+
+if(WITH_STATIC_LIB)
+  set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
+else()
+  if(WIN32)
+    set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
+  else()
+    set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif()
+endif()
+
+if (NOT WIN32)
+  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+  set(DEPS ${DEPS}
+      ${MATH_LIB} ${MKLDNN_LIB}
+      glog gflags protobuf  xxhash
+      ${EXTERNAL_LIB})
+else()
+  set(DEPS ${DEPS}
+      ${MATH_LIB} ${MKLDNN_LIB}
+      glog gflags_static libprotobuf  xxhash ${EXTERNAL_LIB})
+  set(DEPS ${DEPS} shlwapi.lib)
+endif(NOT WIN32)
+
+if(WITH_GPU)
+  if(NOT WIN32)
+    if (USE_TENSORRT)
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX})
+    endif()
+    set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+  else()
+    if(USE_TENSORRT)
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
+    endif()
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
+  endif()
+endif()
+
+cuda_add_library(pd_infer_custom_op ${CUSTOM_OPERATOR_FILES} SHARED)
+add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
+set(DEPS ${DEPS} boost pd_infer_custom_op)
+
+if(WIN32)
+  if(USE_TENSORRT)
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}
+              ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+            COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
+              ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+    )
+  endif()
+  if(WITH_MKL)
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+          COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release
+          COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll ${CMAKE_BINARY_DIR}/Release
+          COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll  ${CMAKE_BINARY_DIR}/Release
+    )
+  else()
+    add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+          COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release
+    )
+  endif()
+  if(NOT WITH_STATIC_LIB)
+      add_custom_command(TARGET ${DEMO_NAME} POST_BUILD 
+        COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_fluid.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+      )
+  endif()
+endif()
+
+target_link_libraries(${DEMO_NAME} ${DEPS})
\ No newline at end of file
diff --git a/c++/custom-operator/README.md b/c++/custom-operator/README.md
new file mode 100644
index 0000000000000..cd50fc7b36923
--- /dev/null
+++ b/c++/custom-operator/README.md
@@ -0,0 +1,53 @@
+## 自定义算子模型构建运行示例
+
+### 一：获取本样例中的自定义算子模型
+下载地址：https://paddle-inference-dist.bj.bcebos.com/inference_demo/custom_operator/custom_relu_infer_model.tgz
+
+### 二：**样例编译**
+
+文件 `custom_relu_op.cc`、`custom_relu_op.cu` 为自定义算子源文件，自定义算子编写方式请参考[飞桨官网文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html)。
+注意：自定义算子目前是试验功能，需要依赖 boost，并需要与飞桨预测库 `libpaddle_inference.so` 联合构建。
+
+文件`custom_op_test.cc` 为预测的样例程序。    
+文件`CMakeLists.txt` 为编译构建文件。   
+脚本`run_impl.sh` 包含了第三方库、预编译库的信息配置。
+
+我们首先需要对脚本`run_impl.sh` 文件中的配置进行修改。
+
+1）**修改`run_impl.sh`**
+
+打开`run_impl.sh`，我们对以下的几处信息进行修改：
+
+```shell
+# 根据预编译库中的version.txt信息判断是否将以下三个标记打开
+WITH_MKL=ON       
+WITH_GPU=ON         
+USE_TENSORRT=OFF
+
+# 配置预测库的根目录
+LIB_DIR=${YOUR_LIB_DIR}/paddle_inference_install_dir
+
+# 如果上述的WITH_GPU 或 USE_TENSORRT设为ON，请设置对应的CUDA， CUDNN， TENSORRT的路径。
+CUDNN_LIB=/paddle/nvidia-downloads/cudnn_v7.5_cuda10.1/lib64
+CUDA_LIB=/paddle/nvidia-downloads/cuda-10.1/lib64
+# TENSORRT_ROOT=/paddle/nvidia-downloads/TensorRT-6.0.1.5
+```
+
+运行 `sh run_impl.sh`， 会在目录下产生build目录。
+
+
+2） **运行样例**
+
+```shell
+# 进入build目录
+cd build
+# 运行样例
+./custom_op_test
+```
+
+运行结束后，程序会将模型结果打印到屏幕，说明运行成功。
+
+### 更多链接
+- [Paddle Inference使用Quick Start！](https://paddle-inference.readthedocs.io/en/latest/introduction/quick_start.html)
+- [Paddle Inference C++ Api使用](https://paddle-inference.readthedocs.io/en/latest/api_reference/cxx_api_index.html)
+- [Paddle Inference Python Api使用](https://paddle-inference.readthedocs.io/en/latest/api_reference/python_api_index.html)
diff --git a/c++/custom-operator/cmake/external/boost.cmake b/c++/custom-operator/cmake/external/boost.cmake
new file mode 100644
index 0000000000000..8b4e084af2802
--- /dev/null
+++ b/c++/custom-operator/cmake/external/boost.cmake
@@ -0,0 +1,60 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ExternalProject)
+
+set(BOOST_PROJECT       "extern_boost")
+# To release PaddlePaddle as a pip package, we have to follow the
+# manylinux1 standard, which features as old Linux kernels and
+# compilers as possible and recommends CentOS 5. Indeed, the earliest
+# CentOS version that works with NVIDIA CUDA is CentOS 6.  And a new
+# version of boost, say, 1.66.0, doesn't build on CentOS 6.  We
+# checked that the devtools package of CentOS 6 installs boost 1.41.0.
+# So we use 1.41.0 here.
+set(BOOST_VER           "1.41.0")
+set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
+set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
+
+MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
+
+set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
+set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
+
+set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}" CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
+include_directories(${BOOST_INCLUDE_DIR})
+
+ExternalProject_Add(
+    ${BOOST_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
+    URL      ${BOOST_URL}
+    DOWNLOAD_NO_PROGRESS  1
+    PREFIX                ${BOOST_SOURCES_DIR}
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         ""
+    INSTALL_COMMAND       ""
+    UPDATE_COMMAND        ""
+    )
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+    add_library(boost STATIC ${dummyfile})
+else()
+    add_library(boost INTERFACE)
+endif()
+
+add_dependencies(boost ${BOOST_PROJECT})
+set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
\ No newline at end of file
diff --git a/c++/custom-operator/custom_op_test.cc b/c++/custom-operator/custom_op_test.cc
new file mode 100644
index 0000000000000..7ca5bd6261120
--- /dev/null
+++ b/c++/custom-operator/custom_op_test.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <numeric>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "paddle/include/paddle_inference_api.h"
+
+using paddle_infer::Config;
+using paddle_infer::Predictor;
+using paddle_infer::CreatePredictor;
+
+void run(Predictor *predictor, const std::vector<float> &input,
+         const std::vector<int> &input_shape, std::vector<float> *out_data) {
+  auto input_names = predictor->GetInputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+  input_t->Reshape(input_shape);
+  input_t->CopyFromCpu(input.data());
+
+  CHECK(predictor->Run());
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputHandle(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+
+  out_data->resize(out_num);
+  output_t->CopyToCpu(out_data->data());
+}
+
+int main() {
+  paddle::AnalysisConfig config;
+  config.EnableUseGpu(100, 0);
+  config.SetModel("custom_relu_infer_model/custom_relu.pdmodel",
+                  "custom_op_inference/custom_relu_infer_model/custom_relu.pdiparams");
+  auto predictor{paddle_infer::CreatePredictor(config)};
+  std::vector<int> input_shape = {1, 1, 28, 28};
+  std::vector<float> input_data(1 * 1 * 28 * 28, 1);
+  std::vector<float> out_data;
+  run(predictor.get(), input_data, input_shape, &out_data);
+  for (auto e : out_data) {
+    LOG(INFO) << e << '\n';
+  }
+  return 0;
+}
diff --git a/c++/custom-operator/custom_relu_op.cc b/c++/custom-operator/custom_relu_op.cc
new file mode 100755
index 0000000000000..6947d0bf1f89a
--- /dev/null
+++ b/c++/custom-operator/custom_relu_op.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/include/experimental/ext_all.h"
+
+template <typename data_t>
+void relu_cpu_forward_kernel(const data_t* x_data,
+                             data_t* out_data,
+                             int64_t x_numel) {
+  for (int i = 0; i < x_numel; ++i) {
+    out_data[i] = std::max(static_cast<data_t>(0.), x_data[i]);
+  }
+}
+
+template <typename data_t>
+void relu_cpu_backward_kernel(const data_t* grad_out_data,
+                              const data_t* out_data,
+                              data_t* grad_x_data,
+                              int64_t out_numel) {
+  for (int i = 0; i < out_numel; ++i) {
+    grad_x_data[i] =
+        grad_out_data[i] * (out_data[i] > static_cast<data_t>(0) ? 1. : 0.);
+  }
+}
+
+std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+
+  out.reshape(x.shape());
+  PD_DISPATCH_FLOATING_TYPES(
+      x.type(), "relu_cpu_forward", ([&] {
+        relu_cpu_forward_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(x.place()), x.size());
+      }));
+
+  return {out};
+}
+
+std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x,
+                                              const paddle::Tensor& out,
+                                              const paddle::Tensor& grad_out) {
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU);
+  grad_x.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
+                               relu_cpu_backward_kernel<data_t>(
+                                   grad_out.data<data_t>(),
+                                   out.data<data_t>(),
+                                   grad_x.mutable_data<data_t>(x.place()),
+                                   out.size());
+                             }));
+
+  return {grad_x};
+}
+
+std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x);
+std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
+                                               const paddle::Tensor& out,
+                                               const paddle::Tensor& grad_out);
+
+std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
+  // TODO(chenweihang): Check Input
+  if (x.place() == paddle::PlaceType::kCPU) {
+    return relu_cpu_forward(x);
+  } else if (x.place() == paddle::PlaceType::kGPU) {
+    return relu_cuda_forward(x);
+  } else {
+    throw std::runtime_error("Not implemented.");
+  }
+}
+
+std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
+                                         const paddle::Tensor& out,
+                                         const paddle::Tensor& grad_out) {
+  // TODO(chenweihang): Check Input
+  if (x.place() == paddle::PlaceType::kCPU) {
+    return relu_cpu_backward(x, out, grad_out);
+  } else if (x.place() == paddle::PlaceType::kGPU) {
+    return relu_cuda_backward(x, out, grad_out);
+  } else {
+    throw std::runtime_error("Not implemented.");
+  }
+}
+
+PD_BUILD_OP(custom_relu)
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(ReluForward));
+
+PD_BUILD_GRAD_OP(custom_relu)
+    .Inputs({"X", "Out", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(ReluBackward));
diff --git a/c++/custom-operator/custom_relu_op.cu b/c++/custom-operator/custom_relu_op.cu
new file mode 100755
index 0000000000000..df3e8d133d162
--- /dev/null
+++ b/c++/custom-operator/custom_relu_op.cu
@@ -0,0 +1,73 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/include/experimental/ext_all.h"
+
+template <typename data_t>
+__global__ void relu_cuda_forward_kernel(const data_t* x,
+                                         data_t* y,
+                                         const int num) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+    y[i] = max(x[i], static_cast<data_t>(0.));
+  }
+}
+
+template <typename data_t>
+__global__ void relu_cuda_backward_kernel(const data_t* dy,
+                                          const data_t* y,
+                                          data_t* dx,
+                                          const int num) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+    dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.);
+  }
+}
+
+std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kGPU);
+
+  out.reshape(x.shape());
+  int numel = x.size();
+  int block = 512;
+  int grid = (numel + block - 1) / block;
+  PD_DISPATCH_FLOATING_TYPES(
+      x.type(), "relu_cuda_forward_kernel", ([&] {
+        relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
+            x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
+      }));
+
+  return {out};
+}
+
+std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
+                                               const paddle::Tensor& out,
+                                               const paddle::Tensor& grad_out) {
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU);
+  grad_x.reshape(x.shape());
+
+  int numel = out.size();
+  int block = 512;
+  int grid = (numel + block - 1) / block;
+  PD_DISPATCH_FLOATING_TYPES(
+      out.type(), "relu_cuda_backward_kernel", ([&] {
+        relu_cuda_backward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
+            grad_out.data<data_t>(),
+            out.data<data_t>(),
+            grad_x.mutable_data<data_t>(x.place()),
+            numel);
+      }));
+
+  return {grad_x};
+}
diff --git a/c++/custom-operator/run.sh b/c++/custom-operator/run.sh
new file mode 100755
index 0000000000000..6b7a8642828ba
--- /dev/null
+++ b/c++/custom-operator/run.sh
@@ -0,0 +1,29 @@
+mkdir -p build
+cd build
+rm -rf *
+
+DEMO_NAME=custom_op_test
+
+WITH_MKL=ON
+WITH_GPU=ON
+USE_TENSORRT=OFF
+
+LIB_DIR=/shixiaowei02/Paddle-custom-op-src/Paddle/build/paddle_inference_install_dir
+CUDNN_LIB=/usr/local/cudnn/lib64
+CUDA_LIB=/usr/local/cuda/lib64
+TENSORRT_ROOT=/root/work/nvidia/TensorRT-6.0.1.5.cuda-10.1.cudnn7.6-OSS7.2.1
+CUSTOM_OPERATOR_FILES="custom_relu_op.cc;custom_relu_op.cu"
+
+
+cmake .. -DPADDLE_LIB=${LIB_DIR} \
+  -DWITH_MKL=${WITH_MKL} \
+  -DDEMO_NAME=${DEMO_NAME} \
+  -DWITH_GPU=${WITH_GPU} \
+  -DWITH_STATIC_LIB=OFF \
+  -DUSE_TENSORRT=${USE_TENSORRT} \
+  -DCUDNN_LIB=${CUDNN_LIB} \
+  -DCUDA_LIB=${CUDA_LIB} \
+  -DTENSORRT_ROOT=${TENSORRT_ROOT} \
+  -DCUSTOM_OPERATOR_FILES=${CUSTOM_OPERATOR_FILES}
+
+make -j