diff --git a/benchmark/fluid/mnist.py b/benchmark/fluid/mnist.py
index 43866da9cb113..dc10ac2ec195a 100644
--- a/benchmark/fluid/mnist.py
+++ b/benchmark/fluid/mnist.py
@@ -139,9 +139,6 @@ def run_benchmark(model, args):
 
     # inference program
     inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            target_vars=[batch_acc, batch_size_tensor])
 
     # Optimization
     opt = fluid.optimizer.AdamOptimizer(
@@ -161,7 +158,7 @@ def run_benchmark(model, args):
     train_reader = paddle.batch(
         paddle.dataset.mnist.train(), batch_size=args.batch_size)
 
-    accuracy = fluid.average.WeightedAverage()
+    accuracy = fluid.metrics.Accuracy()
     iters, num_samples, start_time = 0, 0, time.time()
     for pass_id in range(args.pass_num):
         accuracy.reset()
@@ -184,7 +181,7 @@ def run_benchmark(model, args):
                       "label": y_data},
                 fetch_list=[avg_cost, batch_acc, batch_size_tensor]
             )  # The accuracy is the accumulation of batches, but not the current batch.
-            accuracy.add(value=outs[1], weight=outs[2])
+            accuracy.update(value=outs[1], weight=outs[2])
             iters += 1
             num_samples += len(y_data)
             loss = np.array(outs[0])
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 6320b17520a68..52a22c1fbf477 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -62,29 +62,33 @@ endif()
 
 
 ## Then find the reference-cblas.  www.netlib.org/blas/
-
-
 set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
   "Folder contains reference-cblas")
-set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
-  ${REFERENCE_CBLAS_ROOT}/include
-  /usr/include
-  /usr/include/cblas
-)
-
-set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-  ${REFERENCE_CBLAS_ROOT}/lib
-  /usr/lib
-  /usr/lib/blas/reference/
-  /usr/lib/reference/
-)
+if(NOT CMAKE_CROSSCOMPILING)
+  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
+    ${REFERENCE_CBLAS_ROOT}/include
+    /usr/include
+    /usr/include/cblas
+  )
+
+  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
+    ${REFERENCE_CBLAS_ROOT}/lib
+    /usr/lib
+    /usr/lib/blas/reference/
+    /usr/lib/reference/
+  )
+else()
+  # Diable the finding of reference cblas under host's system path
+  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
+  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
+endif()
 
 find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
         ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
 find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
         ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
 
-if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
   set(CBLAS_FOUND ON)
   set(CBLAS_PROVIDER REFERENCE)
   set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 0853b981813c5..aa24915947077 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -24,16 +24,16 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
 IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
 ELSE()
-  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin)
 ENDIF()
 
 ExternalProject_Add(
     extern_grpc
     DEPENDS protobuf zlib
     GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.8.x"
+    GIT_TAG "v1.11.x"
     PREFIX          ${GRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake
deleted file mode 100644
index af5c689c35247..0000000000000
--- a/cmake/external/nccl.cmake
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-if(NOT WITH_GPU)
-  return()
-endif()
-
-include(ExternalProject)
-
-set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
-
-include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
-
-if(WITH_DSO)
-  # If we use DSO, we do not build nccl, just download the dependencies
-  set(NCCL_BUILD_COMMAND "")
-  set(NCCL_INSTALL_COMMAND "")
-  set(NCCL_INSTALL_DIR "")
-else()
-  # otherwise, we build nccl and link it.
-  set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
-  # Note: cuda 8.0 is needed to make nccl
-  # When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
-  set(NCCL_BUILD_COMMAND "make -j 8")
-  set(NCCL_INSTALL_COMMAND  "make install PREFIX=${NCCL_INSTALL_DIR}")
-endif()
-
-ExternalProject_Add(
-    extern_nccl
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/NVIDIA/nccl.git"
-    GIT_TAG         "v1.3.4-1"
-    PREFIX          "${NCCL_SOURCE_DIR}"
-    UPDATE_COMMAND  ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     "${NCCL_BUILD_COMMAND}"
-    INSTALL_COMMAND   "${NCCL_INSTALL_COMMAND}"
-    INSTALL_DIR       "${NCCL_INSTALL_DIR}"
-    TEST_COMMAND      ""
-)
-
-if(WITH_DSO)
-  if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
-    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c)
-    file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";")
-    add_library(nccl STATIC ${dummyfile})
-  else()
-    add_library(nccl INTERFACE)
-  endif()
-else()
-  add_library(nccl STATIC IMPORTED GLOBAL)
-  set_property(TARGET nccl PROPERTY IMPORTED_LOCATION
-               ${NCCL_INSTALL_DIR}/lib/libnccl_static.a)
-endif()
-
-add_dependencies(nccl extern_nccl)
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index 71f54c425d4c3..80282329c6ac6 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -11,19 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 
-IF(MOBILE_INFERENCE)
+if(MOBILE_INFERENCE OR RPI)
     return()
-ENDIF()
+endif()
 
 include (ExternalProject)
 
 # NOTE: snappy is needed when linking with recordio
 
-SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
-SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
-SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE)
+set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
+set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
+set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
+
+set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
 
 ExternalProject_Add(
     extern_snappy
@@ -51,8 +52,7 @@ ExternalProject_Add(
 )
 
 add_library(snappy STATIC IMPORTED GLOBAL)
-set_property(TARGET snappy PROPERTY IMPORTED_LOCATION
-             "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
 
 include_directories(${SNAPPY_INCLUDE_DIR})
 add_dependencies(snappy extern_snappy)
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 8f7a3bf8eeaef..20a96430823d0 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -11,9 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
 
-IF(MOBILE_INFERENCE)
+IF(MOBILE_INFERENCE OR RPI)
     return()
 ENDIF()
 
@@ -21,9 +20,11 @@ include (ExternalProject)
 
 # NOTE: snappy is needed when linking with recordio
 
-SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
-SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
-SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
+set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
+set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
+set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
+
+set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
 
 ExternalProject_Add(
         extern_snappystream
@@ -51,8 +52,7 @@ ExternalProject_Add(
 )
 
 add_library(snappystream STATIC IMPORTED GLOBAL)
-set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
-        "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
+set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
 
 include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
 include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c4c9f77df8d57..1d3e2ade6d393 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -195,14 +195,7 @@ function(cc_library TARGET_NAME)
         list(REMOVE_ITEM cc_library_DEPS warpctc)
         add_dependencies(${TARGET_NAME} warpctc)
       endif()
-      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
-        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
-        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
-        target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
-      else()
-        target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-      endif()
+      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
     endif()
     
@@ -243,11 +236,7 @@ function(cc_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
-    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
-      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
-    endif()
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 0323cd9698cba..cc758019827b9 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -1,7 +1,22 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set_property(GLOBAL PROPERTY FLUID_MODULES "")
 # find all fluid modules is used for paddle fluid static library
 function(find_fluid_modules TARGET_NAME)
   get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
   string(FIND "${__target_path}" "fluid" pos)
   if(pos GREATER 1)
     get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
@@ -77,6 +92,23 @@ elseif (WITH_MKLML)
     )
 endif()
 
+if(NOT MOBILE_INFERENCE AND NOT RPI)
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
+  copy(snappy_lib
+    SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream")
+  copy(snappystream_lib
+    SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib")
+  copy(zlib_lib
+    SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+endif()
+
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
diff --git a/doc/fluid/design/motivation/fluid.md b/doc/fluid/design/motivation/fluid.md
index 5e147f8263e68..4b7696cc1bbf5 100644
--- a/doc/fluid/design/motivation/fluid.md
+++ b/doc/fluid/design/motivation/fluid.md
@@ -119,7 +119,7 @@ An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Pad
 
 From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
 
-We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/execution/if_else_op.md) structure of Fluid.
 
 ## Turing Completeness
 
diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst
index 23615f8830e99..4231f2bb5cd80 100644
--- a/doc/v2/dev/write_docs_cn.rst
+++ b/doc/v2/dev/write_docs_cn.rst
@@ -65,39 +65,55 @@ PaddlePaddle.org工具可以配合Docker使用，需要在系统里先安装好D
 不使用PaddlePaddle.org工具
 --------------------------
 
-使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档，即
+使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。该方法与 `从源码编译PaddlePaddle <http://paddlepaddle.org/docs/develop/documentation/zh/build_and_install/build_from_source_cn.html>`_ 相似，通过从源码中构建可用于编译PaddlePaddle文档的Docker镜像并运行，在进入Docker容器后使用源码中的脚本构建PaddlePaddle文档，具体步骤如下：
 
-[TBD]
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # 从源码中构建可用于编译PaddlePaddle文档的Docker镜像
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # 进入Docker容器后使用build.sh脚本构建PaddlePaddle文档
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+注：上述命令把当前目录（源码根目录）映射为 container 里的 :code:`/paddle` 目录。
+
+编译完成后，会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 、 ``api/en/html`` 共三个子目录，分别进入这些目录下，执行以下命令：
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。
 
 如果不想使用Docker，也可以使用以下命令直接构建PaddlePaddle文档，即
 
 .. code-block:: bash
 
-   mkdir paddle
-   cd paddle
    git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
    mkdir -p build
    cd build
    cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 
    # 如果只需要构建使用文档，则执行以下命令
-   make -j $processors gen_proto_py
-   make -j $processors paddle_docs paddle_docs_cn
+   make -j $processors paddle_docs
 
    # 如果只需要构建API，则执行以下命令
-   make -j $processors gen_proto_py framework_py_proto
-   make -j $processors copy_paddle_pybind
-   make -j $processors paddle_api_docs
+   make -j $processors paddle_apis
 
 其中$processors代表启动和CPU核一样多的进程来并行编译，可以根据本机的CPU核数设置相应的值。
 
-编译完成后，进入 ``doc/v2`` 目录，如果选择构建文档则会在该目录下生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会生成 ``api/en/html`` 目录，分别进入这些目录下，执行以下命令：
+编译完成后，同样会产生 ``doc/v2`` 和 ``doc/fluid`` 两个目录，如果选择构建文档则会在这两个目录下分别都生成 ``cn/html/`` 、 ``en/html`` 两个子目录，选择构建API则会在这两个目录下分别生成 ``api/en/html`` 目录，分别进入这些子目录下，执行以下命令：
 
 .. code-block:: bash
 
    python -m SimpleHTTPServer 8088
 
-在浏览器中输入 http://localhost:8088 就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
+在浏览器中输入 http://localhost:8088 就可以看到编译生成的 ``v2`` 和 ``fluid`` 两种版本的中/英文的文档页面和英文的API页面。下图为生成的 ``v2`` 英文文档首页示例。注意，示例中由于使用了sphinx的原始主题，所以页面的风格与官网并不一致，但这并不影响开发者进行调试。
 
 ..  image:: src/doc_en.png
     :align: center
diff --git a/doc/v2/dev/write_docs_en.rst b/doc/v2/dev/write_docs_en.rst
index 15ff0d34ad622..6105455e202e4 100644
--- a/doc/v2/dev/write_docs_en.rst
+++ b/doc/v2/dev/write_docs_en.rst
@@ -68,39 +68,56 @@ Please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develo
 Manually Building the Documentation
 -------------------------------------
 
-Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. After Docker is installed, you could use the scripts in the source directory to build the documentation.
+Build PaddlePaddle's documentation with Docker，you need to install Docker first. Please refer to `Docker's official website <https://docs.docker.com/>`_ on how to install Docker. This method is quite similar to ` Build From Sources <http://paddlepaddle.org/docs/develop/documentation/en/build_and_install/build_from_source_en.html>`_ , by constructing, from source code, a docker image that can be used to build PaddlePaddle documentation. Enter the Docker container and use the script ``build.sh`` in the source directory to build the PaddlePaddle documentation. The specific steps are as follows:
 
-[TBD]
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+
+   # Construct a docker image from source code
+   docker build -t paddle:dev .
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" -e "WITH_DOC=ON" paddle:dev /bin/bash
+
+   # Use build.sh to build PaddlePaddle documentation
+   bash -x /paddle/paddle/scripts/docker/build.sh
+
+Note: The above commands maps the current directory (source root directory) to the :code:`/paddle` directory in the container.
+
+After compiling, there should be two generated directories: ``doc/v2`` and ``doc/fluid``, where three subdirectories ``cn/html/``, ``en/html`` and ``api/en/html`` are generated. Please enter these directories respectively and execute the following commands:
+
+.. code-block:: bash
+
+   python -m SimpleHTTPServer 8088
+
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page.
 
 If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation.
 
 .. code-block:: bash
 
-   mkdir paddle
-   cd paddle
+
    git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
    mkdir -p build
    cd build
    cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 
    # If you only need to build documents, use the following commands
-   make -j $processors gen_proto_py
-   make -j $processors paddle_docs paddle_docs_cn
+   make -j $processors paddle_docs
 
    # If you only need to build APIs, use the following commands
-   make -j $processors gen_proto_py framework_py_proto
-   make -j $processors copy_paddle_pybind
-   make -j $processors paddle_api_docs
+   make -j $processors paddle_apis
 
 $processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine.
 
-After the compilation is complete, enter the ``doc/v2`` directory. If you chose to build documents, it will generate ``cn/html/`` and ``en/html`` subdirectories under this directory. If you chose to build APIs，it will generate``api/en/html`` subdirectory. Please enter these directories respectively and execute the following commands:
+After compiling, there also should be two generated directories: ``doc/v2`` and ``doc/fluid`` . If you chose to build documents, two subdirectories ``cn/html/`` and ``en/html``  will be generated in both two directories. If you chose to build APIs，a subdirectory ``api/en/html`` will be generated. Please enter these directories respectively and execute the following commands:
 
 .. code-block:: bash
 
    python -m SimpleHTTPServer 8088
 
-Use a web browser and navigate to http://localhost:8000, you could see the compiled Chinese/English documents page and the English APIs page. The following figure is an example of the built English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
+Use a web browser and navigate to http://localhost:8000, you could see the compiled  ``v2`` 's and ``fluid`` 's Chinese/English documents page and English APIs page. The following figure is an example of the built ``v2`` 's English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging.
 
 ..  image:: src/doc_en.png
     :align: center
diff --git a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
index bc3d50b3ffd3b..dee1b7554f97a 100644
--- a/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
+++ b/doc/v2/howto/cluster/multi_cluster/k8s_distributed_en.md
@@ -1,3 +1,372 @@
-# Kubernetes Distributed
+# Distributed Training on Kubernetes
 
-TBD
+We introduced how to create a PaddlePaddle Job with a single node on Kuberentes in the
+previous document.
+In this article, we will introduce how to create a PaddlePaddle job with multiple nodes
+on Kubernetes cluster.
+
+## Overall Architecture
+
+Before creating a training job, the users need to slice the training data and deploy
+the Python scripts along with it into the distributed file system
+(We can use the different type of Kuberentes Volumes to mount different distributed
+file systems). Before training starts, The program will copy the training data into the
+Container and also save the models at the same path during training. The global architecture
+is as follows:
+
+![PaddlePaddle on Kubernetes Architecture](src/k8s-paddle-arch.png)
+
+The above figure describes a distributed training architecture which contains 3 nodes, each 
+Pod mounts a folder of the distributed file system to save training data and models
+by Kubernetes Volume. Kubernetes created 3 Pods for this training phase and scheduled these on
+3 nodes, each Pod has a PaddlePaddle container. After the containers car created,
+PaddlePaddle starts up the communication between PServer and Trainer and read training
+data for this training job.
+
+As the description above, we can start up a PaddlePaddle distributed training job on a 
+Kubernetes ready cluster with the following steps:
+
+1. [Build PaddlePaddle Docker Image](#Build a Docker Image)
+1. [Split training data and upload to the distributed file system](#Upload Training Data)
+1. [Edit a YAML file and create a Kubernetes Job](#Create a Job)
+1. [Check the output](#Check The Output)
+
+We will introduce these steps as follows:
+
+### Build a Docker Image
+
+Training docker image needs to package the paddle pserver and paddle trainer runtimes, as well as two more processes before we can kick off the training:
+
+- Copying the training data into container.
+- Generating the initialization arguments for `Paddle PServer` and `Paddle Training` processes.
+
+Since the paddlepaddle official docker image already has the runtimes we need, we'll take it as the base image and pack some additional scripts for the processes mentioned above to build our training image. for more detail, please find from the following link:
+- https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile
+
+
+```bash
+$ cd doc/howto/usage/k8s/src/k8s_train
+$ docker build -t [YOUR_REPO]/paddle:mypaddle .
+```
+
+And then upload the new Docker Image to a Docker hub:
+
+```bash
+docker push  [YOUR_REPO]/paddle:mypaddle
+```
+
+**[NOTE]**, in the above command arguments, `[YOUR_REPO]` represents your Docker repository,
+you need to use your repository instead of it. We will replace it with your respository name to
+represent the Docker Image which built in this step.
+
+### Prepare Training Data
+
+We can download and split the training job by creating a Kubernetes Job, or custom your image
+by editing [k8s_train](./src/k8s_train/).
+
+Before creating a Job, we need to bind a [persistenVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes) by the different type of
+the different file system, the generated dataset would be saved on this volume.
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      hostNetwork: true
+      containers:
+      - name: paddle-data
+        image: paddlepaddle/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/mnt"
+          name: nfs
+        env:
+        - name: OUT_DIR
+          value: /home/work/mfs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: nfs
+          persistentVolumeClaim:
+            claimName: mfs
+      restartPolicy: Never
+```
+
+Create the Job with the following command:
+
+```bash
+> kubectl create -f xxx.yaml
+```
+
+If created successfully, you can see some information like this:
+
+```base
+[root@paddle-kubernetes-node0 nfsdir]$ tree -d
+.
+`-- paddle-cluster-job
+    |-- 0
+    |   `-- data
+    |-- 1
+    |   `-- data
+    |-- 2
+    |   `-- data
+    |-- output
+    |-- quick_start
+```
+
+The `paddle-cluster-job` above is the job name for this training job; we need 3
+PaddlePaddle training nodes and save the split training data in `paddle-cluster-job` path,
+the folder `0`, `1` and `2` represents the `training_id` on each node, `quick_start` folder is used to store training data, `output` folder is used to store the models and logs.
+
+
+### Create a Job
+
+Kubernetes allow users to create objects with YAML files, and we can use a command-line tool
+to create it.
+
+The Job YAML file describes that which Docker Image would be used in this training job, how much nodes would be created, what's the startup arguments of `Paddle PServer/Trainer` process and what's the type of Volumes. You can find the details of the YAML filed in
+[Kubernetes Job API](http://kubernetes.io/docs/api-reference/batch/v1/definitions/#_v1_job).
+The following is an example for this training job:
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath:
+          path: /home/work/mfs
+      containers:
+      - name: trainer
+        image: [YOUR_REPO]/paddle:mypaddle
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath
+      restartPolicy: Never
+```
+
+In the above YAML file:
+- `metadata.name`, The job name.
+- `parallelism`, Whether the Kubernetes Job would create `parallelism` Pods at the same time.
+- `completions`, The Job would become the success status only when the number of successful Pod(the exit code is 0)
+  is equal to `completions`.
+- `volumeMounts`, the name field `jobpath` is a key, the `mountPath` field represents
+  the path in the container, and we can define the `jobpath` in `volumes` filed, use `hostPath`
+  to configure the host path we want to mount.
+- `env`, the environment variables in the Container, we pass some startup arguments by
+  this approach, some details are as following:
+  - JOB_PATH：the mount path in the container
+  - JOB_NAME：the job name
+  - TRAIN_CONFIG_DIR：the job path in the container, we can find the training data path by
+    combine with JOB_NAME.
+  - CONF_PADDLE_NIC: the argument `--nics` of `Paddle PServer` process, the network
+    device name.
+  - CONF_PADDLE_PORT: the argument `--port` of `Paddle PServer` process.
+  - CONF_PADDLE_PORTS_NUM: the argument `--ports_num` of `Paddle PServer`, the port number
+    for dense prameter update. 
+  - CONF_PADDLE_PORTS_NUM_SPARSE：the argument `--ports_num_for_sparse` of `Paddle PServer`,
+    the port number for sparse parameter update.
+  - CONF_PADDLE_GRADIENT_NUM：the number of training node, the argument 
+  `--num_gradient_servers` of `Paddle PServer` and `Paddle Trainer`.
+
+You can find some details information at [here]
+(http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。
+
+We can use the command-line tool of Kubernetes to create a Job when we finish the YAML file:
+
+```bash
+kubectl create -f job.yaml
+```
+
+Upon successful creation, Kubernetes would create 3 Pods as PaddlePaddle training node,
+pull the Docker image and begin to train.
+
+
+### Checkout the Output
+
+At the process of training, we can check the logs and the output models which is stored in
+the `output` folder.
+
+**NOTE**, `node_0`, `node_1` and `node_2` represent the
+`trainer_id` of the PaddlePaddle training job rather than the node id of Kubernetes.
+
+```bash
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│   ├── server.log
+│   └── train.log
+├── node_1
+│   ├── server.log
+│   └── train.log
+├── node_2
+......
+├── pass-00002
+│   ├── done
+│   ├── ___embedding_0__.w0
+│   ├── ___embedding_1__.w0
+......
+```
+
+We can checkout the status of each training Pod by viewing the logs:
+
+```bash
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121    50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+    --nics=eth0 --port=7164
+    --ports_num=2 --comment=paddle_process_by_paddle
+    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+    --ports_num_for_sparse=2 --config=./trainer_config.py
+    --trainer_count=4 --num_passes=10 --use_gpu=0
+    --log_period=50 --dot_period=10 --saving_period=1
+    --local=0 --trainer_id=0
+    --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__]
+I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
+
+## Some Additional Details
+
+### Using Environment Variables
+
+Usually we use the environment varialbes to configurate the PaddlePaddle Job which runs in
+Kubernetes, `start_paddle.py` provides a start up script to convert the environment variable
+to the start up arguments of PaddlePaddle process:
+
+```bash
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+```
+
+### Communication between Pods
+
+At the begin of `start_paddle.py`, it would initializes and parses the arguments.
+
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+And then query the status of all the other Pods of this Job by the function `getPodList()`, and fetch `triner_id` by the function `getIdMap(podlist)` if all the Pods status is `RUNNING`.
+
+```python
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+
+**NOTE**: `getPodList()` would prefetch all the Pods in the current namespace, if some 
+Pods are alreay running, it may cause some error. We will use [statfulesets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets) instead of
+Kubernetes Pod or Replicaset in the future.
+
+The function `getIdMap(podlist)` fetches IPs addresses of `podlist` and then sort them
+to generate `trainer_id`.
+
+```python
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+After getting the `idMap`, we can generate the arguments of `Paddle PServer` and `Paddle Trainer`
+so that we can start up them by `startPaddle(idMap, train_args_dict)`.
+
+### Create Job
+
+The main goal of `startPaddle` is generating the arguments of `Paddle PServer` and
+`Paddle Trainer` processes. Take `Paddle Trainer` as an example, we parse the
+environment variable and then get `PADDLE_NIC`, `PADDLE_PORT`, `PADDLE_PORTS_NUM` and etc...,
+finally find `trainerId` from `idMap` according to its IP address.
+
+```python
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index c44f8a8a8ecc1..8b1ca5e165483 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -24,6 +24,6 @@ if(NOT WITH_FLUID_ONLY)
 endif()
 
 add_subdirectory(testing)
-if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS)
+if(NOT MOBILE_INFERENCE AND NOT RPI)
   add_subdirectory(fluid)
 endif()
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index d725763b01d59..d274d96c29bdb 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(pybind)
-add_subdirectory(inference)
 add_subdirectory(string)
 add_subdirectory(recordio)
+# NOTE: please add subdirectory inference at last.
+add_subdirectory(inference)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 3840bbe83b68d..1f3ca24df16cf 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -79,14 +79,12 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
     COMMENT "Copy generated python proto into directory paddle/fluid/proto."
     WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
-cc_library(backward SRCS backward.cc DEPS net_op)
-cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto backward glog lod_rank_table feed_fetch_method)
+framework_proto glog lod_rank_table feed_fetch_method)
 
 
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
diff --git a/paddle/fluid/framework/backward.cc b/paddle/fluid/framework/backward.cc
deleted file mode 100644
index 1314af2b3dab2..0000000000000
--- a/paddle/fluid/framework/backward.cc
+++ /dev/null
@@ -1,585 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/backward.h"
-#include "paddle/fluid/operators/net_op.h"
-
-#include <deque>
-#include <list>
-#include <memory>
-#include <unordered_set>
-
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/net_op.h"
-
-namespace paddle {
-namespace framework {
-
-static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
-// Control Flow operators's backward is significantly different from
-// computational operators. Hack Code here.
-// We should design a better way to backward CtrlFlowOps.
-static std::unordered_set<std::string>& CtrlFlowOps() {
-  if (g_ctrl_flow_ops_ == nullptr) {
-    g_ctrl_flow_ops_ = new std::unordered_set<std::string>{
-        "increment", "lod_rank_table", "less_than"};
-  }
-  return *g_ctrl_flow_ops_;
-}
-
-static inline std::unique_ptr<OperatorBase> CreateGradOp(
-    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
-    std::unordered_map<std::string, std::string>* grad_to_var) {
-  OpDesc op_desc;
-  op_desc.SetInputMap(op.Inputs());
-  op_desc.SetOutputMap(op.Outputs());
-  op_desc.SetType(op.Type());
-  op_desc.SetAttrMap(op.Attrs());
-  auto& info = OpInfoMap::Instance().Get(op.Type());
-  auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, {});
-  std::vector<std::unique_ptr<OperatorBase>> grad_ops;
-  grad_ops.reserve(grad_descs.size());
-  std::transform(grad_descs.begin(), grad_descs.end(),
-                 std::back_inserter(grad_ops),
-                 [](const std::unique_ptr<OpDesc>& grad_desc) {
-                   return OpRegistry::CreateOp(*grad_desc);
-                 });
-  PADDLE_ENFORCE(!grad_ops.empty());
-  if (grad_ops.size() == 1) {
-    return std::move(grad_ops[0]);
-  } else {
-    auto net_op = new operators::NetOp();
-    for (auto& grad_op : grad_ops) {
-      net_op->AppendOp(std::move(grad_op));
-    }
-    net_op->CompleteAddOp();
-    return std::unique_ptr<OperatorBase>(net_op);
-  }
-}
-
-template <typename Map, typename T>
-static void ForEachVarName(const Map& names, T callback) {
-  for (auto& name : names) {
-    for (auto& n : name.second) {
-      if (callback(n)) return;
-    }
-  }
-}
-
-// return whether all the names + suffixes in the set
-static bool AllInSet(
-    const std::map<std::string, std::vector<std::string>>& names,
-    const std::string& suffix, const std::unordered_set<std::string>& set) {
-  bool all_in_set = true;
-  ForEachVarName(names, [&all_in_set, &set, &suffix](const std::string& n) {
-    all_in_set = set.find(n + suffix) != set.end();
-    return !all_in_set;
-  });
-  return all_in_set;
-}
-
-static std::unique_ptr<OperatorBase> NOP() {
-  auto net_op = new operators::NetOp();
-  net_op->SetType("@NOP@");
-  net_op->CompleteAddOp();
-  return std::unique_ptr<OperatorBase>(net_op);
-}
-
-//  Get backward operator from a forward operator, a recursive implementation.
-//
-//  no_grad_names the gradient variable names without gradient calculating.
-//
-//  uniq_id is a unique index used inside recursively calling
-//  BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
-//  pass `uniq_id` through recursive calling.
-//
-//  returns The backward operator. In a simple situation, it may be a simple
-//  operator, in a complex situation, it maybe a NetOp.
-//
-//  See Backward.h for details
-static std::unique_ptr<OperatorBase> BackwardRecursive(
-    const OperatorBase& forwardOp,
-    std::unordered_set<std::string>& no_grad_names,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    size_t& uniq_id) {
-  //  If all input gradients of forwarding operator do not need to calculate,
-  //  just return an NOP. Not return null ptr because NOP does not take
-  //  too much time for calculation, but it is useful for simplifying logic.
-  if (AllInSet(forwardOp.Inputs() /*names*/, kGradVarSuffix /*suffix*/,
-               no_grad_names /*set*/)) {
-    return NOP();
-  }
-
-  //  All output gradients of forwarding operator do not need to calculate.
-  //  Then all input gradients cannot be computed at all, and we put them into
-  //  `no_grad_names` set. Return an NOP.
-  if (AllInSet(forwardOp.Outputs() /*names*/, kGradVarSuffix /*suffix*/,
-               no_grad_names /*set*/)) {
-    ForEachVarName(forwardOp.Inputs(),
-                   [&no_grad_names](const std::string& name) -> bool {
-                     no_grad_names.insert(GradVarName(name));
-                     return false;
-                   });
-    return NOP();
-  }
-
-  // Returned gradient network
-  auto net = std::unique_ptr<operators::NetOp>(new operators::NetOp());
-
-  if (forwardOp.IsNetOp()) {
-    // Because forwardOp is a net op, it can static_cast.
-    auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);
-
-    // Map from output gradient variable name to operator's indices in
-    // backward net's ops_. That operator generates that variable.
-    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
-
-    size_t local_op_id = 0;
-    // reversely travel forwardNet and collect all duplicate outputs.
-    for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
-         ++it, ++local_op_id) {
-      auto& fwd = *it;
-      auto bwd = BackwardRecursive(*fwd, no_grad_names, grad_to_var, uniq_id);
-      ForEachVarName(bwd->Outputs(),
-                     [&dup_output_ops, local_op_id](const std::string& out) {
-                       dup_output_ops[out].emplace_back(local_op_id);
-                       return false;
-                     });
-      net->AppendOp(std::move(bwd));
-    }
-    // Get unique ID for this method.
-    auto uid = uniq_id++;
-    // TODO(dzh): more comment
-    // multiple operators which have the same output (y for example) may
-    // overwrite the same y variable when backward, special operations are token
-    // to handle this case. For each duplicate output, rename it to an alias
-    // (original name with a offset), append an `add` op for its operator,
-    // and finally sum all the alias variable to the final output variable y.
-    using Pos = std::pair<size_t, std::unique_ptr<OperatorBase>>;
-    std::list<Pos> insert_position;
-    for (auto& dup_output_op : dup_output_ops) {
-      const std::string& name = dup_output_op.first;
-      // duplicate @Empty@ don't need to be added
-      if (name == kEmptyVarName) continue;
-
-      auto& dup_op = dup_output_op.second;
-      // no duplicate output
-      if (dup_op.size() == 1) continue;
-
-      // process the duplicate outputs
-      std::vector<std::string> dup_outputs;
-      for (size_t i = 0; i < dup_op.size(); ++i) {
-        // rename each duplicate output to an alias
-        auto op_offset = dup_op[i];
-        dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
-                              std::to_string(i));
-        net->ops_[op_offset]->Rename(name, dup_outputs.back());
-      }
-      // collect all the offset for each alias,
-      // insert a sum operator to add all aliases to output
-      insert_position.push_back(
-          {dup_op.back(),
-           OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}},
-                                AttributeMap{})});
-    }
-
-    // make sure the inserted `sum` ops follow the BFS order.
-    insert_position.sort(
-        [](const Pos& l, const Pos& r) { return l.first > r.first; });
-
-    for (auto& pos : insert_position) {
-      net->InsertOp(pos.first + 1, std::move(pos.second));
-    }
-  } else {
-    std::unique_ptr<OperatorBase> grad_op(
-        CreateGradOp(forwardOp, no_grad_names, grad_to_var));
-
-    ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
-                                          const std::string& grad_input) {
-      if (no_grad_names.count(grad_input)) {
-        // +1 for \0
-        std::string prefix = grad_input.substr(
-            0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
-        grad_op->Rename(grad_input, prefix + kZeroVarSuffix);
-
-        // If part of input gradient of that operator is not calculated, fill
-        // zero variables to that input gradient.
-        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
-                                           {{"Out", {grad_input}}},
-                                           AttributeMap{}));
-      }
-      return false;
-    });
-
-    ForEachVarName(grad_op->Outputs(),
-                   [&no_grad_names, &grad_op](const std::string& grad_output) {
-                     if (no_grad_names.count(grad_output)) {
-                       grad_op->Rename(grad_output, kEmptyVarName);
-                     }
-                     return false;
-                   });
-
-    if (net->ops_.empty()) {  // Current no aux op is added to network
-      return grad_op;
-    }
-    net->AppendOp(std::move(grad_op));
-  }
-  net->SetType("@GENERATED_BACKWARD@");
-  net->CompleteAddOp();
-  return std::unique_ptr<OperatorBase>(
-      static_cast<OperatorBase*>(net.release()));
-}
-
-// See header for comments
-std::unique_ptr<OperatorBase> Backward(
-    const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars) {
-  std::unordered_set<std::string> no_grad_names;
-  no_grad_names.reserve(no_grad_vars.size() + 1);
-
-  no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
-
-  for (auto& name : no_grad_vars) {
-    no_grad_names.insert(name + kGradVarSuffix);
-  }
-  size_t uid = 0;
-  std::unordered_map<std::string, std::string> grad_to_var;
-  return BackwardRecursive(forwardOp, no_grad_names, &grad_to_var, uid);
-}
-
-// ====================================  //
-
-static bool AllGradInSet(const std::vector<std::string>& names,
-                         const std::unordered_set<std::string>& set) {
-  for (const std::string& name : names) {
-    if (!set.count(GradVarName(name))) {
-      return false;
-    }
-  }
-  if (VLOG_IS_ON(10)) {
-    std::ostringstream sout;
-    sout << "All input {";
-    for (auto& name : names) {
-      sout << name << ",";
-    }
-    sout << "} is in {";
-    for (auto& name : set) {
-      sout << name << ",";
-    }
-    sout << "}";
-    VLOG(10) << sout.str();
-  }
-  return true;
-}
-
-static std::string FwdName(const std::string& grad_name) {
-  auto pos = grad_name.find("@GRAD");
-  if (pos == std::string::npos) {
-    return "";
-  } else {
-    return grad_name.substr(0, pos);
-  }
-}
-
-static void CreateGradVarInBlock(
-    size_t grad_op_start_index,
-    const std::unordered_map<std::string, std::string>& param_name_map,
-    BlockDesc* block_desc,
-    std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
-  auto ops = block_desc->AllOps();
-  for (size_t op_index = grad_op_start_index; op_index < ops.size();
-       ++op_index) {
-    std::unordered_set<std::string> new_vars;
-    auto& ctrl_flow_ops = CtrlFlowOps();
-    ForEachVarName(ops[op_index]->Outputs(),
-                   [&](const std::string& grad_var_name) {
-                     if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
-                         ctrl_flow_ops.end()) {
-                       if (block_desc->HasVarRecursive(grad_var_name)) {
-                         return false;
-                       }
-                     } else {
-                       if (block_desc->HasVar(grad_var_name)) {
-                         return false;
-                       }
-                     }
-                     if (grad_var_name == framework::kEmptyVarName) {
-                       return false;
-                     }
-                     auto var = block_desc->Var(grad_var_name);
-                     VLOG(10) << "Creating Variable " << grad_var_name;
-                     new_vars.insert(var->Name());
-                     auto it = param_name_map.find(grad_var_name);
-                     if (it == param_name_map.end()) {
-                       return false;
-                     }
-                     auto param_var_name = it->second;
-                     auto& grad_record = (*grad_var_record)[param_var_name];
-                     grad_record.name_ = grad_var_name;
-                     grad_record.block_idx_ = block_desc->ID();
-                     grad_record.op_idx_ = static_cast<int>(op_index);
-                     return false; /* not break */
-                   });
-    ops[op_index]->InferVarType(block_desc);
-    for (auto& arg : ops[op_index]->OutputArgumentNames()) {
-      if (new_vars.find(arg) == new_vars.end()) {
-        continue;
-      }
-      auto pname = FwdName(arg);
-      auto* param = block_desc->FindVarRecursive(pname);
-      auto* grad = block_desc->FindVar(arg);
-      if (param == nullptr) {
-        grad->SetDataType(proto::VarType::FP32);
-      } else {
-        grad->SetDataType(param->GetDataType());
-      }
-    }
-    ops[op_index]->InferShape(*block_desc);
-  }
-}
-
-std::vector<std::unique_ptr<OpDesc>> MakeOpGrad(
-    const OpDesc* op_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>()) {
-  std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
-  // All input gradients of forwarding operator do not need to calculate.
-  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
-  if (AllGradInSet(inputs, *no_grad_vars)) {
-    VLOG(10) << "Drop operator  " << op_desc->Type();
-    return grad_op_descs;  // empty vector
-  }
-
-  // All output gradients of forwarding operator do not need to calculate.
-  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
-
-  if (AllGradInSet(outputs, *no_grad_vars)) {
-    VLOG(10) << "Drop operator " << op_desc->Type();
-    // FIXME: Hack code here
-    auto& ctrl_flow_ops = CtrlFlowOps();
-    if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
-      // Only computational op need drop input's gradient.
-      for (const std::string& name : inputs) {
-        no_grad_vars->insert(GradVarName(name));
-        VLOG(10) << " Also drop " << GradVarName(name);
-      }
-    }
-
-    return grad_op_descs;  // empty vector
-  }
-
-  grad_op_descs =
-      OpInfoMap::Instance()
-          .Get(op_desc->Type())
-          .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);
-
-  std::list<std::unique_ptr<OpDesc>> pending_fill_zeros_ops;
-  for (auto& desc : grad_op_descs) {
-    for (const std::string& in_name : desc->InputArgumentNames()) {
-      if (no_grad_vars->count(in_name)) {
-        std::string prefix = in_name.substr(
-            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
-        std::string new_name = prefix + kZeroVarSuffix;
-        desc->Rename(in_name, new_name);
-        std::unique_ptr<OpDesc> fill_zeros_op(
-            new OpDesc("fill_zeros_like", {{"X", {prefix}}},
-                       {{"Out", {new_name}}}, AttributeMap{}));
-        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
-      }
-    }
-  }
-
-  for (auto& p : pending_fill_zeros_ops) {
-    grad_op_descs.insert(grad_op_descs.begin(), std::move(p));
-  }
-  return grad_op_descs;
-}
-
-static BlockDesc* CreateStepBlock(
-    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    int step_block_idx);
-
-std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
-    ProgramDesc& program_desc, int block_idx,
-    std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var) {
-  VLOG(5) << "MakeBlockBackward";
-  BlockDesc* cur_block = program_desc.MutableBlock(block_idx);
-  std::vector<OpDesc*> op_descs = cur_block->AllOps();
-  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
-  size_t grad_desc_idx = 0;
-  std::vector<std::unique_ptr<OpDesc>> backward_descs;
-
-  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
-    VLOG(5) << "Making backward " << (*it)->Type() << " op";
-    std::vector<std::unique_ptr<OpDesc>> op_grads;
-
-    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" ||
-        (*it)->Type() == "parallel_do") {
-      int step_block_idx = (*it)->GetBlockAttr("sub_block");
-      BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
-                                                  grad_to_var, step_block_idx);
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
-    } else if ((*it)->Type() == "conditional_block") {
-      BlockDesc* backward_block =
-          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
-                          (*it)->GetBlockAttr("sub_block"));
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
-    } else {
-      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
-    }
-
-    if (VLOG_IS_ON(10)) {
-      std::ostringstream sout;
-      sout << "Made ";
-      for (auto& op_grad : op_grads) {
-        sout << op_grad->Type() << " ";
-      }
-      VLOG(10) << sout.str();
-    }
-
-    for (const auto& desc : op_grads) {
-      for (const std::string& out_name : desc->OutputArgumentNames()) {
-        if (out_name.find("@GRAD") == std::string::npos) {
-          // Not all outputs of a backward operator is a gradient. Only gradient
-          // need to be sum. Skip variables are not gradient.
-          continue;
-        }
-        dup_out_ops[out_name].emplace_back(grad_desc_idx);
-      }
-      ++grad_desc_idx;
-    }
-    std::transform(op_grads.begin(), op_grads.end(),
-                   std::back_inserter(backward_descs),
-                   [](std::unique_ptr<OpDesc>& ptr) { return std::move(ptr); });
-  }
-
-  VLOG(5) << "Appending Sums";
-  // Check whether some variables are written more than once
-  std::list<std::pair<size_t, std::unique_ptr<OpDesc>>> pending_sum_ops;
-  for (const auto& dup : dup_out_ops) {
-    const std::string& out_name = dup.first;
-    const std::vector<size_t> dup_op = dup.second;
-    if (out_name != kEmptyVarName && dup_op.size() > 1) {
-      std::vector<std::string> sum_op_inputs;
-      std::string next_g_name = out_name;
-      for (size_t i = 0; i < dup_op.size(); ++i) {
-        VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
-                 << " duplicated";
-        std::string new_name = out_name + "@RENAME@" + std::to_string(i);
-        backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
-        backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
-        sum_op_inputs.emplace_back(new_name);
-        next_g_name = sum_op_inputs.back();
-      }
-      std::unique_ptr<OpDesc> sum_op(new OpDesc("sum", {{"X", sum_op_inputs}},
-                                                {{"Out", {out_name}}},
-                                                AttributeMap{}));
-      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
-    }
-  }
-
-  pending_sum_ops.sort([](const std::pair<size_t, std::unique_ptr<OpDesc>>& a,
-                          const std::pair<size_t, std::unique_ptr<OpDesc>>& b) {
-    return a.first > b.first;
-  });
-  for (auto& p : pending_sum_ops) {
-    backward_descs.insert(backward_descs.begin() + p.first + 1,
-                          std::move(p.second));
-  }
-
-  VLOG(5) << "MakeBlockBackward Finished";
-
-  return backward_descs;
-}
-
-static BlockDesc* CreateStepBlock(
-    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
-    std::unordered_map<std::string, std::string>* grad_to_var,
-    int step_block_idx) {
-  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
-                                                   no_grad_vars, grad_to_var);
-  BlockDesc* backward_block =
-      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
-  for (auto& ptr : backward_block_op_descs) {
-    backward_block->AppendAllocatedOp(move(ptr));
-  }
-  return backward_block;
-}
-
-ParamGradInfoMap AppendBackward(
-    ProgramDesc& program_desc, const VarDesc& target,
-    const std::unordered_set<std::string>& no_grad_vars) {
-  std::unordered_set<std::string> no_grad_var_names;
-  no_grad_var_names.reserve(no_grad_vars.size() + 1);
-  no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
-  for (auto& name : no_grad_vars) {
-    no_grad_var_names.insert(GradVarName(name));
-  }
-
-  const int root_block_idx = 0;
-  auto root_block = program_desc.MutableBlock(root_block_idx);
-
-  std::string fill_one_op_out = GradVarName(target.Name());
-  bool is_scalar = target.GetShape() == std::vector<int64_t>{1};
-  PADDLE_ENFORCE(is_scalar, "target should be scalar");
-  VLOG(3) << "backward from loss=" << target.Name()
-          << " data_type=" << target.GetDataType();
-  std::unique_ptr<OpDesc> fill_one_op(
-      new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}},
-                 {{"shape", std::vector<int>{1}},
-                  {"value", static_cast<float>(1.0)},
-                  {"dtype", target.GetDataType()}}));
-  // infer var type of fill_one_op
-  fill_one_op->InferVarType(root_block);
-
-  root_block->AppendAllocatedOp(std::move(fill_one_op));
-  size_t forward_op_num = root_block->OpSize();
-  size_t forward_block_num = program_desc.Size();
-
-  // Insert backward operators
-  std::unordered_map<std::string, std::string> grad_to_var;
-  auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx,
-                                             &no_grad_var_names, &grad_to_var);
-
-  for (auto& ptr : backward_op_descs) {
-    root_block->AppendAllocatedOp(std::move(ptr));
-  }
-  // Create Variable
-
-  // Create target gradient variable
-  std::unordered_map<std::string, GradVarInfo> retv;
-
-  auto var = root_block->Var(fill_one_op_out);
-  var->SetDataType(target.GetDataType());
-  var->SetShape(target.GetShape());
-  auto& target_grad = retv[target.Name()];
-  target_grad.name_ = fill_one_op_out;
-  target_grad.block_idx_ = root_block_idx;
-  target_grad.op_idx_ = static_cast<int>(forward_op_num);
-
-  // create grad_var for all blocks in this program
-  CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv);
-  for (size_t block_index = forward_block_num;
-       block_index < program_desc.Size(); ++block_index) {
-    CreateGradVarInBlock(0, grad_to_var, program_desc.MutableBlock(block_index),
-                         &retv);
-  }
-  return retv;
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/backward.h b/paddle/fluid/framework/backward.h
deleted file mode 100644
index 3a971090c25c8..0000000000000
--- a/paddle/fluid/framework/backward.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-
-namespace paddle {
-namespace framework {
-
-// Create the backward operator from a forward operator.
-// TODO(yuyang18): Add more API reference comment.
-extern std::unique_ptr<OperatorBase> Backward(
-    const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars);
-
-struct GradVarInfo {
-  GradVarInfo() {}
-  GradVarInfo(const std::string& name, int block_idx, int op_idx)
-      : name_(name), block_idx_(block_idx), op_idx_(op_idx) {}
-
-  bool operator==(const GradVarInfo& b) const {
-    return name_ == b.name_ && block_idx_ == b.block_idx_ &&
-           op_idx_ == b.op_idx_;
-  }
-
-  std::string name_;
-  int block_idx_;
-  int op_idx_;
-};
-
-using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
-                                            GradVarInfo /*grad_var_info*/>;
-
-ParamGradInfoMap AppendBackward(
-    ProgramDesc& program_desc, const VarDesc& target,
-    const std::unordered_set<std::string>& no_grad_vars);
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/backward_test.cc b/paddle/fluid/framework/backward_test.cc
deleted file mode 100644
index cc1f871360ed3..0000000000000
--- a/paddle/fluid/framework/backward_test.cc
+++ /dev/null
@@ -1,918 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/backward.h"
-
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/op_desc.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_desc.h"
-#include "paddle/fluid/operators/net_op.h"
-
-USE_NO_KERNEL_OP(fill_constant);
-
-namespace paddle {
-namespace framework {
-
-using DeviceContext = platform::DeviceContext;
-
-class NoneOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {}
-};
-
-template <typename Place, typename T>
-class NoneKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {}
-};
-
-class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
- public:
-  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input X of Add");
-    AddInput("b", "Bias of Add");
-    AddOutput("Out", "Out of Add");
-    AddComment("Add Op");
-  }
-};
-
-class RowWiseAddGradMaker : public SingleGradOpDescMaker {
- public:
-  using SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<OpDesc> Apply() const override {
-    auto grad_op = new OpDesc();
-    grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
-    grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
-    grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
-    grad_op->SetType("rowwise_add_grad");
-    return std::unique_ptr<OpDesc>(grad_op);
-  }
-};
-
-class MulOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "A");
-    AddInput("Y", "B");
-    AddOutput("Out", "Out");
-    AddAttr<int>("x_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
-    AddAttr<int>("y_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
-    AddComment("Mul");
-  }
-};
-
-class SigmoidOpMaker : public OpProtoAndCheckerMaker {
- public:
-  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "X");
-    AddOutput("Out", "Y");
-    AddComment("Sigmoid");
-  }
-};
-
-class NoGradOpMaker : public OpProtoAndCheckerMaker {
- public:
-  NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "X input");
-    AddOutput("Out", "Y output");
-    AddComment("NoGradOp, same input output. no Grad");
-  }
-};
-
-class FcOp : public operators::NetOp {
- public:
-  FcOp(const std::string &type, const VariableNameMap &inputs,
-       const VariableNameMap &outputs, const AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    AppendOp(OpRegistry::CreateOp(
-        "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}},
-        {{"Out", {Output("mul_result")}}}, AttributeMap{}));
-    auto input_b = Inputs("b");
-    std::string before_act = "mul_result";
-    if (input_b.size() != 0) {
-      AppendOp(OpRegistry::CreateOp(
-          "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
-          {{"Out", {Output("add_result")}}}, AttributeMap{}));
-      before_act = "add_result";
-    } else {
-      auto out_varname = Output("add_result");
-      if (out_varname != kEmptyVarName) {
-        this->Rename(out_varname, kEmptyVarName);
-      }
-    }
-
-    AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
-                                  {{"Out", {Output("Out")}}}, AttributeMap{}));
-    CompleteAddOp(false);
-  }
-};
-
-class FcOpMaker : public OpProtoAndCheckerMaker {
- public:
-  FcOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddInput("W", "w");
-    AddInput("b", "b");
-    AddOutput("mul_result", "").AsIntermediate();
-    AddOutput("add_result", "").AsIntermediate();
-    AddOutput("Out", "");
-    AddComment("");
-  }
-};
-
-class ManyOutputOpMaker : public OpProtoAndCheckerMaker {
- public:
-  ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("x", "x");
-    AddOutput("y", "y");
-    AddOutput("z", "z");
-    AddComment("");
-  }
-};
-
-class FillZeroOpMaker : public OpProtoAndCheckerMaker {
- public:
-  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddOutput("Out", "out");
-    AddComment("");
-  }
-};
-
-class SumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input tensors of sum operator.").AsDuplicable();
-    AddOutput("Out", "the output tensor of sum operator.");
-    AddComment("");
-  }
-};
-
-class MultInOutOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MultInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x");
-    AddInput("H", "h");
-    AddOutput("Y", "y");
-    AddOutput("Z", "z");
-    AddComment("");
-  }
-};
-
-class MinusGradOpDescMaker : public GradOpDescMakerBase {
- public:
-  using GradOpDescMakerBase::GradOpDescMakerBase;
-
-  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
-    std::vector<std::unique_ptr<OpDesc>> retv;
-    auto x_g = InputGrad("X");
-    if (!x_g.empty()) {
-      auto *op_desc = new OpDesc();
-      op_desc->SetType("scale");
-      op_desc->SetInput("X", OutputGrad("Out"));
-      op_desc->SetOutput("Out", x_g);
-      op_desc->SetAttr("scale", 1.0f);
-      retv.emplace_back(op_desc);
-    }
-
-    auto y_g = InputGrad("Y");
-    if (!y_g.empty()) {
-      auto *op_desc = new OpDesc();
-      op_desc->SetType("scale");
-      op_desc->SetInput("X", OutputGrad("Out"));
-      op_desc->SetOutput("Out", y_g);
-      op_desc->SetAttr("scale", -1.0f);
-      retv.emplace_back(op_desc);
-    }
-    return retv;
-  }
-};
-
-class MinusOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "");
-    AddInput("Y", "");
-    AddOutput("Out", "");
-    AddComment("minus for unittest");
-  }
-};
-}  // namespace framework
-}  // namespace paddle
-
-namespace f = paddle::framework;
-namespace ops = paddle::operators;
-using EnforceNotMet = paddle::platform::EnforceNotMet;
-// rowwise_add
-REGISTER_OPERATOR(rowwise_add, f::NoneOp, f::RowWiseAddOpMaker,
-                  f::RowWiseAddGradMaker);
-REGISTER_OP_CPU_KERNEL(rowwise_add,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OPERATOR(rowwise_add_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(rowwise_add_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// mul
-REGISTER_OP(mul, f::NoneOp, f::MulOpMaker, mul_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(mul, f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mul_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// sigmoid
-REGISTER_OP(sigmoid, f::NoneOp, f::SigmoidOpMaker, sigmoid_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(sigmoid,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NoneOp, f::NoGradOpMaker);
-// fill_zeros_like
-REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NoneOp, f::FillZeroOpMaker);
-REGISTER_OP_CPU_KERNEL(fill_zeros_like,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// sum
-REGISTER_OP(sum, f::NoneOp, f::SumOpMaker, sum_grad, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(sum, f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(sum_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// fc
-REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
-// many_output_op
-REGISTER_OP(many_output_op, f::NoneOp, f::ManyOutputOpMaker,
-            many_output_op_grad, f::NoneOp);
-// mult_in_out
-REGISTER_OP(mult_in_out, f::NoneOp, f::MultInOutOpMaker, mult_in_out_grad,
-            f::NoneOp);
-REGISTER_OP_CPU_KERNEL(mult_in_out,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(mult_in_out_grad,
-                       f::NoneKernel<paddle::platform::CPUPlace, float>);
-// minus
-REGISTER_OPERATOR(minus, f::NoneOp, f::MinusOpMaker, f::MinusGradOpDescMaker);
-REGISTER_OP_CPU_KERNEL(minus, f::NoneKernel<paddle::platform::CPUPlace, float>);
-// scale
-REGISTER_OPERATOR(scale, f::NoneOp);
-REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel<paddle::platform::CPUPlace, float>);
-
-TEST(Backward, simple_op_not_need_grad) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  auto gop = f::Backward(*fwd, {"x"});
-  ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName);
-
-  auto no_input_gop = f::Backward(*fwd, {"x", "b"});
-  ASSERT_NE(no_input_gop, nullptr);
-  ASSERT_TRUE(no_input_gop->IsNetOp());
-  ASSERT_EQ(0UL, static_cast<ops::NetOp *>(no_input_gop.get())->ops_.size());
-}
-
-TEST(Backward, net_fc_backward_normal) {
-  std::shared_ptr<f::OperatorBase> fwd =
-      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
-                              {{"mul_result", {"mul_res"}},
-                               {"add_result", {"add_re"}},
-                               {"Out", {"out"}}},
-                              f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop =
-      f::Backward(*fwd, std::unordered_set<std::string>{});
-  ASSERT_TRUE(gop->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(gop.get());
-
-  ASSERT_NO_THROW(net->DebugString());
-
-  ASSERT_EQ(3UL, net->ops_.size());
-
-  f::OperatorBase &d_sigmoid = *net->ops_[0];
-  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
-
-  f::OperatorBase &d_add = *net->ops_[1];
-  ASSERT_EQ("rowwise_add_grad", d_add.Type());
-
-  f::OperatorBase &d_mul = *net->ops_[2];
-  ASSERT_EQ("mul_grad", d_mul.Type());
-}
-
-TEST(Backward, net_fc_backward_not_have_b) {
-  std::shared_ptr<f::OperatorBase> fwd =
-      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {}}},
-                              {{"mul_result", {"mul_res"}},
-                               {"add_result", {"add_res"}},
-                               {"Out", {"tmp"}}},
-                              f::AttributeMap{});
-  ASSERT_NE(fwd, nullptr);
-  std::shared_ptr<f::OperatorBase> gop =
-      f::Backward(*fwd, std::unordered_set<std::string>{});
-  ASSERT_TRUE(gop->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(gop.get());
-
-  ASSERT_NO_THROW(net->DebugString());
-
-  ASSERT_EQ(2UL, net->ops_.size());
-
-  f::OperatorBase &d_sigmoid = *net->ops_[0];
-  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
-
-  f::OperatorBase &d_mul = *net->ops_[1];
-  ASSERT_EQ("mul_grad", d_mul.Type());
-}
-
-TEST(Backward, net_input_of_network_not_need_grad) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
-      {{"mul_result", {"mul_tmp_0"}},
-       {"add_result", {"add_tmp_0"}},
-       {"Out", {"hidden0"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
-      {{"mul_result", {"mul_tmp_1"}},
-       {"add_result", {"add_tmp_1"}},
-       {"Out", {"hidden1"}}},
-      f::AttributeMap{}));
-  net.CompleteAddOp();
-  auto bwd = Backward(net, {"x"});  // x@GRAD is not need.
-  ASSERT_TRUE(bwd->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
-
-  auto output_vars = bwd_net->OutputVars(true);
-  std::unordered_set<std::string> all_outputs =
-      std::unordered_set<std::string>(output_vars.begin(), output_vars.end());
-  all_outputs.erase(f::kEmptyVarName);
-
-  for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
-    ASSERT_NE(all_outputs.find(f::GradVarName(out)), all_outputs.end());
-  }
-
-  // Not Generated X
-  ASSERT_EQ(all_outputs.find(f::GradVarName("X")), all_outputs.end());
-
-  ASSERT_EQ(2UL, bwd_net->ops_.size());
-  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
-  auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
-  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
-  ASSERT_EQ(f::kEmptyVarName,
-            first_fc_grad->ops_[2]->Output(f::GradVarName("X")));
-}
-
-TEST(Backward, net_shared_weight) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
-                                       {{"Out", {"out"}}}, f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
-                                       {{"Out", {"FinalOut"}}},
-                                       f::AttributeMap{}));
-  net.CompleteAddOp();
-
-  auto bwd = f::Backward(net, std::unordered_set<std::string>{});
-  ASSERT_TRUE(bwd->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
-  ASSERT_EQ(3UL, bwd_net->ops_.size());
-  ASSERT_EQ("sum", bwd_net->ops_[2]->Type());
-}
-
-TEST(Backward, op_all_input_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"x", "b"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_TRUE(net->ops_.empty());
-}
-
-TEST(Backward, op_all_output_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
-                              {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"out"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_TRUE(net->ops_.empty());
-}
-
-TEST(Backward, op_part_of_output_are_not_need) {
-  auto fwd =
-      f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
-                              {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"Z"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_EQ(net->ops_.size(), 2UL);
-
-  auto &fill_zero = *net->ops_[0];
-  ASSERT_EQ("fill_zeros_like", fill_zero.Type());
-  ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
-  ASSERT_EQ("Z", fill_zero.Input("X"));
-  ASSERT_EQ(1UL, fill_zero.Outputs("Out").size());
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out"));
-
-  auto &d_many_out = *net->ops_[1];
-  ASSERT_EQ("many_output_op_grad", d_many_out.Type());
-  ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.Inputs().size());  // I/O/OG
-  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix,
-            d_many_out.Input(f::GradVarName("z")));
-  ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y")));
-  ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x")));
-}
-
-TEST(Backward, op_part_of_input_are_not_need) {
-  auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}},
-                                     {{"Out", {"out"}}}, f::AttributeMap{});
-  auto backward = f::Backward(*fwd, {"a"});
-  auto &grad_mul = *backward;
-  ASSERT_EQ(grad_mul.Type(), "mul_grad");
-  ASSERT_EQ(grad_mul.Inputs().size(), 2UL + 1UL + 1UL);
-  ASSERT_EQ(grad_mul.Outputs().size(), 2UL);
-  ASSERT_EQ(grad_mul.Output(f::GradVarName("X")), f::kEmptyVarName);
-  ASSERT_EQ(grad_mul.Output(f::GradVarName("Y")), f::GradVarName("b"));
-  ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out"));
-  ASSERT_EQ(grad_mul.Input("X"), "a");
-  ASSERT_EQ(grad_mul.Input("Y"), "b");
-  ASSERT_EQ(grad_mul.Input("Out"), "out");
-}
-
-TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
-  ops::NetOp net;
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-      {{"mul_result", {"mul_out1"}},
-       {"add_result", {"add_out1"}},
-       {"Out", {"out1"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-      {{"mul_result", {"mul_out2"}},
-       {"add_result", {"tmp_out2"}},
-       {"Out", {"out2"}}},
-      f::AttributeMap{}));
-  net.AppendOp(f::OpRegistry::CreateOp(
-      "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
-      {{"mul_result", {"mul_out3"}},
-       {"add_result", {"tmp_out3"}},
-       {"Out", {"out3"}}},
-      f::AttributeMap{}));
-  net.CompleteAddOp();
-
-  auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
-  ASSERT_TRUE(backward->IsNetOp());
-  auto bwd_net = static_cast<ops::NetOp *>(backward.get());
-  ASSERT_EQ(bwd_net->ops_.size(), 3UL);
-  auto &grad_fc = *bwd_net->ops_[0];
-
-  const char *all = paddle::operators::NetOp::kAll;
-  EXPECT_EQ(grad_fc.Inputs(all).size(),
-            2UL       /* external input number */
-                + 1UL /* external output number*/
-                + 1UL /* number of gradient of external output*/
-                + 2UL /* internal variable number*/
-            );
-  EXPECT_EQ(grad_fc.Outputs(all).size(),
-            2UL       /* input number of mul*/
-                + 2UL /* input number of rowwise_add*/
-                + 1UL /* input number of sigmod */
-                - 1UL /* out2 is not needed*/);
-  EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
-  EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL);
-}
-
-TEST(Backward, simple_single_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-
-  f::OpDesc *op = block->AppendOp();
-  op->SetType("rowwise_add");
-  op->SetInput("X", {"x"});
-  op->SetInput("b", {"b"});
-  op->SetOutput("Out", {"out"});
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 3UL);
-  f::OpDesc *fill_op = block->AllOps()[1];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op = block->AllOps()[2];
-  EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x")}));
-  EXPECT_EQ(grad_op->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b")}));
-
-  EXPECT_EQ(var_to_grad.size(), 3UL);
-  EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2));
-  EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x")));
-}
-
-TEST(Backward, default_attribute) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op = block->AppendOp();
-  op->SetType("mul");
-  op->SetInput("X", {"x"});
-  op->SetInput("Y", {"y"});
-  op->SetOutput("Out", {"out"});
-  op->CheckAttrs();
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 3UL);
-  EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
-  EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
-
-  f::OpDesc *fill_op = block->AllOps()[1];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op = block->AllOps()[2];
-  ASSERT_EQ(grad_op->Type(), "mul_grad");
-  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
-  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
-}
-
-TEST(Backward, simple_mult_op) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"out1"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out2"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  auto target = f::VarDesc("out3");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 6UL + 1);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op1 = block->AllOps()[6];
-  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  f::OpDesc *grad_op2 = block->AllOps()[5];
-  EXPECT_EQ(grad_op2->Type(), "mul_grad");
-  ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op2->Input("Out"), std::vector<std::string>({"out2"}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-
-  f::OpDesc *grad_op3 = block->AllOps()[4];
-  EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out3")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b3")}));
-
-  EXPECT_EQ(var_to_grad.size(), 7UL);
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("out2"),
-            f::GradVarInfo(f::GradVarName("out2"), 0, 4));
-  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
-}
-
-TEST(Backward, intermedia_var_no_grad) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"x2"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out2"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  f::OpDesc *op4 = block->AppendOp();
-  op4->SetType("mul");
-  op4->SetInput("X", {"out1"});
-  op4->SetInput("Y", {"out3"});
-  op4->SetOutput("Out", {"out4"});
-
-  auto target = f::VarDesc("out4");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"out3"});
-
-  ASSERT_EQ(block->AllOps().size(), 7UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op1 = block->AllOps()[6];
-  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  f::OpDesc *grad_op4 = block->AllOps()[5];
-  EXPECT_EQ(grad_op4->Type(), "mul_grad");
-  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"out3"}));
-  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out4"}));
-  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out4")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector<std::string>());
-
-  EXPECT_EQ(var_to_grad.size(), 4UL);
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-}
-
-TEST(Backward, var_no_grad) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("mult_in_out");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("H", {"h1"});
-  op1->SetOutput("Y", {"y1"});
-  op1->SetOutput("Z", {"z1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mult_in_out");
-  op2->SetInput("X", {"y1"});
-  op2->SetInput("H", {"z1"});
-  op2->SetOutput("Y", {"y2"});
-  op2->SetOutput("Z", {"z2"});
-
-  auto target = f::VarDesc("z2");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"z1"});
-
-  ASSERT_EQ(block->AllOps().size(), 6UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op2 = block->AllOps()[3];
-  ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
-  ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
-  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"y1"}));
-  EXPECT_EQ(grad_op2->Input("H"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op2->Input("Z"), std::vector<std::string>({"z2"}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-  EXPECT_EQ(grad_op2->Input(f::GradVarName("Z")),
-            std::vector<std::string>({f::GradVarName("z2")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("y1")}));
-  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());
-
-  f::OpDesc *fill_zero_op = block->AllOps()[4];
-  ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
-  ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
-  ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
-  EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(fill_zero_op->Output("Out"),
-            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
-
-  f::OpDesc *grad_op1 = block->AllOps()[5];
-  ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input("X"), std::vector<std::string>({"x1"}));
-  EXPECT_EQ(grad_op1->Input("H"), std::vector<std::string>({"h1"}));
-  EXPECT_EQ(grad_op1->Input("Y"), std::vector<std::string>({"y1"}));
-  EXPECT_EQ(grad_op1->Input("Z"), std::vector<std::string>({"z1"}));
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y1")}));
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Z")),
-            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("H")),
-            std::vector<std::string>({f::GradVarName("h1")}));
-
-  EXPECT_EQ(var_to_grad.size(), 4UL);
-  EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3));
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("h1")));
-}
-
-TEST(Backward, shared_var) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  f::OpDesc *op1 = block->AppendOp();
-  op1->SetType("rowwise_add");
-  op1->SetInput("X", {"x1"});
-  op1->SetInput("b", {"b1"});
-  op1->SetOutput("Out", {"out1"});
-
-  f::OpDesc *op2 = block->AppendOp();
-  op2->SetType("mul");
-  op2->SetInput("X", {"out1"});
-  op2->SetInput("Y", {"y2"});
-  op2->SetOutput("Out", {"out2"});
-
-  f::OpDesc *op3 = block->AppendOp();
-  op3->SetType("rowwise_add");
-  op3->SetInput("X", {"out1"});
-  op3->SetInput("b", {"b3"});
-  op3->SetOutput("Out", {"out3"});
-
-  auto target = f::VarDesc("out3");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad =
-      AppendBackward(program, target, std::unordered_set<std::string>{});
-
-  ASSERT_EQ(block->AllOps().size(), 8UL);
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-
-  f::OpDesc *grad_op3 = block->AllOps()[4];
-  ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out3")}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0"}));
-  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b3")}));
-
-  f::OpDesc *grad_op4 = block->AllOps()[5];
-  ASSERT_EQ(grad_op4->Type(), "mul_grad");
-  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
-  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
-  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"y2"}));
-  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out2"}));
-  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out2")}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@1"}));
-  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
-            std::vector<std::string>({f::GradVarName("y2")}));
-
-  f::OpDesc *sum_op = block->AllOps()[6];
-  ASSERT_EQ(sum_op->Type(), "sum");
-  ASSERT_EQ(sum_op->InputNames().size(), 1UL);
-  ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
-  EXPECT_EQ(sum_op->Input("X"),
-            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0",
-                                      f::GradVarName("out1") + "@RENAME@1"}));
-  EXPECT_EQ(sum_op->Output("Out"),
-            std::vector<std::string>({f::GradVarName("out1")}));
-
-  f::OpDesc *grad_op1 = block->AllOps()[7];
-  ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
-  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
-  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
-  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
-            std::vector<std::string>({f::GradVarName("out1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
-            std::vector<std::string>({f::GradVarName("x1")}));
-  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
-            std::vector<std::string>({f::GradVarName("b1")}));
-
-  EXPECT_EQ(var_to_grad.size(), 6UL);
-  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
-  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
-  EXPECT_EQ(var_to_grad.at("out1"),
-            f::GradVarInfo(f::GradVarName("out1"), 0, 6));
-  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 7));
-  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 7));
-
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
-  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
-}
-
-TEST(Backward, half_backward) {
-  f::ProgramDesc program;
-  f::BlockDesc *block = program.MutableBlock(0);
-  auto *op1 = block->AppendOp();
-  op1->SetType("minus");
-  op1->SetInput("X", {"a"});
-  op1->SetInput("Y", {"b"});
-  op1->SetOutput("Out", {"out"});
-
-  auto target = f::VarDesc("out");
-  target.SetShape({1});
-  size_t forward_len = block->AllOps().size();
-  auto var_to_grad = AppendBackward(program, target, {"b"});
-  f::OpDesc *fill_op = block->AllOps()[forward_len];
-  EXPECT_EQ(fill_op->Type(), "fill_constant");
-  auto ops = block->AllOps();
-  ASSERT_EQ(3UL, ops.size());
-
-  EXPECT_EQ(var_to_grad.size(), 2UL);
-  EXPECT_EQ(var_to_grad.at("a"),
-            f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1));
-}
diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc
index fbe08349c37c4..b8847e4b909cb 100644
--- a/paddle/fluid/framework/block_desc.cc
+++ b/paddle/fluid/framework/block_desc.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/block_desc.h"
+#include <queue>
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-#include <queue>
-
 namespace paddle {
 namespace framework {
 
@@ -147,52 +146,7 @@ void BlockDesc::RemoveOp(size_t s, size_t e) {
   if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
     return;
   }
-  auto get_vars = [](std::deque<std::unique_ptr<OpDesc>>::iterator &op,
-                     std::vector<std::string> &v) {
-    auto in_names = (*op)->InputArgumentNames();
-    v.insert(v.end(), in_names.begin(), in_names.end());
-    auto out_names = (*op)->OutputArgumentNames();
-    v.insert(v.end(), out_names.begin(), out_names.end());
-    std::sort(v.begin(), v.end());
-    auto last = std::unique(v.begin(), v.end());
-    v.erase(last, v.end());
-  };
-  need_update_ = true;
-
-  for (size_t i = s; i < e; i++) {
-    // since remove op one by one, every time remove the first op.
-    auto op = ops_.begin() + s;
-
-    // collect input and output variables from current delete op
-    std::vector<std::string> cur_vars;
-    get_vars(op, cur_vars);
-
-    // remove current op
-    ops_.erase(ops_.begin() + s);
-
-    // collect input and output variables from other ops
-    std::vector<std::string> other_vars;
-    for (auto it = ops_.begin(); it != ops_.end(); it++) {
-      get_vars(it, other_vars);
-    }
-
-    // variables should be deleted
-    std::vector<std::string> delete_vars;
-    // delete_vars = cur_vars -  cur_vars ^ other_input_vars
-    std::set_difference(cur_vars.begin(), cur_vars.end(), other_vars.begin(),
-                        other_vars.end(),
-                        std::inserter(delete_vars, delete_vars.end()));
-    // remove variables
-    for (size_t i = 0; i < delete_vars.size(); i++) {
-      auto name = delete_vars[i];
-      auto it = vars_.find(name);
-      PADDLE_ENFORCE(it != vars_.end(),
-                     "%s is not in variable list, it should not be deleted",
-                     name);
-      vars_.erase(it);
-      VLOG(3) << "deleting variable " << name;
-    }
-  }
+  ops_.erase(ops_.begin() + s, ops_.begin() + e);
 }
 
 std::vector<OpDesc *> BlockDesc::AllOps() const {
diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h
index 873969b2a884f..eef19c4f09c60 100644
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -92,7 +92,7 @@ class BlockDesc {
 
   /*
    * Remove Op and its input/output variables.
-   * Note that for either input or ouput variable, if it is also an input or
+   * Note that for either input or output variable, if it is also an input or
    * output variable of other ops, we should remain it.
    */
   void RemoveOp(size_t s, size_t e);
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 89b5c6847f15b..85b649b2937f6 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -5,6 +5,7 @@ cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod
 nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
         dynload_cuda)
 cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
+cc_library(send_op_handle SRCS send_op_handle.cc DEPS framework_proto scope place operator op_registry)
 
 cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
 cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
@@ -15,7 +16,7 @@ else()
     set(multi_devices_graph_builder_deps)
 endif()
 cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
-            scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
+            scale_loss_grad_op_handle send_op_handle ${multi_devices_graph_builder_deps})
 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
         simple_threadpool device_context)
diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc
index 7a1b40c0b60a7..e3f8bbb72f2a1 100644
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 
+#include <string>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -33,7 +35,7 @@ void ComputationOpHandle::RunImpl() {
     }
   }
 
-  op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get<Scope *>(), place_);
+  op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
 }
 
 std::string ComputationOpHandle::Name() const { return op_->Type(); }
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 9180903b864d0..e3e7c55d153ae 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -14,6 +14,9 @@
 
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 
+#include <string>
+#include <vector>
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -57,7 +60,10 @@ void FetchOpHandle::RunImpl() {
 
   for (size_t i = 0; i < scopes.size(); ++i) {
     auto &scope = scopes[i];
-    auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>();
+    auto &t = scope->FindVar(kLocalExecScopeName)
+                  ->Get<Scope *>()
+                  ->FindVar(var_name)
+                  ->Get<framework::LoDTensor>();
     if (platform::is_gpu_place(var->place_)) {
 #ifdef PADDLE_WITH_CUDA
       TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index e7a0cb678ebfd..e0dd9e6068174 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
 #include "paddle/fluid/framework/details/computation_op_handle.h"
 #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
+#include "paddle/fluid/framework/details/send_op_handle.h"
 #include "paddle/fluid/framework/scope.h"
 
 #ifdef PADDLE_WITH_CUDA
@@ -54,6 +55,27 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
   }
 }
 
+void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, OpDesc *op,
+                                                const platform::Place &p,
+                                                const size_t &i) const {
+  auto *op_handle = result->ops_.back().get();
+  op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
+      platform::DeviceContextPool::Instance().Get(p));
+
+  auto var_names = op->InputArgumentNames();
+
+  for (auto &each_var_name : var_names) {
+    VarHandle *var = CreateOrGetLatestVarHandle(result, each_var_name, p, i);
+    op_handle->AddInput(var);
+  }
+
+  var_names = op->OutputArgumentNames();
+
+  for (auto &each_var_name : var_names) {
+    CreateOpOutput(result, op_handle, each_var_name, p, i);
+  }
+}
+
 std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
     const ProgramDesc &program) const {
   auto graph = new SSAGraph();
@@ -76,27 +98,28 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       }
     }
 
+    // append send op if program is distributed trainer main program.
+    // always use the first device
+    if (!is_forwarding && op->Type() == "send") {
+      auto &p = places_[0];
+      auto *s = local_scopes_[0];
+      // FIXME(wuyi): send op always copy from GPU 0
+      result.ops_.emplace_back(new SendOpHandle(*op, s, p));
+      // Create inputs for output on original place and no ssa output
+      // is created for send op.
+      CreateOpHandleIOs(&result, op, p, 0);
+      continue;
+    }
+
     for (size_t i = 0; i < places_.size(); ++i) {
       auto &p = places_[i];
       auto *s = local_scopes_[i];
 
       result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
       auto *op_handle = result.ops_.back().get();
-      op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
-          platform::DeviceContextPool::Instance().Get(p));
+      CreateOpHandleIOs(&result, op, p, i);
 
-      auto var_names = op->InputArgumentNames();
-
-      for (auto &each_var_name : var_names) {
-        VarHandle *var =
-            CreateOrGetLatestVarHandle(&result, each_var_name, p, i);
-        op_handle->AddInput(var);
-      }
-      var_names = op->OutputArgumentNames();
-
-      for (auto &each_var_name : var_names) {
-        CreateOpOutput(&result, op_handle, each_var_name, p, i);
-      }
+      auto var_names = op->OutputArgumentNames();
 
       if (is_forwarding) {
         if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index d3c8e582cf2cd..de34caab1be85 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -14,6 +14,9 @@
 
 #pragma once
 
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 
 namespace paddle {
@@ -41,6 +44,10 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 
   std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
 
+ private:
+  void CreateOpHandleIOs(SSAGraph *result, OpDesc *op, const platform::Place &p,
+                         const size_t &i) const;
+
  private:
   std::string loss_var_name_;
   const std::vector<platform::Place> &places_;
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index d7a541ac4bb83..fbdb54ba8d940 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -24,6 +24,8 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
+
 class OpHandleBase {
  private:
   DISABLE_COPY_AND_ASSIGN(OpHandleBase);
diff --git a/paddle/fluid/framework/details/send_op_handle.cc b/paddle/fluid/framework/details/send_op_handle.cc
new file mode 100644
index 0000000000000..d181607e86372
--- /dev/null
+++ b/paddle/fluid/framework/details/send_op_handle.cc
@@ -0,0 +1,43 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/send_op_handle.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+SendOpHandle::SendOpHandle(const framework::OpDesc &op_desc,
+                           const Scope *local_scope,
+                           const platform::Place &place)
+    : op_(framework::OpRegistry::CreateOp(op_desc)),
+      local_scope_(local_scope),
+      place_(place) {}
+
+void SendOpHandle::RunImpl() {
+  // Wait input done
+  for (auto *in : inputs_) {
+    auto &p = static_cast<VarHandle *>(in)->place_;
+    if (in->DebugString() == "dummy") {  // HACK
+      continue;
+    }
+    in->generated_op_->Wait(dev_ctxes_[p]);
+  }
+  op_->Run(*local_scope_, place_);
+}
+
+std::string SendOpHandle::Name() const { return "send"; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/send_op_handle.h b/paddle/fluid/framework/details/send_op_handle.h
new file mode 100644
index 0000000000000..173f9d726145a
--- /dev/null
+++ b/paddle/fluid/framework/details/send_op_handle.h
@@ -0,0 +1,50 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct SendOpHandle : public OpHandleBase {
+  std::unique_ptr<OperatorBase> op_;
+  const Scope* local_scope_;
+  const platform::Place& place_;
+
+  SendOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
+               const platform::Place& place);
+
+  std::string Name() const override;
+
+  // Delay and buffer nccl_all_reduce together can significantly increase
+  // performance. Disable this feature by returning false.
+  bool IsMultiDeviceTransfer() override { return false; };
+
+ protected:
+  void RunImpl() override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h
index 3b818b1a45b56..a8833b7388ab9 100644
--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -15,13 +15,15 @@
 #pragma once
 
 #include <memory>
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 
 namespace paddle {
 namespace framework {
 namespace details {
-
 class SSAGraphExecutor {
   DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);
 
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index 62af4c1d79ded..1ce69ab02b09f 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -136,12 +136,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     ready_ops.clear();
   };
 
-  // Create local scopes.
-  for (auto &scope : local_scopes_) {
-    auto &local_scope = scope->NewScope();
-    *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>() = &local_scope;
-  }
-
   // Step 3. Execution
   while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
     // 1. Run All Ready ops
@@ -189,34 +183,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
   PADDLE_ENFORCE(ready_ops.empty());
   PADDLE_ENFORCE(delayed_ops.empty());
   PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
-  ++computation_count_;
-
-  auto sync_computation = [&] {
-    computation_count_ = 0;
-    // Wait All computational streams
-    for (auto p : this->places_) {
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
-    }
-    for (auto &scope : local_scopes_) {
-      scope->DropKids();
-    }
-  };
 
   // Wait FetchOps.
   if (!fetch_ops.empty()) {
     fetch_ops.clear();
-    sync_computation();
-  }
-
-  if (computation_count_ == max_async_computation) {
-    sync_computation();
-  }
-
-  // NOTE: the temp scope can be dropped lazily if needed.
-  // Drop tmp scopes;
-  for (auto &scope : local_scopes_) {
-    auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>();
-    kid = nullptr;
   }
 
   return fetch_data;
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index 79cfc26b461a3..bb5e837b135c3 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -99,9 +99,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::unique_ptr<platform::EnforceNotMet> exception_;
   std::atomic<int> running_ops_;
   bool allow_op_delay_;
-
-  size_t computation_count_{0};
-  size_t max_async_computation{100};
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index 8155cb55a468a..a56674cbe216e 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -12,9 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/lod_tensor.h"
+#include <stdint.h>
+#include <string.h>
+#include <algorithm>
+#include <iterator>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/lod_tensor.h"
 
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
@@ -22,11 +27,6 @@ limitations under the License. */
 #include "paddle/fluid/recordio/scanner.h"
 #include "paddle/fluid/recordio/writer.h"
 
-#include <stdint.h>
-#include <string.h>
-#include <algorithm>
-#include <iterator>
-
 namespace paddle {
 namespace framework {
 
@@ -294,7 +294,7 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
   TensorFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
 }
 
-void WriteToRecordIO(recordio::Writer &writer,
+void WriteToRecordIO(recordio::Writer *writer,
                      const std::vector<LoDTensor> &tensor,
                      const platform::DeviceContext &dev_ctx) {
   std::stringstream buffer;
@@ -303,18 +303,20 @@ void WriteToRecordIO(recordio::Writer &writer,
   for (auto &each : tensor) {
     SerializeToStream(buffer, each, dev_ctx);
   }
-  writer.Write(buffer.str());
+  writer->Write(buffer.str());
 }
 
 std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner &scanner, const platform::DeviceContext &dev_ctx) {
-  std::istringstream sin(scanner.Next());
-  uint32_t sz;
-  sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+    recordio::Scanner *scanner, const platform::DeviceContext &dev_ctx) {
   std::vector<LoDTensor> result;
-  result.resize(sz);
-  for (uint32_t i = 0; i < sz; ++i) {
-    DeserializeFromStream(sin, &result[i], dev_ctx);
+  if (scanner->HasNext()) {
+    std::istringstream sin(scanner->Next());
+    uint32_t sz;
+    sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
+    result.resize(sz);
+    for (uint32_t i = 0; i < sz; ++i) {
+      DeserializeFromStream(sin, &result[i], dev_ctx);
+    }
   }
   return result;
 }
diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h
index 4f130d2659004..1159fee39b073 100644
--- a/paddle/fluid/framework/lod_tensor.h
+++ b/paddle/fluid/framework/lod_tensor.h
@@ -15,6 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 #ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
@@ -216,12 +219,12 @@ void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
 void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
                            const platform::DeviceContext& dev_ctx);
 
-extern void WriteToRecordIO(recordio::Writer& writer,
+extern void WriteToRecordIO(recordio::Writer* writer,
                             const std::vector<LoDTensor>& tensor,
                             const platform::DeviceContext& dev_ctx);
 
 extern std::vector<LoDTensor> ReadFromRecordIO(
-    recordio::Scanner& scanner, const platform::DeviceContext& dev_ctx);
+    recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc
index e691e29383d48..97ab98f09b1a9 100644
--- a/paddle/fluid/framework/lod_tensor_test.cc
+++ b/paddle/fluid/framework/lod_tensor_test.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/lod_tensor.h"
-
-#include "paddle/fluid/recordio/scanner.h"
-#include "paddle/fluid/recordio/writer.h"
-
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <memory>
 #include <vector>
 
+#include "paddle/fluid/framework/lod_tensor.h"
+
+#include "paddle/fluid/recordio/scanner.h"
+#include "paddle/fluid/recordio/writer.h"
+
 namespace paddle {
 namespace framework {
 
@@ -240,8 +240,8 @@ TEST(LoDTensor, RecordIO) {
       *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
   {
     recordio::Writer writer(stream, recordio::Compressor::kSnappy);
-    WriteToRecordIO(writer, {tensor, tensor}, ctx);
-    WriteToRecordIO(writer, {tensor, tensor}, ctx);
+    WriteToRecordIO(&writer, {tensor, tensor}, ctx);
+    WriteToRecordIO(&writer, {tensor, tensor}, ctx);
     writer.Flush();
   }
 
@@ -254,11 +254,11 @@ TEST(LoDTensor, RecordIO) {
   {
     std::unique_ptr<std::istream> stream_ptr(stream);
     recordio::Scanner scanner(std::move(stream_ptr));
-    auto tensors = ReadFromRecordIO(scanner, ctx);
+    auto tensors = ReadFromRecordIO(&scanner, ctx);
     ASSERT_EQ(tensors.size(), 2);
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
-    tensors = ReadFromRecordIO(scanner, ctx);
+    tensors = ReadFromRecordIO(&scanner, ctx);
     ASSERT_EQ(tensors.size(), 2);
     assert_tensor_ok(tensors[0]);
     assert_tensor_ok(tensors[1]);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index a3b4a8c0829ae..f97bd0827428f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -46,7 +46,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
   }
 }
 
-static DDim GetDims(const Scope& scope, const std::string& name) {
+static DDim GetDims(const Scope& scope, const std::string& name,
+                    bool get_actual_dim = false) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
     return DDim({-1});
@@ -55,7 +56,11 @@ static DDim GetDims(const Scope& scope, const std::string& name) {
   if (var->IsType<LoDTensor>()) {
     return var->Get<LoDTensor>().dims();
   } else if (var->IsType<SelectedRows>()) {
-    return var->Get<SelectedRows>().GetCompleteDims();
+    if (get_actual_dim) {
+      return var->Get<SelectedRows>().value().dims();
+    } else {
+      return var->Get<SelectedRows>().GetCompleteDims();
+    }
   } else {
     return DDim({-1});
   }
@@ -129,7 +134,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     for (size_t i = 0; i < input.second.size(); ++i) {
       ss << input.second[i];
       if (scope) {
-        ss << "[" << GetDims(*scope, input.second[i]) << "]";
+        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
         ss << "(" << GetLoD(*scope, input.second[i]) << ")";
       }
       if (i != input.second.size() - 1) {
@@ -149,7 +154,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     for (size_t i = 0; i < output.second.size(); ++i) {
       ss << output.second[i];
       if (scope) {
-        ss << "[" << GetDims(*scope, output.second[i]) << "]";
+        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
         ss << "(" << GetLoD(*scope, output.second[i]) << ")";
       }
       if (i != output.second.size() - 1) {
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 99b3065d8df80..c1486b527d2e0 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 
 #include <string>
+#include <tuple>
 #include <vector>
 
 #ifdef PADDLE_WITH_CUDA
@@ -41,6 +42,8 @@ class ParallelExecutorPrivate {
 #ifdef PADDLE_WITH_CUDA
   std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
+
+  std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
 };
 
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -97,14 +100,9 @@ ParallelExecutor::ParallelExecutor(
       allow_op_delay));
 
   // Step 3. Create vars in each scope;
-  for (auto *scope : member_->local_scopes_) {
-    for (auto *var : main_program.Block(0).AllVars()) {
-      if (scope->FindVar(var->Name()) != nullptr) {
-        continue;
-      }
-
-      InitializeVariable(scope->Var(var->Name()), var->GetType());
-    }
+  for (auto *var : main_program.Block(0).AllVars()) {
+    member_->var_types_.emplace_back(var->Name(), var->GetType(),
+                                     var->Persistable());
   }
 }
 
@@ -115,14 +113,12 @@ void ParallelExecutor::BCastParamsToGPUs(
 
   for (auto &var : vars) {
     auto *main_var = main_scope->FindVar(var);
-    if (!main_var->IsType<LoDTensor>()) {
+    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
       continue;
     }
 
     auto &main_tensor = main_var->Get<LoDTensor>();
-
     auto &dims = main_tensor.dims();
-
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
       size_t numel = main_tensor.numel();
       ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
@@ -165,9 +161,42 @@ void ParallelExecutor::Run(
     const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
   platform::RecordBlock b(0);
   SplitTensorToPlaces(feed_tensors);
+
+  // Create local scopes.
+  for (auto &scope : member_->local_scopes_) {
+    Scope &local_scope = scope->NewScope();
+    *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
+        &local_scope;
+
+    for (auto &name_type_pair : member_->var_types_) {
+      if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) {
+        continue;
+      }
+
+      if (std::get<2>(name_type_pair)) {  // Persistable
+        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+                           std::get<1>(name_type_pair));
+      } else {
+        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+                           std::get<1>(name_type_pair));
+      }
+    }
+  }
+
   auto fetch_data = member_->executor_->Run(fetch_tensors);
   *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
       fetch_data;
+
+  // Wait All computational streams
+  for (auto p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
+  for (auto &scope : member_->local_scopes_) {
+    auto &local_scope =
+        *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
+    scope->DeleteScope(local_scope);
+    local_scope = nullptr;
+  }
 }
 
 void ParallelExecutor::SplitTensorToPlaces(
@@ -181,10 +210,10 @@ void ParallelExecutor::SplitTensorToPlaces(
         member_->places_.size(), lod_tensors.size());
     for (size_t j = 0; j < member_->places_.size(); ++j) {
       // TODO(panxy0718): Do I need to delete this var?
-      member_->local_scopes_[j]
-          ->Var(it.first)
-          ->GetMutable<LoDTensor>()
-          ->ShareDataWith(lod_tensors[j]);
+      auto t =
+          member_->local_scopes_[j]->Var(it.first)->GetMutable<LoDTensor>();
+      t->ShareDataWith(lod_tensors[j]);
+      t->set_lod(lod_tensors[j].lod());
     }
   }
 }
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index c048c3865f148..b4f16dba858fb 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -48,13 +48,13 @@ class ParallelExecutor {
            const std::string& fetched_var_name,
            const std::unordered_map<std::string, LoDTensor>& feed_tensors);
 
+  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
+
  private:
   void SplitTensorToPlaces(
       const std::unordered_map<std::string, LoDTensor>& feed_tensors);
 
   ParallelExecutorPrivate* member_;
-
-  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc
index 0e44b34383027..8af7d2d510d36 100644
--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -14,18 +14,17 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/prune.h"
 
+#include <gtest/gtest.h>
+#include <string>
+
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/net_op.h"
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-#include <gtest/gtest.h>
-
 namespace f = paddle::framework;
-namespace ops = paddle::operators;
 
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
            const f::VariableNameMap &outputs, f::AttributeMap attrs,
diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc
index 56bf00e5f9170..76126f3dc64d7 100644
--- a/paddle/fluid/framework/reader.cc
+++ b/paddle/fluid/framework/reader.cc
@@ -22,7 +22,9 @@ FileReader::FileReader(const std::vector<DDim> &dims) : dims_(dims) {}
 
 void FileReader::ReadNext(std::vector<LoDTensor> *out) {
   ReadNextImpl(out);
-  PADDLE_ENFORCE_EQ(out->size(), dims_.size());
+  if (out->empty()) {
+    return;
+  }
   for (size_t i = 0; i < dims_.size(); ++i) {
     auto &actual = out->at(i).dims();
     auto &expect = dims_[i];
diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h
index 3573b99becf6d..3a413941df964 100644
--- a/paddle/fluid/framework/reader.h
+++ b/paddle/fluid/framework/reader.h
@@ -14,14 +14,13 @@
 
 #pragma once
 
+#include <memory>
+#include <vector>
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/platform/place.h"
 
-#include <memory>
-#include <thread>
-#include <vector>
-
 namespace paddle {
 namespace framework {
 
@@ -31,8 +30,6 @@ class ReaderBase {
 
   virtual void ReInit() = 0;
 
-  virtual bool HasNext() const = 0;
-
   virtual ~ReaderBase();
 };
 
@@ -44,8 +41,6 @@ class DecoratedReader : public ReaderBase {
 
   void ReInit() override { reader_->ReInit(); }
 
-  bool HasNext() const override { return reader_->HasNext(); }
-
  protected:
   ReaderBase* reader_;
 };
@@ -80,8 +75,6 @@ class ReaderHolder {
     reader_->ReInit();
   }
 
-  bool HasNext() const { return reader_->HasNext(); }
-
  private:
   std::unique_ptr<ReaderBase> reader_;
 };
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index f417f62f3f753..e53bcf2384e54 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
 
 cc_library(paddle_fluid_api
     SRCS io.cc
@@ -11,7 +11,7 @@ cc_library(paddle_fluid DEPS ${fluid_modules})
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
     SRCS io.cc
-    DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
+    DEPS ${fluid_modules})
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index a5b62ef322bfa..a29d457b6fa9d 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -17,10 +17,16 @@ limitations under the License. */
 #include <fstream>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/pybind/pybind.h"
 
 namespace paddle {
 namespace inference {
 
+// Temporarilly add this function for exposing framework::InitDevices() when
+// linking the inference shared library.
+void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
+
 void ReadBinaryFile(const std::string& filename, std::string& contents) {
   std::ifstream fin(filename, std::ios::in | std::ios::binary);
   PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index d07d315b93ef1..756c936b33ad5 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -18,12 +18,15 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 
 namespace paddle {
 namespace inference {
 
+void Init(bool init_p2p);
+
 void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
                       const framework::ProgramDesc& main_program,
                       const std::string& dirname,
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index c684971b6f223..4edcdab99a85a 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -30,14 +30,15 @@ function(inference_test TARGET_NAME)
     string(REGEX REPLACE "^_$" "" arg "${arg}")
     cc_test(test_inference_${TARGET_NAME}${arg}
         SRCS test_inference_${SOURCE_NAME}.cc
-        DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+        DEPS paddle_fluid
         ARGS --dirname=${PYTHON_TESTS_DIR}/${book_dir}/${TARGET_NAME}${arg}.inference.model ${use_float16})
     set_tests_properties(test_inference_${TARGET_NAME}${arg}
         PROPERTIES DEPENDS test_${TARGET_NAME})
   endforeach()
 endfunction(inference_test)
 
-inference_test(fit_a_line)
+# This unittest is buggy!
+#inference_test(fit_a_line)
 inference_test(image_classification ARGS vgg resnet)
 inference_test(float16_image_classification ARGS vgg resnet)
 inference_test(label_semantic_roles)
diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
index 164096960a6a2..45f341d5c898d 100644
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "gflags/gflags.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
+#include "paddle/fluid/platform/float16.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_int32(batch_size, 1, "Batch size of input data");
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 5ff987ad8b3ba..3c8696b508443 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -100,7 +100,7 @@ function(op_library TARGET)
     endif()
 
     # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
         endif()
@@ -199,7 +199,6 @@ else()
     set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
 endif()
 
-op_library(cond_op DEPS framework_proto tensor net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
@@ -259,7 +258,6 @@ endforeach()
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu.cc
index eecb58e11ef57..cb1927bc0f2eb 100644
--- a/paddle/fluid/operators/batch_norm_op.cu.cc
+++ b/paddle/fluid/operators/batch_norm_op.cu.cc
@@ -114,23 +114,11 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
     const auto *bias = ctx.Input<Tensor>("Bias");
 
     auto *y = ctx.Output<Tensor>("Y");
-    auto *mean_out = ctx.Output<Tensor>("MeanOut");
-    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
-    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
-    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
 
     // alloc memory
     y->mutable_data<T>(ctx.GetPlace());
-    mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
-    saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
-        functor;
-    functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
-    functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
 
     auto handle = dev_ctx.cudnn_handle();
 
@@ -159,6 +147,21 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
       // Run training mode.
       // obtain running mean and running inv var, and see if we need to
       // initialize them.
+
+      auto *mean_out = ctx.Output<Tensor>("MeanOut");
+      auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+      mean_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      variance_out->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+
+      auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+      auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+      saved_mean->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      saved_variance->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
+      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+          functor;
+      functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+      functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
+
       double this_factor = 1. - momentum;
 
       CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index d65a7b34678cd..4a36b03cb63ac 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/concat_op.h"
+
 #include <string>
 #include <vector>
 
@@ -34,7 +35,10 @@ class ConcatOp : public framework::OperatorWithKernel {
     size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
     const size_t n = ins.size();
 
-    PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1.");
+    PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
+    if (n == 1) {
+      VLOG(3) << "Warning: concat op have only one input, may waste memory";
+    }
 
     auto out_dims = ins[0];
     size_t in_zero_dims_size = out_dims.size();
diff --git a/paddle/fluid/operators/cond_op.cc b/paddle/fluid/operators/cond_op.cc
deleted file mode 100644
index 15dce9e3e28fa..0000000000000
--- a/paddle/fluid/operators/cond_op.cc
+++ /dev/null
@@ -1,235 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/cond_op.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace paddle {
-namespace operators {
-
-using Scope = framework::Scope;
-using Variable = framework::Variable;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-
-framework::Scope& CondOp::AddSubScope(const Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
-  auto& sub_scope = scope.NewScope();
-  sub_scopes->push_back(&sub_scope);
-  return sub_scope;
-}
-
-std::vector<framework::Scope*>& CondOp::GetSubScopes(
-    const framework::Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  return *sub_scopes_var->GetMutable<std::vector<framework::Scope*>>();
-}
-
-LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const {
-  auto index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
-                          "Output(IndexTensors) of CondOp should not be null.");
-  auto& index_tensors =
-      *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
-  index_tensors.push_back(LoDTensor());
-  return index_tensors.back();
-}
-
-std::vector<framework::LoDTensor>& CondOp::GetIndexTensors(
-    const framework::Scope& scope) const {
-  auto* index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
-                          "Output(IndexTensors) of CondOp should not be null.");
-  return *index_tensors_var->GetMutable<std::vector<framework::LoDTensor>>();
-}
-
-void CondOp::PrepareDataForSubnet(
-    const framework::Scope& scope,
-    const platform::DeviceContext& dev_ctx) const {
-  PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty.");
-
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    // Create two sub scopes for true and false branches
-    //   sub_scopes[0] for the true branch
-    //   sub_scopes[1] for the false branch
-    AddSubScope(scope);
-    // Create two tensors for true and false indices:
-    //   index_tensors[0] for the true branch
-    //   index_tensors[1] for the false branch
-    AddIndexTensor(scope);
-  }
-
-  Variable* cond_var = scope.FindVar(Input("Cond"));
-  PADDLE_ENFORCE_NOT_NULL(cond_var,
-                          "Input(Cond) of CondOp should not be null.");
-  const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
-
-  // get the true/false index at runtime according to cond tensor
-  // index_vectors[0]: vector<int>, contains all index for cond[i] == true
-  // index_vectors[1]: vector<int>, contains all index for cond[i] == false
-  std::vector<std::vector<int>> index_vectors;
-  index_vectors.resize(BRANCH_NUM);
-
-  const int* cond_data = cond->data<int>();
-  for (int i = 0; i < cond->dims()[0]; ++i) {
-    if (cond_data[i])
-      index_vectors[TRUE_BRANCH].push_back(i);
-    else
-      index_vectors[FALSE_BRANCH].push_back(i);
-  }
-
-  // put index_vectors[0] and index_vectors[1] into two tensors:
-  // index_tensors[0] and index_tensors[1]
-  std::vector<framework::LoDTensor>& index_tensors = GetIndexTensors(scope);
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    DDim dim = {static_cast<int64_t>(index_vectors[i].size())};
-    int* index_tensor_data_ptr =
-        index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
-    memcpy(index_tensor_data_ptr, index_vectors[i].data(),
-           dim[0] * sizeof(int));
-  }
-
-  // create input in subscopes according to index_vectors
-  for (auto& input : Inputs("Xs")) {
-    Variable* var_parent = scope.FindVar(input);
-    PADDLE_ENFORCE_NOT_NULL(var_parent);
-    const auto* tensor_parent = &var_parent->Get<LoDTensor>();
-
-    for (int i = 0; i < BRANCH_NUM; ++i) {
-      Variable* var_child = sub_scopes[i]->FindVar(input);
-      PADDLE_ENFORCE_NOT_NULL(var_child);
-      auto* tensor_child = var_child->GetMutable<LoDTensor>();
-
-      // Resize child
-      DDim dim = tensor_parent->dims();
-      dim[0] = index_tensors[i].dims()[0];
-      tensor_child->mutable_data<float>(dim, platform::CPUPlace());
-
-      CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
-    }
-  }
-
-  // create output_tensors in subscope for sub_net
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    for (auto& output : (*sub_net_op_[i]).Outputs()) {
-      for (auto& var_name : output.second) {
-        sub_scopes[i]->Var(var_name);
-      }
-    }
-  }
-}
-
-void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
-                                 const platform::DeviceContext& dev_ctx) const {
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  const std::vector<framework::LoDTensor>& index_tensors =
-      GetIndexTensors(scope);
-
-  // Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0]
-  PADDLE_ENFORCE(!Outputs("Outs").empty(),
-                 "Outputs(Outs) of CondOp can't be empty.");
-  for (auto& output : Outputs("Outs")) {
-    const LoDTensor* tensor_t_out =
-        &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
-    const LoDTensor* tensor_f_out =
-        &sub_scopes[FALSE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
-
-    auto* var_out = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found");
-    LoDTensor* tensor_out = var_out->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
-                            "True output tensor should not be NULL");
-
-    DDim true_dim = tensor_t_out->dims();
-    DDim false_dim = tensor_f_out->dims();
-    true_dim[0] = 0;
-    false_dim[0] = 0;
-    PADDLE_ENFORCE_EQ(true_dim, false_dim,
-                      "Outputs not of the same shape except the first dim");
-
-    DDim out_dim = tensor_t_out->dims();
-    out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0];
-    tensor_out->Resize(out_dim);
-    tensor_out->mutable_data<float>(platform::CPUPlace());
-  }
-
-  // merge output results:
-  // output_tensor = true_output_tensor + false_output_tensor
-  for (auto& output : Outputs("Outs")) {
-    Variable* var_parent = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(var_parent);
-    auto* tensor_parent = var_parent->GetMutable<LoDTensor>();
-
-    for (int i = 0; i < BRANCH_NUM; ++i) {
-      Variable* var_child = sub_scopes[i]->FindVar(output);
-      PADDLE_ENFORCE_NOT_NULL(var_child);
-      auto* tensor_child = &var_child->Get<LoDTensor>();
-      ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
-                           tensor_parent);
-    }
-  }
-}
-
-void CondOp::RunImpl(const Scope& scope, const platform::Place& place) const {
-  // get device context from pool
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(place);
-
-  PrepareDataForSubnet(scope, dev_ctx);
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], place);
-  }
-  MergeDataFromSubnet(scope, dev_ctx);
-}
-
-class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Cond", "The condition, which is a bool vector");
-    AddInput("Xs", "Inputs of Subnets").AsDuplicable();
-    AddOutput("Outs", "Outputs of Cond_Op after merge").AsDuplicable();
-
-    AddOutput("SubScopes", "sub scopes for true and false branches");
-    AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
-
-    AddComment(R"DOC(
-Sample Dependent Conditional Operator.
-
-Given Cond[i] as a 1/0 vector to indicate true/false:
-Out[i] = subnet_true[i], if Cond[i] == true
-Out[i] = subnet_false[i], if Cond[i] == false
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_WITHOUT_GRADIENT(cond, paddle::operators::CondOp,
-                             paddle::operators::CondOpProtoAndCheckerMaker);
diff --git a/paddle/fluid/operators/cond_op.h b/paddle/fluid/operators/cond_op.h
deleted file mode 100644
index d3888923dbdee..0000000000000
--- a/paddle/fluid/operators/cond_op.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-
-/*
- * @brief CondOp is a dynamic if-else Operator
- *
- * It has a input tensor named cond indicating which netop each instance will
- * run.
- *
- * if cond == 1, it will run true_net, which is a NetOp.
- *
- * if cond == 0, it will run false_net, which is another NetOp.
- */
-class CondOp : public framework::OperatorBase {
- public:
-  CondOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    sub_net_op_.resize(BRANCH_NUM);
-  }
-
-  CondOp(const CondOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    // TODO(yuyang18): Implement copy ctor well.
-    PADDLE_THROW("Not implemented");
-  }
-
-  framework::Scope& AddSubScope(const framework::Scope& scope) const;
-  std::vector<framework::Scope*>& GetSubScopes(
-      const framework::Scope& scope) const;
-
-  framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const;
-  std::vector<framework::LoDTensor>& GetIndexTensors(
-      const framework::Scope& scope) const;
-
-  void PrepareDataForSubnet(const framework::Scope& scope,
-                            const platform::DeviceContext& dev_ctx) const;
-  void MergeDataFromSubnet(const framework::Scope& scope,
-                           const platform::DeviceContext& dev_ctx) const;
-
-  /*
-   * Set True Block
-   */
-  void set_truenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[TRUE_BRANCH] = std::move(net);
-  }
-
-  /*
-   * Set False Block
-   */
-  void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[FALSE_BRANCH] = std::move(net);
-  }
-
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override;
-
- private:
-  const int TRUE_BRANCH = 0;
-  const int FALSE_BRANCH = 1;
-  const int BRANCH_NUM = 2;
-
-  // sub_net_op_[0]: subnet_t
-  // sub_net_op_[1]: subnet_f
-  std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index 54e0b1d9ad83c..bbad74e96d9c6 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <stdio.h>
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+#include <vector>
 #include "paddle/fluid/operators/ctc_align_op.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index 70698d99589ae..9c5c6f5aa0363 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <string.h>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc
index 8bbfd1f159259..45f88ec8697d9 100644
--- a/paddle/fluid/operators/detail/grpc_client.cc
+++ b/paddle/fluid/operators/detail/grpc_client.cc
@@ -65,9 +65,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
 }
 
 void ProcGetResponse(const VarHandle& var_h,
-                     // const sendrecv::VariableMessage& ret_msg) {
                      const ::grpc::ByteBuffer& ret_msg) {
-  framework::Variable* outvar = NULL;
+  framework::Variable* outvar = nullptr;
   DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
 }
 
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
index d5fc163bc2540..0b582a08bc0bf 100644
--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -161,6 +161,7 @@ class RequestPrefetch final : public RequestBase {
     ::grpc::ByteBuffer reply;
 
     std::string var_name = request_->OutVarname();
+    VLOG(3) << "prefetch var " << var_name;
     auto var_desc = program_->Block(0).FindVar(var_name);
     framework::Scope* local_scope = &scope_->NewScope();
     auto* var = local_scope->FindVar(var_name);
diff --git a/paddle/fluid/operators/detail/serde_test.cc b/paddle/fluid/operators/detail/serde_test.cc
index f8cae6b26acf9..cb5f89583436b 100644
--- a/paddle/fluid/operators/detail/serde_test.cc
+++ b/paddle/fluid/operators/detail/serde_test.cc
@@ -107,7 +107,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
   for (int i = 0; i < tensor_numel; ++i) {
     EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
   }
-  for (int64_t i = 0; i < rows2->size(); ++i) {
+  for (size_t i = 0; i < rows2->size(); ++i) {
     EXPECT_EQ(rows_data2[i], i);
   }
   EXPECT_EQ(slr2->height(), 1000);
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index f04d8d8fd82ed..a33634ab2503f 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 
@@ -106,18 +107,18 @@ information. However, the output only shares the LoD information with input $X$.
  protected:
   std::string comment_;
 
-  void Replace(std::string& src, std::string from, std::string to) {
+  void Replace(std::string* src, std::string from, std::string to) {
     std::size_t len_from = std::strlen(from.c_str());
     std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src.find(from); pos != std::string::npos;
-         pos = src.find(from, pos + len_to)) {
-      src.replace(pos, len_from, to);
+    for (std::size_t pos = src->find(from); pos != std::string::npos;
+         pos = src->find(from, pos + len_to)) {
+      src->replace(pos, len_from, to);
     }
   }
 
   void SetComment(std::string name, std::string equation) {
-    Replace(comment_, "{name}", name);
-    Replace(comment_, "{equation}", equation);
+    Replace(&comment_, "{name}", name);
+    Replace(&comment_, "{equation}", equation);
   }
 };
 
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 2a91dcbcd418f..2490b83b8c50c 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/gru_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index 0886bebc41d8b..1d5c291495c0f 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 048391549dd8d..5b387d8d344df 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/im2sequence_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index a6a83fefbc626..d792c68f784d8 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -13,7 +13,7 @@
    limitations under the License. */
 
 #pragma once
-
+#include <vector>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index eef25f8a06ddb..c2a8c7f867a44 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/label_smooth_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index 800a1303e1a42..d5162bcd742c0 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -100,7 +100,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     auto x_row_max = EigenMatrix<T>::From(emission_row_max);
     x_row_max.device(place) =
         x.maximum(Eigen::DSizes<int, 1>(1))
-            .reshape(Eigen::DSizes<int, 2>(int(batch_size), 1));
+            .reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
 
     auto x_exps = EigenMatrix<T>::From(*emission_exps);
     x_exps.device(place) =
diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc
index 9188f2d989e60..5d293665f0bcc 100644
--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <ostream>
-#include <thread>
+#include <thread>  // NOLINT
+#include <vector>
 
 #include "paddle/fluid/operators/listen_and_serv_op.h"
 
@@ -88,8 +89,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
 
   auto ins = Inputs("X");
   auto fan_in = Attr<int>("Fanin");
-  auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-  auto *program = block->Program();
+  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+  auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
+  auto *program = optimize_block->Program();
   size_t num_blocks = program->Size();
   PADDLE_ENFORCE_GE(num_blocks, 2,
                     "server program should have at least 2 blocks");
@@ -97,18 +99,25 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
   framework::Executor executor(dev_place);
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
-    block_list.push_back(blkid);
+    if (blkid != prefetch_block->ID()) {
+      block_list.push_back(blkid);
+    }
   }
-  auto prepared = executor.Prepare(*program, block_list);
+  auto optimize_prepared = executor.Prepare(*program, block_list);
   // Insert placeholder for block0 which holds current op itself.
-  prepared.insert(prepared.begin(),
-                  std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
+  optimize_prepared.insert(
+      optimize_prepared.begin(),
+      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
 
   rpc_service_->SetScope(&recv_scope);
   rpc_service_->SetDevCtx(&dev_ctx);
   // TODO(qiao) set proper fields for table lookup and update
   rpc_service_->SetExecutor(&executor);
-  rpc_service_->SetPrefetchBlkdId(0);
+  VLOG(3) << "prefetch block id is " << prefetch_block->ID();
+  auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
+  rpc_service_->SetPrefetchBlkdId(prefetch_block->ID());
+  rpc_service_->SetPrefetchPreparedCtx(prefetch_prepared.get());
+  prefetch_prepared.release();
   rpc_service_->SetProgram(program);
   // start the server listening after all member initialized.
   server_thread_.reset(new std::thread(RunServer, rpc_service_));
@@ -166,16 +175,18 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     parallel_blkids.push_back(1);
     double ts = detail::GetTimestamp();
     for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
-      if (program->Block(blkid).Parent() != last_parent_blkid) {
-        ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
-                              &recv_scope);
-        parallel_blkids.clear();
-        last_parent_blkid = program->Block(blkid).Parent();
+      if (blkid != prefetch_block->ID()) {
+        if (program->Block(blkid).Parent() != last_parent_blkid) {
+          ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
+                                program, &recv_scope);
+          parallel_blkids.clear();
+          last_parent_blkid = program->Block(blkid).Parent();
+        }
+        parallel_blkids.push_back(blkid);
       }
-      parallel_blkids.push_back(blkid);
     }
-    ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
-                          &recv_scope);
+    ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
+                          program, &recv_scope);
     VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
 
     // Reset the received sparse variables, the sum operator would not
@@ -211,6 +222,8 @@ from send_op and send back variables to recv_op.
         .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
     AddAttr<framework::BlockDesc *>(kOptimizeBlock,
                                     "BlockID to run on server side.");
+    AddAttr<framework::BlockDesc *>(kPrefetchBlock,
+                                    "prefetch block to run on server side.");
     AddAttr<int>("Fanin", "How many clients send to this server.")
         .SetDefault(1);
   }
diff --git a/paddle/fluid/operators/listen_and_serv_op.h b/paddle/fluid/operators/listen_and_serv_op.h
index 0da87afc961e8..759b2a462ba5b 100644
--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <stdint.h>
 #include <ostream>
+#include <string>
 
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -27,6 +28,7 @@ namespace paddle {
 namespace operators {
 
 constexpr char kOptimizeBlock[] = "OptimizeBlock";
+constexpr char kPrefetchBlock[] = "PrefetchBlock";
 
 void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);
 
diff --git a/paddle/fluid/operators/logical_op.cc b/paddle/fluid/operators/logical_op.cc
index 6a7db31cf36f3..41aa00ee8ac10 100644
--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/logical_op.h"
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index bf33be3106866..5e59bd1b178ad 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -78,6 +78,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(boolean, default false) "
                   "Sparse update.")
         .SetDefault(false);
+    AddAttr<bool>("is_distributed",
+                  "(boolean, default false) distributed lookup table.")
+        .SetDefault(false);
     AddAttr<int64_t>("padding_idx",
                      "(int64, default -1) "
                      "If the value is -1, it makes no effect to lookup. "
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index cb1568398125b..553a06c3dcdbb 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lrn_op.h"
+#include <string>
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc
index d75537741ef1d..e062d62c66c25 100644
--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstm_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index 11f9f223b5d9a..a1ef0eb278dea 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu
index 76245a1b5a9c8..acf094238fff9 100644
--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@@ -18,6 +18,7 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cross_entropy_op.h"
+#include "paddle/fluid/operators/lstm_unit_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index a881ef82ec3ce..82541517e122d 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstmp_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index dfa7f74d5116b..172db54896013 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <string>
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
index 82e12943148a8..c28047e6e9152 100644
--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -39,13 +39,14 @@ void gemm<platform::CUDADeviceContext, float16>(
   cublasOperation_t cuTransB =
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-
   // TODO(kexinzhao): add processing code for compute capability < 53 case
   PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
                     "cublas fp16 gemm requires GPU compute capability >= 53");
 
+#if CUDA_VERSION >= 8000
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
   if (context.GetComputeCapability() >= 70) {
@@ -56,7 +57,7 @@ void gemm<platform::CUDADeviceContext, float16>(
     PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
                                                         CUBLAS_DEFAULT_MATH));
   }
-#endif
+#endif  // CUDA_VERSION >= 9000
 
   // cublasHgemm does true FP16 computation which is slow for non-Volta
   // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
@@ -66,6 +67,18 @@ void gemm<platform::CUDADeviceContext, float16>(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
       CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
       CUDA_R_32F, algo));
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+  const half h_alpha = static_cast<const half>(alpha);
+  const half h_beta = static_cast<const half>(beta);
+  const half* h_A = reinterpret_cast<const half*>(A);
+  const half* h_B = reinterpret_cast<const half*>(B);
+  half* h_C = reinterpret_cast<half*>(C);
+
+  PADDLE_ENFORCE(platform::dynload::cublasHgemm(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
+      h_A, lda, &h_beta, h_C, N));
+#endif  // CUDA_VERSION >= 8000
 }
 
 template <>
@@ -275,9 +288,14 @@ void batched_gemm<platform::CUDADeviceContext, float16>(
   // TODO(kexinzhao): add processing code for compute capability < 53 case
   PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
                     "cublas Hgemm requires GPU compute capability >= 53");
+
+#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
       strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "HgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 
 template <>
@@ -297,9 +315,13 @@ void batched_gemm<platform::CUDADeviceContext, float>(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int strideC = M * N;
 
+#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
       strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "SgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 
 template <>
@@ -319,9 +341,13 @@ void batched_gemm<platform::CUDADeviceContext, double>(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int strideC = M * N;
 
+#if CUDA_VERSION >= 8000
   PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
       context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
       strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
+#else
+  PADDLE_ENFORCE(false, "DgemmStridedBatched is not supported on cuda <= 7.5");
+#endif
 }
 
 template <>
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 85855928521b8..1f52558873912 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/matmul_op.h"
+#include <algorithm>
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/matmul_op.h b/paddle/fluid/operators/matmul_op.h
index 1cd8fe55dcbd2..f2e9cfdcdbf93 100644
--- a/paddle/fluid/operators/matmul_op.h
+++ b/paddle/fluid/operators/matmul_op.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
+#include <algorithm>
+#include <functional>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matmul.h"
diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc
index efaae7d5f2d20..4e28d98834d27 100644
--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -13,6 +13,8 @@
  *     limitations under the License. */
 
 #include "paddle/fluid/operators/maxout_op.h"
+#include <vector>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 7de9d94979fdc..a302b24560e68 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/minus_op.h"
-#include "paddle/fluid/operators/net_op.h"
+
+#include <string>
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu
index da4a6af298f61..5eb9d9950248b 100644
--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/momentum_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index 90af1e2d602ac..5038287527c70 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/mul_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/net_op.cc b/paddle/fluid/operators/net_op.cc
deleted file mode 100644
index 0c2da744177b6..0000000000000
--- a/paddle/fluid/operators/net_op.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/net_op.h"
-#include <set>
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-const char NetOp::kAll[] = "all";
-
-void NetOp::CompleteAddOp(bool calc) {
-  add_op_done_ = true;
-  if (!calc) return;
-  std::set<std::string> input_set;
-  std::set<std::string> output_set;
-  for (auto& op : ops_) {
-    for (auto& ipt : op->Inputs()) {
-      for (auto& var_name : ipt.second) {
-        // If input variable has been in output set, then it will be
-        // added into intermediate_outputs_. Otherwise, it will be
-        // added into input set.
-        if (Contains(output_set, var_name)) {
-          intermediate_outputs_.insert(var_name);
-        } else {
-          input_set.insert(var_name);
-        }
-      }
-    }
-
-    for (auto& opt : op->Outputs()) {
-      for (auto& var_name : opt.second) {
-        output_set.insert(var_name);
-      }
-    }
-  }
-  auto& inputs = inputs_[kAll];
-  inputs.reserve(input_set.size());
-  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs));
-  auto& outputs = outputs_[kAll];
-  outputs.reserve(output_set.size());
-  std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
-}
-
-std::string NetOp::DebugStringEx(const framework::Scope* scope) const {
-  std::ostringstream os;
-  os << OperatorBase::DebugStringEx(scope) << std::endl;
-  for (auto& op : ops_) {
-    std::istringstream is(op->DebugStringEx(scope));
-    for (std::string line; std::getline(is, line);) {
-      os << "    " << line << std::endl;
-    }
-  }
-  return os.str();
-}
-
-bool NetOp::IsNetOp() const { return true; }
-
-std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
-  std::vector<std::string> all;
-  for (auto& pair : this->outputs_) {
-    for (auto& var_name : pair.second) {
-      all.push_back(var_name);
-    }
-  }
-  if (has_intermediate) {
-    return all;
-  }
-  std::vector<std::string> ret_val;
-  for (auto& each : all) {
-    if (!Contains(intermediate_outputs_, each)) {
-      ret_val.push_back(each);
-    }
-  }
-  return ret_val;
-}
-
-NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::VariableNameMap& outputs,
-             const framework::AttributeMap& attrs)
-    : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
-std::unique_ptr<framework::OperatorBase> NetOp::Clone() const {
-  PADDLE_ENFORCE(
-      add_op_done_,
-      "Must clone a sealed NetOp, invoke Net::CompleteAddOp before clone");
-  return std::unique_ptr<OperatorBase>(new NetOp(*this));
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/net_op.h b/paddle/fluid/operators/net_op.h
deleted file mode 100644
index cbf8820cf4991..0000000000000
--- a/paddle/fluid/operators/net_op.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <set>
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-/**
- * @brief Network is also a type of Operator
- *
- * It will manage the operators it has.
- *
- * Network is the container and controller of a set of operators.
-
- * A network object knows all Operators belonging to this network. Variables,
- * which are inputs and outputs of these operators, are created and managed by a
- * hierarchy of Scope objects.
- *
- * This is the base class of network, all the networks should implement the APIs
- * it defines.
- */
-class NetOp : public framework::OperatorBase {
- public:
-  static const char kAll[];
-  NetOp()
-      : framework::OperatorBase("plain_net", framework::VariableNameMap{},
-                                framework::VariableNameMap{},
-                                framework::AttributeMap{}) {}
-
-  NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-        const framework::VariableNameMap& outputs,
-        const framework::AttributeMap& attrs);
-
-  NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
-    this->ops_.reserve(o.ops_.size());
-    std::transform(
-        o.ops_.begin(), o.ops_.end(), std::back_inserter(this->ops_),
-        [](const std::unique_ptr<framework::OperatorBase>& op) {
-          return std::unique_ptr<framework::OperatorBase>(op->Clone());
-        });
-    this->CompleteAddOp();
-  }
-
-  bool SupportGPU() const override {
-    for (auto& op : ops_) {
-      if (!op->SupportGPU()) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
-
-  /**
-   * @brief Add an operator by ptr
-   */
-  void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_,
-                   "Cannot AppendOp when this network is sealed");
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    ops_.push_back(std::move(op));
-  }
-
-  void InsertOp(size_t pos, std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_,
-                   "Cannot InsertOp when this network is sealed");
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range");
-    ops_.insert(ops_.begin() + pos, std::move(op));
-  }
-
-  void InsertOp(size_t pos, const framework::OperatorBase& op) {
-    InsertOp(pos, op.Clone());
-  }
-
-  void CompleteAddOp(bool calculate = true);
-
-  std::string DebugStringEx(
-      const framework::Scope* scope = nullptr) const override;
-
-  bool IsNetOp() const override;
-  std::vector<std::string> OutputVars(bool has_intermediate) const override;
-
-  std::unique_ptr<framework::OperatorBase> Clone() const override;
-
-  std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
-
- private:
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators with the `scope`, if no scope is provided, default
-   * scope will be used instead. If no OpContext is provicded, default context
-   * will be used.
-   */
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    for (auto& op : ops_) {
-      op->Run(scope, place);
-    }
-  }
-
-  bool add_op_done_{false};
-  std::set<std::string> intermediate_outputs_;
-
-  template <typename T, typename KeyType>
-  static bool Contains(T container, KeyType key) {
-    return container.find(key) != container.end();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/net_op_test.cc b/paddle/fluid/operators/net_op_test.cc
deleted file mode 100644
index 3b5f575485853..0000000000000
--- a/paddle/fluid/operators/net_op_test.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/net_op.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace operators {
-using Scope = framework::Scope;
-using DeviceContext = platform::DeviceContext;
-
-static int run_cnt = 0;
-
-class TestOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  DEFINE_OP_CLONE_METHOD(TestOp);
-
- private:
-  void RunImpl(const Scope& scope,
-               const platform::Place& place) const override {
-    ++run_cnt;
-  }
-};
-
-template <typename T>
-void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
-                                  const std::vector<T>& actual) {
-  ASSERT_EQ(expected.size(), actual.size());
-  std::unordered_set<T> expected_set;
-  for (auto& tmp : expected) {
-    expected_set.insert(tmp);
-  }
-  for (auto& act : actual) {
-    ASSERT_NE(expected_set.end(), expected_set.find(act));
-  }
-}
-
-TEST(OpKernel, all) {
-  auto net = std::make_shared<NetOp>();
-  ASSERT_NE(net, nullptr);
-
-  net->AppendOp(std::unique_ptr<TestOp>(
-      new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                 {{"Out", {"y"}}}, framework::AttributeMap{})));
-  net->AppendOp(std::unique_ptr<TestOp>(
-      new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-                 {{"Out", {"z"}}}, framework::AttributeMap{})));
-
-  net->CompleteAddOp();
-  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
-                               net->Inputs(NetOp::kAll));
-  AssertSameVectorWithoutOrder({"y", "z"}, net->Outputs(NetOp::kAll));
-
-  auto final_outs = net->OutputVars(false);
-
-  ASSERT_EQ(final_outs.size(), 1UL);
-  ASSERT_EQ(final_outs[0], "z");
-}
-
-TEST(NetOp, insert_op) {
-  NetOp net;
-  auto op1 = std::unique_ptr<framework::NOP>(
-      new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                         {{"Out", {"y"}}}, framework::AttributeMap{}));
-  net.AppendOp(*op1);
-  net.InsertOp(0, *op1);
-  ASSERT_EQ(2UL, net.ops_.size());
-  net.InsertOp(2, std::move(op1));
-  ASSERT_EQ(3UL, net.ops_.size());
-}
-
-TEST(NetOp, Clone) {
-  NetOp net;
-  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
-      "empty", framework::VariableNameMap{}, framework::VariableNameMap{},
-      framework::AttributeMap{}}));
-  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
-      "empty2", framework::VariableNameMap{}, framework::VariableNameMap{},
-      framework::AttributeMap{}}));
-  net.CompleteAddOp(true);
-  auto new_net_op = net.Clone();
-  ASSERT_NE(new_net_op, nullptr);
-  ASSERT_TRUE(new_net_op->IsNetOp());
-  auto* new_net = static_cast<NetOp*>(new_net_op.get());
-  ASSERT_EQ(2UL, new_net->ops_.size());
-  ASSERT_EQ(new_net->ops_[0]->Type(), "empty");
-  ASSERT_EQ(new_net->ops_[1]->Type(), "empty2");
-}
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h
index a36abe3789574..c93c096575a30 100644
--- a/paddle/fluid/operators/pad_op.h
+++ b/paddle/fluid/operators/pad_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <utility>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc
index c88578570c1ac..63eaaedcd5fc3 100644
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -83,9 +83,11 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
 
     auto src_memory =
-        mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
+        mkldnn::memory({src_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(input_data)));
     auto dst_memory =
-        mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data);
+        mkldnn::memory({dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(output_data)));
 
     auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
                                              *workspace_memory);
@@ -195,9 +197,11 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
         pool_bwd_desc, mkldnn_engine, *pool_pd);
 
     auto diff_src_memory =
-        mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data);
+        mkldnn::memory({diff_src_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(in_x_grad_data)));
     auto diff_dst_memory =
-        mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data);
+        mkldnn::memory({diff_dst_md, mkldnn_engine},
+                       static_cast<void*>(const_cast<T*>(out_grad_data)));
 
     auto bwd_prim = mkldnn::pooling_backward(
         pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index 2fec50ef25e0d..a48127ea6983d 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
index 83e7bd138ae25..b55fa76eae34c 100644
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc
index 09ab7da663b5e..f9ae01ab5d297 100644
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -50,8 +50,8 @@ class PrefetchOp : public framework::OperatorBase {
 
     for (size_t i = 0; i < ins.size(); i++) {
       if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << "to get "
-                << outs[i] << "back";
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
+                << outs[i] << " back";
         rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i],
                                           outs[i]);
       } else {
@@ -71,7 +71,7 @@ class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
               "(RPCClient) The RPC client object which will be"
               "initialized at most once.");
     AddOutput("Out",
-              "(SelectedRows) result "
+              "(LoDTensor) result "
               "to be fetched from parameter server")
         .AsDuplicable();
     AddAttr<std::vector<std::string>>(
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index 447b854544b72..8eaa12a4a6cfc 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/prelu_op.h"
-#include "paddle/fluid/operators/net_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc
index 82e54139c8c1f..058b13eeb872a 100644
--- a/paddle/fluid/operators/prior_box_op.cc
+++ b/paddle/fluid/operators/prior_box_op.cc
@@ -45,7 +45,7 @@ class PriorBoxOp : public framework::OperatorWithKernel {
     bool flip = ctx->Attrs().Get<bool>("flip");
 
     std::vector<float> aspect_ratios_vec;
-    ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
+    ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec);
 
     size_t num_priors = aspect_ratios_vec.size() * min_sizes.size();
     if (max_sizes.size() > 0) {
diff --git a/paddle/fluid/operators/prior_box_op.cu b/paddle/fluid/operators/prior_box_op.cu
index 76bf2b3b7de7a..0ea8909296f8f 100644
--- a/paddle/fluid/operators/prior_box_op.cu
+++ b/paddle/fluid/operators/prior_box_op.cu
@@ -96,7 +96,7 @@ class PriorBoxOpCUDAKernel : public framework::OpKernel<T> {
     auto clip = ctx.Attr<bool>("clip");
 
     std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
 
     T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
     T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
diff --git a/paddle/fluid/operators/prior_box_op.h b/paddle/fluid/operators/prior_box_op.h
index 1e4a12aac1c5f..1c62fd8d2c4d4 100644
--- a/paddle/fluid/operators/prior_box_op.h
+++ b/paddle/fluid/operators/prior_box_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/transform.h"
@@ -22,23 +24,23 @@ namespace operators {
 
 inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
                                bool flip,
-                               std::vector<float>& output_aspect_ratior) {
+                               std::vector<float>* output_aspect_ratior) {
   constexpr float epsilon = 1e-6;
-  output_aspect_ratior.clear();
-  output_aspect_ratior.push_back(1.0f);
+  output_aspect_ratior->clear();
+  output_aspect_ratior->push_back(1.0f);
   for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
     float ar = input_aspect_ratior[i];
     bool already_exist = false;
-    for (size_t j = 0; j < output_aspect_ratior.size(); ++j) {
-      if (fabs(ar - output_aspect_ratior[j]) < epsilon) {
+    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
+      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
         already_exist = true;
         break;
       }
     }
     if (!already_exist) {
-      output_aspect_ratior.push_back(ar);
+      output_aspect_ratior->push_back(ar);
       if (flip) {
-        output_aspect_ratior.push_back(1.0f / ar);
+        output_aspect_ratior->push_back(1.0f / ar);
       }
     }
   }
@@ -68,7 +70,7 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     auto clip = ctx.Attr<bool>("clip");
 
     std::vector<float> aspect_ratios;
-    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+    ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios);
 
     T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
     T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index 767eef56861ea..a1127f11a75e5 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/rank_loss_op.h"
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc
index 2925b8a85da1b..bf02b99589275 100644
--- a/paddle/fluid/operators/read_op.cc
+++ b/paddle/fluid/operators/read_op.cc
@@ -66,13 +66,7 @@ class ReadOp : public framework::OperatorBase {
     std::vector<std::string> out_arg_names = Outputs("Out");
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
-    if (ins.empty()) {
-      reader->ReInit();
-      reader->ReadNext(&ins);
-      PADDLE_ENFORCE(
-          !ins.empty(),
-          "Reader can not read the next data even it has been re-initialized.");
-    }
+    PADDLE_ENFORCE(!ins.empty(), "There is no next data.");
     PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
     for (size_t i = 0; i < ins.size(); ++i) {
       auto* out =
diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt
index 6fa0195b9ae10..845528860f91d 100644
--- a/paddle/fluid/operators/reader/CMakeLists.txt
+++ b/paddle/fluid/operators/reader/CMakeLists.txt
@@ -22,5 +22,6 @@ reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
 reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
 reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
 reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
+reader_library(create_threaded_reader_op SRCS create_threaded_reader_op.cc)
 # Export local libraries to parent
 set(READER_LIBRARY ${LOCAL_READER_LIBS} PARENT_SCOPE)
diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
index ed868786ab2a8..33a50b5cebc1f 100644
--- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc
@@ -63,13 +63,14 @@ class DoubleBufferReader : public framework::DecoratedReader {
     StartPrefetcher();
   }
 
-  bool HasNext() const override;
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
   void ReInit() override;
 
   ~DoubleBufferReader() { EndPrefetcher(); }
 
  private:
+  bool HasNext() const;
+
   void StartPrefetcher() {
     channel_ = framework::MakeChannel<Item>(kChannelSize);
     prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
@@ -109,7 +110,9 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
 
     auto place_str = Attr<std::string>("place");
     platform::Place place;
-    if (place_str == "CPU") {
+    if (place_str == "AUTO") {
+      place = dev_place;
+    } else if (place_str == "CPU") {
       place = platform::CPUPlace();
     } else {
       std::istringstream sin(place_str);
@@ -140,28 +143,22 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
       enum_range.insert(string::Sprintf("CUDA:%d", i));
     }
     enum_range.insert("CPU");
-    AddAttr<std::string>("place", "The double buffer place, default is CPU")
-        .SetDefault("CPU")
+    enum_range.insert("AUTO");
+    AddAttr<std::string>("place", "The double buffer place")
+        .SetDefault("AUTO")
         .InEnum({enum_range});
   }
 };
 
-bool DoubleBufferReader::HasNext() const {
-  while (!channel_->IsClosed() && !channel_->CanReceive()) {
-  }
-  return channel_->CanReceive();
-}
-
 void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  if (!HasNext()) {
-    PADDLE_THROW("There is no next data!");
-  }
-
-  Item batch;
-  channel_->Receive(&batch);
-  *out = batch.payloads_;
-  if (batch.ctx_) {
-    batch.ctx_->Wait();
+  out->clear();
+  if (HasNext()) {
+    Item batch;
+    channel_->Receive(&batch);
+    *out = batch.payloads_;
+    if (batch.ctx_) {
+      batch.ctx_->Wait();
+    }
   }
 }
 
@@ -171,16 +168,26 @@ void DoubleBufferReader::ReInit() {
   StartPrefetcher();
 }
 
+bool DoubleBufferReader::HasNext() const {
+  while (!channel_->IsClosed() && !channel_->CanReceive()) {
+  }
+  return channel_->CanReceive();
+}
+
 void DoubleBufferReader::PrefetchThreadFunc() {
   VLOG(5) << "A new prefetch thread starts.";
   std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache(kCacheSize);
   std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache(kCacheSize);
   size_t cached_tensor_id = 0;
 
-  while (reader_->HasNext()) {
+  while (true) {
     Item batch;
     auto& cpu_batch = cpu_tensor_cache[cached_tensor_id];
     reader_->ReadNext(&cpu_batch);
+    if (cpu_batch.empty()) {
+      // The underlying reader have no next data.
+      break;
+    }
     if (platform::is_gpu_place(place_)) {
       auto& gpu_batch = gpu_tensor_cache[cached_tensor_id];
       auto* gpu_ctx = ctxs_[cached_tensor_id].get();
diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
index b72ccc77a3e1e..0573345ba502b 100644
--- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc
@@ -25,22 +25,12 @@ class MultiPassReader : public framework::DecoratedReader {
       : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {}
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    if (!HasNext()) {
-      PADDLE_THROW("There is no next data!");
-    }
     reader_->ReadNext(out);
-  }
-
-  bool HasNext() const override {
-    if (reader_->HasNext()) {
-      return true;
-    } else {
+    if (out->empty()) {
       ++pass_count_;
-      if (pass_count_ >= pass_num_) {
-        return false;
-      } else {
+      if (pass_count_ < pass_num_) {
         reader_->ReInit();
-        return true;
+        reader_->ReadNext(out);
       }
     }
   }
diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
index 95d8674c08b63..d1cb8e47da70c 100644
--- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc
+++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc
@@ -52,8 +52,6 @@ class RandomDataGenerator : public framework::ReaderBase {
 
   void ReInit() override { return; }
 
-  bool HasNext() const override { return true; }
-
  private:
   float min_;
   float max_;
@@ -74,7 +72,7 @@ class CreateRandomDataGeneratorOp : public framework::OperatorBase {
     const auto& ranks = Attr<std::vector<int>>("ranks");
     PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
     PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
+                      static_cast<int>(shape_concat.size()),
                       "The accumulate of all ranks should be equal to the "
                       "shape concat's length.");
     std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
index adaa0b9e5f1ff..2ae2972556176 100644
--- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc
@@ -12,8 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <mutex>
-#include <thread>
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 #include "paddle/fluid/recordio/scanner.h"
 
@@ -35,17 +33,15 @@ class RecordIOFileReader : public framework::FileReader {
     LOG(INFO) << "Creating file reader" << filename;
   }
 
-  bool HasNext() const override { return scanner_.HasNext(); }
-
   void ReInit() override { scanner_.Reset(); }
 
  protected:
   void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
     if (ThreadSafe) {
       std::lock_guard<std::mutex> guard(*mutex_);
-      *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
     } else {
-      *out = framework::ReadFromRecordIO(scanner_, dev_ctx_);
+      *out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
     }
   }
 
@@ -66,7 +62,7 @@ class CreateRecordIOReaderOp : public framework::OperatorBase {
     const auto& ranks = Attr<std::vector<int>>("ranks");
     PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
     PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
-                      int(shape_concat.size()),
+                      static_cast<int>(shape_concat.size()),
                       "The accumulate of all ranks should be equal to the "
                       "shape concat's length.");
     std::string filename = Attr<std::string>("filename");
diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
index b164ce232d6be..13825d65913be 100644
--- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc
@@ -30,35 +30,33 @@ class ShuffleReader : public framework::DecoratedReader {
       std::random_device device;
       seed_ = device();
     }
-    ReadIntoBuffers();
+    ReloadBuffer();
   }
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override {
-    if (!HasNext()) {
-      PADDLE_THROW("There is no next data!");
-    }
+    out->clear();
     if (iteration_pos_ >= buffer_.size()) {
       VLOG(10) << "Resetting shuffle buffer";
-      ReadIntoBuffers();
+      ReloadBuffer();
+      if (buffer_.empty()) {
+        return;
+      }
     }
     *out = buffer_[iteration_pos_++];
   }
 
-  bool HasNext() const override {
-    return iteration_pos_ < buffer_.size() || reader_->HasNext();
-  }
-
  private:
-  void ReadIntoBuffers() {
+  void ReloadBuffer() {
     buffer_.clear();
     buffer_.reserve(buffer_size_);
     iteration_pos_ = 0;
     for (size_t i = 0; i < buffer_size_; ++i) {
-      if (!reader_->HasNext()) {
+      std::vector<framework::LoDTensor> ins;
+      reader_->ReadNext(&ins);
+      if (ins.empty()) {
         break;
       }
-      buffer_.emplace_back();
-      reader_->ReadNext(&buffer_.back());
+      buffer_.emplace_back(ins);
     }
     std::mt19937 g(seed_);
     std::shuffle(buffer_.begin(), buffer_.end(), g);
diff --git a/paddle/fluid/operators/reader/create_threaded_reader_op.cc b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
new file mode 100644
index 0000000000000..cbf709d5e734c
--- /dev/null
+++ b/paddle/fluid/operators/reader/create_threaded_reader_op.cc
@@ -0,0 +1,94 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/reader/reader_op_registry.h"
+
+namespace paddle {
+namespace operators {
+namespace reader {
+
+class ThreadedReader : public framework::DecoratedReader {
+ public:
+  ThreadedReader(ReaderBase* reader, bool safe_mode)
+      : DecoratedReader(reader), safe_mode_(safe_mode) {}
+
+  void ReadNext(std::vector<framework::LoDTensor>* out) override {
+    std::lock_guard<std::mutex> lock(mutex_);
+    reader_->ReadNext(out);
+  }
+
+  void ReInit() override {
+    if (safe_mode_) {
+      PADDLE_THROW(
+          "ThreadedReader::ReInit() is disabled when 'safe_mode' is true.");
+    }
+    VLOG(5) << "ThreadedReader::ReInit() is invoked! It might be buggy in "
+               "multi-thread environment.";
+    reader_->ReInit();
+  }
+
+ private:
+  bool safe_mode_;
+  std::mutex mutex_;
+};
+
+class CreateThreadedReaderOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    auto* out = detail::Ref(scope.FindVar(Output("Out")))
+                    .GetMutable<framework::ReaderHolder>();
+    if (out->Get() != nullptr) {
+      return;
+    }
+    const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
+                                        ->Get<framework::ReaderHolder>();
+    bool safe_mode = Attr<bool>("safe_mode");
+    out->Reset(new ThreadedReader(underlying_reader.Get(), safe_mode));
+  }
+};
+
+class CreateThreadedReaderOpMaker : public DecoratedReaderMakerBase {
+ public:
+  CreateThreadedReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
+      : DecoratedReaderMakerBase(op_proto, op_checker) {
+    AddAttr<bool>("safe_mode",
+                  "When 'safe_mode' is true, 'ReInit()' is disabled to avoid "
+                  "unexpected bugs in multi-thread environment.")
+        .SetDefault(true);
+    AddComment(R"DOC(
+      CreateThreadedReader Operator
+
+      This operator creates a threaded reader. A threaded reader's 
+      'ReadNext()' can be invoked by several threads at the same 
+      time. 
+      When the attribute 'safe_mode' is true, the threaded reader's 
+      'ReInit()' is disabled to avoid unexpected bugs in multi-thread 
+      environment.
+    )DOC");
+  }
+};
+
+}  // namespace reader
+}  // namespace operators
+}  // namespace paddle
+
+namespace reader = paddle::operators::reader;
+REGISTER_DECORATED_READER_OPERATOR(create_threaded_reader,
+                                   reader::CreateThreadedReaderOp,
+                                   reader::CreateThreadedReaderOpMaker);
diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc
index eacedeea8835d..779dc8a6a0deb 100644
--- a/paddle/fluid/operators/reader/open_files_op.cc
+++ b/paddle/fluid/operators/reader/open_files_op.cc
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <thread>  // NOLINT
+
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/operators/reader/reader_op_registry.h"
 
@@ -19,38 +21,23 @@ namespace paddle {
 namespace operators {
 namespace reader {
 
-class MultipleReader : public framework::ReaderBase {
+class MultiFileReader : public framework::ReaderBase {
  public:
-  class ThreadBufferMap {
-   public:
-    std::vector<framework::LoDTensor>& operator[](
-        const std::thread::id& thread_id) {
-      std::lock_guard<std::mutex> lock(mutex_);
-      return buffer_[thread_id];
-    }
-
-    void Clear() { buffer_.clear(); }
-
-   private:
-    std::mutex mutex_;
-    std::unordered_map<std::thread::id, std::vector<framework::LoDTensor>>
-        buffer_;
-  };
-
-  MultipleReader(const std::vector<std::string>& file_names,
-                 const std::vector<framework::DDim>& dims, size_t thread_num)
-      : file_names_(file_names), dims_(dims) {
+  MultiFileReader(const std::vector<std::string>& file_names,
+                  const std::vector<framework::DDim>& dims, size_t thread_num,
+                  size_t buffer_size)
+      : file_names_(file_names), dims_(dims), buffer_size_(buffer_size) {
     prefetchers_.resize(thread_num);
     StartNewScheduler();
   }
 
   void ReadNext(std::vector<framework::LoDTensor>* out) override;
-  bool HasNext() const override;
   void ReInit() override;
 
-  ~MultipleReader() { EndScheduler(); }
+  ~MultiFileReader() { EndScheduler(); }
 
  private:
+  bool HasNext();
   void StartNewScheduler();
   void EndScheduler();
   void ScheduleThreadFunc();
@@ -60,39 +47,36 @@ class MultipleReader : public framework::ReaderBase {
   std::vector<framework::DDim> dims_;
   std::thread scheduler_;
   std::vector<std::thread> prefetchers_;
+  size_t buffer_size_;
   framework::Channel<size_t>* waiting_file_idx_;
   framework::Channel<size_t>* available_thread_idx_;
   framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
-  mutable ThreadBufferMap thread_buffer_map_;
 };
 
-void MultipleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
-  if (!HasNext()) {
-    PADDLE_THROW("There is no next data!");
+void MultiFileReader::ReadNext(std::vector<framework::LoDTensor>* out) {
+  out->clear();
+  if (HasNext()) {
+    buffer_->Receive(out);
   }
-  auto& thread_local_buffer = thread_buffer_map_[std::this_thread::get_id()];
-  *out = thread_local_buffer;
-  thread_local_buffer.clear();
-}
-
-bool MultipleReader::HasNext() const {
-  auto& thread_local_buffer = thread_buffer_map_[std::this_thread::get_id()];
-  return thread_local_buffer.empty() ? buffer_->Receive(&thread_local_buffer)
-                                     : true;
 }
 
-void MultipleReader::ReInit() {
+void MultiFileReader::ReInit() {
   EndScheduler();
-  thread_buffer_map_.Clear();
   StartNewScheduler();
 }
 
-void MultipleReader::StartNewScheduler() {
+bool MultiFileReader::HasNext() {
+  while (!buffer_->IsClosed() && !buffer_->CanReceive()) {
+  }
+  return buffer_->CanReceive();
+}
+
+void MultiFileReader::StartNewScheduler() {
   size_t thread_num = prefetchers_.size();
   waiting_file_idx_ = framework::MakeChannel<size_t>(file_names_.size());
   available_thread_idx_ = framework::MakeChannel<size_t>(thread_num);
   buffer_ =
-      framework::MakeChannel<std::vector<framework::LoDTensor>>(thread_num);
+      framework::MakeChannel<std::vector<framework::LoDTensor>>(buffer_size_);
 
   for (size_t i = 0; i < file_names_.size(); ++i) {
     waiting_file_idx_->Send(&i);
@@ -105,7 +89,7 @@ void MultipleReader::StartNewScheduler() {
   scheduler_ = std::thread([this] { ScheduleThreadFunc(); });
 }
 
-void MultipleReader::EndScheduler() {
+void MultiFileReader::EndScheduler() {
   available_thread_idx_->Close();
   buffer_->Close();
   waiting_file_idx_->Close();
@@ -117,8 +101,8 @@ void MultipleReader::EndScheduler() {
   delete waiting_file_idx_;
 }
 
-void MultipleReader::ScheduleThreadFunc() {
-  VLOG(5) << "MultipleReader schedule thread starts.";
+void MultiFileReader::ScheduleThreadFunc() {
+  VLOG(5) << "MultiFileReader schedule thread starts.";
   size_t completed_thread_num = 0;
   size_t thread_idx;
   while (available_thread_idx_->Receive(&thread_idx)) {
@@ -150,17 +134,20 @@ void MultipleReader::ScheduleThreadFunc() {
       p.join();
     }
   }
-  VLOG(5) << "MultipleReader schedule thread terminates.";
+  VLOG(5) << "MultiFileReader schedule thread terminates.";
 }
 
-void MultipleReader::PrefetchThreadFunc(std::string file_name,
-                                        size_t thread_idx) {
+void MultiFileReader::PrefetchThreadFunc(std::string file_name,
+                                         size_t thread_idx) {
   VLOG(5) << "The prefetch thread of file '" << file_name << "' starts.";
   std::unique_ptr<framework::ReaderBase> reader =
       CreateReaderByFileName(file_name, dims_);
-  while (reader->HasNext()) {
+  while (true) {
     std::vector<framework::LoDTensor> ins;
     reader->ReadNext(&ins);
+    if (ins.empty()) {
+      break;
+    }
     try {
       buffer_->Send(&ins);
     } catch (paddle::platform::EnforceNotMet e) {
@@ -197,11 +184,13 @@ class OpenFilesOp : public framework::OperatorBase {
     const auto& file_names = Attr<std::vector<std::string>>("file_names");
     PADDLE_ENFORCE(!file_names.empty(), "No file to be read!");
     const size_t thread_num = Attr<int>("thread_num");
+    const size_t buffer_size = Attr<int>("buffer_size");
 
     auto* out = scope.FindVar(Output("Out"))
                     ->template GetMutable<framework::ReaderHolder>();
-    out->Reset(new MultipleReader(
-        file_names, RestoreShapes(shape_concat, ranks), thread_num));
+    out->Reset(new MultiFileReader(file_names,
+                                   RestoreShapes(shape_concat, ranks),
+                                   thread_num, buffer_size));
   }
 };
 
@@ -212,11 +201,12 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
     AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
     AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.")
         .GreaterThan(0);
+    AddAttr<int>("buffer_size", "The size of prefetch buffer.").GreaterThan(0);
 
     AddComment(R"DOC(
       OpenFiles Operator
 
-      An OpenFilesOp creates a MultipleReader, which is able to 
+      An OpenFilesOp creates a MultiFileReader, which is able to 
       read data multi-threaded from multiple files.
     )DOC");
   }
diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc
index 083c1fae5e201..a4dcf704a63ae 100644
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -19,7 +20,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-#include <future>
 #include "paddle/fluid/operators/detail/grpc_client.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
index f38c5a3c0c995..54e07490319cf 100644
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index b16d06df8d0f7..7ca7639fdb9b4 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
-#include "paddle/fluid/operators/net_op.h"
+
+#include <string>
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc
index 542bc3fde2a36..3bf5d57809019 100644
--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <unistd.h>
 #include <string>
-#include <thread>
+#include <thread>  // NOLINT
 
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -37,11 +37,11 @@ namespace m = paddle::operators::math;
 std::unique_ptr<f::OperatorBase> listen_and_serv_op;
 int selected_port;
 
-void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
+void InitTensorsInScope(const p::CPUPlace &place, f::Scope *scope) {
   p::CPUDeviceContext ctx(place);
   for (int i = 0; i < 2; ++i) {
     auto var_name = paddle::string::Sprintf("x%d", i);
-    auto var = scope.Var(var_name);
+    auto var = scope->Var(var_name);
     auto tensor = var->GetMutable<f::LoDTensor>();
     tensor->Resize({10, 10});
     float *expect = tensor->mutable_data<float>(place);
@@ -50,20 +50,20 @@ void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
     }
   }
 
-  auto out_var = scope.Var("Out");
+  auto out_var = scope->Var("Out");
   auto out_tensor = out_var->GetMutable<f::LoDTensor>();
   out_tensor->Resize({10, 10});
   out_tensor->mutable_data<float>(place);  // allocate
 }
 
-void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
+void InitSelectedRowsInScope(const p::CPUPlace &place, f::Scope *scope) {
   p::CPUDeviceContext ctx(place);
   int64_t height = 10;
   int64_t row_numel = 10;
   m::SetConstant<p::CPUDeviceContext, float> set_one;
   // init x0
   std::vector<int64_t> rows0{0, 4, 7};
-  auto x0_var = scope.Var("x0");
+  auto x0_var = scope->Var("x0");
   auto x0 = x0_var->GetMutable<f::SelectedRows>();
   x0->set_rows(rows0);
   x0->set_height(height);
@@ -74,7 +74,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
 
   // init x1
   std::vector<int64_t> rows1{2, 9};
-  auto x1_var = scope.Var("x1");
+  auto x1_var = scope->Var("x1");
   auto x1 = x1_var->GetMutable<f::SelectedRows>();
   x1->set_rows(rows1);
   x1->set_height(height);
@@ -83,7 +83,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
       f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
   set_one(ctx, x1_value, 1.0);
 
-  auto out_var = scope.Var("Out");
+  auto out_var = scope->Var("Out");
   auto out = out_var->GetMutable<f::SelectedRows>();
   auto out_value = out->mutable_value();
   out->set_height(height);
@@ -117,15 +117,16 @@ void StartServerNet(bool is_sparse) {
   f::Scope scope;
   p::CPUPlace place;
   if (is_sparse) {
-    InitSelectedRowsInScope(scope, place);
+    InitSelectedRowsInScope(place, &scope);
   } else {
-    InitTensorsInScope(scope, place);
+    InitTensorsInScope(place, &scope);
   }
 
   // sub program run in listen_and_serv_op, for simple test we use sum
   f::ProgramDesc program;
   const auto &root_block = program.Block(0);
   auto *optimize_block = program.AppendBlock(root_block);
+  auto *prefetch_block = program.AppendBlock(root_block);
   // X for server side tensors, RX for received tensers, must be of same shape.
   AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
 
@@ -135,6 +136,7 @@ void StartServerNet(bool is_sparse) {
   attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
   attrs.insert({"GradList", std::vector<std::string>({"x1"})});
   attrs.insert({"OptimizeBlock", optimize_block});
+  attrs.insert({"PrefetchBlock", prefetch_block});
   listen_and_serv_op =
       f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
   LOG(INFO) << "selected port before run " << selected_port;
@@ -148,7 +150,7 @@ TEST(SendRecvOp, CPUDense) {
   // local net
   f::Scope scope;
   p::CPUPlace place;
-  InitTensorsInScope(scope, place);
+  InitTensorsInScope(place, &scope);
   // create rpc client var
   scope.Var("RPC_CLIENT_VAR");
 
@@ -191,7 +193,7 @@ TEST(SendRecvOp, CPUSparse) {
   f::Scope scope;
   p::CPUPlace place;
   p::CPUDeviceContext ctx(place);
-  InitSelectedRowsInScope(scope, place);
+  InitSelectedRowsInScope(place, &scope);
   scope.Var("RPC_CLIENT_VAR");
   f::AttributeMap attrs;
   selected_port = static_cast<paddle::operators::ListenAndServOp *>(
diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc
index 2cbd9e2394800..56b3713d6af28 100644
--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <future>
+#include <future>  // NOLINT
 #include <ostream>
 
 #include "paddle/fluid/framework/data_type.h"
@@ -36,7 +36,7 @@ class SendVarsOp : public framework::OperatorBase {
     auto ins = Inputs("X");
 
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    int sync_send = Attr<int>("sync_sent");
+    int sync_send = Attr<int>("sync_send");
 
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     auto& ctx = *pool.Get(place);
diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc
index 074fa9e00f2ec..06cb0550ad7d4 100644
--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -35,8 +35,8 @@ class SGDOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
                       "Learning rate should have 1 element");
     auto param_dim = ctx->GetInputDim("Param");
-    // TODO(qijun): check dimensions of Param and Grad at complie
-    // and run time.
+    // TODO(qijun): check dimensions of Param and Grad at compile
+    // and runtime.
     ctx->SetOutputDim("ParamOut", param_dim);
   }
 
diff --git a/paddle/fluid/operators/split_ids_op.cc b/paddle/fluid/operators/split_ids_op.cc
index a54f8a2878c86..a53cbc8ac5199 100644
--- a/paddle/fluid/operators/split_ids_op.cc
+++ b/paddle/fluid/operators/split_ids_op.cc
@@ -48,11 +48,11 @@ class SplitIdsOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
 
     auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    PADDLE_ENFORCE_EQ(ids_var_type, framework::proto::VarType::LOD_TENSOR);
-
     auto ids_dims = ctx->GetInputDim("Ids");
-    PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
-    PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
+      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    }
   }
 };
 
@@ -60,8 +60,9 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference {
  public:
   void operator()(const framework::OpDesc &op_desc,
                   framework::BlockDesc *block) const override {
+    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
     for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR);
+      block->Var(out_var)->SetType(input_var->GetType());
     }
   }
 };
@@ -73,4 +74,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
                   ops::SplitIdsOpInferVarType);
 REGISTER_OP_CPU_KERNEL(
-    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>);
+    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
+    ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h
index d36ed398ebce6..ba1e903dbb6da 100644
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -24,35 +24,63 @@ namespace operators {
 template <typename DeviceContext, typename T>
 class SplitIdsOpKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext &ctx) const override {
     auto place = ctx.GetPlace();
     if (!platform::is_cpu_place(place)) {
       PADDLE_THROW("SplitIds do not support GPU kernel");
     }
 
-    auto& ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
-    const T* ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
-    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
-    const size_t shard_num = outs.size();
+    const auto *ids_var = ctx.InputVar("Ids");
+    if (ids_var->IsType<framework::LoDTensor>()) {
+      const auto &ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
+      const T *ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
+      auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
+      const size_t shard_num = outs.size();
 
-    std::vector<std::vector<T>> out_ids;
-    out_ids.resize(outs.size());
+      std::vector<std::vector<T>> out_ids;
+      out_ids.resize(outs.size());
 
-    // split id by their shard_num.
-    for (int i = 0; i < ids_dims[0]; ++i) {
-      T id = ids[i];
-      size_t shard_id = static_cast<size_t>(id) % shard_num;
-      out_ids[shard_id].push_back(id);
-    }
+      // split id by their shard_num.
+      for (int i = 0; i < ids_dims[0]; ++i) {
+        T id = ids[i];
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        out_ids[shard_id].push_back(id);
+      }
+
+      // create tensor for each shard and send to parameter server
+      for (size_t i = 0; i < out_ids.size(); ++i) {
+        auto *shard_t = outs[i];
+        std::vector<T> ids = out_ids[i];
+        auto *shard_data = shard_t->mutable_data<T>(
+            framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
+        for (size_t i = 0; i < ids.size(); ++i) {
+          shard_data[i] = ids[i];
+        }
+      }
+    } else if (ids_var->IsType<framework::SelectedRows>()) {
+      const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
+      auto &ids_dims = ids_selected_rows->value().dims();
+      PADDLE_ENFORCE_EQ(ids_dims[0], ids_selected_rows->rows().size(), "");
+      const T *ids = ids_selected_rows->value().data<T>();
+      const auto &ids_rows = ids_selected_rows->rows();
+      auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
+      const size_t shard_num = outs.size();
+      // get rows for outputs
+      for (auto &id : ids_rows) {
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        outs[shard_id]->mutable_rows()->push_back(id);
+      }
 
-    // create tensor for each shard and send to parameter server
-    for (size_t i = 0; i < out_ids.size(); ++i) {
-      auto* shard_t = outs[i];
-      std::vector<T> ids = out_ids[i];
-      auto* shard_data = shard_t->mutable_data<T>(
-          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
-      for (size_t i = 0; i < ids.size(); ++i) {
-        shard_data[i] = ids[i];
+      int64_t row_width = ids_dims[1];
+      for (auto &out : outs) {
+        out->set_height(ids_selected_rows->height());
+        framework::DDim ddim = framework::make_ddim(
+            {static_cast<int64_t>(out->rows().size()), row_width});
+        T *output = out->mutable_value()->mutable_data<T>(ddim, place);
+        for (size_t i = 0; i < ddim[0]; ++i) {
+          memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
+                 row_width * sizeof(T));
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index dffac772f11be..e745509ec8c1f 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/split_op.h"
-#include "paddle/fluid/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h
index 22c1db82e9f5a..7a10218e15566 100644
--- a/paddle/fluid/operators/strided_memcpy.h
+++ b/paddle/fluid/operators/strided_memcpy.h
@@ -37,8 +37,8 @@ inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
                           const framework::DDim& src_stride,
                           const framework::DDim& dst_dim,
                           const framework::DDim& dst_stride, T* dst) {
-  using namespace detail;
-  StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
+  paddle::operators::detail::StridedCopyDimVisitor<T> func(
+      dev_ctx, src, src_stride, dst_stride, dst);
   boost::apply_visitor(func, dst_dim);
 }
 
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 9061e137bd1c7..108f26fafe7af 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/sum_op.h"
+
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 
@@ -37,7 +39,10 @@ class SumOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputsDim("X");
     size_t N = x_dims.size();
-    PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
+    PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
+    if (N == 1) {
+      VLOG(3) << "Warning: sum have only one input, may waste memory";
+    }
 
     framework::DDim in_dim({0});
     for (auto& x_dim : x_dims) {
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index bfd26c2f2294f..d7f4d383ce0d9 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
 
 namespace paddle {
@@ -133,71 +134,71 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
 }
 
 template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
                                               int beam_size, const T* src,
-                                              bool& firstStep, bool& is_empty,
-                                              Pair<T>& max, int dim,
+                                              bool* firstStep, bool* is_empty,
+                                              Pair<T>* max, int dim,
                                               const int tid) {
-  if (beam > 0) {
-    int length = beam < beam_size ? beam : beam_size;
-    if (firstStep) {
-      firstStep = false;
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
       GetTopK<T, BlockSize>(topk, src, tid, dim, length);
     } else {
       for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - beam) {
-          topk[k] = topk[k + beam];
+        if (k < MaxLength - (*beam)) {
+          topk[k] = topk[k + *beam];
         } else {
           topk[k].set(-INFINITY, -1);
         }
       }
-      if (!is_empty) {
-        GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max,
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, src, tid, dim, *max,
                               length);
       }
     }
 
-    max = topk[MaxLength - 1];
-    if (max.v == -1) is_empty = true;
-    beam = 0;
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -1) *is_empty = true;
+    *beam = 0;
   }
 }
 
 template <typename T, int MaxLength, int BlockSize>
-__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
                                               int beam_size, const T* val,
-                                              int* col, bool& firstStep,
-                                              bool& is_empty, Pair<T>& max,
+                                              int* col, bool* firstStep,
+                                              bool* is_empty, Pair<T>* max,
                                               int dim, const int tid) {
-  if (beam > 0) {
-    int length = beam < beam_size ? beam : beam_size;
-    if (firstStep) {
-      firstStep = false;
+  if (*beam > 0) {
+    int length = (*beam) < beam_size ? *beam : beam_size;
+    if (*firstStep) {
+      *firstStep = false;
       GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
     } else {
       for (int k = 0; k < MaxLength; k++) {
-        if (k < MaxLength - beam) {
-          topk[k] = topk[k + beam];
+        if (k < MaxLength - *beam) {
+          topk[k] = topk[k + *beam];
         } else {
           topk[k].set(-INFINITY, -1);
         }
       }
-      if (!is_empty) {
-        GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max,
+      if (!(*is_empty)) {
+        GetTopK<T, BlockSize>(topk + MaxLength - *beam, val, col, tid, dim, max,
                               length);
       }
     }
 
-    max = topk[MaxLength - 1];
-    if (max.v == -1) is_empty = true;
-    beam = 0;
+    *max = topk[MaxLength - 1];
+    if ((*max).v == -1) *is_empty = true;
+    *beam = 0;
   }
 }
 
 template <typename T, int MaxLength, int BlockSize>
 __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
                                             Pair<T> topk[], T** topVal,
-                                            int64_t** topIds, int& beam, int& k,
+                                            int64_t** topIds, int* beam, int* k,
                                             const int tid, const int warp) {
   while (true) {
     __syncthreads();
@@ -225,17 +226,17 @@ __device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
       (*topVal)++;
       (*topIds)++;
     }
-    if (tid == maxid[0]) beam++;
-    if (--k == 0) break;
+    if (tid == maxid[0]) (*beam)++;
+    if (--(*k) == 0) break;
     __syncthreads();
 
     if (tid == maxid[0]) {
-      if (beam < MaxLength) {
-        sh_topk[tid] = topk[beam];
+      if (*beam < MaxLength) {
+        sh_topk[tid] = topk[*beam];
       }
     }
     if (maxid[0] / 32 == warp) {
-      if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break;
+      if (__shfl(*beam, (maxid[0]) % 32, 32) == MaxLength) break;
     }
   }
 }
@@ -268,13 +269,13 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
     topk[k].set(-INFINITY, -1);
   }
   while (k) {
-    ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k,
-                                           src + blockIdx.x * lds, firststep,
-                                           is_empty, max, dim, tid);
+    ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
+                                           src + blockIdx.x * lds, &firststep,
+                                           &is_empty, &max, dim, tid);
 
     sh_topk[tid] = topk[0];
     BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
-                                         &indices, beam, k, tid, warp);
+                                         &indices, &beam, &k, tid, warp);
   }
 }
 
@@ -308,9 +309,9 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
     KeMatrixTopK<T, 5, 256><<<
         grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
-                              .stream()>>>(output_data, output->dims()[1],
-                                           indices_data, input_data,
-                                           input_width, input_width, int(k));
+                              .stream()>>>(
+        output_data, output->dims()[1], indices_data, input_data, input_width,
+        input_width, static_cast<int>(k));
   }
 };
 
diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h
index 881d611d4ac26..8758af0804ae0 100644
--- a/paddle/fluid/platform/cuda_helper.h
+++ b/paddle/fluid/platform/cuda_helper.h
@@ -33,22 +33,26 @@ constexpr int PADDLE_CUDA_NUM_THREADS = 512;
 USE_CUDA_ATOMIC(Add, float);
 USE_CUDA_ATOMIC(Add, int);
 USE_CUDA_ATOMIC(Add, unsigned int);
-USE_CUDA_ATOMIC(Add, unsigned long long int);
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+USE_CUDA_ATOMIC(Add, unsigned long long int);  // NOLINT
 
 CUDA_ATOMIC_WRAPPER(Add, int64_t) {
-  static_assert(sizeof(int64_t) == sizeof(long long int),
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
                 "long long should be int64");
-  return CudaAtomicAdd(reinterpret_cast<unsigned long long int*>(address),
-                       static_cast<unsigned long long int>(val));
+  return CudaAtomicAdd(
+      reinterpret_cast<unsigned long long int*>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));           // NOLINT
 }
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
-  unsigned long long int* address_as_ull =
-      reinterpret_cast<unsigned long long int*>(address);
-  unsigned long long int old = *address_as_ull, assumed;
+  unsigned long long int* address_as_ull =                 // NOLINT
+      reinterpret_cast<unsigned long long int*>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;   // NOLINT
 
   do {
     assumed = old;
diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc
index eb541579a136d..361d3439b844e 100644
--- a/paddle/fluid/platform/dynload/cublas.cc
+++ b/paddle/fluid/platform/dynload/cublas.cc
@@ -28,6 +28,10 @@ CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
 CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
 #endif
 
+#ifdef CUBLAS_BLAS_ROUTINE_EACH_R3
+CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h
index a41018d350e89..1ab55d6b9bf8f 100644
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -71,7 +71,6 @@ extern void *cublas_dso_handle;
   __macro(cublasDgemm_v2);                \
   __macro(cublasHgemm);                   \
   __macro(cublasSgemmEx);                 \
-  __macro(cublasGemmEx);                  \
   __macro(cublasSgeam_v2);                \
   __macro(cublasDgeam_v2);                \
   __macro(cublasCreate_v2);               \
@@ -83,11 +82,6 @@ extern void *cublas_dso_handle;
   __macro(cublasDgemmBatched);            \
   __macro(cublasCgemmBatched);            \
   __macro(cublasZgemmBatched);            \
-  __macro(cublasSgemmStridedBatched);     \
-  __macro(cublasDgemmStridedBatched);     \
-  __macro(cublasCgemmStridedBatched);     \
-  __macro(cublasZgemmStridedBatched);     \
-  __macro(cublasHgemmStridedBatched);     \
   __macro(cublasSgetrfBatched);           \
   __macro(cublasSgetriBatched);           \
   __macro(cublasDgetrfBatched);           \
@@ -95,10 +89,24 @@ extern void *cublas_dso_handle;
 
 CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 
+// APIs available after CUDA 8.0
+#if CUDA_VERSION >= 8000
+#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
+  __macro(cublasGemmEx);                     \
+  __macro(cublasSgemmStridedBatched);        \
+  __macro(cublasDgemmStridedBatched);        \
+  __macro(cublasCgemmStridedBatched);        \
+  __macro(cublasZgemmStridedBatched);        \
+  __macro(cublasHgemmStridedBatched);
+
+CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#endif
+
 // APIs available after CUDA 9.0
 #if CUDA_VERSION >= 9000
-#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) __macro(cublasSetMathMode);
-CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
+#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode);
+
+CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
 #endif
 
 #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 2999004320650..ca9ab2c7aecff 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -14,8 +14,9 @@
 
 #pragma once
 
-#include <thread>
+#include <thread>  // NOLINT
 #include <typeindex>
+#include <vector>
 #include "paddle/fluid/platform/dynload/nccl.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -29,6 +30,8 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
     return ncclDouble;
   } else if (type == typeid(int)) {  // NOLINT
     return ncclInt;
+  } else if (type == typeid(int64_t)) {  // NOLINT
+    return ncclInt64;
   } else {
     PADDLE_THROW("Not supported");
   }
@@ -58,7 +61,7 @@ struct NCCLContext {
   ncclComm_t comm_;
 
   explicit NCCLContext(int dev_id)
-      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))) {}
+      : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
 
   cudaStream_t stream() const { return ctx_->stream(); }
 
@@ -66,23 +69,23 @@ struct NCCLContext {
     return boost::get<platform::CUDAPlace>(ctx_->GetPlace()).device;
   }
 
-  static void InitNCCLContext(std::unordered_map<int, NCCLContext> &contexts,
+  static void InitNCCLContext(std::unordered_map<int, NCCLContext> *contexts,
                               const std::vector<platform::Place> &places) {
     std::vector<ncclComm_t> comms;
     std::vector<int> devs;
-    comms.resize(contexts.size());
-    devs.reserve(contexts.size());
+    comms.resize(contexts->size());
+    devs.reserve(contexts->size());
 
     for (auto &p : places) {
       devs.push_back(boost::get<platform::CUDAPlace>(p).device);
     }
 
     PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-        &comms[0], static_cast<int>(contexts.size()), &devs[0]));
+        &comms[0], static_cast<int>(contexts->size()), &devs[0]));
 
     int i = 0;
     for (auto &dev_id : devs) {
-      contexts.at(dev_id).comm_ = comms[i++];
+      contexts->at(dev_id).comm_ = comms[i++];
     }
   }
 };
@@ -91,7 +94,8 @@ struct NCCLContextMap {
   std::unordered_map<int, NCCLContext> contexts_;
   std::vector<int> order_;
 
-  NCCLContextMap(const std::vector<platform::Place> &places) {
+  explicit NCCLContextMap(const std::vector<platform::Place> &places) {
+    PADDLE_ENFORCE(!places.empty());
     order_.reserve(places.size());
     for (auto &p : places) {
       int dev_id = boost::get<CUDAPlace>(p).device;
@@ -102,15 +106,17 @@ struct NCCLContextMap {
         order_.size(), contexts_.size(),
         "NCCL Context Map does not support contain two or more same device");
 
-    std::vector<ncclComm_t> comms;
-    comms.resize(order_.size());
+    if (places.size() > 1) {
+      std::vector<ncclComm_t> comms;
+      comms.resize(order_.size());
 
-    PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
-        &comms[0], static_cast<int>(order_.size()), &order_[0]));
+      PADDLE_ENFORCE(platform::dynload::ncclCommInitAll(
+          &comms[0], static_cast<int>(order_.size()), &order_[0]));
 
-    int i = 0;
-    for (auto &dev_id : order_) {
-      contexts_.at(dev_id).comm_ = comms[i++];
+      int i = 0;
+      for (auto &dev_id : order_) {
+        contexts_.at(dev_id).comm_ = comms[i++];
+      }
     }
   }
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 884289a7fda65..4fef351c2118e 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,13 +2,13 @@ if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
            parallel_executor
       ${GLOB_OP_LIB})
   else()
     cc_library(paddle_pybind SHARED
       SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
            parallel_executor
       ${GLOB_OP_LIB})
     if(NOT APPLE AND NOT ANDROID)
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index 2fe8290363860..93533e5c9d88a 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <string>
 #include <tuple>
 
-#include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -125,23 +124,6 @@ void BindProgramDesc(pybind11::module *m) {
            })
       .def("append_block", &pd::ProgramDesc::AppendBlock,
            pybind11::return_value_policy::reference)
-      .def("append_backward",
-           [](pd::ProgramDesc &program_desc, const pd::VarDesc &target,
-              const std::unordered_set<std::string> &no_grad_vars) {
-             pd::ParamGradInfoMap param_grad_map =
-                 AppendBackward(program_desc, target, no_grad_vars);
-             std::unordered_map<
-                 std::string, std::tuple<std::string /* grad_var_name */,
-                                         int /* block_idx */, int /* op_idx */>>
-                 retv;
-             for (auto it = param_grad_map.begin(); it != param_grad_map.end();
-                  ++it) {
-               const auto &grad_info = it->second;
-               retv[it->first] = std::make_tuple(
-                   grad_info.name_, grad_info.block_idx_, grad_info.op_idx_);
-             }
-             return retv;
-           })
       .def("block", &pd::ProgramDesc::MutableBlock,
            pybind11::return_value_policy::reference)
       .def("num_blocks", &pd::ProgramDesc::Size)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3924040455784..a1e8ff6399f08 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -20,9 +20,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/pybind/protobuf.h"
-
-#include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
@@ -31,18 +28,18 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/cond_op.h"
-#include "paddle/fluid/operators/net_op.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
-#include "paddle/fluid/pybind/pybind.h"
+#include "paddle/fluid/pybind/protobuf.h"
+#include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/recordio.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 
@@ -239,11 +236,6 @@ All parameter, weight, gradient are variables in Paddle.
            },
            py::return_value_policy::reference)
 #endif
-      .def("get_net",
-           [](Variable &self) -> operators::NetOp * {
-             return self.GetMutable<operators::NetOp>();
-           },
-           py::return_value_policy::reference)
       .def("get_reader",
            [](Variable &self) -> framework::ReaderHolder * {
              PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
@@ -252,7 +244,6 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference);
 
   py::class_<framework::ReaderHolder>(m, "Reader", "")
-      .def("has_next", &framework::ReaderHolder::HasNext)
       .def("reset", &framework::ReaderHolder::ReInit);
 
   py::class_<Scope>(m, "Scope", "")
@@ -389,11 +380,6 @@ All parameter, weight, gradient are variables in Paddle.
                                    desc.InitializationErrorString());
                     return OpRegistry::CreateOp(desc);
                   })
-      .def("backward",
-           [](const OperatorBase &forwardOp,
-              const std::unordered_set<std::string> &no_grad_vars) {
-             return Backward(forwardOp, no_grad_vars).release();
-           })
       .def("run",
            [](OperatorBase &self, const Scope &scope,
               const platform::CPUPlace &place) { self.Run(scope, place); })
@@ -421,42 +407,6 @@ All parameter, weight, gradient are variables in Paddle.
            [](const OperatorBase &op) { return op.OutputVars(false); })
       .def("support_gpu", &OperatorBase::SupportGPU);
 
-  py::class_<operators::NetOp, OperatorBase>(m, "Net")
-      .def_static("create",
-                  []() -> operators::NetOp * {
-                    auto *retv = new operators::NetOp;
-                    retv->SetType("plain_net");
-                    return retv;
-                  })
-      .def("append_op", [](operators::NetOp &self,
-                           const OperatorBase &op) { self.AppendOp(op); })
-      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
-      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
-        self->CompleteAddOp();
-      });
-
-  // cond_op
-  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
-      .def_static("create",
-                  [](py::bytes protobin) -> operators::CondOp * {
-                    proto::OpDesc desc;
-                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                                   "Cannot parse user input to OpDesc");
-                    PADDLE_ENFORCE(desc.IsInitialized(),
-                                   "User OpDesc is not initialized, reason %s",
-                                   desc.InitializationErrorString());
-                    auto cond_op = OpRegistry::CreateOp(desc);
-                    return static_cast<operators::CondOp *>(cond_op.release());
-                  })
-      .def("set_truenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_truenet(net.Clone());
-           })
-      .def("set_falsenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_falsenet(net.Clone());
-           });
-
   py::class_<framework::Executor>(m, "Executor")
       .def(py::init<const platform::Place &>())
       .def("run",
@@ -554,6 +504,7 @@ All parameter, weight, gradient are variables in Paddle.
                                   bcast_vars, main_program, loss_var_name,
                                   scope, local_scopes, allow_op_delay);
            })
+      .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
       .def("local_scopes",
            [](ParallelExecutor &self) -> std::vector<Scope *> * {
              return &self.GetLocalScopes();
diff --git a/paddle/fluid/pybind/recordio.cc b/paddle/fluid/pybind/recordio.cc
index 0644d91425af1..330d104e0a774 100644
--- a/paddle/fluid/pybind/recordio.cc
+++ b/paddle/fluid/pybind/recordio.cc
@@ -39,7 +39,7 @@ class RecordIOWriter {
   void CompleteAppendTensor() {
     auto& ctx =
         *platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
-    framework::WriteToRecordIO(writer_, tensors_, ctx);
+    framework::WriteToRecordIO(&writer_, tensors_, ctx);
     tensors_.clear();
   }
 
diff --git a/paddle/fluid/recordio/chunk.cc b/paddle/fluid/recordio/chunk.cc
index e7ebbba452c5c..82d9aa601cf45 100644
--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
@@ -14,13 +14,13 @@
 
 #include "paddle/fluid/recordio/chunk.h"
 
+#include <zlib.h>
 #include <algorithm>
 #include <memory>
 #include <sstream>
 
 #include "paddle/fluid/platform/enforce.h"
-#include "snappy_stream/include/snappystream.hpp"
-#include "zlib/include/zlib.h"
+#include "snappystream.hpp"
 
 namespace paddle {
 namespace recordio {
diff --git a/paddle/fluid/recordio/header.cc b/paddle/fluid/recordio/header.cc
index ed09d58f6a3e2..c4822329a43a7 100644
--- a/paddle/fluid/recordio/header.cc
+++ b/paddle/fluid/recordio/header.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "paddle/fluid/recordio/header.h"
+
+#include <string>
+
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 4885b74e6c664..be1565ab53303 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -231,7 +231,7 @@ function gen_fluid_inference_lib() {
     Deploying fluid inference library ...
     ========================================
 EOF
-        make inference_lib_dist
+        make -j `nproc` inference_lib_dist
     fi
 }
 
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index a5a3884750cce..f757411b853ba 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -29,6 +29,7 @@
 import backward
 import regularizer
 import average
+import metrics
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py
index ded6eb0859683..6abe8233b07c4 100644
--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import numpy as np
+import warnings
 """
     Class of all kinds of Average.
 
@@ -22,6 +23,8 @@
     wrappers of Python functions.
 """
 
+__all__ = ["WeightedAverage"]
+
 
 def _is_number_(var):
     return isinstance(var, int) or isinstance(var, float) or (isinstance(
@@ -34,6 +37,9 @@ def _is_number_or_matrix_(var):
 
 class WeightedAverage(object):
     def __init__(self):
+        warnings.warn(
+            "The %s is deprecated, please use fluid.metrics.Accuracy instead." %
+            (self.__class__.__name__), Warning)
         self.reset()
 
     def reset(self):
diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index 0ec3ebc7e3dba..b0522b49f44d8 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -13,14 +13,17 @@
 # limitations under the License.
 
 from __future__ import print_function
-import framework
-from framework import Program, default_main_program, default_startup_program, Parameter, Variable
-import optimizer
-from layer_helper import LayerHelper
-import distributed_splitter as splitter
+
 import math
+
+import distributed_splitter as splitter
+import framework
+from framework import Program, default_main_program, Variable
 from . import core
-import debuger
+
+LOOKUP_TABLE_TYPE = "lookup_table"
+LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad"
+RPC_CLIENT_VAR_NAME = "RPC_CLIENT_VAR"
 
 
 class VarBlock:
@@ -35,9 +38,9 @@ def __str__(self):
 
 
 class UnionFind(object):
-    """ Union-find data struct.
+    """ Union-find data structure.
 
-    Union-find is a data struct that keeps track of a set of elements partitioned
+    Union-find is a data structure that keeps track of a set of elements partitioned
     into a number of disjoint (non-overlapping) subsets.
 
     Reference:
@@ -185,19 +188,66 @@ def transpile(self,
         assert (callable(split_method))
         if program is None:
             program = default_main_program()
-        self.program = program
-        self.trainers = trainers
+        self.origin_program = program
+        self.trainer_num = trainers
         self.optimize_ops = optimize_ops
         # TODO(typhoonzero): currently trainer_id is fetched from cluster system
         # like Kubernetes, we should port this to use etcd later when developing
         # fluid distributed training with fault-tolerance.
         self.trainer_id = trainer_id
         pserver_endpoints = pservers.split(",")
+        self.pserver_endpoints = pserver_endpoints
+
+        # process lookup_table_op
+        # 1. check all lookup_table_op is distributed
+        # 2. check all lookup_table_op share the same table.
+        distributed_lookup_table_ops = []
+        # support only one distributed_lookup_table now
+        self.table_name = None
+        for op in program.global_block().ops:
+            if op.type == LOOKUP_TABLE_TYPE:
+                if op.attrs['is_distributed'] is True:
+                    if self.table_name is None:
+                        self.table_name = op.input("W")[0]
+                    if self.table_name != op.input("W")[0]:
+                        raise RuntimeError("all distributed lookup_table_ops"
+                                           " should have only one table")
+                    distributed_lookup_table_ops.append(op)
+                else:
+                    if self.table_name is not None:
+                        assert op.input("W")[0] != self.table_name
+
+        self.has_distributed_lookup_table = len(
+            distributed_lookup_table_ops) > 0
 
         # step1: For large parameters and gradients, split them into smaller
         # blocks.
         param_list = [pg[0] for pg in params_grads]
         grad_list = [pg[1] for pg in params_grads]
+
+        if self.has_distributed_lookup_table:
+            param_list = [
+                param for param in param_list if param.name != self.table_name
+            ]
+            grad_list = [
+                grad for grad in grad_list
+                if grad.name != framework.grad_var_name(self.table_name)
+            ]
+            self.table_param_grad = [
+                param_grad for param_grad in params_grads
+                if param_grad[0].name == self.table_name
+            ][0]
+            table_grad_var = self.table_param_grad[1]
+            self.table_grad_list = [
+                program.global_block().create_var(
+                    name="%s.trainer_%d.pserver_%d" %
+                    (table_grad_var.name, trainer_id, index),
+                    type=table_grad_var.type,
+                    shape=table_grad_var.shape,
+                    dtype=table_grad_var.dtype)
+                for index in range(len(self.pserver_endpoints))
+            ]
+
         grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
         param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
         # step2: Create new vars for the parameters and gradients blocks and
@@ -229,7 +279,7 @@ def transpile(self,
             self.param_grad_ep_mapping[ep]["grads"].append(grad)
 
         rpc_client_var = program.global_block().create_var(
-            name="RPC_CLIENT_VAR",
+            name=RPC_CLIENT_VAR_NAME,
             persistable=True,
             type=core.VarDesc.VarType.RAW)
 
@@ -252,12 +302,19 @@ def transpile(self,
                 outputs={"Out": [orig_param]},
                 attrs={"axis": 0})
 
+        if self.has_distributed_lookup_table:
+            self._replace_lookup_table_op_with_prefetch(program, rpc_client_var,
+                                                        eplist)
+            self._split_table_grad_and_add_send_vars(program, rpc_client_var,
+                                                     pserver_endpoints)
+
     def get_trainer_program(self):
         # remove optimize ops and add a send op to main_program
-        self.program.global_block().delete_ops(self.optimize_ops)
+        self.origin_program.global_block().delete_ops(self.optimize_ops)
+        self.origin_program.sync_with_cpp()
         # FIXME(typhoonzero): serialize once will fix error occurs when clone.
-        self.program.__str__()
-        return self.program
+        self.origin_program.__str__()
+        return self.origin_program
 
     def get_pserver_program(self, endpoint):
         """
@@ -293,8 +350,8 @@ def get_pserver_program(self, endpoint):
                     type=v.type,
                     dtype=v.dtype,
                     shape=v.shape)
-            if self.trainers > 1:
-                for trainer_id in xrange(self.trainers):
+            if self.trainer_num > 1:
+                for trainer_id in xrange(self.trainer_num):
                     var = pserver_program.global_block().create_var(
                         name="%s.trainer_%d" % (orig_var_name, trainer_id),
                         persistable=False,
@@ -308,7 +365,7 @@ def get_pserver_program(self, endpoint):
         # step3
         optimize_block = pserver_program.create_block(0)
         # step 4
-        # Create a union-find data struct from optimize ops,
+        # Create a union-find data structure from optimize ops,
         # If two ops are connected, we could add these two ops
         # into one set.
         ufind = self._create_ufind(self.optimize_ops)
@@ -383,6 +440,23 @@ def __append_optimize_op__(op, block):
         #             __append_optimize_op__(glb_op, optimize_block)
         #             break
 
+        # process distributed lookup_table
+        prefetch_block = None
+        if self.has_distributed_lookup_table:
+            pserver_index = self.pserver_endpoints.index(endpoint)
+            self._create_table_optimize_block(pserver_index, pserver_program,
+                                              append_block)
+            prefetch_block = self._create_prefetch_block(
+                pserver_index, pserver_program, optimize_block)
+
+        # NOTE: if has_distributed_lookup_table is False, then prefetch_block will
+        # not be executed, so it's safe to use optimize_block to hold the place
+        if self.has_distributed_lookup_table:
+            assert prefetch_block is not None
+        else:
+            assert prefetch_block is None
+            prefetch_block = pserver_program.global_block()
+
         # step5 append the listen_and_serv op
         pserver_program.global_block().append_op(
             type="listen_and_serv",
@@ -391,8 +465,10 @@ def __append_optimize_op__(op, block):
             attrs={
                 "OptimizeBlock": optimize_block,
                 "endpoint": endpoint,
-                "Fanin": self.trainers
+                "Fanin": self.trainer_num,
+                "PrefetchBlock": prefetch_block
             })
+
         pserver_program.sync_with_cpp()
         return pserver_program
 
@@ -450,6 +526,197 @@ def _get_splited_name_and_shape(varname):
                     attrs=op.attrs)
         return s_prog
 
+    # transpiler function for dis lookup_table
+    def _replace_lookup_table_op_with_prefetch(self, program, rpc_client_var,
+                                               eplist):
+        # 1. replace lookup_table_op with split_ids_op -> prefetch_op -> sum_op
+        self.prefetch_input_vars = None
+        self.prefetch_output_vars = None
+
+        continue_search_lookup_table_op = True
+        while continue_search_lookup_table_op:
+            continue_search_lookup_table_op = False
+            all_ops = program.global_block().ops
+            for op in all_ops:
+                if op.type == LOOKUP_TABLE_TYPE:
+                    continue_search_lookup_table_op = True
+
+                    op_index = list(all_ops).index(op)
+                    ids_name = op.input("Ids")
+                    out_name = op.output("Out")
+
+                    if self.prefetch_input_vars is None:
+                        ids_var = program.global_block().vars[ids_name[0]]
+                        self.prefetch_input_vars = self.create_splited_vars(
+                            source_var=ids_var,
+                            block=program.global_block(),
+                            tag="_prefetch_in_")
+                    if self.prefetch_output_vars is None:
+                        out_var = program.global_block().vars[out_name[0]]
+                        self.prefetch_output_vars = self.create_splited_vars(
+                            source_var=out_var,
+                            block=program.global_block(),
+                            tag="_prefetch_out_")
+
+                    # insert split_ids_op
+                    program.global_block().insert_op(
+                        index=op_index,
+                        type="split_ids",
+                        inputs={
+                            'Ids': [
+                                program.global_block().vars[varname]
+                                for varname in ids_name
+                            ]
+                        },
+                        outputs={"Out": self.prefetch_input_vars})
+
+                    # insert prefetch_op
+                    program.global_block().insert_op(
+                        index=op_index + 1,
+                        type="prefetch",
+                        inputs={'X': self.prefetch_input_vars},
+                        outputs={
+                            "Out": self.prefetch_output_vars,
+                            "RPCClient": rpc_client_var
+                        },
+                        attrs={"epmap": eplist})
+
+                    # insert concat_op
+                    program.global_block().insert_op(
+                        index=op_index + 2,
+                        type="concat",
+                        inputs={'X': self.prefetch_output_vars},
+                        outputs={
+                            "Out": [
+                                program.global_block().vars[varname]
+                                for varname in out_name
+                            ]
+                        },
+                        attrs={"axis": 0})
+
+                    # delete lookup_table_op
+                    program.global_block().delete_ops([op])
+                    program.sync_with_cpp()
+                    # break for loop
+                    break
+
+    def _split_table_grad_and_add_send_vars(self, program, rpc_client_var,
+                                            pserver_endpoints):
+        # 2. add split_ids_op and send_vars_op to send gradient to pservers
+        # there should only be one table_name
+        all_ops = program.global_block().ops
+        table_grad_name = framework.grad_var_name(self.table_name)
+        for op in all_ops:
+            if table_grad_name in op.output_arg_names:
+                op_index = list(all_ops).index(op)
+                # insert split_ids_op
+                program.global_block().insert_op(
+                    index=op_index + 1,
+                    type="split_ids",
+                    inputs={
+                        'Ids': [program.global_block().vars[table_grad_name]]
+                    },
+                    outputs={"Out": self.table_grad_list})
+                program.global_block().insert_op(
+                    index=op_index + 2,
+                    type="send_vars",
+                    inputs={'X': self.table_grad_list},
+                    outputs={"RPCClient": rpc_client_var},
+                    attrs={"sync_send": True,
+                           "epmap": pserver_endpoints})
+                break
+
+    def _create_prefetch_block(self, pserver_index, pserver_program,
+                               optimize_block):
+        # STEP: create prefetch block
+        table_var = pserver_program.global_block().vars[self.table_name]
+        prefetch_block = pserver_program.create_block(optimize_block.idx)
+        trainer_ids = self.prefetch_input_vars[pserver_index]
+        pserver_ids = pserver_program.global_block().create_var(
+            name=trainer_ids.name,
+            type=trainer_ids.type,
+            shape=trainer_ids.shape,
+            dtype=trainer_ids.dtype)
+        trainer_out = self.prefetch_output_vars[pserver_index]
+        pserver_out = pserver_program.global_block().create_var(
+            name=trainer_out.name,
+            type=trainer_out.type,
+            shape=trainer_out.shape,
+            dtype=trainer_out.dtype)
+        prefetch_block.append_op(
+            type=LOOKUP_TABLE_TYPE,
+            inputs={'Ids': pserver_ids,
+                    "W": table_var},
+            outputs={"Out": pserver_out},
+            attrs={
+                "is_sparse": True,  # has no effect on lookup_table op
+                "is_distributed": True,
+                "padding_idx": -1
+            })
+        return prefetch_block
+
+    def _create_table_optimize_block(self, pserver_index, pserver_program,
+                                     append_block):
+        def _clone_var(block, var, persistable=True):
+            assert isinstance(var, Variable)
+            return block.create_var(
+                name=var.name,
+                shape=var.shape,
+                dtype=var.dtype,
+                type=var.type,
+                persistable=persistable)
+
+        # STEP: create table optimize block
+        # create table param and grad var in pserver program
+        param_var = _clone_var(
+            pserver_program.global_block(),
+            self.origin_program.global_block().vars[self.table_name])
+        grad_var = _clone_var(
+            pserver_program.global_block(),
+            self.origin_program.global_block().vars[framework.grad_var_name(
+                self.table_name)],
+            persistable=False)
+
+        # create grad vars in pserver program
+        table_grad_var = self.table_param_grad[1]
+        table_grad_list = [
+            pserver_program.global_block().create_var(
+                name="%s.trainer_%d.pserver_%d" %
+                (table_grad_var.name, index, pserver_index),
+                type=table_grad_var.type,
+                shape=table_grad_var.shape,
+                dtype=table_grad_var.dtype) for index in range(self.trainer_num)
+        ]
+
+        # create table optimize block in pserver program
+        table_opt_op = [
+            op for op in self.optimize_ops
+            if op.input("Param")[0] == self.table_name
+        ][0]
+        table_opt_block = pserver_program.create_block(append_block.idx)
+        # only support sgd now
+        assert table_opt_op.type == "sgd"
+
+        # append sum op for table_grad_list
+        table_opt_block.append_op(
+            type="sum",
+            inputs={"X": table_grad_list},
+            outputs={"Out": [grad_var]})
+
+        lr_var = pserver_program.global_block().vars[table_opt_op.input(
+            "LearningRate")[0]]
+        inputs = {
+            "Param": [param_var],
+            "Grad": [grad_var],
+            "LearningRate": [lr_var]
+        }
+        outputs = {"ParamOut": [param_var]}
+        table_opt_block.append_op(
+            type=table_opt_op.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=table_opt_op.attrs)
+
     # ====================== private transpiler functions =====================
     def _create_vars_from_blocklist(self,
                                     program,
@@ -511,7 +778,17 @@ def _create_vars_from_blocklist(self,
             program.global_block().sync_with_cpp()
         return var_mapping
 
-    def _clone_var(self, block, var):
+    def create_splited_vars(self, source_var, block, tag):
+        return [
+            block.create_var(
+                name=str(source_var.name + tag + str(index)),
+                type=source_var.type,
+                shape=source_var.shape,
+                dtype=source_var.dtype)
+            for index in range(len(self.pserver_endpoints))
+        ]
+
+    def _clone_var(self, block, var, persistable=True):
         assert isinstance(var, Variable)
         return block.create_var(
             name=var.name,
@@ -519,12 +796,12 @@ def _clone_var(self, block, var):
             dtype=var.dtype,
             type=var.type,
             lod_level=var.lod_level,
-            persistable=True)
+            persistable=persistable)
 
     def _append_split_op(self, program, gradblocks):
         # Split variables that need to be split and append respective ops
         add_suffix = False
-        if self.trainers > 1:
+        if self.trainer_num > 1:
             add_suffix = True
         var_mapping = self._create_vars_from_blocklist(
             program, gradblocks, add_trainer_suffix=add_suffix)
@@ -615,9 +892,9 @@ def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
                     return
                 merged_var = \
                     pserver_block.vars[self._orig_varname(grad_block.name)]
-                if self.trainers > 1:
+                if self.trainer_num > 1:
                     vars2merge = []
-                    for i in xrange(self.trainers):
+                    for i in xrange(self.trainer_num):
                         per_trainer_name = "%s.trainer_%d" % \
                         (self._orig_varname(grad_block.name), i)
                         vars2merge.append(pserver_block.vars[per_trainer_name])
@@ -632,7 +909,7 @@ def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
                             type="scale",
                             inputs={"X": merged_var},
                             outputs={"Out": merged_var},
-                            attrs={"scale": 1.0 / float(self.trainers)})
+                            attrs={"scale": 1.0 / float(self.trainer_num)})
                 new_inputs[key] = merged_var
             elif key == "Param":
                 # param is already created on global program
@@ -668,7 +945,7 @@ def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
             new_shape = None
             if key in ["Param", "Grad", "LearningRate"]:
                 continue
-            var = self.program.global_block().vars[opt_op.input(key)[0]]
+            var = self.origin_program.global_block().vars[opt_op.input(key)[0]]
             # update accumulator variable shape
             param_shape = new_inputs["Param"].shape
             new_shape = self._get_optimizer_input_shape(opt_op.type, key,
@@ -681,8 +958,8 @@ def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
             new_inputs[key] = tmpvar
 
         # change output's ParamOut variable
-        outputs = self._get_output_map_from_op(self.program.global_block().vars,
-                                               opt_op)
+        outputs = self._get_output_map_from_op(
+            self.origin_program.global_block().vars, opt_op)
         outputs["ParamOut"] = new_inputs["Param"]
 
         optimize_block.append_op(
@@ -694,8 +971,8 @@ def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
     def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
         program = optimize_block.program
         # Append the ops for parameters that do not need to be optimized/updated
-        inputs = self._get_input_map_from_op(self.program.global_block().vars,
-                                             opt_op)
+        inputs = self._get_input_map_from_op(
+            self.origin_program.global_block().vars, opt_op)
         for varlist in inputs.itervalues():
             if not isinstance(varlist, list):
                 varlist = [varlist]
@@ -708,8 +985,8 @@ def _append_pserver_non_opt_ops(self, optimize_block, opt_op):
                         dtype=var.dtype,
                         shape=var.shape)
 
-        outputs = self._get_output_map_from_op(self.program.global_block().vars,
-                                               opt_op)
+        outputs = self._get_output_map_from_op(
+            self.origin_program.global_block().vars, opt_op)
 
         for varlist in outputs.itervalues():
             if not isinstance(varlist, list):
@@ -782,7 +1059,6 @@ def _is_opt_op_on_pserver(self, endpoint, op):
                 if same_or_split_var(n, param) and n != param:
                     return True
             return False
-        return False
 
     def _get_input_map_from_op(self, varmap, op):
         """Returns a dict from op input name to the vars in varmap."""
@@ -820,7 +1096,7 @@ def _get_lr_ops(self):
 
         find_ops = []
         # find ops which output is lr var
-        block = self.program.global_block()
+        block = self.origin_program.global_block()
         for op in block.ops:
             if set(op.output_arg_names) & lr_vars:
                 find_ops.append(op)
diff --git a/python/paddle/fluid/evaluator.py b/python/paddle/fluid/evaluator.py
index 19e5b61b0b32a..13475025b5c2a 100644
--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 import numpy as np
 
 import layers
@@ -59,6 +60,9 @@ class Evaluator(object):
     """
 
     def __init__(self, name, **kwargs):
+        warnings.warn(
+            "The %s is deprecated, because maintain a modified program inside evaluator cause bug easily, please use fluid.metrics.%s instead."
+            % (self.__class__.__name__, self.__class__.__name__), Warning)
         self.states = []
         self.metrics = []
         self.helper = LayerHelper(name, **kwargs)
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 7622c4947c6d2..57f0724e701dd 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -833,6 +833,11 @@ def rename_var(self, name, new_name):
         del self.vars[name]
         self.sync_with_cpp()
 
+    def remove_var(self, name):
+        self.sync_with_cpp()
+        self.desc.remove_var(name)
+        del self.vars[name]
+
     def create_parameter(self, *args, **kwargs):
         global_block = self.program.global_block()
         param = Parameter(global_block, *args, **kwargs)
@@ -853,6 +858,11 @@ def insert_op(self, index, *args, **kwargs):
         self.ops.insert(index, op)
         return op
 
+    def remove_op(self, index):
+        self.sync_with_cpp()
+        self.desc.remove_op(index, index + 1)
+        del self.ops[index]
+
     def delete_ops(self, ops):
         # remove from cpp
         # FIXME(typhoonzero): remove only the first occurrence.
@@ -861,6 +871,7 @@ def delete_ops(self, ops):
             end = list(self.ops).index(ops[-1])
         except Exception, e:
             raise e
+
         self.desc.remove_op(start, end + 1)
 
     def slice_ops(self, start, end):
@@ -1123,24 +1134,6 @@ def block(self, index):
     def current_block(self):
         return self.blocks[self.current_block_idx]
 
-    def append_backward(self, target, no_grad_set=None):
-        """
-        return map(param_name -> (grad_name, block_index, op_index))
-        """
-        assert isinstance(target, Variable)
-        if no_grad_set is None:
-            no_grad_set = set()
-        try:
-            param_to_grad_info = self.desc.append_backward(target.desc,
-                                                           no_grad_set)
-        except Exception as e:
-            raise core.EnforceNotMet(
-                str(e) + "\nCurrent protobuf is\n{0}".format(
-                    self.to_string(False)))
-
-        self.sync_with_cpp()
-        return param_to_grad_info
-
     def create_block(self, parent_idx=None):
         new_block_idx = len(self.blocks)
         parent = self.current_block() if parent_idx is None else self.block(
@@ -1205,6 +1198,8 @@ def __init__(self, block, shape, dtype, **kwargs):
 
         self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
 
+        self.do_model_average = kwargs.get('do_model_average', None)
+
     def __str__(self):
         return self.to_string(True)
 
@@ -1225,7 +1220,7 @@ def to_string(self, throw_on_error, with_details=False):
         if with_details:
             res_str = Variable.to_string(self, throw_on_error, True)
             additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "gradient_clip_attr")
+                               "gradient_clip_attr", "do_model_average")
             for attr_name in additional_attr:
                 res_str += "%s: %s\n" % (attr_name,
                                          str(getattr(self, attr_name)))
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 927f1e625a579..4e132ed26183e 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -18,7 +18,8 @@
 
 __all__ = [
     'Constant', 'Uniform', 'Normal', 'Xavier', 'force_init_on_cpu',
-    'init_on_cpu'
+    'init_on_cpu', 'ConstantInitializer', 'UniformInitializer',
+    'NormalInitializer', 'XavierInitializer'
 ]
 
 _force_init_on_cpu_ = False
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 969398bda4cfd..e7d6c4e2521be 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -21,8 +21,7 @@
 
 __all__ = [
     'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file',
-    'open_files', 'read_file', 'create_shuffle_reader',
-    'create_double_buffer_reader', 'create_multi_pass_reader'
+    'open_files', 'read_file', 'shuffle', 'double_buffer'
 ]
 
 
@@ -237,13 +236,9 @@ def __get_reader__():
         var = scope.find_var(reader.name)
         return var.get_reader()
 
-    def eof():
-        return not __get_reader__().has_next()
-
     def reset():
         return __get_reader__().reset()
 
-    reader.eof = eof
     reader.reset = reset
     reader.stop_gradient = True
     reader.persistable = True
@@ -283,7 +278,42 @@ def _copy_reader_create_op_(block, op):
     return new_op
 
 
-def open_recordio_file(filename, shapes, lod_levels, dtypes):
+def open_recordio_file(filename,
+                       shapes,
+                       lod_levels,
+                       dtypes,
+                       pass_num=1,
+                       for_parallel=False):
+    """
+    Open a RecordIO file
+
+    This layer takes a RecordIO file to read from and returns a Reader Variable.
+    Via the Reader Variable, we can get data from the given RecordIO file.
+
+    Args:
+       filename(str): The RecordIO file's name.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(list): List of ints which declaring data lod_level.
+       dtypes(list): List of strs which declaring data type.
+       pass_num(int): Number of passes to run.
+       for_parallel(Bool): Set it as True if you are going to run
+            subsequent operators in parallel.
+
+    Returns:
+       Variable: A Reader Variable via which we can get RecordIO file data.
+
+    Examples:
+       .. code-block:: python
+
+         reader = fluid.layers.io.open_recordio_file(
+                                          filename='./data.recordio',
+                                          shapes=[(3,224,224), (1)],
+                                          lod_levels=[0, 0],
+                                          dtypes=['float32', 'int64'])
+
+         # Via the reader, we can use 'read_file' layer to get data:
+         image, label = fluid.layers.read_file(reader)
+    """
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
     ranks = []
@@ -310,10 +340,63 @@ def open_recordio_file(filename, shapes, lod_levels, dtypes):
     startup_var.persistable = True
     main_prog_var = _copy_reader_var_(default_main_program().current_block(),
                                       startup_var)
+
+    if pass_num > 1:
+        main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+
+    if for_parallel:
+        main_prog_var = parallel(reader=main_prog_var)
+
     return monkey_patch_reader_methods(main_prog_var)
 
 
-def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
+def open_files(filenames,
+               shapes,
+               lod_levels,
+               dtypes,
+               thread_num,
+               buffer_size=None,
+               pass_num=1,
+               for_parallel=False):
+    """
+    Open files
+
+    This layer takes a list of files to read from and returns a Reader Variable. 
+    Via the Reader Variable, we can get data from given files. All files must 
+    have name suffixs to indicate their formats, e.g., '*.recordio'. 
+
+    Args:
+       filenames(list): The list of file names.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(list): List of ints which declaring data lod_level.
+       dtypes(list): List of strs which declaring data type.
+       thread_num(int): The maximal concurrent prefetch thread number.
+       buffer_size(int): The size of prefetch buffer.
+       pass_num(int): Number of passes to run.
+       for_parallel(Bool): Set it as True if you are going to run 
+            subsequent operators in parallel.
+
+    Returns:
+       Variable: A Reader Variable via which we can get file data.
+
+    Examples:
+       .. code-block:: python
+
+         reader = fluid.layers.io.open_files(filenames=['./data1.recordio',
+                                                     './data2.recordio'],
+                                             shapes=[(3,224,224), (1)],
+                                             lod_levels=[0, 0],
+                                             dtypes=['float32', 'int64'],
+                                             thread_num=2,
+                                             buffer_size=2)
+
+         # Via the reader, we can use 'read_file' layer to get data:
+         image, label = fluid.layers.io.read_file(reader)
+    """
+    if buffer_size is None:
+        buffer_size = thread_num
+    if isinstance(filenames, basestring):
+        filenames = [filenames]
     dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
     shape_concat = []
     ranks = []
@@ -322,29 +405,36 @@ def open_files(filenames, thread_num, shapes, lod_levels, dtypes):
         shape_concat.extend(shape)
         ranks.append(len(shape))
 
-    var_name = unique_name('multiple_reader')
-
+    multi_file_reader_name = unique_name('multi_file_reader')
     startup_blk = default_startup_program().current_block()
-    startup_var = startup_blk.create_var(name=var_name)
+    startup_reader = startup_blk.create_var(name=multi_file_reader_name)
     startup_blk.append_op(
         type='open_files',
-        outputs={'Out': [startup_var]},
+        outputs={'Out': [startup_reader]},
         attrs={
             'shape_concat': shape_concat,
             'lod_levels': lod_levels,
             'ranks': ranks,
             'file_names': filenames,
-            'thread_num': thread_num
+            'thread_num': thread_num,
+            'buffer_size': buffer_size
         })
 
-    startup_var.desc.set_dtypes(dtypes)
-    startup_var.persistable = True
-    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
-                                      startup_var)
-    return monkey_patch_reader_methods(main_prog_var)
+    startup_reader.desc.set_dtypes(dtypes)
+    startup_reader.persistable = True
+    main_prog_reader = _copy_reader_var_(default_main_program().current_block(),
+                                         startup_reader)
+    if pass_num > 1:
+        main_prog_reader = multi_pass(
+            reader=main_prog_reader, pass_num=pass_num)
+
+    if for_parallel:
+        main_prog_reader = parallel(reader=main_prog_reader)
+
+    return monkey_patch_reader_methods(main_prog_reader)
 
 
-def __create_decorated_reader__(op_type, reader, attrs):
+def __create_shared_decorated_reader__(op_type, reader, attrs):
     var_name = unique_name(op_type)
     startup_blk = default_startup_program().current_block()
     startup_var = startup_blk.create_var(name=var_name)
@@ -360,22 +450,41 @@ def __create_decorated_reader__(op_type, reader, attrs):
     return monkey_patch_reader_methods(main_prog_var)
 
 
-def create_shuffle_reader(reader, buffer_size):
-    return __create_decorated_reader__('create_shuffle_reader', reader,
-                                       {'buffer_size': int(buffer_size)})
+def __create_unshared_decorated_reader__(op_type, reader, attrs):
+    new_reader_name = unique_name(op_type)
+    main_blk = default_main_program().current_block()
+    new_reader = main_blk.create_var(name=new_reader_name)
+    main_blk.append_op(
+        type=op_type,
+        inputs={'UnderlyingReader': reader},
+        outputs={'Out': [new_reader]},
+        attrs=attrs)
+    new_reader.persistable = True
+    new_reader.stop_gradient = True
+    return monkey_patch_reader_methods(new_reader)
+
+
+def shuffle(reader, buffer_size):
+    return __create_unshared_decorated_reader__(
+        'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
 
 
-def create_double_buffer_reader(reader, place=None):
+def double_buffer(reader, place=None):
     attrs = dict()
     if place is not None:
         attrs['place'] = str(place).upper()
-    return __create_decorated_reader__('create_double_buffer_reader', reader,
-                                       attrs)
+    return __create_unshared_decorated_reader__('create_double_buffer_reader',
+                                                reader, attrs)
+
+
+def multi_pass(reader, pass_num):
+    return __create_shared_decorated_reader__(
+        'create_multi_pass_reader', reader, {'pass_num': int(pass_num)})
 
 
-def create_multi_pass_reader(reader, pass_num):
-    return __create_decorated_reader__('create_multi_pass_reader', reader,
-                                       {'pass_num': int(pass_num)})
+def parallel(reader):
+    return __create_shared_decorated_reader__('create_threaded_reader', reader,
+                                              {})
 
 
 def read_file(file_obj):
diff --git a/python/paddle/fluid/layers/metric.py b/python/paddle/fluid/layers/metric.py
index 3d9157ad4ef93..f66dccfa2d040 100644
--- a/python/paddle/fluid/layers/metric.py
+++ b/python/paddle/fluid/layers/metric.py
@@ -15,12 +15,13 @@
 All layers just related to metric.
 """
 
+import warnings
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
 from ..param_attr import ParamAttr
 
-__all__ = ['accuracy']
+__all__ = ['accuracy', 'auc']
 
 
 def accuracy(input, label, k=1, correct=None, total=None):
@@ -55,3 +56,37 @@ def accuracy(input, label, k=1, correct=None, total=None):
             "Total": [total],
         })
     return acc_out
+
+
+def auc(input, label, curve='ROC', num_thresholds=200):
+    warnings.warn(
+        "This interface not recommended, fluid.layers.auc compute the auc at every minibatch, \
+        but can not aggregate them and get the pass AUC, because pass \
+        auc can not be averaged with weighted from the minibatch auc value. \
+        Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
+        which can get every minibatch and every pass auc value.", Warning)
+    helper = LayerHelper("auc", **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": k})
+    auc_out = helper.create_tmp_variable(dtype="float32")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        attrs={"curve": curve,
+               "num_thresholds": num_thresholds},
+        outputs={"AUC": [auc_out], })
+    return auc_out
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index d2e7d58524bfb..5c2c2dd7abebf 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -218,6 +218,7 @@ def fc(input,
 def embedding(input,
               size,
               is_sparse=False,
+              is_distributed=False,
               padding_idx=None,
               param_attr=None,
               dtype='float32'):
@@ -268,8 +269,11 @@ def embedding(input,
         inputs={'Ids': input,
                 'W': w},
         outputs={'Out': tmp},
-        attrs={'is_sparse': is_sparse,
-               'padding_idx': padding_idx})
+        attrs={
+            'is_sparse': is_sparse,
+            'is_distributed': is_distributed,
+            'padding_idx': padding_idx
+        })
     return tmp
 
 
@@ -1516,7 +1520,8 @@ def batch_norm(input,
                in_place=False,
                name=None,
                moving_mean_name=None,
-               moving_variance_name=None):
+               moving_variance_name=None,
+               do_model_average_for_mean_and_var=False):
     """
     This function helps create an operator to implement
     the BatchNorm layer using the configurations from the input parameters.
@@ -1547,7 +1552,10 @@ def batch_norm(input,
 
     mean = helper.create_parameter(
         attr=ParamAttr(
-            name=moving_mean_name, initializer=Constant(0.0), trainable=False),
+            name=moving_mean_name,
+            initializer=Constant(0.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
         shape=param_shape,
         dtype=input.dtype)
     mean.stop_gradient = True
@@ -1556,7 +1564,8 @@ def batch_norm(input,
         attr=ParamAttr(
             name=moving_variance_name,
             initializer=Constant(1.0),
-            trainable=False),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
         shape=param_shape,
         dtype=input.dtype)
     variance.stop_gradient = True
@@ -3374,14 +3383,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
     Here are some examples to explain it.
 
     1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
-    is [6, 8], the reshape operator will transform x into a 2-D tensor with 
+    is [6, 8], the reshape operator will transform x into a 2-D tensor with
     shape [6, 8] and leaving x's data unchanged.
 
     2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
     specified is [2, 3, -1, 2], the reshape operator will transform x into a
     4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
-    case, one dimension of the target shape is set to -1, the value of this 
-    dimension is inferred from the total element number of x and remaining 
+    case, one dimension of the target shape is set to -1, the value of this
+    dimension is inferred from the total element number of x and remaining
     dimensions.
 
     3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
@@ -3615,7 +3624,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
 def pad(x, paddings, pad_value=0., name=None):
     """
     Pads a tensor with a constant value given by :attr:`pad_value`, and the
-    padded width is specified by :attr:`paddings`. 
+    padded width is specified by :attr:`paddings`.
 
     Specifically, the number of values padded before the contents of :attr:`x`
     in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
@@ -3643,7 +3652,7 @@ def pad(x, paddings, pad_value=0., name=None):
         x (Variable): The input tensor variable.
         paddings (list): A list of integers. Its elements specify the padded
                          width before and after for each dimension in turn.
-                         The length of :attr:paddings must be 
+                         The length of :attr:paddings must be
                          :math:`rank(x) \\times 2`.
         pad_value (float): The constant value used to pad.
         name(str|None): A name for this layer(optional). If set None, the layer
diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py
new file mode 100644
index 0000000000000..99a81c1d4244b
--- /dev/null
+++ b/python/paddle/fluid/metrics.py
@@ -0,0 +1,378 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fluid Metrics
+
+The metrics are accomplished via Python natively. 
+"""
+import numpy as np
+import copy
+import warnings
+
+__all__ = [
+    'MetricBase',
+    'CompositeMetric',
+    'Accuracy',
+    'ChunkEvaluator',
+    'EditDistance',
+    'DetectionMAP',
+    'Auc',
+]
+
+
+def _is_numpy_(var):
+    return isinstance(var, (np.ndarray, np.generic))
+
+
+def _is_number_(var):
+    return isinstance(var, int) or isinstance(var, float) or (isinstance(
+        var, np.ndarray) and var.shape == (1, ))
+
+
+def _is_number_or_matrix_(var):
+    return _is_number_(var) or isinstance(var, np.ndarray)
+
+
+class MetricBase(object):
+    """
+    Base Class for all evaluators
+
+    Args:
+        name(str): The name of evaluator. such as, "accuracy". Used for generate
+            temporary variable name.
+    Interface:
+        Note(*) : the states is the attributes who not has _ prefix.
+
+        get_config(): print current states and configuration
+        reset(): clear the states. If the Metrics states type is not (int, float, np.ndarray),
+                Please override this method.
+        update(): update states at every minibatch
+        eval(): get metric evaluation in numpy type.
+    """
+
+    def __init__(self, name, **kwargs):
+        self._name = str(name) if name != None else self.__class__.__name__
+        self._kwargs = kwargs if kwargs != None else dict()
+        self.reset()
+
+    def __str__(self):
+        return self._name
+
+    def reset(self):
+        """
+        states is the attributes who not has _ prefix.
+        reset the states of metrics.
+        """
+        states = {
+            attr: value
+            for attr, value in self.__dict__.iteritems()
+            if not attr.startswith("_")
+        }
+        for attr, value in states.iteritems():
+            if isinstance(value, int):
+                setattr(self, attr, 0)
+            elif isinstance(value, float):
+                setattr(self, attr, .0)
+            elif isinstance(value, (np.ndarray, np.generic)):
+                setattr(self, attr, np.zeros_like(value))
+            else:
+                setattr(self, attr, None)
+
+    def get_config(self):
+        states = {
+            attr: value
+            for attr, value in self.__dict__.iteritems()
+            if not attr.startswith("_")
+        }
+        config = copy.deepcopy(self._kwargs)
+        config.update({"name": self._name, "states": copy.deepcopy(states)})
+        return config
+
+    def update(self):
+        raise NotImplementedError()
+
+    def eval(self):
+        raise NotImplementedError()
+
+
+class CompositeMetric(MetricBase):
+    """
+    Compute multiple metrics in each minibatch.
+    for example, merge F1, accuracy, recall into one Metric.
+    """
+
+    def __init__(self, name=None, **kwargs):
+        super(CompositeMetric, self).__init__(name, kwargs)
+        self._metrics = []
+
+    def add_metric(self, metric):
+        if not isinstance(metric, MetricBase):
+            raise ValueError("SubMetric should be inherit from MetricBase.")
+        self._metrics.append(metric)
+
+    def eval(self):
+        ans = []
+        for m in self._metrics:
+            ans.append(m.eval())
+        return ans
+
+
+class Accuracy(MetricBase):
+    """
+    Accumulate the accuracy from minibatches and compute the average accuracy
+    for every pass.
+
+    Args:
+       name: the metrics name
+
+    Example:
+        minibatch_accuracy = fluid.layers.accuracy(pred, label)
+        accuracy_evaluator = fluid.metrics.Accuracy()
+        for epoch in PASS_NUM:
+            accuracy_evaluator.reset()
+            for data in batches:
+                loss = exe.run(fetch_list=[cost, minibatch_accuracy])
+            accuracy_evaluator.update(value=minibatch_accuracy, weight=batches)
+            accuracy = accuracy_evaluator.eval()
+    """
+
+    def __init__(self, name=None):
+        super(Accuracy, self).__init__(name)
+        self.value = .0
+        self.weight = .0
+
+    def update(self, value, weight):
+        if not _is_number_or_matrix_(value):
+            raise ValueError(
+                "The 'value' must be a number(int, float) or a numpy ndarray.")
+        if not _is_number_(weight):
+            raise ValueError("The 'weight' must be a number(int, float).")
+        self.value += value * weight
+        self.weight += weight
+
+    def eval(self):
+        if self.weight == 0:
+            raise ValueError(
+                "There is no data in Accuracy Metrics. Please check layers.accuracy output has added to Accuracy."
+            )
+        return self.value / self.weight
+
+
+class ChunkEvalutor(MetricBase):
+    """
+    Accumulate counter numbers output by chunk_eval from mini-batches and
+    compute the precision recall and F1-score using the accumulated counter
+    numbers.
+    """
+
+    def __init__(self, name=None):
+        super(ChunkEvalutor, self).__init__(name)
+        self.num_infer_chunks = 0
+        self.num_label_chunks = 0
+        self.num_correct_chunks = 0
+
+    def update(self, num_infer_chunks, num_label_chunks, num_correct_chunks):
+        if not _is_number_or_matrix_(num_infer_chunks):
+            raise ValueError(
+                "The 'num_infer_chunks' must be a number(int, float) or a numpy ndarray."
+            )
+        if not _is_number_or_matrix_(num_label_chunks):
+            raise ValueError(
+                "The 'num_label_chunks' must be a number(int, float) or a numpy ndarray."
+            )
+        if not _is_number_or_matrix_(num_correct_chunks):
+            raise ValueError(
+                "The 'num_correct_chunks' must be a number(int, float) or a numpy ndarray."
+            )
+        self.num_infer_chunks += num_infer_chunks
+        self.num_label_chunks += num_label_chunks
+        self.num_correct_chunks += num_correct_chunks
+
+    def eval(self):
+        precision = float(
+            self.num_correct_chunks
+        ) / self.num_infer_chunks if self.num_infer_chunks else 0
+        recall = float(self.num_correct_chunks
+                       ) / self.num_label_chunks if self.num_label_chunks else 0
+        f1_score = float(2 * precision * recall) / (
+            precision + recall) if self.num_correct_chunks else 0
+        return precision, recall, f1_score
+
+
+class EditDistance(MetricBase):
+    """
+    Accumulate edit distance sum and sequence number from mini-batches and
+    compute the average edit_distance and instance error of all batches.
+
+    Args:
+        name: the metrics name
+
+    Example:
+        edit_distance_metrics = fluid.layers.edit_distance(input, label)
+        distance_evaluator = fluid.metrics.EditDistance()
+        for epoch in PASS_NUM:
+            distance_evaluator.reset()
+            for data in batches:
+                loss = exe.run(fetch_list=[cost] + list(edit_distance_metrics))
+            distance_evaluator.update(*edit_distance_metrics)
+            distance, instance_error = distance_evaluator.eval()
+
+        In the above example:
+        'distance' is the average of the edit distance in a pass.
+        'instance_error' is the instance error rate in a pass.
+
+    """
+
+    def __init__(self, name):
+        super(EditDistance, self).__init__(name)
+        self.total_distance = .0
+        self.seq_num = 0
+        self.instance_error = 0
+
+    def update(self, distances, seq_num):
+        if not _is_numpy_(distances):
+            raise ValueError("The 'distances' must be a numpy ndarray.")
+        if not _is_number_(seq_num):
+            raise ValueError("The 'seq_num' must be a number(int, float).")
+        seq_right_count = np.sum(distances == 0)
+        total_distance = np.sum(distances)
+        self.seq_num += seq_num
+        self.instance_error += seq_num - seq_right_count
+        self.total_distance += total_distance
+
+    def eval():
+        if self.seq_num == 0:
+            raise ValueError(
+                "There is no data in EditDistance Metric. Please check layers.edit_distance output has been added to EditDistance."
+            )
+        avg_distance = self.total_distance / self.seq_num
+        avg_instance_error = self.instance_error / self.seq_num
+        return avg_distance, avg_instance_error
+
+
+class DetectionMAP(MetricBase):
+    """
+    Calculate the detection mean average precision (mAP).
+
+    TODO (Dang Qingqing): update the following doc.
+    The general steps are as follows:
+    1. calculate the true positive and false positive according to the input
+        of detection and labels.
+    2. calculate mAP value, support two versions: '11 point' and 'integral'.
+
+    Please get more information from the following articles:
+      https://sanchom.wordpress.com/tag/average-precision/
+      https://arxiv.org/abs/1512.02325
+    """
+
+    def __init__(self, name=None):
+        super(DetectionMAP, self).__init__(name)
+        # the current map value
+        self.value = .0
+
+    def update(self, value, weight):
+        if not _is_number_or_matrix_(value):
+            raise ValueError(
+                "The 'value' must be a number(int, float) or a numpy ndarray.")
+        if not _is_number_(weight):
+            raise ValueError("The 'weight' must be a number(int, float).")
+        self.value += value
+        self.weight += weight
+
+    def eval(self):
+        if self.weight == 0:
+            raise ValueError(
+                "There is no data in DetectionMAP Metrics. "
+                "Please check layers.detection_map output has added to DetectionMAP."
+            )
+        return self.value / self.weight
+
+
+class Auc(MetricBase):
+    """
+    Auc Metrics which adapts to binary classification.
+    Need to note that auc metrics compute the value via Python natively.
+    If you concern the speed, please use the fluid.layers.auc instead.
+
+    The `auc` function creates four local variables, `true_positives`,
+      `true_negatives`, `false_positives` and `false_negatives` that are used to
+      compute the AUC. To discretize the AUC curve, a linearly spaced set of
+      thresholds is used to compute pairs of recall and precision values. The area
+      under the ROC-curve is therefore computed using the height of the recall
+      values by the false positive rate, while the area under the PR-curve is the
+      computed using the height of the precision values by the recall.
+
+    Args:
+        name: metric name
+        curve: Specifies the name of the curve to be computed, 'ROC' [default] or
+          'PR' for the Precision-Recall-curve.
+        num_thresholds: The number of thresholds to use when discretizing the roc
+            curve.
+
+    "NOTE: only implement the ROC curve type via Python now."
+    """
+
+    def __init__(self, name, curve='ROC', num_thresholds=200):
+        super(MetricBase, self).__init__(name, curve, num_thresholds)
+        self._curve = curve
+        self._num_thresholds = num_thresholds
+        self._epsilon = 1e-6
+        self.tp_list = np.ndarray((num_thresholds, ))
+        self.fn_list = np.ndarray((num_thresholds, ))
+        self.tn_list = np.ndarray((num_thresholds, ))
+        self.fp_list = np.ndarray((num_thresholds, ))
+
+    def update(self, labels, predictions, axis=1):
+        if not _is_numpy_(labels):
+            raise ValueError("The 'labels' must be a numpy ndarray.")
+        if not _is_numpy_(predictions):
+            raise ValueError("The 'predictions' must be a numpy ndarray.")
+
+        kepsilon = 1e-7  # to account for floating point imprecisions
+        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                      for i in range(num_thresholds - 2)]
+        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+
+        # caculate TP, FN, TN, FP count
+        for idx_thresh, thresh in enumerate(thresholds):
+            tp, fn, tn, fp = 0, 0, 0, 0
+            for i, lbl in enumerate(labels):
+                if lbl:
+                    if predictions[i, 0] >= thresh:
+                        tp += 1
+                    else:
+                        fn += 1
+                else:
+                    if predictions[i, 0] >= thresh:
+                        fp += 1
+                    else:
+                        tn += 1
+            tp_list[idx_thresh] += tp
+            fn_list[idx_thresh] += fn
+            tn_list[idx_thresh] += tn
+            fp_list[idx_thresh] += fp
+
+    def eval(self):
+        epsilon = self._epsilon
+        num_thresholds = self._num_thresholds
+        tpr = (tp_list.astype("float32") + epsilon) / (
+            tp_list + fn_list + epsilon)
+        fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon)
+        rec = (tp_list.astype("float32") + epsilon) / (
+            tp_list + fp_list + epsilon)
+
+        x = fpr[:num_thresholds - 1] - fpr[1:]
+        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
+        auc_value = np.sum(x * y)
+        return auc_value
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 180575c35dc6e..36503cac6d539 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import re
 from collections import defaultdict
 from paddle.fluid.framework import Program
 import framework
@@ -818,8 +818,8 @@ class ModelAverage(Optimizer):
     min_average_window, max_average_window and current update times.
 
     Args:
-        params_grads: A list of parameter-grad variable pairs.
         average_window_rate: The rate of average window.
+        params_grads: A list of parameter-grad variable pairs.
         min_average_window: The minimum size of average window.
         max_average_window: The maximum size of average window.
 
@@ -840,8 +840,8 @@ class ModelAverage(Optimizer):
     """
 
     def __init__(self,
-                 params_grads,
                  average_window_rate,
+                 params_grads=None,
                  min_average_window=10000,
                  max_average_window=10000,
                  **kwargs):
@@ -849,24 +849,37 @@ def __init__(self,
         self.average_window = average_window_rate
         self.min_average_window = min_average_window
         self.max_average_window = max_average_window
-        self.params_grads = params_grads
+
+        self.params_grads = [] if params_grads is None else params_grads
+        params = {}
+        for param, grad in self.params_grads:
+            if param.do_model_average != False:
+                params[param.name] = (param, grad)
+        for param in framework.default_main_program().global_block(
+        ).all_parameters():
+            if param.name not in params and param.do_model_average != False:
+                grad = param.block.create_var(
+                    name=unique_name.generate(".".join([param.name, 'tmp'])),
+                    dtype=param.dtype,
+                    persistable=False,
+                    stop_gradient=True)
+                params[param.name] = (param, grad)
+        self.params_grads = params.values()
+
         for param, grad in self.params_grads:
-            if grad is not None:
-                self._append_average_accumulate_op(param)
+            self._append_average_accumulate_op(param)
 
         self.apply_program = Program()
         block = self.apply_program.global_block()
         with program_guard(main_program=self.apply_program):
             for param_grad in self.params_grads:
-                if param_grad[1] is not None:
-                    self._add_average_apply_op(block, param_grad)
+                self._add_average_apply_op(block, param_grad)
 
         self.restore_program = Program()
         block = self.restore_program.global_block()
         with program_guard(main_program=self.restore_program):
             for param_grad in self.params_grads:
-                if param_grad[1] is not None:
-                    self._add_average_restore_op(block, param_grad)
+                self._add_average_restore_op(block, param_grad)
 
     def _add_average_apply_op(self, block, param_grad):
         param = block.clone_variable(param_grad[0])
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 24dfa6144ae95..5ce2aa1fc4d0b 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -100,9 +100,11 @@ def __init__(self,
         local_scopes = share_vars_from.executor.local_scopes(
         ) if share_vars_from else []
 
-        persistable_vars = [
+        self.persistable_vars = [
             v.name
-            for v in filter(lambda var: var.persistable, main.list_vars())
+            for v in filter(lambda var: \
+                var.persistable and var.type != core.VarDesc.VarType.RAW,
+                main.list_vars())
         ]
 
         self.executor = core.ParallelExecutor(
@@ -113,7 +115,7 @@ def __init__(self,
                 p.name for p in main.global_block().iter_parameters()
                 if not p.stop_gradient
             ]),
-            set(persistable_vars),
+            set(self.persistable_vars),
             main.desc,
             loss_name if loss_name else '',
             scope,
@@ -143,3 +145,6 @@ def run(self, fetch_list, feed_dict={}):
         self.executor.run(fetch_list, fetch_var_name, feed_tensor_dict)
         arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()
         return [arr[i] for i in range(len(arr))]
+
+    def bcast_params(self):
+        self.executor.bcast_params(set(self.persistable_vars))
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 255cd2104325a..1c6970441bccd 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -28,13 +28,15 @@ def __init__(self,
                  learning_rate=1.0,
                  regularizer=None,
                  trainable=True,
-                 gradient_clip=None):
+                 gradient_clip=None,
+                 do_model_average=None):
         self.name = name
         self.initializer = initializer
         self.learning_rate = learning_rate
         self.regularizer = regularizer
         self.trainable = trainable
         self.gradient_clip = gradient_clip
+        self.model_average = do_model_average
 
     def set_default_initializer(self, initializer):
         if initializer is None:
@@ -80,7 +82,8 @@ def to_kwargs(self, with_initializer=False):
             },
             'regularizer': self.regularizer,
             'trainable': self.trainable,
-            'gradient_clip_attr': self.gradient_clip
+            'gradient_clip_attr': self.gradient_clip,
+            'model_average': self.model_average
         }
         if with_initializer:
             kwargs['initializer'] = self.initializer
@@ -90,7 +93,7 @@ def to_kwargs(self, with_initializer=False):
 class WeightNormParamAttr(ParamAttr):
     """
     Used for weight normalization. Any field in ParamAttr can also be set here.
-    Besides, an extra field dim can be set to indicate the dimension except 
+    Besides, an extra field dim can be set to indicate the dimension except
     which to normalize.
     """
     # List to record the parameters reparameterized by weight normalization.
diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
index c0a6df831acbf..4d8bca4d2430a 100644
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -37,7 +37,7 @@
 mix_hidden_lr = 1e-3
 
 IS_SPARSE = True
-PASS_NUM = 10
+PASS_NUM = 100
 BATCH_SIZE = 10
 
 embedding_name = 'emb'
@@ -77,7 +77,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
     emb_layers.append(mark_embedding)
 
     hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
+        for emb in emb_layers
     ]
 
     hidden_0 = fluid.layers.sums(input=hidden_0_layers)
@@ -94,8 +95,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
 
     for i in range(1, depth):
         mix_hidden = fluid.layers.sums(input=[
-            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
-            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
         ])
 
         lstm = fluid.layers.dynamic_lstm(
@@ -109,8 +110,8 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
         input_tmp = [mix_hidden, lstm]
 
     feature_out = fluid.layers.sums(input=[
-        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
-        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
     ])
 
     return feature_out
@@ -171,7 +172,7 @@ def train(use_cuda, save_dirname=None, is_local=True):
     # check other optimizers and check why out will be NAN
     sgd_optimizer = fluid.optimizer.SGD(
         learning_rate=fluid.layers.exponential_decay(
-            learning_rate=0.0001,
+            learning_rate=0.01,
             decay_steps=100000,
             decay_rate=0.5,
             staircase=True))
@@ -233,7 +234,7 @@ def train_loop(main_program):
                         print("second per batch: " + str((time.time(
                         ) - start_time) / batch_id))
                     # Set the threshold low to speed up the CI test
-                    if float(pass_precision) > 0.05:
+                    if float(pass_precision) > 0.01:
                         if save_dirname is not None:
                             # TODO(liuyiqun): Change the target to crf_decode
                             fluid.io.save_inference_model(save_dirname, [
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
index 10aa63e18a6ee..7ecf9a1459ffc 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -14,23 +14,13 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest
 from paddle.fluid.framework import grad_var_name
 
 
-def get_backward_op(scope, op, no_grad_set):
-    backward_op = core.Operator.backward(op, no_grad_set)
-    for input in backward_op.input_vars():
-        var = scope.var(input)
-        var.get_tensor()
-    for output in backward_op.output_vars():
-        var = scope.var(output)
-        var.get_tensor()
-    return backward_op
-
-
 def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
     x_shape = x.shape
     if len(x_shape) == 2:
@@ -64,11 +54,6 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
 
 def _reference_training(x, scale, offset, epsilon, data_format):
     x_shape = x.shape
-    if len(x_shape) == 2:
-        if data_format == "NCHW":
-            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
-        else:
-            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
 
     if data_format == "NCHW":
         n, c, h, w = x.shape
@@ -88,8 +73,6 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         offset_tile = np.reshape(offset, (1, c, 1, 1))
         offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
         y = normalized * scale_tile + offset_tile
-        if len(x_shape) == 2:
-            y = np.reshape(y, (y.shape[0], y.shape[1]))
         return y, mean, var
     elif data_format == "NHWC":
         x_square = x * x
@@ -100,59 +83,42 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         var = x_square_sum / element_count - mean * mean
         normalized = (x - mean) / np.sqrt(var + epsilon)
         y = normalized * scale + offset
-        if len(x_shape) == 2:
-            y = np.reshape(y, x_shape)
         return y, mean, var
     else:
         raise ValueError("Unknown data order.")
 
 
-def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
+def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
     # Use the following formulas to calculate gradients:
     # grad_scale =
     #   sum(grad_y * (x - mean)) * rsqrt(var + epsilon)
     #
     # grad_offset = sum(output_y)
     #
-    # grad_x =
+    # x_grad =
     #   1/N * scale * rsqrt(var + epsilon) * (N * grad_y - sum(grad_y) -
     #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
 
     # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
-    x_shape = x.shape
-
-    if len(x_shape) == 2:
-        if data_format == "NCHW":
-            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
-            grad_y = np.reshape(grad_y,
-                                (grad_y.shape[0], grad_y.shape[1], 1, 1))
-        else:
-            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
-            grad_y = np.reshape(grad_y,
-                                (grad_y.shape[0], 1, 1, grad_y.shape[1]))
-
     if data_format == "NCHW":
         x = np.transpose(x, (0, 2, 3, 1))
-        grad_y = np.transpose(grad_y, (0, 2, 3, 1))
+        y_grad = np.transpose(y_grad, (0, 2, 3, 1))
 
-        # raise ValueError("data_format must be NHWC, got %s." % data_format)
-    grad_x = scale * (grad_y - np.mean(
-        grad_y, axis=(0, 1, 2)) - (x - mean) * np.mean(
-            grad_y * (x - mean), axis=(0, 1, 2)) /
+    x_grad = scale * (y_grad - np.mean(
+        y_grad, axis=(0, 1, 2)) - (x - mean) * np.mean(
+            y_grad * (x - mean), axis=(0, 1, 2)) /
                       (var + epsilon)) / np.sqrt(var + epsilon)
-    grad_scale = np.sum(grad_y * (x - mean) / np.sqrt(var + epsilon),
+    grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
                         axis=(0, 1, 2))
-    grad_offset = np.sum(grad_y, axis=(0, 1, 2))
+    grad_offset = np.sum(y_grad, axis=(0, 1, 2))
 
     # transfer back to N, C, H, W
     if data_format == "NCHW":
-        grad_x = np.transpose(grad_x, (0, 3, 1, 2))
+        x_grad = np.transpose(x_grad, (0, 3, 1, 2))
         x = np.transpose(x, (0, 3, 1, 2))
-        grad_y = np.transpose(grad_y, (0, 3, 1, 2))
+        y_grad = np.transpose(y_grad, (0, 3, 1, 2))
 
-    if len(x_shape) == 2:
-        grad_x = np.reshape(grad_x, x_shape)
-    return grad_x, grad_scale, grad_offset
+    return x_grad, grad_scale, grad_offset
 
 
 def create_or_get_tensor(scope, var_name, var, place):
@@ -186,7 +152,7 @@ def __set_tensor__(name, data=None):
         __set_tensor__(output, data)
 
 
-class TestBatchNormOpInference(OpTest):
+class TestBatchNormOpInference(unittest.TestCase):
     def setUp(self):
         self.dtype = np.float32
 
@@ -304,231 +270,121 @@ def test_check_output(self):
                 self.check_with_place(place, data_format, self.dtype, [2, 3])
 
 
-class TestBatchNormOpTraining(OpTest):
+class TestBatchNormOpTraining(unittest.TestCase):
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        if not np.allclose(np.array(tensor), np_array, atol=atol):
+            import pdb
+            pdb.set_trace()
         self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
 
-    def test_python_testing(self):
-        data_format = "NHWC"
-        epsilon = 0.00001
-
-        n, h, w, c = 2, 3, 4, 5
-        x_shape = [n, h, w, c]
-        scale_shape = [c]
-
-        x_val = np.random.random_sample(x_shape).astype(np.float32)
-        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-
-        mean = np.zeros(scale_shape).astype(np.float32)
-        variance = np.ones(scale_shape).astype(np.float32)
-
-        y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
-                                   epsilon, "NHWC")
-
-        # running N, C, H, W case
-        # should produce the same results
-        x_shape2 = [n, c, h, w]
-        x_val2 = np.transpose(x_val, (0, 3, 1, 2))
-        y_out2 = _reference_testing(x_val2, scale_val, bias_val, mean, variance,
-                                    epsilon, "NCHW")
-
-        # transfer (N, C, H, W) back to (N, H, W, C)
-        y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
-        self.__assert_close(y_out, y_out2_trans, "inference output")
-        print 'python: NHWC, NCHW, inference checking passed'
-
-    def test_python_training(self):
-        data_format = "NHWC"
-        epsilon = 0.00001
-        momentum = 0.9
-
-        # N, H, W, C: 2, 3, 4, 2
-        n, h, w, c = 2, 3, 4, 5
-        x_shape = [n, h, w, c]
-        scale_shape = [c]
-
-        x_val = np.random.random_sample(x_shape).astype(np.float32)
-        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-
-        mean = np.zeros(scale_shape).astype(np.float32)
-        variance = np.ones(scale_shape).astype(np.float32)
-
-        # run forward
-        y_out, saved_mean, var_ref = _reference_training(
-            x_val, scale_val, bias_val, epsilon, "NHWC")
-
-        #
-        mean_out = saved_mean * (1. - momentum) + momentum * mean
-        variance_out = var_ref * (1. - momentum) + momentum * variance
-        saved_variance = 1. / np.sqrt(var_ref + epsilon)
-
-        # running N, C, H, W case
-        # should produce the same results
-        x_shape2 = [n, c, h, w]
-        x_val2 = np.transpose(x_val, (0, 3, 1, 2))
-        y_out2, saved_mean2, var_ref2 = _reference_training(
-            x_val2, scale_val, bias_val, epsilon, "NCHW")
-
-        self.__assert_close(saved_mean, saved_mean2, "batch mean")
-        self.__assert_close(var_ref, var_ref2, "batch variance")
-
-        # transfer (N, C, H, W) back to (N, H, W, C)
-        y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
-        self.__assert_close(y_out, y_out2_trans, "batch output")
-        print 'python: NHWC, NCHW, forward checking passed'
-
-        # test backward now
-        # NHWC
-        self.y_grad = np.random.random_sample(x_shape).astype(np.float32)
-        y_grad = self.y_grad
-        # y_grad = np.ones(x_shape).astype(np.float32)
-        x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
-            x_val, y_grad, scale_val, saved_mean, var_ref, epsilon, "NHWC")
-
-        # NCHW
-        y_grad2 = np.transpose(y_grad, (0, 3, 1, 2))
-        # y_grad2 = np.ones(x_shape2).astype(np.float32)
-        x_grad_ref2, scale_grad_ref2, bias_grad_ref2 = _reference_grad(
-            x_val2, y_grad2, scale_val, saved_mean2, var_ref2, epsilon, "NCHW")
-
-        self.__assert_close(scale_grad_ref, scale_grad_ref2, "scale gradient")
-        self.__assert_close(bias_grad_ref, bias_grad_ref2, "bias gradient")
-
-        x_grad_transpose = np.transpose(x_grad_ref2, (0, 2, 3, 1))
-        self.__assert_close(x_grad_ref, x_grad_transpose, "x gradient")
-        print 'python: NHWC, NCHW, backward checking passed'
-
     def test_forward_backward(self):
         def test_with_place(place, data_layout, shape):
             # attr
             epsilon = 0.00001
             momentum = 0.9
-
-            if len(shape) == 2:
-                x_shape = shape
-                c = shape[1]
+            if data_layout == "NCHW":
+                n, c, h, w = shape[0], shape[1], shape[2], shape[3]
             else:
-                # n, h, w, c = 2, 3, 4, 2
                 n, h, w, c = shape[0], shape[1], shape[2], shape[3]
-                if data_format == "NHWC":
-                    x_shape = [n, h, w, c]
-                elif data_format == "NCHW":
-                    x_shape = [n, c, h, w]
-                else:
-                    raise ValueError("Unknown data type.")
             scale_shape = [c]
 
-            x_val = np.random.random_sample(x_shape).astype(np.float32)
-            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-
+            np.random.seed(123)
+            x = np.random.random_sample(shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
             mean = np.zeros(scale_shape).astype(np.float32)
             variance = np.ones(scale_shape).astype(np.float32)
 
             # run forward
-            y_out, saved_mean, var_ref = _reference_training(
-                x_val, scale_val, bias_val, epsilon, data_format)
-
-            # update moving mean and variance
+            y, saved_mean, var_ref = _reference_training(x, scale, bias,
+                                                         epsilon, data_layout)
             mean_out = saved_mean * (1. - momentum) + momentum * mean
             variance_out = var_ref * (1. - momentum) + momentum * variance
             saved_variance = 1. / np.sqrt(var_ref + epsilon)
-
-            #  for gradient test
-            # y_grad = np.ones(x_shape).astype(np.float32)
-            y_grad = np.zeros(x_shape).astype(np.float32)
-            if len(y_grad.shape) == 2:
-                y_grad[0, 0] = 1.
-            else:
-                y_grad[0, 0, 0, 0] = 1.
-            # y_grad = np.random.random_sample(x_shape).astype(np.float32)
-            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
-                x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
-                data_format)
-
-            scope = core.Scope()
-
-            # create input
-            x_tensor = create_or_get_tensor(scope, "x_val", x_val, place)
-            scale_tensor = create_or_get_tensor(scope, "scale_val", scale_val,
-                                                place)
-            bias_tensor = create_or_get_tensor(scope, "bias_val", bias_val,
-                                               place)
-            mean_tensor = create_or_get_tensor(scope, "mean", mean, place)
-            variance_tensor = create_or_get_tensor(scope, "variance", variance,
-                                                   place)
-
-            # create output
-            y_tensor = create_or_get_tensor(scope, "y_out", None, place)
-            saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None,
-                                                     place)
-            saved_variance_tensor = create_or_get_tensor(
-                scope, "saved_variance", None, place)
-            mean_out_tensor = mean_tensor
-            variance_out_tensor = variance_tensor
-
-            batch_norm_op = Operator(
-                "batch_norm",
-                # inputs
-                X="x_val",
-                Scale="scale_val",
-                Bias="bias_val",
-                Mean="mean",
-                Variance="variance",
-                # outputs
-                Y="y_out",
-                MeanOut="mean",
-                VarianceOut="variance",
-                SavedMean="saved_mean",
-                SavedVariance="saved_variance",
-                # attrs
-                is_test=False,
-                data_layout=data_layout,
-                momentum=momentum,
-                epsilon=epsilon)
-
-            batch_norm_op.run(scope, place)
-
-            # check forward result
-            self.__assert_close(y_tensor, y_out, "y_out")
-            self.__assert_close(saved_mean_tensor, saved_mean, "saved_mean")
-            self.__assert_close(saved_variance_tensor, saved_variance,
-                                "saved_variance")
-            self.__assert_close(mean_out_tensor, mean_out, "mean_out")
-            if isinstance(place, core.CUDAPlace):
-                atol = 5e-2
-            else:
-                atol = 1e-4
-            self.__assert_close(variance_out_tensor, variance_out,
-                                "variance_out", atol)
-            print "op test forward passed: ", str(place), data_layout
-
             # run backward
-            batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set())
-            set_output_grad(
-                scope,
-                ["y_out", "mean", "variance", "saved_mean", "saved_variance"],
-                place,
-                feed_dict={"y_out": y_grad})
-            batch_norm_op_grad.run(scope, place)
-
-            x_grad_tensor = create_or_get_tensor(scope,
-                                                 grad_var_name("x_val"), None,
-                                                 place)
-            scale_grad_tensor = create_or_get_tensor(scope,
-                                                     grad_var_name("scale_val"),
-                                                     None, place)
-            bias_grad_tensor = create_or_get_tensor(scope,
-                                                    grad_var_name("bias_val"),
-                                                    None, place)
+            y_grad = np.random.random_sample(shape).astype(np.float32)
+            x_grad, scale_grad, bias_grad = _reference_grad(
+                x, y_grad, scale, saved_mean, var_ref, epsilon, data_format)
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+
+            var_names = [
+                'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
+                'saved_variance'
+            ]
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                bn_op = block.append_op(
+                    type="batch_norm",
+                    inputs={
+                        "X": block.var('x'),
+                        "Scale": block.var('scale'),
+                        "Bias": block.var('bias'),
+                        "Mean": block.var('mean'),
+                        "Variance": block.var('variance')
+                    },
+                    outputs={
+                        "Y": block.var('y'),
+                        "MeanOut": block.var('mean'),  # share the same memory
+                        "VarianceOut":
+                        block.var('variance'),  # share the same memory
+                        "SavedMean": block.var('saved_mean'),
+                        "SavedVariance": block.var('saved_variance')
+                    },
+                    attrs={
+                        "momentum": momentum,
+                        "epsilon": epsilon,
+                        "is_test": False,
+                        "data_layout": data_layout
+                    })
+                block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
+
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    bn_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                exe = fluid.Executor(place)
+                out = exe.run(
+                    program,
+                    feed={
+                        name: var_dict[name]
+                        for name in
+                        ['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
+                    },
+                    fetch_list=[
+                        'y', 'mean', 'variance', 'saved_mean', 'saved_variance',
+                        'x@GRAD', 'scale@GRAD', 'bias@GRAD'
+                    ])
+
+            self.__assert_close(y, out[0], "y")
+            self.__assert_close(mean_out, out[1], "mean")
+            self.__assert_close(variance_out, out[2], "variance", 1e-3)
+            self.__assert_close(saved_mean, out[3], "saved_mean")
+            self.__assert_close(saved_variance, out[4], "saved_variance", 1e-3)
+            self.__assert_close(x_grad, out[5], "x_grad")
+            self.__assert_close(scale_grad, out[6], "scale_grad")
+            self.__assert_close(bias_grad, out[7], "bias_grad")
 
-            # check gradient output
-            self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad")
-            self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad")
-            self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad")
-            print "op test backward passed: ", str(place), data_layout
+            print "op test forward passed: ", str(place), data_layout
 
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
@@ -537,7 +393,6 @@ def test_with_place(place, data_layout, shape):
         for place in places:
             for data_format in ["NCHW", "NHWC"]:
                 test_with_place(place, data_format, [2, 3, 4, 5])
-                test_with_place(place, data_format, [2, 3])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_cond_op.py b/python/paddle/fluid/tests/unittests/test_cond_op.py
deleted file mode 100644
index 66fbae961a270..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_cond_op.py
+++ /dev/null
@@ -1,128 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import paddle.fluid.core as core
-import unittest
-import numpy as np
-from paddle.fluid.op import Operator, CondOp
-
-
-class PySimpleCond(object):
-    '''
-    A simple implementation of dynamic if-else based on numpy
-    '''
-
-    def __init__(self):
-        array = [1] * 10
-        for i in range(1, 10, 2):
-            array[i] = 0
-        self.cond = np.array(array)
-        self.x = np.ones(shape=(10, 1)).astype("float32")
-
-    def forward(self):
-        self.index_t = np.where(self.cond == 1)
-        self.index_f = np.where(self.cond == 0)
-        y_t = self.x[self.index_t]
-        y_f = self.x[self.index_f]
-        y_t = y_t * 2.
-        y_f = y_f * (-2.)
-        output = np.zeros(shape=(10, 1))
-        output[self.index_t] = y_t
-        output[self.index_f] = y_f
-        return output
-
-
-class PySimpleCondTest(unittest.TestCase):
-    def setUp(self):
-        self.condnn = PySimpleCond()
-
-    def test_forward(self):
-        output = self.condnn.forward()
-
-
-def create_tensor(scope, name, shape, np_data):
-    tensor = scope.var(name).get_tensor()
-    tensor.set_dims(shape)
-    tensor.set(np_data, core.CPUPlace())
-    return tensor
-
-
-class TestCondOp(unittest.TestCase):
-    '''
-    Test CondOp
-
-    equation:
-        cond = [True, False, True, False, ...]
-        y[index_t] = x[index_t] * 2.
-        y[index_f] = x[index_f] * -2.
-    outputs:
-        y
-    '''
-
-    def setUp(self):
-        self.py_cond = PySimpleCond()
-
-    def forward(self):
-        self.scope = core.Scope()
-        self.create_global_variables()
-        self.create_cond_op()
-        self.create_sub_net()
-        self.condop.run(self.scope, core.CPUPlace())
-        return np.array(self.scope.find_var("Out").get_tensor())
-
-    def create_global_variables(self):
-        x_np_data = self.py_cond.x
-        create_tensor(self.scope, "X", [10, 1], x_np_data)
-        cond_np_data = self.py_cond.cond.astype("int32")
-        create_tensor(self.scope, "cond", [10, 1], cond_np_data)
-        self.scope.var("SubScopes")
-        self.scope.var("IndexTensors")
-        self.scope.var("Out")
-
-    def create_cond_op(self):
-        self.condop = CondOp(
-            Cond="cond",
-            Xs=["X"],
-            Outs=["Out"],
-            SubScopes="SubScopes",
-            IndexTensors="IndexTensors")
-
-    def create_sub_net(self):
-        truenet = core.Net.create()
-        scale_op_t = Operator("scale", X='X', Out='Out', scale=2.)
-        truenet.append_op(scale_op_t)
-        truenet.complete_add_op(True)
-        self.condop.set_truenet(truenet)
-
-        falsenet = core.Net.create()
-        scale_op_t = Operator("scale", X='X', Out='Out', scale=-2.)
-        falsenet.append_op(scale_op_t)
-        falsenet.complete_add_op(True)
-        self.condop.set_falsenet(falsenet)
-
-    def test_forward(self):
-        print 'test cond op forward'
-        pd_output = self.forward()
-        py_output = self.py_cond.forward()
-        print 'pd_output', pd_output
-        print
-        print 'py_output', py_output
-        self.assertEqual(pd_output.shape, py_output.shape)
-        print 'test passed'
-        return 0
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index 8c67e45b7fc99..69365db4d104a 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -15,10 +15,8 @@
 import numpy as np
 
 from operator import mul
-from op_test import OpTest
 import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-from paddle.fluid.framework import grad_var_name
+import paddle.fluid as fluid
 
 np.random.random(123)
 
@@ -70,161 +68,93 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
     return grad_x, d_scale, d_bias
 
 
-def get_backward_op(scope, op, no_grad_set):
-    backward_op = core.Operator.backward(op, no_grad_set)
-    for input in backward_op.input_vars():
-        var = scope.var(input)
-        var.get_tensor()
-    for output in backward_op.output_vars():
-        var = scope.var(output)
-        var.get_tensor()
-    return backward_op
-
-
-def create_or_get_tensor(scope, var_name, var, place):
-    tensor = scope.var(var_name).get_tensor()
-    if var is not None:
-        assert isinstance(var, np.ndarray)
-        tensor.set_lod([[]])
-        tensor.set_dims(var.shape)
-        tensor.set(var, place)
-    return tensor
-
-
-def set_output_grad(scope, outputs, place, feed_dict=None):
-    def __set_tensor__(name, data=None):
-        out_tensor = scope.find_var(name).get_tensor()
-        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
-        out_dtype = out_tensor.dtype()
-        if data is None:
-            if out_dtype == core.VarDesc.VarType.FP64:
-                data = np.ones(out_tensor.shape(), dtype=np.float64)
-            elif out_dtype == core.VarDesc.VarType.FP32:
-                data = np.ones(out_tensor.shape(), dtype=np.float32)
-            else:
-                raise ValueError("Not supported data type " + str(out_dtype))
-        grad_tensor.set(data, place)
-
-    for output in outputs:
-        data = None
-        if output in feed_dict:
-            data = feed_dict[output]
-        __set_tensor__(output, data)
-
-
-class TestLayerNormdOp(OpTest):
+class TestLayerNormdOp(unittest.TestCase):
     def __assert_close(self, tensor, np_array, msg, atol=1e-4):
         self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
 
-    def __assert_grad_close(self,
-                            tensor,
-                            np_array,
-                            name,
-                            place,
-                            max_relative_error=0.02):
-        a = np.array(tensor)
-        b = np_array
-        abs_a = np.abs(a)
-        abs_a[abs_a < 1e-5] = 1
-
-        diff_mat = np.abs(a - b) / abs_a
-        max_diff = np.max(diff_mat)
-
-        def err_msg():
-            offset = np.argmax(diff_mat > max_relative_error)
-            return ("%s Variable %s max gradient diff %f over limit %f, "
-                    "the first error element is %d, %f, %f") % (
-                        "Gradient Check On %s" % str(place), name, max_diff,
-                        max_relative_error, offset, a.flatten()[offset],
-                        b.flatten()[offset])
-
-        self.assertLessEqual(max_diff, max_relative_error, err_msg())
-
     def check_forward_backward(self, shape, begin_norm_axis):
-        def test_with_place(place, shape, begin_norm_axis=1):
-            # setUp
-            assert begin_norm_axis > 0 and begin_norm_axis < len(
-                shape), 'begin_norm_axis must be between 0 and len(shape)-1.'
+        def test_with_place(place, shape, begin_norm_axis):
             # attr
             epsilon = 0.00001
             x_shape = shape
             D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
             scale_shape = [D]
 
-            x_val = np.random.random_sample(x_shape).astype(np.float32)
-            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+            np.random.seed(123)
+            x = np.random.random_sample(x_shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
             y_grad = np.random.random_sample(x_shape).astype(np.float32)
 
-            # run forward
-            y_out, saved_mean, var_ref = _reference_layer_norm_naive(
-                x_val, scale_val, bias_val, epsilon, begin_norm_axis)
-            naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref}
-
-            # get gradient
-            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad(
-                x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis)
-            naive_grad = {
-                "X": x_grad_ref,
-                "Scale": scale_grad_ref,
-                "Bias": bias_grad_ref
-            }
-
-            scope = core.Scope()
-
-            # create input
-            input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val}
-            for i_name in input_map:
-                create_or_get_tensor(scope, i_name, input_map[i_name], place)
-
-            # create output
-            output_map = {"Y": None, "Mean": None, "Variance": None}
-            output_tensor = {}
-            for o_name in output_map:
-                output_tensor[o_name] = create_or_get_tensor(
-                    scope, o_name, output_map[o_name], place)
-
-            layer_norm_op = Operator(
-                "layer_norm",
-                # inputs
-                X="X",
-                Scale="Scale",
-                Bias="Bias",
-                # outputs
-                Y="Y",
-                Mean="Mean",
-                Variance="Variance",
-                # attrs
-                epsilon=epsilon,
-                begin_norm_axis=begin_norm_axis)
-
-            layer_norm_op.run(scope, place)
-
-            # check forward result
-            atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4
-            for o_tensor in output_tensor:
-                self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor],
-                                    o_tensor, atol)
-
-            # run backward
-            layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set())
-            set_output_grad(
-                scope, ["Y", "Mean", "Variance"],
-                place,
-                feed_dict={"Y": y_grad})
-            layer_norm_op_grad.run(scope, place)
-
-            # get output
-            grad_tensor = {}
-            for o_name in naive_grad:
-                grad_tensor[o_name] = x_ = create_or_get_tensor(
-                    scope, grad_var_name(o_name), None, place)
-
-            # check gradient output
-            for o_grad in naive_grad:
-                self.__assert_grad_close(grad_tensor[o_grad],
-                                         naive_grad[o_grad], o_grad + "@GRAD",
-                                         place)
+            # reference forward & backward
+            y, mean, variance = _reference_layer_norm_naive(
+                x, scale, bias, epsilon, begin_norm_axis)
+            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
+                x, y_grad, scale, mean, variance, begin_norm_axis)
+
+            var_dict = locals()
+            var_dict['y@GRAD'] = y_grad
+            var_names = [
+                'x', 'scale', 'bias', 'mean', 'variance', 'y', 'y@GRAD'
+            ]
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            program = fluid.Program()
+            with fluid.program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                layer_norm_op = block.append_op(
+                    type="layer_norm",
+                    inputs={
+                        "X": block.var('x'),
+                        "Scale": block.var('scale'),
+                        "Bias": block.var('bias'),
+                    },
+                    outputs={
+                        "Y": block.var('y'),
+                        "Mean": block.var('mean'),  # share the same memory
+                        "Variance":
+                        block.var('variance'),  # share the same memory
+                    },
+                    attrs={
+                        "epsilon": epsilon,
+                        "begin_norm_axis": begin_norm_axis
+                    })
+
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    layer_norm_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                exe = fluid.Executor(place)
+                out = exe.run(program,
+                              feed={
+                                  name: var_dict[name]
+                                  for name in ['x', 'scale', 'bias', 'y@GRAD']
+                              },
+                              fetch_list=[
+                                  'y', 'mean', 'variance', 'x@GRAD',
+                                  'scale@GRAD', 'bias@GRAD'
+                              ])
+                self.__assert_close(y, out[0], "y")
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(variance, out[2], "variance", 1e-3)
+                self.__assert_close(x_grad, out[3], "x_grad")
+                self.__assert_close(scale_grad, out[4], "scale_grad", 1e-3)
+                self.__assert_close(bias_grad, out[5], "bias_grad")
 
         places = [core.CPUPlace()]
         if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
@@ -237,15 +167,6 @@ def test_check_forward_backward_with_scale_and_bias(self):
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
 
-    def test_check_forward_backward_with_scale(self):
-        pass  # TODO(zcd)
-
-    def test_check_forward_backward_with_bias(self):
-        pass  # TODO(zcd)
-
-    def test_check_forward_backward(self):
-        pass  # TODO(zcd)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 2179826d81f71..f88a6f1ce6e95 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -32,7 +32,6 @@ def test_fit_a_line(self):
             cost = layers.square_error_cost(input=y_predict, label=y)
             avg_cost = layers.mean(cost)
             self.assertIsNotNone(avg_cost)
-            program.append_backward(avg_cost)
 
         print(str(program))
 
@@ -94,8 +93,6 @@ def test_recognize_digits_conv(self):
             cost = layers.cross_entropy(input=predict, label=label)
             avg_cost = layers.mean(cost)
 
-            program.append_backward(avg_cost)
-
         print(str(program))
 
     def test_word_embedding(self):
diff --git a/python/paddle/fluid/tests/unittests/test_multiple_reader.py b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
similarity index 91%
rename from python/paddle/fluid/tests/unittests/test_multiple_reader.py
rename to python/paddle/fluid/tests/unittests/test_multi_file_reader.py
index a60a5d6c4af2b..5dc41e54d6158 100644
--- a/python/paddle/fluid/tests/unittests/test_multiple_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
@@ -61,8 +61,12 @@ def main(self, thread_num):
             exe.run(fluid.default_startup_program())
 
             batch_count = 0
-            while not data_files.eof():
-                img_val, = exe.run(fetch_list=[img])
+            while True:
+                try:
+                    img_val, = exe.run(fetch_list=[img])
+                except fluid.core.EnforceNotMet as ex:
+                    self.assertIn("There is no next data.", ex.message)
+                    break
                 batch_count += 1
                 self.assertLessEqual(img_val.shape[0], self.batch_size)
             data_files.reset()
diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
index 0b7a29075939a..1471843ded7a4 100644
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
@@ -44,7 +44,7 @@ def test_main(self):
                 shapes=[(-1, 784), (-1, 1)],
                 lod_levels=[0, 0],
                 dtypes=['float32', 'int64'])
-            data_file = fluid.layers.create_multi_pass_reader(
+            data_file = fluid.layers.io.multi_pass(
                 reader=data_file, pass_num=self.pass_num)
             img, label = fluid.layers.read_file(data_file)
 
@@ -57,8 +57,12 @@ def test_main(self):
             exe.run(fluid.default_startup_program())
 
             batch_count = 0
-            while not data_file.eof():
-                img_val, = exe.run(fetch_list=[img])
+            while True:
+                try:
+                    img_val, = exe.run(fetch_list=[img])
+                except fluid.core.EnforceNotMet as ex:
+                    self.assertIn("There is no next data.", ex.message)
+                    break
                 batch_count += 1
                 self.assertLessEqual(img_val.shape[0], self.batch_size)
             data_file.reset()
diff --git a/python/paddle/fluid/tests/unittests/test_net.py b/python/paddle/fluid/tests/unittests/test_net.py
deleted file mode 100644
index ae1699d647d7c..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_net.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import unittest
-
-
-def fc(X, W, Y):
-    ret_v = core.Net.create()
-
-    ret_v.append_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
-    ret_v.append_op(Operator("sigmoid", X="pre_activation", Out=Y))
-    ret_v.complete_add_op(True)
-    return ret_v
-
-
-class TestNet(unittest.TestCase):
-    def test_net_all(self):
-        net = core.Net.create()
-        op1 = Operator("sum", X=["X", "Y"], Out="Out")
-        net.append_op(op1)
-
-        net2 = core.Net.create()
-        net2.append_op(fc(X="X", W="w", Y="fc.out"))
-        net2.complete_add_op(True)
-        net.append_op(net2)
-        net.complete_add_op(True)
-
-        expected = '''
-Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
-    Op(sum), inputs:{X[X, Y]}, outputs:{Out[Out]}.
-    Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
-        Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
-            Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
-            Op(sigmoid), inputs:{X[pre_activation]}, outputs:{Out[fc.out]}.
-'''
-        self.assertEqual(expected, "\n" + str(net))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index 8401716db88ef..95845ea4de54a 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -26,11 +26,14 @@ def simple_fc_net(use_feed):
         img = fluid.layers.data(name='image', shape=[784], dtype='float32')
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
     else:
-        reader = fluid.layers.open_recordio_file(
-            filename='./mnist.recordio',
+        reader = fluid.layers.open_files(
+            filenames=['./mnist.recordio'],
             shapes=[[-1, 784], [-1, 1]],
             lod_levels=[0, 0],
-            dtypes=['float32', 'int64'])
+            dtypes=['float32', 'int64'],
+            thread_num=1,
+            for_parallel=True)
+        reader = fluid.layers.io.double_buffer(reader)
         img, label = fluid.layers.read_file(reader)
     hidden = img
     for _ in xrange(4):
@@ -51,11 +54,14 @@ def fc_with_batchnorm(use_feed):
         img = fluid.layers.data(name='image', shape=[784], dtype='float32')
         label = fluid.layers.data(name='label', shape=[1], dtype='int64')
     else:
-        reader = fluid.layers.open_recordio_file(
-            filename='./mnist.recordio',
+        reader = fluid.layers.open_files(
+            filenames=['mnist.recordio'],
             shapes=[[-1, 784], [-1, 1]],
             lod_levels=[0, 0],
-            dtypes=['float32', 'int64'])
+            dtypes=['float32', 'int64'],
+            thread_num=1,
+            for_parallel=True)
+        reader = fluid.layers.io.double_buffer(reader)
         img, label = fluid.layers.read_file(reader)
 
     hidden = img
@@ -467,7 +473,7 @@ def test_parallel_testing(self):
             loss = simple_fc_net(True)
             test_program = main.clone(for_test=True)
 
-            opt = fluid.optimizer.SGD(learning_rate=0.0001)
+            opt = fluid.optimizer.SGD(learning_rate=0.001)
             opt.minimize(loss)
 
             batch_size = 32
@@ -494,4 +500,8 @@ def test_parallel_testing(self):
 
                 train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
                 train_loss = numpy.array(train_loss)
-                self.assertTrue(numpy.allclose(train_loss, test_loss))
+                self.assertTrue(
+                    numpy.allclose(
+                        train_loss, test_loss, atol=1e-8),
+                    "Train loss: " + str(train_loss) + "\n Test loss:" +
+                    str(test_loss))
diff --git a/python/paddle/fluid/tests/unittests/test_program.py b/python/paddle/fluid/tests/unittests/test_program.py
index 87a2195f0d5c7..c51a482393306 100644
--- a/python/paddle/fluid/tests/unittests/test_program.py
+++ b/python/paddle/fluid/tests/unittests/test_program.py
@@ -87,57 +87,6 @@ def test_parse_program_from_string(self):
         print(prog)
         print(prog_restored)
 
-    def test_append_backward(self):
-        prog = Program()
-        block = prog.global_block()
-
-        mul_x = block.create_var(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        mul_op = block.append_op(
-            type="mul",
-            inputs={"X": [mul_x],
-                    "Y": mul_y},
-            outputs={"Out": [mul_out]},
-            attrs={"x_num_col_dims": 1})
-
-        add_y = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="add.y")
-        add_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="add.out")
-        add_op = block.append_op(
-            type="elementwise_add",
-            inputs={"X": mul_out,
-                    "Y": add_y},
-            outputs={"Out": add_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": add_out}, outputs={"Out": mean_out})
-
-        self.assertEqual(mul_op.idx, 0)
-        self.assertEqual(add_op.idx, 1)
-        param_to_grad = prog.append_backward(mean_out, set())
-
-        for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out",
-                         "mean.out"):
-            self.assertEqual(param_to_grad[var_name][0],
-                             grad_var_name(var_name))
-            self.assertEqual(param_to_grad[var_name][1], 0)
-
-        expect_ops = [
-            "mul", "elementwise_add", "mean", "fill_constant", "mean_grad",
-            "elementwise_add_grad", "mul_grad"
-        ]
-        actual_ops = []
-        for op in block.ops:
-            actual_ops.append(op.type)
-        self.assertEqual(actual_ops, expect_ops)
-
     def test_program_clone_with_parameter(self):
         main_program = Program()
         startup_program = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
index f98a8bbc68a43..3f9059fb5b31c 100644
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
@@ -201,24 +201,6 @@ def test_remove_op(self):
         op1.set_type("test")
         op2.set_type("test")
 
-        var0 = block.var("var0")
-        var1 = block.var("var1")
-        var2 = block.var("var2")
-        var3 = block.var("var3")
-        var4 = block.var("var4")
-        var5 = block.var("var5")
-
-        op0.set_input("X", ["var0"])
-        op0.set_output("Y", ["var0"])
-        op1.set_input("X", ["var1", "var2"])
-        op1.set_output("Y", ["var3", "var4"])
-        op2.set_input("X", ["var1"])
-        op2.set_output("Y", ["var4", "var5"])
-
-        program.sync_with_cpp()
-
-        # remove op1, its input var2 and output var3 will be removed at the same time,
-        # but its input var1 and output var4 will not be removed since they are used for op2.
         block.remove_op(1, 2)
         program.sync_with_cpp()
 
@@ -226,8 +208,6 @@ def test_remove_op(self):
         for idx in xrange(0, block.op_size()):
             all_ops.append(block.op(idx))
         self.assertEqual(all_ops, [op0, op2])
-        all_vars = block.all_vars()
-        self.assertEqual(set(all_vars), {var0, var1, var4, var5})
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
index 24a0074d9b962..7c8e7f634fdd3 100644
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
@@ -65,8 +65,13 @@ def test_main(self, decorator_callback=None):
 
             # train a pass
             batch_id = 0
-            while not data_file.eof():
-                tmp, = exe.run(fetch_list=[avg_loss])
+            while True:
+                try:
+                    tmp, = exe.run(fetch_list=[avg_loss])
+                except fluid.core.EnforceNotMet as ex:
+                    self.assertIn("There is no next data.", ex.message)
+                    break
+
                 avg_loss_np.append(tmp)
                 batch_id += 1
             data_file.reset()
@@ -74,8 +79,8 @@ def test_main(self, decorator_callback=None):
             self.assertLess(avg_loss_np[-1], avg_loss_np[0])
 
     def test_shuffle_reader(self):
-        self.test_main(decorator_callback=lambda reader: fluid.layers.create_shuffle_reader(reader, buffer_size=200))
+        self.test_main(decorator_callback=lambda reader: fluid.layers.io.shuffle(reader, buffer_size=200))
 
     def test_double_buffer_reader(self):
-        self.test_main(decorator_callback=lambda reader: fluid.layers.create_double_buffer_reader(reader,
+        self.test_main(decorator_callback=lambda reader: fluid.layers.io.double_buffer(reader,
                                                                                                   place='cuda:0' if fluid.core.is_compiled_with_cuda() else 'cpu'))