diff --git a/.github/ISSUE_TEMPLATE/---document-issue-.md b/.github/ISSUE_TEMPLATE/---document-issue-.md index 7c464ac584bc87..ffc2fcd7817b64 100644 --- a/.github/ISSUE_TEMPLATE/---document-issue-.md +++ b/.github/ISSUE_TEMPLATE/---document-issue-.md @@ -56,4 +56,4 @@ For example: no sample code; The sample code is not helpful; The sample code not For example:Chinese API in this doc is inconsistent with English API, including params, description, sample code, formula, etc. #### Other -For example: The doc link is broken; The doc page is missing; Dead link in docs. \ No newline at end of file +For example: The doc link is broken; The doc page is missing; Dead link in docs. diff --git a/CMakeLists.txt b/CMakeLists.txt index 765d8fc1578565..30f9e3a3dcdd2c 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License -cmake_minimum_required(VERSION 3.10) +cmake_minimum_required(VERSION 3.15) +cmake_policy(VERSION 3.10) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) @@ -32,16 +33,19 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) +option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) +# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON +# to develop some acl related functionality on x86 +option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) +option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) if (WITH_GPU AND WITH_XPU) message(FATAL_ERROR "Error when compile GPU and XPU at the same time") endif() -if (WITH_GPU AND WITH_ASCEND) +if (WITH_GPU AND WITH_ASCEND) message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time") endif() -# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them. -if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15)) - message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. " - "You can use cmake 3.16 (recommended), 3.10, 3.11, 3.15 or 3.17. Please refer to the install document: https://cmake.org/install/") +if (WITH_GPU AND WITH_ROCM) + message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time") endif() if(WITH_GPU AND NOT APPLE) @@ -61,6 +65,9 @@ if(WITH_MUSL) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy") endif() +if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") +endif() if(WIN32) option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) @@ -165,8 +172,6 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(ON_INFER "Turn on inference optimization and inference-lib generation" OFF) ################################ Internal Configurations ####################################### -option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) -option(WITH_RCCL "Compile PaddlePaddle with RCCL support" OFF) option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF) option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) @@ -179,12 +184,14 @@ option(WITH_XBYAK "Compile with xbyak support" ON) option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE}) +option(WITH_HETERPS "Compile with heterps" OFF}) option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF) option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) +option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON) option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF) option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON) option(WITH_ARM "Compile PaddlePaddle with arm support" OFF) @@ -302,9 +309,9 @@ endif(WITH_ROCM) if (NOT WITH_ROCM AND WITH_RCCL) MESSAGE(WARNING - "Disable RCCL when compiling without GPU. Force WITH_RCCL=OFF.") - set(WITH_NCCL OFF CACHE STRING - "Disable RCCL when compiling without GPU" FORCE) + "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.") + set(WITH_RCCL OFF CACHE STRING + "Disable RCCL when compiling without ROCM" FORCE) endif() if(WITH_RCCL) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 9c1bd52e7fb7df..bf1352d4e11479 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -82,6 +82,10 @@ if(WITH_ASCEND) add_definitions(-DPADDLE_WITH_ASCEND) endif() +if(WITH_ASCEND_CL) + add_definitions(-DPADDLE_WITH_ASCEND_CL) +endif() + if(WITH_XPU) message(STATUS "Compile with XPU!") add_definitions(-DPADDLE_WITH_XPU) @@ -93,13 +97,18 @@ if(WITH_GPU) FIND_PACKAGE(CUDA REQUIRED) - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 7) - message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 10.1) + message(FATAL_ERROR "Paddle needs CUDA >= 10.1 to compile") endif() if(NOT CUDNN_FOUND) message(FATAL_ERROR "Paddle needs cudnn to compile") endif() + + if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + message(FATAL_ERROR "Paddle needs CUDNN >= 7.0 to compile") + endif() + if(CUPTI_FOUND) include_directories(${CUPTI_INCLUDE_DIR}) add_definitions(-DPADDLE_WITH_CUPTI) @@ -164,6 +173,9 @@ if(WITH_PSCORE) add_definitions(-DPADDLE_WITH_PSCORE) endif() +if(WITH_HETERPS) + add_definitions(-DPADDLE_WITH_HETERPS) +endif() if(WITH_GRPC) add_definitions(-DPADDLE_WITH_GRPC) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index c4d1384312e3c9..7f2addb02d36dd 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -6,15 +6,9 @@ endif() if (WITH_NV_JETSON) add_definitions(-DWITH_NV_JETSON) set(paddle_known_gpu_archs "53 62 72") - set(paddle_known_gpu_archs7 "53") - set(paddle_known_gpu_archs8 "53 62") - set(paddle_known_gpu_archs9 "53 62") set(paddle_known_gpu_archs10 "53 62 72") else() - set(paddle_known_gpu_archs "30 35 50 52 60 61 70") - set(paddle_known_gpu_archs7 "30 35 50 52") - set(paddle_known_gpu_archs8 "30 35 50 52 60 61") - set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70") + set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80") set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75") set(paddle_known_gpu_archs11 "52 60 61 70 75 80") endif() @@ -74,7 +68,7 @@ endfunction() # select_nvcc_arch_flags(out_variable) function(select_nvcc_arch_flags out_variable) # List of arch names - set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual") + set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "All" "Manual") set(archs_name_default "Auto") list(APPEND archs_names "Auto") @@ -108,6 +102,8 @@ function(select_nvcc_arch_flags out_variable) set(cuda_arch_bin "70") elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") set(cuda_arch_bin "75") + elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere") + set(cuda_arch_bin "80") elseif(${CUDA_ARCH_NAME} STREQUAL "All") set(cuda_arch_bin ${paddle_known_gpu_archs}) elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") @@ -158,25 +154,7 @@ function(select_nvcc_arch_flags out_variable) endfunction() message(STATUS "CUDA detected: " ${CMAKE_CUDA_COMPILER_VERSION}) -if (${CMAKE_CUDA_COMPILER_VERSION} LESS 7.0) - set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 8.0) # CUDA 7.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 9.0) # CUDA 8.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") - # CUDA 8 may complain that sm_20 is no longer supported. Suppress the - # warning for now. - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) # CUDA 9.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs9}) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x +if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs10}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") @@ -206,14 +184,11 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}") message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}") -# Set C++11 support +# Set C++14 support set(CUDA_PROPAGATE_HOST_FLAGS OFF) # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. # So, don't set these flags here. -if (NOT WIN32) # windows msvc2015 support c++11 natively. - # -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake. - set(CMAKE_CUDA_STANDARD 11) -endif(NOT WIN32) +set(CMAKE_CUDA_STANDARD 14) # (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w # So replace /W[1-4] with /W0 diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index d8d8f634e76b6b..c82847100abefa 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -94,7 +94,7 @@ macro(find_cudnn_version cudnn_header_file) "${CUDNN_MAJOR_VERSION} * 1000 + ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") message(STATUS "Current cuDNN header is ${cudnn_header_file} " - "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ") + "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. ") endif() endif() endmacro() diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index bcf0c0a0646fc3..bddd2023b437b1 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -12,50 +12,69 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) - -SET(ASCEND_PROJECT "extern_ascend") -IF((NOT DEFINED ASCEND_VER) OR (NOT DEFINED ASCEND_URL)) - MESSAGE(STATUS "use pre defined download url") - SET(ASCEND_VER "0.1.1" CACHE STRING "" FORCE) - SET(ASCEND_NAME "ascend" CACHE STRING "" FORCE) - SET(ASCEND_URL "http://paddle-ascend.bj.bcebos.com/ascend.tar.gz" CACHE STRING "" FORCE) -ENDIF() -MESSAGE(STATUS "ASCEND_NAME: ${ASCEND_NAME}, ASCEND_URL: ${ASCEND_URL}") -SET(ASCEND_SOURCE_DIR "${THIRD_PARTY_PATH}/ascend") -SET(ASCEND_DOWNLOAD_DIR "${ASCEND_SOURCE_DIR}/src/${ASCEND_PROJECT}") -SET(ASCEND_DST_DIR "ascend") -SET(ASCEND_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") -SET(ASCEND_INSTALL_DIR ${ASCEND_INSTALL_ROOT}/${ASCEND_DST_DIR}) -SET(ASCEND_ROOT ${ASCEND_INSTALL_DIR}) -SET(ASCEND_INC_DIR ${ASCEND_ROOT}/include) -SET(ASCEND_LIB_DIR ${ASCEND_ROOT}/lib) -SET(ASCEND_LIB ${ASCEND_LIB_DIR}/libge_runner.so) -SET(ASCEND_GRAPH_LIB ${ASCEND_LIB_DIR}/libgraph.so) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ASCEND_ROOT}/lib") - -INCLUDE_DIRECTORIES(${ASCEND_INC_DIR}) -FILE(WRITE ${ASCEND_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(ASCEND)\n" - "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${ASCEND_NAME}/include ${ASCEND_NAME}/lib \n" - " DESTINATION ${ASCEND_DST_DIR})\n") -ExternalProject_Add( - ${ASCEND_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${ASCEND_SOURCE_DIR} - DOWNLOAD_DIR ${ASCEND_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${ASCEND_URL} -c -q -O ${ASCEND_NAME}.tar.gz - && tar zxvf ${ASCEND_NAME}.tar.gz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ASCEND_INSTALL_ROOT} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ASCEND_INSTALL_ROOT} -) -ADD_LIBRARY(ascend SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ascend PROPERTY IMPORTED_LOCATION ${ASCEND_LIB}) - -ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${ASCEND_GRAPH_LIB}) -ADD_DEPENDENCIES(ascend ascend_graph ${ASCEND_PROJECT}) +#NOTE: Logic is from +# https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt +if(DEFINED ENV{ASCEND_CUSTOM_PATH}) + set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH}) +else() + set(ASCEND_DIR /usr/local/Ascend) +endif() + +if(WITH_ASCEND) + set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) + set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) + set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share) + set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64) + set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64) + set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64) + set(STATIC_ACL_LIB ${ASCEND_ACL_DIR}) + + set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR}) + set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR}) + set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) + set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64) + set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64) + set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR}) + + set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so) + set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so) + set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) + INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) + + if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h) + add_definitions(-DPADDLE_WITH_ASCEND_STRING) + endif() + + ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) + + ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib}) + + ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) + + add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl) +endif() + +if(WITH_ASCEND_CL) + set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) + + set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so) + set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so) + set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + + message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}") + message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}") + INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR}) + + ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib}) + + ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib}) + add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler) + +endif() diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index 0eb590c42d0cb7..2d72b6eb56deaa 100644 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -39,9 +39,9 @@ set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/ ExternalProject_Add( extern_brpc ${EXTERNAL_PROJECT_LOG_ARGS} - # TODO(gongwb): change to de newst repo when they changed. + # TODO(gongwb): change to de newst repo when they changed GIT_REPOSITORY "https://github.com/wangjiawei04/brpc" - GIT_TAG "6d79e0b17f25107c35b705ea58d888083f59ff47" + GIT_TAG "e203afb794caf027da0f1e0776443e7d20c0c28e" PREFIX ${BRPC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 5a755a816c332a..4619f9f7b7e34c 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -14,11 +14,11 @@ include(ExternalProject) -# update eigen to the commit id 4da2c6b1 on 03/19/2020 +# update eigen to the commit id f612df27 on 03/16/2021 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3) set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3) set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git) -set(EIGEN_TAG 4da2c6b1974827b1999bab652a3d4703e1992d26) +set(EIGEN_TAG f612df273689a19d25b45ca4f8269463207c4fee) cache_third_party(extern_eigen3 REPOSITORY ${EIGEN_REPOSITORY} @@ -27,47 +27,13 @@ cache_third_party(extern_eigen3 if(WIN32) add_definitions(-DEIGEN_STRONG_INLINE=inline) - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst) - # For Windows - # which will cause a compilation error in Tensor:74: - # "can not open file 'unistd.h'" - # so use following patch to solve compilation error On Windows. - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Tensor native_src2) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/Tensor native_dst2) - # For VS2015 - # which will cause a compilation error in TensorBlock.h:1028: - # "syntax error" - # so use following patch to solve compilation error On Windows. - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorBlock.h native_src3) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h native_dst3) - set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y && copy ${native_src2} ${native_dst2} /Y && copy ${native_src3} ${native_dst3} /Y) elseif(LINUX) - # For gxx=4.8, __GXX_ABI_VERSION is less than 1004 - # which will cause a compilation error in Geometry_SSE.h:38: - # "no matching function for call to 'pmul(Eigen::internal::Packet4f&, __m128)" - # refer to: https://gitlab.com/libeigen/eigen/-/blob/4da2c6b1974827b1999bab652a3d4703e1992d26/Eigen/src/Core/arch/SSE/PacketMath.h#L33-60 - # add -fabi-version=4 could avoid above error, but will cause "double free corruption" when compile with gcc8 - # so use following patch to solve compilation error with different version of gcc. - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Geometry_SSE.h native_src1) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Geometry/arch/Geometry_SSE.h native_dst1) - # The compiler fully support const expressions since c++14, - # but Eigen use some const expressions such as std::max and std::min, which are not supported in c++11 - # add patch to avoid compilation error in c++11 - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/MathFunctions.h native_src2) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/MathFunctions.h native_dst2) if(WITH_ROCM) # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC # which will cause compiler error of using __host__ funciont in __host__ __device__ - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src3) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst3) - # For HIPCC Eigen::internal::scalar_sum_op is not EIGEN_DEVICE_FUNC - # which will cause compiler error of using __host__ funciont in __host__ __device__ - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/BinaryFunctors.h native_src4) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/functors/BinaryFunctors.h native_dst4) - set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2} && cp ${native_src3} ${native_dst3} && cp ${native_src4} ${native_dst4}) - else() - set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2}) + file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src) + file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst) + set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst}) endif() endif() @@ -82,7 +48,7 @@ ExternalProject_Add( PREFIX ${EIGEN_PREFIX_DIR} SOURCE_DIR ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" - PATCH_COMMAND ${EIGEN_PATCH_COMMAND} + PATCH_COMMAND ${EIGEN_PATCH_COMMAND} CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake index ea7af315e1a690..e8db13a694f557 100644 --- a/cmake/external/gloo.cmake +++ b/cmake/external/gloo.cmake @@ -32,21 +32,39 @@ cache_third_party(extern_gloo TAG ${GLOO_TAG} DIR GLOO_SOURCE_DIR) -ExternalProject_Add( - extern_gloo - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - "${GLOO_DOWNLOAD_CMD}" - PREFIX "${GLOO_PREFIX_DIR}" - SOURCE_DIR "${GLOO_SOURCE_DIR}" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build - && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make - && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} - COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" -) + if(WITH_ASCEND OR WITH_ASCEND_CL) + ExternalProject_Add( + extern_gloo + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${GLOO_DOWNLOAD_CMD}" + PREFIX "${GLOO_PREFIX_DIR}" + SOURCE_DIR "${GLOO_SOURCE_DIR}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build + && cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make + && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" + ) +else() + ExternalProject_Add( + extern_gloo + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${GLOO_DOWNLOAD_CMD}" + PREFIX "${GLOO_PREFIX_DIR}" + SOURCE_DIR "${GLOO_SOURCE_DIR}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build + && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make + && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" + ) +endif() ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake deleted file mode 100644 index 536e95c1dc2a4f..00000000000000 --- a/cmake/external/grpc.cmake +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -include (ExternalProject) - -SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc) -SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) -SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) -SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) - -include(ProcessorCount) -ProcessorCount(NUM_OF_PROCESSOR) - -IF(APPLE) - SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh) - SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install) -ELSE() - SET(GRPC_CFLAGS "-Wno-error -std=c11 ${CLFAGS}") - SET(GRPC_CXXFLAGS "-Wno-error -std=c++11 ${CXXFLAGS}") - SET(BUILD_CMD make CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS} HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin) - SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS}) -ENDIF() - -# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them -ExternalProject_Add( - extern_grpc - DEPENDS protobuf zlib - # NOTE(wuyi): - # this package is generated by following steps: - # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git - # 2. git submodule update --init - # 3. keep only zlib, cares, protobuf, boringssl under "third_party", - # checkout and clean other dirs under third_party - # 4. remove .git, and package the directory. - URL http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x_paddle.tar.gz - URL_MD5 f5442d137ddccee252e194b1bc90f98c - PREFIX ${GRPC_SOURCES_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_IN_SOURCE 1 - # NOTE(yuyang18): - # Disable -Werror, otherwise the compile will fail in MacOS. - # It seems that we cannot configure that by make command. - # Just dry run make command and remove `-Werror`, then use a shell to run make commands - BUILD_COMMAND ${BUILD_CMD} - INSTALL_COMMAND ${GRPC_INSTALL_CMD} -) - -ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a") - -ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgrpc++.a") -ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgpr.a") - -ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a") - -include_directories(${GRPC_INCLUDE_DIR}) -ADD_DEPENDENCIES(grpc++_unsecure extern_grpc) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 884219d8dd81f3..fb1d4d9d56dcc6 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) SET(MKLDNN_REPOSITORY ${GIT_URL}/oneapi-src/oneDNN.git) -SET(MKLDNN_TAG 72efa005effb49595933e033cc732f215ef0445a) +SET(MKLDNN_TAG f58682cd8bd0615f41d879f8afc8f1511ab42d24) # Introduce variables: # * CMAKE_INSTALL_LIBDIR diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 40a27f506f3077..c108c05368c915 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -198,8 +198,16 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}") ENDIF() +if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) + SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) + SET(PROTOBUF_TAG v3.8.0) +elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) + SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) + SET(PROTOBUF_TAG v3.8.0) +else() SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546) +endif() cache_third_party(${TARGET_NAME} REPOSITORY ${PROTOBUF_REPOSITORY} @@ -234,7 +242,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ) ENDFUNCTION() -SET(PROTOBUF_VERSION 3.1.0) +if(WITH_ASCEND OR WITH_ASCEND_CL) + SET(PROTOBUF_VERSION 3.8.0) +else() + SET(PROTOBUF_VERSION 3.1.0) +endif() IF(NOT PROTOBUF_FOUND) build_protobuf(extern_protobuf FALSE) diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake index 205e8d26d93ca1..f9cb3a9075a821 100644 --- a/cmake/external/threadpool.cmake +++ b/cmake/external/threadpool.cmake @@ -16,7 +16,11 @@ INCLUDE(ExternalProject) SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool) SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool) -SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) +if(WITH_ASCEND OR WITH_ASCEND_CL) + SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git) +else() + SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) +endif() SET(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040) cache_third_party(extern_threadpool diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index b0ef575f643238..100b9153394690 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -14,11 +14,17 @@ INCLUDE(ExternalProject) +IF(WITH_ROCM) + add_definitions(-DWARPCTC_WITH_HIP) +ENDIF() + SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) +# in case of low internet speed +#set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git) set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git) -set(WARPCTC_TAG cd828e5b6c3b953b82af73f7f44cddc393a20efa) +set(WARPCTC_TAG c690fc5755abbdbdc98ef78d51ec10a6748a8cd1) SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE) @@ -37,38 +43,77 @@ cache_third_party(extern_warpctc TAG ${WARPCTC_TAG} DIR WARPCTC_SOURCE_DIR) -ExternalProject_Add( - extern_warpctc - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - "${WARPCTC_DOWNLOAD_CMD}" - PREFIX ${WARPCTC_PREFIX_DIR} - SOURCE_DIR ${WARPCTC_SOURCE_DIR} - #UPDATE_COMMAND "" - PATCH_COMMAND "" - BUILD_ALWAYS 1 - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=$ - -DCMAKE_C_FLAGS_DEBUG=$ - -DCMAKE_C_FLAGS_RELEASE=$ - -DCMAKE_CXX_FLAGS=$ - -DCMAKE_CXX_FLAGS_RELEASE=$ - -DCMAKE_CXX_FLAGS_DEBUG=$ - -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} - -DWITH_GPU=${WITH_GPU} - -DWITH_OMP=${USE_OMP} - -DWITH_TORCH=OFF - -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON - -DBUILD_SHARED=ON - -DBUILD_TESTS=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} -) +if(WITH_ASCEND OR WITH_ASCEND_CL) + ExternalProject_Add( + extern_warpctc + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${WARPCTC_DOWNLOAD_CMD}" + PREFIX ${WARPCTC_PREFIX_DIR} + SOURCE_DIR ${WARPCTC_SOURCE_DIR} + #UPDATE_COMMAND "" + PATCH_COMMAND "" + BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + ) +else() + ExternalProject_Add( + extern_warpctc + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${WARPCTC_DOWNLOAD_CMD}" + PREFIX ${WARPCTC_PREFIX_DIR} + SOURCE_DIR ${WARPCTC_SOURCE_DIR} + #UPDATE_COMMAND "" + PATCH_COMMAND "" + BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=$ + -DCMAKE_C_FLAGS_DEBUG=$ + -DCMAKE_C_FLAGS_RELEASE=$ + -DCMAKE_CXX_FLAGS=$ + -DCMAKE_CXX_FLAGS_RELEASE=$ + -DCMAKE_CXX_FLAGS_DEBUG=$ + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + ) +endif() + + IF(WIN32) SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index b5a3f0154745b9..f846623602ed79 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -13,7 +13,7 @@ if(NOT XPU_SDK_ROOT) elseif(WITH_SUNWAY) SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) else() - SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_02_27.tar.gz" CACHE STRING "" FORCE) + SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_04_09.tar.gz" CACHE STRING "" FORCE) endif() SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") diff --git a/cmake/flags.cmake b/cmake/flags.cmake index e110524dd1abb8..a2ddad557c2956 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -4,10 +4,10 @@ include(CheckCCompilerFlag) include(CheckCXXSymbolExists) include(CheckTypeSize) -function(CheckCompilerCXX11Flag) +function(CheckCompilerCXX14Flag) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8) - message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.") + if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4) + message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.") elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2) message(WARNING "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2") endif() @@ -20,23 +20,15 @@ function(CheckCompilerCXX11Flag) message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.") endif() else() - if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3) - message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.") + if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4) + message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.") endif() endif() endif() endfunction() -CheckCompilerCXX11Flag() -if (WITH_GPU) - if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") - else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - endif() -else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -endif() +CheckCompilerCXX14Flag() +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") # safe_set_flag # # Set a compile flag only if compiler is support diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 0343ff3cc292d9..7dac91e531e4cf 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -11,6 +11,7 @@ function(op_library TARGET) set(cu_cc_srcs) set(hip_cc_srcs) set(xpu_cc_srcs) + set(npu_cc_srcs) set(cudnn_cu_cc_srcs) set(miopen_cu_cc_srcs) set(cudnn_cu_srcs) @@ -20,6 +21,9 @@ function(op_library TARGET) set(mkldnn_cc_srcs) set(MKLDNN_FILE) set(op_common_deps operator op_registry math_function layer common_infer_shape_functions) + if (WITH_ASCEND_CL) + set(op_common_deps ${op_common_deps} npu_op_runner) + endif() # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build. set(options UNITY) set(oneValueArgs "") @@ -85,6 +89,12 @@ function(op_library TARGET) list(APPEND xpu_cc_srcs ${XPU_FILE}.cc) endif() endif() + if(WITH_ASCEND_CL) + string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}") + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc) + list(APPEND npu_cc_srcs ${NPU_FILE}.cc) + endif() + endif() else() foreach(src ${op_library_SRCS}) if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$") @@ -107,6 +117,8 @@ function(op_library TARGET) list(APPEND cu_cc_srcs ${src}) elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$") list(APPEND xpu_cc_srcs ${src}) + elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$") + list(APPEND npu_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cc$") list(APPEND cc_srcs ${src}) else() @@ -176,7 +188,7 @@ function(op_library TARGET) # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. if(WITH_UNITY_BUILD AND op_library_UNITY) # Combine the cc source files. - compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs}) + compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs}) if(TARGET ${UNITY_TARGET}) # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources}) @@ -187,7 +199,7 @@ function(op_library TARGET) # Add alias library to handle dependencies. add_library(${TARGET} ALIAS ${UNITY_TARGET}) else() - cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS} + cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) endif() endif() @@ -207,6 +219,7 @@ function(op_library TARGET) # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. # Note that it's enough to just adding one operator to pybind in a *_op.cc file. # And for detail pybind information, please see generated paddle/pybind/pybind.h. + set(ORIGINAL_TARGET ${TARGET}) file(READ ${TARGET}.cc TARGET_CONTENT) string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}") # [ \t\r\n]* is used for blank characters @@ -239,8 +252,9 @@ function(op_library TARGET) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) list(LENGTH xpu_cc_srcs xpu_cc_srcs_len) list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len) + list(LENGTH npu_cc_srcs npu_cc_srcs_len) if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND - ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0) + ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0) file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") set(pybind_flag 1) endif() @@ -280,6 +294,26 @@ function(op_library TARGET) if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n") endif() + + if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0) + file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT) + # It is different from the logic above, becareful + string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\(.*" multi_npu_register "${TARGET_NPU_CONTENT}") + # [ \t\r\n]* is used for blank characters + string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_npu_register "${multi_npu_register}") + + if (one_npu_register STREQUAL "") + string(REPLACE "_op" "" NPU_TARGET "${TARGET}") + else () + string(REPLACE "REGISTER_OP_NPU_KERNEL(" "" NPU_TARGET "${one_npu_register}") + string(REPLACE "," "" NPU_TARGET "${NPU_TARGET}") + # [ \t\r\n]+ is used for blank characters. + # Here we use '+' instead of '*' since it is a REPLACE operation. + string(REGEX REPLACE "[ \t\r\n]+" "" NPU_TARGET "${NPU_TARGET}") + endif() + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${NPU_TARGET}, NPU);\n") + endif() + # pybind USE_OP_DEVICE_KERNEL for MKLDNN if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) # Append first implemented MKLDNN activation operator @@ -330,6 +364,7 @@ function(register_operators) file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") string(REPLACE "_mkldnn" "" OPS "${OPS}") string(REPLACE "_xpu" "" OPS "${OPS}") + string(REPLACE "_npu" "" OPS "${OPS}") string(REPLACE ".cc" "" OPS "${OPS}") list(REMOVE_DUPLICATES OPS) list(LENGTH register_operators_DEPS register_operators_DEPS_len) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 6488d29afc5f7f..81fa7d0dfa98f0 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -274,10 +274,15 @@ if(WITH_BOX_PS) list(APPEND third_party_deps extern_box_ps) endif(WITH_BOX_PS) -if(WITH_ASCEND) +if(WITH_ASCEND OR WITH_ASCEND_CL) include(external/ascend) - list(APPEND third_party_deps extern_ascend) -endif (WITH_ASCEND) + if(WITH_ASCEND) + list(APPEND third_party_deps extern_ascend) + endif() + if(WITH_ASCEND_CL) + list(APPEND third_party_deps extern_ascend_cl) + endif() +endif () if (WITH_PSCORE) include(external/snappy) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index c18332d3b87316..dcff02a662e273 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -9,4 +9,3 @@ add_subdirectory(pybind) # NOTE: please add subdirectory inference at last. add_subdirectory(inference) -add_subdirectory(train) diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index 5a2d7a06201ba4..a2062d82c8130b 100644 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -14,6 +14,7 @@ endif() add_subdirectory(table) add_subdirectory(service) add_subdirectory(test) +add_subdirectory(index_dataset) get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc index b638af49730dd4..9aafdd769ed4a0 100644 --- a/paddle/fluid/distributed/fleet.cc +++ b/paddle/fluid/distributed/fleet.cc @@ -146,41 +146,6 @@ void FleetWrapper::CreateClient2ClientConnection() { client2client_max_retry_); } -std::future FleetWrapper::PullSparseVarsAsync( - const Scope& scope, const uint64_t table_id, - const std::vector& var_names, std::vector* fea_keys, - std::vector>* fea_values, int fea_value_dim) { - fea_keys->clear(); - fea_keys->resize(0); - fea_keys->reserve(MAX_FEASIGN_NUM); - for (auto name : var_names) { - Variable* var = scope.FindVar(name); - if (var == nullptr) { - continue; - } - LoDTensor* tensor = var->GetMutable(); - CHECK(tensor != nullptr) << "tensor of var " << name << " is null"; - int64_t* ids = tensor->data(); - size_t len = tensor->numel(); - for (auto i = 0u; i < len; ++i) { - if (ids[i] == 0u) { - continue; - } - fea_keys->push_back(static_cast(ids[i])); - } - } - fea_values->resize(fea_keys->size() + 1); - for (auto& t : *fea_values) { - t.resize(fea_value_dim); - } - std::vector pull_result_ptr; - for (auto& t : *fea_values) { - pull_result_ptr.push_back(t.data()); - } - return pserver_ptr_->_worker_ptr->pull_sparse( - pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size()); -} - void FleetWrapper::PullSparseVarsSync( const Scope& scope, const uint64_t table_id, const std::vector& var_names, std::vector* fea_keys, @@ -224,8 +189,10 @@ void FleetWrapper::PullSparseVarsSync( for (auto& t : *fea_values) { pull_result_ptr.push_back(t.data()); } + bool training = true; auto status = pserver_ptr_->_worker_ptr->pull_sparse( - pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size()); + pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size(), + training); pull_sparse_status.push_back(std::move(status)); for (auto& t : pull_sparse_status) { t.wait(); @@ -238,9 +205,13 @@ void FleetWrapper::PullSparseVarsSync( } } +// is_training is true means training, false means inference, the behavior is +// different on pserver + void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim, uint64_t padding_id, platform::Place place, + bool is_training, std::vector* inputs, std::vector* outputs) { std::vector fea_keys; @@ -279,7 +250,8 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim, } auto* communicator = Communicator::GetInstance(); auto status = communicator->_worker_ptr->pull_sparse( - pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size()); + pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size(), + is_training); status.wait(); auto ret = status.get(); if (ret != 0) { diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h index ac566606ddcb40..863440180a808d 100644 --- a/paddle/fluid/distributed/fleet.h +++ b/paddle/fluid/distributed/fleet.h @@ -84,19 +84,14 @@ class FleetWrapper { int fea_dim, const std::vector& var_emb_names); - // Pull sparse variables from server in async mode - // Param: scope, table_id, var_names, fea_keys, fea_dim - // Param: fea_values std::future - std::future PullSparseVarsAsync( - const Scope& scope, const uint64_t table_id, - const std::vector& var_names, - std::vector* fea_keys, - std::vector>* fea_values, int fea_dim); - // Pull sparse variables from server in sync mode // pull immediately to tensors + // is_training is true means training, false means inference, the behavior is + // different on pserver + void PullSparseToTensorSync(const uint64_t table_id, int fea_dim, uint64_t padding_id, platform::Place place, + bool is_training, std::vector* inputs, // NOLINT std::vector* outputs); // NOLINT diff --git a/paddle/fluid/distributed/index_dataset/CMakeLists.txt b/paddle/fluid/distributed/index_dataset/CMakeLists.txt new file mode 100644 index 00000000000000..a30488494a52bc --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/CMakeLists.txt @@ -0,0 +1,7 @@ +proto_library(index_dataset_proto SRCS index_dataset.proto) +cc_library(index_wrapper SRCS index_wrapper.cc DEPS index_dataset_proto fs) +cc_library(index_sampler SRCS index_sampler.cc DEPS index_wrapper) + +if(WITH_PYTHON) + py_proto_compile(index_dataset_py_proto SRCS index_dataset.proto) +endif() diff --git a/paddle/fluid/operators/distributed/distributed_pb.h b/paddle/fluid/distributed/index_dataset/index_dataset.proto similarity index 57% rename from paddle/fluid/operators/distributed/distributed_pb.h rename to paddle/fluid/distributed/index_dataset/index_dataset.proto index f1c662be9af67b..1b4ee313671ad5 100644 --- a/paddle/fluid/operators/distributed/distributed_pb.h +++ b/paddle/fluid/distributed/index_dataset/index_dataset.proto @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,19 +12,21 @@ // See the License for the specific language governing permissions and // limitations under the License. -#pragma once +syntax = "proto2"; +package paddle.distributed; -#ifdef PADDLE_WITH_DISTRIBUTE +message IndexNode { + required uint64 id = 1; + required bool is_leaf = 2; + required float probability = 3; +} -#ifdef PADDLE_WITH_GRPC +message TreeMeta { + required int32 height = 1; + required int32 branch = 2; +} -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - -#else // PADDLE_WITH_GRPC - -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - -#endif // PADDLE_WITH_GRPC - -#endif // PADDLE_WITH_DISTRIBUTE +message KVItem { + required bytes key = 1; + required bytes value = 2; +} \ No newline at end of file diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.cc b/paddle/fluid/distributed/index_dataset/index_sampler.cc new file mode 100644 index 00000000000000..58f85d98fb09c6 --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/index_sampler.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/index_dataset/index_sampler.h" +#include "paddle/fluid/operators/math/sampler.h" + +namespace paddle { +namespace distributed { + +using Sampler = paddle::operators::math::Sampler; + +std::vector> LayerWiseSampler::sample( + const std::vector>& user_inputs, + const std::vector& target_ids, bool with_hierarchy) { + auto input_num = target_ids.size(); + auto user_feature_num = user_inputs[0].size(); + std::vector> outputs( + input_num * layer_counts_sum_, + std::vector(user_feature_num + 2)); + + auto max_layer = tree_->Height(); + std::vector sampler_vec(max_layer - start_sample_layer_); + std::vector> layer_ids(max_layer - + start_sample_layer_); + + auto layer_index = max_layer - 1; + size_t idx = 0; + while (layer_index >= start_sample_layer_) { + auto layer_codes = tree_->GetLayerCodes(layer_index); + layer_ids[idx] = tree_->GetNodes(layer_codes); + sampler_vec[idx] = new paddle::operators::math::UniformSampler( + layer_ids[idx].size() - 1, seed_); + layer_index--; + idx++; + } + + idx = 0; + for (size_t i = 0; i < input_num; i++) { + auto travel_codes = + tree_->GetTravelCodes(target_ids[i], start_sample_layer_); + auto travel_path = tree_->GetNodes(travel_codes); + for (size_t j = 0; j < travel_path.size(); j++) { + // user + if (j > 0 && with_hierarchy) { + auto ancestor_codes = + tree_->GetAncestorCodes(user_inputs[i], max_layer - j - 1); + auto hierarchical_user = tree_->GetNodes(ancestor_codes); + for (int idx_offset = 0; idx_offset <= layer_counts_[j]; idx_offset++) { + for (size_t k = 0; k < user_feature_num; k++) { + outputs[idx + idx_offset][k] = hierarchical_user[k].id(); + } + } + } else { + for (int idx_offset = 0; idx_offset <= layer_counts_[j]; idx_offset++) { + for (size_t k = 0; k < user_feature_num; k++) { + outputs[idx + idx_offset][k] = user_inputs[i][k]; + } + } + } + + // sampler ++ + outputs[idx][user_feature_num] = travel_path[j].id(); + outputs[idx][user_feature_num + 1] = 1.0; + idx += 1; + for (int idx_offset = 0; idx_offset < layer_counts_[j]; idx_offset++) { + int sample_res = 0; + do { + sample_res = sampler_vec[j]->Sample(); + } while (layer_ids[j][sample_res].id() == travel_path[j].id()); + outputs[idx + idx_offset][user_feature_num] = + layer_ids[j][sample_res].id(); + outputs[idx + idx_offset][user_feature_num + 1] = 0; + } + idx += layer_counts_[j]; + } + } + for (size_t i = 0; i < sampler_vec.size(); i++) { + delete sampler_vec[i]; + } + return outputs; +} + +} // end namespace distributed +} // end namespace paddle diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.h b/paddle/fluid/distributed/index_dataset/index_sampler.h new file mode 100644 index 00000000000000..66882bedc9b765 --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/index_sampler.h @@ -0,0 +1,100 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/distributed/index_dataset/index_wrapper.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +class IndexSampler { + public: + virtual ~IndexSampler() {} + IndexSampler() {} + + template + static std::shared_ptr Init(const std::string& name) { + std::shared_ptr instance = nullptr; + instance.reset(new T(name)); + return instance; + } + + virtual void init_layerwise_conf(const std::vector& layer_sample_counts, + int start_sample_layer = 1, int seed = 0) {} + virtual void init_beamsearch_conf(const int64_t k) {} + virtual std::vector> sample( + const std::vector>& user_inputs, + const std::vector& input_targets, + bool with_hierarchy = false) = 0; +}; + +class LayerWiseSampler : public IndexSampler { + public: + virtual ~LayerWiseSampler() {} + explicit LayerWiseSampler(const std::string& name) { + tree_ = IndexWrapper::GetInstance()->get_tree_index(name); + } + + void init_layerwise_conf(const std::vector& layer_sample_counts, + int start_sample_layer, int seed) override { + seed_ = seed; + start_sample_layer_ = start_sample_layer; + + PADDLE_ENFORCE_GT( + start_sample_layer_, 0, + paddle::platform::errors::InvalidArgument( + "start sampler layer = [%d], it should greater than 0.", + start_sample_layer_)); + PADDLE_ENFORCE_LT(start_sample_layer_, tree_->Height(), + paddle::platform::errors::InvalidArgument( + "start sampler layer = [%d], it should less than " + "max_layer, which is [%d].", + start_sample_layer_, tree_->Height())); + + size_t i = 0; + layer_counts_sum_ = 0; + layer_counts_.clear(); + int cur_layer = start_sample_layer_; + while (cur_layer < tree_->Height()) { + int layer_sample_num = 1; + if (i < layer_sample_counts.size()) { + layer_sample_num = layer_sample_counts[i]; + } + layer_counts_sum_ += layer_sample_num + 1; + layer_counts_.push_back(layer_sample_num); + VLOG(3) << "[INFO] level " << cur_layer + << " sample_layer_counts.push_back: " << layer_sample_num; + cur_layer += 1; + i += 1; + } + reverse(layer_counts_.begin(), layer_counts_.end()); + VLOG(3) << "sample counts sum: " << layer_counts_sum_; + } + std::vector> sample( + const std::vector>& user_inputs, + const std::vector& target_ids, bool with_hierarchy) override; + + private: + std::vector layer_counts_; + int64_t layer_counts_sum_{0}; + std::shared_ptr tree_{nullptr}; + int seed_{0}; + int start_sample_layer_{1}; +}; + +} // end namespace distributed +} // end namespace paddle diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.cc b/paddle/fluid/distributed/index_dataset/index_wrapper.cc new file mode 100644 index 00000000000000..99fe4ca0c6d043 --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/index_wrapper.cc @@ -0,0 +1,196 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/io/fs.h" + +#include +#include +#include "paddle/fluid/distributed/index_dataset/index_wrapper.h" + +namespace paddle { +namespace distributed { + +std::shared_ptr IndexWrapper::s_instance_(nullptr); + +int TreeIndex::Load(const std::string filename) { + int err_no; + auto fp = paddle::framework::fs_open_read(filename, &err_no, ""); + PADDLE_ENFORCE_NE( + fp, nullptr, + platform::errors::InvalidArgument( + "Open file %s failed. Please check whether the file exists.", + filename)); + + int num = 0; + max_id_ = 0; + fake_node_.set_id(0); + fake_node_.set_is_leaf(false); + fake_node_.set_probability(0.0); + max_code_ = 0; + size_t ret = fread(&num, sizeof(num), 1, fp.get()); + while (ret == 1 && num > 0) { + std::string content(num, '\0'); + size_t read_num = + fread(const_cast(content.data()), 1, num, fp.get()); + PADDLE_ENFORCE_EQ( + read_num, static_cast(num), + platform::errors::InvalidArgument( + "Read from file: %s failed. Valid Format is " + "an integer representing the length of the following string, " + "and the string itself.We got an iteger[% d], " + "but the following string's length is [%d].", + filename, num, read_num)); + + KVItem item; + PADDLE_ENFORCE_EQ( + item.ParseFromString(content), true, + platform::errors::InvalidArgument("Parse from file: %s failed. It's " + "content can't be parsed by KVItem.", + filename)); + + if (item.key() == ".tree_meta") { + meta_.ParseFromString(item.value()); + } else { + auto code = boost::lexical_cast(item.key()); + IndexNode node; + node.ParseFromString(item.value()); + PADDLE_ENFORCE_NE(node.id(), 0, + platform::errors::InvalidArgument( + "Node'id should not be equel to zero.")); + if (node.is_leaf()) { + id_codes_map_[node.id()] = code; + } + data_[code] = node; + if (node.id() > max_id_) { + max_id_ = node.id(); + } + if (code > max_code_) { + max_code_ = code; + } + } + ret = fread(&num, sizeof(num), 1, fp.get()); + } + total_nodes_num_ = data_.size(); + max_code_ += 1; + return 0; +} + +std::vector TreeIndex::GetNodes(const std::vector& codes) { + std::vector nodes; + nodes.reserve(codes.size()); + for (size_t i = 0; i < codes.size(); i++) { + if (CheckIsValid(codes[i])) { + nodes.push_back(data_.at(codes[i])); + } else { + nodes.push_back(fake_node_); + } + } + return nodes; +} + +std::vector TreeIndex::GetLayerCodes(int level) { + uint64_t level_num = static_cast(std::pow(meta_.branch(), level)); + uint64_t level_offset = level_num - 1; + + std::vector res; + res.reserve(level_num); + for (uint64_t i = 0; i < level_num; i++) { + auto code = level_offset + i; + if (CheckIsValid(code)) { + res.push_back(code); + } + } + return res; +} + +std::vector TreeIndex::GetAncestorCodes( + const std::vector& ids, int level) { + std::vector res; + res.reserve(ids.size()); + + int cur_level; + for (size_t i = 0; i < ids.size(); i++) { + if (id_codes_map_.find(ids[i]) == id_codes_map_.end()) { + res.push_back(max_code_); + } else { + auto code = id_codes_map_.at(ids[i]); + cur_level = meta_.height() - 1; + + while (level >= 0 && cur_level > level) { + code = (code - 1) / meta_.branch(); + cur_level--; + } + res.push_back(code); + } + } + return res; +} + +std::vector TreeIndex::GetChildrenCodes(uint64_t ancestor, + int level) { + auto level_code_num = static_cast(std::pow(meta_.branch(), level)); + auto code_min = level_code_num - 1; + auto code_max = meta_.branch() * level_code_num - 1; + + std::vector parent; + parent.push_back(ancestor); + std::vector res; + size_t p_idx = 0; + while (true) { + size_t p_size = parent.size(); + for (; p_idx < p_size; p_idx++) { + for (int i = 0; i < meta_.branch(); i++) { + auto code = parent[p_idx] * meta_.branch() + i + 1; + if (data_.find(code) != data_.end()) parent.push_back(code); + } + } + if ((code_min <= parent[p_idx]) && (parent[p_idx] < code_max)) { + break; + } + } + + return std::vector(parent.begin() + p_idx, parent.end()); +} + +std::vector TreeIndex::GetTravelCodes(uint64_t id, int start_level) { + std::vector res; + PADDLE_ENFORCE_NE(id_codes_map_.find(id), id_codes_map_.end(), + paddle::platform::errors::InvalidArgument( + "id = %d doesn't exist in Tree.", id)); + auto code = id_codes_map_.at(id); + int level = meta_.height() - 1; + + while (level >= start_level) { + res.push_back(code); + code = (code - 1) / meta_.branch(); + level--; + } + return res; +} + +std::vector TreeIndex::GetAllLeafs() { + std::vector res; + res.reserve(id_codes_map_.size()); + for (auto& ite : id_codes_map_) { + auto code = ite.second; + res.push_back(data_.at(code)); + } + return res; +} + +} // end namespace distributed +} // end namespace paddle diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.h b/paddle/fluid/distributed/index_dataset/index_wrapper.h new file mode 100644 index 00000000000000..8fb8faf6c84a2d --- /dev/null +++ b/paddle/fluid/distributed/index_dataset/index_wrapper.h @@ -0,0 +1,120 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/distributed/index_dataset/index_dataset.pb.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +class Index { + public: + Index() {} + ~Index() {} +}; + +class TreeIndex : public Index { + public: + TreeIndex() {} + ~TreeIndex() {} + + int Height() { return meta_.height(); } + int Branch() { return meta_.branch(); } + uint64_t TotalNodeNums() { return total_nodes_num_; } + uint64_t EmbSize() { return max_id_ + 1; } + int Load(const std::string path); + + inline bool CheckIsValid(int code) { + if (data_.find(code) != data_.end()) { + return true; + } else { + return false; + } + } + + std::vector GetNodes(const std::vector& codes); + std::vector GetLayerCodes(int level); + std::vector GetAncestorCodes(const std::vector& ids, + int level); + std::vector GetChildrenCodes(uint64_t ancestor, int level); + std::vector GetTravelCodes(uint64_t id, int start_level); + std::vector GetAllLeafs(); + + std::unordered_map data_; + std::unordered_map id_codes_map_; + uint64_t total_nodes_num_; + TreeMeta meta_; + uint64_t max_id_; + uint64_t max_code_; + IndexNode fake_node_; +}; + +using TreePtr = std::shared_ptr; + +class IndexWrapper { + public: + virtual ~IndexWrapper() {} + IndexWrapper() {} + + void clear_tree() { tree_map.clear(); } + + TreePtr get_tree_index(const std::string name) { + PADDLE_ENFORCE_NE(tree_map.find(name), tree_map.end(), + paddle::platform::errors::InvalidArgument( + "tree [%s] doesn't exist. Please insert it firstly " + "by API[\' insert_tree_index \'].", + name)); + return tree_map[name]; + } + + void insert_tree_index(const std::string name, const std::string tree_path) { + if (tree_map.find(name) != tree_map.end()) { + VLOG(0) << "Tree " << name << " has already existed."; + return; + } + TreePtr tree = std::make_shared(); + int ret = tree->Load(tree_path); + PADDLE_ENFORCE_EQ(ret, 0, paddle::platform::errors::InvalidArgument( + "Load tree[%s] from path[%s] failed. Please " + "check whether the file exists.", + name, tree_path)); + tree_map.insert(std::pair{name, tree}); + } + + static std::shared_ptr GetInstancePtr() { + if (NULL == s_instance_) { + s_instance_.reset(new paddle::distributed::IndexWrapper()); + } + return s_instance_; + } + + static IndexWrapper* GetInstance() { + if (NULL == s_instance_) { + s_instance_.reset(new paddle::distributed::IndexWrapper()); + } + return s_instance_.get(); + } + + private: + static std::shared_ptr s_instance_; + std::unordered_map tree_map; +}; + +} // end namespace distributed +} // end namespace paddle diff --git a/paddle/fluid/distributed/service/CMakeLists.txt b/paddle/fluid/distributed/service/CMakeLists.txt index bb3f6f1174da9d..d1f04e26ade728 100644 --- a/paddle/fluid/distributed/service/CMakeLists.txt +++ b/paddle/fluid/distributed/service/CMakeLists.txt @@ -16,6 +16,7 @@ set_source_files_properties(communicator.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT set_source_files_properties(service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(brpc_ps_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(brpc_ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(ps_local_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(brpc_utils.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(heter_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) @@ -24,11 +25,13 @@ set_source_files_properties(heter_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT set_source_files_properties(client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - +set_source_files_properties(graph_brpc_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(graph_brpc_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_library(brpc_utils SRCS brpc_utils.cc DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS}) -cc_library(downpour_server SRCS brpc_ps_server.cc DEPS boost eigen3 table brpc_utils ${RPC_DEPS}) -cc_library(downpour_client SRCS brpc_ps_client.cc DEPS boost eigen3 table brpc_utils ${RPC_DEPS}) +cc_library(downpour_server SRCS graph_brpc_server.cc brpc_ps_server.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) +cc_library(downpour_client SRCS graph_brpc_client.cc brpc_ps_client.cc +ps_local_client.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) cc_library(client SRCS ps_client.cc DEPS downpour_client boost ${RPC_DEPS}) cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS}) @@ -38,3 +41,6 @@ cc_library(ps_service SRCS service.cc DEPS communicator client server boost ${RP cc_library(heter_server SRCS heter_server.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) + +set_source_files_properties(graph_py_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(graph_py_service SRCS graph_py_service.cc DEPS ps_service) diff --git a/paddle/fluid/distributed/service/brpc_ps_client.cc b/paddle/fluid/distributed/service/brpc_ps_client.cc index 163526fe3b28c9..a6ad9d08f52fda 100644 --- a/paddle/fluid/distributed/service/brpc_ps_client.cc +++ b/paddle/fluid/distributed/service/brpc_ps_client.cc @@ -768,8 +768,8 @@ std::future BrpcPsClient::push_global_step(int table_id, std::future BrpcPsClient::pull_sparse(float **select_values, size_t table_id, - const uint64_t *keys, - size_t num) { + const uint64_t *keys, size_t num, + bool is_training) { size_t request_call_num = _server_channels.size(); auto shard_sorted_kvs = std::make_shared< @@ -837,16 +837,27 @@ std::future BrpcPsClient::pull_sparse(float **select_values, uint32_t kv_request_count = 0; size_t sorted_kv_size = sorted_kvs.size(); auto &request_buffer = closure->cntl(i)->request_attachment(); + + request_buffer.append((void *)&is_training, sizeof(bool)); + std::vector keys_counter; + keys_counter.reserve(sorted_kv_size); + for (size_t kv_idx = 0; kv_idx < sorted_kv_size; ++kv_idx) { ++kv_request_count; + uint32_t keys = 1; last_key = sorted_kvs[kv_idx].first; request_buffer.append((void *)&last_key, sizeof(uint64_t)); while (kv_idx < sorted_kv_size - 1 && last_key == sorted_kvs[kv_idx + 1].first) { ++kv_idx; + ++keys; } + keys_counter.push_back(keys); } + request_buffer.append((void *)keys_counter.data(), + sizeof(uint32_t) * keys_counter.size()); + if (kv_request_count == 0) { closure->Run(); } else { @@ -869,8 +880,8 @@ std::future BrpcPsClient::send_client2client_msg( auto promise = std::make_shared>(); std::future fut = promise->get_future(); if (to_client_id >= _client_channels.size()) { - LOG(FATAL) << "to_client_id is out of range clients, which size is " - << _client_channels.size(); + VLOG(0) << "to_client_id is out of range clients, which size is " + << _client_channels.size(); promise->set_value(-1); return fut; } @@ -956,7 +967,7 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id, } auto status = pull_sparse((float **)save_vec.data(), table_id, - save_key.data(), save_key.size()); + save_key.data(), save_key.size(), true); status.wait(); // create lod tensor diff --git a/paddle/fluid/distributed/service/brpc_ps_client.h b/paddle/fluid/distributed/service/brpc_ps_client.h index 8f9d2653864d1c..5192356e4b5e57 100644 --- a/paddle/fluid/distributed/service/brpc_ps_client.h +++ b/paddle/fluid/distributed/service/brpc_ps_client.h @@ -148,7 +148,8 @@ class BrpcPsClient : public PSClient { virtual std::future pull_sparse(float **select_values, size_t table_id, - const uint64_t *keys, size_t num); + const uint64_t *keys, size_t num, + bool is_training); virtual std::future print_table_stat(uint32_t table_id); @@ -170,9 +171,22 @@ class BrpcPsClient : public PSClient { virtual int32_t recv_and_save_table(const uint64_t table_id, const std::string &path); - private: + protected: + virtual size_t get_server_nums() { return _server_channels.size(); } + inline brpc::Channel *get_sparse_channel(size_t server_id) { + return _server_channels[server_id][0].get(); + } + inline brpc::Channel *get_dense_channel(size_t server_id) { + return _server_channels[server_id][1].get(); + } + inline brpc::Channel *get_cmd_channel(size_t server_id) { + return _server_channels[server_id][2].get(); + } virtual int32_t initialize() override; + private: + // virtual int32_t initialize() override; + inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total, uint32_t shard_num) { return dense_dim_total / shard_num + 1; @@ -184,16 +198,6 @@ class BrpcPsClient : public PSClient { std::future send_save_cmd(uint32_t table_id, int cmd_id, const std::vector ¶m); - inline brpc::Channel *get_sparse_channel(size_t server_id) { - return _server_channels[server_id][0].get(); - } - inline brpc::Channel *get_dense_channel(size_t server_id) { - return _server_channels[server_id][1].get(); - } - inline brpc::Channel *get_cmd_channel(size_t server_id) { - return _server_channels[server_id][2].get(); - } - bool _running = false; bool _flushing = false; std::atomic _async_call_num; //异步请求计数 @@ -220,8 +224,6 @@ class BrpcPsClient : public PSClient { size_t num, void *done) override; - virtual size_t get_server_nums() { return _server_channels.size(); } - private: int32_t start_client_service(); diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc index 8400e669182d67..a9370561a540be 100644 --- a/paddle/fluid/distributed/service/brpc_ps_server.cc +++ b/paddle/fluid/distributed/service/brpc_ps_server.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/distributed/service/brpc_ps_server.h" #include // NOLINT +#include "paddle/fluid/distributed/table/depends/sparse_utils.h" #include "paddle/fluid/distributed/table/table.h" #include "paddle/fluid/framework/archive.h" #include "paddle/fluid/platform/profiler.h" @@ -60,7 +61,8 @@ uint64_t BrpcPsServer::start(const std::string &ip, uint32_t port) { std::unique_lock lock(mutex_); std::string ip_port = ip + ":" + std::to_string(port); - VLOG(3) << "server of rank " << _rank << " starts at " << ip_port; + VLOG(0) << "running server with rank id: " << _rank + << ", endpoint: " << ip_port; brpc::ServerOptions options; int num_threads = std::thread::hardware_concurrency(); @@ -336,33 +338,39 @@ int32_t BrpcPsService::pull_sparse(Table *table, brpc::Controller *cntl) { platform::RecordEvent record_event("PsService->pull_sparse"); CHECK_TABLE_EXIST(table, request, response) - thread_local std::string push_sparse_request_buffer; + auto &req_io_buffer = cntl->request_attachment(); auto req_buffer_size = req_io_buffer.size(); + if (req_buffer_size < 1) { set_response_code(response, -1, "req attachment is empty"); return 0; } + if (request.params_size() < 1) { set_response_code(response, -1, "PsRequestMessage.params is requeired at " "least 1 for num of sparse_key"); return 0; } + uint32_t num = *(uint32_t *)(request.params(0).c_str()); - push_sparse_request_buffer.resize(0); - push_sparse_request_buffer.reserve(req_buffer_size); - const char *data = (const char *)cntl->request_attachment().fetch( - const_cast(push_sparse_request_buffer.data()), req_buffer_size); - /* - Attachment Content: - |---keysData---| - |---8*{num}B---| - */ - const uint64_t *keys = (const uint64_t *)data; + auto dim = table->value_accesor()->select_dim(); + + thread_local std::string req_buffer; + req_buffer.reserve(req_buffer_size); + + const void *data = cntl->request_attachment().fetch( + const_cast(req_buffer.data()), req_buffer_size); + + auto value = PullSparseValue(num, dim); + + value.DeserializeFromBytes(const_cast(data)); + std::vector res_data; - res_data.resize(num * table->value_accesor()->select_size() / sizeof(float)); - table->pull_sparse(res_data.data(), keys, num); + res_data.resize(num * dim); + table->pull_sparse(res_data.data(), value); + cntl->response_attachment().append((char *)res_data.data(), res_data.size() * sizeof(float)); return 0; @@ -538,7 +546,7 @@ int32_t BrpcPsService::stop_server(Table *table, auto *p_server = _server; std::thread t_stop([p_server]() { p_server->stop(); - LOG(INFO) << "Server Stoped"; + VLOG(3) << "Server Stoped"; }); t_stop.detach(); return 0; diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc index 096718768149c5..a356b77e73733e 100644 --- a/paddle/fluid/distributed/service/brpc_utils.cc +++ b/paddle/fluid/distributed/service/brpc_utils.cc @@ -324,7 +324,7 @@ std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port) { while (hp->h_addr_list[i] != NULL) { int_ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]); - VLOG(0) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip; + VLOG(3) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip; break; } diff --git a/paddle/fluid/distributed/service/communicator.cc b/paddle/fluid/distributed/service/communicator.cc index 8699719e5cdcc8..3d5ab8e16d9020 100644 --- a/paddle/fluid/distributed/service/communicator.cc +++ b/paddle/fluid/distributed/service/communicator.cc @@ -320,9 +320,11 @@ void Communicator::RpcRecvSparse(const std::string &varname, int table_id, push_g_vec.push_back(tensor->data() + i * dim); } + bool training = true; + auto status = _worker_ptr->pull_sparse( (float **)push_g_vec.data(), table_id, // NOLINT - sparse_push_keys.data(), sparse_push_keys.size()); + sparse_push_keys.data(), sparse_push_keys.size(), training); status.wait(); return; } diff --git a/paddle/fluid/distributed/service/communicator.h b/paddle/fluid/distributed/service/communicator.h index 043fe9d83dfc53..fa60cab2b58779 100644 --- a/paddle/fluid/distributed/service/communicator.h +++ b/paddle/fluid/distributed/service/communicator.h @@ -310,6 +310,8 @@ class Communicator { return _worker_ptr; } + RecvCtxMap &GetRecvCtxMap() { return recv_varname_to_ctx_; } + std::shared_ptr _worker_ptr; // pointer to worker protected: diff --git a/paddle/fluid/distributed/service/env.h b/paddle/fluid/distributed/service/env.h index 901aba0ad90c49..ca395a776afd4e 100644 --- a/paddle/fluid/distributed/service/env.h +++ b/paddle/fluid/distributed/service/env.h @@ -39,7 +39,7 @@ struct PSHost { // |---ip---|---port---|--rank--| // |-32bit--|--20bit---|--12bit-| - // for pslib + uint64_t serialize_to_uint64() { uint64_t host_label = 0; host_label = inet_addr(ip.c_str()); @@ -175,14 +175,12 @@ class PSEnvironment { host.ip = ip; host.port = port; host.rank = rank; - if (sign_set.count(rank) > 0) { - LOG(WARNING) << "ps-host :" << host.ip << ":" << host.port - << ", rank:" << host.rank - << " already register, ignore register"; - } else { + + if (sign_set.count(rank) == 0) { host_list.push_back(host); sign_set.insert(rank); } + return 0; } diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc new file mode 100644 index 00000000000000..a6271cac83c9a9 --- /dev/null +++ b/paddle/fluid/distributed/service/graph_brpc_client.cc @@ -0,0 +1,331 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include +#include +#include +#include +#include +#include +#include "Eigen/Dense" +#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/framework/archive.h" +#include "paddle/fluid/string/string_helper.h" +namespace paddle { +namespace distributed { + +void GraphPsService_Stub::service( + ::google::protobuf::RpcController *controller, + const ::paddle::distributed::PsRequestMessage *request, + ::paddle::distributed::PsResponseMessage *response, + ::google::protobuf::Closure *done) { + if (graph_service != NULL && local_channel == channel()) { + // VLOG(0)<<"use local"; + task_pool->enqueue([this, controller, request, response, done]() -> int { + this->graph_service->service(controller, request, response, done); + return 0; + }); + } else { + // VLOG(0)<<"use server"; + PsService_Stub::service(controller, request, response, done); + } +} + +int GraphBrpcClient::get_server_index_by_id(uint64_t id) { + int shard_num = get_shard_num(); + int shard_per_server = shard_num % server_size == 0 + ? shard_num / server_size + : shard_num / server_size + 1; + return id % shard_num / shard_per_server; +} + +std::future GraphBrpcClient::get_node_feat( + const uint32_t &table_id, const std::vector &node_ids, + const std::vector &feature_names, + std::vector> &res) { + std::vector request2server; + std::vector server2request(server_size, -1); + for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_ids[query_idx]); + if (server2request[server_index] == -1) { + server2request[server_index] = request2server.size(); + request2server.push_back(server_index); + } + } + size_t request_call_num = request2server.size(); + std::vector> node_id_buckets(request_call_num); + std::vector> query_idx_buckets(request_call_num); + for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_ids[query_idx]); + int request_idx = server2request[server_index]; + node_id_buckets[request_idx].push_back(node_ids[query_idx]); + query_idx_buckets[request_idx].push_back(query_idx); + } + + DownpourBrpcClosure *closure = new DownpourBrpcClosure( + request_call_num, + [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + int fail_num = 0; + for (int request_idx = 0; request_idx < request_call_num; + ++request_idx) { + if (closure->check_response(request_idx, + PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) { + ++fail_num; + } else { + auto &res_io_buffer = + closure->cntl(request_idx)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + std::unique_ptr buffer_wrapper(new char[bytes_size]); + char *buffer = buffer_wrapper.get(); + io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + + for (size_t feat_idx = 0; feat_idx < feature_names.size(); + ++feat_idx) { + for (size_t node_idx = 0; + node_idx < query_idx_buckets.at(request_idx).size(); + ++node_idx) { + int query_idx = query_idx_buckets.at(request_idx).at(node_idx); + size_t feat_len = *(size_t *)(buffer); + buffer += sizeof(size_t); + auto feature = std::string(buffer, feat_len); + res[feat_idx][query_idx] = feature; + buffer += feat_len; + } + } + } + if (fail_num == request_call_num) { + ret = -1; + } + } + closure->set_promise_value(ret); + }); + + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + + for (int request_idx = 0; request_idx < request_call_num; ++request_idx) { + int server_index = request2server[request_idx]; + closure->request(request_idx)->set_cmd_id(PS_GRAPH_GET_NODE_FEAT); + closure->request(request_idx)->set_table_id(table_id); + closure->request(request_idx)->set_client_id(_client_id); + size_t node_num = node_id_buckets[request_idx].size(); + + closure->request(request_idx) + ->add_params((char *)node_id_buckets[request_idx].data(), + sizeof(uint64_t) * node_num); + std::string joint_feature_name = + paddle::string::join_strings(feature_names, '\t'); + closure->request(request_idx) + ->add_params(joint_feature_name.c_str(), joint_feature_name.size()); + + PsService_Stub rpc_stub(get_cmd_channel(server_index)); + closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx), + closure->response(request_idx), closure); + } + + return fut; +} +// char* &buffer,int &actual_size +std::future GraphBrpcClient::batch_sample_neighboors( + uint32_t table_id, std::vector node_ids, int sample_size, + std::vector>> &res) { + std::vector request2server; + std::vector server2request(server_size, -1); + res.clear(); + for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_ids[query_idx]); + if (server2request[server_index] == -1) { + server2request[server_index] = request2server.size(); + request2server.push_back(server_index); + } + res.push_back(std::vector>()); + } + size_t request_call_num = request2server.size(); + std::vector> node_id_buckets(request_call_num); + std::vector> query_idx_buckets(request_call_num); + for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_ids[query_idx]); + int request_idx = server2request[server_index]; + node_id_buckets[request_idx].push_back(node_ids[query_idx]); + query_idx_buckets[request_idx].push_back(query_idx); + } + + DownpourBrpcClosure *closure = new DownpourBrpcClosure( + request_call_num, + [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + int fail_num = 0; + for (int request_idx = 0; request_idx < request_call_num; + ++request_idx) { + if (closure->check_response(request_idx, + PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) { + ++fail_num; + } else { + auto &res_io_buffer = + closure->cntl(request_idx)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + std::unique_ptr buffer_wrapper(new char[bytes_size]); + char *buffer = buffer_wrapper.get(); + io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + + size_t node_num = *(size_t *)buffer; + int *actual_sizes = (int *)(buffer + sizeof(size_t)); + char *node_buffer = + buffer + sizeof(size_t) + sizeof(int) * node_num; + + int offset = 0; + for (size_t node_idx = 0; node_idx < node_num; ++node_idx) { + int query_idx = query_idx_buckets.at(request_idx).at(node_idx); + int actual_size = actual_sizes[node_idx]; + int start = 0; + while (start < actual_size) { + res[query_idx].push_back( + {*(uint64_t *)(node_buffer + offset + start), + *(float *)(node_buffer + offset + start + + GraphNode::id_size)}); + start += GraphNode::id_size + GraphNode::weight_size; + } + offset += actual_size; + } + } + if (fail_num == request_call_num) { + ret = -1; + } + } + closure->set_promise_value(ret); + }); + + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + + for (int request_idx = 0; request_idx < request_call_num; ++request_idx) { + int server_index = request2server[request_idx]; + closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBOORS); + closure->request(request_idx)->set_table_id(table_id); + closure->request(request_idx)->set_client_id(_client_id); + size_t node_num = node_id_buckets[request_idx].size(); + + closure->request(request_idx) + ->add_params((char *)node_id_buckets[request_idx].data(), + sizeof(uint64_t) * node_num); + closure->request(request_idx) + ->add_params((char *)&sample_size, sizeof(int)); + // PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = + getServiceStub(get_cmd_channel(server_index)); + closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx), + closure->response(request_idx), closure); + } + + return fut; +} +std::future GraphBrpcClient::random_sample_nodes( + uint32_t table_id, int server_index, int sample_size, + std::vector &ids) { + DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + if (closure->check_response(0, PS_GRAPH_SAMPLE_NODES) != 0) { + ret = -1; + } else { + auto &res_io_buffer = closure->cntl(0)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + char buffer[bytes_size]; + auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + int index = 0; + while (index < bytes_size) { + ids.push_back(*(uint64_t *)(buffer + index)); + index += GraphNode::id_size; + } + } + closure->set_promise_value(ret); + }); + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + ; + closure->request(0)->set_cmd_id(PS_GRAPH_SAMPLE_NODES); + closure->request(0)->set_table_id(table_id); + closure->request(0)->set_client_id(_client_id); + closure->request(0)->add_params((char *)&sample_size, sizeof(int)); + ; + // PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index)); + closure->cntl(0)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0), + closure); + return fut; +} +std::future GraphBrpcClient::pull_graph_list( + uint32_t table_id, int server_index, int start, int size, int step, + std::vector &res) { + DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + if (closure->check_response(0, PS_PULL_GRAPH_LIST) != 0) { + ret = -1; + } else { + auto &res_io_buffer = closure->cntl(0)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + char buffer[bytes_size]; + io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + int index = 0; + while (index < bytes_size) { + FeatureNode node; + node.recover_from_buffer(buffer + index); + index += node.get_size(false); + res.push_back(node); + } + } + closure->set_promise_value(ret); + }); + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + closure->request(0)->set_cmd_id(PS_PULL_GRAPH_LIST); + closure->request(0)->set_table_id(table_id); + closure->request(0)->set_client_id(_client_id); + closure->request(0)->add_params((char *)&start, sizeof(int)); + closure->request(0)->add_params((char *)&size, sizeof(int)); + closure->request(0)->add_params((char *)&step, sizeof(int)); + // PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index)); + closure->cntl(0)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0), + closure); + return fut; +} +int32_t GraphBrpcClient::initialize() { + // set_shard_num(_config.shard_num()); + BrpcPsClient::initialize(); + server_size = get_server_nums(); + graph_service = NULL; + local_channel = NULL; + return 0; +} +} +} diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h new file mode 100644 index 00000000000000..4e6775a4bedaf1 --- /dev/null +++ b/paddle/fluid/distributed/service/graph_brpc_client.h @@ -0,0 +1,105 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include "ThreadPool.h" +#include "brpc/channel.h" +#include "brpc/controller.h" +#include "brpc/server.h" +#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/ps_client.h" +#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" + +namespace paddle { +namespace distributed { + +class GraphPsService_Stub : public PsService_Stub { + public: + GraphPsService_Stub(::google::protobuf::RpcChannel* channel, + ::google::protobuf::RpcChannel* local_channel = NULL, + GraphBrpcService* service = NULL, int thread_num = 1) + : PsService_Stub(channel) { + this->local_channel = local_channel; + this->graph_service = service; + task_pool.reset(new ::ThreadPool(thread_num)); + } + virtual ~GraphPsService_Stub() {} + + // implements PsService ------------------------------------------ + GraphBrpcService* graph_service; + std::shared_ptr<::ThreadPool> task_pool; + ::google::protobuf::RpcChannel* local_channel; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(GraphPsService_Stub); + void service(::google::protobuf::RpcController* controller, + const ::paddle::distributed::PsRequestMessage* request, + ::paddle::distributed::PsResponseMessage* response, + ::google::protobuf::Closure* done); +}; +class GraphBrpcClient : public BrpcPsClient { + public: + GraphBrpcClient() {} + virtual ~GraphBrpcClient() {} + // given a batch of nodes, sample graph_neighboors for each of them + virtual std::future batch_sample_neighboors( + uint32_t table_id, std::vector node_ids, int sample_size, + std::vector>>& res); + + virtual std::future pull_graph_list(uint32_t table_id, + int server_index, int start, + int size, int step, + std::vector& res); + virtual std::future random_sample_nodes(uint32_t table_id, + int server_index, + int sample_size, + std::vector& ids); + virtual std::future get_node_feat( + const uint32_t& table_id, const std::vector& node_ids, + const std::vector& feature_names, + std::vector>& res); + virtual int32_t initialize(); + int get_shard_num() { return shard_num; } + void set_shard_num(int shard_num) { this->shard_num = shard_num; } + int get_server_index_by_id(uint64_t id); + void set_local_channel(int index) { + this->local_channel = get_cmd_channel(index); + } + void set_local_graph_service(GraphBrpcService* graph_service) { + this->graph_service = graph_service; + } + GraphPsService_Stub getServiceStub(::google::protobuf::RpcChannel* channel, + int thread_num = 1) { + return GraphPsService_Stub(channel, local_channel, graph_service, + thread_num); + } + + private: + int shard_num; + size_t server_size; + ::google::protobuf::RpcChannel* local_channel; + GraphBrpcService* graph_service; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc new file mode 100644 index 00000000000000..bdd926278b624b --- /dev/null +++ b/paddle/fluid/distributed/service/graph_brpc_server.cc @@ -0,0 +1,348 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/brpc_ps_server.h" + +#include // NOLINT +#include "butil/endpoint.h" +#include "iomanip" +#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/framework/archive.h" +#include "paddle/fluid/platform/profiler.h" +namespace paddle { +namespace distributed { + +int32_t GraphBrpcServer::initialize() { + auto &service_config = _config.downpour_server_param().service_param(); + if (!service_config.has_service_class()) { + LOG(ERROR) << "miss service_class in ServerServiceParameter"; + return -1; + } + auto *service = + CREATE_PSCORE_CLASS(PsBaseService, service_config.service_class()); + if (service == NULL) { + LOG(ERROR) << "service is unregistered, service_name:" + << service_config.service_class(); + return -1; + } + + _service.reset(service); + if (service->configure(this) != 0 || service->initialize() != 0) { + LOG(ERROR) << "service initialize failed, service_name:" + << service_config.service_class(); + return -1; + } + if (_server.AddService(service, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { + LOG(ERROR) << "service add to brpc failed, service:" + << service_config.service_class(); + return -1; + } + return 0; +} + +uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) { + std::unique_lock lock(mutex_); + + std::string ip_port = ip + ":" + std::to_string(port); + VLOG(3) << "server of rank " << _rank << " starts at " << ip_port; + brpc::ServerOptions options; + + int num_threads = std::thread::hardware_concurrency(); + auto trainers = _environment->get_trainers(); + options.num_threads = trainers > num_threads ? trainers : num_threads; + + if (_server.Start(ip_port.c_str(), &options) != 0) { + LOG(ERROR) << "GraphBrpcServer start failed, ip_port=" << ip_port; + return 0; + } + _environment->registe_ps_server(ip, port, _rank); + return 0; +} + +int32_t GraphBrpcServer::port() { return _server.listen_address().port; } + +int32_t GraphBrpcService::initialize() { + _is_initialize_shard_info = false; + _service_handler_map[PS_STOP_SERVER] = &GraphBrpcService::stop_server; + _service_handler_map[PS_LOAD_ONE_TABLE] = &GraphBrpcService::load_one_table; + _service_handler_map[PS_LOAD_ALL_TABLE] = &GraphBrpcService::load_all_table; + + _service_handler_map[PS_PRINT_TABLE_STAT] = + &GraphBrpcService::print_table_stat; + _service_handler_map[PS_BARRIER] = &GraphBrpcService::barrier; + _service_handler_map[PS_START_PROFILER] = &GraphBrpcService::start_profiler; + _service_handler_map[PS_STOP_PROFILER] = &GraphBrpcService::stop_profiler; + + _service_handler_map[PS_PULL_GRAPH_LIST] = &GraphBrpcService::pull_graph_list; + _service_handler_map[PS_GRAPH_SAMPLE_NEIGHBOORS] = + &GraphBrpcService::graph_random_sample_neighboors; + _service_handler_map[PS_GRAPH_SAMPLE_NODES] = + &GraphBrpcService::graph_random_sample_nodes; + _service_handler_map[PS_GRAPH_GET_NODE_FEAT] = + &GraphBrpcService::graph_get_node_feat; + + // shard初始化,server启动后才可从env获取到server_list的shard信息 + initialize_shard_info(); + + return 0; +} + +#define CHECK_TABLE_EXIST(table, request, response) \ + if (table == NULL) { \ + std::string err_msg("table not found with table_id:"); \ + err_msg.append(std::to_string(request.table_id())); \ + set_response_code(response, -1, err_msg.c_str()); \ + return -1; \ + } + +int32_t GraphBrpcService::initialize_shard_info() { + if (!_is_initialize_shard_info) { + std::lock_guard guard(_initialize_shard_mutex); + if (_is_initialize_shard_info) { + return 0; + } + size_t shard_num = _server->environment()->get_ps_servers().size(); + auto &table_map = *(_server->table()); + for (auto itr : table_map) { + itr.second->set_shard(_rank, shard_num); + } + _is_initialize_shard_info = true; + } + return 0; +} + +void GraphBrpcService::service(google::protobuf::RpcController *cntl_base, + const PsRequestMessage *request, + PsResponseMessage *response, + google::protobuf::Closure *done) { + brpc::ClosureGuard done_guard(done); + std::string log_label("ReceiveCmd-"); + if (!request->has_table_id()) { + set_response_code(*response, -1, "PsRequestMessage.tabel_id is required"); + return; + } + + response->set_err_code(0); + response->set_err_msg(""); + auto *table = _server->table(request->table_id()); + brpc::Controller *cntl = static_cast(cntl_base); + auto itr = _service_handler_map.find(request->cmd_id()); + if (itr == _service_handler_map.end()) { + std::string err_msg( + "undefined cmd_id, should match PsCmdID in ps.proto, cmd_id:"); + err_msg.append(std::to_string(request->cmd_id())); + set_response_code(*response, -1, err_msg.c_str()); + return; + } + serviceFunc handler_func = itr->second; + int service_ret = (this->*handler_func)(table, *request, *response, cntl); + if (service_ret != 0) { + response->set_err_code(service_ret); + response->set_err_msg("server internal error"); + } +} + +int32_t GraphBrpcService::barrier(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + + if (request.params_size() < 1) { + set_response_code(response, -1, + "PsRequestMessage.params is requeired at " + "least 1 for num of sparse_key"); + return 0; + } + + auto trainer_id = request.client_id(); + auto barrier_type = request.params(0); + table->barrier(trainer_id, barrier_type); + return 0; +} + +int32_t GraphBrpcService::print_table_stat(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + std::pair ret = table->print_table_stat(); + paddle::framework::BinaryArchive ar; + ar << ret.first << ret.second; + std::string table_info(ar.Buffer(), ar.Length()); + response.set_data(table_info); + + return 0; +} + +int32_t GraphBrpcService::load_one_table(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 2) { + set_response_code( + response, -1, + "PsRequestMessage.datas is requeired at least 2 for path & load_param"); + return -1; + } + if (table->load(request.params(0), request.params(1)) != 0) { + set_response_code(response, -1, "table load failed"); + return -1; + } + return 0; +} + +int32_t GraphBrpcService::load_all_table(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + auto &table_map = *(_server->table()); + for (auto &itr : table_map) { + if (load_one_table(itr.second.get(), request, response, cntl) != 0) { + LOG(ERROR) << "load table[" << itr.first << "] failed"; + return -1; + } + } + return 0; +} + +int32_t GraphBrpcService::stop_server(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + GraphBrpcServer *p_server = (GraphBrpcServer *)_server; + std::thread t_stop([p_server]() { + p_server->stop(); + LOG(INFO) << "Server Stoped"; + }); + p_server->export_cv()->notify_all(); + t_stop.detach(); + return 0; +} + +int32_t GraphBrpcService::stop_profiler(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + platform::DisableProfiler(platform::EventSortingKey::kDefault, + string::Sprintf("server_%s_profile", _rank)); + return 0; +} + +int32_t GraphBrpcService::start_profiler(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + platform::EnableProfiler(platform::ProfilerState::kCPU); + return 0; +} + +int32_t GraphBrpcService::pull_graph_list(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 3) { + set_response_code(response, -1, + "pull_graph_list request requires at least 3 arguments"); + return 0; + } + int start = *(int *)(request.params(0).c_str()); + int size = *(int *)(request.params(1).c_str()); + int step = *(int *)(request.params(2).c_str()); + std::unique_ptr buffer; + int actual_size; + ((GraphTable *)table) + ->pull_graph_list(start, size, buffer, actual_size, false, step); + cntl->response_attachment().append(buffer.get(), actual_size); + return 0; +} +int32_t GraphBrpcService::graph_random_sample_neighboors( + Table *table, const PsRequestMessage &request, PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 2) { + set_response_code( + response, -1, + "graph_random_sample request requires at least 2 arguments"); + return 0; + } + size_t node_num = request.params(0).size() / sizeof(uint64_t); + uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); + int sample_size = *(uint64_t *)(request.params(1).c_str()); + std::vector> buffers(node_num); + std::vector actual_sizes(node_num, 0); + ((GraphTable *)table) + ->random_sample_neighboors(node_data, sample_size, buffers, actual_sizes); + + cntl->response_attachment().append(&node_num, sizeof(size_t)); + cntl->response_attachment().append(actual_sizes.data(), + sizeof(int) * node_num); + for (size_t idx = 0; idx < node_num; ++idx) { + cntl->response_attachment().append(buffers[idx].get(), actual_sizes[idx]); + } + return 0; +} +int32_t GraphBrpcService::graph_random_sample_nodes( + Table *table, const PsRequestMessage &request, PsResponseMessage &response, + brpc::Controller *cntl) { + size_t size = *(uint64_t *)(request.params(0).c_str()); + std::unique_ptr buffer; + int actual_size; + if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) == + 0) { + cntl->response_attachment().append(buffer.get(), actual_size); + } else + cntl->response_attachment().append(NULL, 0); + + return 0; +} + +int32_t GraphBrpcService::graph_get_node_feat(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 2) { + set_response_code( + response, -1, + "graph_get_node_feat request requires at least 2 arguments"); + return 0; + } + size_t node_num = request.params(0).size() / sizeof(uint64_t); + uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); + + std::vector feature_names = + paddle::string::split_string(request.params(1), "\t"); + + std::vector> feature( + feature_names.size(), std::vector(node_num)); + + ((GraphTable *)table)->get_node_feat(node_ids, feature_names, feature); + + for (size_t feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + for (size_t node_idx = 0; node_idx < node_num; ++node_idx) { + size_t feat_len = feature[feat_idx][node_idx].size(); + cntl->response_attachment().append(&feat_len, sizeof(size_t)); + cntl->response_attachment().append(feature[feat_idx][node_idx].data(), + feat_len); + } + } + + return 0; +} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h new file mode 100644 index 00000000000000..32c572f9e6c2bf --- /dev/null +++ b/paddle/fluid/distributed/service/graph_brpc_server.h @@ -0,0 +1,114 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "brpc/channel.h" +#include "brpc/controller.h" +#include "brpc/server.h" + +#include +#include +#include "paddle/fluid/distributed/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/service/server.h" +#include "paddle/fluid/distributed/table/common_graph_table.h" +#include "paddle/fluid/distributed/table/table.h" +namespace paddle { +namespace distributed { +class GraphBrpcServer : public PSServer { + public: + GraphBrpcServer() {} + virtual ~GraphBrpcServer() {} + PsBaseService *get_service() { return _service.get(); } + virtual uint64_t start(const std::string &ip, uint32_t port); + virtual int32_t stop() { + std::unique_lock lock(mutex_); + if (stoped_) return 0; + stoped_ = true; + // cv_.notify_all(); + _server.Stop(1000); + _server.Join(); + return 0; + } + virtual int32_t port(); + + std::condition_variable *export_cv() { return &cv_; } + + private: + virtual int32_t initialize(); + mutable std::mutex mutex_; + std::condition_variable cv_; + bool stoped_ = false; + brpc::Server _server; + std::shared_ptr _service; + std::vector> _pserver_channels; +}; + +class GraphBrpcService; + +typedef int32_t (GraphBrpcService::*serviceFunc)( + Table *table, const PsRequestMessage &request, PsResponseMessage &response, + brpc::Controller *cntl); + +class GraphBrpcService : public PsBaseService { + public: + virtual int32_t initialize() override; + + virtual void service(::google::protobuf::RpcController *controller, + const PsRequestMessage *request, + PsResponseMessage *response, + ::google::protobuf::Closure *done) override; + + protected: + std::unordered_map _service_handler_map; + int32_t initialize_shard_info(); + int32_t pull_graph_list(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t graph_random_sample_neighboors(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl); + int32_t graph_random_sample_nodes(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl); + int32_t graph_get_node_feat(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl); + int32_t barrier(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t load_one_table(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t load_all_table(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t stop_server(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t start_profiler(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t stop_profiler(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + + int32_t print_table_stat(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + + private: + bool _is_initialize_shard_info; + std::mutex _initialize_shard_mutex; + std::unordered_map _msg_handler_map; + std::vector _ori_values; + const int sample_nodes_ranges = 23; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/service/graph_py_service.cc new file mode 100644 index 00000000000000..61e4e0cf7bb915 --- /dev/null +++ b/paddle/fluid/distributed/service/graph_py_service.cc @@ -0,0 +1,325 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/service/graph_py_service.h" +#include // NOLINT +#include "butil/endpoint.h" +#include "iomanip" +#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/framework/archive.h" +#include "paddle/fluid/platform/profiler.h" +namespace paddle { +namespace distributed { +std::vector GraphPyService::split(std::string& str, + const char pattern) { + std::vector res; + std::stringstream input(str); + std::string temp; + while (std::getline(input, temp, pattern)) { + res.push_back(temp); + } + return res; +} + +void GraphPyService::add_table_feat_conf(std::string table_name, + std::string feat_name, + std::string feat_dtype, + int32_t feat_shape) { + if (this->table_id_map.count(table_name)) { + this->table_feat_conf_table_name.push_back(table_name); + this->table_feat_conf_feat_name.push_back(feat_name); + this->table_feat_conf_feat_dtype.push_back(feat_dtype); + this->table_feat_conf_feat_shape.push_back(feat_shape); + } +} + +void GraphPyService::set_up(std::string ips_str, int shard_num, + std::vector node_types, + std::vector edge_types) { + set_shard_num(shard_num); + set_num_node_types(node_types.size()); + + for (size_t table_id = 0; table_id < node_types.size(); table_id++) { + this->table_id_map[node_types[table_id]] = this->table_id_map.size(); + } + for (size_t table_id = 0; table_id < edge_types.size(); table_id++) { + this->table_id_map[edge_types[table_id]] = this->table_id_map.size(); + } + std::istringstream stream(ips_str); + std::string ip; + server_size = 0; + std::vector ips_list = split(ips_str, ';'); + int index = 0; + for (auto ips : ips_list) { + auto ip_and_port = split(ips, ':'); + server_list.push_back(ip_and_port[0]); + port_list.push_back(ip_and_port[1]); + uint32_t port = stoul(ip_and_port[1]); + auto ph_host = paddle::distributed::PSHost(ip_and_port[0], port, index); + host_sign_list.push_back(ph_host.serialize_to_string()); + index++; + } +} +void GraphPyClient::start_client() { + std::map> dense_regions; + dense_regions.insert( + std::pair>(0, {})); + auto regions = dense_regions[0]; + ::paddle::distributed::PSParameter worker_proto = GetWorkerProto(); + paddle::distributed::PaddlePSEnvironment _ps_env; + auto servers_ = host_sign_list.size(); + _ps_env = paddle::distributed::PaddlePSEnvironment(); + _ps_env.set_ps_servers(&host_sign_list, servers_); + worker_ptr = std::shared_ptr( + (paddle::distributed::GraphBrpcClient*) + paddle::distributed::PSClientFactory::create(worker_proto)); + worker_ptr->configure(worker_proto, dense_regions, _ps_env, client_id); + worker_ptr->set_shard_num(get_shard_num()); +} +void GraphPyServer::start_server(bool block) { + std::string ip = server_list[rank]; + uint32_t port = std::stoul(port_list[rank]); + ::paddle::distributed::PSParameter server_proto = this->GetServerProto(); + + auto _ps_env = paddle::distributed::PaddlePSEnvironment(); + _ps_env.set_ps_servers(&this->host_sign_list, + this->host_sign_list.size()); // test + pserver_ptr = std::shared_ptr( + (paddle::distributed::GraphBrpcServer*) + paddle::distributed::PSServerFactory::create(server_proto)); + VLOG(0) << "pserver-ptr created "; + std::vector empty_vec; + framework::ProgramDesc empty_prog; + empty_vec.push_back(empty_prog); + pserver_ptr->configure(server_proto, _ps_env, rank, empty_vec); + pserver_ptr->start(ip, port); + std::condition_variable* cv_ = pserver_ptr->export_cv(); + if (block) { + std::mutex mutex_; + std::unique_lock lock(mutex_); + cv_->wait(lock); + } +} +::paddle::distributed::PSParameter GraphPyServer::GetServerProto() { + // Generate server proto desc + ::paddle::distributed::PSParameter server_fleet_desc; + ::paddle::distributed::ServerParameter* server_proto = + server_fleet_desc.mutable_server_param(); + ::paddle::distributed::DownpourServerParameter* downpour_server_proto = + server_proto->mutable_downpour_server_param(); + ::paddle::distributed::ServerServiceParameter* server_service_proto = + downpour_server_proto->mutable_service_param(); + server_service_proto->set_service_class("GraphBrpcService"); + server_service_proto->set_server_class("GraphBrpcServer"); + server_service_proto->set_client_class("GraphBrpcClient"); + server_service_proto->set_start_server_port(0); + server_service_proto->set_server_thread_num(12); + + for (auto& tuple : this->table_id_map) { + VLOG(0) << " make a new table " << tuple.second; + ::paddle::distributed::TableParameter* sparse_table_proto = + downpour_server_proto->add_downpour_table_param(); + std::vector feat_name; + std::vector feat_dtype; + std::vector feat_shape; + for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) { + if (tuple.first == table_feat_conf_table_name[i]) { + feat_name.push_back(table_feat_conf_feat_name[i]); + feat_dtype.push_back(table_feat_conf_feat_dtype[i]); + feat_shape.push_back(table_feat_conf_feat_shape[i]); + } + } + std::string table_type; + if (tuple.second < this->num_node_types) { + table_type = "node"; + } else { + table_type = "edge"; + } + + GetDownpourSparseTableProto(sparse_table_proto, tuple.second, tuple.first, + table_type, feat_name, feat_dtype, feat_shape); + } + + return server_fleet_desc; +} + +::paddle::distributed::PSParameter GraphPyClient::GetWorkerProto() { + ::paddle::distributed::PSParameter worker_fleet_desc; + ::paddle::distributed::WorkerParameter* worker_proto = + worker_fleet_desc.mutable_worker_param(); + + ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto = + worker_proto->mutable_downpour_worker_param(); + + for (auto& tuple : this->table_id_map) { + VLOG(0) << " make a new table " << tuple.second; + ::paddle::distributed::TableParameter* worker_sparse_table_proto = + downpour_worker_proto->add_downpour_table_param(); + std::vector feat_name; + std::vector feat_dtype; + std::vector feat_shape; + for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) { + if (tuple.first == table_feat_conf_table_name[i]) { + feat_name.push_back(table_feat_conf_feat_name[i]); + feat_dtype.push_back(table_feat_conf_feat_dtype[i]); + feat_shape.push_back(table_feat_conf_feat_shape[i]); + } + } + std::string table_type; + if (tuple.second < this->num_node_types) { + table_type = "node"; + } else { + table_type = "edge"; + } + + GetDownpourSparseTableProto(worker_sparse_table_proto, tuple.second, + tuple.first, table_type, feat_name, feat_dtype, + feat_shape); + } + + ::paddle::distributed::ServerParameter* server_proto = + worker_fleet_desc.mutable_server_param(); + ::paddle::distributed::DownpourServerParameter* downpour_server_proto = + server_proto->mutable_downpour_server_param(); + ::paddle::distributed::ServerServiceParameter* server_service_proto = + downpour_server_proto->mutable_service_param(); + server_service_proto->set_service_class("GraphBrpcService"); + server_service_proto->set_server_class("GraphBrpcServer"); + server_service_proto->set_client_class("GraphBrpcClient"); + server_service_proto->set_start_server_port(0); + server_service_proto->set_server_thread_num(12); + + for (auto& tuple : this->table_id_map) { + VLOG(0) << " make a new table " << tuple.second; + ::paddle::distributed::TableParameter* sparse_table_proto = + downpour_server_proto->add_downpour_table_param(); + std::vector feat_name; + std::vector feat_dtype; + std::vector feat_shape; + for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) { + if (tuple.first == table_feat_conf_table_name[i]) { + feat_name.push_back(table_feat_conf_feat_name[i]); + feat_dtype.push_back(table_feat_conf_feat_dtype[i]); + feat_shape.push_back(table_feat_conf_feat_shape[i]); + } + } + std::string table_type; + if (tuple.second < this->num_node_types) { + table_type = "node"; + } else { + table_type = "edge"; + } + + GetDownpourSparseTableProto(sparse_table_proto, tuple.second, tuple.first, + table_type, feat_name, feat_dtype, feat_shape); + } + + return worker_fleet_desc; +} +void GraphPyClient::load_edge_file(std::string name, std::string filepath, + bool reverse) { + // 'e' means load edge + std::string params = "e"; + if (reverse) { + // 'e<' means load edges from $2 to $1 + params += "<"; + } else { + // 'e>' means load edges from $1 to $2 + params += ">"; + } + if (this->table_id_map.count(name)) { + VLOG(0) << "loadding data with type " << name << " from " << filepath; + uint32_t table_id = this->table_id_map[name]; + auto status = + get_ps_client()->load(table_id, std::string(filepath), params); + status.wait(); + } +} + +void GraphPyClient::load_node_file(std::string name, std::string filepath) { + // 'n' means load nodes and 'node_type' follows + std::string params = "n" + name; + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = + get_ps_client()->load(table_id, std::string(filepath), params); + status.wait(); + } +} +std::vector>> +GraphPyClient::batch_sample_neighboors(std::string name, + std::vector node_ids, + int sample_size) { + std::vector>> v; + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = + worker_ptr->batch_sample_neighboors(table_id, node_ids, sample_size, v); + status.wait(); + } + return v; +} + +std::vector GraphPyClient::random_sample_nodes(std::string name, + int server_index, + int sample_size) { + std::vector v; + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = + worker_ptr->random_sample_nodes(table_id, server_index, sample_size, v); + status.wait(); + } + return v; +} + +// (name, dtype, ndarray) +std::vector> GraphPyClient::get_node_feat( + std::string node_type, std::vector node_ids, + std::vector feature_names) { + std::vector> v( + feature_names.size(), std::vector(node_ids.size())); + if (this->table_id_map.count(node_type)) { + uint32_t table_id = this->table_id_map[node_type]; + auto status = + worker_ptr->get_node_feat(table_id, node_ids, feature_names, v); + status.wait(); + } + return v; +} + +std::vector GraphPyClient::pull_graph_list(std::string name, + int server_index, + int start, int size, + int step) { + std::vector res; + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = worker_ptr->pull_graph_list(table_id, server_index, start, + size, step, res); + status.wait(); + } + return res; +} + +void GraphPyClient::stop_server() { + VLOG(0) << "going to stop server"; + std::unique_lock lock(mutex_); + if (stoped_) return; + auto status = this->worker_ptr->stop_server(); + if (status.get() == 0) stoped_ = true; +} +void GraphPyClient::finalize_worker() { this->worker_ptr->finalize_worker(); } +} +} diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h new file mode 100644 index 00000000000000..e185f23e3d240f --- /dev/null +++ b/paddle/fluid/distributed/service/graph_py_service.h @@ -0,0 +1,178 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include // NOLINT +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include +#include "google/protobuf/text_format.h" + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" + +#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/service/env.h" +#include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/service/service.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" +namespace paddle { +namespace distributed { +class GraphPyService { + protected: + std::vector server_list, port_list, host_sign_list; + int server_size, shard_num; + int num_node_types; + std::unordered_map table_id_map; + std::vector table_feat_conf_table_name; + std::vector table_feat_conf_feat_name; + std::vector table_feat_conf_feat_dtype; + std::vector table_feat_conf_feat_shape; + + // std::thread *server_thread, *client_thread; + + // std::shared_ptr pserver_ptr; + + // std::shared_ptr worker_ptr; + + public: + // std::shared_ptr get_ps_server() { + // return pserver_ptr; + // } + // std::shared_ptr get_ps_client() { + // return worker_ptr; + // } + int get_shard_num() { return shard_num; } + void set_shard_num(int shard_num) { this->shard_num = shard_num; } + void GetDownpourSparseTableProto( + ::paddle::distributed::TableParameter* sparse_table_proto, + uint32_t table_id, std::string table_name, std::string table_type, + std::vector feat_name, std::vector feat_dtype, + std::vector feat_shape) { + sparse_table_proto->set_table_id(table_id); + sparse_table_proto->set_table_class("GraphTable"); + sparse_table_proto->set_shard_num(shard_num); + sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE); + ::paddle::distributed::TableAccessorParameter* accessor_proto = + sparse_table_proto->mutable_accessor(); + + ::paddle::distributed::CommonAccessorParameter* common_proto = + sparse_table_proto->mutable_common(); + + // Set GraphTable Parameter + common_proto->set_table_name(table_name); + common_proto->set_name(table_type); + for (size_t i = 0; i < feat_name.size(); i++) { + common_proto->add_params(feat_dtype[i]); + common_proto->add_dims(feat_shape[i]); + common_proto->add_attributes(feat_name[i]); + } + + accessor_proto->set_accessor_class("CommMergeAccessor"); + } + + void set_server_size(int server_size) { this->server_size = server_size; } + void set_num_node_types(int num_node_types) { + this->num_node_types = num_node_types; + } + int get_server_size(int server_size) { return server_size; } + std::vector split(std::string& str, const char pattern); + void set_up(std::string ips_str, int shard_num, + std::vector node_types, + std::vector edge_types); + + void add_table_feat_conf(std::string node_type, std::string feat_name, + std::string feat_dtype, int32_t feat_shape); +}; +class GraphPyServer : public GraphPyService { + public: + GraphPyServer() {} + void set_up(std::string ips_str, int shard_num, + std::vector node_types, + std::vector edge_types, int rank) { + set_rank(rank); + GraphPyService::set_up(ips_str, shard_num, node_types, edge_types); + } + int get_rank() { return rank; } + void set_rank(int rank) { this->rank = rank; } + + void start_server(bool block = true); + ::paddle::distributed::PSParameter GetServerProto(); + std::shared_ptr get_ps_server() { + return pserver_ptr; + } + + protected: + int rank; + std::shared_ptr pserver_ptr; + std::thread* server_thread; +}; +class GraphPyClient : public GraphPyService { + public: + void set_up(std::string ips_str, int shard_num, + std::vector node_types, + std::vector edge_types, int client_id) { + set_client_id(client_id); + GraphPyService::set_up(ips_str, shard_num, node_types, edge_types); + } + std::shared_ptr get_ps_client() { + return worker_ptr; + } + void bind_local_server(int local_channel_index, GraphPyServer& server) { + worker_ptr->set_local_channel(local_channel_index); + worker_ptr->set_local_graph_service( + (paddle::distributed::GraphBrpcService*)server.get_ps_server() + ->get_service()); + } + void stop_server(); + void finalize_worker(); + void load_edge_file(std::string name, std::string filepath, bool reverse); + void load_node_file(std::string name, std::string filepath); + int get_client_id() { return client_id; } + void set_client_id(int client_id) { this->client_id = client_id; } + void start_client(); + std::vector>> batch_sample_neighboors( + std::string name, std::vector node_ids, int sample_size); + std::vector random_sample_nodes(std::string name, int server_index, + int sample_size); + std::vector> get_node_feat( + std::string node_type, std::vector node_ids, + std::vector feature_names); + std::vector pull_graph_list(std::string name, int server_index, + int start, int size, int step = 1); + ::paddle::distributed::PSParameter GetWorkerProto(); + + protected: + mutable std::mutex mutex_; + int client_id; + std::shared_ptr worker_ptr; + std::thread* client_thread; + bool stoped_ = false; +}; +} +} diff --git a/paddle/fluid/distributed/service/ps_client.cc b/paddle/fluid/distributed/service/ps_client.cc index 095b5dee0b28e4..d45f41a0f58de3 100644 --- a/paddle/fluid/distributed/service/ps_client.cc +++ b/paddle/fluid/distributed/service/ps_client.cc @@ -15,11 +15,15 @@ #include "paddle/fluid/distributed/service/ps_client.h" #include "glog/logging.h" #include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/service/ps_local_client.h" #include "paddle/fluid/distributed/table/table.h" namespace paddle { namespace distributed { REGISTER_PSCORE_CLASS(PSClient, BrpcPsClient); +REGISTER_PSCORE_CLASS(PSClient, PsLocalClient); +REGISTER_PSCORE_CLASS(PSClient, GraphBrpcClient); int32_t PSClient::configure( const PSParameter &config, @@ -78,8 +82,7 @@ PSClient *PSClientFactory::create(const PSParameter &ps_config) { } TableManager::instance().initialize(); - LOG(INFO) << "Create PSClient[" << service_param.client_class() - << "] success"; + VLOG(3) << "Create PSClient[" << service_param.client_class() << "] success"; return client; } } // namespace distributed diff --git a/paddle/fluid/distributed/service/ps_client.h b/paddle/fluid/distributed/service/ps_client.h index 50f5802c63a253..74a1e0dde71fc4 100644 --- a/paddle/fluid/distributed/service/ps_client.h +++ b/paddle/fluid/distributed/service/ps_client.h @@ -24,16 +24,11 @@ #include "paddle/fluid/distributed/service/env.h" #include "paddle/fluid/distributed/service/sendrecv.pb.h" #include "paddle/fluid/distributed/table/accessor.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" namespace paddle { namespace distributed { -class PSEnvironment; -class PsRequestMessage; -class PsResponseMessage; -class ValueAccessor; -struct Region; - using paddle::distributed::PsRequestMessage; using paddle::distributed::PsResponseMessage; @@ -117,10 +112,22 @@ class PSClient { // future结束前keys和values缓冲区不能再次使用 // 整合多个线程请求的keys,聚集并分散发送到server // 返回结果后,遍历buffer并对values赋值 + // is_training 用于区分请求是训练/预测,server端对于特征和准入会有不同的处理. virtual std::future pull_sparse(float **select_values, size_t table_id, - const uint64_t *keys, - size_t num) = 0; + const uint64_t *keys, size_t num, + bool is_training) = 0; + + virtual ::std::future pull_sparse_ptr(char **select_values, + size_t table_id, + const uint64_t *keys, + size_t num) { + VLOG(0) << "Did not implement"; + std::promise promise; + std::future fut = promise.get_future(); + promise.set_value(-1); + return fut; + } virtual std::future print_table_stat(uint32_t table_id) = 0; @@ -154,12 +161,13 @@ class PSClient { virtual std::future send_client2client_msg(int msg_type, int to_client_id, const std::string &msg) { - LOG(FATAL) << "Did not implement"; + VLOG(0) << "Did not implement"; std::promise promise; std::future fut = promise.get_future(); promise.set_value(-1); return fut; } + // client2client消息处理,std::function ret (msg_type, from_client_id, msg) typedef std::function MsgHandlerFunc; diff --git a/paddle/fluid/distributed/service/ps_local_client.cc b/paddle/fluid/distributed/service/ps_local_client.cc new file mode 100644 index 00000000000000..2acc845a50890b --- /dev/null +++ b/paddle/fluid/distributed/service/ps_local_client.cc @@ -0,0 +1,269 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/service/ps_local_client.h" +#include "paddle/fluid/distributed/table/table.h" + +//#define pslib_debug_dense_compress + +namespace paddle { +namespace distributed { +int32_t PsLocalClient::initialize() { + const auto& downpour_param = _config.server_param().downpour_server_param(); + TableManager::instance().initialize(); + for (size_t i = 0; i < downpour_param.downpour_table_param_size(); ++i) { + auto* table = CREATE_PSCORE_CLASS( + Table, downpour_param.downpour_table_param(i).table_class()); + table->initialize(downpour_param.downpour_table_param(i), + _config.fs_client_param()); + table->set_shard(0, 1); + _table_map[downpour_param.downpour_table_param(i).table_id()].reset(table); + } + return 0; +} + +::std::future PsLocalClient::shrink(uint32_t table_id, + const std::string threshold) { + // TODO + return done(); +} + +::std::future PsLocalClient::load(const std::string& epoch, + const std::string& mode) { + // TODO + // for (auto& it : _table_map) { + // load(it.first, epoch, mode); + //} + return done(); +} +::std::future PsLocalClient::load(uint32_t table_id, + const std::string& epoch, + const std::string& mode) { + // TODO + // auto* table_ptr = table(table_id); + // table_ptr->load(epoch, mode); + return done(); +} + +::std::future PsLocalClient::save(const std::string& epoch, + const std::string& mode) { + // TODO + for (auto& it : _table_map) { + save(it.first, epoch, mode); + } + return done(); +} +::std::future PsLocalClient::save(uint32_t table_id, + const std::string& epoch, + const std::string& mode) { + // TODO + auto* table_ptr = table(table_id); + table_ptr->flush(); + table_ptr->save(epoch, mode); + return done(); +} + +::std::future PsLocalClient::clear() { + // TODO + return done(); +} +::std::future PsLocalClient::clear(uint32_t table_id) { + // TODO + return done(); +} + +::std::future PsLocalClient::flush() { + // no need + return done(); +} + +::std::future PsLocalClient::stop_server() { + // no need + return done(); +} + +::std::future PsLocalClient::pull_dense(Region* regions, + size_t region_num, + size_t table_id) { + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + uint32_t num_per_shard = dense_dim_per_shard(accessor->fea_dim(), 1); + std::vector region_buffer; + region_buffer.resize(num_per_shard); + table_ptr->pull_dense(region_buffer.data(), region_buffer.size()); + + size_t region_idx = 0; + size_t region_data_idx = 0; + size_t shard_data_size = num_per_shard; + size_t shard_buffer_remain = shard_data_size * sizeof(float); + PADDLE_ENFORCE_EQ( + shard_buffer_remain, region_buffer.size() * sizeof(float), + platform::errors::PreconditionNotMet("pull dense size error.")); + size_t index = 0; + while (shard_buffer_remain > 0 && region_idx < region_num) { + auto& region = regions[region_idx]; + if (region.size - region_data_idx >= shard_buffer_remain) { + memcpy((void*)(region.data + region_data_idx), + (uint8_t*)(void*)(region_buffer.data()) + index, + shard_buffer_remain); + region_data_idx += shard_buffer_remain; + shard_buffer_remain = 0; + } else if (region.size - region_data_idx == 0) { + ++region_idx; + region_data_idx = 0; + } else { + memcpy((void*)(region.data + region_data_idx), + (uint8_t*)(void*)(region_buffer.data()) + index, + region.size - region_data_idx); + shard_buffer_remain -= (region.size - region_data_idx); + index += (region.size - region_data_idx); + ++region_idx; + region_data_idx = 0; + } + } + + return done(); +} + +::std::future PsLocalClient::push_dense_param(const Region* regions, + size_t region_num, + size_t table_id) { + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + std::vector region_buffer; + region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1), 0); + for (size_t i = 0, offset = 0; i < region_num; ++i) { + uint32_t data_num = regions[i].size / sizeof(float); + memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size); + offset += data_num; + } + + // table_ptr->push_dense_param(region_buffer.data(), region_buffer.size()); + + return done(); +} + +::std::future PsLocalClient::push_dense_raw_gradient( + int table_id, float* total_send_data, size_t total_send_data_size, + void* callback) { + VLOG(1) << "wxx push_dense_raw_gradient"; + + PSClientClosure* closure = reinterpret_cast(callback); + + auto* table_ptr = table(table_id); + + table_ptr->push_dense(total_send_data, total_send_data_size); + delete closure; + return done(); +} + +::std::future PsLocalClient::push_dense(const Region* regions, + size_t region_num, + size_t table_id) { + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + std::vector region_buffer; + region_buffer.resize(dense_dim_per_shard(accessor->fea_dim(), 1)); + size_t data_size = region_buffer.size(); + for (size_t i = 0, offset = 0; i < region_num; ++i) { + uint32_t data_num = regions[i].size / sizeof(float); + PADDLE_ENFORCE_LE( + offset + data_num, data_size, + platform::errors::PreconditionNotMet( + "invalid dense size, cur pos[%d] data_num[%d] size[%d]", offset, + data_num, data_size)); + memcpy(region_buffer.data() + offset, regions[i].data, regions[i].size); + offset += data_num; + } + + table_ptr->push_dense(region_buffer.data(), region_buffer.size()); + + return done(); +} + +//::std::future PsLocalClient::pull_sparse(float** select_values, +// size_t table_id, +// const uint64_t* keys, +// size_t num) { +// // FIXME +// // auto timer = +// // std::make_shared("pslib_downpour_client_pull_sparse"); +// // auto local_timer = +// // std::make_shared("pslib_downpour_client_pull_sparse_local"); +// //将key拆分到各shard请求,并记录原始对应value指针 +// auto* accessor = table_accessor(table_id); +// auto* table_ptr = table(table_id); +// size_t value_size = accessor->select_size(); +// +// // table_ptr->pull_sparse(keys, num); +// std::vector res_data; +// res_data.resize(num * value_size / sizeof(float)); +// table_ptr->pull_sparse(res_data.data(), keys, num); +// // memcpy(select_values[0], res_data->data(), res_data->size() * +// // sizeof(float)); +// size_t offset = 0; +// for (int i = 0; i < num; ++i) { +// memcpy(select_values[i], (char*)res_data.data() + offset, value_size); +// offset += value_size; +// } +// +// // return fut; +// return done(); +//} + +::std::future PsLocalClient::pull_sparse_ptr(char** select_values, + size_t table_id, + const uint64_t* keys, + size_t num) { + // FIXME + // auto timer = + // std::make_shared("pslib_downpour_client_pull_sparse"); + // auto local_timer = + // std::make_shared("pslib_downpour_client_pull_sparse_local"); + //将key拆分到各shard请求,并记录原始对应value指针 + auto* table_ptr = table(table_id); + + table_ptr->pull_sparse_ptr(select_values, keys, num); + + return done(); +} + +::std::future PsLocalClient::push_sparse_raw_gradient( + size_t table_id, const uint64_t* keys, const float** update_values, + size_t num, void* callback) { + VLOG(1) << "wxx push_sparse_raw_gradient"; + PSClientClosure* closure = reinterpret_cast(callback); + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + table_ptr->push_sparse(keys, update_values, num); + delete closure; + return done(); +} + +::std::future PsLocalClient::push_sparse(size_t table_id, + const uint64_t* keys, + const float** update_values, + size_t num) { + auto* accessor = table_accessor(table_id); + auto* table_ptr = table(table_id); + + table_ptr->push_sparse(keys, update_values, num); + return done(); +} +} +} diff --git a/paddle/fluid/distributed/service/ps_local_client.h b/paddle/fluid/distributed/service/ps_local_client.h new file mode 100644 index 00000000000000..9d2b01a45fe929 --- /dev/null +++ b/paddle/fluid/distributed/service/ps_local_client.h @@ -0,0 +1,226 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License 0// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "brpc/channel.h" +#include "brpc/controller.h" +#include "brpc/server.h" +#include "paddle/fluid/distributed/service/ps_client.h" + +namespace paddle { +namespace distributed { + +class Table; + +class PsLocalClient : public PSClient { + public: + PsLocalClient() {} + virtual ~PsLocalClient() { _running = false; } + virtual int32_t create_client2client_connection(int pslib_timeout_ms, + int pslib_connect_timeout_ms, + int max_retry) { + return 0; + } + + virtual ::std::future shrink(uint32_t table_id, + const std::string threshold) override; + virtual ::std::future load(const std::string& epoch, + const std::string& mode) override; + virtual ::std::future load(uint32_t table_id, + const std::string& epoch, + const std::string& mode) override; + + virtual ::std::future save(const std::string& epoch, + const std::string& mode) override; + virtual ::std::future save(uint32_t table_id, + const std::string& epoch, + const std::string& mode) override; + + virtual ::std::future clear() override; + virtual ::std::future clear(uint32_t table_id) override; + + virtual ::std::future stop_server() override; + + virtual void finalize_worker() override {} + virtual ::std::future pull_dense(Region* regions, size_t region_num, + size_t table_id); + + virtual ::std::future push_dense(const Region* regions, + size_t region_num, size_t table_id); + + virtual ::std::future push_dense_param(const Region* regions, + size_t region_num, + size_t table_id); + + virtual ::std::future pull_sparse(float** select_values, + size_t table_id, + const uint64_t* keys, size_t num, + bool is_training) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual ::std::future pull_sparse_ptr(char** select_values, + size_t table_id, + const uint64_t* keys, + size_t num); + + virtual ::std::future print_table_stat(uint32_t table_id) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + virtual ::std::future push_sparse(size_t table_id, + const uint64_t* keys, + const float** update_values, + size_t num); + + virtual ::std::future flush(); + // server profilera + virtual std::future start_profiler() { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + }; + + virtual std::future stop_profiler() { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual std::future barrier(size_t table_id, uint32_t barrier_type) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual std::future pull_geo_param(size_t table_id, + std::vector* values, + std::vector* keys, + int pserver_idx) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual std::future push_global_step(int table_id, + int64_t* total_send_data, + void* done) { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + // recv table from server and save it in LodTensor + virtual int32_t recv_and_save_table(const uint64_t table_id, + const std::string& path) { + return 0; + } + + virtual ::std::future send_client2client_msg( + int msg_type, int to_client_id, const std::string& msg) override { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + virtual size_t get_server_nums() { return 1; } + + virtual std::future push_dense_raw_gradient( + int table_id, float* total_send_data, size_t total_send_data_size, + void* callback) override; + + virtual std::future push_sparse_raw_gradient( + size_t table_id, const uint64_t* keys, const float** update_values, + size_t num, void* callback) override; + + virtual std::future push_sparse_raw_gradient_partial( + size_t table_id, const uint64_t* keys, const float** update_values, + uint32_t num, void* done, int pserver_idx) override { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + virtual std::future push_sparse_param(size_t table_id, + const uint64_t* keys, + const float** update_values, + size_t num, + void* done) override { + std::promise prom; + std::future fut = prom.get_future(); + prom.set_value(0); + + return fut; + } + + private: + virtual int32_t initialize() override; + + std::future done() { + std::shared_ptr> prom = + std::make_shared>(); + std::future fut = prom->get_future(); + prom->set_value(0); + return fut; + } + + inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total, + uint32_t shard_num) { + return dense_dim_total / shard_num + 1; + } + + inline std::unordered_map>* table() { + return &_table_map; + } + + inline Table* table(size_t table_id) { + auto itr = _table_map.find(table_id); + if (itr != _table_map.end()) { + return itr->second.get(); + } + LOG(ERROR) << "table not found " << table_id; + return NULL; + } + + std::unordered_map> _table_map; + + bool _running = false; + bool _flushing = false; + + private: + float _mae = 0; + float _mse = 0; + uint16_t _push_times = 0; +}; +} +} diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/distributed/service/ps_local_server.h similarity index 56% rename from paddle/fluid/operators/distributed/parameter_send.h rename to paddle/fluid/distributed/service/ps_local_server.h index 4335ef8c73cc0a..dfbccc70900e3c 100644 --- a/paddle/fluid/operators/distributed/parameter_send.h +++ b/paddle/fluid/distributed/service/ps_local_server.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,22 +14,24 @@ #pragma once -#include +#include #include - -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" +#include "paddle/fluid/distributed/service/server.h" namespace paddle { -namespace operators { namespace distributed { -template -struct ParameterSend { - void operator()(const CommContext &rpc_ctx, const framework::Scope &scope, - bool sync, int multi_parts); -}; +class PsLocalServer : public PSServer { + public: + PsLocalServer() {} + virtual ~PsLocalServer() {} + virtual uint64_t start() { return 0; } + virtual uint64_t start(const std::string& ip, uint32_t port) { return 0; } + virtual int32_t stop() { return 0; } + virtual int32_t port() { return 0; } -}; // namespace distributed -}; // namespace operators -}; // namespace paddle + private: + virtual int32_t initialize() { return 0; } +}; +} +} diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto index 6250f84c98754d..d908c26da9870a 100644 --- a/paddle/fluid/distributed/service/sendrecv.proto +++ b/paddle/fluid/distributed/service/sendrecv.proto @@ -48,6 +48,10 @@ enum PsCmdID { PS_START_PROFILER = 27; PS_STOP_PROFILER = 28; PS_PUSH_GLOBAL_STEP = 29; + PS_PULL_GRAPH_LIST = 30; + PS_GRAPH_SAMPLE_NEIGHBOORS = 31; + PS_GRAPH_SAMPLE_NODES = 32; + PS_GRAPH_GET_NODE_FEAT = 33; } message PsRequestMessage { @@ -111,4 +115,4 @@ message MultiVariableMessage { service PsService { rpc service(PsRequestMessage) returns (PsResponseMessage); rpc SendAndRecvVariable(MultiVariableMessage) returns (MultiVariableMessage); -}; \ No newline at end of file +}; diff --git a/paddle/fluid/distributed/service/server.cc b/paddle/fluid/distributed/service/server.cc index fc230a0b9c92e6..e44876e3d2b789 100644 --- a/paddle/fluid/distributed/service/server.cc +++ b/paddle/fluid/distributed/service/server.cc @@ -16,13 +16,18 @@ #include "glog/logging.h" #include "paddle/fluid/distributed/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/ps_local_server.h" #include "paddle/fluid/distributed/table/table.h" namespace paddle { namespace distributed { REGISTER_PSCORE_CLASS(PSServer, BrpcPsServer); +REGISTER_PSCORE_CLASS(PSServer, PsLocalServer); REGISTER_PSCORE_CLASS(PsBaseService, BrpcPsService); +REGISTER_PSCORE_CLASS(PSServer, GraphBrpcServer); +REGISTER_PSCORE_CLASS(PsBaseService, GraphBrpcService); PSServer *PSServerFactory::create(const PSParameter &ps_config) { const auto &config = ps_config.server_param(); diff --git a/paddle/fluid/distributed/service/service.cc b/paddle/fluid/distributed/service/service.cc index 3d0f94fac27750..2759e4614e66e1 100644 --- a/paddle/fluid/distributed/service/service.cc +++ b/paddle/fluid/distributed/service/service.cc @@ -47,7 +47,7 @@ paddle::distributed::PSParameter load_from_prototxt( } void PSCore::init_gflag(const std::string& gflags) { - LOG(INFO) << "Init With Gflags:" << gflags; + VLOG(3) << "Init With Gflags:" << gflags; std::vector flags = paddle::string::split_string(gflags); if (flags.size() < 1) { flags.push_back("-max_body_size=314217728"); diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt index 1e98e193d54ae6..dde1f5ae8ee3a1 100644 --- a/paddle/fluid/distributed/table/CMakeLists.txt +++ b/paddle/fluid/distributed/table/CMakeLists.txt @@ -1,13 +1,19 @@ set_property(GLOBAL PROPERTY TABLE_DEPS string_helper) - +set(graphDir graph) get_property(TABLE_DEPS GLOBAL PROPERTY TABLE_DEPS) - +set_source_files_properties(${graphDir}/graph_edge.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(graph_edge SRCS ${graphDir}/graph_edge.cc) +set_source_files_properties(${graphDir}/graph_weighted_sampler.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(WeightedSampler SRCS ${graphDir}/graph_weighted_sampler.cc DEPS graph_edge) +set_source_files_properties(${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(graph_node SRCS ${graphDir}/graph_node.cc DEPS WeightedSampler) set_source_files_properties(common_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(common_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc DEPS ${TABLE_DEPS} device_context string_helper simple_threadpool xxhash generator) +cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator) set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc new file mode 100644 index 00000000000000..020bcdcc52ef4b --- /dev/null +++ b/paddle/fluid/distributed/table/common_graph_table.cc @@ -0,0 +1,506 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/common_graph_table.h" +#include +#include +#include +#include +#include "paddle/fluid/distributed/common/utils.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/fluid/string/string_helper.h" +namespace paddle { +namespace distributed { + +std::vector GraphShard::get_batch(int start, int end, int step) { + if (start < 0) start = 0; + std::vector res; + for (int pos = start; pos < std::min(end, (int)bucket.size()); pos += step) { + res.push_back(bucket[pos]); + } + return res; +} + +size_t GraphShard::get_size() { return bucket.size(); } + +GraphNode *GraphShard::add_graph_node(uint64_t id) { + if (node_location.find(id) == node_location.end()) { + node_location[id] = bucket.size(); + bucket.push_back(new GraphNode(id)); + } + return (GraphNode *)bucket[node_location[id]]; +} + +FeatureNode *GraphShard::add_feature_node(uint64_t id) { + if (node_location.find(id) == node_location.end()) { + node_location[id] = bucket.size(); + bucket.push_back(new FeatureNode(id)); + } + return (FeatureNode *)bucket[node_location[id]]; +} + +void GraphShard::add_neighboor(uint64_t id, uint64_t dst_id, float weight) { + find_node(id)->add_edge(dst_id, weight); +} + +Node *GraphShard::find_node(uint64_t id) { + auto iter = node_location.find(id); + return iter == node_location.end() ? nullptr : bucket[iter->second]; +} + +int32_t GraphTable::load(const std::string &path, const std::string ¶m) { + bool load_edge = (param[0] == 'e'); + bool load_node = (param[0] == 'n'); + if (load_edge) { + bool reverse_edge = (param[1] == '<'); + return this->load_edges(path, reverse_edge); + } + if (load_node) { + std::string node_type = param.substr(1); + return this->load_nodes(path, node_type); + } + return 0; +} + +int32_t GraphTable::get_nodes_ids_by_ranges( + std::vector> ranges, std::vector &res) { + int start = 0, end, index = 0, total_size = 0; + res.clear(); + std::vector>> tasks; + // std::string temp = ""; + // for(int i = 0;i < shards.size();i++) + // temp+= std::to_string((int)shards[i].get_size()) + " "; + // VLOG(0)<<"range distribution "<= end) { + break; + } else { + int first = std::max(ranges[index].first, start); + int second = std::min(ranges[index].second, end); + start = second; + first -= total_size; + second -= total_size; + // VLOG(0)<<" FIND RANGE "<enqueue( + [this, first, second, i]() -> std::vector { + return shards[i].get_ids_by_range(first, second); + })); + } + } + total_size += shards[i].get_size(); + } + for (int i = 0; i < tasks.size(); i++) { + auto vec = tasks[i].get(); + for (auto &id : vec) { + res.push_back(id); + std::swap(res[rand() % res.size()], res[(int)res.size() - 1]); + } + } + return 0; +} + +int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { + auto paths = paddle::string::split_string(path, ";"); + int64_t count = 0; + int64_t valid_count = 0; + for (auto path : paths) { + std::ifstream file(path); + std::string line; + while (std::getline(file, line)) { + count++; + auto values = paddle::string::split_string(line, "\t"); + if (values.size() < 2) continue; + auto id = std::stoull(values[1]); + + size_t shard_id = id % shard_num; + if (shard_id >= shard_end || shard_id < shard_start) { + VLOG(4) << "will not load " << id << " from " << path + << ", please check id distribution"; + continue; + } + + if (count % 1000000 == 0) { + VLOG(0) << count << " nodes are loaded from filepath"; + } + + std::string nt = values[0]; + if (nt != node_type) { + continue; + } + + size_t index = shard_id - shard_start; + + auto node = shards[index].add_feature_node(id); + + node->set_feature_size(feat_name.size()); + + for (size_t slice = 2; slice < values.size(); slice++) { + auto feat = this->parse_feature(values[slice]); + if (feat.first >= 0) { + node->set_feature(feat.first, feat.second); + } else { + VLOG(4) << "Node feature: " << values[slice] + << " not in feature_map."; + } + } + valid_count++; + } + } + + VLOG(0) << valid_count << "/" << count << " nodes in type " << node_type + << " are loaded successfully in " << path; + return 0; +} + +int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { + auto paths = paddle::string::split_string(path, ";"); + int count = 0; + std::string sample_type = "random"; + bool is_weighted = false; + int valid_count = 0; + + for (auto path : paths) { + std::ifstream file(path); + std::string line; + while (std::getline(file, line)) { + auto values = paddle::string::split_string(line, "\t"); + count++; + if (values.size() < 2) continue; + auto src_id = std::stoull(values[0]); + auto dst_id = std::stoull(values[1]); + if (reverse_edge) { + std::swap(src_id, dst_id); + } + float weight = 1; + if (values.size() == 3) { + weight = std::stof(values[2]); + sample_type = "weighted"; + is_weighted = true; + } + + size_t src_shard_id = src_id % shard_num; + + if (src_shard_id >= shard_end || src_shard_id < shard_start) { + VLOG(4) << "will not load " << src_id << " from " << path + << ", please check id distribution"; + continue; + } + if (count % 1000000 == 0) { + VLOG(0) << count << " edges are loaded from filepath"; + } + + size_t index = src_shard_id - shard_start; + shards[index].add_graph_node(src_id)->build_edges(is_weighted); + shards[index].add_neighboor(src_id, dst_id, weight); + valid_count++; + } + } + VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in " + << path; + + // Build Sampler j + + for (auto &shard : shards) { + auto bucket = shard.get_bucket(); + for (int i = 0; i < bucket.size(); i++) { + bucket[i]->build_sampler(sample_type); + } + } + return 0; +} + +Node *GraphTable::find_node(uint64_t id) { + size_t shard_id = id % shard_num; + if (shard_id >= shard_end || shard_id < shard_start) { + return nullptr; + } + size_t index = shard_id - shard_start; + Node *node = shards[index].find_node(id); + return node; +} +uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { + return node_id % shard_num % shard_num_per_table % task_pool_size_; +} +int32_t GraphTable::random_sample_nodes(int sample_size, + std::unique_ptr &buffer, + int &actual_size) { + bool need_feature = false; + int total_size = 0; + for (int i = 0; i < shards.size(); i++) { + total_size += shards[i].get_size(); + } + if (sample_size > total_size) sample_size = total_size; + int range_num = random_sample_nodes_ranges; + if (range_num > sample_size) range_num = sample_size; + if (sample_size == 0 || range_num == 0) return 0; + std::vector ranges_len, ranges_pos; + int remain = sample_size, last_pos = -1, num; + std::set separator_set; + for (int i = 0; i < range_num - 1; i++) { + while (separator_set.find(num = rand() % (sample_size - 1)) != + separator_set.end()) + ; + separator_set.insert(num); + } + for (auto p : separator_set) { + ranges_len.push_back(p - last_pos); + last_pos = p; + } + ranges_len.push_back(sample_size - 1 - last_pos); + remain = total_size - sample_size + range_num; + separator_set.clear(); + for (int i = 0; i < range_num; i++) { + while (separator_set.find(num = rand() % remain) != separator_set.end()) + ; + separator_set.insert(num); + } + int used = 0, index = 0; + last_pos = -1; + for (auto p : separator_set) { + used += p - last_pos - 1; + last_pos = p; + ranges_pos.push_back(used); + used += ranges_len[index++]; + } + std::vector> first_half, second_half; + int start_index = rand() % total_size; + for (int i = 0; i < ranges_len.size() && i < ranges_pos.size(); i++) { + if (ranges_pos[i] + ranges_len[i] - 1 + start_index < total_size) + first_half.push_back({ranges_pos[i] + start_index, + ranges_pos[i] + ranges_len[i] + start_index}); + else if (ranges_pos[i] + start_index >= total_size) { + second_half.push_back( + {ranges_pos[i] + start_index - total_size, + ranges_pos[i] + ranges_len[i] + start_index - total_size}); + } else { + first_half.push_back({ranges_pos[i] + start_index, total_size}); + second_half.push_back( + {0, ranges_pos[i] + ranges_len[i] + start_index - total_size}); + } + } + for (auto &pair : first_half) second_half.push_back(pair); + std::vector res; + get_nodes_ids_by_ranges(second_half, res); + actual_size = res.size() * sizeof(uint64_t); + buffer.reset(new char[actual_size]); + char *pointer = buffer.get(); + memcpy(pointer, res.data(), actual_size); + return 0; +} +int32_t GraphTable::random_sample_neighboors( + uint64_t *node_ids, int sample_size, + std::vector> &buffers, + std::vector &actual_sizes) { + size_t node_num = buffers.size(); + std::vector> tasks; + for (size_t idx = 0; idx < node_num; ++idx) { + uint64_t &node_id = node_ids[idx]; + std::unique_ptr &buffer = buffers[idx]; + int &actual_size = actual_sizes[idx]; + tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( + [&]() -> int { + Node *node = find_node(node_id); + + if (node == nullptr) { + actual_size = 0; + return 0; + } + std::vector res = node->sample_k(sample_size); + actual_size = res.size() * (Node::id_size + Node::weight_size); + int offset = 0; + uint64_t id; + float weight; + char *buffer_addr = new char[actual_size]; + buffer.reset(buffer_addr); + for (int &x : res) { + id = node->get_neighbor_id(x); + weight = node->get_neighbor_weight(x); + memcpy(buffer_addr + offset, &id, Node::id_size); + offset += Node::id_size; + memcpy(buffer_addr + offset, &weight, Node::weight_size); + offset += Node::weight_size; + } + return 0; + })); + } + for (size_t idx = 0; idx < node_num; ++idx) { + tasks[idx].get(); + } + return 0; +} + +int32_t GraphTable::get_node_feat(const std::vector &node_ids, + const std::vector &feature_names, + std::vector> &res) { + size_t node_num = node_ids.size(); + std::vector> tasks; + for (size_t idx = 0; idx < node_num; ++idx) { + uint64_t node_id = node_ids[idx]; + tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( + [&, idx, node_id]() -> int { + Node *node = find_node(node_id); + + if (node == nullptr) { + return 0; + } + for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + const std::string &feature_name = feature_names[feat_idx]; + if (feat_id_map.find(feature_name) != feat_id_map.end()) { + // res[feat_idx][idx] = + // node->get_feature(feat_id_map[feature_name]); + auto feat = node->get_feature(feat_id_map[feature_name]); + res[feat_idx][idx] = feat; + } + } + return 0; + })); + } + for (size_t idx = 0; idx < node_num; ++idx) { + tasks[idx].get(); + } + return 0; +} + +std::pair GraphTable::parse_feature( + std::string feat_str) { + // Return (feat_id, btyes) if name are in this->feat_name, else return (-1, + // "") + auto fields = paddle::string::split_string(feat_str, " "); + if (this->feat_id_map.count(fields[0])) { + int32_t id = this->feat_id_map[fields[0]]; + std::string dtype = this->feat_dtype[id]; + int32_t shape = this->feat_shape[id]; + std::vector values(fields.begin() + 1, fields.end()); + if (dtype == "feasign") { + return std::make_pair( + int32_t(id), paddle::string::join_strings(values, ' ')); + } else if (dtype == "string") { + return std::make_pair( + int32_t(id), paddle::string::join_strings(values, ' ')); + } else if (dtype == "float32") { + return std::make_pair( + int32_t(id), FeatureNode::parse_value_to_bytes(values)); + } else if (dtype == "float64") { + return std::make_pair( + int32_t(id), FeatureNode::parse_value_to_bytes(values)); + } else if (dtype == "int32") { + return std::make_pair( + int32_t(id), FeatureNode::parse_value_to_bytes(values)); + } else if (dtype == "int64") { + return std::make_pair( + int32_t(id), FeatureNode::parse_value_to_bytes(values)); + } + } + return std::make_pair(-1, ""); +} + +int32_t GraphTable::pull_graph_list(int start, int total_size, + std::unique_ptr &buffer, + int &actual_size, bool need_feature, + int step) { + if (start < 0) start = 0; + int size = 0, cur_size; + std::vector>> tasks; + for (size_t i = 0; i < shards.size() && total_size > 0; i++) { + cur_size = shards[i].get_size(); + if (size + cur_size <= start) { + size += cur_size; + continue; + } + int count = std::min(1 + (size + cur_size - start - 1) / step, total_size); + int end = start + (count - 1) * step + 1; + tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( + [this, i, start, end, step, size]() -> std::vector { + + return this->shards[i].get_batch(start - size, end - size, step); + })); + start += count * step; + total_size -= count; + size += cur_size; + } + for (size_t i = 0; i < tasks.size(); ++i) { + tasks[i].wait(); + } + size = 0; + std::vector> res; + for (size_t i = 0; i < tasks.size(); i++) { + res.push_back(tasks[i].get()); + for (size_t j = 0; j < res.back().size(); j++) { + size += res.back()[j]->get_size(need_feature); + } + } + char *buffer_addr = new char[size]; + buffer.reset(buffer_addr); + int index = 0; + for (size_t i = 0; i < res.size(); i++) { + for (size_t j = 0; j < res[i].size(); j++) { + res[i][j]->to_buffer(buffer_addr + index, need_feature); + index += res[i][j]->get_size(need_feature); + } + } + actual_size = size; + return 0; +} +int32_t GraphTable::initialize() { + _shards_task_pool.resize(task_pool_size_); + for (size_t i = 0; i < _shards_task_pool.size(); ++i) { + _shards_task_pool[i].reset(new ::ThreadPool(1)); + } + server_num = _shard_num; + // VLOG(0) << "in init graph table server num = " << server_num; + /* + _shard_num is actually server number here + when a server initialize its tables, it sets tables' _shard_num to server_num, + and _shard_idx to server + rank + */ + auto common = _config.common(); + + this->table_name = common.table_name(); + this->table_type = common.name(); + VLOG(0) << " init graph table type " << this->table_type << " table name " + << this->table_name; + int feat_conf_size = static_cast(common.attributes().size()); + for (int i = 0; i < feat_conf_size; i++) { + auto &f_name = common.attributes()[i]; + auto &f_shape = common.dims()[i]; + auto &f_dtype = common.params()[i]; + this->feat_name.push_back(f_name); + this->feat_shape.push_back(f_shape); + this->feat_dtype.push_back(f_dtype); + this->feat_id_map[f_name] = i; + VLOG(0) << "init graph table feat conf name:" << f_name + << " shape:" << f_shape << " dtype:" << f_dtype; + } + + shard_num = _config.shard_num(); + VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx" + << _shard_idx; + shard_num_per_table = sparse_local_shard_num(shard_num, server_num); + shard_start = _shard_idx * shard_num_per_table; + shard_end = shard_start + shard_num_per_table; + VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start " + << shard_start << " shard_end " << shard_end; + // shards.resize(shard_num_per_table); + shards = std::vector(shard_num_per_table, GraphShard(shard_num)); + return 0; +} +} +}; diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h new file mode 100644 index 00000000000000..8ddf3c8f904a6c --- /dev/null +++ b/paddle/fluid/distributed/table/common_graph_table.h @@ -0,0 +1,147 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include +#include +#include +#include "paddle/fluid/distributed/table/accessor.h" +#include "paddle/fluid/distributed/table/common_table.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/fluid/string/string_helper.h" +namespace paddle { +namespace distributed { +class GraphShard { + public: + // static int bucket_low_bound; + // static int gcd(int s, int t) { + // if (s % t == 0) return t; + // return gcd(t, s % t); + // } + size_t get_size(); + GraphShard() {} + GraphShard(int shard_num) { + this->shard_num = shard_num; + // bucket_size = init_bucket_size(shard_num); + // bucket.resize(bucket_size); + } + std::vector &get_bucket() { return bucket; } + std::vector get_batch(int start, int end, int step); + // int init_bucket_size(int shard_num) { + // for (int i = bucket_low_bound;; i++) { + // if (gcd(i, shard_num) == 1) return i; + // } + // return -1; + // } + std::vector get_ids_by_range(int start, int end) { + std::vector res; + for (int i = start; i < end && i < bucket.size(); i++) { + res.push_back(bucket[i]->get_id()); + } + return res; + } + GraphNode *add_graph_node(uint64_t id); + FeatureNode *add_feature_node(uint64_t id); + Node *find_node(uint64_t id); + void add_neighboor(uint64_t id, uint64_t dst_id, float weight); + // std::unordered_map::iterator> + std::unordered_map get_node_location() { + return node_location; + } + + private: + std::unordered_map node_location; + int shard_num; + std::vector bucket; +}; +class GraphTable : public SparseTable { + public: + GraphTable() {} + virtual ~GraphTable() {} + virtual int32_t pull_graph_list(int start, int size, + std::unique_ptr &buffer, + int &actual_size, bool need_feature, + int step); + + virtual int32_t random_sample_neighboors( + uint64_t *node_ids, int sample_size, + std::vector> &buffers, + std::vector &actual_sizes); + + int32_t random_sample_nodes(int sample_size, std::unique_ptr &buffers, + int &actual_sizes); + + virtual int32_t get_nodes_ids_by_ranges( + std::vector> ranges, std::vector &res); + virtual int32_t initialize(); + + int32_t load(const std::string &path, const std::string ¶m); + + int32_t load_edges(const std::string &path, bool reverse); + + int32_t load_nodes(const std::string &path, std::string node_type); + + Node *find_node(uint64_t id); + + virtual int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) { + return 0; + } + + virtual int32_t push_sparse(const uint64_t *keys, const float *values, + size_t num) { + return 0; + } + + virtual void clear() {} + virtual int32_t flush() { return 0; } + virtual int32_t shrink(const std::string ¶m) { return 0; } + //指定保存路径 + virtual int32_t save(const std::string &path, const std::string &converter) { + return 0; + } + virtual int32_t initialize_shard() { return 0; } + virtual uint32_t get_thread_pool_index(uint64_t node_id); + virtual std::pair parse_feature(std::string feat_str); + + virtual int32_t get_node_feat(const std::vector &node_ids, + const std::vector &feature_names, + std::vector> &res); + + protected: + std::vector shards; + size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num; + const int task_pool_size_ = 11; + const int random_sample_nodes_ranges = 3; + + std::vector feat_name; + std::vector feat_dtype; + std::vector feat_shape; + std::unordered_map feat_id_map; + std::string table_name; + std::string table_type; + + std::vector> _shards_task_pool; +}; +} // namespace distributed +}; // namespace paddle diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc index ffedbea14a0290..2e8c257b6aad47 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.cc +++ b/paddle/fluid/distributed/table/common_sparse_table.cc @@ -254,7 +254,6 @@ int32_t CommonSparseTable::initialize_value() { } auto accessor = _config.accessor(); - std::vector feasigns; for (size_t x = 0; x < accessor.fea_dim(); ++x) { @@ -271,9 +270,14 @@ int32_t CommonSparseTable::initialize_value() { std::vector ids(bucket_feasigns); std::copy(feasigns.begin() + buckets[x], feasigns.begin() + buckets[x + 1], ids.begin()); + + std::vector fres; + fres.resize(ids.size(), 1); + + auto pull_value = PullSparseValue(ids, fres, param_dim_); std::vector pulls; pulls.resize(bucket_feasigns * param_dim_); - pull_sparse(pulls.data(), ids.data(), bucket_feasigns); + pull_sparse(pulls.data(), pull_value); } return 0; @@ -399,10 +403,51 @@ int32_t CommonSparseTable::pour() { return 0; } -int32_t CommonSparseTable::pull_sparse(float* pull_values, const uint64_t* keys, - size_t num) { +int32_t CommonSparseTable::pull_sparse(float* pull_values, + const PullSparseValue& pull_value) { rwlock_->RDLock(); + auto shard_num = task_pool_size_; + std::vector> tasks(shard_num); + + for (int shard_id = 0; shard_id < shard_num; ++shard_id) { + tasks[shard_id] = _shards_task_pool[shard_id]->enqueue( + [this, shard_id, shard_num, &pull_value, &pull_values]() -> int { + auto& block = shard_values_[shard_id]; + + std::vector offsets; + pull_value.Fission(shard_id, shard_num, &offsets); + + if (pull_value.is_training_) { + for (auto& offset : offsets) { + auto feasign = pull_value.feasigns_[offset]; + auto frequencie = pull_value.frequencies_[offset]; + auto* value = block->Init(feasign, true, frequencie); + std::copy_n(value + param_offset_, param_dim_, + pull_values + param_dim_ * offset); + } + } else { + for (auto& offset : offsets) { + auto feasign = pull_value.feasigns_[offset]; + auto* value = block->Init(feasign, false); + std::copy_n(value + param_offset_, param_dim_, + pull_values + param_dim_ * offset); + } + } + + return 0; + }); + } + + for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { + tasks[shard_id].wait(); + } + rwlock_->UNLock(); + return 0; +} + +int32_t CommonSparseTable::pull_sparse_ptr(char** pull_values, + const uint64_t* keys, size_t num) { std::vector> offset_bucket; offset_bucket.resize(task_pool_size_); @@ -422,9 +467,10 @@ int32_t CommonSparseTable::pull_sparse(float* pull_values, const uint64_t* keys, for (int i = 0; i < offsets.size(); ++i) { auto offset = offsets[i]; auto id = keys[offset]; - auto* value = block->Init(id); - std::copy_n(value + param_offset_, param_dim_, - pull_values + param_dim_ * offset); + auto* value = block->InitGet(id); + // std::copy_n(value + param_offset_, param_dim_, + // pull_values + param_dim_ * offset); + pull_values[offset] = (char*)value; } return 0; @@ -434,7 +480,6 @@ int32_t CommonSparseTable::pull_sparse(float* pull_values, const uint64_t* keys, for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { tasks[shard_id].wait(); } - rwlock_->UNLock(); return 0; } @@ -494,6 +539,45 @@ int32_t CommonSparseTable::push_sparse(const uint64_t* keys, return 0; } +int32_t CommonSparseTable::push_sparse(const uint64_t* keys, + const float** values, size_t num) { + _push_sparse(keys, values, num); + return 0; +} + +int32_t CommonSparseTable::_push_sparse(const uint64_t* keys, + const float** values, size_t num) { + rwlock_->RDLock(); + std::vector> offset_bucket; + offset_bucket.resize(task_pool_size_); + + for (int x = 0; x < num; ++x) { + auto y = keys[x] % task_pool_size_; + offset_bucket[y].push_back(x); + } + + std::vector> tasks(task_pool_size_); + + for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) { + tasks[shard_id] = _shards_task_pool[shard_id]->enqueue( + [this, shard_id, &keys, &values, num, &offset_bucket]() -> int { + auto& offsets = offset_bucket[shard_id]; + for (size_t i = 0; i < offsets.size(); ++i) { + std::vector tmp_off = {0}; + optimizer_->update(keys + offsets[i], values[offsets[i]], num, + tmp_off, shard_values_[shard_id].get()); + } + return 0; + }); + } + + for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { + tasks[shard_id].wait(); + } + rwlock_->UNLock(); + return 0; +} + int32_t CommonSparseTable::push_sparse_param(const uint64_t* keys, const float* values, size_t num) { rwlock_->RDLock(); diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/table/common_sparse_table.h index 98cbf2b4a21057..50c295da53464c 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.h +++ b/paddle/fluid/distributed/table/common_sparse_table.h @@ -61,12 +61,17 @@ class CommonSparseTable : public SparseTable { int32_t save(const std::string& path, const std::string& param); virtual std::pair print_table_stat(); - virtual int32_t pull_sparse(float* pull_values, const uint64_t* keys, - size_t num); + virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value); + + virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys, + size_t num); virtual int32_t push_sparse(const uint64_t* keys, const float* values, size_t num); + virtual int32_t push_sparse(const uint64_t* keys, const float** values, + size_t num); + // only for sparse geo table virtual int32_t push_sparse_param(const uint64_t* keys, const float* values, size_t num); @@ -81,6 +86,8 @@ class CommonSparseTable : public SparseTable { protected: virtual int32_t _push_sparse(const uint64_t* keys, const float* values, size_t num); + virtual int32_t _push_sparse(const uint64_t* keys, const float** values, + size_t num); private: const int task_pool_size_ = 11; diff --git a/paddle/fluid/distributed/table/common_table.h b/paddle/fluid/distributed/table/common_table.h index dc3cfa75ff6898..bc7f17f5f24579 100644 --- a/paddle/fluid/distributed/table/common_table.h +++ b/paddle/fluid/distributed/table/common_table.h @@ -98,8 +98,8 @@ class DenseTable : public Table { virtual ~DenseTable() {} virtual void *get_shard(size_t shard_idx) { return 0; } - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, @@ -123,8 +123,8 @@ class BarrierTable : public Table { int32_t push_dense(const float *values, size_t num) override { return 0; } - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, diff --git a/paddle/fluid/distributed/table/depends/dense.h b/paddle/fluid/distributed/table/depends/dense.h index a2acdfd20148ac..8079003d1bf8f6 100644 --- a/paddle/fluid/distributed/table/depends/dense.h +++ b/paddle/fluid/distributed/table/depends/dense.h @@ -89,7 +89,6 @@ class DSGD : public DenseOptimizer { auto blas = GetBlas(); float lr = *(global_learning_rate_) * (*learning_rate); - VLOG(4) << "DSGD LearningRate: " << lr; blas.VCOPY(update_numel, update_values + begin, grads.data()); blas.SCAL(update_numel, lr, grads.data()); blas.VSUB(update_numel, param + begin, grads.data(), param + begin); @@ -157,7 +156,6 @@ class DAdam : public DenseOptimizer { beta2_pow[0] = beta2_pow[0] * beta2; float lr_ = *(global_learning_rate_)*learning_rate[0]; - VLOG(4) << "DAdam LearningRate: " << lr_; lr_ *= sqrt(1 - beta2_pow[0]) / (1 - beta1_pow[0]); float* tmp_ = tmp.data(); diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h index ba79a381a6d881..68d252661edd53 100644 --- a/paddle/fluid/distributed/table/depends/large_scale_kv.h +++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h @@ -87,7 +87,7 @@ class ValueBlock { value_dims_(value_dims), value_offsets_(value_offsets), value_idx_(value_idx) { - for (int x = 0; x < value_dims.size(); ++x) { + for (size_t x = 0; x < value_dims.size(); ++x) { value_length_ += value_dims[x]; } @@ -96,13 +96,15 @@ class ValueBlock { auto slices = string::split_string(entry_attr, ":"); if (slices[0] == "none") { entry_func_ = std::bind(&count_entry, std::placeholders::_1, 0); + threshold_ = 0; } else if (slices[0] == "count_filter_entry") { - int threshold = std::stoi(slices[1]); - entry_func_ = std::bind(&count_entry, std::placeholders::_1, threshold); + threshold_ = std::stoi(slices[1]); + entry_func_ = + std::bind(&count_entry, std::placeholders::_1, threshold_); } else if (slices[0] == "probability_entry") { - float threshold = std::stof(slices[1]); + threshold_ = std::stof(slices[1]); entry_func_ = - std::bind(&probility_entry, std::placeholders::_1, threshold); + std::bind(&probility_entry, std::placeholders::_1, threshold_); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Not supported Entry Type : %s, Only support [CountFilterEntry, " @@ -155,7 +157,8 @@ class ValueBlock { } // pull - float *Init(const uint64_t &id, const bool with_update = true) { + float *Init(const uint64_t &id, const bool with_update = true, + const int counter = 1) { if (!Has(id)) { values_[id] = std::make_shared(value_length_); } @@ -163,22 +166,37 @@ class ValueBlock { auto &value = values_.at(id); if (with_update) { - AttrUpdate(value); + AttrUpdate(value, counter); } return value->data_.data(); } - void AttrUpdate(std::shared_ptr value) { + VALUE *InitGet(const uint64_t &id, const bool with_update = true, + const int counter = 1) { + if (!Has(id)) { + values_[id] = std::make_shared(value_length_); + } + + auto &value = values_.at(id); + + if (with_update) { + AttrUpdate(value, counter); + } + + return value.get(); + } + + void AttrUpdate(std::shared_ptr value, const int counter) { // update state value->unseen_days_ = 0; - ++value->count_; + value->count_ += counter; if (!value->is_entry_) { value->is_entry_ = entry_func_(value); if (value->is_entry_) { // initialize - for (int x = 0; x < value_names_.size(); ++x) { + for (size_t x = 0; x < value_names_.size(); ++x) { initializers_[x]->GetValue(value->data_.data() + value_offsets_[x], value_dims_[x]); } @@ -223,6 +241,8 @@ class ValueBlock { return; } + float GetThreshold() { return threshold_; } + private: bool Has(const uint64_t id) { auto got = values_.find(id); @@ -245,6 +265,7 @@ class ValueBlock { std::function)> entry_func_; std::vector> initializers_; + float threshold_; }; } // namespace distributed diff --git a/paddle/fluid/distributed/table/depends/sparse.h b/paddle/fluid/distributed/table/depends/sparse.h index 672d6e7d396874..0e1d7ef03c129c 100644 --- a/paddle/fluid/distributed/table/depends/sparse.h +++ b/paddle/fluid/distributed/table/depends/sparse.h @@ -110,7 +110,6 @@ class SSGD : public SparseOptimizer { auto* value = block->Get(id); float learning_rate = *(global_learning_rate_) * (value + lr_offset)[0]; - VLOG(4) << "SSGD LearningRate: " << learning_rate; float* param = value + param_offset; std::vector grads; @@ -166,7 +165,6 @@ class SAdam : public SparseOptimizer { if (!block->GetEntry(id)) continue; auto* values = block->Get(id); float lr_ = *(global_learning_rate_) * (values + lr_offset)[0]; - VLOG(4) << "SAdam LearningRate: " << lr_; float* param = values + param_offset; float* moment1 = values + m1_offset; float* moment2 = values + m2_offset; diff --git a/paddle/fluid/distributed/table/depends/sparse_utils.h b/paddle/fluid/distributed/table/depends/sparse_utils.h new file mode 100644 index 00000000000000..c185dd17d792e4 --- /dev/null +++ b/paddle/fluid/distributed/table/depends/sparse_utils.h @@ -0,0 +1,74 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +namespace paddle { +namespace distributed { + +struct PullSparseValue { + explicit PullSparseValue(int numel, int dim) + : numel_(numel), + dim_(dim), + is_training_(true), + feasigns_(nullptr), + frequencies_(nullptr) {} + + explicit PullSparseValue(std::vector feasigns, + std::vector frequencies, int dim) { + numel_ = feasigns.size(); + dim_ = dim; + is_training_ = true; + feasigns_ = feasigns.data(); + frequencies_ = frequencies.data(); + } + + void DeserializeFromBytes(void* bytes) { + /* + |---isTraining--------------| + |---8*{num}B(keysData)------| + |---4*{num}B(Frequencies)---| + */ + auto* begin = reinterpret_cast(bytes); + is_training_ = reinterpret_cast(begin)[0]; + feasigns_ = reinterpret_cast(begin + sizeof(bool)); + frequencies_ = reinterpret_cast(begin + sizeof(bool) + + sizeof(uint64_t) * numel_); + } + + void Fission(const int shard_id, const int shard_num, + std::vector* offset_shard) const { + offset_shard->reserve(numel_ / shard_num + 1); + for (int x = 0; x < numel_; ++x) { + if (feasigns_[x] % shard_num == shard_id) { + offset_shard->push_back(x); + } + } + } + + int numel_; + int dim_; + bool is_training_; + uint64_t* feasigns_; + uint32_t* frequencies_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc b/paddle/fluid/distributed/table/graph/graph_edge.cc similarity index 59% rename from paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc rename to paddle/fluid/distributed/table/graph/graph_edge.cc index 3f3b6b959e3019..0ab0d5a76d6715 100644 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc +++ b/paddle/fluid/distributed/table/graph/graph_edge.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,16 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" - +#include "paddle/fluid/distributed/table/graph/graph_edge.h" +#include namespace paddle { -namespace operators { namespace distributed { -std::once_flag AsyncSparseParamUpdateRecorder::init_flag_; -std::unique_ptr - AsyncSparseParamUpdateRecorder::recorder_(nullptr); +void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { + id_arr.push_back(id); +} -} // namespace distributed -} // namespace operators -} // namespace paddle +void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { + id_arr.push_back(id); + weight_arr.push_back(weight); +} +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_edge.h b/paddle/fluid/distributed/table/graph/graph_edge.h new file mode 100644 index 00000000000000..3dfe5a6f357a7c --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_edge.h @@ -0,0 +1,46 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +namespace paddle { +namespace distributed { + +class GraphEdgeBlob { + public: + GraphEdgeBlob() {} + virtual ~GraphEdgeBlob() {} + size_t size() { return id_arr.size(); } + virtual void add_edge(uint64_t id, float weight); + uint64_t get_id(int idx) { return id_arr[idx]; } + virtual float get_weight(int idx) { return 1; } + + protected: + std::vector id_arr; +}; + +class WeightedGraphEdgeBlob : public GraphEdgeBlob { + public: + WeightedGraphEdgeBlob() {} + virtual ~WeightedGraphEdgeBlob() {} + virtual void add_edge(uint64_t id, float weight); + virtual float get_weight(int idx) { return weight_arr[idx]; } + + protected: + std::vector weight_arr; +}; +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_node.cc b/paddle/fluid/distributed/table/graph/graph_node.cc new file mode 100644 index 00000000000000..816d31b979072c --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_node.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include +namespace paddle { +namespace distributed { + +GraphNode::~GraphNode() { + if (sampler != nullptr) { + delete sampler; + sampler = nullptr; + } + if (edges != nullptr) { + delete edges; + edges = nullptr; + } +} + +int Node::weight_size = sizeof(float); +int Node::id_size = sizeof(uint64_t); +int Node::int_size = sizeof(int); + +int Node::get_size(bool need_feature) { return id_size + int_size; } + +void Node::to_buffer(char* buffer, bool need_feature) { + memcpy(buffer, &id, id_size); + buffer += id_size; + + int feat_num = 0; + memcpy(buffer, &feat_num, sizeof(int)); +} + +void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); } + +int FeatureNode::get_size(bool need_feature) { + int size = id_size + int_size; // id, feat_num + if (need_feature) { + size += feature.size() * int_size; + for (const std::string& fea : feature) { + size += fea.size(); + } + } + return size; +} + +void GraphNode::build_edges(bool is_weighted) { + if (edges == nullptr) { + if (is_weighted == true) { + edges = new WeightedGraphEdgeBlob(); + } else { + edges = new GraphEdgeBlob(); + } + } +} +void GraphNode::build_sampler(std::string sample_type) { + if (sample_type == "random") { + sampler = new RandomSampler(); + } else if (sample_type == "weighted") { + sampler = new WeightedSampler(); + } + sampler->build(edges); +} +void FeatureNode::to_buffer(char* buffer, bool need_feature) { + memcpy(buffer, &id, id_size); + buffer += id_size; + + int feat_num = 0; + int feat_len; + if (need_feature) { + feat_num += feature.size(); + memcpy(buffer, &feat_num, sizeof(int)); + buffer += sizeof(int); + for (int i = 0; i < feat_num; ++i) { + feat_len = feature[i].size(); + memcpy(buffer, &feat_len, sizeof(int)); + buffer += sizeof(int); + memcpy(buffer, feature[i].c_str(), feature[i].size()); + buffer += feature[i].size(); + } + } else { + memcpy(buffer, &feat_num, sizeof(int)); + } +} +void FeatureNode::recover_from_buffer(char* buffer) { + int feat_num, feat_len; + memcpy(&id, buffer, id_size); + buffer += id_size; + + memcpy(&feat_num, buffer, sizeof(int)); + buffer += sizeof(int); + + feature.clear(); + for (int i = 0; i < feat_num; ++i) { + memcpy(&feat_len, buffer, sizeof(int)); + buffer += sizeof(int); + + char str[feat_len + 1]; + memcpy(str, buffer, feat_len); + buffer += feat_len; + str[feat_len] = '\0'; + feature.push_back(std::string(str)); + } +} +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_node.h b/paddle/fluid/distributed/table/graph/graph_node.h new file mode 100644 index 00000000000000..8ad795ac97b549 --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_node.h @@ -0,0 +1,127 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h" +namespace paddle { +namespace distributed { + +class Node { + public: + Node() {} + Node(uint64_t id) : id(id) {} + virtual ~Node() {} + static int id_size, int_size, weight_size; + uint64_t get_id() { return id; } + void set_id(uint64_t id) { this->id = id; } + + virtual void build_edges(bool is_weighted) {} + virtual void build_sampler(std::string sample_type) {} + virtual void add_edge(uint64_t id, float weight) {} + virtual std::vector sample_k(int k) { return std::vector(); } + virtual uint64_t get_neighbor_id(int idx) { return 0; } + virtual float get_neighbor_weight(int idx) { return 1.; } + + virtual int get_size(bool need_feature); + virtual void to_buffer(char *buffer, bool need_feature); + virtual void recover_from_buffer(char *buffer); + virtual std::string get_feature(int idx) { return std::string(""); } + virtual void set_feature(int idx, std::string str) {} + virtual void set_feature_size(int size) {} + virtual int get_feature_size() { return 0; } + + protected: + uint64_t id; +}; + +class GraphNode : public Node { + public: + GraphNode() : Node(), sampler(nullptr), edges(nullptr) {} + GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {} + virtual ~GraphNode(); + virtual void build_edges(bool is_weighted); + virtual void build_sampler(std::string sample_type); + virtual void add_edge(uint64_t id, float weight) { + edges->add_edge(id, weight); + } + virtual std::vector sample_k(int k) { return sampler->sample_k(k); } + virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); } + virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); } + + protected: + Sampler *sampler; + GraphEdgeBlob *edges; +}; + +class FeatureNode : public Node { + public: + FeatureNode() : Node() {} + FeatureNode(uint64_t id) : Node(id) {} + virtual ~FeatureNode() {} + virtual int get_size(bool need_feature); + virtual void to_buffer(char *buffer, bool need_feature); + virtual void recover_from_buffer(char *buffer); + virtual std::string get_feature(int idx) { + if (idx < (int)this->feature.size()) { + return this->feature[idx]; + } else { + return std::string(""); + } + } + + virtual void set_feature(int idx, std::string str) { + if (idx >= (int)this->feature.size()) { + this->feature.resize(idx + 1); + } + this->feature[idx] = str; + } + virtual void set_feature_size(int size) { this->feature.resize(size); } + virtual int get_feature_size() { return this->feature.size(); } + + template + static std::string parse_value_to_bytes(std::vector feat_str) { + T v; + size_t Tsize = sizeof(T) * feat_str.size(); + char buffer[Tsize]; + for (size_t i = 0; i < feat_str.size(); i++) { + std::stringstream ss(feat_str[i]); + ss >> v; + std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T)); + } + return std::string(buffer, Tsize); + } + + template + static std::vector parse_bytes_to_array(std::string feat_str) { + T v; + std::vector out; + size_t start = 0; + const char *buffer = feat_str.data(); + while (start < feat_str.size()) { + std::memcpy((char *)&v, buffer + start, sizeof(T)); + start += sizeof(T); + out.push_back(v); + } + return out; + } + + protected: + std::vector feature; +}; +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc new file mode 100644 index 00000000000000..3a680875e3df4a --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h" +#include +#include +namespace paddle { +namespace distributed { + +void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; } + +std::vector RandomSampler::sample_k(int k) { + int n = edges->size(); + if (k > n) { + k = n; + } + struct timespec tn; + clock_gettime(CLOCK_REALTIME, &tn); + srand(tn.tv_nsec); + std::vector sample_result; + std::unordered_map replace_map; + while (k--) { + int rand_int = rand() % n; + auto iter = replace_map.find(rand_int); + if (iter == replace_map.end()) { + sample_result.push_back(rand_int); + } else { + sample_result.push_back(iter->second); + } + + iter = replace_map.find(n - 1); + if (iter == replace_map.end()) { + replace_map[rand_int] = n - 1; + } else { + replace_map[rand_int] = iter->second; + } + --n; + } + return sample_result; +} + +WeightedSampler::WeightedSampler() { + left = nullptr; + right = nullptr; + edges = nullptr; +} + +WeightedSampler::~WeightedSampler() { + if (left != nullptr) { + delete left; + left = nullptr; + } + if (right != nullptr) { + delete right; + right = nullptr; + } +} + +void WeightedSampler::build(GraphEdgeBlob *edges) { + if (left != nullptr) { + delete left; + left = nullptr; + } + if (right != nullptr) { + delete right; + right = nullptr; + } + return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size()); +} + +void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start, + int end) { + count = 0; + this->edges = edges; + if (start + 1 == end) { + left = right = nullptr; + idx = start; + count = 1; + weight = edges->get_weight(idx); + + } else { + left = new WeightedSampler(); + right = new WeightedSampler(); + left->build_one(edges, start, start + (end - start) / 2); + right->build_one(edges, start + (end - start) / 2, end); + weight = left->weight + right->weight; + count = left->count + right->count; + } +} +std::vector WeightedSampler::sample_k(int k) { + if (k > count) { + k = count; + } + std::vector sample_result; + float subtract; + std::unordered_map subtract_weight_map; + std::unordered_map subtract_count_map; + struct timespec tn; + clock_gettime(CLOCK_REALTIME, &tn); + srand(tn.tv_nsec); + while (k--) { + float query_weight = rand() % 100000 / 100000.0; + query_weight *= weight - subtract_weight_map[this]; + sample_result.push_back(sample(query_weight, subtract_weight_map, + subtract_count_map, subtract)); + } + return sample_result; +} + +int WeightedSampler::sample( + float query_weight, + std::unordered_map &subtract_weight_map, + std::unordered_map &subtract_count_map, + float &subtract) { + if (left == nullptr) { + subtract_weight_map[this] = weight; + subtract = weight; + subtract_count_map[this] = 1; + return idx; + } + int left_count = left->count - subtract_count_map[left]; + int right_count = right->count - subtract_count_map[right]; + float left_subtract = subtract_weight_map[left]; + int return_idx; + if (right_count == 0 || + left_count > 0 && left->weight - left_subtract >= query_weight) { + return_idx = left->sample(query_weight, subtract_weight_map, + subtract_count_map, subtract); + } else { + return_idx = + right->sample(query_weight - (left->weight - left_subtract), + subtract_weight_map, subtract_count_map, subtract); + } + subtract_weight_map[this] += subtract; + subtract_count_map[this]++; + return return_idx; +} +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h new file mode 100644 index 00000000000000..1787ab23b04316 --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h @@ -0,0 +1,58 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/distributed/table/graph/graph_edge.h" +namespace paddle { +namespace distributed { + +class Sampler { + public: + virtual ~Sampler() {} + virtual void build(GraphEdgeBlob *edges) = 0; + virtual std::vector sample_k(int k) = 0; +}; + +class RandomSampler : public Sampler { + public: + virtual ~RandomSampler() {} + virtual void build(GraphEdgeBlob *edges); + virtual std::vector sample_k(int k); + GraphEdgeBlob *edges; +}; + +class WeightedSampler : public Sampler { + public: + WeightedSampler(); + virtual ~WeightedSampler(); + WeightedSampler *left, *right; + float weight; + int count; + int idx; + GraphEdgeBlob *edges; + virtual void build(GraphEdgeBlob *edges); + virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end); + virtual std::vector sample_k(int k); + + private: + int sample(float query_weight, + std::unordered_map &subtract_weight_map, + std::unordered_map &subtract_count_map, + float &subtract); +}; +} +} diff --git a/paddle/fluid/operators/distributed/large_scale_kv.cc b/paddle/fluid/distributed/table/graph_edge.cc similarity index 62% rename from paddle/fluid/operators/distributed/large_scale_kv.cc rename to paddle/fluid/distributed/table/graph_edge.cc index d2673ed6ffb366..cc90f4c6516c18 100644 --- a/paddle/fluid/operators/distributed/large_scale_kv.cc +++ b/paddle/fluid/distributed/table/graph_edge.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,15 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/distributed/large_scale_kv.h" - +#include "paddle/fluid/distributed/table/graph_edge.h" +#include namespace paddle { -namespace operators { namespace distributed { -std::once_flag LargeScaleKV::init_flag_; -std::shared_ptr LargeScaleKV::scale_kv_(nullptr); +void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { + id_arr.push_back(id); +} -} // namespace distributed -} // namespace operators -} // namespace paddle +void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { + id_arr.push_back(id); + weight_arr.push_back(weight); +} +} +} diff --git a/paddle/fluid/distributed/table/graph_edge.h b/paddle/fluid/distributed/table/graph_edge.h new file mode 100644 index 00000000000000..3dfe5a6f357a7c --- /dev/null +++ b/paddle/fluid/distributed/table/graph_edge.h @@ -0,0 +1,46 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +namespace paddle { +namespace distributed { + +class GraphEdgeBlob { + public: + GraphEdgeBlob() {} + virtual ~GraphEdgeBlob() {} + size_t size() { return id_arr.size(); } + virtual void add_edge(uint64_t id, float weight); + uint64_t get_id(int idx) { return id_arr[idx]; } + virtual float get_weight(int idx) { return 1; } + + protected: + std::vector id_arr; +}; + +class WeightedGraphEdgeBlob : public GraphEdgeBlob { + public: + WeightedGraphEdgeBlob() {} + virtual ~WeightedGraphEdgeBlob() {} + virtual void add_edge(uint64_t id, float weight); + virtual float get_weight(int idx) { return weight_arr[idx]; } + + protected: + std::vector weight_arr; +}; +} +} diff --git a/paddle/fluid/distributed/table/graph_node.cc b/paddle/fluid/distributed/table/graph_node.cc new file mode 100644 index 00000000000000..27a2cafaf4f0fe --- /dev/null +++ b/paddle/fluid/distributed/table/graph_node.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph_node.h" +#include +namespace paddle { +namespace distributed { + +GraphNode::~GraphNode() { + if (sampler != nullptr) { + delete sampler; + sampler = nullptr; + } + if (edges != nullptr) { + delete edges; + edges = nullptr; + } +} + +int Node::weight_size = sizeof(float); +int Node::id_size = sizeof(uint64_t); +int Node::int_size = sizeof(int); + +int Node::get_size(bool need_feature) { return id_size + int_size; } + +void Node::to_buffer(char* buffer, bool need_feature) { + memcpy(buffer, &id, id_size); + buffer += id_size; + + int feat_num = 0; + memcpy(buffer, &feat_num, sizeof(int)); +} + +void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); } + +int FeatureNode::get_size(bool need_feature) { + int size = id_size + int_size; // id, feat_num + if (need_feature) { + size += feature.size() * int_size; + for (const std::string& fea : feature) { + size += fea.size(); + } + } + return size; +} + +void GraphNode::build_edges(bool is_weighted) { + if (edges == nullptr) { + if (is_weighted == true) { + edges = new WeightedGraphEdgeBlob(); + } else { + edges = new GraphEdgeBlob(); + } + } +} +void GraphNode::build_sampler(std::string sample_type) { + if (sample_type == "random") { + sampler = new RandomSampler(); + } else if (sample_type == "weighted") { + sampler = new WeightedSampler(); + } + sampler->build(edges); +} +void FeatureNode::to_buffer(char* buffer, bool need_feature) { + memcpy(buffer, &id, id_size); + buffer += id_size; + + int feat_num = 0; + int feat_len; + if (need_feature) { + feat_num += feature.size(); + memcpy(buffer, &feat_num, sizeof(int)); + buffer += sizeof(int); + for (int i = 0; i < feat_num; ++i) { + feat_len = feature[i].size(); + memcpy(buffer, &feat_len, sizeof(int)); + buffer += sizeof(int); + memcpy(buffer, feature[i].c_str(), feature[i].size()); + buffer += feature[i].size(); + } + } else { + memcpy(buffer, &feat_num, sizeof(int)); + } +} +void FeatureNode::recover_from_buffer(char* buffer) { + int feat_num, feat_len; + memcpy(&id, buffer, id_size); + buffer += id_size; + + memcpy(&feat_num, buffer, sizeof(int)); + buffer += sizeof(int); + + feature.clear(); + for (int i = 0; i < feat_num; ++i) { + memcpy(&feat_len, buffer, sizeof(int)); + buffer += sizeof(int); + + char str[feat_len + 1]; + memcpy(str, buffer, feat_len); + buffer += feat_len; + str[feat_len] = '\0'; + feature.push_back(std::string(str)); + } +} +} +} diff --git a/paddle/fluid/distributed/table/graph_node.h b/paddle/fluid/distributed/table/graph_node.h new file mode 100644 index 00000000000000..c3e8e3ce5b50d0 --- /dev/null +++ b/paddle/fluid/distributed/table/graph_node.h @@ -0,0 +1,127 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/distributed/table/graph_weighted_sampler.h" +namespace paddle { +namespace distributed { + +class Node { + public: + Node() {} + Node(uint64_t id) : id(id) {} + virtual ~Node() {} + static int id_size, int_size, weight_size; + uint64_t get_id() { return id; } + void set_id(uint64_t id) { this->id = id; } + + virtual void build_edges(bool is_weighted) {} + virtual void build_sampler(std::string sample_type) {} + virtual void add_edge(uint64_t id, float weight) {} + virtual std::vector sample_k(int k) { return std::vector(); } + virtual uint64_t get_neighbor_id(int idx) { return 0; } + virtual float get_neighbor_weight(int idx) { return 1.; } + + virtual int get_size(bool need_feature); + virtual void to_buffer(char *buffer, bool need_feature); + virtual void recover_from_buffer(char *buffer); + virtual std::string get_feature(int idx) { return std::string(""); } + virtual void set_feature(int idx, std::string str) {} + virtual void set_feature_size(int size) {} + virtual int get_feature_size() { return 0; } + + protected: + uint64_t id; +}; + +class GraphNode : public Node { + public: + GraphNode() : Node(), sampler(nullptr), edges(nullptr) {} + GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {} + virtual ~GraphNode(); + virtual void build_edges(bool is_weighted); + virtual void build_sampler(std::string sample_type); + virtual void add_edge(uint64_t id, float weight) { + edges->add_edge(id, weight); + } + virtual std::vector sample_k(int k) { return sampler->sample_k(k); } + virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); } + virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); } + + protected: + Sampler *sampler; + GraphEdgeBlob *edges; +}; + +class FeatureNode : public Node { + public: + FeatureNode() : Node() {} + FeatureNode(uint64_t id) : Node(id) {} + virtual ~FeatureNode() {} + virtual int get_size(bool need_feature); + virtual void to_buffer(char *buffer, bool need_feature); + virtual void recover_from_buffer(char *buffer); + virtual std::string get_feature(int idx) { + if (idx < (int)this->feature.size()) { + return this->feature[idx]; + } else { + return std::string(""); + } + } + + virtual void set_feature(int idx, std::string str) { + if (idx >= (int)this->feature.size()) { + this->feature.resize(idx + 1); + } + this->feature[idx] = str; + } + virtual void set_feature_size(int size) { this->feature.resize(size); } + virtual int get_feature_size() { return this->feature.size(); } + + template + static std::string parse_value_to_bytes(std::vector feat_str) { + T v; + size_t Tsize = sizeof(T) * feat_str.size(); + char buffer[Tsize]; + for (size_t i = 0; i < feat_str.size(); i++) { + std::stringstream ss(feat_str[i]); + ss >> v; + std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T)); + } + return std::string(buffer, Tsize); + } + + template + static std::vector parse_bytes_to_array(std::string feat_str) { + T v; + std::vector out; + size_t start = 0; + const char *buffer = feat_str.data(); + while (start < feat_str.size()) { + std::memcpy((char *)&v, buffer + start, sizeof(T)); + start += sizeof(T); + out.push_back(v); + } + return out; + } + + protected: + std::vector feature; +}; +} +} diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph_weighted_sampler.cc new file mode 100644 index 00000000000000..059a1d64bc392d --- /dev/null +++ b/paddle/fluid/distributed/table/graph_weighted_sampler.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph_weighted_sampler.h" +#include +#include +namespace paddle { +namespace distributed { + +void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; } + +std::vector RandomSampler::sample_k(int k) { + int n = edges->size(); + if (k > n) { + k = n; + } + struct timespec tn; + clock_gettime(CLOCK_REALTIME, &tn); + srand(tn.tv_nsec); + std::vector sample_result; + std::unordered_map replace_map; + while (k--) { + int rand_int = rand() % n; + auto iter = replace_map.find(rand_int); + if (iter == replace_map.end()) { + sample_result.push_back(rand_int); + } else { + sample_result.push_back(iter->second); + } + + iter = replace_map.find(n - 1); + if (iter == replace_map.end()) { + replace_map[rand_int] = n - 1; + } else { + replace_map[rand_int] = iter->second; + } + --n; + } + return sample_result; +} + +WeightedSampler::WeightedSampler() { + left = nullptr; + right = nullptr; + edges = nullptr; +} + +WeightedSampler::~WeightedSampler() { + if (left != nullptr) { + delete left; + left = nullptr; + } + if (right != nullptr) { + delete right; + right = nullptr; + } +} + +void WeightedSampler::build(GraphEdgeBlob *edges) { + if (left != nullptr) { + delete left; + left = nullptr; + } + if (right != nullptr) { + delete right; + right = nullptr; + } + return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size()); +} + +void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start, + int end) { + count = 0; + this->edges = edges; + if (start + 1 == end) { + left = right = nullptr; + idx = start; + count = 1; + weight = edges->get_weight(idx); + + } else { + left = new WeightedSampler(); + right = new WeightedSampler(); + left->build_one(edges, start, start + (end - start) / 2); + right->build_one(edges, start + (end - start) / 2, end); + weight = left->weight + right->weight; + count = left->count + right->count; + } +} +std::vector WeightedSampler::sample_k(int k) { + if (k > count) { + k = count; + } + std::vector sample_result; + float subtract; + std::unordered_map subtract_weight_map; + std::unordered_map subtract_count_map; + struct timespec tn; + clock_gettime(CLOCK_REALTIME, &tn); + srand(tn.tv_nsec); + while (k--) { + float query_weight = rand() % 100000 / 100000.0; + query_weight *= weight - subtract_weight_map[this]; + sample_result.push_back(sample(query_weight, subtract_weight_map, + subtract_count_map, subtract)); + } + return sample_result; +} + +int WeightedSampler::sample( + float query_weight, + std::unordered_map &subtract_weight_map, + std::unordered_map &subtract_count_map, + float &subtract) { + if (left == nullptr) { + subtract_weight_map[this] = weight; + subtract = weight; + subtract_count_map[this] = 1; + return idx; + } + int left_count = left->count - subtract_count_map[left]; + int right_count = right->count - subtract_count_map[right]; + float left_subtract = subtract_weight_map[left]; + int return_idx; + if (right_count == 0 || + left_count > 0 && left->weight - left_subtract >= query_weight) { + return_idx = left->sample(query_weight, subtract_weight_map, + subtract_count_map, subtract); + } else { + return_idx = + right->sample(query_weight - (left->weight - left_subtract), + subtract_weight_map, subtract_count_map, subtract); + } + subtract_weight_map[this] += subtract; + subtract_count_map[this]++; + return return_idx; +} +} +} diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph_weighted_sampler.h new file mode 100644 index 00000000000000..cfc341d27c6b76 --- /dev/null +++ b/paddle/fluid/distributed/table/graph_weighted_sampler.h @@ -0,0 +1,58 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/distributed/table/graph_edge.h" +namespace paddle { +namespace distributed { + +class Sampler { + public: + virtual ~Sampler() {} + virtual void build(GraphEdgeBlob *edges) = 0; + virtual std::vector sample_k(int k) = 0; +}; + +class RandomSampler : public Sampler { + public: + virtual ~RandomSampler() {} + virtual void build(GraphEdgeBlob *edges); + virtual std::vector sample_k(int k); + GraphEdgeBlob *edges; +}; + +class WeightedSampler : public Sampler { + public: + WeightedSampler(); + virtual ~WeightedSampler(); + WeightedSampler *left, *right; + float weight; + int count; + int idx; + GraphEdgeBlob *edges; + virtual void build(GraphEdgeBlob *edges); + virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end); + virtual std::vector sample_k(int k); + + private: + int sample(float query_weight, + std::unordered_map &subtract_weight_map, + std::unordered_map &subtract_count_map, + float &subtract); +}; +} +} diff --git a/paddle/fluid/distributed/table/sparse_geo_table.cc b/paddle/fluid/distributed/table/sparse_geo_table.cc index 9b276e7de5c92d..04cd1136382a4e 100644 --- a/paddle/fluid/distributed/table/sparse_geo_table.cc +++ b/paddle/fluid/distributed/table/sparse_geo_table.cc @@ -22,8 +22,17 @@ int32_t SparseGeoTable::pull_geo_param(const uint32_t trainer_id, std::vector* ids) { geo_recorder->GetAndClear(trainer_id, ids); auto dim = _config.common().dims()[0]; + + std::vector frequencies; + frequencies.resize(ids->size(), 1); + + auto pull_value = PullSparseValue(ids->size(), dim); + pull_value.is_training_ = true; + pull_value.feasigns_ = ids->data(); + pull_value.frequencies_ = frequencies.data(); + values->resize(ids->size() * dim); - CommonSparseTable::pull_sparse(values->data(), ids->data(), ids->size()); + CommonSparseTable::pull_sparse(values->data(), pull_value); return 0; } diff --git a/paddle/fluid/distributed/table/table.cc b/paddle/fluid/distributed/table/table.cc index dfaaa6ffc12c2b..600be954cb5966 100644 --- a/paddle/fluid/distributed/table/table.cc +++ b/paddle/fluid/distributed/table/table.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/distributed/common/registerer.h" #include "paddle/fluid/distributed/table/common_dense_table.h" +#include "paddle/fluid/distributed/table/common_graph_table.h" #include "paddle/fluid/distributed/table/common_sparse_table.h" #include "paddle/fluid/distributed/table/sparse_geo_table.h" #include "paddle/fluid/distributed/table/tensor_accessor.h" @@ -25,7 +26,7 @@ namespace paddle { namespace distributed { - +REGISTER_PSCORE_CLASS(Table, GraphTable); REGISTER_PSCORE_CLASS(Table, CommonDenseTable); REGISTER_PSCORE_CLASS(Table, CommonSparseTable); REGISTER_PSCORE_CLASS(Table, SparseGeoTable); @@ -75,5 +76,6 @@ int32_t Table::initialize_accessor() { _value_accesor.reset(accessor); return 0; } + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h index 65c99d2bbd40d4..81a1ff5eced2bb 100644 --- a/paddle/fluid/distributed/table/table.h +++ b/paddle/fluid/distributed/table/table.h @@ -21,6 +21,8 @@ #include #include #include "paddle/fluid/distributed/table/accessor.h" +#include "paddle/fluid/distributed/table/depends/sparse_utils.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/device_context.h" @@ -46,10 +48,17 @@ class Table { return 0; } - virtual int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) = 0; + virtual int32_t pull_sparse_ptr(char **pull_values, const uint64_t *keys, + size_t num) { + VLOG(0) << "NOT IMPLEMENT"; + return 0; + } + virtual int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) = 0; virtual int32_t push_sparse(const uint64_t *keys, const float *values, size_t num) = 0; + virtual int32_t push_sparse(const uint64_t *keys, const float **values, + size_t num){}; virtual int32_t push_sparse_param(const uint64_t *keys, const float *values, size_t num) { return 0; @@ -141,5 +150,6 @@ class TableManager { TableManager() {} ~TableManager() {} }; + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/table/tensor_table.h b/paddle/fluid/distributed/table/tensor_table.h index 1a8f1a9cd9adb8..080682d131420b 100644 --- a/paddle/fluid/distributed/table/tensor_table.h +++ b/paddle/fluid/distributed/table/tensor_table.h @@ -52,8 +52,8 @@ class TensorTable : public Table { int32_t push_dense(const float *values, size_t num) override { return 0; } - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, @@ -102,8 +102,8 @@ class DenseTensorTable : public TensorTable { DenseTensorTable() {} virtual ~DenseTensorTable() {} - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, @@ -158,8 +158,8 @@ class GlobalStepTable : public DenseTensorTable { GlobalStepTable() {} virtual ~GlobalStepTable() {} - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index adedd049023daa..b756c740ac764c 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -15,3 +15,6 @@ cc_test(brpc_service_sparse_sgd_test SRCS brpc_service_sparse_sgd_test.cc DEPS s set_source_files_properties(brpc_utils_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_function ${COMMON_DEPS} ${RPC_DEPS}) + +set_source_files_properties(graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc index fbd236012f5237..8fb3434af6e281 100644 --- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc +++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc @@ -212,8 +212,8 @@ void RunBrpcPushSparse() { /*-----------------------Test Server Init----------------------------------*/ LOG(INFO) << "Run pull_sparse_param"; - auto pull_status = worker_ptr_->pull_sparse(fea_value_ptr.data(), 0, - fea_keys.data(), fea_keys.size()); + auto pull_status = worker_ptr_->pull_sparse( + fea_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true); pull_status.wait(); for (size_t idx = 0; idx < tensor->numel(); ++idx) { fea_values.data()[idx] *= 2.0; @@ -241,7 +241,7 @@ void RunBrpcPushSparse() { push_status.wait(); auto pull_param_status = worker_ptr_->pull_sparse( - fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size()); + fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true); pull_param_status.wait(); for (size_t idx = 0; idx < tensor->numel(); ++idx) { @@ -275,7 +275,7 @@ void RunBrpcPushSparse() { push_grad_status.wait(); auto pull_update_status = worker_ptr_->pull_sparse( - fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size()); + fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true); pull_update_status.wait(); for (size_t idx = 0; idx < tensor->numel(); ++idx) { diff --git a/paddle/fluid/distributed/test/geo_table_test.cc b/paddle/fluid/distributed/test/geo_table_test.cc index 22e11acf6584ee..c9f15db3f788e1 100644 --- a/paddle/fluid/distributed/test/geo_table_test.cc +++ b/paddle/fluid/distributed/test/geo_table_test.cc @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/table/common_dense_table.h" #include "paddle/fluid/distributed/table/common_sparse_table.h" +#include "paddle/fluid/distributed/table/depends/sparse_utils.h" #include "paddle/fluid/distributed/table/sparse_geo_table.h" #include "paddle/fluid/distributed/table/table.h" @@ -53,14 +54,18 @@ TEST(SparseGeoTable, SSUM) { // test push_sparse_param, and create params std::vector init_keys = {0, 1, 2, 3, 4}; + std::vector init_fres = {1, 1, 1, 1, 1}; std::vector init_values; for (size_t i = 0; i < init_keys.size() * emb_dim; i++) { init_values.push_back(0.0); } table->push_sparse_param(init_keys.data(), init_values.data(), init_keys.size()); + std::vector pull_values(init_values.size()); - table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size()); + auto value = PullSparseValue(init_keys, init_fres, emb_dim); + table->pull_sparse(pull_values.data(), value); + for (size_t i = 0; i < init_keys.size() * emb_dim; i++) { ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-5); } diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc new file mode 100644 index 00000000000000..b268bb449e1461 --- /dev/null +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -0,0 +1,556 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include // NOLINT +#include +#include +#include +#include // NOLINT +#include +#include +#include "google/protobuf/text_format.h" + +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/service/env.h" +#include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/graph_py_service.h" +#include "paddle/fluid/distributed/service/ps_client.h" +#include "paddle/fluid/distributed/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/service/service.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace math = paddle::operators::math; +namespace memory = paddle::memory; +namespace distributed = paddle::distributed; + +void testSampleNodes( + std::shared_ptr& worker_ptr_) { + std::vector ids; + auto pull_status = worker_ptr_->random_sample_nodes(0, 0, 6, ids); + std::unordered_set s; + std::unordered_set s1 = {37, 59}; + pull_status.wait(); + for (auto id : ids) s.insert(id); + ASSERT_EQ(true, s.size() == s1.size()); + for (auto id : s) { + ASSERT_EQ(true, s1.find(id) != s1.end()); + } +} + +void testFeatureNodeSerializeInt() { + std::string out = + distributed::FeatureNode::parse_value_to_bytes({"123", "345"}); + std::vector out2 = + distributed::FeatureNode::parse_bytes_to_array(out); + ASSERT_EQ(out2[0], 123); + ASSERT_EQ(out2[1], 345); +} + +void testFeatureNodeSerializeInt64() { + std::string out = + distributed::FeatureNode::parse_value_to_bytes({"123", "345"}); + std::vector out2 = + distributed::FeatureNode::parse_bytes_to_array(out); + ASSERT_EQ(out2[0], 123); + ASSERT_EQ(out2[1], 345); +} + +void testFeatureNodeSerializeFloat32() { + std::string out = distributed::FeatureNode::parse_value_to_bytes( + {"123.123", "345.123"}); + std::vector out2 = + distributed::FeatureNode::parse_bytes_to_array(out); + float eps; + std::cout << "Float " << out2[0] << " " << 123.123 << std::endl; + eps = out2[0] - 123.123; + ASSERT_LE(eps * eps, 1e-5); + eps = out2[1] - 345.123; + ASSERT_LE(eps * eps, 1e-5); +} + +void testFeatureNodeSerializeFloat64() { + std::string out = distributed::FeatureNode::parse_value_to_bytes( + {"123.123", "345.123"}); + std::vector out2 = + distributed::FeatureNode::parse_bytes_to_array(out); + float eps; + eps = out2[0] - 123.123; + std::cout << "Float64 " << out2[0] << " " << 123.123 << std::endl; + ASSERT_LE(eps * eps, 1e-5); + eps = out2[1] - 345.123; + ASSERT_LE(eps * eps, 1e-5); +} + +void testSingleSampleNeighboor( + std::shared_ptr& worker_ptr_) { + std::vector>> vs; + auto pull_status = worker_ptr_->batch_sample_neighboors( + 0, std::vector(1, 37), 4, vs); + pull_status.wait(); + + std::unordered_set s; + std::unordered_set s1 = {112, 45, 145}; + for (auto g : vs[0]) { + s.insert(g.first); + } + ASSERT_EQ(s.size(), 3); + for (auto g : s) { + ASSERT_EQ(true, s1.find(g) != s1.end()); + } + VLOG(0) << "test single done"; + s.clear(); + s1.clear(); + vs.clear(); + pull_status = worker_ptr_->batch_sample_neighboors( + 0, std::vector(1, 96), 4, vs); + pull_status.wait(); + s1 = {111, 48, 247}; + for (auto g : vs[0]) { + s.insert(g.first); + } + ASSERT_EQ(s.size(), 3); + for (auto g : s) { + ASSERT_EQ(true, s1.find(g) != s1.end()); + } +} + +void testBatchSampleNeighboor( + std::shared_ptr& worker_ptr_) { + std::vector>> vs; + std::vector v = {37, 96}; + auto pull_status = worker_ptr_->batch_sample_neighboors(0, v, 4, vs); + pull_status.wait(); + std::unordered_set s; + std::unordered_set s1 = {112, 45, 145}; + for (auto g : vs[0]) { + s.insert(g.first); + } + ASSERT_EQ(s.size(), 3); + for (auto g : s) { + ASSERT_EQ(true, s1.find(g) != s1.end()); + } + s.clear(); + s1.clear(); + s1 = {111, 48, 247}; + for (auto g : vs[1]) { + s.insert(g.first); + } + ASSERT_EQ(s.size(), 3); + for (auto g : s) { + ASSERT_EQ(true, s1.find(g) != s1.end()); + } +} + +void testGraphToBuffer(); +// std::string nodes[] = {std::string("37\taa\t45;0.34\t145;0.31\t112;0.21"), +// std::string("96\tfeature\t48;1.4\t247;0.31\t111;1.21"), +// std::string("59\ttreat\t45;0.34\t145;0.31\t112;0.21"), +// std::string("97\tfood\t48;1.4\t247;0.31\t111;1.21")}; + +std::string edges[] = { + std::string("37\t45\t0.34"), std::string("37\t145\t0.31"), + std::string("37\t112\t0.21"), std::string("96\t48\t1.4"), + std::string("96\t247\t0.31"), std::string("96\t111\t1.21"), + std::string("59\t45\t0.34"), std::string("59\t145\t0.31"), + std::string("59\t122\t0.21"), std::string("97\t48\t0.34"), + std::string("97\t247\t0.31"), std::string("97\t111\t0.21")}; +char edge_file_name[] = "edges.txt"; + +std::string nodes[] = { + std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"), + std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"), + std::string("user\t59\ta 0.11\tb 11 14"), + std::string("user\t97\ta 0.11\tb 12 11"), + std::string("item\t45\ta 0.21"), + std::string("item\t145\ta 0.21"), + std::string("item\t112\ta 0.21"), + std::string("item\t48\ta 0.21"), + std::string("item\t247\ta 0.21"), + std::string("item\t111\ta 0.21"), + std::string("item\t46\ta 0.21"), + std::string("item\t146\ta 0.21"), + std::string("item\t122\ta 0.21"), + std::string("item\t49\ta 0.21"), + std::string("item\t248\ta 0.21"), + std::string("item\t113\ta 0.21")}; +char node_file_name[] = "nodes.txt"; + +void prepare_file(char file_name[], bool load_edge) { + std::ofstream ofile; + ofile.open(file_name); + if (load_edge) { + for (auto x : edges) { + ofile << x << std::endl; + } + } else { + for (auto x : nodes) { + ofile << x << std::endl; + } + } + ofile.close(); +} +void GetDownpourSparseTableProto( + ::paddle::distributed::TableParameter* sparse_table_proto) { + sparse_table_proto->set_table_id(0); + sparse_table_proto->set_table_class("GraphTable"); + sparse_table_proto->set_shard_num(127); + sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE); + ::paddle::distributed::TableAccessorParameter* accessor_proto = + sparse_table_proto->mutable_accessor(); + accessor_proto->set_accessor_class("CommMergeAccessor"); +} + +::paddle::distributed::PSParameter GetServerProto() { + // Generate server proto desc + ::paddle::distributed::PSParameter server_fleet_desc; + ::paddle::distributed::ServerParameter* server_proto = + server_fleet_desc.mutable_server_param(); + ::paddle::distributed::DownpourServerParameter* downpour_server_proto = + server_proto->mutable_downpour_server_param(); + ::paddle::distributed::ServerServiceParameter* server_service_proto = + downpour_server_proto->mutable_service_param(); + server_service_proto->set_service_class("GraphBrpcService"); + server_service_proto->set_server_class("GraphBrpcServer"); + server_service_proto->set_client_class("GraphBrpcClient"); + server_service_proto->set_start_server_port(0); + server_service_proto->set_server_thread_num(12); + + ::paddle::distributed::TableParameter* sparse_table_proto = + downpour_server_proto->add_downpour_table_param(); + GetDownpourSparseTableProto(sparse_table_proto); + return server_fleet_desc; +} + +::paddle::distributed::PSParameter GetWorkerProto() { + ::paddle::distributed::PSParameter worker_fleet_desc; + ::paddle::distributed::WorkerParameter* worker_proto = + worker_fleet_desc.mutable_worker_param(); + + ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto = + worker_proto->mutable_downpour_worker_param(); + + ::paddle::distributed::TableParameter* worker_sparse_table_proto = + downpour_worker_proto->add_downpour_table_param(); + GetDownpourSparseTableProto(worker_sparse_table_proto); + + ::paddle::distributed::ServerParameter* server_proto = + worker_fleet_desc.mutable_server_param(); + ::paddle::distributed::DownpourServerParameter* downpour_server_proto = + server_proto->mutable_downpour_server_param(); + ::paddle::distributed::ServerServiceParameter* server_service_proto = + downpour_server_proto->mutable_service_param(); + server_service_proto->set_service_class("GraphBrpcService"); + server_service_proto->set_server_class("GraphBrpcServer"); + server_service_proto->set_client_class("GraphBrpcClient"); + server_service_proto->set_start_server_port(0); + server_service_proto->set_server_thread_num(12); + + ::paddle::distributed::TableParameter* server_sparse_table_proto = + downpour_server_proto->add_downpour_table_param(); + GetDownpourSparseTableProto(server_sparse_table_proto); + + return worker_fleet_desc; +} + +/*-------------------------------------------------------------------------*/ + +std::string ip_ = "127.0.0.1", ip2 = "127.0.0.1"; +uint32_t port_ = 5209, port2 = 5210; + +std::vector host_sign_list_; + +std::shared_ptr pserver_ptr_, + pserver_ptr2; + +std::shared_ptr worker_ptr_; + +void RunServer() { + LOG(INFO) << "init first server"; + ::paddle::distributed::PSParameter server_proto = GetServerProto(); + + auto _ps_env = paddle::distributed::PaddlePSEnvironment(); + _ps_env.set_ps_servers(&host_sign_list_, 2); // test + pserver_ptr_ = std::shared_ptr( + (paddle::distributed::GraphBrpcServer*) + paddle::distributed::PSServerFactory::create(server_proto)); + std::vector empty_vec; + framework::ProgramDesc empty_prog; + empty_vec.push_back(empty_prog); + pserver_ptr_->configure(server_proto, _ps_env, 0, empty_vec); + LOG(INFO) << "first server, run start(ip,port)"; + pserver_ptr_->start(ip_, port_); + LOG(INFO) << "init first server Done"; +} + +void RunServer2() { + LOG(INFO) << "init second server"; + ::paddle::distributed::PSParameter server_proto2 = GetServerProto(); + + auto _ps_env2 = paddle::distributed::PaddlePSEnvironment(); + _ps_env2.set_ps_servers(&host_sign_list_, 2); // test + pserver_ptr2 = std::shared_ptr( + (paddle::distributed::GraphBrpcServer*) + paddle::distributed::PSServerFactory::create(server_proto2)); + std::vector empty_vec2; + framework::ProgramDesc empty_prog2; + empty_vec2.push_back(empty_prog2); + pserver_ptr2->configure(server_proto2, _ps_env2, 1, empty_vec2); + pserver_ptr2->start(ip2, port2); +} + +void RunClient( + std::map>& dense_regions, + int index, paddle::distributed::PsBaseService* service) { + ::paddle::distributed::PSParameter worker_proto = GetWorkerProto(); + paddle::distributed::PaddlePSEnvironment _ps_env; + auto servers_ = host_sign_list_.size(); + _ps_env = paddle::distributed::PaddlePSEnvironment(); + _ps_env.set_ps_servers(&host_sign_list_, servers_); + worker_ptr_ = std::shared_ptr( + (paddle::distributed::GraphBrpcClient*) + paddle::distributed::PSClientFactory::create(worker_proto)); + worker_ptr_->configure(worker_proto, dense_regions, _ps_env, 0); + worker_ptr_->set_shard_num(127); + worker_ptr_->set_local_channel(index); + worker_ptr_->set_local_graph_service( + (paddle::distributed::GraphBrpcService*)service); +} + +void RunBrpcPushSparse() { + setenv("http_proxy", "", 1); + setenv("https_proxy", "", 1); + prepare_file(edge_file_name, 1); + prepare_file(node_file_name, 0); + auto ph_host = paddle::distributed::PSHost(ip_, port_, 0); + host_sign_list_.push_back(ph_host.serialize_to_string()); + + // test-start + auto ph_host2 = paddle::distributed::PSHost(ip2, port2, 1); + host_sign_list_.push_back(ph_host2.serialize_to_string()); + // test-end + // Srart Server + std::thread* server_thread = new std::thread(RunServer); + std::thread* server_thread2 = new std::thread(RunServer2); + sleep(1); + + std::map> dense_regions; + dense_regions.insert( + std::pair>(0, {})); + auto regions = dense_regions[0]; + + RunClient(dense_regions, 0, pserver_ptr_->get_service()); + + /*-----------------------Test Server Init----------------------------------*/ + auto pull_status = + worker_ptr_->load(0, std::string(edge_file_name), std::string("e>")); + srand(time(0)); + pull_status.wait(); + std::vector>> vs; + testSampleNodes(worker_ptr_); + sleep(5); + testSingleSampleNeighboor(worker_ptr_); + testBatchSampleNeighboor(worker_ptr_); + pull_status = worker_ptr_->batch_sample_neighboors( + 0, std::vector(1, 10240001024), 4, vs); + pull_status.wait(); + ASSERT_EQ(0, vs[0].size()); + + std::vector nodes; + pull_status = worker_ptr_->pull_graph_list(0, 0, 0, 1, 1, nodes); + pull_status.wait(); + ASSERT_EQ(nodes.size(), 1); + ASSERT_EQ(nodes[0].get_id(), 37); + nodes.clear(); + pull_status = worker_ptr_->pull_graph_list(0, 0, 1, 4, 1, nodes); + pull_status.wait(); + ASSERT_EQ(nodes.size(), 1); + ASSERT_EQ(nodes[0].get_id(), 59); + for (auto g : nodes) { + std::cout << g.get_id() << std::endl; + } + distributed::GraphPyServer server1, server2; + distributed::GraphPyClient client1, client2; + std::string ips_str = "127.0.0.1:5211;127.0.0.1:5212"; + std::vector edge_types = {std::string("user2item")}; + std::vector node_types = {std::string("user"), + std::string("item")}; + VLOG(0) << "make 2 servers"; + server1.set_up(ips_str, 127, node_types, edge_types, 0); + server2.set_up(ips_str, 127, node_types, edge_types, 1); + + server1.add_table_feat_conf("user", "a", "float32", 1); + server1.add_table_feat_conf("user", "b", "int32", 2); + server1.add_table_feat_conf("user", "c", "string", 1); + server1.add_table_feat_conf("user", "d", "string", 1); + server1.add_table_feat_conf("item", "a", "float32", 1); + + server2.add_table_feat_conf("user", "a", "float32", 1); + server2.add_table_feat_conf("user", "b", "int32", 2); + server2.add_table_feat_conf("user", "c", "string", 1); + server2.add_table_feat_conf("user", "d", "string", 1); + server2.add_table_feat_conf("item", "a", "float32", 1); + + client1.set_up(ips_str, 127, node_types, edge_types, 0); + + client1.add_table_feat_conf("user", "a", "float32", 1); + client1.add_table_feat_conf("user", "b", "int32", 2); + client1.add_table_feat_conf("user", "c", "string", 1); + client1.add_table_feat_conf("user", "d", "string", 1); + client1.add_table_feat_conf("item", "a", "float32", 1); + + client2.set_up(ips_str, 127, node_types, edge_types, 1); + + client2.add_table_feat_conf("user", "a", "float32", 1); + client2.add_table_feat_conf("user", "b", "int32", 2); + client2.add_table_feat_conf("user", "c", "string", 1); + client2.add_table_feat_conf("user", "d", "string", 1); + client2.add_table_feat_conf("item", "a", "float32", 1); + + server1.start_server(false); + std::cout << "first server done" << std::endl; + server2.start_server(false); + std::cout << "second server done" << std::endl; + client1.start_client(); + std::cout << "first client done" << std::endl; + client2.start_client(); + std::cout << "first client done" << std::endl; + std::cout << "started" << std::endl; + VLOG(0) << "come to set local server"; + client1.bind_local_server(0, server1); + VLOG(0) << "first bound"; + client2.bind_local_server(1, server2); + VLOG(0) << "second bound"; + client1.load_node_file(std::string("user"), std::string(node_file_name)); + client1.load_node_file(std::string("item"), std::string(node_file_name)); + client1.load_edge_file(std::string("user2item"), std::string(edge_file_name), + 0); + nodes.clear(); + + nodes = client1.pull_graph_list(std::string("user"), 0, 1, 4, 1); + + ASSERT_EQ(nodes[0].get_id(), 59); + nodes.clear(); + + // Test Pull by step + + std::unordered_set count_item_nodes; + // pull by step 2 + for (int test_step = 1; test_step < 4; test_step++) { + count_item_nodes.clear(); + std::cout << "check pull graph list by step " << test_step << std::endl; + for (int server_id = 0; server_id < 2; server_id++) { + for (int start_step = 0; start_step < test_step; start_step++) { + nodes = client1.pull_graph_list(std::string("item"), server_id, + start_step, 12, test_step); + for (auto g : nodes) { + count_item_nodes.insert(g.get_id()); + } + nodes.clear(); + } + } + ASSERT_EQ(count_item_nodes.size(), 12); + } + + vs = client1.batch_sample_neighboors(std::string("user2item"), + std::vector(1, 96), 4); + ASSERT_EQ(vs[0].size(), 3); + std::vector node_ids; + node_ids.push_back(96); + node_ids.push_back(37); + vs = client1.batch_sample_neighboors(std::string("user2item"), node_ids, 4); + + ASSERT_EQ(vs.size(), 2); + std::vector nodes_ids = client2.random_sample_nodes("user", 0, 6); + ASSERT_EQ(nodes_ids.size(), 2); + ASSERT_EQ(true, (nodes_ids[0] == 59 && nodes_ids[1] == 37) || + (nodes_ids[0] == 37 && nodes_ids[1] == 59)); + + // Test get node feat + node_ids.clear(); + node_ids.push_back(37); + node_ids.push_back(96); + std::vector feature_names; + feature_names.push_back(std::string("c")); + feature_names.push_back(std::string("d")); + auto node_feat = + client1.get_node_feat(std::string("user"), node_ids, feature_names); + ASSERT_EQ(node_feat.size(), 2); + ASSERT_EQ(node_feat[0].size(), 2); + VLOG(0) << "get_node_feat: " << node_feat[0][0]; + VLOG(0) << "get_node_feat: " << node_feat[0][1]; + VLOG(0) << "get_node_feat: " << node_feat[1][0]; + VLOG(0) << "get_node_feat: " << node_feat[1][1]; + + // Test string + node_ids.clear(); + node_ids.push_back(37); + node_ids.push_back(96); + // std::vector feature_names; + feature_names.clear(); + feature_names.push_back(std::string("a")); + feature_names.push_back(std::string("b")); + node_feat = + client1.get_node_feat(std::string("user"), node_ids, feature_names); + ASSERT_EQ(node_feat.size(), 2); + ASSERT_EQ(node_feat[0].size(), 2); + VLOG(0) << "get_node_feat: " << node_feat[0][0].size(); + VLOG(0) << "get_node_feat: " << node_feat[0][1].size(); + VLOG(0) << "get_node_feat: " << node_feat[1][0].size(); + VLOG(0) << "get_node_feat: " << node_feat[1][1].size(); + + std::remove(edge_file_name); + std::remove(node_file_name); + LOG(INFO) << "Run stop_server"; + worker_ptr_->stop_server(); + LOG(INFO) << "Run finalize_worker"; + worker_ptr_->finalize_worker(); + testFeatureNodeSerializeInt(); + testFeatureNodeSerializeInt64(); + testFeatureNodeSerializeFloat32(); + testFeatureNodeSerializeFloat64(); + testGraphToBuffer(); + client1.stop_server(); +} + +void testGraphToBuffer() { + ::paddle::distributed::GraphNode s, s1; + s.set_feature_size(1); + s.set_feature(0, std::string("hhhh")); + s.set_id(65); + int size = s.get_size(true); + char str[size]; + s.to_buffer(str, true); + s1.recover_from_buffer(str); + ASSERT_EQ(s.get_id(), s1.get_id()); + VLOG(0) << s.get_feature(0); + VLOG(0) << s1.get_feature(0); +} + +TEST(RunBrpcPushSparse, Run) { RunBrpcPushSparse(); } diff --git a/paddle/fluid/distributed/test/sparse_table_test.cc b/paddle/fluid/distributed/test/sparse_table_test.cc index 6db95c5fac211b..26bede392d6fad 100644 --- a/paddle/fluid/distributed/test/sparse_table_test.cc +++ b/paddle/fluid/distributed/test/sparse_table_test.cc @@ -55,9 +55,14 @@ TEST(CommonSparseTable, SGD) { // pull parameters for create and check std::vector init_keys = {0, 1, 2, 3, 4}; + std::vector init_fres = {1, 1, 1, 1, 1}; + std::vector init_values; init_values.resize(init_keys.size() * emb_dim); - table->pull_sparse(init_values.data(), init_keys.data(), init_keys.size()); + + std::vector pull_values(init_values.size()); + auto value = PullSparseValue(init_keys, init_fres, emb_dim); + table->pull_sparse(init_values.data(), value); // for check std::vector total_gradients; @@ -100,7 +105,8 @@ TEST(CommonSparseTable, SGD) { std::vector pull_values; pull_values.resize(init_keys.size() * emb_dim); - table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size()); + table->pull_sparse(init_values.data(), value); + for (size_t i = 0; i < init_values.size(); ++i) { auto update_val = init_values[i] - 1.0 * total_gradients[i]; ASSERT_TRUE(abs(update_val - pull_values[i]) < 1e-5); @@ -148,9 +154,13 @@ TEST(CommonSparseTable, Adam) { // pull parameters for create and check std::vector init_keys = {0, 1, 2, 3, 4}; + std::vector init_fres = {1, 1, 1, 1, 1}; + std::vector init_values; init_values.resize(init_keys.size() * emb_dim); - table->pull_sparse(init_values.data(), init_keys.data(), init_keys.size()); + + auto value = PullSparseValue(init_keys, init_fres, emb_dim); + table->pull_sparse(init_values.data(), value); // push gradient std::vector> trainer_keys; diff --git a/paddle/fluid/extension/include/ext_tensor.h b/paddle/fluid/extension/include/ext_tensor.h index 52606b2a7f59e0..fa91490e6cd8af 100644 --- a/paddle/fluid/extension/include/ext_tensor.h +++ b/paddle/fluid/extension/include/ext_tensor.h @@ -113,6 +113,9 @@ class PD_DLL_DECL Tensor { /// \brief Cast datatype from one to another Tensor cast(const DataType& target_type) const; + /// \brief Check Tensor is initialized + bool is_initialized() const; + #ifdef PADDLE_WITH_CUDA /// \bref Get current stream of Tensor cudaStream_t stream() const; diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc index e9705e2101cc3c..8b2f7cc5bf13c9 100644 --- a/paddle/fluid/extension/src/ext_tensor.cc +++ b/paddle/fluid/extension/src/ext_tensor.cc @@ -103,15 +103,6 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc, void Tensor::reshape(const std::vector &shape) { GET_CASTED_TENSOR auto new_dim = framework::make_ddim(shape); - if (tensor->numel() != framework::product(new_dim)) { - LOG(WARNING) << "Custom Op: Calling reshape to a new shape which is bigger " - "or smaller" - << "than original shape will not change your tensor's memory " - "Please call" - << "paddle::Tensor::mutable_data() after to reallocate " - "your tensor's size." - << std::endl; - } tensor->Resize(new_dim); } @@ -393,6 +384,15 @@ int64_t Tensor::size() const { return tensor->numel(); } +bool Tensor::is_initialized() const { + GET_CASTED_TENSOR; + if (tensor->IsInitialized()) { + return true; + } else { + return false; + } +} + #ifdef PADDLE_WITH_CUDA cudaStream_t Tensor::stream() const { if (!stream_.IsStreamSet()) { diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 1fa4ce9b573a09..24bed277280839 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -191,13 +191,15 @@ if(WITH_PYTHON) py_proto_compile(distributed_strategy_py_proto SRCS distributed_strategy.proto) #Generate an empty \ #__init__.py to make framework_py_proto as a valid python module. + add_custom_target(fleet_proto_init ALL + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto + COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py + ) add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) - add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto) + add_dependencies(framework_py_proto framework_py_proto_init trainer_py_proto distributed_strategy_py_proto fleet_proto_init) if (NOT WIN32) add_custom_command(TARGET framework_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto - COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ COMMAND cp distributed_strategy_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto COMMENT "Copy generated python proto into directory paddle/fluid/proto." @@ -207,8 +209,6 @@ if(WITH_PYTHON) string(REPLACE "/" "\\" fleet_proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/") add_custom_command(TARGET framework_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto - COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto/__init__.py COMMAND copy /Y *.py ${proto_dstpath} COMMAND copy /Y distributed_strategy_*.py ${fleet_proto_dstpath} COMMENT "Copy generated python proto into directory paddle/fluid/proto." @@ -217,6 +217,12 @@ if(WITH_PYTHON) endif(NOT WIN32) endif() +if (WITH_PSCORE) + add_custom_target(index_dataset_proto_init ALL DEPENDS fleet_proto_init index_dataset_py_proto + COMMAND cp ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/index_dataset/index_dataset_*.py ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto + COMMENT "Copy generated python proto into directory paddle/distributed/fleet/proto.") +endif(WITH_PSCORE) + cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) @@ -360,71 +366,30 @@ set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_prot cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) -# Old custom op extension mechanism related, will be removed in 2.1.0 -cc_library(paddle_framework_shared - SHARED SRCS executor.cc operator.cc - ${CMAKE_CURRENT_SOURCE_DIR}/c/c_api.cc - ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc - DEPS ${FLUID_FRAMEWORK_MODULES}) -get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) -set_target_properties(paddle_framework_shared PROPERTIES OUTPUT_NAME paddle_framework) -target_link_libraries(paddle_framework_shared ${os_dependency_modules}) - -if (LINUX) - set(FLUID_FRAMEWORK_SHARED_LIB - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.so - CACHE INTERNAL "Fluid framework lib") -endif() - -if (WIN32) - if("${CMAKE_GENERATOR}" STREQUAL "Ninja") - set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR}) - else() - set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}) - endif() - set(FLUID_FRAMEWORK_IMPORT_LIB - ${paddle_framework_lib_path}/paddle_framework.lib - CACHE INTERNAL "Fluid framework lib") - set(FLUID_FRAMEWORK_SHARED_LIB - ${paddle_framework_lib_path}/paddle_framework.dll - CACHE INTERNAL "Fluid framework dll") -endif() - -if(APPLE) - set(FLUID_FRAMEWORK_SHARED_LIB - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.dylib - CACHE INTERNAL "Fluid framework lib") -endif() if(WITH_TESTING AND TEST selected_rows_test) set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120) endif() -# New custom op extension mechanism related +##### 2.0 New custom op extension mechanism related ##### # if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_ -set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer) - -set(PADDLE_CUSTOM_OP_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc - ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc - ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc - ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc) -set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE) +if (WIN32) + set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer) -cc_library(paddle_custom_op_shared - SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES}) + set(PADDLE_CUSTOM_OP_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc + ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc) + set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE) -get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) -set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op) -target_link_libraries(paddle_custom_op_shared ${os_dependency_modules}) + cc_library(paddle_custom_op_shared + SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES}) -if (LINUX) - set(PADDLE_CUSTOM_OP_SHARED_LIB - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_custom_op.so - CACHE INTERNAL "Paddle custom op lib") -endif() + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op) + target_link_libraries(paddle_custom_op_shared ${os_dependency_modules}) -if (WIN32) if("${CMAKE_GENERATOR}" STREQUAL "Ninja") set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR}) else() @@ -437,9 +402,3 @@ if (WIN32) ${paddle_custom_op_lib_path}/paddle_custom_op.dll CACHE INTERNAL "Paddle custom op dll") endif() - -if(APPLE) - set(PADDLE_CUSTOM_OP_SHARED_LIB - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/paddle_custom_op.dylib - CACHE INTERNAL "Paddle custom op lib") -endif() diff --git a/paddle/fluid/framework/c/c_api.cc b/paddle/fluid/framework/c/c_api.cc deleted file mode 100644 index 5e73c5cc23afa4..00000000000000 --- a/paddle/fluid/framework/c/c_api.cc +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/c/c_api.h" - -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" - -extern "C" { - -paddle::framework::OpInfoMap &PD_GetOpInfoMap() { - return paddle::framework::OpInfoMap::Instance(); -} - -void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool) { - paddle::platform::DeviceContextPool::SetPool(pool); -} - -std::vector PD_GetGradOpDescStrs( - const paddle::framework::OpDesc &op_desc, - const std::unordered_set &no_grad_set, - std::unordered_map *grad_to_var, - const std::vector &grad_block) { - auto &op_info = PD_GetOpInfoMap().Get(op_desc.Type()); - std::vector ret; - if (op_info.grad_op_maker_) { - auto grad_op_descs = - op_info.grad_op_maker_(op_desc, no_grad_set, grad_to_var, grad_block); - size_t op_num = grad_op_descs.size(); - ret.resize(op_num); - for (size_t i = 0; i < op_num; ++i) { - PADDLE_ENFORCE_EQ( - grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true, - paddle::platform::errors::Unavailable( - "Cannot serialize operator desc message.")); - } - } - return ret; -} - -} // end extern "C" diff --git a/paddle/fluid/framework/c/c_api.h b/paddle/fluid/framework/c/c_api.h deleted file mode 100644 index a9ec402f381e43..00000000000000 --- a/paddle/fluid/framework/c/c_api.h +++ /dev/null @@ -1,55 +0,0 @@ -/* copyright (c) 2019 paddlepaddle authors. all rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace framework { -class OpInfoMap; -} // namespace framework -namespace platform { -class DeviceContextPool; -} // namespace platform -} // namespace paddle - -#ifdef __cplusplus -extern "C" { -#endif - -// C-API to get global OpInfo map. -paddle::framework::OpInfoMap &PD_GetOpInfoMap(); - -// C-API to init global DeviceContextPool from outside. -void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool); - -// C-API to serialize the grad op protocol message to a binary string. -std::vector PD_GetGradOpDescStrs( - const paddle::framework::OpDesc &op_desc, - const std::unordered_set &no_grad_set, - std::unordered_map *grad_to_var, - const std::vector &grad_block); - -#ifdef __cplusplus -} -#endif diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc index 8d6fd4efd5ae3d..a65dcbd55f9463 100644 --- a/paddle/fluid/framework/custom_tensor_test.cc +++ b/paddle/fluid/framework/custom_tensor_test.cc @@ -220,6 +220,21 @@ void GroupTestDtypeConvert() { paddle::DataType::FLOAT16); } +void TestInitilized() { + paddle::Tensor test_tensor(paddle::PlaceType::kCPU); + CHECK(test_tensor.is_initialized() == false); + test_tensor.reshape({1, 1}); + test_tensor.mutable_data(); + CHECK(test_tensor.is_initialized() == true); + float* tensor_data = test_tensor.data(); + for (int i = 0; i < test_tensor.size(); i++) { + tensor_data[i] = 0.5; + } + for (int i = 0; i < test_tensor.size(); i++) { + CHECK(tensor_data[i] == 0.5); + } +} + TEST(CustomTensor, copyTest) { VLOG(2) << "TestCopy"; GroupTestCopy(); @@ -233,4 +248,6 @@ TEST(CustomTensor, copyTest) { GroupTestCast(); VLOG(2) << "TestDtypeConvert"; GroupTestDtypeConvert(); + VLOG(2) << "TestInitilized"; + TestInitilized(); } diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 34c87b8388975a..5636e3ed1b63f9 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -161,9 +161,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ !defined(_WIN32) && !defined(__APPLE__) AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass"); -#else - LOG(WARNING) << "fusion_group is not enabled for Windows/MacOS now, and " - "only effective when running with CUDA GPU."; #endif AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_, "fuse_elewise_add_act_pass"); @@ -265,12 +262,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { if (FLAGS_use_mkldnn) { AppendPass(pass_name); } else if (!strategy_.mkldnn_enabled_op_types_.empty()) { - LOG(WARNING) - << "mkldnn_enabled_op_types specify the operator type list to " - "use MKLDNN acceleration. It is null in default, means " - "that all the operators supported by MKLDNN will be " - "accelerated. And it should not be set when " - "FLAGS_use_mkldnn=false."; + VLOG(1) << "mkldnn_enabled_op_types specify the operator type list to " + "use MKLDNN acceleration. It is null in default, means " + "that all the operators supported by MKLDNN will be " + "accelerated. And it should not be set when " + "FLAGS_use_mkldnn=false."; } #else PADDLE_ENFORCE_NE(FLAGS_use_mkldnn, true, @@ -403,26 +399,26 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, << ", num_trainers:" << num_trainers_; } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") { if (use_device != p::kCUDA) { - LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on " - "GPU, skipped."; + VLOG(1) << "fuse_relu_depthwise_conv_pass is only supported on " + "GPU, skipped."; continue; } } else if (pass->Type() == "fusion_group_pass") { pass->Set("use_gpu", new bool((use_device == p::kCUDA))); if (use_device != p::kCUDA) { - LOG(WARNING) << "fusion_group_pass is only supported on GPU, skipped."; + VLOG(1) << "fusion_group_pass is only supported on GPU, skipped."; continue; } } else if (pass->Type() == "fuse_bn_act_pass") { if (use_device != p::kCUDA) { - LOG(WARNING) << "fuse_bn_act_pass is only supported on " - "GPU, skipped."; + VLOG(1) << "fuse_bn_act_pass is only supported on " + "GPU, skipped."; continue; } } else if (pass->Type() == "fuse_bn_add_act_pass") { if (use_device != p::kCUDA) { - LOG(WARNING) << "fuse_bn_add_act_pass is only supported on " - "GPU, skipped."; + VLOG(1) << "fuse_bn_add_act_pass is only supported on " + "GPU, skipped."; continue; } } else if (pass->Type() == "mkldnn_placement_pass") { diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 05c54a90f7eb02..628b9f0d70f598 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -205,7 +205,7 @@ class DeviceWorker { Scope* root_scope_ = nullptr; Scope* thread_scope_; paddle::platform::Place place_; - int64_t batch_num_; + int64_t batch_num_ = 0; FetchConfig fetch_config_; bool use_cvm_; bool no_cvm_; @@ -562,7 +562,6 @@ class PSGPUWorker : public HogwildWorker { void ResetStat(); protected: - std::shared_ptr fleet_ptr_; void PushGradients(); void DumpParam(); void CopySparseTable(); diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto old mode 100644 new mode 100755 index b36793507f54bf..e6a7d74cc43433 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -29,9 +29,24 @@ message RecomputeConfig { } message ShardingConfig { - optional float fuse_broadcast_MB = 1 [ default = 32.0 ]; - optional bool hybrid_dp = 2 [ default = false ]; - optional int32 sharding_group_size = 3 [ default = 8 ]; + optional string sharding_segment_strategy = 1 + [ default = 'segment_broadcast_MB' ]; + optional float segment_broadcast_MB = 2 [ default = 32.0 ]; + repeated string segment_anchors = 3; + optional int32 sharding_degree = 4 [ default = 8 ]; + optional int32 mp_degree = 5 [ default = 1 ]; + optional int32 dp_degree = 6 [ default = 1 ]; + optional bool hybrid_dp = 7 [ default = false ]; + optional int32 gradient_merge_acc_step = 8 [ default = 1 ]; + optional bool optimize_offload = 9 [ default = false ]; + optional bool pp_allreduce_in_optimize = 10 [ default = false ]; + optional int32 pp_degree = 11 [ default = 1 ]; +} + +message HybridConfig { + optional int32 dp_degree = 1 [ default = -1 ]; + optional int32 mp_degree = 2 [ default = 1 ]; + optional int32 pp_degree = 3 [ default = 1 ]; } message AMPConfig { @@ -115,6 +130,7 @@ message AsyncConfig { optional bool launch_barrier = 9 [ default = true ]; optional string heter_worker_device_guard = 10 [ default = 'cpu' ]; optional int32 lr_decay_steps = 11 [ default = 10 ]; + optional int32 use_ps_gpu = 12 [ default = 0 ]; } message PipelineConfig { @@ -152,6 +168,7 @@ message DistributedStrategy { optional bool fp16_allreduce = 25 [ default = false ]; optional bool sharding = 26 [ default = false ]; optional float last_comm_group_size_MB = 27 [ default = 1 ]; + optional bool find_unused_parameters = 28 [ default = true ]; optional RecomputeConfig recompute_configs = 101; optional AMPConfig amp_configs = 102; @@ -164,6 +181,7 @@ message DistributedStrategy { optional LambConfig lamb_configs = 109; optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110; optional ShardingConfig sharding_configs = 111; + optional HybridConfig hybrid_configs = 112; optional BuildStrategy build_strategy = 201; optional ExecutionStrategy execution_strategy = 202; } diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index a3fbb008fe4f44..b99ab6b5a7ff19 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -82,6 +82,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { platform::errors::Unimplemented("platform::XPUPlace is not supported")); } + inline ::DLContext operator()(const platform::NPUPlace &place) const { + PADDLE_THROW( + platform::errors::Unimplemented("platform::NPUPlace is not supported")); + } + inline ::DLContext operator()(const platform::CUDAPlace &place) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ::DLContext ctx; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 0acc8a55fa9f8a..101991d2c1ba00 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -453,6 +453,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, #else PADDLE_THROW( platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle")); +#endif + } else if (platform::is_npu_place(place_)) { +#ifdef PADDLE_WITH_ASCEND_CL + // TODO(ascendrc): Support garbage collector on NPUPlace + VLOG(4) << "Skip NPU gc because it is not implemented now."; +#else + PADDLE_THROW(platform::errors::Unimplemented( + "No NPU gc found in CPU/GPU/XPU paddle")); #endif } } diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 61f3c026f1facc..c8517b9503741b 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -1,5 +1,10 @@ if(WITH_PSLIB) cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope pslib_brpc pslib) +else() + cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope) +endif(WITH_PSLIB) + +if(WITH_HETERPS) if(WITH_NCCL) nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc DEPS heter_ps) @@ -8,13 +13,10 @@ if(WITH_PSLIB) hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc DEPS heter_ps) add_subdirectory(heter_ps) - else() - cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc) endif(WITH_NCCL) else() - cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope) cc_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cc) -endif(WITH_PSLIB) +endif(WITH_HETERPS) if(WITH_NCCL OR WITH_RCCL) cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope) @@ -42,5 +44,5 @@ cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_conte cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) if(WITH_ASCEND) - cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend ascend_graph) + cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph) endif(WITH_ASCEND) diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h index da79fccb8ca69f..baa2fd126a4b77 100644 --- a/paddle/fluid/framework/fleet/ascend_wrapper.h +++ b/paddle/fluid/framework/fleet/ascend_wrapper.h @@ -37,25 +37,50 @@ limitations under the License. */ namespace paddle { namespace framework { -// typedef std::vector AscendGraphDesc; typedef ge::Graph AscendGraphDesc; +#ifdef PADDLE_WITH_ASCEND_STRING +using AscendString = ge::AscendString; +#else +using AscendString = std::string; +#endif + class AscendInstance { public: virtual ~AscendInstance() {} AscendInstance() {} - std::map GetDefaultInitSessionOptions() { - std::map init_options; - init_options["a"] = "b"; - init_options["ge.trainFlag"] = "1"; + std::map _GetDefaultInitOptions() { + std::map init_options; + init_options["ge.exec.deviceId"] = "0"; + init_options["ge.graphRunMode"] = "1"; + return init_options; + } + + std::map _GetDefaultInitSessionOptions() { + std::map init_options; + // init_options["a"] = "b"; + // init_options["ge.trainFlag"] = "1"; return init_options; } - // add other parameters here to init + ge::Status InitGEForUT() { + return ge::GEInitialize(_GetDefaultInitOptions()); + } + void InitGlobalResouces() { - session_.reset(new ge::Session(GetDefaultInitSessionOptions())); - VLOG(1) << "InitGlobalResouces Done"; + LOG(INFO) << "Begin ascend InitGlobalResouces"; + session_.reset(new ge::Session(_GetDefaultInitSessionOptions())); + if (session_ == nullptr) { + PADDLE_THROW(platform::errors::Fatal("new session error: nullptr")); + } + LOG(INFO) << "End ascend InitGlobalResouces"; + } + + void DestroyGlobalResouces() { + LOG(INFO) << "Begin ascend DestroyGlobalResouces"; + session_ = nullptr; + LOG(INFO) << "Begin ascend DestroyGlobalResouces"; } static std::shared_ptr GetInstance() { @@ -178,6 +203,6 @@ class AscendInstance { private: static std::shared_ptr ascend_instance_; }; -} // end namespace framework -} // end namespace paddle +} // namespace framework +} // namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index e584fb5e2b9ca7..613b2803637d2d 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -34,6 +34,9 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/platform/type_defs.h" +#endif namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h index a02931b3f5c28a..6f063e830c2da7 100644 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -14,15 +14,21 @@ limitations under the License. */ #pragma once -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ - (defined PADDLE_WITH_PSLIB) +#ifdef PADDLE_WITH_HETERPS #include #include #include #include +#ifdef PADDLE_WITH_PSLIB #include "common_value.h" // NOLINT +#endif + +#ifdef PADDLE_WITH_PSCORE +#include "paddle/fluid/distributed/table/depends/large_scale_kv.h" +#endif + #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/scope.h" @@ -39,7 +45,12 @@ class HeterContext { } Scope* scope_{nullptr}; std::vector> feature_keys_; +#ifdef PADDLE_WITH_PSLIB std::vector> value_ptr_; +#endif +#ifdef PADDLE_WITH_PSCORE + std::vector> value_ptr_; +#endif std::vector> device_values_; std::vector> device_keys_; std::vector mutex_; diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index 698ece09de6c50..c3bf33b32c2daf 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS #include diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index e5c0972763bede..089130f6da8c73 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -17,11 +17,17 @@ limitations under the License. */ #include #include #include +#ifdef PADDLE_WTIH_PSLIB #include "common_value.h" // NOLINT +#endif +#ifdef PADDLE_WITH_PSCORE +#endif #include "thrust/pair.h" //#include "cudf/concurrent_unordered_map.cuh.h" #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/distributed/table/depends/large_scale_kv.h" +#include "paddle/fluid/platform/type_defs.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h index 871f9c7857af46..098c795fc7e1f9 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { @@ -119,6 +119,7 @@ void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { continue; } ValType& gpu_val = kv[i].second; +#ifdef PADDLE_WITH_PSLIB auto* downpour_value = (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr); int downpour_value_size = downpour_value->size(); @@ -138,6 +139,14 @@ void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { cpu_val[x + 7] = gpu_val.mf[x]; } } +#endif +#ifdef PADDLE_WITH_PSCORE + auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr); + downpour_value->count_ = gpu_val.show; + for (int x = 0; x < gpu_val.mf_size; x++) { + downpour_value->data_[x] = gpu_val.mf[x]; + } +#endif } container_->prefetch(devid, stream); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 0e38ebbd7f4e72..2ec2a8a1f1e223 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "thrust/pair.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { @@ -182,7 +182,7 @@ class HeterComm { std::vector> path_; std::vector storage_; int feanum_{1800 * 2048}; - int multi_node_{1}; + int multi_node_{0}; std::vector nccl_inner_comms_; std::vector nccl_inter_comms_; int node_size_; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 2f1c809c01eaad..1b4205e3c38fe2 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#ifdef PADDLE_WITH_HETERPS #include -#ifdef PADDLE_WITH_PSLIB namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index f2e129ded9fefc..581b0d511c23ee 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { @@ -54,8 +54,8 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } void HeterPs::push_sparse(int num, FeatureKey* d_keys, FeaturePushValue* d_grads, size_t len) { - // comm_->push_sparse(num, d_keys, d_grads, len, opt_); - comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); + comm_->push_sparse(num, d_keys, d_grads, len, opt_); + // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); } void HeterPs::set_nccl_comm_and_size(const std::vector& inner_comms, diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index 142f4a93b93a29..d78b6b492074de 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h" #include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index 7980220eab9b9b..05b3ecf9c3c12c 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc index f65b664f83ba0d..0f2af2a522e287 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc +++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS #include "heter_resource.h" #include "paddle/fluid/platform/cuda_device_guard.h" diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h index ad7649a8a33cb7..7b23379994c735 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_resource.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_resource.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/enforce.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h index b3ec9e752e62bb..7e82a8e014fd3c 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "optimizer_conf.h" #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 4274876c9975e5..b7bb5110744649 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -26,8 +26,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ - (defined PADDLE_WITH_PSLIB) +#ifdef PADDLE_WITH_HETERPS #include #include @@ -58,7 +57,12 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, auto& device_mutex = gpu_task->mutex_; std::vector threads; +#ifdef PADDLE_WITH_PSLIB auto fleet_ptr = FleetWrapper::GetInstance(); +#endif +#ifdef PADDLE_WITH_PSCORE + auto fleet_ptr = paddle::distributed::Communicator::GetInstance(); +#endif // data should be in input channel thread_keys_.resize(thread_keys_thread_num_); @@ -124,9 +128,16 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, auto ptl_func = [this, &local_keys, &local_ptr, &table_id, &fleet_ptr](int i) { size_t key_size = local_keys[i].size(); +#ifdef PADDLE_WITH_PSLIB auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( reinterpret_cast(local_ptr[i].data()), table_id, local_keys[i].data(), key_size); +#endif +#ifdef PADDLE_WITH_PSCORE + auto tt = fleet_ptr->_worker_ptr->pull_sparse_ptr( + reinterpret_cast(local_ptr[i].data()), table_id, + local_keys[i].data(), key_size); +#endif tt.wait(); auto status = tt.get(); // auto status = 0; @@ -153,8 +164,14 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, auto build_func = [device_num, &local_keys, &local_ptr, &device_keys, &device_vals, &device_mutex](int i) { std::vector> task_keys(device_num); +#ifdef PADDLE_WITH_PSLIB std::vector> task_ptrs( device_num); +#endif + +#ifdef PADDLE_WITH_PSCORE + std::vector> task_ptrs(device_num); +#endif for (size_t j = 0; j < local_keys[i].size(); j++) { int shard = local_keys[i][j] % device_num; @@ -169,7 +186,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, int cur = device_keys[dev].size(); device_keys[dev].resize(device_keys[dev].size() + len); device_vals[dev].resize(device_vals[dev].size() + len); - +#ifdef PADDLE_WITH_PSLIB for (int j = 0; j < len; ++j) { device_keys[dev][cur + j] = task_keys[dev][j]; float* ptr_val = task_ptrs[dev][j]->data(); @@ -196,6 +213,35 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, } } } +#endif +#ifdef PADDLE_WITH_PSCORE + for (int j = 0; j < len; ++j) { + device_keys[dev][cur + j] = task_keys[dev][j]; + distributed::VALUE* ptr_val = task_ptrs[dev][j]; + FeatureValue& val = device_vals[dev][cur + j]; + bool has_mf = 1; + val.delta_score = 0; + val.show = ptr_val->count_; + val.clk = 0; + val.slot = 0; + val.lr = 0; + val.lr_g2sum = 0; + val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]); + + if (has_mf) { + val.mf_size = MF_DIM + 1; + for (int x = 0; x < val.mf_size; x++) { + val.mf[x] = ptr_val->data_[x]; + } + } else { + val.mf_size = 0; + for (int x = 0; x < MF_DIM + 1; x++) { + val.mf[x] = 0; + } + } + } +#endif + VLOG(1) << "GpuPs build hbmps done"; device_mutex[dev]->unlock(); } diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 2eedcd5f1c7005..2bf564d3f76d5a 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_PSLIB +#ifdef PADDLE_WITH_HETERPS #include #include #include diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index ef586b41fe05d2..cfb23d1be2acfe 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -14,8 +14,7 @@ limitations under the License. */ #pragma once -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ - (defined PADDLE_WITH_PSLIB) +#ifdef PADDLE_WITH_HETERPS #include #include @@ -26,7 +25,6 @@ limitations under the License. */ #include #include #include - #ifdef PADDLE_WITH_GLOO #include #include "paddle/fluid/framework/fleet/gloo_wrapper.h" @@ -42,6 +40,9 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_PSCORE +#include "paddle/fluid/distributed/service/communicator.h" +#endif namespace paddle { namespace framework { @@ -219,7 +220,7 @@ class PSGPUWrapper { std::shared_ptr resource_; int32_t sleep_seconds_before_fail_exit_; std::vector slot_vector_; - int multi_node_{1}; + int multi_node_{0}; int node_size_; std::vector inner_comms_; std::vector inter_comms_; diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index c8b6c764255175..8dfbd3c268b866 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -86,8 +86,9 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream_)); #else PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_)); + callback_manager_.reset( + new platform::StreamCallbackManager(stream_)); #endif - callback_manager_.reset(new platform::StreamCallbackManager(stream_)); } StreamGarbageCollector::~StreamGarbageCollector() { diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 97800865af861f..572c79d21a045b 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -117,7 +117,8 @@ class StreamGarbageCollector : public GarbageCollector { private: gpuStream_t stream_; - std::unique_ptr callback_manager_; + std::unique_ptr> + callback_manager_; }; class CUDAPinnedGarbageCollector : public GarbageCollector { diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h index 8f52235c962445..3f65eaf3aa1216 100644 --- a/paddle/fluid/framework/heter_service.h +++ b/paddle/fluid/framework/heter_service.h @@ -30,10 +30,12 @@ limitations under the License. */ #include "brpc/controller.h" #include "brpc/server.h" #include "paddle/fluid/platform/timer.h" +#endif namespace paddle { namespace framework { +#ifdef PADDLE_WITH_PSLIB typedef std::function HeterServiceHandler; class DataFeed; @@ -142,7 +144,7 @@ class HeterTask { double cpu_2_gpu_time{0}; platform::Timer timeline; }; - +#endif template class HeterObjectPool { public: @@ -153,7 +155,7 @@ class HeterObjectPool { if (pool_.empty()) { num_ += 1; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - VLOG(0) << "pool construct size: " << num_; + VLOG(3) << "pool construct size: " << num_; #endif return std::make_shared(); } else { @@ -178,6 +180,7 @@ class HeterObjectPool { int num_{0}; }; +#ifdef PADDLE_WITH_PSLIB struct BthreadMutextGuard { BthreadMutextGuard(bthread_mutex_t* rho) { mutex_ = rho; @@ -258,7 +261,6 @@ class HeterList { std::unique_lock lock(mutex_); cond_.wait(lock, [this] { return size < cap_; }); if (task_map_.find(key) != task_map_.end()) { - // std::cout << "try put key=" << key << " false" << std::endl; task_map_.erase(key); return false; } else { @@ -267,7 +269,6 @@ class HeterList { node->value = value; map_[node->key] = node; attach(node); - // std::cout << "try put key=" << key << " true" << std::endl; return true; } } @@ -276,7 +277,6 @@ class HeterList { std::unique_lock lock(mutex_); cond_.wait(lock, [this] { return size < cap_; }); HeterNode* node = new HeterNode; - // std::cout << "put key=" << key << " true" << std::endl; node->key = key; node->value = value; map_[node->key] = node; @@ -288,7 +288,6 @@ class HeterList { std::lock_guard lock(mutex_); auto iter = map_.find(key); if (iter != map_.end()) { - // std::cout << "try get key=" << key << " true" << std::endl; HeterNode* node = iter->second; detach(node); cond_.notify_one(); @@ -298,7 +297,6 @@ class HeterList { return ret; } task_map_.insert(key); - // std::cout << "try get key=" << key << " false" << std::endl; return nullptr; } @@ -306,7 +304,6 @@ class HeterList { std::lock_guard lock(mutex_); auto iter = map_.find(key); if (iter != map_.end()) { - // std::cout << "get key=" << key << " true" << std::endl; HeterNode* node = iter->second; detach(node); cond_.notify_one(); @@ -315,7 +312,6 @@ class HeterList { delete node; return ret; } - // std::cout << "get key=" << key << " false" << std::endl; return nullptr; } @@ -323,14 +319,12 @@ class HeterList { std::lock_guard lock(mutex_); HeterNode* node = head_->next; if (node == tail_) { - // std::cout << "get2 false" << std::endl; return nullptr; } else { detach(node); cond_.notify_one(); T ret = std::move(node->value); map_.erase(node->key); - // std::cout << "get2 key=" << node->key << " true" << std::endl; delete node; return ret; } @@ -371,7 +365,7 @@ class HeterList { int cap_; int size; }; +#endif } // namespace framework } // namespace paddle -#endif diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index d8639643f2c8a7..89dc5c7d3ea932 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" @@ -226,14 +227,32 @@ void HogwildWorker::PrintFetchVars() { // call count batch_num_++; int batch_per_print = fetch_config_.print_period(); - if (thread_id_ == 0) { - if (batch_num_ % batch_per_print == 0) { - int fetch_var_num = fetch_config_.fetch_var_names_size(); - for (int i = 0; i < fetch_var_num; ++i) { - platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i), - fetch_config_.fetch_var_str_format(i)); + int fetch_var_num = fetch_config_.fetch_var_names_size(); + + if (fetch_var_num == 0) { + return; + } + + if (thread_id_ == 0 && batch_num_ % batch_per_print == 0) { + time_t curtime; + time(&curtime); + char mbstr[80]; + std::strftime(mbstr, sizeof(mbstr), "%Y-%m-%d %H:%M:%S", + std::localtime(&curtime)); + + std::stringstream ss; + ss << "time: [" << mbstr << "], "; + ss << "batch: [" << batch_num_ << "], "; + + for (int i = 0; i < fetch_var_num; ++i) { + platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i), + fetch_config_.fetch_var_str_format(i), &ss); + if (i < fetch_var_num - 1) { + ss << ", "; } } + + std::cout << ss.str() << std::endl; } } diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc index 84c6b03e76bc1e..59d071e1034590 100644 --- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc @@ -34,15 +34,19 @@ namespace patterns { static PDNode* create_emb_vars(PDPattern* pattern, const std::string& name, const std::string& arg, bool is_persist = false) { + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; PDNode* node = - pattern->NewNode(name)->assert_is_op_input("lookup_table", arg); + pattern->NewNode(name)->assert_is_ops_input(embedding_ops, arg); if (is_persist) return node->assert_is_persistable_var(); return node; } static PDNode* create_emb_out_vars(PDPattern* pattern, const std::string& name, const std::string& arg) { + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; PDNode* node = pattern->NewNode(name) - ->assert_is_only_output_of_op("lookup_table") + ->assert_is_only_output_of_ops(embedding_ops) ->assert_is_op_input("elementwise_add", arg) ->AsIntermediate(); return node; @@ -56,10 +60,12 @@ void Embedding2Eltwise1Pattern::operator()() { create_emb_vars(pattern, lookup_table1_w_repr(), "W", true); auto* lookup_table2_w = create_emb_vars(pattern, lookup_table2_w_repr(), "W", true); + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; auto* lookup_table1 = - pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table"); + pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops); auto* lookup_table2 = - pattern->NewNode(lookup_table2_repr())->assert_is_op("lookup_table"); + pattern->NewNode(lookup_table2_repr())->assert_is_ops(embedding_ops); auto* lookup_table1_out = create_emb_out_vars(pattern, lookup_table1_out_repr(), "X"); auto* lookup_table2_out = @@ -80,8 +86,10 @@ void Embedding1Eltwise1Pattern::operator()() { create_emb_vars(pattern, lookup_table1_x_repr(), "Ids"); auto* lookup_table1_w = create_emb_vars(pattern, lookup_table1_w_repr(), "W", true); + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; auto* lookup_table1 = - pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table"); + pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops); auto* lookup_table1_out = create_emb_out_vars(pattern, lookup_table1_out_repr(), "Y"); auto* eltwise_add = @@ -291,6 +299,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope new_op_desc.SetOutput("Out", {end_pattern_out[k]->Name()}); new_op_desc.SetAttr("epsilon", end_patter_layernorms[k]->Op()->GetAttr("epsilon")); + + if (end_patter_layernorms[k]->Op()->HasAttr("out_threshold")) { + new_op_desc.SetAttr("enable_int8", true); + } + auto* embedding_eltwise_layernorm = graph->CreateOpNode(&new_op_desc); for (size_t iter = 0; iter < start_pattern_in_nodes[i].size(); ++iter) { @@ -347,4 +360,5 @@ REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass) .AddCombination( paddle::framework::compatible::OpVersionComparatorCombination() .EQ("lookup_table", 0) + .LE("lookup_table_v2", 1) .EQ("elementweise_add", 0)); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index deb182c0fbe19c..d74e8e5f65cd20 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -652,6 +652,36 @@ PDNode *PDNode::assert_is_ops_input( return this; } +PDNode *PDNode::assert_is_only_input_of_ops( + const std::unordered_set &op_types) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->outputs) { + if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) && + op->inputs.size() == 1) { + return true; + } + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_only_output_of_ops( + const std::unordered_set &op_types) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->inputs) { + if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) && + op->outputs.size() == 1) { + return true; + } + } + return false; + }); + return this; +} + bool VarLinksToOp(Node *node, const std::string &op_type) { for (auto *out : node->outputs) { if (out->IsOp() && out->Op()->Type() == op_type) { diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index b6c1074d90dd2a..cfac01ec9dedc8 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -145,6 +145,11 @@ struct PDNode { const std::unordered_set& op_types, const std::string& argument, int nth); + PDNode* assert_is_only_input_of_ops( + const std::unordered_set& op_types); + PDNode* assert_is_only_output_of_ops( + const std::unordered_set& op_types); + PDNode* assert_has_n_inputs(size_t n); PDNode* assert_has_n_outputs(size_t n); diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc index a2443c86986ec8..c36123f65f6644 100644 --- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc +++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc @@ -57,7 +57,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const { std::vector y_shape = matmul_in_y->Var()->GetShape(); size_t x_rank = x_shape.size(); size_t y_rank = y_shape.size(); - flag = flag && x_rank == 2 && y_rank == 2; + flag = flag && (x_rank == 2 || x_rank == 3) && y_rank == 2; std::vector& next_ops = matmul_out->outputs; flag = flag && next_ops.size() == 1 && @@ -69,7 +69,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const { desc.SetInput("X", {matmul_in_x->Name()}); desc.SetInput("Y", {matmul_in_y->Name()}); desc.SetOutput("Out", {matmul_out->Name()}); - desc.SetAttr("x_num_col_dims", 1); + desc.SetAttr("x_num_col_dims", static_cast(x_rank - 1)); desc.SetAttr("y_num_col_dims", 1); if (matmul_op->Op()->HasAttr("enable_int8")) { desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8")); diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc index e20c0667ec3bc2..1e8349e878781d 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc @@ -535,6 +535,38 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope, multihead_op_desc.SetAttr("alpha", scale_attr); multihead_op_desc.SetAttr("head_number", head_number); + auto* mul0_op_desc = mul0->Op(); + auto* mul1_op_desc = mul1->Op(); + auto* mul2_op_desc = mul2->Op(); + if (mul0_op_desc->HasAttr("enable_int8")) { + multihead_op_desc.SetAttr("enable_int8", + mul0_op_desc->GetAttr("enable_int8")); + // all mul op has same input. + multihead_op_desc.SetAttr("Input_scale", + mul0_op_desc->GetAttr("X_scale")); + auto weight_scale0 = BOOST_GET_CONST( + std::vector, mul0_op_desc->GetAttr("weight_scale")); + auto weight_scale1 = BOOST_GET_CONST( + std::vector, mul1_op_desc->GetAttr("weight_scale")); + auto weight_scale2 = BOOST_GET_CONST( + std::vector, mul2_op_desc->GetAttr("weight_scale")); + auto weight_max = std::max(weight_scale0, weight_scale1); + weight_max = std::max(weight_max, weight_scale2); + multihead_op_desc.SetAttr("weight_scale", weight_max); + + if (mul0_op_desc->HasAttr("out_threshold")) { + auto out_scale0 = + BOOST_GET_CONST(float, mul0_op_desc->GetAttr("out_threshold")); + auto out_scale1 = + BOOST_GET_CONST(float, mul1_op_desc->GetAttr("out_threshold")); + auto out_scale2 = + BOOST_GET_CONST(float, mul2_op_desc->GetAttr("out_threshold")); + auto out_scale_max = std::max(out_scale0, out_scale1); + out_scale_max = std::max(out_scale_max, out_scale2); + multihead_op_desc.SetAttr("out_threshold", out_scale_max); + } + } + auto* multihead = graph->CreateOpNode(&multihead_op_desc); IR_NODE_LINK_TO(input0, multihead); @@ -682,6 +714,447 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope, return fusion_count; } +PDNode* MultiHeadMatmulV3Pattern::operator()() { + std::unordered_set matmul_ops{"matmul", "matmul_v2"}; + auto* input0 = pattern->NewNode(input0_repr()); + input0->assert_is_op_input("matmul"); + + // First path with scale + auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matmul"); + auto* mul0_w_var = pattern->NewNode(mul0_w_repr()) + ->AsInput() + ->assert_is_op_input("matmul", "Y"); + auto* mul0_out_var = + pattern->NewNode(mul0_out_repr())->assert_is_op_output("matmul"); + + decltype(mul0) eltadd0; + decltype(mul0) eltadd0_b_var; + decltype(mul0) eltadd0_out_var; + + mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + + eltadd0 = pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add"); + eltadd0_b_var = pattern->NewNode(eltadd0_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd0_out_var = pattern->NewNode(eltadd0_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd0_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_0 = + pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2"); + + auto* reshape2_0_out_var = + pattern->NewNode(reshape2_0_out_repr())->assert_is_op_output("reshape2"); + reshape2_0_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_0 = + pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2"); + auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matmul"); + + auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul"); + auto* matmul_qk_out_var = + pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul"); + matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + + auto* eltadd_qk = + pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add"); + auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd_qk_out_var->AsIntermediate()->assert_is_op_input("softmax"); + + auto* softmax_qk = + pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax"); + auto* softmax_qk_out_var = + pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax"); + softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops); + + auto* matmul_qkv = + pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops); + auto* matmul_qkv_out_var = + pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops); + matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_qkv = + pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2"); + auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_qkv = + pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2"); + auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr()) + ->assert_is_op_output("reshape2"); + reshape2_qkv_out_var->assert_is_op_input("matmul"); + + // Second path to matmul + auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matmul"); + auto* mul1_w_var = pattern->NewNode(mul1_w_repr()) + ->AsInput() + ->assert_is_op_input("matmul", "Y"); + auto* mul1_out_var = + pattern->NewNode(mul1_out_repr())->assert_is_op_output("matmul"); + + decltype(mul1) eltadd1; + decltype(mul1) eltadd1_b_var; + decltype(mul1) eltadd1_out_var; + + mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add"); + eltadd1_b_var = pattern->NewNode(eltadd1_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd1_out_var = pattern->NewNode(eltadd1_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd1_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_1 = + pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2"); + + auto* reshape2_1_out_var = + pattern->NewNode(reshape2_1_out_repr())->assert_is_op_output("reshape2"); + reshape2_1_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_1 = + pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2"); + auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_1_out_var->AsIntermediate()->assert_is_op_input( + "matmul"); // link to matmul qk + + // Third path to matmul + auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matmul"); + auto* mul2_w_var = pattern->NewNode(mul2_w_repr()) + ->AsInput() + ->assert_is_op_input("matmul", "Y"); + auto* mul2_out_var = + pattern->NewNode(mul2_out_repr())->assert_is_op_output("matmul"); + + decltype(mul2) eltadd2; + decltype(mul2) eltadd2_b_var; + decltype(mul2) eltadd2_out_var; + + mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add"); + eltadd2_b_var = pattern->NewNode(eltadd2_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd2_out_var = pattern->NewNode(eltadd2_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd2_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_2 = + pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2"); + + auto* reshape2_2_out_var = + pattern->NewNode(reshape2_2_out_repr())->assert_is_op_output("reshape2"); + reshape2_2_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_2 = + pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2"); + auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_2_out_var->AsIntermediate()->assert_is_ops_input( + matmul_ops); // link to matmul qkv + + // Q path + mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var}); + eltadd0->LinksFrom({mul0_out_var, eltadd0_b_var}).LinksTo({eltadd0_out_var}); + + reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var}); + transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var}); + // K path + mul1->LinksFrom({input0, mul1_w_var}).LinksTo({mul1_out_var}); + eltadd1->LinksFrom({mul1_out_var, eltadd1_b_var}).LinksTo({eltadd1_out_var}); + reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var}); + transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var}); + // compute q*k + matmul_qk->LinksFrom({transpose2_0_out_var, transpose2_1_out_var}) + .LinksTo({matmul_qk_out_var}); + eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var}) + .LinksTo({eltadd_qk_out_var}); + softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var}); + // V path + mul2->LinksFrom({input0, mul2_w_var}).LinksTo({mul2_out_var}); + eltadd2->LinksFrom({mul2_out_var, eltadd2_b_var}).LinksTo({eltadd2_out_var}); + reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var}); + transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var}); + // compute q*k*v + matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var}) + .LinksTo({matmul_qkv_out_var}); + transpose2_qkv->LinksFrom({matmul_qkv_out_var}) + .LinksTo({transpose2_qkv_out_var}); + reshape2_qkv->LinksFrom({transpose2_qkv_out_var}) + .LinksTo({reshape2_qkv_out_var}); + + return transpose2_2_out_var; +} + +static int BuildFusionV3(Graph* graph, const std::string& name_scope, + Scope* scope) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + // Create pattern. + MultiHeadMatmulV3Pattern multihead_pattern(pattern, name_scope); + + multihead_pattern(); + // Create New OpDesc + auto fuse_creater = [&]( + Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out, + Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w, + Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b, + Node* reshape2, Node* reshape2_qkv_out, Node* matmul_qk) { + auto scale_attr = BOOST_GET_CONST(float, matmul_qk->Op()->GetAttr("alpha")); + + // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H) + // bias (B * S * 3 * N * H) + bias (3 * N * H) + // Transpose (B * S * 3 * N * H) -> (3 * B * N * S * H) + auto* wq_tensor = scope->FindVar(mul0_w->Name())->GetMutable(); + auto* wk_tensor = scope->FindVar(mul1_w->Name())->GetMutable(); + auto* wv_tensor = scope->FindVar(mul2_w->Name())->GetMutable(); + + auto* bq_tensor = + scope->FindVar(eltadd0_b->Name())->GetMutable(); + auto* bk_tensor = + scope->FindVar(eltadd1_b->Name())->GetMutable(); + auto* bv_tensor = + scope->FindVar(eltadd2_b->Name())->GetMutable(); + + auto* wq_data = wq_tensor->mutable_data(platform::CPUPlace()); + auto* wk_data = wk_tensor->mutable_data(platform::CPUPlace()); + auto* wv_data = wv_tensor->mutable_data(platform::CPUPlace()); + auto* bq_data = bq_tensor->mutable_data(platform::CPUPlace()); + auto* bk_data = bk_tensor->mutable_data(platform::CPUPlace()); + auto* bv_data = bv_tensor->mutable_data(platform::CPUPlace()); + + auto combined_w_dims = + framework::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]}); + auto combined_bias_dims = framework::make_ddim({3, bq_tensor->dims()[0]}); + + // reuse the mul0_w and eltadd_0_b nodes for the combined nodes. + auto* combined_w_desc = mul0_w->Var(); + combined_w_desc->SetShape({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]}); + combined_w_desc->SetPersistable(true); + + auto* combined_bias_desc = eltadd0_b->Var(); + combined_bias_desc->SetShape({3, bq_tensor->dims()[0]}); + combined_bias_desc->SetPersistable(true); + + framework::LoDTensor tmp_combined_w_tensor; + tmp_combined_w_tensor.Resize(combined_w_dims); + auto* tmp_combined_w_data = + tmp_combined_w_tensor.mutable_data(platform::CPUPlace()); + + std::vector w_vec = {wq_data, wk_data, wv_data}; + int dims_h = combined_w_dims[0], dims_w = combined_w_dims[2]; + // Combine the three fc weights together. + for (int i = 0; i < dims_h; i++) { + for (int j = 0; j < 3; j++) { + for (int k = 0; k < dims_w; k++) { + int out_index = i * (3 * dims_w) + j * dims_w + k; + int in_index = i * dims_w + k; + tmp_combined_w_data[out_index] = w_vec[j][in_index]; + } + } + } + + wq_tensor->Resize(combined_w_dims); + auto* new_combined_w_data = + wq_tensor->mutable_data(platform::CPUPlace()); + memcpy(new_combined_w_data, tmp_combined_w_data, + sizeof(float) * wq_tensor->numel()); + + scope->EraseVars({mul1_w->Name(), mul2_w->Name()}); + + framework::LoDTensor tmp_combined_bias_tensor; + tmp_combined_bias_tensor.Resize(combined_bias_dims); + auto* tmp_combined_bias_data = + tmp_combined_bias_tensor.mutable_data(platform::CPUPlace()); + + size_t bias_size = bq_tensor->numel(); + memcpy(tmp_combined_bias_data, bq_data, sizeof(float) * bias_size); + memcpy(tmp_combined_bias_data + bias_size, bk_data, + sizeof(float) * bias_size); + memcpy(tmp_combined_bias_data + 2 * bias_size, bv_data, + sizeof(float) * bias_size); + + bq_tensor->Resize(combined_bias_dims); + auto* new_combined_bias_data = + bq_tensor->mutable_data(platform::CPUPlace()); + memcpy(new_combined_bias_data, tmp_combined_bias_data, + sizeof(float) * bq_tensor->numel()); + + scope->EraseVars({eltadd1_b->Name(), eltadd2_b->Name()}); + + auto reshape_desc = reshape2->Op(); + int head_number = + BOOST_GET_CONST(std::vector, reshape_desc->GetAttr("shape")).at(2); + + OpDesc multihead_op_desc; + multihead_op_desc.SetType("multihead_matmul"); + + multihead_op_desc.SetInput("Input", {input0->Name()}); + multihead_op_desc.SetInput("W", {mul0_w->Name()}); + multihead_op_desc.SetInput("Bias", {eltadd0_b->Name()}); + multihead_op_desc.SetInput("BiasQK", {eltadd_qk_b->Name()}); + + multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()}); + multihead_op_desc.SetAttr("alpha", scale_attr); + multihead_op_desc.SetAttr("head_number", head_number); + + auto* multihead = graph->CreateOpNode(&multihead_op_desc); + + IR_NODE_LINK_TO(input0, multihead); + IR_NODE_LINK_TO(mul0_w, multihead); + IR_NODE_LINK_TO(eltadd0_b, multihead); + IR_NODE_LINK_TO(eltadd_qk_b, multihead); + + IR_NODE_LINK_TO(multihead, reshape2_qkv_out); + }; + + int fusion_count{0}; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul0_w, mul0_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_0, reshape2_0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out, reshape2_0_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_0, transpose2_0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out, transpose2_0_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul1, mul1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul1_out, mul1_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul1_w, mul1_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_1, reshape2_1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_1_out, reshape2_1_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_1, transpose2_1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_1_out, transpose2_1_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul2, mul2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul2_out, mul2_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul2_w, mul2_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_2, reshape2_2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_2_out, reshape2_2_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_2, transpose2_2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_2_out, transpose2_2_out, + multihead_pattern); + + // nodes need be removed + GET_IR_NODE_FROM_SUBGRAPH(eltadd0, eltadd0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd0_b, eltadd0_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd0_out, eltadd0_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd1, eltadd1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd1_b, eltadd1_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd1_out, eltadd1_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd2, eltadd2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd2_b, eltadd2_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd2_out, eltadd2_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(matmul_qk, matmul_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_qk_out, matmul_qk_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk, eltadd_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_b, eltadd_qk_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_out, eltadd_qk_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(softmax_qk, softmax_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv, matmul_qkv, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out, matmul_qkv_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv, reshape2_qkv, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out, reshape2_qkv_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv, transpose2_qkv, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out, + multihead_pattern); + + // If weights or biases in qkv's fc are shared by multiple multihead_matmul + // patterns, we do not support this kind of fusion, this pass will not take + // effect. + bool is_fc_params_shared = + mul0_w->outputs.size() > 1 || mul1_w->outputs.size() > 1 || + mul2_w->outputs.size() > 1 || eltadd0_b->outputs.size() > 1 || + eltadd1_b->outputs.size() > 1 || eltadd2_b->outputs.size() > 1; + if (is_fc_params_shared) { + return; + } + fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w, + mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b, + reshape2_0, reshape2_qkv_out, matmul_qk); + + std::unordered_set marked_nodes({eltadd0, + eltadd1, + eltadd2, + eltadd1_b, + eltadd2_b, + eltadd0_out, + eltadd1_out, + eltadd2_out, + reshape2_0, + reshape2_1, + reshape2_2, + reshape2_0_out, + reshape2_1_out, + reshape2_2_out, + transpose2_0, + transpose2_1, + transpose2_2, + transpose2_0_out, + transpose2_1_out, + transpose2_2_out, + matmul_qk, + matmul_qk_out, + eltadd_qk, + eltadd_qk_out, + softmax_qk, + softmax_qk_out, + transpose2_qkv, + transpose2_qkv_out, + matmul_qkv, + matmul_qkv_out, + mul0, + mul1, + mul2, + mul0_out, + mul1_out, + mul2_out, + mul1_w, + mul2_w, + reshape2_qkv}); + // Remove unneeded nodes. + GraphSafeRemoveNodes(graph, marked_nodes); + ++fusion_count; + }; + gpd(graph, handler); + + return fusion_count; +} + } // namespace patterns void MultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const { @@ -706,6 +1179,21 @@ void MultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const { AddStatis(fusion_count); } +void MultiHeadMatmulV3FusePass::ApplyImpl(Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + auto* scope = param_scope(); + PADDLE_ENFORCE_NOT_NULL( + scope, + platform::errors::Fatal( + "During the multiheadMatmul pass, The scope should not be null.")); + + int fusion_count = patterns::BuildFusionV3(graph, name_scope_, scope); + if (fusion_count > 0) { + graph->Set(kMultiheadMatmulPass, new bool(true)); + } + AddStatis(fusion_count); +} + } // namespace ir } // namespace framework } // namespace paddle @@ -715,6 +1203,8 @@ REGISTER_PASS(multihead_matmul_fuse_pass, REGISTER_PASS(multihead_matmul_fuse_pass_v2, paddle::framework::ir::MultiHeadMatmulV2FusePass); +REGISTER_PASS(multihead_matmul_fuse_pass_v3, + paddle::framework::ir::MultiHeadMatmulV3FusePass); REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2) .AddCombination( paddle::framework::compatible::OpVersionComparatorCombination() @@ -725,3 +1215,13 @@ REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2) .EQ("scale", 0) .LE("matmul", 1) .EQ("softmax", 0)); +REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v3) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("elementwise_add", 1) + .EQ("reshape2", 0) + .EQ("transpose2", 0) + .EQ("scale", 0) + .LE("matmul", 1) + .EQ("matmul_v2", 0) + .EQ("softmax", 0)); diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h index f5327dc71080be..c7f1336211d346 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h @@ -89,9 +89,63 @@ struct MultiHeadMatmulPattern : public PatternBase { PATTERN_DECL_NODE(matmul_qkv); PATTERN_DECL_NODE(matmul_qkv_out); }; + +struct MultiHeadMatmulV3Pattern : public PatternBase { + MultiHeadMatmulV3Pattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "multihead_matmul_v3") {} + + PDNode* operator()(); + + // declare operator node's name + PATTERN_DECL_NODE(input0); + PATTERN_DECL_NODE(mul0); + PATTERN_DECL_NODE(mul1); + PATTERN_DECL_NODE(mul2); + PATTERN_DECL_NODE(mul0_w); + PATTERN_DECL_NODE(mul1_w); + PATTERN_DECL_NODE(mul2_w); + PATTERN_DECL_NODE(mul0_out); + PATTERN_DECL_NODE(mul1_out); + PATTERN_DECL_NODE(mul2_out); + PATTERN_DECL_NODE(eltadd0); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd1); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd2); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd0_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd1_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd2_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd0_out); + PATTERN_DECL_NODE(eltadd1_out); + PATTERN_DECL_NODE(eltadd2_out); + PATTERN_DECL_NODE(reshape2_0); + PATTERN_DECL_NODE(reshape2_1); + PATTERN_DECL_NODE(reshape2_2); + PATTERN_DECL_NODE(reshape2_qkv); + PATTERN_DECL_NODE(reshape2_0_out); + PATTERN_DECL_NODE(reshape2_1_out); + PATTERN_DECL_NODE(reshape2_2_out); + PATTERN_DECL_NODE(reshape2_qkv_out); + PATTERN_DECL_NODE(transpose2_0); + PATTERN_DECL_NODE(transpose2_1); + PATTERN_DECL_NODE(transpose2_2); + PATTERN_DECL_NODE(transpose2_qkv); + PATTERN_DECL_NODE(transpose2_0_out); + PATTERN_DECL_NODE(transpose2_1_out); + PATTERN_DECL_NODE(transpose2_2_out); + PATTERN_DECL_NODE(transpose2_qkv_out); + PATTERN_DECL_NODE(matmul_qk); + PATTERN_DECL_NODE(matmul_qk_out); + PATTERN_DECL_NODE(eltadd_qk); + PATTERN_DECL_NODE(eltadd_qk_b); + PATTERN_DECL_NODE(eltadd_qk_out); + PATTERN_DECL_NODE(softmax_qk); + PATTERN_DECL_NODE(softmax_qk_out); + + PATTERN_DECL_NODE(matmul_qkv); + PATTERN_DECL_NODE(matmul_qkv_out); +}; + } // namespace patterns -// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op. class MultiHeadMatmulFusePass : public FusePassBase { public: virtual ~MultiHeadMatmulFusePass() {} @@ -112,6 +166,16 @@ class MultiHeadMatmulV2FusePass : public FusePassBase { const std::string name_scope_{"multihead_matmul_fuse_v2"}; }; +class MultiHeadMatmulV3FusePass : public FusePassBase { + public: + virtual ~MultiHeadMatmulV3FusePass() {} + + protected: + void ApplyImpl(Graph* graph) const; + + const std::string name_scope_{"multihead_matmul_fuse_v3"}; +}; + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc index ada20113077c18..232e1d8da4ded3 100644 --- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc @@ -141,14 +141,6 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance, fused_pattern); - // check if is in ernie or not - if (!graph->Has(kEmbEltwiseLayernormPass) || - !graph->Has(kMultiheadMatmulPass)) { - LOG(INFO) << "The skip_layernorm_fuse_pass is only supported in " - << "Ernie/Bert model. Just skip this pass."; - return; - } - std::unordered_set del_node_set; // Create an SkipLayerNorm op node @@ -161,6 +153,10 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { new_desc.SetInput("Scale", {layer_norm_scale->Name()}); new_desc.SetInput("Bias", {layer_norm_bias->Name()}); + if (elementwise->Op()->HasAttr("out_threshold")) { + new_desc.SetAttr("enable_int8", true); + } + // outputs new_desc.SetOutput("Out", {layer_norm_out->Name()}); diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h index 4307e51862df57..8fe314cf5f18c5 100644 --- a/paddle/fluid/framework/library_type.h +++ b/paddle/fluid/framework/library_type.h @@ -61,6 +61,8 @@ inline LibraryType StringToLibraryType(const char* ctype) { return LibraryType::kPlain; } else if (s == std::string("XPU")) { return LibraryType::kPlain; + } else if (s == std::string("NPU")) { + return LibraryType::kPlain; } else if (s == std::string("CUDA")) { return LibraryType::kPlain; } else { diff --git a/paddle/fluid/framework/load_op_lib.h b/paddle/fluid/framework/load_op_lib.h deleted file mode 100644 index 16cffe119d63e0..00000000000000 --- a/paddle/fluid/framework/load_op_lib.h +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" - -namespace paddle { -namespace framework { - -template -T *DynLoad(void *handle, std::string name) { - T *func = reinterpret_cast(dlsym(handle, name.c_str())); -#if !defined(_WIN32) - auto errorno = dlerror(); -#else - auto errorno = GetLastError(); -#endif // !_WIN32 - PADDLE_ENFORCE_NOT_NULL( - func, - platform::errors::NotFound( - "Failed to load dynamic operator library, error code(%s).", errorno)); - return func; -} - -void LoadOpLib(const std::string &dso_name) { - void *handle = paddle::platform::dynload::GetOpDsoHandle(dso_name); - - typedef OpInfoMap &get_op_info_t(); - get_op_info_t *get_op_info = - DynLoad(handle, "PD_GetOpInfoMap"); - auto &op_info = get_op_info(); - auto *dyn_info_map = op_info.mutable_map(); - - typedef std::vector grad_op_desc_maker_t( - const OpDesc &, const std::unordered_set &, - std::unordered_map *, - const std::vector &); - - grad_op_desc_maker_t *grad_op_desc_maker = - DynLoad(handle, "PD_GetGradOpDescStrs"); - - auto &info_map = OpInfoMap::Instance(); - for (const auto &n : *(dyn_info_map)) { - auto type = n.first; - if (type == "recurrent" || type == "recurrent_grad" || - type == "conditional_block" || type == "conditional_block_grad") { - continue; - } - PADDLE_ENFORCE_NE(info_map.Has(n.first), true, - platform::errors::AlreadyExists( - "Operator (%s) has been registered.", type)); - OpInfo info; - info.creator_ = n.second.creator_; - - // If get the protocol buffer from dynamic library directly, there - // will be deconstruction error - // ** Error in `python`: free(): invalid pointer: - // ... paddle::framework::proto::OpDesc::SharedDtor() - // It seems a bug in protobuf, see - // https://github.com/protocolbuffers/protobuf/issues/435 - // So, get the serialized binary string from dynamic library, - // then deserialize to protocol buffer. - info.grad_op_maker_ = [grad_op_desc_maker]( - const OpDesc &op_desc, - const std::unordered_set &no_grad_set, - std::unordered_map *grad_to_var, - const std::vector &grad_block) { - std::vector strs = - grad_op_desc_maker(op_desc, no_grad_set, grad_to_var, grad_block); - std::vector> ret; - for (auto &str : strs) { - proto::OpDesc proto_desc; - PADDLE_ENFORCE_EQ(proto_desc.ParseFromString(str), true, - platform::errors::InvalidArgument( - "Failed to parse OpDesc from string.")); - ret.emplace_back(new OpDesc(proto_desc, nullptr)); - } - return ret; - }; - info.proto_ = n.second.proto_; - info.checker_ = n.second.checker_; - info.infer_var_type_ = n.second.infer_var_type_; - info.infer_shape_ = n.second.infer_shape_; - info.infer_inplace_ = n.second.infer_inplace_; - info.infer_no_need_buffer_vars_ = n.second.infer_no_need_buffer_vars_; - info.use_default_grad_op_desc_maker_ = - n.second.use_default_grad_op_desc_maker_; - info.use_empty_grad_op_desc_maker_ = n.second.use_empty_grad_op_desc_maker_; - - info_map.Insert(type, info); - } - - typedef void init_device_t(platform::DeviceContextPool *); - init_device_t *init_dev = - DynLoad(handle, "PD_InitDevicesPool"); - init_dev(&(platform::DeviceContextPool::Instance())); -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc index ff8e71b92e0ac5..198bb65863bb6a 100644 --- a/paddle/fluid/framework/multi_trainer.cc +++ b/paddle/fluid/framework/multi_trainer.cc @@ -38,6 +38,13 @@ void MultiTrainer::Initialize(const TrainerDesc& trainer_desc, need_merge_var_names_.push_back( trainer_desc.downpour_param().stat_var_names(i)); } +#ifdef PADDLE_WITH_HETERPS + for (int i = 0; i < thread_num_; ++i) { + int num = trainer_desc.worker_places(i); + platform::CUDAPlace place = platform::CUDAPlace(num); + places_.push_back(place); + } +#endif // get filelist from trainer_desc here const std::vector readers = dataset->GetReaders(); @@ -102,13 +109,42 @@ void MultiTrainer::InitDumpEnv() { void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program, const platform::Place& place) { for (int i = 0; i < thread_num_; ++i) { +#ifdef PADDLE_WITH_HETERPS + workers_[i]->SetPlace(places_[i]); + workers_[i]->SetReaderPlace(places_[i]); +#else workers_[i]->SetPlace(place); workers_[i]->SetReaderPlace(place); +#endif workers_[i]->SetRootScope(root_scope_); workers_[i]->CreateDeviceResource(main_program); // Program workers_[i]->BindingDataFeedMemory(); workers_[i]->CacheProgram(main_program); } +#ifdef PADDLE_WITH_HETERPS + for (int num = 0; num < thread_num_; ++num) { + auto place = places_[num]; + Scope* scope = workers_[num]->GetThreadScope(); + auto& block = main_program.Block(0); + for (auto& var : block.AllVars()) { + if (var->Persistable()) { + auto name = var->Name(); + Variable* root_var = root_scope_->FindVar(name); + if (!root_var) { + continue; + } + if (root_var->IsType()) { + continue; + } + LoDTensor* root_tensor = root_var->GetMutable(); + auto* ptr = scope->Var(name); + InitializeVariable(ptr, proto::VarType::LOD_TENSOR); + LoDTensor* thread_tensor = ptr->GetMutable(); + TensorCopy(*root_tensor, place, thread_tensor); + } + } + } +#endif } void MultiTrainer::InitOtherEnv(const ProgramDesc& main_program) { @@ -138,10 +174,77 @@ void MultiTrainer::Run() { } } +#ifdef PADDLE_WITH_HETERPS +void MultiTrainer::MergeDenseParam() { + auto communicator = paddle::distributed::Communicator::GetInstance(); + auto& recv_ctx = communicator->GetRecvCtxMap(); + Scope* thread_scope = workers_[0]->GetThreadScope(); + for (auto& iter : recv_ctx) { + auto& varnames = iter.second; + for (auto& name : varnames) { + Variable* root_var = root_scope_->FindVar(name); + LoDTensor* root_tensor = root_var->GetMutable(); + Variable* var = thread_scope->FindVar(name); + LoDTensor* tensor = var->GetMutable(); + TensorCopy((*tensor), root_tensor->place(), root_tensor); + } + } +} +#endif + +template +void MultiTrainer::MergeToRootScope(LoDTensor* root_tensor, LoDTensor* tensor) { + LoDTensor tmp_root; + TensorCopy(*root_tensor, platform::CPUPlace(), &tmp_root); + T* tmp_root_data = tmp_root.data(); + LoDTensor tmp_tensor; + TensorCopy(*tensor, platform::CPUPlace(), &tmp_tensor); + T* data = tmp_tensor.data(); + for (int i = 0; i < tmp_tensor.numel(); i++) { + tmp_root_data[i] += data[i]; + } + TensorCopy(tmp_root, platform::CPUPlace(), root_tensor); +} + void MultiTrainer::Finalize() { if (need_dump_field_ || need_dump_param_) { FinalizeDumpEnv(); } +#ifdef PADDLE_WITH_HETERPS + for (size_t i = 0; i < need_merge_var_names_.size(); i++) { + Variable* root_var = root_scope_->FindVar(need_merge_var_names_[i]); + if (root_var == nullptr) { + continue; + } + LoDTensor* root_tensor = root_var->GetMutable(); + + for (size_t j = 0; j < places_.size(); j++) { + Scope* cur_thread_scope = workers_[j]->GetThreadScope(); + Variable* thread_var = + cur_thread_scope->FindVar(need_merge_var_names_[i]); + if (thread_var == nullptr) { + continue; + } + LoDTensor* thread_tensor = thread_var->GetMutable(); +#define MergeCallback(cpp_type, proto_type) \ + do { \ + if (root_tensor->type() == proto_type) { \ + if (thread_tensor->type() != proto_type) { \ + VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \ + << "] " << need_merge_var_names_[i] \ + << ", root tensor type=" << root_tensor->type() \ + << ", thread tensor type=" << thread_tensor->type(); \ + exit(-1); \ + } \ + MergeToRootScope(root_tensor, thread_tensor); \ + } \ + } while (0) + _ForEachDataType_(MergeCallback); + } + } + MergeDenseParam(); + +#endif root_scope_->DropKids(); } diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 472c6f408266af..4c529329761227 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -304,6 +304,9 @@ struct OpKernelRegistrarFunctorEx &places, const BuildStrategy &build_strategy, ir::Graph *graph) : member_(new ParallelExecutorPrivate(places, scope)) { + PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]), + platform::errors::Unavailable( + "NPU is not supported in ParallelExecutor")); InitP2P(places); ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), member_->places_.size()); diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index a97fc2e75aab14..5968df548dfb0f 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -71,37 +71,16 @@ void PipelineTrainer::CopyParameters(int microbatch_id, const ProgramDesc& program, const platform::Place& place) { auto& global_block = program.Block(0); - std::map param_map; - for (auto& var : global_block.AllVars()) { - if (var->Persistable()) { - param_map[var->Name()] = 1; - } - } for (auto& var : global_block.AllVars()) { - bool is_param_grad = false; - size_t pos = 0; - // A magic suffix to indicate the merged gradient - std::string magicSuffix = std::string(kGradVarSuffix) + "@MERGED"; - if ((pos = var->Name().find(magicSuffix)) != std::string::npos) { - auto prefix_name = var->Name().substr(0, pos); - if (param_map.find(prefix_name) != param_map.end()) { - is_param_grad = true; - } - } if (var->Persistable() && microbatch_id == 0) { auto* ptr = root_scope_->Var(var->Name()); InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create persistable var: " << var->Name() - << ", which pointer is " << ptr; - } else if (is_param_grad && microbatch_id == 0) { - auto* ptr = minibatch_scope_->Var(var->Name()); - InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create grad for persistable var: " << var->Name() + VLOG(5) << "Create persistable var: " << var->Name() << ", which pointer is " << ptr; - } else if (!var->Persistable() && !is_param_grad) { + } else if (!var->Persistable()) { auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name()); - VLOG(3) << "Create variable " << var->Name() << " for microbatch " + VLOG(5) << "Create variable " << var->Name() << " for microbatch " << microbatch_id << ", which pointer is " << ptr; InitializeVariable(ptr, var->GetType()); } diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index e77932fa5f2265..39bc3f040639bf 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -19,10 +19,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/device_worker_factory.h" -#include "paddle/fluid/framework/fleet/fleet_wrapper.h" -#include "paddle/fluid/framework/fleet/heter_context.h" -#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" -#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/trainer.h" #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ (defined PADDLE_WITH_PSLIB) @@ -64,7 +60,6 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, pull_dense_worker_ = PullDenseWorker::GetInstance(); pull_dense_worker_->Initialize(trainer_desc); SetDebug(trainer_desc.debug()); - fleet_ptr_ = FleetWrapper::GetInstance(); trainer_desc_ = trainer_desc; workers_.resize(place_num); for (int i = 0; i < place_num; ++i) { diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc index 2597901d91f36b..d178c4e556ca57 100644 --- a/paddle/fluid/framework/ps_gpu_worker.cc +++ b/paddle/fluid/framework/ps_gpu_worker.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker_factory.h" -#include "paddle/fluid/framework/fleet/fleet_wrapper.h" #include "paddle/fluid/framework/fleet/heter_wrapper.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/string/string_helper.h" diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index 54f77981306336..101463756c0a51 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -125,25 +125,54 @@ TEST(Tensor, MutableData) { float* p2 = nullptr; // initialization p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); auto p1_holder = src_tensor.Holder(); EXPECT_NE(p1, nullptr); // set src_tensor a new dim with large size // momery is supposed to be re-allocated p2 = src_tensor.mutable_data(framework::make_ddim({3, 1024}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); auto p2_holder = src_tensor.Holder(); EXPECT_NE(p2, nullptr); EXPECT_NE(p1_holder.get(), p2_holder.get()); // set src_tensor a new dim with same size // momery block is supposed to be unchanged p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); EXPECT_EQ(p1, p2); // set src_tensor a new dim with smaller size // momery block is supposed to be unchanged p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); + EXPECT_EQ(p1, p2); + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + { + framework::Tensor src_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), + platform::NPUPlace(0)); + auto p1_holder = src_tensor.Holder(); + EXPECT_NE(p1, nullptr); + // set src_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = src_tensor.mutable_data(framework::make_ddim({3, 1024}), + platform::NPUPlace(0)); + auto p2_holder = src_tensor.Holder(); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1_holder.get(), p2_holder.get()); + // set src_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), + platform::NPUPlace(0)); + EXPECT_EQ(p1, p2); + // set src_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::NPUPlace(0)); EXPECT_EQ(p1, p2); } #endif @@ -179,7 +208,17 @@ TEST(Tensor, ShareDataWith) { framework::Tensor src_tensor; framework::Tensor dst_tensor; src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); + dst_tensor.ShareDataWith(src_tensor); + ASSERT_EQ(src_tensor.data(), dst_tensor.data()); + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + { + framework::Tensor src_tensor; + framework::Tensor dst_tensor; + src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), + platform::NPUPlace(0)); dst_tensor.ShareDataWith(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } @@ -216,7 +255,34 @@ TEST(Tensor, Slice) { { framework::Tensor src_tensor; src_tensor.mutable_data(framework::make_ddim({6, 9}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); + framework::Tensor slice_tensor = src_tensor.Slice(2, 6); + framework::DDim slice_dims = slice_tensor.dims(); + ASSERT_EQ(arity(slice_dims), 2); + EXPECT_EQ(slice_dims[0], 4); + EXPECT_EQ(slice_dims[1], 9); + + uintptr_t src_data_address = + reinterpret_cast(src_tensor.data()); + uintptr_t src_mutable_data_address = + reinterpret_cast(src_tensor.mutable_data( + src_tensor.dims(), platform::CUDAPlace(0))); + uintptr_t slice_data_address = + reinterpret_cast(slice_tensor.data()); + uintptr_t slice_mutable_data_address = + reinterpret_cast(slice_tensor.mutable_data( + slice_tensor.dims(), platform::CUDAPlace(0))); + EXPECT_EQ(src_data_address, src_mutable_data_address); + EXPECT_EQ(slice_data_address, slice_mutable_data_address); + EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); + } +#endif + +#ifdef PADDLE_WITH_ASCEND_CL + { + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({6, 9}), + platform::NPUPlace(0)); framework::Tensor slice_tensor = src_tensor.Slice(2, 6); framework::DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 2); @@ -227,12 +293,12 @@ TEST(Tensor, Slice) { reinterpret_cast(src_tensor.data()); uintptr_t src_mutable_data_address = reinterpret_cast(src_tensor.mutable_data( - src_tensor.dims(), platform::CUDAPlace())); + src_tensor.dims(), platform::NPUPlace(0))); uintptr_t slice_data_address = reinterpret_cast(slice_tensor.data()); uintptr_t slice_mutable_data_address = reinterpret_cast(slice_tensor.mutable_data( - slice_tensor.dims(), platform::CUDAPlace())); + slice_tensor.dims(), platform::NPUPlace(0))); EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index c6ac30a369859d..d6882b25d22588 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -97,6 +97,42 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif +#ifdef PADDLE_WITH_ASCEND_CL + // TODO(zhiqiu): handle different condition like CUDA code below + else if (platform::is_npu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + stream); + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_npu_place(dst_place)) { + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, + stream); + } + else if (platform::is_npu_place(src_place) && // NOLINT + platform::is_npu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + stream); + } + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "Copy from %s to %s is not supported.", src_place, dst_place)); + } +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { @@ -304,6 +340,35 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { /* npu -> cpu*/ + memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + nullptr); + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_npu_place(dst_place)) { /* cpu -> npu*/ + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, + nullptr); + } + else if (platform::is_npu_place(src_place) && // NOLINT + platform::is_npu_place(dst_place)) { /* npu -> npu*/ + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data sync from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + nullptr); + } + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "Copy from %s to %s is not supported.", src_place, dst_place)); + } +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { @@ -431,6 +496,13 @@ class AnyVisitor : public boost::static_visitor { return GetResultHelper(out, gpu); } + bool GetResult(const framework::Tensor& out, + const platform::NPUPlace& npu) const { + PADDLE_THROW( + platform::errors::Unimplemented("Not supported on place (%s) ", npu)); + // return GetResultHelper(out, npu); + } + bool GetResult(const framework::Tensor& out, const platform::CPUPlace& cpu) const { return *out.data(); @@ -633,6 +705,10 @@ struct BothFalseVisitor : public boost::static_visitor<> { #endif } + void VisitorImpl(const platform::NPUPlace& npu) const { + // TODO(zhiqiu) + } + void VisitorImpl(const platform::CPUPlace& cpu) const { int num = in_.numel(); const bool* in_ptr = in_.data(); diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index fd0f98784ceb0a..85af9e50087024 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -135,6 +135,7 @@ void TensorFromArray(const T* src, const size_t& array_size, } #endif } + template void TensorFromVector(const std::vector& src, const platform::DeviceContext& ctx, Tensor* dst) { @@ -157,6 +158,57 @@ void TensorFromVector(const std::vector& src, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(dst_place)) { // NOLINT + memory::Copy( + BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif +} + +// The fully specialized function should be inline to avoid +// multi-definition. +template <> +inline void TensorFromVector(const std::vector& src, + const platform::DeviceContext& ctx, Tensor* dst) { + // vector has no data() member, use array instead. + // See details: + // https://stackoverflow.com/questions/46115669/why-does-stdvectorbool-have-no-data/46115714 + bool* array = new bool[src.size()]; + for (unsigned int i = 0; i < src.size(); i++) { + array[i] = static_cast(src[i]); + } + + auto dst_place = ctx.GetPlace(); + auto src_ptr = static_cast(array); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(bool); + + if (platform::is_cpu_place(dst_place)) { + memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(dst_place)) { // NOLINT + memory::Copy( + BOOST_GET_CONST(platform::CUDAPlace, dst_place), dst_ptr, src_place, + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(dst_place)) { // NOLINT + memory::Copy( + BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif + delete[] array; } template @@ -171,6 +223,23 @@ void TensorFromVector(const std::vector& src, Tensor* dst) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } +template <> +inline void TensorFromVector(const std::vector& src, Tensor* dst) { + bool* array = new bool[src.size()]; + for (unsigned int i = 0; i < src.size(); i++) { + array[i] = static_cast(src[i]); + } + platform::CPUPlace dst_place = platform::CPUPlace(); + auto src_ptr = static_cast(array); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(bool); + + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); + delete[] array; +} + template void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, std::vector* dst) { @@ -194,6 +263,54 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(src.place())) { // NOLINT + memory::Copy( + dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif +} + +template <> +inline void TensorToVector(const Tensor& src, + const platform::DeviceContext& ctx, + std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(bool); + + bool* array = new bool[src.numel()]; + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(array); + + if (platform::is_cpu_place(src.place())) { + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, + size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src.place())) { // NOLINT + memory::Copy( + dst_place, dst_ptr, BOOST_GET_CONST(platform::CUDAPlace, src.place()), + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(src.place())) { // NOLINT + memory::Copy( + dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif + for (unsigned int i = 0; i < src.numel(); i++) { + (*dst)[i] = static_cast(array[i]); + } + delete[] array; } template @@ -215,6 +332,32 @@ void TensorToVector(const Tensor& src, std::vector* dst) { BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size); } +template <> +inline void TensorToVector(const Tensor& src, std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(bool); + + bool* array = new bool[src.numel()]; + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(array); + + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(src.place()), true, + platform::errors::InvalidArgument( + "The input tensor should be CPU device, but actually it is in %s.", + src.place())); + + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src.place()), src_ptr, size); + + for (unsigned int i = 0; i < src.numel(); i++) { + (*dst)[i] = static_cast(array[i]); + } + delete[] array; +} + std::ostream& operator<<(std::ostream& os, const Tensor& t); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index c32efd0a470be2..8587ee8d1e9196 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -242,6 +242,61 @@ TEST(TensorToVector, Tensor) { #endif } +TEST(TensorToVector, Tensor_bool) { + { + paddle::framework::Tensor src; + bool* src_ptr = + src.mutable_data({3, 3}, paddle::platform::CPUPlace()); + for (int i = 0; i < 3 * 3; ++i) { + src_ptr[i] = static_cast(i % 2); + } + + paddle::platform::CPUPlace place; + std::vector dst; + paddle::framework::TensorToVector(src, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_ptr[i], dst[i]); + } + } +#ifdef PADDLE_WITH_CUDA + { + std::vector src_vec = { + false, true, false, true, false, true, false, true, false, + }; + paddle::framework::Tensor gpu_tensor; + paddle::platform::CUDAPlace place; + paddle::platform::CUDADeviceContext gpu_ctx(place); + paddle::framework::TensorFromVector(src_vec, gpu_ctx, &gpu_tensor); + + std::vector dst; + paddle::framework::TensorToVector(gpu_tensor, gpu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_vec[i], dst[i]); + } + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + { + std::vector src_vec = { + false, true, false, true, false, true, false, true, false, + }; + paddle::framework::Tensor npu_tensor; + paddle::platform::NPUPlace place(0); + paddle::platform::NPUDeviceContext npu_ctx(place); + paddle::framework::TensorFromVector(src_vec, npu_ctx, &npu_tensor); + + std::vector dst; + paddle::framework::TensorToVector(npu_tensor, npu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_vec[i], dst[i]); + } + } +#endif +} + TEST(TensorFromDLPack, Tensor) { { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index ca290a50b42fe0..7efb89ad7d9d9c 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -109,13 +109,22 @@ class MultiTrainer : public TrainerBase { virtual Scope* GetWorkerScope(int thread_id); virtual std::string GetDumpPath(int tid); + template + void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor); +#ifdef PADDLE_WITH_HETERPS + + void MergeDenseParam(); +#endif + protected: int thread_num_; std::vector threads_; std::vector readers_; std::vector> workers_; std::vector need_merge_var_names_; - +#ifdef PADDLE_WITH_HETERPS + std::vector places_; +#endif int mpi_rank_; int mpi_size_; int dump_file_num_; @@ -313,7 +322,6 @@ class PSGPUTrainer : public TrainerBase { float scale_datanorm_; paddle::platform::Place place_; ProgramDesc program_; - std::shared_ptr fleet_ptr_; std::shared_ptr pull_dense_worker_; std::vector> workers_; std::vector places_; diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index a2b5a98401e236..e43cccfe648165 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -45,6 +45,17 @@ using Attribute = boost::variant< using AttributeMap = std::unordered_map; +#ifdef PADDLE_WITH_ASCEND_CL +using NPUAttribute = + boost::variant, + std::vector, std::vector, bool, + std::vector, BlockDesc*, int64_t, + std::vector, std::vector, + std::vector, std::vector>>; + +using NPUAttributeMap = std::unordered_map; +#endif + using OpCreator = std::function; diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index 29ba54986801f1..d5350744e4c553 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -36,48 +36,73 @@ DECLARE_bool(sort_sum_gradient); namespace paddle { namespace imperative { -void BasicEngine::Init(VarBase* var, bool retain_graph) { +void BasicEngine::Init( + const std::vector>& tensors, + const std::vector>& grad_tensors, + bool retain_graph) { retain_graph_ = retain_graph; - init_node_ = var->GradVarBase()->GradNode(); - PADDLE_ENFORCE_EQ(var->GradVarBase()->GraphIsFreed(), false, - platform::errors::Unavailable( - "%s trying to backward through the same graph a second " - "time, but this graph have already been freed. Please " - "specify Tensor.backward(retain_graph=True) when " - "calling backward at the first time.", - var->Name())); - - if (!retain_graph) { - VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name() - << " because of retain_graph=False when calling backward"; - var->GradVarBase()->SetGraphIsFreed(true); - var->GradVarBase()->ClearGradNode(); - } - if (init_node_ == nullptr || var->OverridedStopGradient()) { - VLOG(3) << "Skip auto grad since there is no grad op for var or loss is " - "stop_gradient=True: " - << var->Name(); - return; - } + PADDLE_ENFORCE_EQ( + tensors.size(), grad_tensors.size(), + platform::errors::Unavailable( + "The size of tensors do not equal the size of grad_tensors," + "the size of tensors is %s, but the size of grad_tensors is %s.", + tensors.size(), grad_tensors.size())); + + for (size_t i = 0; i < tensors.size(); ++i) { + auto var = tensors[i]; + auto grad_tensor = grad_tensors[i]; + + auto init_node = var->GradVarBase()->GradNode(); + PADDLE_ENFORCE_EQ( + var->GradVarBase()->GraphIsFreed(), false, + platform::errors::Unavailable( + "%s trying to backward through the same graph a second " + "time, but this graph have already been freed. Please " + "specify Tensor.backward(retain_graph=True) when " + "calling backward at the first time.", + var->Name())); + + if (!retain_graph) { + VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name() + << " because of retain_graph=False when calling backward"; + var->GradVarBase()->SetGraphIsFreed(true); + var->GradVarBase()->ClearGradNode(); + } + + if (init_node == nullptr || var->OverridedStopGradient()) { + VLOG(3) << "Skip auto grad since there is no grad op for var or loss is " + "stop_gradient=True: " + << var->Name(); + continue; + } - VLOG(3) << "Init first node of backward"; + VLOG(3) << "Init node of backward"; + + PADDLE_ENFORCE_EQ( + var->HasGradVar(), true, + platform::errors::NotFound("Tensor %s has no gradient", var->Name())); + + auto& fwd_var = var->Var().Get(); + auto* grad_var = + var->GradVarBase()->MutableVar()->GetMutable(); + VLOG(6) << "init loss grad:" << var->GradVarBase()->Name() + << " as stop_gradient false"; + var->GradVarBase()->InnerSetOverridedStopGradient(false); + auto* dev_ctx = + platform::DeviceContextPool::Instance().Get(fwd_var.place()); + if (grad_tensor == nullptr) { + grad_var->Resize(fwd_var.dims()); + grad_var->mutable_data(fwd_var.place(), fwd_var.type()); + operators::math::set_constant(*dev_ctx, grad_var, 1.0); + } else { + paddle::framework::TensorCopy( + grad_tensor->Var().Get(), fwd_var.place(), + *dev_ctx, grad_var); + } - PADDLE_ENFORCE_EQ( - var->HasGradVar(), true, - platform::errors::NotFound("Grad variable not exist for variable %s", - var->Name())); - - auto& fwd_var = var->Var().Get(); - auto* grad_var = - var->GradVarBase()->MutableVar()->GetMutable(); - VLOG(6) << "init loss grad:" << var->GradVarBase()->Name() - << " as stop_gradient false"; - var->GradVarBase()->InnerSetOverridedStopGradient(false); - auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place()); - grad_var->Resize(fwd_var.dims()); - grad_var->mutable_data(fwd_var.place(), fwd_var.type()); - operators::math::set_constant(*dev_ctx, grad_var, 1.0); + init_nodes_.push_back(init_node); + } } void BasicEngine::CheckBackwardInputs(const OpBase& op) { @@ -141,17 +166,6 @@ void BasicEngine::PrepareGradAccumulators( << var.get() << ") that don't have grad node with reference count " << accumulator->RefCnt(); - - if (var->HasLeafHooks()) { - VLOG(3) << "Grad variable wrapper (" << var->Name() - << ") has leaf grad hooks."; - PADDLE_ENFORCE_NE( - var->HasGradNode(), true, - platform::errors::PermissionDenied( - "Only leaf Tensor's gradient can append hook to " - "Gradientaccumulator.")); - accumulator->SetPostHooks(var->GetLeafHooks()); - } } else { // Because Inplace op overwrites the grad_node of the input grad_var. So // only the information of grad_pending_node can be used to find the @@ -235,8 +249,10 @@ void BasicEngine::PrepareDeps() { std::queue q; std::unordered_set visited; - q.push(init_node_.get()); - visited.insert(init_node_.get()); + for (size_t i = 0; i < init_nodes_.size(); ++i) { + q.push(init_nodes_[i].get()); + visited.insert(init_nodes_[i].get()); + } while (!q.empty()) { auto* cur_node = q.front(); @@ -262,15 +278,41 @@ void BasicEngine::PrepareDeps() { } } +static std::shared_ptr> CallGradientHooks( + const NameVarMap& bwd_ins, const std::string& op_type) { + std::shared_ptr> tmp_ins_ptr = nullptr; + for (const auto& pair : bwd_ins) { + for (size_t i = 0; i < pair.second.size(); ++i) { + auto& var = pair.second[i]; + if (var->HasVariableWrapperHook()) { + if (tmp_ins_ptr == nullptr) { + tmp_ins_ptr = std::make_shared>(bwd_ins); + } + VLOG(3) << "Call " << var->GetVariableWrapperHooks().size() + << " hooks of " << op_type << "'s input `" << pair.first + << "`'s var `" << var->Name() << "`."; + auto tmp_var = var; + for (const auto& hook_pair : var->GetVariableWrapperHooks()) { + tmp_var = (*hook_pair.second)(tmp_var); + } + (*tmp_ins_ptr)[pair.first][i] = tmp_var; + } + } + } + return tmp_ins_ptr; +} + void BasicEngine::Execute() { - if (init_node_ == nullptr) { + if (init_nodes_.empty()) { return; } PrepareDeps(); // Start execute Computation graph std::queue> q; - q.push(std::move(init_node_)); + for (size_t i = 0; i < init_nodes_.size(); ++i) { + q.push(std::move(init_nodes_[i])); + } size_t op_num = 0; @@ -292,10 +334,15 @@ void BasicEngine::Execute() { auto& bwd_ins = cur_op.GetInsMap(); auto& bwd_outs = cur_op.GetOutsMap(); + /** + * [ Why need temporary outputs here? ] + * + * - construct the temp output map, avoid to disrupt graph + * - replace the element in the map by temp var, because a + * var may be coresponding to several grad var in one op + */ NameVarMap tmp_outs(bwd_outs); - // 1. construct the temp output map, avoid to disrupt graph - // 2. replace the element in the map by temp var, because a - // var may be coresponding to several grad var in one op + for (auto& pair : tmp_outs) { if (!pair.second.IsGrad()) { continue; @@ -408,10 +455,28 @@ void BasicEngine::Execute() { } } + /** + * [ Why need temporary inputs here? ] + * + * - Hook execution should not change original input tensor. + * User can register hook for Tensor's gradient, It is expected + * that the hook only affects the gradient of the backward + * propagation, and does not affect the gradient value input + * as the hook. + * - use `tmp_ins_ptr`, only copy bwd_ins when the var in bwd_ins + * hold hooks + */ + auto tmp_ins_ptr = CallGradientHooks(bwd_ins, cur_op.Type()); + { VLOG(3) << "Start to execute grad op " << cur_op.Type(); - OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(), - cur_op.place()); + if (tmp_ins_ptr == nullptr) { + OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(), + cur_op.place()); + } else { + OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs, cur_op.Attrs(), + cur_op.place()); + } } for (auto& pair : inplace_output_grad_var_list_) { @@ -428,15 +493,14 @@ void BasicEngine::Execute() { if (!accumulator->SumGradCompleted()) { continue; } - // 1. Call Hooks for **inner_var_** + // 1. Call Hooks for `inner_var_` + accumulator->CallGradientHooks(); - // 2. Sum Gradient with Previous Graph + // 2. Sum Gradient `inner_var_` to `var_` of Current or Previous Graph accumulator->AccumulateGrad(); - // 3. Call backward Hooks for **var_** - if (accumulator->HasPostHooks()) { - accumulator->CallBackwardPostHooks(); - } + // 3. Call backward Hooks for `var_` + accumulator->CallReduceHooks(); } need_accu_var_list_.clear(); @@ -470,7 +534,7 @@ void BasicEngine::Execute() { } void BasicEngine::Clear() { - init_node_.reset(); + init_nodes_.clear(); node_deps_.clear(); accumulators_.clear(); accumulators_with_grad_node_.clear(); diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h index a2ad8b5f8aa61e..49761a8df0b6b1 100644 --- a/paddle/fluid/imperative/basic_engine.h +++ b/paddle/fluid/imperative/basic_engine.h @@ -30,7 +30,9 @@ class OpBase; class BasicEngine : public Engine { public: - void Init(VarBase* var, bool retain_graph = false); + void Init(const std::vector>& tensors, + const std::vector>& grad_tensors, + bool retain_graph = false); void Execute() override; @@ -46,7 +48,7 @@ class BasicEngine : public Engine { void Clear(); private: - std::shared_ptr init_node_; + std::vector> init_nodes_; std::unordered_map node_deps_; // The input and output of Inplace op are the same. If only `var` is used // as the key, then the input and output of inplace op must be gradient diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc index 873068a0d310dc..16f9454e9376e4 100644 --- a/paddle/fluid/imperative/bkcl_context.cc +++ b/paddle/fluid/imperative/bkcl_context.cc @@ -19,12 +19,11 @@ #include #include +#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/bkcl_helper.h" #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/gen_comm_id_helper.h" - -#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/split.h" #include "paddle/fluid/string/string_helper.h" @@ -77,7 +76,7 @@ void BKCLParallelContext::Init() { bkcl_ids.resize(strategy_.nrings_); if (strategy_.local_rank_ == 0) { - // generate the unique ncclid on the root worker + // generate the unique bkclid on the root worker for (size_t i = 0; i < bkcl_ids.size(); ++i) { auto ret = bkcl_get_unique_id(&bkcl_ids[i]); PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret, @@ -99,6 +98,28 @@ void BKCLParallelContext::Init() { } } +void BKCLParallelContext::InitWithRingID(int ring_id) { + std::vector bkcl_ids; + bkcl_ids.resize(1); + + if (strategy_.local_rank_ == 0) { + // generate the unique bkclid on the root worker + auto ret = bkcl_get_unique_id(&bkcl_ids[0]); + PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret, + platform::errors::PreconditionNotMet( + "BKCL get unique id failed [%d]", ret)); + } + BcastBKCLId(bkcl_ids, 0); + + int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; + VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_ + << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id + << " ring id: " << ring_id; + // it will assign bkcl_comm in XPUDeviceContext within ring_id + platform::BKCLCommContext::Instance().CreateBKCLComm( + &bkcl_ids[0], strategy_.nranks_, strategy_.local_rank_, xpu_id, ring_id); +} + void BKCLParallelContext::AllReduceByStream(const framework::Variable &src, framework::Variable *dst, int ring_id, bool use_calc_stream) { @@ -146,8 +167,6 @@ void BKCLParallelContext::WaitCompute(int ring_id) { platform::errors::OutOfRange("Ring id expected < nrings," "but got ring id = %d, nrings = %d", ring_id, strategy_.nrings_)); - // TODO(wangxi16): [Performance optimize] Maybe need to put Wait and - // bkcl_allreduce to comm thread, for bkcl_allreduce is blocking now. auto compute_dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place_)); compute_dev_ctx->Wait(); @@ -167,6 +186,12 @@ void BKCLParallelContext::WaitComm(int ring_id) { comm_dev_ctx->Wait(); } +void BKCLParallelContext::SynchronizeCompute() { + auto compute_dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place_)); + compute_dev_ctx->Wait(); +} + } // namespace imperative } // namespace paddle #endif diff --git a/paddle/fluid/imperative/bkcl_context.h b/paddle/fluid/imperative/bkcl_context.h index d7d917f20082ac..652b7689666c6c 100644 --- a/paddle/fluid/imperative/bkcl_context.h +++ b/paddle/fluid/imperative/bkcl_context.h @@ -36,6 +36,8 @@ class BKCLParallelContext : public ParallelContext { void Init() override; + void InitWithRingID(int ring_id) override; + void AllReduceByStream(const framework::Variable& src, framework::Variable* dst, int ring_id, bool use_calc_stream) override; @@ -45,6 +47,8 @@ class BKCLParallelContext : public ParallelContext { void WaitCompute(int ring_id) override; void WaitComm(int ring_id) override; + + void SynchronizeCompute() override; }; } // namespace imperative diff --git a/paddle/fluid/imperative/dygraph_grad_maker.h b/paddle/fluid/imperative/dygraph_grad_maker.h index a3678404728275..7fefc9ccc67b52 100644 --- a/paddle/fluid/imperative/dygraph_grad_maker.h +++ b/paddle/fluid/imperative/dygraph_grad_maker.h @@ -279,6 +279,8 @@ class TracedGradOp { void SetType(const std::string& type) { op_->SetType(type); } + const framework::OperatorBase& InnerOp() const { return op_->InnerOp(); } + void SetAttrMap(const framework::AttributeMap& attrs) { return op_->SetAttrMap(attrs); } diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index b9df88b1f1eeaa..43546cf99c69ff 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -115,6 +115,23 @@ class TensorAddFunctor : public boost::static_visitor<> { } #endif +#ifdef PADDLE_WITH_ASCEND_CL + void operator()(const platform::NPUPlace& place) { + // TODO(zhiqiu): SUPPORT it + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } +#else + void operator()(const platform::NPUPlace& place) { + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } +#endif + // there is NO blas in CUDAPinnedPlace void operator()(const platform::CUDAPinnedPlace& place) { PADDLE_THROW(platform::errors::PermissionDenied( @@ -384,8 +401,8 @@ static platform::Place GetPlaceOfVar( void GradientAccumulator::AccumulateGrad() { /** - * If the gradient has been calculated by previous graph, - * it should be added to the previous graph result. + * If the leaf gradient has been calculated done, the inner_var_ + * should be added to the var_. */ if (!var_->IsLeafGrad() || !SumGradCompleted() || !HasInnerVar()) { return; @@ -396,7 +413,7 @@ void GradientAccumulator::AccumulateGrad() { "this auto-grad")); PADDLE_ENFORCE_EQ(inner_var_->Var().IsInitialized(), true, platform::errors::InvalidArgument( - "Interior var of Leaf tensor should be initialized.")); + "Interior var of Leaf tensor should be initialized.")); auto* src = inner_var_->MutableVar(); auto* dst = var_->MutableVar(); if (!var_->IsEmpty()) { @@ -427,10 +444,65 @@ void GradientAccumulator::AccumulateGrad() { *(dst) = std::move(*src); var_->SetType(inner_var_->Type()); var_->SetDataType(inner_var_->DataType()); + var_->SetIsEmpty(false); } inner_var_.reset(); } +void GradientAccumulator::CallGradientHooks() { + PADDLE_ENFORCE_EQ(var_->IsLeafGrad(), true, + platform::errors::Unavailable( + "Only leaf gradient Tensor can deal with by gradient " + "hook in gradient accumulator.")); + PADDLE_ENFORCE_EQ( + SumGradCompleted(), true, + platform::errors::PreconditionNotMet( + "Only can call gradient hooks after sum gradient completed.")); + PADDLE_ENFORCE_EQ( + HasInnerVar(), true, + platform::errors::PreconditionNotMet( + "Leaf Tensor's inner var is nullptr when call gradient hook.")); + PADDLE_ENFORCE_EQ( + inner_var_->Var().IsInitialized(), true, + platform::errors::PreconditionNotMet("Leaf Tensor's inner var " + "is not initialized when " + "call gradient hook.")); + if (var_->HasVariableWrapperHook()) { + VLOG(3) << "Call " << var_->GetVariableWrapperHooks().size() + << " hooks of leaf gradient accumulator's inner var `" + << var_->Name() << "`."; + auto tmp_var = inner_var_; + VLOG(3) << "Input var " << var_->Name() << "'s hook size - " + << var_->GetVariableWrapperHooks().size(); + for (const auto& hook_pair : var_->GetVariableWrapperHooks()) { + tmp_var = (*hook_pair.second)(tmp_var); + } + inner_var_ = tmp_var; + } +} + +void GradientAccumulator::CallReduceHooks() { + PADDLE_ENFORCE_EQ( + var_->IsLeafGrad(), true, + platform::errors::Unavailable("Only leaf gradient Tensor can deal with " + "by reduce hook in gradient accumulator.")); + PADDLE_ENFORCE_EQ(SumGradCompleted(), true, + platform::errors::PreconditionNotMet( + "Only can call reduce hooks after the gradient " + "summation is completed in current batch.")); + PADDLE_ENFORCE_EQ(HasInnerVar(), false, + platform::errors::PreconditionNotMet( + "Only can call reduce hooks after the " + "gradient accumulation is completed in " + "current batch or across batchs.")); + if (var_->HasVoidHook()) { + for (const auto& hook : var_->GetVoidHooks()) { + VLOG(3) << "call gradient accumulator backward hooks."; + (*hook)(); + } + } +} + void EagerGradientAccumulator::SumGrad(std::shared_ptr var, size_t trace_id, bool unchange_input) { /** diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h index e2dabc06a7dae6..6411dce4405c11 100644 --- a/paddle/fluid/imperative/gradient_accumulator.h +++ b/paddle/fluid/imperative/gradient_accumulator.h @@ -40,8 +40,8 @@ class GradientAccumulator { } // inner_var_ record the grad of this auto-grad. - // Only need to generate inner var for non-empty leaf-tensor. - if (var->IsLeafGrad() && !var->IsEmpty()) { + // Only need to generate inner var for leaf-tensor. + if (var->IsLeafGrad()) { inner_var_ = std::make_shared(var->Name()); inner_var_->SetType(var->Type()); inner_var_->SetDataType(var->DataType()); @@ -52,9 +52,6 @@ class GradientAccumulator { << ") to store result of this Graph"; } - // TODO(zhouwei): fix Tensor.clear_gradient() bug, remove this hard flag - var->SetIsEmpty(false); - // var_ is the final grad, processed by hooks and grad accumulation var_ = var; } @@ -93,42 +90,38 @@ class GradientAccumulator { inline bool HasInnerVar() const { return inner_var_ != nullptr; } - /* Hook related methods */ - inline bool HasPostHooks() const { return !post_hooks_.expired(); } - - void SetPostHooks(const std::shared_ptr& hooks) { - PADDLE_ENFORCE_NOT_NULL( - hooks, platform::errors::InvalidArgument( - "The hook set to GradientAccumulator is nullptr.")); - - auto shared_hooks = post_hooks_.lock(); - if (shared_hooks != hooks) { - PADDLE_ENFORCE_EQ( - shared_hooks, nullptr, - platform::errors::PermissionDenied( - "Cannot set post hooks twice to GradientAccumulator.")); - post_hooks_ = hooks; - } - } - // void CallHooks(){} - // ** inner_var_ ** - // function that Sum Gradient with Previous Graph void AccumulateGrad(); - // call backward post hooks, such as reduce hook - void CallBackwardPostHooks() { - PADDLE_ENFORCE_NE( - post_hooks_.expired(), true, - platform::errors::NotFound( - "The post hooks of GradientAccumulator for Tensor `%s` expired.", - var_->Name())); - auto shared_hooks = post_hooks_.lock(); - for (const auto& hook : shared_hooks->backward_hooks()) { - VLOG(3) << "call gradient accumulator backward hooks."; - (*hook)(var_); - } - } + /** [ Hook related methods ] + * + * [Why need two types of VariableWrapperHook? ] + * + * There are two types of gradient accumulation: + * 1. Gradient accumulation in same batch + * 2. Gradient accumulation across batchs + * The order of execution between Hooks and gradient accumulation: + + * [ Gradient accumulation in same batch] + * | + * [ leaf GradVarBase hooks ] + * | + * [ Gradient accumulation across batchs ] + * | + * [ Gradient reduce / allreduce hooks ] + + * Because we currently intend to accumulate these two gradient + * accumulation in one GradientAccumulator, We must distinguish between + * two types of hooks. + + * And the InplaceVariableWrapperHook does not allow users to register + * directly, and is currently only used to support the reduce strategy of + * parallel multi-card training. + */ + + void CallGradientHooks(); + + void CallReduceHooks(); protected: VariableWrapper* var_; @@ -137,7 +130,6 @@ class GradientAccumulator { std::shared_ptr inner_var_; size_t ref_cnt_{0}; size_t cur_cnt_{0}; - std::weak_ptr post_hooks_; }; class EagerGradientAccumulator : public GradientAccumulator { diff --git a/paddle/fluid/imperative/hooks.h b/paddle/fluid/imperative/hooks.h index 1211ec6ae6c7bd..fa929b7c7a51c7 100644 --- a/paddle/fluid/imperative/hooks.h +++ b/paddle/fluid/imperative/hooks.h @@ -18,215 +18,63 @@ #include #include #include - -#include "paddle/fluid/imperative/type_defs.h" -#include "paddle/fluid/platform/macros.h" - namespace paddle { namespace imperative { class VariableWrapper; -/** [ Basic hook classes ] - * s - * @brief OpBasePreHook is executed before the grad OpBase is executed, - * taking the input of the current grad OpBase as input, and - * executing python hooks (user-defined) or C++ hooks (developer-defined) - * to achieve the purpose of custom operations on the interior VarBase - * gradient. +/** [ VariableWrapper Hook ] * - * @note OpBasePreHook will not change the input gradient VarBase. + * @brief This hook functor is executed before the grad OpBase is executed or + * after gradient accumulation completed in current batch. + * 1. For interior var, VariableWrapper Hook take the input of the + * current grad OpBase as input. + * 2. For leaf var, VariableWrapper Hook take the inner_var_ of + * GradientAccumulator as input. * - * @note [Why need to be OpBase `PreHook`, why not `PostHook`?] + * @note This hook functor will not change the input gradient VariableWrapper, + * but if you copy the input VariableWrapper and change the value of + * Variable in VariableWrapper, the value of input will also be changed, + * because they shared same PlaceHolder. * - * If set OpBase post hook, when the op executed end, the op's output - * gradient may not be the final state, because it may need other op's - * gradient output to accumulated to it. But before op can be executed, - * the gradient output must have been accumulated to final value. + * @note [ Why need to be OpBase `PreHook`, why not `PostHook`? ] * - * @note [Why only can be used for interior VarBase?] + * We expect If set OpBase post hook, when the op executed end, the + * op's output gradient may not be the final state, because it may need + * other op's gradient output to accumulated to it. But before op can + * be executed, the gradient output must have been accumulated to final + * value. + * + * @note [ Why Leaf gradient is special? ] * * Because the leaf VarBase's GradVarBase has no GradOpNode, so leaf * GradVarBase has no next OpBase to executed, so if need to deal with - * the leaf GradVarBase, cannot use OpBasePreHook. For this case, we - * deal with by GradAccumulatorPostHook. + * the leaf GradVarBase, we should call hooks after gradient accumulation + * completed. */ -class OpBasePreHook { +class VariableWrapperHook { public: - virtual ~OpBasePreHook() = default; - virtual VariableWrapperList operator()( - const VariableWrapperList& grad_inputs) = 0; + virtual ~VariableWrapperHook() = default; + virtual std::shared_ptr operator()( + const std::shared_ptr& var) = 0; }; -/** - * @brief GradAccumulatorPostHook is the Hook that operates on the current - * gradientafter the GradientAccumulator has accumulated the gradient. - * Leaf GradVarBase has no next OpBase, if we want to register hook - * for it, we also need to wait until the leaf GradVarBase accumulation - * is completed, so we can add post hook to GradientAccumulator. - * - * @note GradAccumulatorPostHook will change the grad VarBase value. - * - * @note Only allow leaf VarBase hold GradientAccumulatorPostHook. - */ -class GradAccumulatorPostHook { - public: - virtual ~GradAccumulatorPostHook() = default; - virtual void operator()(VariableWrapper* var) = 0; -}; - -/** [ Hook for cpp functions ] - * - * Here we design three C++ hooks; - * 1. CppOpBasePreHook (Implement later): - * - used for developer-defined C++ interior VarBase hooks - * 2. CppGradAccumulatorPostHook (Implement later): - * - used for developer-defined C++ leaf VarBase hooks - * 3. LambdaGradAccumulatorPostHook: - * - used for VarBase reduce in parallel training - * - * @note [Why need two types of GradAccumulatorPostHook? ] - * - * There are two types of gradient accumulation: - * 1. Gradient accumulation in same batch - * 2. Gradient accumulation across batchs - * The order of execution between Hooks and gradient accumulation: - * - * [ Gradient accumulation in same batch] - * | - * [ leaf GradVarBase hooks ] - * | - * [ Gradient accumulation across batchs ] - * | - * [ Gradient reduce / allreduce] - * - * Because we currently intend to accumulate these two gradient - * accumulation in one GradientAccumulator, We must distinguish between - * two types of hooks. - * - * And the LambdaGradAccumulatorPostHook does not allow users to register - * directly, and is currently only used to support the reduce strategy of - * parallel multi-card training. - */ -class LambdaGradAccumulatorPostHook : public GradAccumulatorPostHook { +class CppVariableWrapperHook : public VariableWrapperHook { public: - explicit LambdaGradAccumulatorPostHook( - std::function fn) + explicit CppVariableWrapperHook( + std::function( + const std::shared_ptr&)>&& fn) : fn_(std::move(fn)) {} - void operator()(VariableWrapper* var) override { fn_(var); } - - private: - std::function fn_; -}; - -/* Hooks for python function: in pybind/imperative.cc */ - -/** Add Python Hooks later: - * - PyOpBasePreHook (Implement later): used for user-defined interior python - * VarBase hooks - * - PyGradAccumulatorPostHook (Implement later): used for user-defined leaf - * python VarBase hooks - */ - -/** [ Hook Pipeline classes ] - * - * @note [Why need hook pipeline classes?] - * - * There are 2 purposes for adding Hook pipeline here: - * - * 1. Make the code implementation cleaner. - * - * If there are no Hook pipeline, we need to add 3 hook vector into - * VariableWrapper, 1 hook vector into OpBase, 2 hook vector into - * GradientAccumulator, like: - * - * - VariableWrapper: - * std::vector> - * interior_var_hooks_; - * std::vector> - * leaf_var_hooks_; - * std::vector> - * backward_hooks_; - * - * - OpBase: - * std::vector> - * interior_var_hooks_; - * - * - GradientAccumulator: - * std::vector> - * leaf_var_hooks_; - * std::vector> - * backward_hooks_; - * - * This seems more complicated, and std::vector> - * is not easy to destruct. - * - * 2. Make the code easier to understand. - * - * From these two packages, we can clearly understand that we - * have two types of Hooks, respectively for the interior - * gradient var and leaf gradient var inside the backward - * calculation graph. - */ - -class InteriorVarHookPipeline { - public: - InteriorVarHookPipeline() = default; - - void add_hook(std::unique_ptr&& hook) { - hooks_.emplace_back(std::move(hook)); + std::shared_ptr operator()( + const std::shared_ptr& var) override { + return fn_(var); } - const std::vector>& hooks() const { - return hooks_; - } - - std::vector>& hooks() { return hooks_; } - private: - std::vector> hooks_; - - DISABLE_COPY_AND_ASSIGN(InteriorVarHookPipeline); -}; - -class LeafVarHookPipeline { - public: - LeafVarHookPipeline() = default; - - void add_hook(std::unique_ptr&& hook) { - hooks_.emplace_back(std::move(hook)); - } - - const std::vector>& hooks() const { - return hooks_; - } - - std::vector>& hooks() { - return hooks_; - } - - void add_backward_hook(std::unique_ptr&& hook) { - backward_hooks_.emplace_back(std::move(hook)); - } - - const std::vector>& backward_hooks() - const { - return backward_hooks_; - } - - std::vector>& backward_hooks() { - return backward_hooks_; - } - - private: - std::vector> hooks_; - // NOTE: the `backward` here means the `whole backward process`, - // the `backward_hooks_` need to be executed after the `whole backward - // process`. - std::vector> backward_hooks_; - - DISABLE_COPY_AND_ASSIGN(LeafVarHookPipeline); + std::function( + const std::shared_ptr&)> + fn_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 062f04c6b7052f..70359dc3fd25bf 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -406,7 +406,7 @@ void OpBase::Run(const framework::OperatorBase& op, OpBaseRunImpl(op, ins, outs, attrs, place); } -static void ClearNoNeedBufferInputs(OpBase* op) { +void ClearNoNeedBufferInputs(OpBase* op) { auto& inferer = op->Info().NoNeedBufferVarsInferer(); if (!inferer) return; auto* ins = op->GetMutableInsMap(); diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index ff5a780a5f9dbf..bbede47e364298 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -30,6 +30,7 @@ #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/flags.h" +#include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/saved_variable_wrapper_list.h" #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/variable_wrapper.h" @@ -107,6 +108,10 @@ class VarBase { void ClearGradVarBase() { grad_var_ = nullptr; } + void SetGradVarBase(VarBase& grad_var) { + MutableGradVarBase()->CopyFrom(grad_var, true); + } + const std::shared_ptr& MutableGradVarBase() { if (grad_var_ == nullptr) { if (auto grad_var_wrapper = var_->GetGradVar()) { @@ -220,6 +225,28 @@ class VarBase { void BumpInplaceVersion(); + /* Hook related method: now only used for GradVarBase */ + bool HasVariableWrapperHook() const { return var_->HasVariableWrapperHook(); } + + int64_t AddVariableWrapperHook(std::shared_ptr&& hook) { + return var_->AddVariableWrapperHook( + std::forward>(hook)); + } + + bool RemoveVariableWrapperHook(const int64_t& hook_id) { + return var_->RemoveVariableWrapperHook(hook_id); + } + + const std::map>& + GetVariableWrapperHooks() const { + return var_->GetVariableWrapperHooks(); + } + + void AddVoidHook(std::shared_ptr>&& hook) { + var_->AddVoidHook( + std::forward>>(hook)); + } + private: /** * NOTE(zengjinle): never remove the const qualifier of `var_` if you are @@ -259,5 +286,7 @@ std::shared_ptr CreateGradOpNode( const platform::Place& place, const std::map& inplace_map); +void ClearNoNeedBufferInputs(OpBase* op); + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index eb0135d15e0743..b91fc460781c79 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -79,6 +79,30 @@ void NCCLParallelContext::Init() { } } +void NCCLParallelContext::InitWithRingID(int ring_id) { + std::vector nccl_ids; + nccl_ids.resize(1); + + if (strategy_.local_rank_ == 0) { + // generate the unique ncclid on the root worker + platform::dynload::ncclGetUniqueId(&nccl_ids[0]); + } + BcastNCCLId(nccl_ids, 0); + + int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device; + VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ + << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id + << " ring id: " << ring_id; + // it will assign nccl_comm in CUDADeviceContext within ring_id + platform::NCCLCommContext::Instance().CreateNCCLComm( + &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id); + + compute_events_.emplace_back(platform::CudaEventResourcePool::Instance().New( + BOOST_GET_CONST(platform::CUDAPlace, place_).device)); + comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New( + BOOST_GET_CONST(platform::CUDAPlace, place_).device)); +} + void NCCLParallelContext::AllReduceByStream(const framework::Variable &src, framework::Variable *dst, int ring_id, bool use_calc_stream) { @@ -149,6 +173,12 @@ void NCCLParallelContext::WaitComm(int ring_id) { #endif } +void NCCLParallelContext::SynchronizeCompute() { + auto *compute_dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place_)); + compute_dev_ctx->Wait(); +} + #endif } // namespace imperative diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h index 51e5743aebdc3d..bcaeb811b108c5 100644 --- a/paddle/fluid/imperative/nccl_context.h +++ b/paddle/fluid/imperative/nccl_context.h @@ -53,6 +53,8 @@ class NCCLParallelContext : public ParallelContext { void Init() override; + void InitWithRingID(int ring_id) override; + void AllReduceByStream(const framework::Variable& src, framework::Variable* dst, int ring_id, bool use_calc_stream) override; @@ -63,6 +65,8 @@ class NCCLParallelContext : public ParallelContext { void WaitComm(int ring_id) override; + void SynchronizeCompute() override; + private: // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id] std::vector> compute_events_; diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h index 2b7642ae7cfd92..0164ff9313cdfe 100644 --- a/paddle/fluid/imperative/op_base.h +++ b/paddle/fluid/imperative/op_base.h @@ -177,8 +177,6 @@ class OpBase { std::unique_ptr op_; platform::Place place_; size_t id_{-1UL}; - - std::weak_ptr pre_hooks_; }; class GradOpNode { diff --git a/paddle/fluid/imperative/parallel_context.h b/paddle/fluid/imperative/parallel_context.h index ef0a9604092151..f537a316014d60 100644 --- a/paddle/fluid/imperative/parallel_context.h +++ b/paddle/fluid/imperative/parallel_context.h @@ -50,6 +50,8 @@ class ParallelContext { virtual void Init() = 0; + virtual void InitWithRingID(int ring_id) = 0; + virtual void AllReduceByStream(const framework::Variable& src, framework::Variable* dst, int ring_id, bool use_calc_stream) = 0; @@ -64,6 +66,9 @@ class ParallelContext { // if CPU, should do nothing. virtual void WaitComm(int ring_id) = 0; + // synchorize compute stream + virtual void SynchronizeCompute() = 0; + inline int GetNRings() const { return strategy_.nrings_; } inline int64_t GetNRanks() const { return strategy_.nranks_; } diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index 8dd8cafc835ab1..3da3a05ed1071c 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -369,6 +369,10 @@ class GradientAccumulationInfo { *is_finished = (cur_ref_cnt_ == total_ref_cnt_); accumulator_->SumGrad(grad_var_partial, trace_id, unchange_input); + if (*is_finished && accumulator_->HasInnerVar()) { + accumulator_->AccumulateGrad(); + } + if (create_graph_) { VLOG(10) << "Store partial grad grad for double grad " << mapped_grad_var_->Name(); diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h new file mode 100644 index 00000000000000..bd132f2576fec1 --- /dev/null +++ b/paddle/fluid/imperative/py_layer_fwd.h @@ -0,0 +1,172 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/tracer.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/operators/py_layer_op.h" + +namespace paddle { +namespace imperative { + +namespace py = ::pybind11; + +bool RequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap& outs) { + for (const auto& name_pair : ins) { + for (const auto& var_base : name_pair.second) { + if (!var_base->OverridedStopGradient()) { + PassStopGradient(outs, var_base->OverridedStopGradient()); + return true; + } + } + } + return false; +} + +std::shared_ptr CreateGradOpNode( + const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, const framework::AttributeMap& attrs, + const platform::Place& place, + const std::map& inplace_map, + const std::shared_ptr& py_context) { + operators::PyLayerGradOpMaker maker( + type, ins, outs, attrs, inplace_map); + + maker.SetPyLayerContext(py_context); + auto grad_node = maker(); + if (grad_node && !grad_node->empty()) { + for (auto& grad_op : *grad_node) { + grad_op.SetId(OpBase::GenerateUniqueId()); + grad_op.SetPlace(place); + ClearNoNeedBufferInputs(&grad_op); + } + return grad_node; + } else { + return nullptr; + } +} + +py::object PyLayerApply(const platform::Place& place, const py::object& cls, + const py::args args, const py::kwargs kwargs) { + auto bk_function = cls.attr("_backward_function"); + auto context = bk_function(); + auto forward = cls.attr("forward"); + + auto result_forward = forward(context, *args, **kwargs); + std::shared_ptr py_layer_ctx = + std::make_shared(context.release().ptr()); + // make inputs to varbase + std::vector> input_vars; + // process args,`input_vars` only collect `imperative::VarBase` + if (!args.empty()) { + for (auto ptr = args.begin(); ptr != args.end(); ptr++) { + try { + if (Py_None != ptr->ptr()) { + auto a = ptr->cast>(); + input_vars.push_back(a); + } + } catch (py::cast_error& err) { + // Only collect Tensor type in 'args' and pass them to backward. Ignore + // other types of input temporarily. + } + } + } + // process kwargs, only collect `imperative::VarBase` + if (!kwargs.empty()) { + for (auto ptr = kwargs.begin(); ptr != kwargs.end(); ptr++) { + try { + if (Py_None != ptr->second.ptr()) { + auto a = ptr->second.cast>(); + input_vars.push_back(a); + } + } catch (py::cast_error&) { + // Only collect Tensor type in 'kwargs' and pass them to backward. + // Ignore other types of input temporarily. + } + } + } + NameVarBaseMap ins = {{"X", input_vars}}; + + std::vector> output_vars; + if (PyTuple_Check(result_forward.ptr()) || + PyList_Check(result_forward.ptr())) { + auto tuple_result = result_forward.cast(); + for (size_t i = 0; i < tuple_result.size(); i++) { + if (Py_None != tuple_result[i].ptr()) { + try { + auto temp_out = + tuple_result[i].cast>(); + output_vars.push_back(temp_out); + } catch (py::cast_error&) { + PADDLE_THROW(platform::errors::Unimplemented( + "The output of `PyLayer.forward` should be `Tensor`.")); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "The output of `PyLayer.forward` can not be `None`.")); + } + } + } else { + if (Py_None != result_forward.ptr()) { + try { + auto temp_out = + result_forward.cast>(); + output_vars.push_back(temp_out); + } catch (py::cast_error&) { + PADDLE_THROW(platform::errors::Unimplemented( + "The output of `PyLayer.forward` should be `Tensor`.")); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "The output of `PyLayer.forward` can not be `None`.")); + } + } + + NameVarBaseMap outs = {{"Out", output_vars}}; + + if (RequiredGrad(ins, outs)) { + std::map inplace_map{}; + bool if_inplace = false; + for (auto temp_ins : input_vars) { + if (if_inplace) { + break; + } + for (auto temp_outs : output_vars) { + if (temp_ins->Name() == temp_outs->Name()) { + if_inplace = true; + break; + } + } + } + if (if_inplace) { + inplace_map["X"] = "Out"; + } + + CreateGradOpNode("py_layer", ins, outs, {{}}, place, inplace_map, + py_layer_ctx); + } else { + VLOG(3) << "No Grad to track for Op: py_layer_op"; + } + + return result_forward; +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index e8b531d35cabfc..a92704ce447dc1 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -310,13 +310,16 @@ Reducer::Reducer(const std::vector> &vars, for (size_t global_var_index = 0; global_var_index < vars_.size(); ++global_var_index) { auto var = vars_[global_var_index]; - var->SharedVar()->AddGradVarLeafBackwardHook( - std::unique_ptr( - new LambdaGradAccumulatorPostHook([=](VariableWrapper *grad) { - this->AddDistHook(global_var_index); - }))); + var->GradVarBase()->AddVoidHook(std::make_shared>( + [=]() { this->AddDistHook(global_var_index); })); var_index_map_[var->GradVarBase()->SharedVar().get()] = global_var_index; } + + // for checking var is ready once + vars_marked_ready_.resize(vars_.size(), false); + + // Initialize local used vars + local_used_vars_.resize(vars_.size(), 0); } void Reducer::InitializeDenseGroups( @@ -325,7 +328,7 @@ void Reducer::InitializeDenseGroups( for (size_t index = 0; index < variable_indices_.size(); ++index) { const auto variable_index = variable_indices_[index]; const auto &var = vars_[variable_index]; - const auto var_name = var->Name(); + const auto &var_name = var->Name(); PADDLE_ENFORCE_EQ(is_sparse_gradient_[variable_index], false, platform::errors::PreconditionNotMet( "Tensor %s's GRAD must be LoDTensor, but received " @@ -336,7 +339,7 @@ void Reducer::InitializeDenseGroups( PADDLE_ENFORCE_EQ(lod_tensor->IsInitialized(), true, platform::errors::PreconditionNotMet( "Tensor %s is not initialized.", var_name)); - auto size = lod_tensor->numel(); + const auto size = lod_tensor->numel(); PADDLE_ENFORCE_GT( size, 0, platform::errors::PreconditionNotMet( "The number of tensor %s's elements is 0.", var_name)); @@ -348,8 +351,8 @@ void Reducer::InitializeDenseGroups( p_group->dense_tensors_.push_back(framework::Tensor()); // check the dtype and place, it must be same. - auto dtype = var->DataType(); - auto place = var->Place(); + const auto &dtype = var->DataType(); + const auto &place = var->Place(); if (index > 0) { PADDLE_ENFORCE_EQ( dtype, p_group->dtype_, @@ -419,8 +422,7 @@ void Reducer::InitializeGroups( group.variable_indices_ = std::move(variable_indices_); groups_.emplace_back(std::move(group)); // Debug Message For Reducer - VLOG(3) << "The Group[" << group_index << "]:"; - VLOG(3) << groups_.back(); + VLOG(3) << "The Group[" << group_index << "]:" << groups_.back(); } } @@ -463,34 +465,38 @@ void Reducer::PrepareDeps(const std::unordered_set &init_nodes) { // and allreudce sequence counter(next_group_) will be cleaned up again. void Reducer::PrepareForBackward( const std::vector> &outputs) { - VLOG(3) << "start reseting count.."; + VLOG(3) << "after forward, then reset count for backward."; next_group_ = 0; std::for_each(groups_.begin(), groups_.end(), [](Group &group) { group.pending_ = group.variable_indices_.size(); group.sparse_contents_ = nullptr; }); + // reinitialize vars_marked_ready_ for next iteration + vars_marked_ready_.clear(); + vars_marked_ready_.resize(vars_.size(), false); + PADDLE_ENFORCE_EQ( - all_group_ready_, false, + groups_need_finalize_, false, platform::errors::PreconditionNotMet( - "Please note that all forward outputs derived from the module " + "A serious error has occurred here. There may be several reasons: " + "1) Please note that all forward outputs derived from the module " "parameters must participate in the calculation of losses and " "subsequent gradient calculations. If not, the wrapper will hang, " "waiting for autograd to generate gradients for these parameters. " "you can use detach or stop_gradient to make the unused parameters " - "detached from the autograd graph.")); + "detached from the autograd graph. " + "2) Used multiple forwards and one backward. You may be able to wrap " + "multiple forwards in a model.")); // The first var to trigger the unused parameter has_marked_unused_vars_ = false; + unused_vars_.clear(); + if (!find_unused_vars_) { return; } - // TODO(shenliang03) "find_unused_vars" interface will be exposed in the - // future to handle control flow to process unused parameters - find_unused_vars_ = false; - - unused_vars_.clear(); node_deps_.clear(); std::queue> q; std::unordered_set var_visited; @@ -553,6 +559,23 @@ void Reducer::PrepareForBackward( << "] is not used"; } } + + if (unused_vars_.empty()) { + LOG_FIRST_N(WARNING, 1) + << "All parameters are involved in the backward pass. " + "It is recommended to set find_unused_parameters to False " + "to improve performance. However, if unused parameters " + "appear in subsequent iterative training, then an error " + "will occur. Please make it clear that in the subsequent " + "training, there will be no parameters that are not used " + "in the backward pass, and then set find_unused_parameters"; + } else if (unused_vars_.size() == vars_.size()) { + LOG_FIRST_N(WARNING, 1) + << "There is no parameter in the device involved " + "in the backward calculation. If there are " + "parameters on other devices involved in the " + "backward, then a serious error will occur here."; + } } // Add hook function to each leaf node. When the gradient of a leaf node is @@ -565,67 +588,133 @@ void Reducer::PrepareForBackward( // concat + allreduce + split is emitted in turn according to next_group_. // 3, FinalizeBackward: after the end, synchronize each stream. void Reducer::AddDistHook(size_t var_index) { + PADDLE_ENFORCE_LT(var_index, variable_locators_.size(), + platform::errors::OutOfRange( + "Out of bounds variable index. it must be less" + "than %d, but it is %d", + variable_locators_.size(), var_index)); + VLOG(3) << "Var[" << var_index << "] [" << vars_[var_index]->GradVarBase()->Name() << "] arrived and triggered disthook"; - if (!has_marked_unused_vars_) { - has_marked_unused_vars_ = true; - for (auto unused_index : unused_vars_) { - if (NeedRebuildGroup()) { - rebuild_vars_.push_back(vars_[unused_index]); - rebuild_var_indices_.push_back(unused_index); - } - MarkVarReady(unused_index, false); - } - } + local_used_vars_[var_index] = 1; + + // rebuild group when find_unused_vars_ is false if (NeedRebuildGroup()) { rebuild_vars_.push_back(vars_[var_index]); rebuild_var_indices_.push_back(var_index); } + + if (!has_marked_unused_vars_ && find_unused_vars_) { + has_marked_unused_vars_ = true; + for (const auto &unused_index : unused_vars_) { + MarkVarReady(unused_index, false); + } + } + MarkVarReady(var_index, true); } void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { - all_group_ready_ = true; + groups_need_finalize_ = true; + const auto &var_locator = variable_locators_[var_index]; - auto group_index = var_locator.group_index; + const auto group_index = var_locator.group_index; auto &group = groups_[group_index]; + // error happened, if the var is ready before. + if (vars_marked_ready_[var_index]) { + auto error_info = string::Sprintf( + "Error happened, when parameter[%d][%s] has been ready before. " + "There may be several reasons for this error: " + "1) In multiple reentrant backward phase, some parameters are reused." + "2) Using model parameters outside of forward function. Please " + "make sure that model parameters are not shared in concurrent " + "forward-backward passes.", + var_index, vars_[var_index]->GradVarBase()->Name()); + + PADDLE_ENFORCE_EQ(has_marked_unused_vars_, false, + platform::errors::PreconditionNotMet(error_info)); + + error_info += + "3) Unused parameters retrieval is incorrect. " + "The return value of forward will be used to retrieve" + " the unused parameters of the entire model. These " + "gradients of unused parameters will not be synchronized " + "between multiple cards. However, if the unused " + "parameters participate in the backward calculation " + "again at a later time (e.g. after the forward function, " + "the loss calculation uses the unused " + "paramters of the forward and trigger backward), " + "its gradient will be wrong."; + + PADDLE_ENFORCE_EQ(has_marked_unused_vars_, true, + platform::errors::PreconditionNotMet(error_info)); + } else { + vars_marked_ready_[var_index] = true; + } + if (!group.is_sparse_) { // process dense group - auto inside_group_index = var_locator.inside_group_index; - auto length = group.length_[inside_group_index]; + const auto inside_group_index = var_locator.inside_group_index; + const auto length = group.length_[inside_group_index]; auto &group_tensor = group.dense_tensors_[inside_group_index]; + if (is_used_var) { - auto var_warpper = vars_[var_index]->GradVarBase()->SharedVar(); - auto tensor = - var_warpper->MutableVar()->GetMutable(); + auto var_base = vars_[var_index]->GradVarBase(); + auto tensor = var_base->MutableVar()->GetMutable(); group_tensor.ShareDataWith(*tensor).Resize( {static_cast(length)}); } else { + // TODO(shenliang03): maybe save the memory + // by avoiding tensor construction if (!group_tensor.IsInitialized()) { group_tensor.Resize({static_cast(length)}); group_tensor.mutable_data(place_, group.dtype_); + } + #ifdef PADDLE_WITH_XPU_BKCL - if (platform::is_xpu_place(group_tensor.place())) { - // TODO(liuyuhui) support XPU set constant - VLOG(3) << "XPU doesn't support set_constant"; - } + if (platform::is_xpu_place(group_tensor.place())) { + // TODO(liuyuhui) support XPU set constant + VLOG(3) << "XPU doesn't support set_constant"; + } #else - auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); + auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); + if (HasGrad(var_index)) { + auto var_base = vars_[var_index]->GradVarBase(); + auto tensor = + var_base->MutableVar()->GetMutable(); + TensorCopy(*tensor, place_, *dev_ctx, &group_tensor); + group_tensor.Resize({static_cast(length)}); + } else { + group_tensor.Resize({static_cast(length)}); operators::math::set_constant(*dev_ctx, &group_tensor, 0.0); -#endif } +#endif } } else { // process sparse group - if (is_used_var) { - auto var_warpper = vars_[var_index]->GradVarBase()->SharedVar(); - group.sparse_contents_ = var_warpper->MutableVar(); - } else { - group.sparse_contents_ = nullptr; - } + PADDLE_ENFORCE_EQ(HasGrad(var_index), true, + platform::errors::PreconditionNotMet( + "The sparse parameter[%d][%s] must have a gradient", + var_index, vars_[var_index]->Name())); + auto var_base = vars_[var_index]->GradVarBase(); + // need to check tensor type + PADDLE_ENFORCE_EQ( + var_base->Var().IsType(), true, + platform::errors::PreconditionNotMet( + "The sparse parameter[%d][%s] must have a selectedrows gradient. " + "Before forward pass, the parameter type is inferred to be " + "SelectedRows, but after backward pass, its actual type becomes " + "LodTensor. It is currently not supported by DataParallel. " + "For example, if sparse embedding is used, and the weight of " + "embedding is shared with subsequent dense parameters, then " + "the parameter gradient of the embedding will be converted " + "to dense parameters.", + var_index, vars_[var_index]->Name())); + + group.sparse_contents_ = var_base->MutableVar(); } if (--group.pending_ == 0) { @@ -641,6 +730,14 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { // TODO(liuyuhui): If BKCL support non-blocking communication, it should be // fixed as same as multi gpus card trainging. void Reducer::MarkGroupReady(size_t group_index) { + PADDLE_ENFORCE_GE( + group_index, next_group_, + platform::errors::PreconditionNotMet( + "The index of the incoming group must be greater " + "than or equal to the previously synchronized group index, " + "expect it to greater than or equal to %d, but got %d.", + next_group_, group_index)); + if (group_index > next_group_) { VLOG(3) << "It will adjust the order of group in next batch automatically"; return; @@ -649,7 +746,7 @@ void Reducer::MarkGroupReady(size_t group_index) { for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0; ++next_group_) { auto &group = groups_[next_group_]; - int run_order = next_group_ % nrings_; + const int run_order = next_group_ % nrings_; // For CUDA or XPU, compute_stream --> comm_stream. // For CPU, do nothing. @@ -668,7 +765,7 @@ void Reducer::MarkGroupReady(size_t group_index) { comm_pool_->enqueue([&] { auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; platform::SetXPUDeviceId(dev_id); - FusedAllReduceSchedule(run_order, group); + FusedAllReduceSchedule(run_order, group, next_group_); { std::lock_guard lock(mutex_); comm_op_count_ -= 1; // lock @@ -676,7 +773,7 @@ void Reducer::MarkGroupReady(size_t group_index) { } }); #elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) - FusedAllReduceSchedule(run_order, group); + FusedAllReduceSchedule(run_order, group, next_group_); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Not compiled with BKCL or NCCL.")); @@ -684,24 +781,23 @@ void Reducer::MarkGroupReady(size_t group_index) { } } -void Reducer::FusedAllReduceSchedule(int run_order, Group &group) { +void Reducer::FusedAllReduceSchedule(const int run_order, Group &group, + const int curr_group_index) { + // The overall timeline: concat > div_nranks > allreduce > split + // dev_context is used to select different stream + const auto &dev_context = *parallel_ctx_->GetDeviceContext(run_order); if (group.is_sparse_) { - if (group.sparse_contents_ != nullptr) { - VLOG(3) << "sparse group [" << next_group_ << "] start allreduce in ring[" - << run_order << "]"; - group.DivNRanks(*parallel_ctx_->GetDeviceContext(run_order), nranks_); - parallel_ctx_->AllReduceByStream( - *group.sparse_contents_, group.sparse_contents_, run_order, false); - } else { - VLOG(3) << "The sparse group[" << next_group_ - << "] has no var to allreduce"; - } + VLOG(3) << "sparse group [" << curr_group_index + << "] start allreduce in ring[" << run_order << "]"; + group.DivNRanks(dev_context, nranks_); + parallel_ctx_->AllReduceByStream(*group.sparse_contents_, + group.sparse_contents_, run_order, false); } else { - VLOG(3) << "dense group [" << next_group_ << "] start allreduce in ring[" - << run_order << "]"; + VLOG(3) << "dense group [" << curr_group_index + << "] start allreduce in ring[" << run_order << "]"; // Select common commstream to concat tensors // group.dense_tensors ---> group.dense_contents_ - group.ConcatTensors(*parallel_ctx_->GetDeviceContext(run_order)); + group.ConcatTensors(dev_context); // NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support // default stream for communicating, so there exist some problems in @@ -713,15 +809,15 @@ void Reducer::FusedAllReduceSchedule(int run_order, Group &group) { parallel_ctx_->WaitComm(run_order); } #endif - group.DivNRanks(*parallel_ctx_->GetDeviceContext(run_order), nranks_); + group.DivNRanks(dev_context, nranks_); // Start allreduce parallel_ctx_->AllReduceByStream( group.dense_contents_, &(group.dense_contents_), run_order, false); - // Select common commstream to split tensors + // Select communication stream to split tensors // group.dense_contents_ ---> group.dense_tensors - group.SplitTensors(*parallel_ctx_->GetDeviceContext(run_order)); + group.SplitTensors(dev_context); } } @@ -747,14 +843,98 @@ std::vector> Reducer::RebuildGruops() { return rebuild_group_indices; } +void Reducer::ProcessUnusedDenseVars() { + // The calculation stream must be used here to + // avoid conflicts with communication. + VLOG(3) << "Local used vars : " + << string::join_strings(local_used_vars_, ','); + const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); + // H2D is to allreduce the local_used_vars_ + auto *global_used_tensor = + global_used_vars_.GetMutable(); + framework::TensorFromVector(local_used_vars_, *dev_ctx, + global_used_tensor); + parallel_ctx_->AllReduceByStream(global_used_vars_, &global_used_vars_, 0, + true); + framework::TensorToVector(*global_used_tensor, *dev_ctx, + &local_used_vars_); + + // sync compute stream to get global used var message, + // but maybe affect speed performance + parallel_ctx_->SynchronizeCompute(); + VLOG(3) << "Global used vars : " + << string::join_strings(local_used_vars_, ','); + + for (const auto var_index : unused_vars_) { + const bool global_unused = (local_used_vars_[var_index] == 0); + + // global used but local unused, set grad + VLOG(3) << "Var [" << var_index << "] [" << vars_[var_index]->Name() + << "] global_unused:" << global_unused + << " has grad: " << HasGrad(var_index); + + if (!global_unused) { + VLOG(3) << "Start process unused Var"; + // 1. source var base + const auto &var_locator = variable_locators_[var_index]; + const auto group_index = var_locator.group_index; + const auto &group = groups_[group_index]; + const auto inside_group_index = var_locator.inside_group_index; + const auto &src_tensor = group.dense_tensors_[inside_group_index]; + // sparse no need to check and no support find_unused_parameters + if (group.is_sparse_) { + continue; + } + // 2. destination var base + auto dest_var_base = vars_[var_index]; + auto *dest_tensor = + dest_var_base->MutableVar()->GetMutable(); + const auto &dest_dims = dest_tensor->dims(); + + // 3. create grad var base or get grad var base + auto grad_var_base_tmp = dest_var_base->MutableGradVarBase(); + + // 4. set grad tensor + auto *dest_grad_tensor = + grad_var_base_tmp->MutableVar()->GetMutable(); + const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); + TensorCopy(src_tensor, place_, *dev_ctx, dest_grad_tensor); + dest_grad_tensor->Resize(dest_dims); + } + } +} + +bool Reducer::HasGrad(size_t var_index) { + const auto grad_var = vars_[var_index]->GradVarBase(); + if (!grad_var || !grad_var->Var().IsInitialized()) { + return false; + } + + const auto &var = grad_var->Var(); + if (var.IsType()) { + if (var.Get().IsInitialized()) { + return true; + } + } else if (var.IsType()) { + if (var.Get().value().IsInitialized()) { + return true; + } + } else { + PADDLE_THROW(platform::errors::PermissionDenied( + "Only support LoDTensor and SelectedRows for gradient var")); + } + return false; +} + void Reducer::FinalizeBackward() { - all_group_ready_ = false; + groups_need_finalize_ = false; #ifdef PADDLE_WITH_XPU_BKCL { std::unique_lock lock(mutex_); cv_.wait(lock, [&] { return comm_op_count_ == 0; }); } #endif + // Must prevent compute_stream_ starting until all comm streams have finished for (int i = 0; i < nrings_; ++i) { parallel_ctx_->WaitComm(i); @@ -767,7 +947,18 @@ void Reducer::FinalizeBackward() { InitializeGroups(group_indices_); } - VLOG(3) << "In the batch, Reducer is finished..."; + if (find_unused_vars_) { +// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + ProcessUnusedDenseVars(); +#endif + // Initialize local used vars + local_used_vars_.clear(); + local_used_vars_.resize(vars_.size(), 0); + VLOG(3) << "ProcessUnusedDenseVars is finished."; + } + + VLOG(3) << "In the batch, Reducer is finished."; } // According to the size of each parameter, it is allocated to different groups. diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index b2680d0dea71aa..0d613dbea89633 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -27,6 +27,7 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/for_range.h" @@ -153,13 +154,20 @@ class Reducer { void MarkGroupReady(size_t group_index); - void FusedAllReduceSchedule(int run_order, Group& group); // NOLINT + void FusedAllReduceSchedule(const int run_order, Group& group, // NOLINT + const int curr_group_index); void FinalizeBackward(); std::vector> RebuildGruops(); - inline bool NeedRebuildGroup() { return !has_rebuilt_group_; } + inline bool NeedRebuildGroup() { + return !has_rebuilt_group_ && !find_unused_vars_; + } + + void ProcessUnusedDenseVars(); + + bool HasGrad(size_t var_index); private: std::vector> vars_; @@ -188,7 +196,7 @@ class Reducer { std::vector unused_vars_; bool has_marked_unused_vars_{false}; bool find_unused_vars_{false}; - bool all_group_ready_{false}; + bool groups_need_finalize_{false}; #ifdef PADDLE_WITH_XPU_BKCL // comm_pool_ is used for scheduling allreduce in multi Kunlun cards training. std::unique_ptr<::ThreadPool> comm_pool_{nullptr}; @@ -196,6 +204,19 @@ class Reducer { std::mutex mutex_; std::condition_variable cv_; #endif + + // it just for checking hook, each parameter can only trigger one hook + std::vector vars_marked_ready_; + + // Following variables are to help control flow. + // local_used_vars_ uses 0/1 to indicate whether the + // var is used in iteration. After the end of the + // iteration, global_used_vars_ is obtained synchronously + // globally. Choose whether to update the local + // gradient according to the global_used_vars_. + std::vector local_used_vars_; + // global_used_vars_ is used in comm stream to avoid wait + framework::Variable global_used_vars_; }; std::vector> AssignGroupBySize( diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc index 7bf5f876681bab..5c4e1538cf0538 100644 --- a/paddle/fluid/imperative/tests/test_hooks.cc +++ b/paddle/fluid/imperative/tests/test_hooks.cc @@ -37,6 +37,30 @@ namespace imperative { using vb_vector = std::vector>; using var_pair = std::pair; +std::shared_ptr DoubleHook( + const std::shared_ptr& var) { + // 1. create out var + auto out_var = std::make_shared(var->Name()); + out_var->SetType(var->Type()); + out_var->SetDataType(var->DataType()); + out_var->SetForwardDataType(var->ForwardDataType()); + out_var->InnerSetOverridedStopGradient(var->InnerOverridedStopGradient()); + + // 2. get input and output var's tensor + auto* out_tensor = out_var->MutableVar()->GetMutable(); + auto& tensor = var->Var().Get(); + out_tensor->Resize(tensor.dims()); + + // 3. double calc + auto* data = tensor.data(); + auto* out_data = out_tensor->mutable_data(platform::CPUPlace()); + for (int64_t i = 0; i < out_tensor->numel(); ++i) { + out_data[i] = data[i] * 2.0; + } + + return out_var; +} + TEST(TestHooks, TestGradVarLeafBackwardHook) { // 1. prepare Tracer tracer; @@ -73,17 +97,14 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) { framework::AttributeMap mul_attr_map; mul_attr_map["use_mkldnn"] = false; - // add GradAccumulatorPostHook - auto x_var_wrapper = x->SharedVar(); - x_var_wrapper->AddGradVarLeafBackwardHook( - std::unique_ptr( - new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) { - auto* grad_tensor = - grad->MutableVar()->GetMutable(); - for (int i = 0; i < grad_tensor->numel(); ++i) { - grad_tensor->mutable_data(place)[i] *= 2.0; - } - }))); + // add VariableWrapper hook + x->GradVarBase()->AddVariableWrapperHook( + std::make_shared(DoubleHook)); + + // add Void hook + int64_t hook_value = 0; + x->GradVarBase()->AddVoidHook( + std::make_shared>([&]() { hook_value = 10; })); // 2. forward tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true); @@ -93,16 +114,21 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) { ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL); // 3. backward + std::vector> tensors{out}; + std::vector> grad_tensors{nullptr}; BasicEngine engine; - engine.Init(out.get()); + engine.Init(tensors, grad_tensors); engine.Execute(); + // verify VariableWrapper hook result framework::LoDTensor x_grad; framework::TensorCopySync(x->GradVar().Get(), place, &x_grad); for (int i = 0; i < x_grad.numel(); ++i) { ASSERT_EQ(x_grad.data()[i], 8.0); } + // verify Void hook result + ASSERT_EQ(hook_value, 10); framework::LoDTensor y_grad; framework::TensorCopySync(y->GradVar().Get(), place, @@ -151,17 +177,14 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() { memory::Copy(place, mutable_z, place, src_data.data(), sizeof(float) * src_data.size()); - // add GradAccumulatorPostHook - auto x_var_wrapper = x->SharedVar(); - x_var_wrapper->AddGradVarLeafBackwardHook( - std::unique_ptr( - new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) { - auto* grad_tensor = - grad->MutableVar()->GetMutable(); - for (int i = 0; i < grad_tensor->numel(); ++i) { - grad_tensor->mutable_data(place)[i] *= 2.0; - } - }))); + // add VariableWrapper hook + x->GradVarBase()->AddVariableWrapperHook( + std::make_shared(DoubleHook)); + + // add Void hook + int64_t hook_value = 0; + x->GradVarBase()->AddVoidHook( + std::make_shared>([&]() { hook_value = 100; })); // 2. forward var_pair x_pair = var_pair("X", vb_vector(1, x)); @@ -193,16 +216,21 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() { ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL); // 3. backward + std::vector> tensors{out}; + std::vector> grad_tensors{nullptr}; BasicEngine engine; - engine.Init(out.get()); + engine.Init(tensors, grad_tensors); engine.Execute(); + // verify VariableWrapper hook result framework::LoDTensor x_grad; framework::TensorCopySync(x->GradVar().Get(), place, &x_grad); for (int i = 0; i < x_grad.numel(); ++i) { ASSERT_EQ(x_grad.data()[i], 16.0); } + // verify Void hook result + ASSERT_EQ(hook_value, 100); framework::LoDTensor y_grad; framework::TensorCopySync(y->GradVar().Get(), place, diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index 9e3b0ea5df6838..76de413b3e6033 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -250,7 +250,10 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) { tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map, gpu_place, true); imperative::BasicEngine engine; - engine.Init(reduce_sum_out.get()); + + std::vector> tensors{reduce_sum_out}; + std::vector> grad_tensors{nullptr}; + engine.Init(tensors, grad_tensors); engine.Execute(); framework::LoDTensor rlt; @@ -376,8 +379,10 @@ TEST(test_tracer, test_var_without_grad_var) { ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL); ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL); + std::vector> tensors{vout}; + std::vector> grad_tensors{nullptr}; imperative::BasicEngine engine; - engine.Init(vout.get()); + engine.Init(tensors, grad_tensors); engine.Execute(); // check the grad diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 608cc407d5b776..777cb10e0754c3 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -38,7 +38,7 @@ void SetCurrentTracer(const std::shared_ptr& tracer) { VLOG(6) << "Set current tracer: " << g_current_tracer; } -static void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) { +void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) { for (const auto& pair : outs) { for (const auto& var : pair.second) { // NOTE(zhiqiu): this happends when None output are passed from python diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index b10d1b2d0b49da..8f50550878262f 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -130,5 +130,7 @@ void IncreaseVarbaseReferenceCountUntilCopyComplete( const std::shared_ptr& var, const platform::Place& place); +void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad); + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index b42f25dcc88001..5fa8b89a396d9b 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -27,8 +27,8 @@ namespace paddle { namespace imperative { -class InteriorVarHookPipeline; -class LeafVarHookPipeline; +class VariableWrapperHook; +class InplaceVariableWrapperHook; class VarBase; class GradOpNode; @@ -38,6 +38,9 @@ class VariableWrapper { explicit VariableWrapper(const std::string& name) : name_(name) {} + VariableWrapper(const std::string& name, const framework::Variable& variable) + : var_(variable), name_(name) {} + ~VariableWrapper() { VLOG(10) << "Destruct VariableWrapper: " << Name(); } const framework::Variable& Var() const { return var_; } @@ -193,42 +196,6 @@ class VariableWrapper { } } - /* Hook related method: only can be call by GradVarBase */ - - bool HasInteriorHooks() const { return interior_hooks_ != nullptr; } - - bool HasLeafHooks() const { return leaf_hooks_ != nullptr; } - - void AddGradVarInteriorHook(std::unique_ptr&& hook) { - auto interior_hooks = GetGradVarInteriorHooksSafely(); - interior_hooks->add_hook(std::move(hook)); - } - - void AddGradVarLeafHook(std::unique_ptr&& hook) { - auto leaf_hooks = GetGradVarLeafHooksSafely(); - leaf_hooks->add_hook(std::move(hook)); - } - - void AddGradVarLeafBackwardHook( - std::unique_ptr&& hook) { - auto leaf_hooks = GetGradVarLeafHooksSafely(); - leaf_hooks->add_backward_hook(std::move(hook)); - } - - const std::shared_ptr& GetInteriorHooks() const { - return interior_hooks_; - } - - std::shared_ptr& GetInteriorHooks() { - return interior_hooks_; - } - - const std::shared_ptr& GetLeafHooks() const { - return leaf_hooks_; - } - - std::shared_ptr& GetLeafHooks() { return leaf_hooks_; } - uint32_t InplaceVersionSnapshot() const { return inplace_version_snapshot_; } void ResetInplaceVersion() { @@ -255,6 +222,38 @@ class VariableWrapper { return; } + /* Hook related methods */ + bool HasVariableWrapperHook() const { return !var_hooks_.empty(); } + + int64_t AddVariableWrapperHook(std::shared_ptr&& hook) { + var_hooks_.emplace(next_hook_id_, std::move(hook)); + return next_hook_id_++; + } + + bool RemoveVariableWrapperHook(const int64_t& hook_id) { + auto remove_cnt = var_hooks_.erase(hook_id); + if (remove_cnt == 0) { + return false; + } + return true; + } + + const std::map>& + GetVariableWrapperHooks() const { + return var_hooks_; + } + + bool HasVoidHook() const { return !void_hooks_.empty(); } + + void AddVoidHook(std::shared_ptr>&& hook) { + void_hooks_.emplace_back(std::move(hook)); + } + + const std::vector>>& GetVoidHooks() + const { + return void_hooks_; + } + private: void SetGradVar(const std::shared_ptr& var) { auto shared_var = grad_var_.lock(); @@ -289,41 +288,6 @@ class VariableWrapper { } } - /* Hook related private methods */ - std::shared_ptr GetGradVarSafely() const { - auto shared_grad_var = grad_var_.lock(); - PADDLE_ENFORCE_NOT_NULL( - shared_grad_var, - platform::errors::PermissionDenied( - "Cannot add gradient hook on Tensor without gradient.")); - return shared_grad_var; - } - - std::shared_ptr& GetGradVarInteriorHooksSafely() { - auto shared_grad_var = GetGradVarSafely(); - PADDLE_ENFORCE_EQ(HasGradNode(), true, - platform::errors::PermissionDenied( - "Only interior Tensor in backward can register " - "interior gradient hook.")); - if (shared_grad_var->interior_hooks_ == nullptr) { - shared_grad_var->interior_hooks_ = - std::make_shared(); - } - return shared_grad_var->interior_hooks_; - } - - std::shared_ptr& GetGradVarLeafHooksSafely() { - auto shared_grad_var = GetGradVarSafely(); - PADDLE_ENFORCE_EQ( - HasGradNode(), false, - platform::errors::PermissionDenied( - "Only leaf Tensor in backward can register leaf gradient hook.")); - if (shared_grad_var->leaf_hooks_ == nullptr) { - shared_grad_var->leaf_hooks_ = std::make_shared(); - } - return shared_grad_var->leaf_hooks_; - } - private: framework::Variable var_; std::string name_; @@ -358,11 +322,19 @@ class VariableWrapper { // isn't need bool is_empty_{false}; - // NOTE: only grad var can hold hooks now - // only interior var can hold interior hooks - std::shared_ptr interior_hooks_; - // only leaf var can hold leaf hooks - std::shared_ptr leaf_hooks_; + // NOTE(chenweihang): only grad var will hold hooks now + int64_t next_hook_id_{0}; + // [ Hooks with VariableWrapper as input and output ] + // NOTE: Now registered for grad var, support adding and removing, + // key is the accumulated int64_t value + // NOTE: Var hook need to support removing, so need hook id + std::map> var_hooks_; + // [ Hooks without input and output ] + // NOTE: Now registered after the execution of the entire backward + // process is over, currently only used for reducing in distributed + // training + // NOTE: Now no need to support remove void hook + std::vector>> void_hooks_; }; } // namespace imperative diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 9a4637306bb359..03f86cc7ba6de6 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -57,11 +57,9 @@ if(WITH_TESTING) if (NOT APPLE AND NOT WIN32) inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_inference_shared ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR}) - set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) elseif(WIN32) inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR}) - set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) endif() endif() diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 7bb092d0e3c1c0..4b6c746d57525a 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1192,6 +1192,8 @@ USE_TRT_CONVERTER(scale); USE_TRT_CONVERTER(stack); USE_TRT_CONVERTER(clip); USE_TRT_CONVERTER(gather); +USE_TRT_CONVERTER(anchor_generator); +USE_TRT_CONVERTER(yolo_box); USE_TRT_CONVERTER(roi_align); USE_TRT_CONVERTER(affine_channel); USE_TRT_CONVERTER(multiclass_nms); diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh index 0d9f3d2aa237ac..c265721db57752 100755 --- a/paddle/fluid/inference/api/demo_ci/clean.sh +++ b/paddle/fluid/inference/api/demo_ci/clean.sh @@ -1,3 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -x cd `dirname $0` rm -rf build/ data/ diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 61fcdb7a90830d..1d77ddaf73ef70 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -86,6 +86,7 @@ const std::vector kTRTSubgraphPasses({ "simplify_with_basic_ops_pass", // "embedding_eltwise_layernorm_fuse_pass", // "multihead_matmul_fuse_pass_v2", // + "multihead_matmul_fuse_pass_v3", // "skip_layernorm_fuse_pass", // "conv_bn_fuse_pass", // "unsqueeze2_eltwise_fuse_pass", // @@ -235,8 +236,8 @@ void CpuPassStrategy::EnableMKLDNN() { "reshape_transpose_matmul_mkldnn_fuse_pass", // "matmul_transpose_reshape_fuse_pass", // // Disabled due to topology-dependent speed-up - //"fc_mkldnn_pass", - //"fc_act_mkldnn_fuse_pass", + // "fc_mkldnn_pass", + // "fc_act_mkldnn_fuse_pass", "batch_norm_act_fuse_pass", // TODO(intel): Please fix the bug on windows. // https://github.com/PaddlePaddle/Paddle/issues/29710 diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index bc7b7355ea1922..3820ac5d7cc246 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -6,6 +6,8 @@ nv_library(tensorrt_converter shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc gather_op.cc + anchor_generator_op.cc + yolo_box_op.cc roi_align_op.cc affine_channel_op.cc multiclass_nms_op.cc diff --git a/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc b/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc new file mode 100644 index 00000000000000..56aab9785c90f3 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* Anchor Generator Op */ +class AnchorGeneratorOpConverter : public OpConverter { + public: + void operator()(const paddle::framework::proto::OpDesc& op, + const paddle::framework::Scope& scope, + bool test_mode) override { + VLOG(3) << "convert a fluid anchor generator op to tensorrt plugin"; + framework::OpDesc op_desc(op, nullptr); + std::string input_name = op_desc.Input("Input").front(); + std::string anchor_name = op_desc.Output("Anchors").front(); + std::string variance_name = op_desc.Output("Variances").front(); + + auto* input = engine_->GetITensor(input_name); + const auto input_dims = input->getDimensions(); // C, H, W + std::vector output_names{anchor_name, variance_name}; + + const auto anchor_sizes = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("anchor_sizes")); + const auto aspect_ratios = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("aspect_ratios")); + const auto stride = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("stride")); + const auto variances = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("variances")); + const auto offset = BOOST_GET_CONST(float, op_desc.GetAttr("offset")); + const int num_anchors = aspect_ratios.size() * anchor_sizes.size(); + bool is_dynamic = engine_->with_dynamic_shape(); + const auto height = input_dims.d[1]; + const auto width = input_dims.d[2]; + const int box_num = width * height * num_anchors; + const nvinfer1::DataType data_type = nvinfer1::DataType::kFLOAT; + + nvinfer1::IPluginV2* anchor_generator_plugin = nullptr; + if (is_dynamic) { + anchor_generator_plugin = new plugin::AnchorGeneratorPluginDynamic( + data_type, anchor_sizes, aspect_ratios, stride, variances, offset, + num_anchors); + } else { + anchor_generator_plugin = new plugin::AnchorGeneratorPlugin( + data_type, anchor_sizes, aspect_ratios, stride, variances, offset, + height, width, num_anchors, box_num); + } + + std::vector anchor_generator_inputs{input}; + auto* anchor_generator_layer = engine_->network()->addPluginV2( + anchor_generator_inputs.data(), anchor_generator_inputs.size(), + *anchor_generator_plugin); + + RreplenishLayerAndOutput(anchor_generator_layer, "anchor_generator", + output_names, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(anchor_generator, AnchorGeneratorOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index 26cd7b22d2baaa..a6484a13557052 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -158,17 +158,49 @@ class BatchNormOpConverter : public OpConverter { TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, 0}; - nvinfer1::IScaleLayer* layer = - TRT_ENGINE_ADD_LAYER(engine_, Scale, *const_cast(X), - nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(), - scale_weights.get(), power_weights.get()); + int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0; + nvinfer1::ILayer* layer = nullptr; + nvinfer1::IShuffleLayer* expand_layer = nullptr; + nvinfer1::IShuffleLayer* squeeze_layer = nullptr; + + auto x_dim = X->getDimensions(); + if (x_dim.nbDims < 3 + dynamic_shape_offset) { + nvinfer1::Dims expand_shape; + expand_shape.nbDims = 3 + dynamic_shape_offset; + for (int i = 0; i < 3 + dynamic_shape_offset; i++) { + if (i < x_dim.nbDims) { + expand_shape.d[i] = x_dim.d[i] < 0 ? 0 : x_dim.d[i]; + } else { + expand_shape.d[i] = 1; + } + } + expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); + expand_layer->setReshapeDimensions(expand_shape); + X = expand_layer->getOutput(0); + } + + layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *X, nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(), + scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Y").front(); engine_->SetWeights(op_desc.Input("Bias").front(), std::move(combile_bias_tensor)); engine_->SetWeights(op_desc.Input("Scale").front(), std::move(combile_scale_tensor)); - RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode); + if (x_dim.nbDims < 3 + dynamic_shape_offset) { + nvinfer1::Dims squeeze_shape; + squeeze_shape.nbDims = x_dim.nbDims; + for (int i = 0; i < squeeze_shape.nbDims; i++) { + squeeze_shape.d[i] = x_dim.d[i] < 0 ? 0 : x_dim.d[i]; + } + squeeze_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0))); + squeeze_layer->setReshapeDimensions(squeeze_shape); + layer = static_cast(squeeze_layer); + } + RreplenishLayerAndOutput(layer, "batchnorm_add_scale", {output_name}, + test_mode); } }; diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index dfadb28a6520f9..74057addecd1f9 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -251,7 +251,7 @@ class ElementwiseTensorOpConverter : public OpConverter { #if IS_TRT_VERSION_GE(6000) plugin::ElementwisePluginDynamic* plugin = new plugin::ElementwisePluginDynamic(op_type_, axis); - layer = engine_->AddPluginV2(itensors.data(), 2, plugin); + layer = engine_->AddDynamicPlugin(itensors.data(), 2, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc index 7f8843a3f67d05..f13f1724541239 100644 --- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc @@ -31,7 +31,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { #if IS_TRT_VERSION_GE(6000) - VLOG(4) << "convert fluid swish op to tensorrt layer"; + VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer"; framework::OpDesc op_desc(op, nullptr); auto id_names = op_desc.Input("Ids"); @@ -89,10 +89,14 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { int64_t bias_size = framework::product(bias_dims); int64_t scale_size = framework::product(scale_dims); nvinfer1::ILayer* layer = nullptr; + bool enable_int8 = op_desc.HasAttr("enable_int8"); if (engine_->with_dynamic_shape()) { if (engine_->use_oss()) { int output_fp16 = static_cast((engine_->WithFp16() == 1) ? 1 : 0); + if (enable_int8) { + output_fp16 = 1; + } PADDLE_ENFORCE_EQ( output_fp16, 1, platform::errors::InvalidArgument( @@ -169,7 +173,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { plugin = new plugin::EmbEltwiseLayernormPluginDynamic( input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, eps, with_fp16); - layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin); + layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin); auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name}, test_mode); diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 527d0ee208578a..194d76c737c7f9 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -106,8 +106,22 @@ class FcOpConverter : public OpConverter { auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output, TensorRTEngine::Weight& weight, TensorRTEngine::Weight& bias) { - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, - n_output, weight.get(), bias.get()); + nvinfer1::ILayer* fc_layer = nullptr; + if (enable_int8) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("out_threshold"), true, + platform::errors::InvalidArgument( + "must have out threshold in fc layers in int8 mode")); + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + nvinfer1::DimsHW nv_ksize(1, 1); + fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output, + nv_ksize, weight.get(), bias.get()); + engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); + } else { + fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, + n_output, weight.get(), bias.get()); + } auto output_name = op_desc.Output("Out").front(); if (activation_type == "relu") { @@ -229,13 +243,24 @@ class FcOpConverter : public OpConverter { "dims equals to 4, the last dim of input must be 1, but got %d", input_d[3])); } - for (int i = 0; i < 3; i++) { - if (i < input_dims) { - reshape_dim3[i] = input_d[i]; - } else { - reshape_dim3[i] = 1; + if (enable_int8) { + reshape_dim3[0] = 1; + for (int i = 0; i < 3; i++) { + reshape_dim3[0] *= input_d[i]; + if (i > 0) { + reshape_dim3[i] = 1; + } + } + } else { + for (int i = 0; i < 3; i++) { + if (i < input_dims) { + reshape_dim3[i] = input_d[i]; + } else { + reshape_dim3[i] = 1; + } } } + nvinfer1::Dims3 reshape_dim(reshape_dim3[0], reshape_dim3[1], reshape_dim3[2]); auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); @@ -249,11 +274,25 @@ class FcOpConverter : public OpConverter { platform::errors::InvalidArgument( "Invalid dimensions. When x_num_col_dims equals to " "2, input_dims should not be 1")); - for (int i = 0; i < 4; i++) { - if (i < input_dims) { - reshape_dim4[i] = input_d[i]; - } else { - reshape_dim4[i] = 1; + + if (enable_int8) { + for (int i = 0; i < 4; i++) { + if (i == 0) { + reshape_dim4[i] = input_d[i]; + } else { + reshape_dim4[i] = 1; + if (i < input_dims) { + reshape_dim4[1] *= input_d[i]; + } + } + } + } else { + for (int i = 0; i < 4; i++) { + if (i < input_dims) { + reshape_dim4[i] = input_d[i]; + } else { + reshape_dim4[i] = 1; + } } } nvinfer1::Dims4 reshape_dim(reshape_dim4[0], reshape_dim4[1], diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc index 4c9996ca02cad4..ca5b6a8b52e797 100644 --- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc @@ -64,7 +64,7 @@ class GeluOpConverter : public OpConverter { engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::GeluPluginDynamic* plugin = new plugin::GeluPluginDynamic(with_fp16); - layer = engine_->AddPluginV2(&input, input_num, plugin); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index 8ce46a19d4b06e..f2f45c694ab44f 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -40,8 +40,25 @@ class MultiheadMatMulOpConverter : public OpConverter { auto* bias_v = scope.FindVar(bias_name); auto* bias_t = bias_v->GetMutable(); - float* weight_data = - engine_->GetWeightCPUData(weight_name, weight_t, false); + float* weight_data = nullptr; + bool enable_int8 = op_desc.HasAttr("enable_int8"); + float in_scale = 0.; + + if (enable_int8) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("Input_scale"), true, + platform::errors::InvalidArgument( + "must have input scale in multihead layers in int8 mode")); + in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127; + auto weight_scale = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("weight_scale")); + weight_data = + engine_->GetWeightCPUData(weight_name, weight_t, true, weight_scale); + engine_->SetTensorDynamicRange(input, in_scale); + } else { + weight_data = engine_->GetWeightCPUData(weight_name, weight_t, false); + } + float* bias_data = engine_->GetWeightCPUData(bias_name, bias_t, false); std::vector weight_data_tmp; weight_data_tmp.reserve(weight_t->numel()); @@ -117,8 +134,27 @@ class MultiheadMatMulOpConverter : public OpConverter { static_cast(bias_data), static_cast(bias_t->numel())}; - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, - n, weight, bias); + nvinfer1::ILayer* fc_layer = nullptr; + float dp_probs = 1.0 / 127.0; + if (enable_int8) { + nvinfer1::DimsHW nv_ksize(1, 1); + fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *input, n, + nv_ksize, weight, bias); + } else { + fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n, + weight, bias); + } + + if (enable_int8) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("out_threshold"), true, + platform::errors::InvalidArgument( + "must have out threshold in multihead layers in int8 mode")); + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); + dp_probs = out_scale / 127.0; + } auto mask_tensor = engine_->GetITensor("qkv_plugin_mask"); @@ -128,6 +164,9 @@ class MultiheadMatMulOpConverter : public OpConverter { int type = static_cast((engine_->WithFp16() == 1) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT); + if (enable_int8) { + type = static_cast(nvinfer1::DataType::kHALF); + } bool has_mask = true; int var_seqlen = 1; const std::vector fields{ @@ -136,7 +175,7 @@ class MultiheadMatMulOpConverter : public OpConverter { {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1}, {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1}, {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1}, - }; + { "dq_probs", &dp_probs, nvinfer1::PluginFieldType::kFLOAT32, 1 }}; nvinfer1::PluginFieldCollection* plugin_collection = static_cast( malloc(sizeof(*plugin_collection) + @@ -227,7 +266,7 @@ class MultiheadMatMulOpConverter : public OpConverter { plugin::DynamicPluginTensorRT* plugin = new plugin::QkvToContextPluginDynamic(hidden_in, head_number, head_size, scale, with_fp16); - layer = engine_->AddPluginV2(plugin_inputs.data(), 2, plugin); + layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin); } } else { PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index aa4e54b5845722..c10072602d7c51 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -147,7 +147,7 @@ class Pool2dOpConverter : public OpConverter { plugin::PoolPluginDynamic *plugin = new plugin::PoolPluginDynamic(ceil_mode, pool_type, adaptive, ksize, strides, paddings, global_pooling); - layer = engine_->AddPluginV2(&input1, 1, plugin); + layer = engine_->AddDynamicPlugin(&input1, 1, plugin); #endif } auto output_name = op_desc.Output("Out")[0]; diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index 5e881ecbbc4e2c..74d77d8be44937 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -65,7 +65,7 @@ class PReluOpConverter : public OpConverter { #if IS_TRT_VERSION_GE(6000) plugin::PReluPluginDynamic* plugin = new plugin::PReluPluginDynamic( alpha_data, alpha_tensor_temp->numel(), mode); - layer = engine_->AddPluginV2(&input, input_num, plugin); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc index 2e4a4e6120d2d8..b44bdcef7123c2 100644 --- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc @@ -49,6 +49,7 @@ class SkipLayerNormOpConverter : public OpConverter { auto* scale = get_persistable_data("Scale", &scale_dims); int bias_size = framework::product(bias_dims); int scale_size = framework::product(scale_dims); + bool enable_int8 = op_desc.HasAttr("enable_int8"); nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { @@ -62,6 +63,10 @@ class SkipLayerNormOpConverter : public OpConverter { int ld = input1->getDimensions().d[2]; // hidden dimension assert(ld > 0); + if (enable_int8) { + type = static_cast(nvinfer1::DataType::kHALF); + } + const std::vector fields{ {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1}, {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1}, @@ -90,7 +95,7 @@ class SkipLayerNormOpConverter : public OpConverter { plugin::SkipLayerNormPluginDynamic* plugin = new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size, scale_size, eps, with_fp16); - layer = engine_->AddPluginV2(inputs.data(), 2, plugin); + layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin); } } else { PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index 0bd2b8c9bf5eef..aee39b7cf0c14c 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -31,6 +31,12 @@ class SliceOpConverter : public OpConverter { // Declare inputs auto* input = engine_->GetITensor(op_desc.Input("Input")[0]); + if (op_desc.HasAttr("out_threshold")) { + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + engine_->SetTensorDynamicRange(input, out_scale); + } + std::vector axes = BOOST_GET_CONST(std::vector, op_desc.GetAttr("axes")); std::vector starts = @@ -90,14 +96,14 @@ class SliceOpConverter : public OpConverter { // bool ban_fp16 = engine_->disable_trt_plugin_fp16(); plugin::SpecialSlicePluginDynamic* plugin = new plugin::SpecialSlicePluginDynamic(); - layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(), - plugin); + layer = engine_->AddDynamicPlugin(plugin_inputs.data(), + plugin_inputs.size(), plugin); } else { bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SlicePluginDynamic* plugin = new plugin::SlicePluginDynamic(starts, ends, axes, with_fp16); - layer = engine_->AddPluginV2(&input, 1, plugin); + layer = engine_->AddDynamicPlugin(&input, 1, plugin); } #else PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 5d494c2093b2a9..75b317e7bfd90e 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -90,7 +90,7 @@ class SplitOpConverter : public OpConverter { engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SplitPluginDynamic* plugin = new plugin::SplitPluginDynamic(axis, output_lengths, with_fp16); - layer = engine_->AddPluginV2(&input, input_num, plugin); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc index 1c971fa12e27e8..a0292b21124633 100644 --- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc @@ -45,6 +45,11 @@ class StackOpConverter : public OpConverter { for (int i = 0; i < input_num; ++i) { inputs[i] = engine_->GetITensor(input[i]); + if (op_desc.HasAttr("out_threshold")) { + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); + engine_->SetTensorDynamicRange(inputs[i], out_scale); + } } int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis")); @@ -59,7 +64,7 @@ class StackOpConverter : public OpConverter { engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::StackPluginDynamic* plugin = new plugin::StackPluginDynamic(axis, input_num, with_fp16); - layer = engine_->AddPluginV2(inputs, input_num, plugin); + layer = engine_->AddDynamicPlugin(inputs, input_num, plugin); assert(layer != nullptr); #else PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc index 25944a2fead6cd..b2e394d14eba23 100644 --- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc @@ -65,7 +65,7 @@ class SwishOpConverter : public OpConverter { engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SwishPluginDynamic* plugin = new plugin::SwishPluginDynamic(beta, with_fp16); - layer = engine_->AddPluginV2(&input, input_num, plugin); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc new file mode 100644 index 00000000000000..2d12eaf736b754 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class YoloBoxOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid yolo box op to tensorrt plugin"; + + framework::OpDesc op_desc(op, nullptr); + std::string X = op_desc.Input("X").front(); + std::string img_size = op_desc.Input("ImgSize").front(); + + auto* X_tensor = engine_->GetITensor(X); + auto* img_size_tensor = engine_->GetITensor(img_size); + + int class_num = BOOST_GET_CONST(int, op_desc.GetAttr("class_num")); + std::vector anchors = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("anchors")); + + int downsample_ratio = + BOOST_GET_CONST(int, op_desc.GetAttr("downsample_ratio")); + float conf_thresh = BOOST_GET_CONST(float, op_desc.GetAttr("conf_thresh")); + bool clip_bbox = BOOST_GET_CONST(bool, op_desc.GetAttr("clip_bbox")); + float scale_x_y = BOOST_GET_CONST(float, op_desc.GetAttr("scale_x_y")); + + int type_id = static_cast(engine_->WithFp16()); + auto input_dim = X_tensor->getDimensions(); + auto* yolo_box_plugin = new plugin::YoloBoxPlugin( + type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, + anchors, class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, + input_dim.d[1], input_dim.d[2]); + + std::vector yolo_box_inputs; + yolo_box_inputs.push_back(X_tensor); + yolo_box_inputs.push_back(img_size_tensor); + + auto* yolo_box_layer = engine_->network()->addPluginV2( + yolo_box_inputs.data(), yolo_box_inputs.size(), *yolo_box_plugin); + + std::vector output_names; + output_names.push_back(op_desc.Output("Boxes").front()); + output_names.push_back(op_desc.Output("Scores").front()); + + RreplenishLayerAndOutput(yolo_box_layer, "yolo_box", output_names, + test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(yolo_box, YoloBoxOpConverter); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index de2924824f09de..2358e1ef976cdb 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -378,9 +378,9 @@ class TensorRTEngine { bool with_dynamic_shape() { return with_dynamic_shape_; } #if IS_TRT_VERSION_GE(6000) - nvinfer1::IPluginV2Layer* AddPluginV2(nvinfer1::ITensor* const* inputs, - int num_inputs, - plugin::DynamicPluginTensorRT* plugin) { + nvinfer1::IPluginV2Layer* AddDynamicPlugin( + nvinfer1::ITensor* const* inputs, int num_inputs, + plugin::DynamicPluginTensorRT* plugin) { owned_pluginv2_.emplace_back(plugin); return network()->addPluginV2(inputs, num_inputs, *plugin); } diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 7c1b2e8001edbd..53225b79780773 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -45,6 +45,12 @@ struct SimpleOpTypeSetTeller : public Teller { #endif #if IS_TRT_VERSION_GE(7130) teller_set.insert("group_norm"); + int8_teller_set.insert("multihead_matmul"); + int8_teller_set.insert("skip_layernorm"); + int8_teller_set.insert("fused_embedding_eltwise_layernorm"); + int8_teller_set.insert("matmul"); + int8_teller_set.insert("stack"); + int8_teller_set.insert("slice"); #endif } @@ -111,10 +117,11 @@ struct SimpleOpTypeSetTeller : public Teller { "flatten2", "flatten", "gather", + "yolo_box", "roi_align", "affine_channel", - "multiclass_nms", "nearest_interp", + "anchor_generator", }; }; @@ -198,6 +205,15 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false; } + if (op_type == "yolo_box") { + if (with_dynamic_shape) return false; + bool has_attrs = + (desc.HasAttr("class_num") && desc.HasAttr("anchors") && + desc.HasAttr("downsample_ratio") && desc.HasAttr("conf_thresh") && + desc.HasAttr("clip_bbox") && desc.HasAttr("scale_x_y")); + if (!has_attrs) return false; + } + if (op_type == "affine_channel") { if (!desc.HasAttr("data_layout")) return false; auto data_layout = framework::StringToDataLayout( diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 4107f9ef674339..1804e6c5571d3a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -5,6 +5,8 @@ nv_library(tensorrt_plugin instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu + anchor_generator_op_plugin.cu + yolo_box_op_plugin.cu roi_align_op_plugin.cu DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu new file mode 100644 index 00000000000000..01ee86ceb48a9e --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu @@ -0,0 +1,566 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +#include "paddle/fluid/operators/detection/anchor_generator_op.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +#define PrepareParamsOnDevice() \ + constexpr int data_size = 4; \ + cudaMalloc(&anchor_sizes_device_, anchor_sizes_.size() * data_size); \ + cudaMalloc(&aspect_ratios_device_, aspect_ratios_.size() * data_size); \ + cudaMalloc(&stride_device_, stride_.size() * data_size); \ + cudaMalloc(&variances_device_, variances_.size() * data_size); \ + cudaMemcpy(anchor_sizes_device_, anchor_sizes_.data(), \ + anchor_sizes_.size() * data_size, cudaMemcpyHostToDevice); \ + cudaMemcpy(aspect_ratios_device_, aspect_ratios_.data(), \ + aspect_ratios_.size() * data_size, cudaMemcpyHostToDevice); \ + cudaMemcpy(stride_device_, stride_.data(), stride_.size() * data_size, \ + cudaMemcpyHostToDevice); \ + cudaMemcpy(variances_device_, variances_.data(), \ + variances_.size() * data_size, cudaMemcpyHostToDevice); + +AnchorGeneratorPlugin::AnchorGeneratorPlugin( + const nvinfer1::DataType data_type, const std::vector& anchor_sizes, + const std::vector& aspect_ratios, const std::vector& stride, + const std::vector& variances, const float offset, const int height, + const int width, const int num_anchors, const int box_num) + : data_type_(data_type), + anchor_sizes_(anchor_sizes), + aspect_ratios_(aspect_ratios), + stride_(stride), + variances_(variances), + offset_(offset), + height_(height), + width_(width), + num_anchors_(num_anchors), + box_num_(box_num) { + // anchors must be float32, which is the generator proposals' input + PADDLE_ENFORCE_EQ(data_type_, nvinfer1::DataType::kFLOAT, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts float32.")); + PADDLE_ENFORCE_GE(height_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts height " + "greater than 0, but receive height = %d.", + height_)); + PADDLE_ENFORCE_GE(width_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts width " + "greater than 0, but receive width = %d.", + width_)); + PADDLE_ENFORCE_GE( + num_anchors_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts number of anchors greater " + "than 0, but receive number of anchors = %d.", + num_anchors_)); + PADDLE_ENFORCE_GE(box_num_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts box_num " + "greater than 0, but receive box_num = %d.", + box_num_)); + PrepareParamsOnDevice(); +} + +AnchorGeneratorPlugin::~AnchorGeneratorPlugin() { + auto release_device_ptr = [](void* ptr) { + if (ptr) { + cudaFree(ptr); + ptr = nullptr; + } + }; + release_device_ptr(anchor_sizes_device_); + release_device_ptr(aspect_ratios_device_); + release_device_ptr(stride_device_); + release_device_ptr(variances_device_); +} + +AnchorGeneratorPlugin::AnchorGeneratorPlugin(const void* data, size_t length) { + DeserializeValue(&data, &length, &data_type_); + DeserializeValue(&data, &length, &anchor_sizes_); + DeserializeValue(&data, &length, &aspect_ratios_); + DeserializeValue(&data, &length, &stride_); + DeserializeValue(&data, &length, &variances_); + DeserializeValue(&data, &length, &offset_); + DeserializeValue(&data, &length, &height_); + DeserializeValue(&data, &length, &width_); + DeserializeValue(&data, &length, &num_anchors_); + DeserializeValue(&data, &length, &box_num_); + PrepareParamsOnDevice(); +} + +const char* AnchorGeneratorPlugin::getPluginType() const { + return "anchor_generator_plugin"; +} + +const char* AnchorGeneratorPlugin::getPluginVersion() const { return "1"; } + +int AnchorGeneratorPlugin::getNbOutputs() const { return 2; } + +nvinfer1::Dims AnchorGeneratorPlugin::getOutputDimensions( + int index, const nvinfer1::Dims* inputs, int nb_input_dims) { + nvinfer1::Dims dims{}; + dims.nbDims = 4; + dims.d[0] = height_; + dims.d[1] = width_; + dims.d[2] = num_anchors_; + dims.d[3] = 4; + return dims; +} + +bool AnchorGeneratorPlugin::supportsFormat( + nvinfer1::DataType type, nvinfer1::TensorFormat format) const { + // static shape plugin can't support different type between input/out + // it may cause addition overhead in half mode + return (type == data_type_ && format == nvinfer1::TensorFormat::kLINEAR); +} + +size_t AnchorGeneratorPlugin::getWorkspaceSize(int max_batch_size) const { + return 0; +} + +template +int AnchorGeneratorPlugin::enqueue_impl(int batch_size, + const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + const int block = 512; + const int gen_anchor_grid = (box_num_ + block - 1) / block; + T* anchors = static_cast(outputs[0]); + T* vars = static_cast(outputs[1]); + const T* anchor_sizes_device = static_cast(anchor_sizes_device_); + const T* aspect_ratios_device = static_cast(aspect_ratios_device_); + const T* stride_device = static_cast(stride_device_); + const T* variances_device = static_cast(variances_device_); + paddle::operators::GenAnchors<<>>( + anchors, aspect_ratios_device, aspect_ratios_.size(), anchor_sizes_device, + anchor_sizes_.size(), stride_device, stride_.size(), height_, width_, + offset_); + const int var_grid = (box_num_ * 4 + block - 1) / block; + paddle::operators::SetVariance<<>>( + vars, variances_device, variances_.size(), box_num_ * 4); + return cudaGetLastError() != cudaSuccess; +} + +int AnchorGeneratorPlugin::enqueue(int batch_size, const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + return enqueue_impl(batch_size, inputs, outputs, workspace, stream); +} + +int AnchorGeneratorPlugin::initialize() { return 0; } + +void AnchorGeneratorPlugin::terminate() {} + +size_t AnchorGeneratorPlugin::getSerializationSize() const { + size_t serialize_size = 0; + serialize_size += SerializedSize(data_type_); + serialize_size += SerializedSize(anchor_sizes_); + serialize_size += SerializedSize(aspect_ratios_); + serialize_size += SerializedSize(stride_); + serialize_size += SerializedSize(variances_); + serialize_size += SerializedSize(offset_); + serialize_size += SerializedSize(height_); + serialize_size += SerializedSize(width_); + serialize_size += SerializedSize(num_anchors_); + serialize_size += SerializedSize(box_num_); + return serialize_size; +} + +void AnchorGeneratorPlugin::serialize(void* buffer) const { + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, anchor_sizes_); + SerializeValue(&buffer, aspect_ratios_); + SerializeValue(&buffer, stride_); + SerializeValue(&buffer, variances_); + SerializeValue(&buffer, offset_); + SerializeValue(&buffer, height_); + SerializeValue(&buffer, width_); + SerializeValue(&buffer, num_anchors_); + SerializeValue(&buffer, box_num_); +} + +void AnchorGeneratorPlugin::destroy() {} + +void AnchorGeneratorPlugin::setPluginNamespace(const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* AnchorGeneratorPlugin::getPluginNamespace() const { + return namespace_.c_str(); +} + +nvinfer1::DataType AnchorGeneratorPlugin::getOutputDataType( + int index, const nvinfer1::DataType* input_type, int nb_inputs) const { + return data_type_; +} + +bool AnchorGeneratorPlugin::isOutputBroadcastAcrossBatch( + int output_index, const bool* input_is_broadcast, int nb_inputs) const { + return true; +} + +bool AnchorGeneratorPlugin::canBroadcastInputAcrossBatch( + int input_index) const { + return false; +} + +void AnchorGeneratorPlugin::configurePlugin( + const nvinfer1::Dims* input_dims, int nb_inputs, + const nvinfer1::Dims* output_dims, int nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, const bool* input_is_broadcast, + const bool* output_is_broadcast, nvinfer1::PluginFormat float_format, + int max_batct_size) {} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPlugin::clone() const { + auto plugin = new AnchorGeneratorPlugin( + data_type_, anchor_sizes_, aspect_ratios_, stride_, variances_, offset_, + height_, width_, num_anchors_, box_num_); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +void AnchorGeneratorPluginCreator::setPluginNamespace( + const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* AnchorGeneratorPluginCreator::getPluginNamespace() const { + return namespace_.c_str(); +} + +const char* AnchorGeneratorPluginCreator::getPluginName() const { + return "anchor_generator_plugin"; +} + +const char* AnchorGeneratorPluginCreator::getPluginVersion() const { + return "1"; +} + +const nvinfer1::PluginFieldCollection* +AnchorGeneratorPluginCreator::getFieldNames() { + return &field_collection_; +} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) { + const nvinfer1::PluginField* fields = fc->fields; + int type_id = -1; + std::vector anchor_sizes, aspect_ratios, stride, variances; + float offset = .5; + int height = -1, width = -1; + int num_anchors = -1; + int box_num = -1; + + for (int i = 0; i < fc->nbFields; ++i) { + const std::string field_name(fc->fields[i].name); + const auto length = fc->fields[i].length; + if (field_name.compare("type_id") == 0) { + type_id = *static_cast(fc->fields[i].data); + } else if (field_name.compare("anchor_sizes")) { + const auto* data = static_cast(fc->fields[i].data); + anchor_sizes.insert(anchor_sizes.end(), data, data + length); + } else if (field_name.compare("aspect_ratios")) { + const auto* data = static_cast(fc->fields[i].data); + aspect_ratios.insert(aspect_ratios.end(), data, data + length); + } else if (field_name.compare("stride")) { + const auto* data = static_cast(fc->fields[i].data); + stride.insert(stride.end(), data, data + length); + } else if (field_name.compare("variances")) { + const auto* data = static_cast(fc->fields[i].data); + variances.insert(variances.end(), data, data + length); + } else if (field_name.compare("offset")) { + offset = *static_cast(fc->fields[i].data); + } else if (field_name.compare("height")) { + height = *static_cast(fc->fields[i].data); + } else if (field_name.compare("width")) { + width = *static_cast(fc->fields[i].data); + } else if (field_name.compare("num_anchors")) { + num_anchors = *static_cast(fc->fields[i].data); + } else if (field_name.compare("box_num")) { + box_num = *static_cast(fc->fields[i].data); + } else { + assert(false && "unknown plugin field name."); + } + } + return new AnchorGeneratorPlugin(nvinfer1::DataType::kFLOAT, anchor_sizes, + aspect_ratios, stride, variances, offset, + height, width, num_anchors, box_num); +} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::deserializePlugin( + const char* name, const void* serial_data, size_t serial_length) { + auto plugin = new AnchorGeneratorPlugin(serial_data, serial_length); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +#if IS_TRT_VERSION_GE(6000) +AnchorGeneratorPluginDynamic::AnchorGeneratorPluginDynamic( + const nvinfer1::DataType data_type, const std::vector& anchor_sizes, + const std::vector& aspect_ratios, const std::vector& stride, + const std::vector& variances, const float offset, + const int num_anchors) + : data_type_(data_type), + anchor_sizes_(anchor_sizes), + aspect_ratios_(aspect_ratios), + stride_(stride), + variances_(variances), + offset_(offset), + num_anchors_(num_anchors) { + // data_type_ is used to determine the output data type + // data_type_ can only be float32 + // height, width, num_anchors are calculated at configurePlugin + PADDLE_ENFORCE_EQ(data_type_, nvinfer1::DataType::kFLOAT, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts float32.")); + PADDLE_ENFORCE_GE( + num_anchors_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts number of anchors greater " + "than 0, but receive number of anchors = %d.", + num_anchors_)); + PrepareParamsOnDevice(); +} + +AnchorGeneratorPluginDynamic::~AnchorGeneratorPluginDynamic() { + auto release_device_ptr = [](void* ptr) { + if (ptr) { + cudaFree(ptr); + ptr = nullptr; + } + }; + release_device_ptr(anchor_sizes_device_); + release_device_ptr(aspect_ratios_device_); + release_device_ptr(stride_device_); + release_device_ptr(variances_device_); +} + +AnchorGeneratorPluginDynamic::AnchorGeneratorPluginDynamic(void const* data, + size_t length) { + DeserializeValue(&data, &length, &data_type_); + DeserializeValue(&data, &length, &anchor_sizes_); + DeserializeValue(&data, &length, &aspect_ratios_); + DeserializeValue(&data, &length, &stride_); + DeserializeValue(&data, &length, &variances_); + DeserializeValue(&data, &length, &offset_); + DeserializeValue(&data, &length, &num_anchors_); + PrepareParamsOnDevice(); +} + +nvinfer1::IPluginV2DynamicExt* AnchorGeneratorPluginDynamic::clone() const { + auto plugin = new AnchorGeneratorPluginDynamic( + data_type_, anchor_sizes_, aspect_ratios_, stride_, variances_, offset_, + num_anchors_); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +nvinfer1::DimsExprs AnchorGeneratorPluginDynamic::getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) { + nvinfer1::DimsExprs ret{}; + ret.nbDims = 4; + ret.d[0] = inputs[0].d[2]; // feature height + ret.d[1] = inputs[0].d[3]; // feature width + ret.d[2] = exprBuilder.constant(num_anchors_); + ret.d[3] = exprBuilder.constant(4); + return ret; +} + +bool AnchorGeneratorPluginDynamic::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, + int nbOutputs) { + // input can be any, doesn't matter + // anchor generator doesn't read input raw data, only need the shape info + auto type = inOut[pos].type; + auto format = inOut[pos].format; +#if IS_TRT_VERSION_GE(7234) + if (pos == 0) return true; +#else + if (pos == 0) return format == nvinfer1::TensorFormat::kLINEAR; +#endif + return (type == nvinfer1::DataType::kFLOAT && + format == nvinfer1::TensorFormat::kLINEAR); +} + +void AnchorGeneratorPluginDynamic::configurePlugin( + const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {} + +size_t AnchorGeneratorPluginDynamic::getWorkspaceSize( + const nvinfer1::PluginTensorDesc* inputs, int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const { + return 0; +} + +template +int AnchorGeneratorPluginDynamic::enqueue_impl( + const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, + void* const* outputs, void* workspace, cudaStream_t stream) { + const int height = inputDesc[0].dims.d[2]; + const int width = inputDesc[0].dims.d[3]; + const int box_num = height * width * num_anchors_; + const int block = 512; + const int gen_anchor_grid = (box_num + block - 1) / block; + T* anchors = static_cast(outputs[0]); + T* vars = static_cast(outputs[1]); + const T* anchor_sizes_device = static_cast(anchor_sizes_device_); + const T* aspect_ratios_device = static_cast(aspect_ratios_device_); + const T* stride_device = static_cast(stride_device_); + const T* variances_device = static_cast(variances_device_); + paddle::operators::GenAnchors<<>>( + anchors, aspect_ratios_device, aspect_ratios_.size(), anchor_sizes_device, + anchor_sizes_.size(), stride_device, stride_.size(), height, width, + offset_); + const int var_grid = (box_num * 4 + block - 1) / block; + paddle::operators::SetVariance<<>>( + vars, variances_device, variances_.size(), box_num * 4); + return cudaGetLastError() != cudaSuccess; +} + +int AnchorGeneratorPluginDynamic::enqueue( + const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, + void* const* outputs, void* workspace, cudaStream_t stream) { + assert(outputDesc[0].type == nvinfer1::DataType::kFLOAT); + assert(outputDesc[1].type == nvinfer1::DataType::kFLOAT); + return enqueue_impl(inputDesc, outputDesc, inputs, outputs, workspace, + stream); +} + +nvinfer1::DataType AnchorGeneratorPluginDynamic::getOutputDataType( + int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { + return data_type_; +} + +const char* AnchorGeneratorPluginDynamic::getPluginType() const { + return "anchor_generator_plugin_dynamic"; +} + +int AnchorGeneratorPluginDynamic::getNbOutputs() const { return 2; } + +int AnchorGeneratorPluginDynamic::initialize() { return 0; } + +void AnchorGeneratorPluginDynamic::terminate() {} + +size_t AnchorGeneratorPluginDynamic::getSerializationSize() const { + size_t serialize_size = 0; + serialize_size += SerializedSize(data_type_); + serialize_size += SerializedSize(anchor_sizes_); + serialize_size += SerializedSize(aspect_ratios_); + serialize_size += SerializedSize(stride_); + serialize_size += SerializedSize(variances_); + serialize_size += SerializedSize(offset_); + serialize_size += SerializedSize(num_anchors_); + return serialize_size; +} + +void AnchorGeneratorPluginDynamic::serialize(void* buffer) const { + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, anchor_sizes_); + SerializeValue(&buffer, aspect_ratios_); + SerializeValue(&buffer, stride_); + SerializeValue(&buffer, variances_); + SerializeValue(&buffer, offset_); + SerializeValue(&buffer, num_anchors_); +} + +void AnchorGeneratorPluginDynamic::destroy() {} + +void AnchorGeneratorPluginDynamicCreator::setPluginNamespace( + const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* AnchorGeneratorPluginDynamicCreator::getPluginNamespace() const { + return namespace_.c_str(); +} + +const char* AnchorGeneratorPluginDynamicCreator::getPluginName() const { + return "anchor_generator_plugin_dynamic"; +} + +const char* AnchorGeneratorPluginDynamicCreator::getPluginVersion() const { + return "1"; +} + +const nvinfer1::PluginFieldCollection* +AnchorGeneratorPluginDynamicCreator::getFieldNames() { + return &field_collection_; +} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) { + const nvinfer1::PluginField* fields = fc->fields; + int type_id = -1; + std::vector anchor_sizes, aspect_ratios, stride, variances; + float offset = .5; + int num_anchors = -1; + for (int i = 0; i < fc->nbFields; ++i) { + const std::string field_name(fc->fields[i].name); + const auto length = fc->fields[i].length; + if (field_name.compare("type_id") == 0) { + type_id = *static_cast(fc->fields[i].data); + } else if (field_name.compare("anchor_sizes")) { + const auto* data = static_cast(fc->fields[i].data); + anchor_sizes.insert(anchor_sizes.end(), data, data + length); + } else if (field_name.compare("aspect_ratios")) { + const auto* data = static_cast(fc->fields[i].data); + aspect_ratios.insert(aspect_ratios.end(), data, data + length); + } else if (field_name.compare("stride")) { + const auto* data = static_cast(fc->fields[i].data); + stride.insert(stride.end(), data, data + length); + } else if (field_name.compare("variances")) { + const auto* data = static_cast(fc->fields[i].data); + variances.insert(variances.end(), data, data + length); + } else if (field_name.compare("offset")) { + offset = *static_cast(fc->fields[i].data); + } else if (field_name.compare("num_anchors")) { + num_anchors = *static_cast(fc->fields[i].data); + } else { + assert(false && "unknown plugin field name."); + } + } + return new AnchorGeneratorPluginDynamic(nvinfer1::DataType::kFLOAT, + anchor_sizes, aspect_ratios, stride, + variances, offset, num_anchors); +} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::deserializePlugin( + const char* name, const void* serial_data, size_t serial_length) { + auto plugin = new AnchorGeneratorPluginDynamic(serial_data, serial_length); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h new file mode 100644 index 00000000000000..aff0b6a6802f11 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h @@ -0,0 +1,201 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class AnchorGeneratorPlugin : public nvinfer1::IPluginV2Ext { + public: + explicit AnchorGeneratorPlugin( + const nvinfer1::DataType, const std::vector& anchor_sizes, + const std::vector& aspect_ratios, const std::vector& stride, + const std::vector& variances, const float offset, const int height, + const int width, const int num_anchors, const int box_num); + AnchorGeneratorPlugin(const void* data, size_t length); + ~AnchorGeneratorPlugin() override; + const char* getPluginType() const override; + const char* getPluginVersion() const override; + int getNbOutputs() const override; + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, + int nb_input_dims) override; + bool supportsFormat(nvinfer1::DataType type, + nvinfer1::TensorFormat format) const override; + size_t getWorkspaceSize(int max_batch_size) const override; + int enqueue(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream) override; + int initialize() override; + void terminate() override; + size_t getSerializationSize() const override; + void serialize(void* buffer) const override; + void destroy() override; + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* input_type, + int nb_inputs) const override; + bool isOutputBroadcastAcrossBatch(int output_index, + const bool* input_is_broadcast, + int nb_inputs) const override; + bool canBroadcastInputAcrossBatch(int input_index) const override; + void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs, + const nvinfer1::Dims* output_dims, int nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, + const bool* input_is_broadcast, + const bool* output_is_broadcast, + nvinfer1::PluginFormat float_format, + int max_batct_size) override; + nvinfer1::IPluginV2Ext* clone() const override; + + private: + template + int enqueue_impl(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream); + nvinfer1::DataType data_type_; + std::vector anchor_sizes_; + std::vector aspect_ratios_; + std::vector stride_; + std::vector variances_; + float offset_; + void* anchor_sizes_device_; + void* aspect_ratios_device_; + void* stride_device_; + void* variances_device_; + int height_; + int width_; + int num_anchors_; + int box_num_; + std::string namespace_; +}; + +class AnchorGeneratorPluginCreator : public nvinfer1::IPluginCreator { + public: + AnchorGeneratorPluginCreator() = default; + ~AnchorGeneratorPluginCreator() override = default; + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + const char* getPluginName() const override; + const char* getPluginVersion() const override; + const nvinfer1::PluginFieldCollection* getFieldNames() override; + nvinfer1::IPluginV2Ext* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override; + nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override; + + private: + std::string namespace_; + nvinfer1::PluginFieldCollection field_collection_; +}; + +REGISTER_TRT_PLUGIN_V2(AnchorGeneratorPluginCreator); + +#if IS_TRT_VERSION_GE(6000) +class AnchorGeneratorPluginDynamic : public DynamicPluginTensorRT { + public: + explicit AnchorGeneratorPluginDynamic(const nvinfer1::DataType data_type, + const std::vector& anchor_sizes, + const std::vector& aspect_ratios, + const std::vector& stride, + const std::vector& variances, + const float offset, + const int num_anchors); + AnchorGeneratorPluginDynamic(void const* data, size_t length); + ~AnchorGeneratorPluginDynamic(); + nvinfer1::IPluginV2DynamicExt* clone() const override; + nvinfer1::DimsExprs getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) override; + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, int nbOutputs) override; + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) override; + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const override; + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) override; + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* inputTypes, + int nbInputs) const override; + const char* getPluginType() const override; + int getNbOutputs() const override; + int initialize() override; + void terminate() override; + size_t getSerializationSize() const override; + void serialize(void* buffer) const override; + void destroy() override; + + private: + template + int enqueue_impl(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, + void* workspace, cudaStream_t stream); + nvinfer1::DataType data_type_; + std::vector anchor_sizes_; + std::vector aspect_ratios_; + std::vector stride_; + std::vector variances_; + float offset_; + void* anchor_sizes_device_; + void* aspect_ratios_device_; + void* stride_device_; + void* variances_device_; + int num_anchors_; + std::string namespace_; +}; + +class AnchorGeneratorPluginDynamicCreator : public nvinfer1::IPluginCreator { + public: + AnchorGeneratorPluginDynamicCreator() = default; + ~AnchorGeneratorPluginDynamicCreator() override = default; + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + const char* getPluginName() const override; + const char* getPluginVersion() const override; + const nvinfer1::PluginFieldCollection* getFieldNames() override; + nvinfer1::IPluginV2Ext* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override; + nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override; + + private: + std::string namespace_; + nvinfer1::PluginFieldCollection field_collection_; +}; +REGISTER_TRT_PLUGIN_V2(AnchorGeneratorPluginDynamicCreator); +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h index 49212aae9aa90d..75a1dd85f0f2c4 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h @@ -144,9 +144,9 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT { int axis_; }; -class ElementwisePluginV2Creator : public nvinfer1::IPluginCreator { +class ElementwisePluginDynamicCreator : public nvinfer1::IPluginCreator { public: - ElementwisePluginV2Creator() {} + ElementwisePluginDynamicCreator() {} const char* getPluginName() const override { return "elementwise_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -182,7 +182,7 @@ class ElementwisePluginV2Creator : public nvinfer1::IPluginCreator { std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(ElementwisePluginV2Creator); +REGISTER_TRT_PLUGIN_V2(ElementwisePluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h index 6c8381a750cba9..7de84a8fc49bcc 100644 --- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h @@ -306,9 +306,10 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { } }; -class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator { +class EmbEltwiseLayernormPluginDynamicCreator + : public nvinfer1::IPluginCreator { public: - EmbEltwiseLayernormPluginV2Creator() {} + EmbEltwiseLayernormPluginDynamicCreator() {} const char* getPluginName() const override { return "fused_embedding_eltwise_layernorm_plugin"; } @@ -345,7 +346,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator { std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h index 979f600a3a9cea..23e507ee477e1a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h @@ -115,9 +115,9 @@ class GeluPluginDynamic : public DynamicPluginTensorRT { void destroy() override { delete this; } }; -class GeluPluginV2Creator : public nvinfer1::IPluginCreator { +class GeluPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - GeluPluginV2Creator() {} + GeluPluginDynamicCreator() {} const char* getPluginName() const override { return "gelu_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -153,7 +153,7 @@ class GeluPluginV2Creator : public nvinfer1::IPluginCreator { std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(GeluPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(GeluPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h index b852f5a454c07c..7147d9855755be 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h @@ -118,9 +118,9 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { float scale_; }; -class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator { +class QkvToContextPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - QkvToContextPluginV2Creator() {} + QkvToContextPluginDynamicCreator() {} const char* getPluginName() const override { return "qkv_to_context_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -155,7 +155,7 @@ class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator { nvinfer1::PluginFieldCollection field_collection_; std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(QkvToContextPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(QkvToContextPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu index 42c0df41a1b5ef..6e7ed0054f502e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu @@ -364,6 +364,7 @@ RoiAlignPluginDynamicCreator::getFieldNames() { nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::createPlugin( const char* name, const nvinfer1::PluginFieldCollection* fc) { const nvinfer1::PluginField* fields = fc->fields; + return nullptr; } nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::deserializePlugin( diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h index 0e457fdc8f4474..ac621784550f2f 100644 --- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h @@ -119,9 +119,9 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { float eps_; }; -class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator { +class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - SkipLayerNormPluginV2Creator() {} + SkipLayerNormPluginDynamicCreator() {} const char* getPluginName() const override { return "skip_layernorm_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -156,7 +156,7 @@ class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator { nvinfer1::PluginFieldCollection field_collection_; std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h index 340406c5e7fae8..9d4f9a35c3b6fe 100644 --- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h @@ -121,9 +121,9 @@ class SlicePluginDynamic : public DynamicPluginTensorRT { cudaStream_t copy_stream_; }; -class SlicePluginV2Creator : public nvinfer1::IPluginCreator { +class SlicePluginDynamicCreator : public nvinfer1::IPluginCreator { public: - SlicePluginV2Creator() {} + SlicePluginDynamicCreator() {} const char* getPluginName() const override { return "slice_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -155,7 +155,7 @@ class SlicePluginV2Creator : public nvinfer1::IPluginCreator { nvinfer1::PluginFieldCollection field_collection_; }; -REGISTER_TRT_PLUGIN_V2(SlicePluginV2Creator); +REGISTER_TRT_PLUGIN_V2(SlicePluginDynamicCreator); #endif diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index e43b57357fb64f..1ee895154d6b04 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -193,9 +193,9 @@ class SplitPluginDynamic : public DynamicPluginTensorRT { std::vector output_length_; }; -class SplitPluginV2Creator : public nvinfer1::IPluginCreator { +class SplitPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - SplitPluginV2Creator() {} + SplitPluginDynamicCreator() {} const char* getPluginName() const override { return "split_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -231,7 +231,7 @@ class SplitPluginV2Creator : public nvinfer1::IPluginCreator { std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(SplitPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(SplitPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h index 85cc6916238fef..11579aadcc4573 100644 --- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h @@ -127,9 +127,9 @@ class SwishPluginDynamic : public DynamicPluginTensorRT { float beta_; }; -class SwishPluginV2Creator : public nvinfer1::IPluginCreator { +class SwishPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - SwishPluginV2Creator() {} + SwishPluginDynamicCreator() {} const char* getPluginName() const override { return "swish_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -165,7 +165,7 @@ class SwishPluginV2Creator : public nvinfer1::IPluginCreator { std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(SwishPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(SwishPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu new file mode 100644 index 00000000000000..13d07e774036a4 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu @@ -0,0 +1,401 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" +#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h" +#include "paddle/fluid/operators/detection/yolo_box_op.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type, + const std::vector& anchors, + const int class_num, const float conf_thresh, + const int downsample_ratio, const bool clip_bbox, + const float scale_x_y, const int input_h, + const int input_w) + : data_type_(data_type), + class_num_(class_num), + conf_thresh_(conf_thresh), + downsample_ratio_(downsample_ratio), + clip_bbox_(clip_bbox), + scale_x_y_(scale_x_y), + input_h_(input_h), + input_w_(input_w) { + anchors_.insert(anchors_.end(), anchors.cbegin(), anchors.cend()); + assert(data_type_ == nvinfer1::DataType::kFLOAT || + data_type_ == nvinfer1::DataType::kHALF); + assert(class_num_ > 0); + assert(input_h_ > 0); + assert(input_w_ > 0); + + cudaMalloc(&anchors_device_, anchors.size() * sizeof(int)); + cudaMemcpy(anchors_device_, anchors.data(), anchors.size() * sizeof(int), + cudaMemcpyHostToDevice); +} + +YoloBoxPlugin::YoloBoxPlugin(const void* data, size_t length) { + DeserializeValue(&data, &length, &data_type_); + DeserializeValue(&data, &length, &anchors_); + DeserializeValue(&data, &length, &class_num_); + DeserializeValue(&data, &length, &conf_thresh_); + DeserializeValue(&data, &length, &downsample_ratio_); + DeserializeValue(&data, &length, &clip_bbox_); + DeserializeValue(&data, &length, &scale_x_y_); + DeserializeValue(&data, &length, &input_h_); + DeserializeValue(&data, &length, &input_w_); +} + +YoloBoxPlugin::~YoloBoxPlugin() { + if (anchors_device_ != nullptr) { + cudaFree(anchors_device_); + anchors_device_ = nullptr; + } +} + +const char* YoloBoxPlugin::getPluginType() const { return "yolo_box_plugin"; } + +const char* YoloBoxPlugin::getPluginVersion() const { return "1"; } + +int YoloBoxPlugin::getNbOutputs() const { return 2; } + +nvinfer1::Dims YoloBoxPlugin::getOutputDimensions(int index, + const nvinfer1::Dims* inputs, + int nb_input_dims) { + const int anchor_num = anchors_.size() / 2; + const int box_num = inputs[0].d[1] * inputs[0].d[2] * anchor_num; + + assert(index <= 1); + + if (index == 0) { + return nvinfer1::Dims2(box_num, 4); + } + return nvinfer1::Dims2(box_num, class_num_); +} + +bool YoloBoxPlugin::supportsFormat(nvinfer1::DataType type, + nvinfer1::TensorFormat format) const { + return ((type == data_type_ || type == nvinfer1::DataType::kINT32) && + format == nvinfer1::TensorFormat::kLINEAR); +} + +size_t YoloBoxPlugin::getWorkspaceSize(int max_batch_size) const { return 0; } + +template +__device__ inline T sigmoid(T x) { + return 1. / (1. + exp(-x)); +} + +template <> +__device__ inline float sigmoid(float x) { + return 1.f / (1.f + expf(-x)); +} + +template +__device__ inline void GetYoloBox(float* box, const T* x, const int* anchors, + int i, int j, int an_idx, int grid_size_h, + int grid_size_w, int input_size_h, + int input_size_w, int index, int stride, + int img_height, int img_width, float scale, + float bias) { + box[0] = static_cast( + (i + sigmoid(static_cast(x[index]) * scale + bias)) * img_width / + grid_size_w); + box[1] = static_cast( + (j + sigmoid(static_cast(x[index + stride]) * scale + bias)) * + img_height / grid_size_h); + box[2] = static_cast(expf(static_cast(x[index + 2 * stride])) * + anchors[2 * an_idx] * img_width / input_size_w); + box[3] = + static_cast(expf(static_cast(x[index + 3 * stride])) * + anchors[2 * an_idx + 1] * img_height / input_size_h); +} + +__device__ inline int GetEntryIndex(int batch, int an_idx, int hw_idx, + int an_num, int an_stride, int stride, + int entry) { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; +} + +template +__device__ inline void CalcDetectionBox(T* boxes, const float* box, + const int box_idx, const int img_height, + const int img_width, bool clip_bbox) { + float tmp_box_0, tmp_box_1, tmp_box_2, tmp_box_3; + tmp_box_0 = box[0] - box[2] / 2; + tmp_box_1 = box[1] - box[3] / 2; + tmp_box_2 = box[0] + box[2] / 2; + tmp_box_3 = box[1] + box[3] / 2; + + if (clip_bbox) { + tmp_box_0 = max(tmp_box_0, 0.f); + tmp_box_1 = max(tmp_box_1, 0.f); + tmp_box_2 = min(tmp_box_2, static_cast(img_width - 1)); + tmp_box_3 = min(tmp_box_3, static_cast(img_height - 1)); + } + + boxes[box_idx + 0] = static_cast(tmp_box_0); + boxes[box_idx + 1] = static_cast(tmp_box_1); + boxes[box_idx + 2] = static_cast(tmp_box_2); + boxes[box_idx + 3] = static_cast(tmp_box_3); +} + +template +__device__ inline void CalcLabelScore(T* scores, const T* input, + const int label_idx, const int score_idx, + const int class_num, const float conf, + const int stride) { + for (int i = 0; i < class_num; i++) { + scores[score_idx + i] = static_cast( + conf * sigmoid(static_cast(input[label_idx + i * stride]))); + } +} + +template +__global__ void KeYoloBoxFw(const T* const input, const int* const imgsize, + T* boxes, T* scores, const float conf_thresh, + const int* anchors, const int n, const int h, + const int w, const int an_num, const int class_num, + const int box_num, int input_size_h, + int input_size_w, bool clip_bbox, const float scale, + const float bias) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + float box[4]; + for (; tid < n * box_num; tid += stride) { + int grid_num = h * w; + int i = tid / box_num; + int j = (tid % box_num) / grid_num; + int k = (tid % grid_num) / w; + int l = tid % w; + + int an_stride = (5 + class_num) * grid_num; + int img_height = imgsize[2 * i]; + int img_width = imgsize[2 * i + 1]; + + int obj_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4); + float conf = sigmoid(static_cast(input[obj_idx])); + int box_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0); + + if (conf < conf_thresh) { + for (int i = 0; i < 4; ++i) { + box[i] = 0.f; + } + } else { + GetYoloBox(box, input, anchors, l, k, j, h, w, input_size_h, + input_size_w, box_idx, grid_num, img_height, img_width, + scale, bias); + } + + box_idx = (i * box_num + j * grid_num + k * w + l) * 4; + CalcDetectionBox(boxes, box, box_idx, img_height, img_width, clip_bbox); + + int label_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5); + int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; + CalcLabelScore(scores, input, label_idx, score_idx, class_num, conf, + grid_num); + } +} + +template +int YoloBoxPlugin::enqueue_impl(int batch_size, const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + const int n = batch_size; + const int h = input_h_; + const int w = input_w_; + const int an_num = anchors_.size() / 2; + const int box_num = h * w * an_num; + int input_size_h = downsample_ratio_ * h; + int input_size_w = downsample_ratio_ * w; + + float bias = -0.5 * (scale_x_y_ - 1.); + constexpr int threads = 256; + + KeYoloBoxFw<<<(n * box_num + threads - 1) / threads, threads, 0, stream>>>( + reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1]), + reinterpret_cast(outputs[0]), reinterpret_cast(outputs[1]), + conf_thresh_, anchors_device_, n, h, w, an_num, class_num_, box_num, + input_size_h, input_size_w, clip_bbox_, scale_x_y_, bias); + return cudaGetLastError() != cudaSuccess; +} + +int YoloBoxPlugin::enqueue(int batch_size, const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + if (data_type_ == nvinfer1::DataType::kFLOAT) { + return enqueue_impl(batch_size, inputs, outputs, workspace, stream); + } else if (data_type_ == nvinfer1::DataType::kHALF) { + return enqueue_impl(batch_size, inputs, outputs, workspace, stream); + } + assert("unsupported type."); +} + +int YoloBoxPlugin::initialize() { return 0; } + +void YoloBoxPlugin::terminate() {} + +size_t YoloBoxPlugin::getSerializationSize() const { + size_t serialize_size = 0; + serialize_size += SerializedSize(data_type_); + serialize_size += SerializedSize(anchors_); + serialize_size += SerializedSize(class_num_); + serialize_size += SerializedSize(conf_thresh_); + serialize_size += SerializedSize(downsample_ratio_); + serialize_size += SerializedSize(clip_bbox_); + serialize_size += SerializedSize(scale_x_y_); + serialize_size += SerializedSize(input_h_); + serialize_size += SerializedSize(input_w_); + return serialize_size; +} + +void YoloBoxPlugin::serialize(void* buffer) const { + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, anchors_); + SerializeValue(&buffer, class_num_); + SerializeValue(&buffer, conf_thresh_); + SerializeValue(&buffer, downsample_ratio_); + SerializeValue(&buffer, clip_bbox_); + SerializeValue(&buffer, scale_x_y_); + SerializeValue(&buffer, input_h_); + SerializeValue(&buffer, input_w_); +} + +void YoloBoxPlugin::destroy() {} + +void YoloBoxPlugin::setPluginNamespace(const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* YoloBoxPlugin::getPluginNamespace() const { + return namespace_.c_str(); +} + +nvinfer1::DataType YoloBoxPlugin::getOutputDataType( + int index, const nvinfer1::DataType* input_type, int nb_inputs) const { + return data_type_; +} + +bool YoloBoxPlugin::isOutputBroadcastAcrossBatch(int output_index, + const bool* input_is_broadcast, + int nb_inputs) const { + return false; +} + +bool YoloBoxPlugin::canBroadcastInputAcrossBatch(int input_index) const { + return false; +} + +void YoloBoxPlugin::configurePlugin( + const nvinfer1::Dims* input_dims, int nb_inputs, + const nvinfer1::Dims* output_dims, int nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, const bool* input_is_broadcast, + const bool* output_is_broadcast, nvinfer1::PluginFormat float_format, + int max_batct_size) {} + +nvinfer1::IPluginV2Ext* YoloBoxPlugin::clone() const { + return new YoloBoxPlugin(data_type_, anchors_, class_num_, conf_thresh_, + downsample_ratio_, clip_bbox_, scale_x_y_, input_h_, + input_w_); +} + +YoloBoxPluginCreator::YoloBoxPluginCreator() {} + +void YoloBoxPluginCreator::setPluginNamespace(const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* YoloBoxPluginCreator::getPluginNamespace() const { + return namespace_.c_str(); +} + +const char* YoloBoxPluginCreator::getPluginName() const { + return "yolo_box_plugin"; +} + +const char* YoloBoxPluginCreator::getPluginVersion() const { return "1"; } + +const nvinfer1::PluginFieldCollection* YoloBoxPluginCreator::getFieldNames() { + return &field_collection_; +} + +nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) { + const nvinfer1::PluginField* fields = fc->fields; + + int type_id = -1; + std::vector anchors; + int class_num = -1; + float conf_thresh = 0.01; + int downsample_ratio = 32; + bool clip_bbox = true; + float scale_x_y = 1.; + int h = -1; + int w = -1; + + for (int i = 0; i < fc->nbFields; ++i) { + const std::string field_name(fc->fields[i].name); + if (field_name.compare("type_id") == 0) { + type_id = *static_cast(fc->fields[i].data); + } else if (field_name.compare("anchors")) { + const int length = fc->fields[i].length; + const int* data = static_cast(fc->fields[i].data); + anchors.insert(anchors.end(), data, data + length); + } else if (field_name.compare("class_num")) { + class_num = *static_cast(fc->fields[i].data); + } else if (field_name.compare("conf_thresh")) { + conf_thresh = *static_cast(fc->fields[i].data); + } else if (field_name.compare("downsample_ratio")) { + downsample_ratio = *static_cast(fc->fields[i].data); + } else if (field_name.compare("clip_bbox")) { + clip_bbox = *static_cast(fc->fields[i].data); + } else if (field_name.compare("scale_x_y")) { + scale_x_y = *static_cast(fc->fields[i].data); + } else if (field_name.compare("h")) { + h = *static_cast(fc->fields[i].data); + } else if (field_name.compare("w")) { + w = *static_cast(fc->fields[i].data); + } else { + assert(false && "unknown plugin field name."); + } + } + + return new YoloBoxPlugin( + type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, anchors, + class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, h, w); +} + +nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::deserializePlugin( + const char* name, const void* serial_data, size_t serial_length) { + auto plugin = new YoloBoxPlugin(serial_data, serial_length); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h new file mode 100644 index 00000000000000..8ca21da7ae0377 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h @@ -0,0 +1,117 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class YoloBoxPlugin : public nvinfer1::IPluginV2Ext { + public: + explicit YoloBoxPlugin(const nvinfer1::DataType data_type, + const std::vector& anchors, const int class_num, + const float conf_thresh, const int downsample_ratio, + const bool clip_bbox, const float scale_x_y, + const int input_h, const int input_w); + YoloBoxPlugin(const void* data, size_t length); + ~YoloBoxPlugin() override; + + const char* getPluginType() const override; + const char* getPluginVersion() const override; + int getNbOutputs() const override; + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, + int nb_input_dims) override; + bool supportsFormat(nvinfer1::DataType type, + nvinfer1::TensorFormat format) const override; + size_t getWorkspaceSize(int max_batch_size) const override; + int enqueue(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream) override; + template + int enqueue_impl(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream); + int initialize() override; + void terminate() override; + size_t getSerializationSize() const override; + void serialize(void* buffer) const override; + void destroy() override; + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* input_type, + int nb_inputs) const override; + bool isOutputBroadcastAcrossBatch(int output_index, + const bool* input_is_broadcast, + int nb_inputs) const override; + bool canBroadcastInputAcrossBatch(int input_index) const override; + void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs, + const nvinfer1::Dims* output_dims, int nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, + const bool* input_is_broadcast, + const bool* output_is_broadcast, + nvinfer1::PluginFormat float_format, + int max_batct_size) override; + nvinfer1::IPluginV2Ext* clone() const override; + + private: + nvinfer1::DataType data_type_; + std::vector anchors_; + int* anchors_device_; + int class_num_; + float conf_thresh_; + int downsample_ratio_; + bool clip_bbox_; + float scale_x_y_; + int input_h_; + int input_w_; + std::string namespace_; +}; + +class YoloBoxPluginCreator : public nvinfer1::IPluginCreator { + public: + YoloBoxPluginCreator(); + ~YoloBoxPluginCreator() override = default; + + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + const char* getPluginName() const override; + const char* getPluginVersion() const override; + const nvinfer1::PluginFieldCollection* getFieldNames() override; + + nvinfer1::IPluginV2Ext* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override; + nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override; + + private: + std::string namespace_; + nvinfer1::PluginFieldCollection field_collection_; +}; + +REGISTER_TRT_PLUGIN_V2(YoloBoxPluginCreator); + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc index 6d4bb70df6f3ad..9211ea246a5c5e 100644 --- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc +++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc @@ -75,14 +75,15 @@ int test_predictor_zero_copy(const AnalysisConfig& config_in, } std::vector input({1}); - auto in_tensor{predictor->GetInputTensor(predictor->GetInputNames().front())}; + auto in_tensor = + predictor->GetInputTensor(predictor->GetInputNames().front()); in_tensor->Reshape({1, 1}); in_tensor->copy_from_cpu(input.data()); predictor->ZeroCopyRun(); - auto out_tensor{ - predictor->GetOutputTensor(predictor->GetOutputNames().front())}; + auto out_tensor = + predictor->GetOutputTensor(predictor->GetOutputNames().front()); std::vector data_o(10); out_tensor->copy_to_cpu(data_o.data()); diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 377ea376773899..2ea047fa13c105 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -27,12 +27,18 @@ if (WITH_ROCM) cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) endif() +if (WITH_ASCEND_CL) + cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info) +endif() + cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) if (WITH_GPU OR WITH_ROCM) set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator) elseif(WITH_XPU) set(AllocatorFacadeDeps xpu_info) +elseif(WITH_ASCEND) + set(AllocatorFacadeDeps ascend_npu_info) else () set(AllocatorFacadeDeps) endif() diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index cbeb263b5f41b9..730efa5c646885 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -32,6 +32,7 @@ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/xpu_info.h" #endif +#include "paddle/fluid/platform/npu_info.h" DEFINE_int64( gpu_allocator_retry_time, 10000, @@ -66,6 +67,11 @@ class AllocatorFacadePrivate { InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); } InitNaiveBestFitCUDAPinnedAllocator(); +#endif +#ifdef PADDLE_WITH_ASCEND_CL + for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { + InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); + } #endif break; } @@ -185,6 +191,12 @@ class AllocatorFacadePrivate { } #endif +#ifdef PADDLE_WITH_ASCEND_CL + void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) { + allocators_[p] = std::make_shared(p); + } +#endif + class ZeroSizeAllocator : public Allocator { public: explicit ZeroSizeAllocator(platform::Place place) : place_(place) {} diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 0ada2cafcc16a6..3e88d61783c9e6 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -19,7 +19,10 @@ #include "gflags/gflags.h" #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" +#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" @@ -110,6 +113,7 @@ size_t Used(const platform::CPUPlace &place) { return GetCPUBuddyAllocator()->Used(); } +// For kunlun XPU template <> void *Alloc(const platform::XPUPlace &place, size_t size) { #ifdef PADDLE_WITH_XPU @@ -219,6 +223,135 @@ size_t Used(const platform::XPUPlace &place) { #endif } +// For Ascend NPU +#ifdef PADDLE_WITH_ASCEND_CL +class NPUBuddyAllocatorList { + private: + NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) { + auto npu_num = devices_.size(); + allocators_.resize(npu_num); + init_flags_.reserve(npu_num); + for (size_t i = 0; i < npu_num; ++i) { + init_flags_.emplace_back(new std::once_flag()); + } + } + + static NPUBuddyAllocatorList *CreateNewInstance() { + return new NPUBuddyAllocatorList(); + } + + public: + static NPUBuddyAllocatorList *Instance() { + static auto *instance = CreateNewInstance(); + return instance; + } + + BuddyAllocator *Get(int npu_id) { + auto pos = std::distance( + devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id)); + PADDLE_ENFORCE_LT(pos, devices_.size(), + platform::errors::OutOfRange( + "The index exceeds the size of devices, the size of " + "devices is %d, the index is %d", + devices_.size(), pos)); + + std::call_once(*init_flags_[pos], [this, pos] { + platform::SetNPUDeviceId(devices_[pos]); + allocators_[pos].reset(new BuddyAllocator( + std::unique_ptr( + new detail::NPUAllocator(devices_[pos])), + platform::NPUMinChunkSize(), platform::NPUMaxChunkSize())); + VLOG(10) << "\n\nNOTE:\n" + << "You can set GFlags environment variable " + << "'FLAGS_fraction_of_gpu_memory_to_use' " + << "or 'FLAGS_initial_gpu_memory_in_mb' " + << "or 'FLAGS_reallocate_gpu_memory_in_mb' " + << "to change the memory size for GPU usage.\n" + << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is " + << FLAGS_fraction_of_gpu_memory_to_use + << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is " + << FLAGS_initial_gpu_memory_in_mb + << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is " + << FLAGS_reallocate_gpu_memory_in_mb << "\n\n"; + }); + + return allocators_[pos].get(); + } + + private: + std::vector devices_; + std::vector> init_flags_; + std::vector> allocators_; +}; + +BuddyAllocator *GetNPUBuddyAllocator(int npu_id) { + return NPUBuddyAllocatorList::Instance()->Get(npu_id); +} +#endif + +template <> +size_t Used(const platform::NPUPlace &place) { +#ifdef PADDLE_WITH_ASCEND_CL + return GetNPUBuddyAllocator(place.device)->Used(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void *Alloc(const platform::NPUPlace &place, size_t size) { +#ifdef PADDLE_WITH_ASCEND_CL + auto *buddy_allocator = GetNPUBuddyAllocator(place.device); + auto *ptr = buddy_allocator->Alloc(size); + if (ptr == nullptr) { + platform::NPUDeviceGuard(place.device); + size_t avail, total; + platform::NPUMemoryUsage(&avail, &total); + PADDLE_THROW(platform::errors::ResourceExhausted( + "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize " + "%s, GpuMaxChunkSize %s, GPU memory used: %s.", + string::HumanReadableSize(size), place.device, + string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), + string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), + string::HumanReadableSize(Used(place)))); + } else { + if (FLAGS_init_allocated_mem) { + aclrtMemset(ptr, size, 0xEF, size); + } + } + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + return ptr; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void Free(const platform::NPUPlace &place, void *p, + size_t size) { +#ifdef PADDLE_WITH_ASCEND_CL + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + GetNPUBuddyAllocator(place.device)->Free(p); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +uint64_t Release(const platform::NPUPlace &place) { +#ifdef PADDLE_WITH_ASCEND_CL + return GetNPUBuddyAllocator(place.device)->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +// For CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class GPUBuddyAllocatorList { private: diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc index 37da748ee9c965..1fe85dd699acf1 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc @@ -61,6 +61,22 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) { } #endif +#ifdef PADDLE_WITH_ASCEND_CL +TEST(NaiveBestFitAllocatorTest, NpuAlloc) { + NaiveBestFitAllocator alloc{platform::NPUPlace(0)}; + { + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + } + sleep(10); + alloc.Release(platform::NPUPlace(0)); + + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + alloc.Release(platform::NPUPlace(0)); +} +#endif + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc new file mode 100644 index 00000000000000..faf7ae6221caaf --- /dev/null +++ b/paddle/fluid/memory/allocation/npu_allocator.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/npu_allocator.h" +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/npu_info.h" + +namespace paddle { +namespace memory { +namespace allocation { + +bool NPUAllocator::IsAllocThreadSafe() const { return true; } +void NPUAllocator::FreeImpl(Allocation* allocation) { + PADDLE_ENFORCE_EQ( + BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_, + platform::errors::PermissionDenied( + "NPU memory is freed in incorrect device. This may be a bug")); + platform::RecordedNPUFree(allocation->ptr(), allocation->size(), + place_.device); + delete allocation; +} + +Allocation* NPUAllocator::AllocateImpl(size_t size) { + std::call_once(once_flag_, + [this] { platform::SetNPUDeviceId(place_.device); }); + + void* ptr; + auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device); + if (LIKELY(result == ACL_ERROR_NONE)) { + return new Allocation(ptr, size, platform::Place(place_)); + } + + size_t avail, total, actual_avail, actual_total; + bool is_limited = platform::RecordedNPUMemGetInfo( + &avail, &total, &actual_avail, &actual_total, place_.device); + + std::string err_msg; + if (is_limited) { + auto limit_size = (total >> 20); + err_msg = string::Sprintf( + "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger " + "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum " + "GPU memory usage is limited to %d MB.\n" + " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", + limit_size, limit_size); + } + + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on NPU %d. " + "Cannot allocate %s memory on NPU %d, " + "available memory is only %s.\n\n" + "Please check whether there is any other process using NPU %d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n" + "2. If no, please decrease the batch size of your model. %s\n\n", + place_.device, string::HumanReadableSize(size), place_.device, + string::HumanReadableSize(avail), place_.device, err_msg)); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h new file mode 100644 index 00000000000000..bf668973505bab --- /dev/null +++ b/paddle/fluid/memory/allocation/npu_allocator.h @@ -0,0 +1,41 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include // NOLINT +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class NPUAllocator : public Allocator { + public: + explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {} + + bool IsAllocThreadSafe() const override; + + protected: + void FreeImpl(Allocation* allocation) override; + Allocation* AllocateImpl(size_t size) override; + + private: + platform::NPUPlace place_; + std::once_flag once_flag_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt index fcae741db3667f..e9631ee739b9b8 100644 --- a/paddle/fluid/memory/detail/CMakeLists.txt +++ b/paddle/fluid/memory/detail/CMakeLists.txt @@ -6,6 +6,8 @@ if(WITH_GPU) nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place) elseif(WITH_ROCM) hip_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place) +elseif(${WITH_ASCEND_CL}) + cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info npu_info place) else() cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place) endif() diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 50c0b58f3a1dd6..55436f451a41ff 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -21,6 +21,9 @@ limitations under the License. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif +#ifdef PADDLE_WITH_ASCEND_CL +DECLARE_uint64(reallocate_gpu_memory_in_mb); +#endif namespace paddle { namespace memory { @@ -235,6 +238,21 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( } } #endif +#ifdef PADDLE_WITH_ASCEND_CL + if (system_allocator_->UseGpu()) { + if ((total_used_ + total_free_) == 0) { + // Compute the allocation size for gpu for the first allocation. + allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes); + } else { + // Compute the re-allocation size, we store the re-allocation size when + // user set FLAGS_reallocate_gpu_memory_in_mb to fix value. + if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) { + realloc_size_ = platform::NPUReallocSize(); + } + allocate_bytes = std::max(realloc_size_, request_bytes); + } + } +#endif // Allocate a new block void* p = system_allocator_->Alloc(&index, allocate_bytes); diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index 15e93deffccda8..135c3b6d04f346 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc index 2dc3e73af24162..290f3d5d1bcd47 100644 --- a/paddle/fluid/memory/detail/buddy_allocator_test.cc +++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc @@ -19,14 +19,16 @@ limitations under the License. */ #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" #endif +#include +#include + #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include -#include - +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_ASCEND_CL) DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb); @@ -342,6 +344,32 @@ TEST(BuddyAllocator, Release) { } #endif +#ifdef PADDLE_WITH_ASCEND_CL +TEST(BuddyAllocator, NpuFraction) { + // In a 16 GB machine, the pool size will be about 160 MB + FLAGS_fraction_of_gpu_memory_to_use = 0.005; + FLAGS_fraction_of_gpu_memory_to_use = 0.92; + FLAGS_initial_gpu_memory_in_mb = 0; + FLAGS_reallocate_gpu_memory_in_mb = 0; + + BuddyAllocator buddy_allocator( + std::unique_ptr(new NPUAllocator(0)), + platform::NPUMinChunkSize(), platform::NPUMaxChunkSize()); + + // Less than pool size + TestBuddyAllocator(&buddy_allocator, 10); + TestBuddyAllocator(&buddy_allocator, 10 << 10); + TestBuddyAllocator(&buddy_allocator, 10 << 20); + buddy_allocator.Release(); + + // Greater than max chunk size + TestBuddyAllocator(&buddy_allocator, 300 << 20, + /* use_system_allocator = */ true); + TestBuddyAllocator(&buddy_allocator, 1 * static_cast(1 << 30), + /* use_system_allocator = */ true); +} +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 38baf6c24bab3f..c733ba5c68c9bd 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -29,6 +29,8 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -247,6 +249,68 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; } #endif +#ifdef PADDLE_WITH_ASCEND_CL +void* NPUAllocator::Alloc(size_t* index, size_t size) { + if (size <= 0) return nullptr; + + void* p; + auto result = platform::RecordedNPUMalloc(&p, size, npu_id_); + + if (result == ACL_ERROR_NONE) { + *index = 0; + npu_alloc_size_ += size; + return p; + } else { + size_t avail, total, actual_avail, actual_total; + bool is_limited = platform::RecordedNPUMemGetInfo( + &avail, &total, &actual_avail, &actual_total, npu_id_); + + std::string err_msg; + if (is_limited) { + auto limit_size = (total >> 20); + err_msg = string::Sprintf( + "\n 3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a " + "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " + "maximum GPU memory usage is limited to %d MB.\n" + " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", + limit_size, limit_size); + } + + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on NPU %d. " + "Cannot allocate %s memory on NPU %d, " + "available memory is only %s.\n\n" + "Please check whether there is any other process using NPU %d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n" + "2. If no, please try one of the following suggestions:\n" + " 1) Decrease the batch size of your model.\n" + " 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, " + "please set it to a higher value but less than 1.0.\n" + " The command is " + "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", + npu_id_, string::HumanReadableSize(size), npu_id_, + string::HumanReadableSize(avail), npu_id_, + FLAGS_fraction_of_gpu_memory_to_use, err_msg)); + } +} + +void NPUAllocator::Free(void* p, size_t size, size_t index) { + VLOG(4) << "Free " << p << " size " << size; + PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_GE(npu_alloc_size_, size, + platform::errors::InvalidArgument( + "The size of memory (%d) to free exceeds the size of " + "allocated gpu memory (%d)", + size, npu_alloc_size_)); + npu_alloc_size_ -= size; + + platform::RecordedNPUFree(p, size, npu_id_); +} + +bool NPUAllocator::UseGpu() const { return true; } +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index e332bb670da235..26711ae4070f5e 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -66,6 +66,22 @@ class CUDAPinnedAllocator : public SystemAllocator { }; #endif +#ifdef PADDLE_WITH_ASCEND_CL + +class NPUAllocator : public SystemAllocator { + public: + explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {} + + virtual void* Alloc(size_t* index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t npu_alloc_size_ = 0; + int npu_id_; +}; +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc index 13854d771a0bf6..ead188341dac46 100644 --- a/paddle/fluid/memory/detail/system_allocator_test.cc +++ b/paddle/fluid/memory/detail/system_allocator_test.cc @@ -85,3 +85,11 @@ TEST(GPUAllocator, AllocFailure) { } } #endif + +#ifdef PADDLE_WITH_ASCEND_CL +TEST(NPUAllocator, Alloc) { + paddle::memory::detail::NPUAllocator a(0); + TestAllocator(&a, 1 << 20); + TestAllocator(&a, 1); +} +#endif diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 7f871fab5a1470..1eb0535831bb19 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -40,7 +40,7 @@ void Copy(platform::XPUPlace dst_place, platform::CPUPlace src_place, const void* src, size_t num) { if (num <= 0) { - VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")"; + VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")"; return; } int dev_id = -1; @@ -86,7 +86,7 @@ void Copy(platform::CPUPlace dst_place, platform::XPUPlace src_place, const void* src, size_t num) { if (num <= 0) { - VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")"; + VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")"; return; } int dev_id = -1; @@ -132,7 +132,7 @@ void Copy(platform::XPUPlace dst_place, platform::XPUPlace src_place, const void* src, size_t num) { if (num <= 0) { - VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")"; + VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")"; return; } int dev_id = -1; @@ -196,6 +196,101 @@ void Copy(platform::XPUPlace dst_place, } #endif +#ifdef PADDLE_WITH_ASCEND_CL +template <> +void Copy(platform::NPUPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, size_t num, + aclrtStream stream) { + if (UNLIKELY(num == 0)) return; + + platform::SetNPUDeviceId(dst_place.device); + + // NOTE(ascendrc): NPU memcpy async from host to device is a "real" async, + // which is different from CUDA. In Paddle, when async is called, "sync" + // is run actually, which means Paddle doesn't fully supported async. + // TODO(ascendrc): Support NPU memcpy async for better performance. + stream = nullptr; + + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by thream(" << stream << ")"; + + if (stream) { + platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); + } else { + platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); + } +} + +template <> +void Copy(platform::CPUPlace dst_place, + void* dst, + platform::NPUPlace src_place, + const void* src, size_t num, + aclrtStream stream) { + if (UNLIKELY(num == 0)) return; + + platform::SetNPUDeviceId(src_place.device); + + // NOTE(ascendrc): NPU memcpy async from device to host is a "real" async, + // which is different from CUDA. In Paddle, when async is called, "sync" + // is run actually, which means Paddle doesn't fully supported async. + // TODO(ascendrc): Support NPU memcpy async for better performance. + stream = nullptr; + + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by thream(" << stream << ")"; + + if (stream) { + platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); + } else { + platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); + } +} + +template <> +void Copy(platform::NPUPlace dst_place, + void* dst, + platform::NPUPlace src_place, + const void* src, size_t num, + aclrtStream stream) { + if (UNLIKELY(num == 0)) return; + + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by stream(" << stream << ")"; + if (dst_place == src_place) { + platform::SetNPUDeviceId(src_place.device); + if (stream) { + platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, + stream); + } else { + platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); + } + } else { + if (!platform::NPUCanAccessPeer(dst_place.device, dst_place.device)) { + PADDLE_THROW(platform::errors::Unavailable( + "Peer access between NPU places is not allowed.")); + } + if (stream) { + // TODO(zhiqiu): support peer access? + platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, + stream); + } else { + platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); + } + } +} +#endif + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h index 25490f28b65987..c630437224cd09 100644 --- a/paddle/fluid/memory/memcpy.h +++ b/paddle/fluid/memory/memcpy.h @@ -52,7 +52,27 @@ void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); template void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, gpuStream_t stream); +#endif +#ifdef PADDLE_WITH_ASCEND_CL +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU or NPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU or NPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * \param[in] stream NPU stream. + * + * \note For NPU memory copy, NPU stream need to be specified + * for asynchronously memory copy. + * + */ +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, + aclrtStream stream); #endif + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 467a5ff9063a65..cecc70cc6dda8e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -10,6 +10,7 @@ file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operators/CMakeLists copy_if_different(${pybind_file} ${pybind_file_final}) add_subdirectory(math) +add_subdirectory(eigen) add_subdirectory(controlflow) add_subdirectory(detection) add_subdirectory(elementwise) @@ -68,7 +69,7 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() -register_operators(EXCLUDES py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op +register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS}) @@ -110,8 +111,9 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_fun set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_cc_function) if (WITH_GPU OR WITH_ROCM) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor eigen_cu_function) endif() set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer) @@ -121,6 +123,12 @@ if (WITH_ASCEND) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} ascend_wrapper) endif() +if (WITH_ASCEND_CL) + cc_test(assign_op_npu_test SRCS assign_op_npu_test.cc DEPS assign_op) + cc_library(npu_op_runner SRCS npu_op_runner.cc DEPS operator npu_info) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} npu_op_runner) +endif() + # FIXME(typhoonzero): operator deps may not needed. # op_library(lod_tensor_to_array_op DEPS lod_rank_table_op) # op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) @@ -134,8 +142,8 @@ set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS}) set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") cc_test(test_common_infer_shape_functions SRCS test_common_infer_shape_functions.cc DEPS common_infer_shape_functions ${COMMON_OP_DEPS} activation_op elementwise_add_op softmax_op softmax) -cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op) cc_test(gather_test SRCS gather_test.cc DEPS tensor) +cc_test(assign_op_test SRCS assign_op_test.cc DEPS assign_op) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function) cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) @@ -154,12 +162,22 @@ endif() cc_library(tensor_formatter SRCS tensor_formatter.cc DEPS ${OP_HEADER_DEPS}) if (WITH_PYTHON) cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind) + cc_library(py_layer_op SRCS py_layer_op.cc DEPS op_registry python pybind) +endif() + +if (WITH_ASCEND_CL) + cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor) + cc_test(lookup_table_v2_op_npu_test SRCS lookup_table_v2_op_npu_test.cc DEPS op_registry lookup_table_v2_op scope device_context enforce executor compare_op) endif() set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") add_subdirectory(benchmark) cc_test(op_debug_string_test SRCS op_debug_string_test.cc DEPS elementwise_add_op) +if (WITH_ASCEND_CL) + cc_test(transpose_op_npu_test SRCS transpose_op_npu_test.cc DEPS op_registry transpose_op scope device_context enforce executor) +endif() + if(WITH_MKLDNN) include(mkldnn/inplace_op_tests.cmake) @@ -173,3 +191,7 @@ if(WITH_UNITY_BUILD) # The specified link dependency needs to be displayed here. target_link_libraries(paddle_operators_unity ${OP_HEADER_DEPS} ${COMMON_OP_DEPS}) endif() + +if(WITH_ASCEND_CL) +cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor) +endif() diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 94f2eb3672bd5d..1cac9ed9f1dd08 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -782,6 +782,26 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { } }; +template +class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("tanh_grad_grad"); + // input1: Out + op->SetInput("Out", this->Input("Out")); + // input2: ddx + op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); + op->SetInput("DOut", this->Input(framework::GradVarName("Out"))); + op->SetAttrMap(this->Attrs()); + // output: ddy + op->SetOutput("DOutNew", this->InputGrad("Out")); + op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out"))); + } +}; + // ReluGrad: dx = dy if y >= 0 else 0 // ReluGradGrad: ddy = ddx if y >= 0 else 0 template @@ -1041,6 +1061,34 @@ namespace plat = paddle::platform; FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); +/* ========================== tanh register ============================= */ +REGISTER_OPERATOR( + tanh, ops::ActivationOp, ops::TanhOpMaker, ops::ActivationOpInferVarType, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::framework::OpDesc>, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::imperative::OpBase>, + std::conditional>(), + ops::ActFwdInplaceInferer, void>::type); +REGISTER_OPERATOR(tanh_grad, ops::ActivationOpGrad, + ops::ActivationGradOpInplaceInferer, + ops::TanhDoubleGradMaker, + ops::TanhDoubleGradMaker) +REGISTER_OPERATOR( + tanh_grad_grad, + ops::ActivationOpDoubleGrad::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInferer); + +REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor); +REGISTER_OP_CPU_KERNEL( + tanh_grad_grad, ops::TanhDoubleGradKernel>, + ops::TanhDoubleGradKernel>, + ops::TanhDoubleGradKernel>); +/* ========================================================================== */ + /* ========================== relu register ============================= */ REGISTER_OPERATOR( relu, ops::ActivationOp, ops::ReluOpMaker, ops::ActivationOpInferVarType, diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index c6d2fbccd8e84b..781a97c1ffcc17 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -42,6 +42,10 @@ template class BaseGPUFunctor { public: using ELEMENT_TYPE = T; + + using AttrPair = std::vector>; + + AttrPair GetAttrs() { return AttrPair(); } }; /* ========================================================================== */ @@ -57,42 +61,35 @@ class ReluGPUFunctor : public BaseGPUFunctor { // for relu forward when T is double __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type* x); + const typename CudaVecType::type in) { + // relu forward : out = max(x, 0) + return in > zero_ ? in : zero_; + } // when num % vecsize != 0 this func will be used - __device__ __forceinline__ T ComputeRemainder(const T x) { - return x > zero_ ? x : zero_; + __device__ __forceinline__ T ComputeRemainder(const T in) { + // relu forward : out = max(x, 0) + return in > zero_ ? in : zero_; } }; -template <> -__device__ __forceinline__ CudaVecType::type -ReluGPUFunctor::Compute(const CudaVecType::type* x) { -// relu forward : out = max(x, 0) -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 - return __ldg(x) > zero_ ? __ldg(x) : zero_; -#else - return (*x) > zero_ ? (*x) : zero_; -#endif -} - template <> __device__ __forceinline__ CudaVecType::type -ReluGPUFunctor::Compute(const CudaVecType::type* xx) { - // relu forward : out = max(xx, 0) - return make_float4((xx->x > zero_) * (xx->x), (xx->y > zero_) * (xx->y), - (xx->z > zero_) * (xx->z), (xx->w > zero_) * (xx->w)); +ReluGPUFunctor::Compute(const CudaVecType::type in) { + // relu forward : out = max(in, 0) + return make_float4((in.x > zero_) * (in.x), (in.y > zero_) * (in.y), + (in.z > zero_) * (in.z), (in.w > zero_) * (in.w)); } template <> __device__ __forceinline__ CudaVecType::type -ReluGPUFunctor::Compute(const CudaVecType::type* in) { +ReluGPUFunctor::Compute(const CudaVecType::type in) { // relu forward : out = max(in, 0) #ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) const half2 kzero = __float2half2_rn(0.0f); - return __hmul2(__hgt2(__ldg(in), kzero), __ldg(in)); + return __hmul2(__hgt2(in, kzero), in); #else - const float2 xx = __half22float2(*in); + const float2 xx = __half22float2(in); return __floats2half2_rn((xx.x > 0.0f) * static_cast(xx.x), (xx.y > 0.0f) * static_cast(xx.y)); #endif @@ -112,8 +109,10 @@ class ReluGradGPUFunctor : public BaseGPUFunctor { // for relu backward when T is double __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type* out, - const typename CudaVecType::type* dout); + const typename CudaVecType::type out, + const typename CudaVecType::type dout) { + return out > zero_ ? dout : zero_; + } // when num % vecsize != 0 this func will be used __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) { @@ -124,44 +123,132 @@ class ReluGradGPUFunctor : public BaseGPUFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; -template <> -__device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type* out, - const CudaVecType::type* dout) { -// relu backward : dx = out > 0 ? dout : 0; -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 - return __ldg(out) > zero_ ? __ldg(dout) : zero_; -#else - return (*out) > zero_ ? (*dout) : zero_; -#endif -} - template <> __device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type* out, - const CudaVecType::type* dout) { +ReluGradGPUFunctor::Compute(const CudaVecType::type out, + const CudaVecType::type dout) { // relu backward : dx = out > 0 ? dout : 0; - return make_float4((out->x > zero_) * (dout->x), (out->y > zero_) * (dout->y), - (out->z > zero_) * (dout->z), - (out->w > zero_) * (dout->w)); + return make_float4((out.x > zero_) * (dout.x), (out.y > zero_) * (dout.y), + (out.z > zero_) * (dout.z), (out.w > zero_) * (dout.w)); } template <> __device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type* out, - const CudaVecType::type* dout) { +ReluGradGPUFunctor::Compute(const CudaVecType::type out, + const CudaVecType::type dout) { // relu backward : dx = out > 0 ? dout : 0; #ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) const half2 kzero = __float2half2_rn(0.0f); - return __hmul2(__hgt2(__ldg(out), kzero), __ldg(dout)); + return __hmul2(__hgt2(out, kzero), dout); #else - const float2 xx = __half22float2(*out); - const float2 yy = __half22float2(*dout); + const float2 xx = __half22float2(out); + const float2 yy = __half22float2(dout); return __floats2half2_rn((xx.x > 0.0f) * static_cast(yy.x), (xx.y > 0.0f) * static_cast(yy.y)); #endif } +/* ========================================================================== */ +/* ======================== leaky relu forward ======================== + */ +template +class LeakyReluGPUFunctor : public BaseGPUFunctor { + private: + T zero_; + float alpha_; + + public: + LeakyReluGPUFunctor() { zero_ = static_cast(0.0f); } + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha_}}; + } + // leakyrelu forward : out = x > 0 ? x : x * alpha + __device__ __forceinline__ typename CudaVecType::type Compute( + const typename CudaVecType::type in) { + return in > zero_ ? in : static_cast(alpha_) * in; + } + + __device__ __forceinline__ T ComputeRemainder(const T in) { + // leakyrelu forward : out = x > 0 ? x : x * alpha + return in > zero_ ? in : static_cast(alpha_) * in; + } +}; + +template <> +__device__ __forceinline__ CudaVecType::type +LeakyReluGPUFunctor::Compute(const CudaVecType::type in) { + // leakyrelu forward : out = x > 0 ? x : x * alpha + return make_float4((in.x > zero_) ? (in.x) : (in.x) * alpha_, + (in.y > zero_) ? (in.y) : (in.y) * alpha_, + (in.z > zero_) ? (in.z) : (in.z) * alpha_, + (in.w > zero_) ? (in.w) : (in.w) * alpha_); +} + +template <> +__device__ __forceinline__ CudaVecType::type +LeakyReluGPUFunctor::Compute(const CudaVecType::type in) { + // leakyrelu forward : out = x > 0 ? x : x * alpha + const float2 xx = __half22float2(in); + return __floats2half2_rn((xx.x > 0.0f) ? xx.x : xx.x * alpha_, + (xx.y > 0.0f) ? xx.y : xx.y * alpha_); +} +/* ========================================================================== */ + +/* =========================== leaky relu backward ======================= + */ +template +class LeakyReluGradGPUFunctor : public BaseGPUFunctor { + private: + T zero_; + float alpha_; + + public: + LeakyReluGradGPUFunctor() { zero_ = static_cast(0.0f); } + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha_}}; + } + + // for leaky relu backward when T is double + __device__ __forceinline__ typename CudaVecType::type Compute( + const typename CudaVecType::type in, + const typename CudaVecType::type dout) { + // leakyrelu backward : dx = x > 0 ? dout : alpha * dout + return in > zero_ ? dout : static_cast(alpha_) * dout; + } + + // when num % vecsize != 0 this func will be used + __device__ __forceinline__ T ComputeRemainder(const T in, const T dout) { + // leakyrelu backward : dx = x > 0 ? dout : alpha * dout + return in > zero_ ? dout : static_cast(alpha_) * dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template <> +__device__ __forceinline__ CudaVecType::type +LeakyReluGradGPUFunctor::Compute(const CudaVecType::type in, + const CudaVecType::type dout) { + // leakyrelu backward : dx = x > 0 ? dout : alpha * dout + return make_float4((in.x > zero_) ? (dout.x) : alpha_ * (dout.x), + (in.y > zero_) ? (dout.y) : alpha_ * (dout.y), + (in.z > zero_) ? (dout.z) : alpha_ * (dout.z), + (in.w > zero_) ? (dout.w) : alpha_ * (dout.w)); +} + +template <> +__device__ __forceinline__ CudaVecType::type LeakyReluGradGPUFunctor< + float16>::Compute(const CudaVecType::type in, + const CudaVecType::type dout) { + // leakyrelu backward : dx = x > 0 ? dout : alpha * dout + const float2 xx = __half22float2(in); + const float2 yy = __half22float2(dout); + return __floats2half2_rn((xx.x > 0.0f) ? yy.x : alpha_ * yy.x, + (xx.y > 0.0f) ? yy.y : alpha_ * yy.y); +} + /* ========================================================================== */ template @@ -176,14 +263,23 @@ __global__ void ActivationGradKernelVec(const T* forward_data, const T* dout, const VecType* in_forward = reinterpret_cast(forward_data); const VecType* in_dout = reinterpret_cast(dout); VecType* out = reinterpret_cast(dx); - + VecType forward_vec, dout_vec; + T in_data, dout_data; for (int i = idx; i < loop; i += stride) { - out[i] = functor.Compute((in_forward + i), (in_dout + i)); +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 + forward_vec = __ldg(in_forward + i); + dout_vec = __ldg(in_dout + i); +#else + forward_vec = in_forward[i]; + dout_vec = in_dout[i]; +#endif + out[i] = functor.Compute(forward_vec, dout_vec); } while (idx == loop && tail) { - dx[num - tail] = - functor.ComputeRemainder(forward_data[num - tail], dout[num - tail]); + in_data = forward_data[num - tail]; + dout_data = dout[num - tail]; + dx[num - tail] = functor.ComputeRemainder(in_data, dout_data); --tail; } } @@ -199,9 +295,14 @@ __global__ void ActivationkernelVec(const T* src, T* dst, int num, int tail = num % vecsize; const VecType* in = reinterpret_cast(src); VecType* out = reinterpret_cast(dst); - + VecType x_vec; for (int i = idx; i < loop; i += stride) { - out[i] = functor.Compute((in + i)); +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 + x_vec = __ldg(in + i); +#else + x_vec = in[i]; +#endif + out[i] = functor.Compute(x_vec); } while (idx == loop && tail) { @@ -231,6 +332,10 @@ class ActivationGPUKernel block = 256; #endif Functor functor; + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } constexpr int vecsize = CudaVecType::vecsize; int grid = max((num / vecsize + block - 1) / block, 1); auto stream = context.cuda_device_context().stream(); @@ -270,7 +375,12 @@ class ActivationGradGPUKernel #ifdef __HIPCC__ block = 256; #endif + Functor functor; + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } constexpr int vecsize = CudaVecType::vecsize; int grid = max((numel / vecsize + block - 1) / block, 1); auto stream = context.cuda_device_context().stream(); @@ -300,12 +410,28 @@ namespace plat = paddle::platform; ops::grad_functor>, \ ops::ActivationGradKernel>); - FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL); +#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, op_name, functor, \ + grad_functor) \ + REGISTER_OP_CUDA_KERNEL( \ + act_type, ops::ActivationGPUKernel>, \ + ops::ActivationGPUKernel>, \ + ops::ActivationGPUKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, ops::ActivationGradGPUKernel>, \ + ops::ActivationGradGPUKernel>, \ + ops::ActivationGradGPUKernel>); + /* ======================== leaky relu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor, - LeakyReluGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluGPUFunctor, + LeakyReluGradGPUFunctor); REGISTER_OP_CUDA_KERNEL( leaky_relu_grad_grad, @@ -330,21 +456,7 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* =========================== relu register ============================ */ -REGISTER_OP_CUDA_KERNEL( - relu, ops::ActivationGPUKernel>, - ops::ActivationGPUKernel>, - ops::ActivationGPUKernel>); - -REGISTER_OP_CUDA_KERNEL( - relu_grad, ops::ActivationGradGPUKernel>, - ops::ActivationGradGPUKernel>, - ops::ActivationGradGPUKernel>); +REGISTER_ACTIVATION_GPU_KERNEL(relu, Relu, ReluGPUFunctor, ReluGradGPUFunctor); REGISTER_OP_CUDA_KERNEL( relu_grad_grad, @@ -356,6 +468,19 @@ REGISTER_OP_CUDA_KERNEL( ops::ReluGradGradFunctor>); /* ========================================================================== */ +/* =========================== tanh register ============================ */ +REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor); + +REGISTER_OP_CUDA_KERNEL( + tanh_grad_grad, + ops::TanhDoubleGradKernel>, + ops::TanhDoubleGradKernel>, + ops::TanhDoubleGradKernel>); +/* ========================================================================== */ + /* =========================== sqrt register ============================= */ REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor); diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index bc7def61b2e249..fb9f956f17c0b1 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -366,6 +366,36 @@ struct TanhGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +template +struct TanhGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* Out, + const framework::Tensor* ddX, const framework::Tensor* dOut, + framework::Tensor* dOutNew, framework::Tensor* ddOut) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad")); + auto out = framework::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad")); + // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out + // * ddx) + if (dOutNew) { + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad")); + auto dout_new = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad")); + dout_new.device(*d) = + static_cast(-1) * dout * static_cast(2) * out * ddx; + } + if (ddOut) { + auto ddout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad")); + ddout.device(*d) = (static_cast(1) - out * out) * ddx; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + // tanhshrink(x) = x - tanh(x) // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template @@ -400,7 +430,7 @@ struct HardShrinkFunctor : public BaseActivationFunctor { void operator()(Device d, X x, Out out) const { auto temp1 = x < static_cast(threshold * -1.f); auto temp2 = x > static_cast(threshold); - out.device(d) = x * (temp1 + temp2 > 0).template cast(); + out.device(d) = x * (temp1 + temp2).template cast(); } }; @@ -417,7 +447,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor { void operator()(Device d, X x, Out out, dOut dout, dX dx) const { auto temp1 = x < static_cast(threshold * -1.f); auto temp2 = x > static_cast(threshold); - dx.device(d) = dout * (temp1 + temp2 > 0).template cast(); + dx.device(d) = dout * (temp1 + temp2).template cast(); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -1734,6 +1764,58 @@ inline void ExtractDoubleGradTensorWithInputDOut( } } +template +class TanhDoubleGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *Out, *ddX, *dOut; + framework::Tensor *dOutNew, *ddOut; + Out = ddX = dOut = nullptr; + dOutNew = ddOut = nullptr; + + // extract ddx(input) and out(input) + auto ddx_var = ctx.InputVar("DDX"); + auto out_var = ctx.InputVar("Out"); + PADDLE_ENFORCE_NOT_NULL( + ddx_var, platform::errors::NotFound( + "Cannot get input Variable ddx, variable name = %s", + ctx.InputName("DDX"))); + PADDLE_ENFORCE_NOT_NULL( + out_var, platform::errors::NotFound( + "Cannot get input Variable out, variable name = %s", + ctx.InputName("Out"))); + ddX = ctx.Input("DDX"); + Out = ctx.Input("Out"); + + // set output ddout + auto ddout_var = ctx.OutputVar("DDOut"); + if (ddout_var) { + ddOut = ctx.Output("DDOut"); + } + + // extract dOut(intput) + auto dout_var = ctx.InputVar("DOut"); + PADDLE_ENFORCE_NOT_NULL( + dout_var, platform::errors::NotFound( + "Cannot get input Variable dout_var, variable name = %s", + ctx.InputName("DOut"))); + dOut = ctx.Input("DOut"); + + // set output dout_new + auto dout_new_var = ctx.OutputVar("DOutNew"); + if (dout_new_var) { + dOutNew = ctx.Output("DOutNew"); + } + + if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); + if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); + auto& place = ctx.template device_context(); + Functor functor; + functor(place, Out, ddX, dOut, dOutNew, ddOut); + } +}; template class SquareDoubleGradKernel : public framework::OpKernel { @@ -2048,7 +2130,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor { #define FOR_EACH_ACTIVATION_OP(__macro) \ __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ - __macro(tanh, Tanh, TanhFunctor, TanhGradFunctor); \ __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc new file mode 100644 index 00000000000000..923b581af287d1 --- /dev/null +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -0,0 +1,368 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class PowNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + auto factor = ctx.Attr("factor"); + + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Power", {*x}, {*out}, + {{"power", factor}, + {"scale", static_cast(1.0)}, + {"shift", static_cast(0.0)}}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class PowGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto factor = ctx.Attr("factor"); + + auto x_dims = x->dims(); + + auto place = ctx.GetPlace(); + auto stream = + ctx.template device_context() + .stream(); + + // NOTE(liym27): dx = dout * factor * x.pow(factor-1) + + // Step1: Compute x_pow = x.pow(factor-1) + Tensor x_pow(x->type()); + x_pow.mutable_data(x->dims(), place); + auto runner_pow = NpuOpRunner("Power", {*x}, {x_pow}, + {{"power", factor - static_cast(1)}}); + runner_pow.Run(stream); + + // Step 2: Construct a broadcast factor, which has the same shape with x. + + // 2.1 Get a factor tensor with shape [1]. + Tensor factor_tensor(framework::proto::VarType::FP32); + factor_tensor.mutable_data({1}, place); + TensorFromVector(std::vector{factor}, ctx.device_context(), + &factor_tensor); + + // 2.2 Get the factor which has the shape with x and the same value with + // factor. + Tensor factor_bc_tensor(framework::proto::VarType::FP32); + factor_bc_tensor.mutable_data(x_dims, place); + auto runner_bc = NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor}, + {{"dims", framework::vectorize(x_dims)}}); + runner_bc.Run(stream); + + // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1) + Tensor x_power_mul_factor(x->type()); + x_power_mul_factor.mutable_data(x->dims(), place); + auto runner_mul_1 = + NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {}); + runner_mul_1.Run(stream); + + // Step 4: Compute dx = dout * factor * x.pow(factor-1) + dx->mutable_data(place); + auto runner_mul_2 = + NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {}); + runner_mul_2.Run(stream); + } +}; + +template +class ReluNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Relu", + { + *x, + }, + {*out}, {}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class ReluGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto stream = + ctx.template device_context() + .stream(); + + dx->mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {}); + + runner.Run(stream); + } +}; + +template +class SqrtNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Sqrt", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +template +class SqrtGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto dx_runner = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {}); + dx_runner.Run(stream); + } +}; + +template +class LogNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + Tensor one(x->type()); + one.mutable_data(x->dims(), place); + auto one_runner = NpuOpRunner("OnesLike", {*x}, {one}, {}); + one_runner.Run(stream); + + Tensor sub(x->type()); + sub.mutable_data(x->dims(), place); + auto sub_runner = NpuOpRunner("Sub", {*x, one}, {sub}, {}); + sub_runner.Run(stream); + + auto out_runner = NpuOpRunner("Log1p", {sub}, {*out}, {}); + out_runner.Run(stream); + } +}; + +template +class LogGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* x = ctx.Input("X"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + auto runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {}); + runner.Run(stream); + } +}; + +template +class TanhNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Tanh", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +template +class TanhGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* out = ctx.Input("Out"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto dx_runner = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {}); + dx_runner.Run(stream); + } +}; + +template +class SquareNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Square", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + pow, ops::PowNPUKernel, + ops::PowNPUKernel); + +REGISTER_OP_NPU_KERNEL( + pow_grad, ops::PowGradNPUKernel, + ops::PowGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + relu, ops::ReluNPUKernel, + ops::ReluNPUKernel); + +REGISTER_OP_NPU_KERNEL( + relu_grad, + ops::ReluGradNPUKernel, + ops::ReluGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + sqrt, ops::SqrtNPUKernel, + ops::SqrtNPUKernel); + +REGISTER_OP_NPU_KERNEL( + sqrt_grad, + ops::SqrtGradNPUKernel, + ops::SqrtGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + log, ops::LogNPUKernel, + ops::LogNPUKernel); + +REGISTER_OP_NPU_KERNEL( + log_grad, ops::LogGradNPUKernel, + ops::LogGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + tanh, ops::TanhNPUKernel, + ops::TanhNPUKernel); + +REGISTER_OP_NPU_KERNEL( + tanh_grad, + ops::TanhGradNPUKernel, + ops::TanhGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + square, ops::SquareNPUKernel, + ops::SquareNPUKernel, + ops::SquareNPUKernel); diff --git a/paddle/fluid/operators/addmm_op.h b/paddle/fluid/operators/addmm_op.h index 97e3ed9c1adda0..ecfd10d2fa6fbd 100644 --- a/paddle/fluid/operators/addmm_op.h +++ b/paddle/fluid/operators/addmm_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" @@ -32,8 +33,8 @@ template using EigenTensor = framework::EigenTensor; -using Array1 = Eigen::DSizes; -using Array2 = Eigen::DSizes; +using Array1 = Eigen::DSizes; +using Array2 = Eigen::DSizes; using Tensor = framework::Tensor; @@ -105,7 +106,8 @@ class AddMMKernel : public framework::OpKernel { auto eigen_out = EigenTensor::From(*out); auto& place = *context.template device_context().eigen_device(); - eigen_out.device(place) = eigen_input.broadcast(bcast_dims); + EigenBroadcast, T, 2>::Eval( + place, eigen_out, eigen_input, bcast_dims); blas.GEMM(false, false, x_dims[0], y_dims[1], x_dims[1], alpha, x->data(), x_dims[1], y->data(), y_dims[1], beta, diff --git a/paddle/fluid/operators/amp/CMakeLists.txt b/paddle/fluid/operators/amp/CMakeLists.txt index b3ff52a7ae119d..2ea8bbcbc61df8 100644 --- a/paddle/fluid/operators/amp/CMakeLists.txt +++ b/paddle/fluid/operators/amp/CMakeLists.txt @@ -4,3 +4,7 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() register_operators() + +if(WITH_ASCEND_CL) + cc_test(check_finite_and_unscale_op_npu_test SRCS check_finite_and_unscale_op_npu_test.cc DEPS op_registry check_finite_and_unscale_op scope device_context enforce executor) +endif() diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu index 6840e4847c4c64..2c3a9c366e4fd0 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu @@ -26,18 +26,48 @@ __global__ void InverseAndMemset(const T* s, T* o, bool* found_inf) { } template -__global__ void CheckFiniteAndUnscale(const T* in, const MT* scale, int num, - bool* found_inf, T* out) { - const int idx = threadIdx.x + blockIdx.x * blockDim.x; - - if (idx < num) { - MT val = static_cast(in[idx]) * (*scale); +__global__ void CheckFiniteAndUnscale(const T** xs, const MT* scale, + int64_t size, int64_t* starts, + bool* found_inf, T** outs) { + const int64_t tid = threadIdx.x + blockIdx.x * blockDim.x; + + // copy starts array from global memory to shared memory + extern __shared__ int64_t s_starts[]; + for (int i = threadIdx.x; i <= size; i += blockDim.x) { + s_starts[i] = starts[i]; + } + __syncthreads(); + + const int64_t num = s_starts[size]; + int pre_xs_index = 0; + bool t_found_inf = false; + const MT t_scale = *scale; + for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) { + // get the xs's index of thread + int xs_index = pre_xs_index; + while (idx < s_starts[xs_index]) xs_index++; + // avoid some tensor's numel is zero + while (idx >= s_starts[xs_index]) xs_index++; + pre_xs_index = xs_index - 1; + + // get in data and out data + const T* in = xs[pre_xs_index]; + T* out = outs[pre_xs_index]; + int64_t in_idx = idx - s_starts[pre_xs_index]; + + // Unscale + MT val = static_cast(in[in_idx]) * t_scale; T narrow_val = static_cast(val); - out[idx] = narrow_val; + out[in_idx] = narrow_val; + + // CheckFinite if (!isfinite(narrow_val)) { - *found_inf = true; + t_found_inf = true; } } + if (t_found_inf) { + *found_inf = true; + } } template @@ -63,20 +93,53 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { InverseAndMemset<<<1, 1, 0, dev_ctx.stream()>>>( scale_data, inverse_scale_v, found_inf_data); - for (size_t i = 0; i < xs.size(); ++i) { - const auto* x = xs[i]; - auto* out = outs[i]; - const T* x_data = x->data(); - T* out_data = out->mutable_data(dev_ctx.GetPlace()); - - int num = x->numel(); - int block = 1024; - int grid = (num + block - 1) / block; - VLOG(3) << "launch kernel"; - CheckFiniteAndUnscale<<>>( - x_data, inverse_scale_v, num, found_inf_data, out_data); - VLOG(3) << "finish kernel"; + size_t xs_size = xs.size(); + // calculate each tensor's start index and copy to device + auto h_starts_tensor = + memory::Alloc(platform::CPUPlace(), (xs_size + 1) * sizeof(int64_t)); + int64_t* h_starts = reinterpret_cast(h_starts_tensor->ptr()); + + auto d_starts_tensor = + memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); + int64_t* d_starts = reinterpret_cast(d_starts_tensor->ptr()); + + h_starts[0] = 0; + for (int i = 1; i <= xs_size; i++) { + // the start index value of each tensor is + // the sum of previous tensor's size + h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel(); + } + int64_t total_num = h_starts[xs_size]; + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + d_starts, platform::CPUPlace(), h_starts, + (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()); + + // copy each tensor's data address to device + auto h_mem = memory::Alloc(platform::CPUPlace(), 2 * xs_size * sizeof(T*)); + const T** h_xs = reinterpret_cast(h_mem->ptr()); + T** h_outs = reinterpret_cast(h_mem->ptr()) + xs_size; + + auto d_mem = memory::Alloc(dev_ctx, 2 * xs_size * sizeof(T*)); + const T** d_xs = reinterpret_cast(d_mem->ptr()); + T** d_outs = reinterpret_cast(d_mem->ptr()) + xs_size; + + for (size_t i = 0; i < xs_size; ++i) { + h_xs[i] = xs[i]->data(); + h_outs[i] = outs[i]->mutable_data(dev_ctx.GetPlace()); } + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs, + platform::CPUPlace(), h_xs, 2 * xs_size * sizeof(T*), + dev_ctx.stream()); + + // Launch Kernel + int block = 1024; + int block_num = block * 20; // each thread deal with 20 number + int grid = (total_num + block_num - 1) / block_num; + VLOG(3) << "launch kernel"; + CheckFiniteAndUnscale<<< + grid, block, (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>( + d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs); + VLOG(3) << "finish kernel"; } }; } // namespace operators diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc new file mode 100644 index 00000000000000..46f9f7ff089448 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + const auto xs = ctx.MultiInput("X"); + const auto* scale = ctx.Input("Scale"); + auto outs = ctx.MultiOutput("Out"); + auto* found_inf = ctx.Output("FoundInfinite"); + + found_inf->mutable_data(ctx.GetPlace()); + + bool found_inf_data = false; + + auto stream = + ctx.template device_context() + .stream(); + + // step1: inverse scale(RealDiv) + Tensor const_tensor; + const_tensor.mutable_data({1}, ctx.GetPlace()); + TensorFromVector(std::vector{static_cast(1.0)}, ctx.device_context(), + &const_tensor); + + ctx.template device_context().Wait(); + + // Inverse(1.0/scale) + Tensor* tmp_inverse_out = const_cast(scale); + Tensor inverse_out(scale->type()); + inverse_out.Resize(scale->dims()); + inverse_out.mutable_data(ctx.GetPlace()); + auto runner_inverse = + NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {}); + runner_inverse.Run(stream); + tmp_inverse_out = &inverse_out; + + size_t x_size = xs.size(); + for (size_t i = 0; i < x_size; ++i) { + found_inf_data = true; + const auto* x = xs[i]; + auto* out = outs[i]; + out->mutable_data(ctx.GetPlace()); + + // step2: CheckNumerics + // CheckNumerics runs on the Ascend AI CPU, which delivers poor + // performance. + Tensor check_xout(x->type()); + check_xout.Resize(x->dims()); + check_xout.mutable_data(ctx.GetPlace()); + try { + auto runner_checknumerics = + NpuOpRunner("CheckNumerics", {*x}, {check_xout}, + {{"message", std::string("check_nan_and_inf")}}); + runner_checknumerics.Run(stream); + } catch (platform::EnforceNotMet& exception) { + LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; + found_inf_data = true; + } catch (...) { + LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; + found_inf_data = true; + } + + if (!found_inf_data) { + // MatMul + auto runner_matmul = + NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {}); + runner_matmul.Run(stream); + } else { + // ZerosLike + auto runner_zeroslike = NpuOpRunner("ZerosLike", {*x}, {*out}, {}); + runner_zeroslike.Run(stream); + } // end if + } // end for + + // set found_inf to true + if (found_inf_data) { + Tensor found_inf_tensor; + found_inf_tensor.Resize({1}); + bool* is_found_inf = + found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); + *is_found_inf = true; + framework::TensorCopySync(found_inf_tensor, ctx.GetPlace(), found_inf); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_NPU_KERNEL(check_finite_and_unscale, + ops::CheckFiniteAndUnscaleNPUKernel, + ops::CheckFiniteAndUnscaleNPUKernel); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc new file mode 100644 index 00000000000000..99e81a4757d0e0 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc @@ -0,0 +1,131 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/enforce.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +using Tensor = paddle::framework::Tensor; + +USE_OP(check_finite_and_unscale); +USE_OP_DEVICE_KERNEL(check_finite_and_unscale, NPU); + +struct InputVars { + std::string name; + f::LoDTensor *tensor; +}; + +template +void Compare(f::Scope *scope, const p::DeviceContext &ctx) { + const f::DDim dims = f::make_ddim({2, 2}); + auto place = ctx.GetPlace(); + + // init input + std::vector input_names = { + {"x", scope->Var("x")->GetMutable()}, + {"x1", scope->Var("x1")->GetMutable()}}; + + auto *scale = scope->Var("scale")->GetMutable(); + + // init output + auto *out = scope->Var("out")->GetMutable(); + auto *out1 = scope->Var("out1")->GetMutable(); + auto *found_inf = scope->Var("found_inf")->GetMutable(); + + // Initialize input data + const int num_inputs = input_names.size(); + size_t numel = static_cast(f::product(dims)); + + for (int i = 0; i < num_inputs; ++i) { + std::vector init_xs; + for (size_t j = 0; j < numel; ++j) { + if (j == 0) { + init_xs.push_back(static_cast(NAN)); + } else { + init_xs.push_back(static_cast(j + 1)); + } + } + f::TensorFromVector(init_xs, ctx, input_names[i].tensor); + input_names[i].tensor->Resize(dims); + } + + f::TensorFromVector(std::vector{static_cast(0.5)}, ctx, scale); + + ctx.Wait(); + + // run + f::AttributeMap attrs; + auto op = f::OpRegistry::CreateOp( + "check_finite_and_unscale", {{"X", {"x", "x1"}}, {"Scale", {"scale"}}}, + {{"Out", {"out", "out1"}}, {"FoundInfinite", {"found_inf"}}}, attrs); + op->Run(*scope, place); + ctx.Wait(); + + // out0 + std::vector out_vec; + f::TensorToVector(*out, ctx, &out_vec); + EXPECT_EQ(out_vec.size(), static_cast(4)); + for (size_t j = 0; j < out_vec.size(); ++j) { + VLOG(3) << "out_vec[" << j << "]:" << out_vec[j]; + } + + ctx.Wait(); + + // out0 + std::vector out1_vec; + f::TensorToVector(*out1, ctx, &out1_vec); + EXPECT_EQ(out1_vec.size(), static_cast(4)); + for (size_t j = 0; j < out1_vec.size(); ++j) { + VLOG(3) << "out1_vec[" << j << "]:" << out1_vec[j]; + } + + ctx.Wait(); + + // out found_inf + Tensor found_inf_tensor; + found_inf_tensor.Resize({1}); + bool *is_finite_data = + found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); + f::TensorCopy(*found_inf, place, &found_inf_tensor); + EXPECT_FALSE(*is_finite_data); + + ctx.Wait(); +} + +TEST(check_finite_and_unscale, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} + +TEST(check_finite_and_unscale, NPU_fp16) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc new file mode 100644 index 00000000000000..dd6dbfd5c0b653 --- /dev/null +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc @@ -0,0 +1,219 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +void Update(const platform::NPUDeviceContext& ctx, + const std::vector found_inf_vec, + const Tensor* pre_loss_scaling_tensor, const Tensor* good_in_tensor, + const Tensor* bad_in_tensor, const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, const float incr_ratio, + const float decr_ratio, Tensor* updated_loss_scaling_tensor, + Tensor* good_out_tensor, Tensor* bad_out_tensor) { + auto place = ctx.GetPlace(); + auto stream = ctx.stream(); + if (found_inf_vec[0]) { + // good_out_data = 0 + auto g = good_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(g), 0, + good_out_tensor->numel() * sizeof(int), stream); + // bad_out_data = bad_in_data + 1 + Tensor factor_tensor(bad_out_tensor->type()); + factor_tensor.mutable_data({1}, place); + TensorFromVector(std::vector{1}, ctx, &factor_tensor); + auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor}, + {*bad_out_tensor}, {}); + runner_p2.Run(stream); + + std::vector bad_out_data; + TensorToVector(*bad_out_tensor, ctx, &bad_out_data); + if (bad_out_data[0] == decr_every_n_nan_or_inf) { + auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", decr_ratio}, + {"shift", static_cast(0)}}); + + runner_p3.Run(stream); + + std::vector new_loss_scaling; + TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); + if (new_loss_scaling[0] < static_cast(1)) { + // updated_loss_scaling_data = 1 + auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(0)}, + {"shift", static_cast(1)}}); + + runner_p4.Run(stream); + } + + // bad_out_data = 0 + auto b = bad_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(b), 0, + bad_out_tensor->numel() * sizeof(int), stream); + } + } else { + // bad_out_data = 0 + auto b = bad_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(b), 0, + bad_out_tensor->numel() * sizeof(int), stream); + + // good_out_data = good_in_data + 1 + Tensor factor_tensor(good_out_tensor->type()); + factor_tensor.mutable_data({1}, place); + TensorFromVector(std::vector{1}, ctx, &factor_tensor); + auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor}, + {*good_out_tensor}, {}); + runner_p2.Run(stream); + + std::vector good_out_data; + TensorToVector(*good_out_tensor, ctx, &good_out_data); + + if (good_out_data[0] == incr_every_n_steps) { + auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", incr_ratio}, + {"shift", static_cast(0)}}); + runner_p3.Run(stream); + + std::vector new_loss_scaling; + TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); + if (!std::isfinite(new_loss_scaling[0])) { + // updated_loss_scaling_data = pre_loss_scaling_data + auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(1)}, + {"shift", static_cast(0)}}); + + runner_p4.Run(stream); + } + // good_out_data = 0 + auto g = good_out_tensor->mutable_data(place); + platform::NPUMemsetAsync(static_cast(g), 0, + good_out_tensor->numel() * sizeof(int), stream); + } + } +} + +template +class UpdateLossScalingFunctor { + public: + void operator()(const platform::NPUDeviceContext& dev_ctx, + const std::vector found_inf_vec, + const Tensor* pre_loss_scaling_tensor, + const Tensor* good_in_tensor, const Tensor* bad_in_tensor, + const int incr_every_n_steps, + const int decr_every_n_nan_or_inf, const float incr_ratio, + const float decr_ratio, Tensor* updated_loss_scaling_tensor, + Tensor* good_out_tensor, Tensor* bad_out_tensor) const { + Update(dev_ctx, found_inf_vec, pre_loss_scaling_tensor, good_in_tensor, + bad_in_tensor, incr_every_n_steps, decr_every_n_nan_or_inf, + incr_ratio, decr_ratio, updated_loss_scaling_tensor, + good_out_tensor, bad_out_tensor); + } +}; + +template +class LazyZerosNPU { + public: + void operator()(const platform::NPUDeviceContext& dev_ctx, + const std::vector found_inf_vec, + const std::vector& xs, + const std::vector& outs) const { + for (size_t i = 0; i < xs.size(); ++i) { + auto* out = outs[i]; + if (found_inf_vec[0]) { + VLOG(4) << "-- UpdateLossScaling: Find infinite grads. --"; + + auto place = dev_ctx.GetPlace(); + auto stream = dev_ctx.stream(); + auto g = out->mutable_data(place); + platform::NPUMemsetAsync(static_cast(g), 0, + out->numel() * sizeof(T), stream); + } + } + } +}; + +template +class UpdateLossScalingNPUKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const auto xs = ctx.MultiInput("X"); + auto outs = ctx.MultiOutput("Out"); + const auto* found_inf = ctx.Input("FoundInfinite"); + PADDLE_ENFORCE_EQ(found_inf->numel(), 1, + platform::errors::InvalidArgument( + "FoundInfinite must has only one element.")); + + std::vector found_inf_vec; + TensorToVector(*found_inf, ctx.device_context(), &found_inf_vec); + + LazyZerosNPU{}(dev_ctx, found_inf_vec, xs, outs); + const bool stop_update = ctx.Attr("stop_update"); + if (stop_update) { + return; + } + + const auto* pre_loss_scaling = ctx.Input("PrevLossScaling"); + const auto* good_in = ctx.Input("InGoodSteps"); + const auto* bad_in = ctx.Input("InBadSteps"); + auto* updated_loss_scaling = ctx.Output("LossScaling"); + auto* good_out = ctx.Output("OutGoodSteps"); + auto* bad_out = ctx.Output("OutBadSteps"); + + updated_loss_scaling->mutable_data(dev_ctx.GetPlace()); + good_out->mutable_data(dev_ctx.GetPlace()); + bad_out->mutable_data(dev_ctx.GetPlace()); + + const int incr_every_n_steps = ctx.Attr("incr_every_n_steps"); + const int decr_every_n_nan_or_inf = + ctx.Attr("decr_every_n_nan_or_inf"); + const float incr_ratio = ctx.Attr("incr_ratio"); + const float decr_ratio = ctx.Attr("decr_ratio"); + UpdateLossScalingFunctor{}( + dev_ctx, found_inf_vec, pre_loss_scaling, good_in, bad_in, + incr_every_n_steps, decr_every_n_nan_or_inf, incr_ratio, decr_ratio, + updated_loss_scaling, good_out, bad_out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + update_loss_scaling, + ops::UpdateLossScalingNPUKernel, + ops::UpdateLossScalingNPUKernel); diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc new file mode 100644 index 00000000000000..93689d5e495f33 --- /dev/null +++ b/paddle/fluid/operators/assign_op_npu.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/assign_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace framework { +class OpDesc; +class Variable; +} // namespace framework +namespace imperative { +class OpBase; +} // namespace imperative +namespace platform { +struct CPUPlace; +struct CUDAPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace paddle { +namespace operators { +template +class AssignNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + assign, ops::AssignNPUKernel, + ops::AssignNPUKernel, + ops::AssignNPUKernel) diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc new file mode 100644 index 00000000000000..5cf1303a229a90 --- /dev/null +++ b/paddle/fluid/operators/assign_op_npu_test.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(assign); +USE_OP_DEVICE_KERNEL(assign, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + std::vector init; + init.push_back(static_cast(1.0)); + init.push_back(static_cast(2.0)); + init.push_back(static_cast(3.0)); + init.push_back(static_cast(4.0)); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({4}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + auto op = + f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {}); + + op->Run(*scope, place); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + ctx.Wait(); + + EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4); + EXPECT_EQ(out_vec[0], static_cast(1.0)); + EXPECT_EQ(out_vec[1], static_cast(2.0)); + EXPECT_EQ(out_vec[2], static_cast(3.0)); + EXPECT_EQ(out_vec[3], static_cast(4.0)); +} + +TEST(assign, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "assign"); +} diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu index 99153101fc326c..8bd2b7fe2d127c 100644 --- a/paddle/fluid/operators/bce_loss_op.cu +++ b/paddle/fluid/operators/bce_loss_op.cu @@ -32,6 +32,11 @@ __global__ void GPUBCELossForward(const T* x_data, const T* label_data, T one = static_cast(1.); T neg_100 = static_cast(-100.); + PADDLE_ENFORCE( + (x >= static_cast(0)) && (x <= one), + "Input is expected to be within the interval [0, 1], but recieved %f.", + x); + T term1 = max(real_log(x), neg_100); T term2 = max(real_log(one - x), neg_100); @@ -64,29 +69,13 @@ class BCELossCUDAKernel : public framework::OpKernel { auto* labels = ctx.Input("Label"); auto* out = ctx.Output("Out"); - auto x_data = x->data(); - auto out_data = out->mutable_data(ctx.GetPlace()); + const auto* x_data = x->data(); + auto* out_data = out->mutable_data(ctx.GetPlace()); auto x_numel = x->numel(); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), x_numel); - - Tensor x_cpu; - framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu); - T* x_cpu_data = x_cpu.data(); - - for (int64_t i = 0; i < x_numel; ++i) { - PADDLE_ENFORCE_GE( - x_cpu_data[i], static_cast(0), - platform::errors::InvalidArgument( - "Illegal input, input must be greater than or equal to 0")); - PADDLE_ENFORCE_LE( - x_cpu_data[i], static_cast(1), - platform::errors::InvalidArgument( - "Illegal input, input must be less than or equal to 1")); - } - auto& dev_ctx = ctx.cuda_device_context(); + platform::GpuLaunchConfig config = + platform::GetGpuLaunchConfig1D(dev_ctx, x_numel); GPUBCELossForward<<>>(x_data, labels->data(), @@ -102,9 +91,10 @@ class BCELossGradCUDAKernel : public framework::OpKernel { auto* labels = ctx.Input("Label"); auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); - auto dx_data = dx->mutable_data(ctx.GetPlace()); int x_numel = x->numel(); + auto* dx_data = dx->mutable_data(ctx.GetPlace()); + auto& dev_ctx = ctx.cuda_device_context(); platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(dev_ctx, x_numel); diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc new file mode 100644 index 00000000000000..20b33c4e4e05a6 --- /dev/null +++ b/paddle/fluid/operators/cast_op_npu.cc @@ -0,0 +1,92 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include + +#include "paddle/fluid/operators/cast_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +static std::map + DTYPE_2_ACL_DTYPE = { + {framework::proto::VarType::BOOL, ACL_BOOL}, + {framework::proto::VarType::INT16, ACL_INT16}, + {framework::proto::VarType::INT32, ACL_INT32}, + {framework::proto::VarType::INT64, ACL_INT64}, + {framework::proto::VarType::FP16, ACL_FLOAT16}, + {framework::proto::VarType::FP32, ACL_FLOAT}, + {framework::proto::VarType::FP64, ACL_DOUBLE}, +}; + +using Tensor = framework::Tensor; + +template +class CastNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + int dtype = ctx.Attr("out_dtype"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + auto iter = DTYPE_2_ACL_DTYPE.find( + static_cast(dtype)); + int aclDtype = iter->second; + + if (dtype == framework::proto::VarType::FP32) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::FP16) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::INT16) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::INT32) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::INT64) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::FP64) { + out->mutable_data(place); + } else if (dtype == framework::proto::VarType::BOOL) { + out->mutable_data(place); + } + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("Cast", {*x}, {*out}, + {{"dst_type", static_cast(aclDtype)}}); + runner.Run(stream); + } +}; +} // namespace operators +} // namespace paddleaclDtype + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + cast, ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel, + ops::CastNPUKernel); +#endif diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc index bbd43274a002d8..ca15858cf67d75 100644 --- a/paddle/fluid/operators/cast_op_xpu.cc +++ b/paddle/fluid/operators/cast_op_xpu.cc @@ -23,8 +23,22 @@ limitations under the License. */ namespace paddle { namespace operators { +template +class XPUFPTypeTrait { + public: + using Type = T; +}; + +template <> +class XPUFPTypeTrait { + public: + using Type = float16; +}; + template class CastXPUKernel : public framework::OpKernel { + using XPUInTDType = typename XPUFPTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { auto* in = context.Input("X"); @@ -34,27 +48,39 @@ class CastXPUKernel : public framework::OpKernel { auto out_type = static_cast( context.Attr("out_dtype")); auto* in_data = in->data(); + + // using XPUOutTDType = typename XPUFPTypeTrait::Type; auto numel = in->numel(); auto& dev_ctx = context.template device_context(); int r = -1; if (out_type == framework::proto::VarType::FP32) { auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast_v2(dev_ctx.x_context(), in_data, out_data, - numel); + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + out_data, numel); } else if (out_type == framework::proto::VarType::INT32) { auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast_v2(dev_ctx.x_context(), in_data, out_data, - numel); + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + out_data, numel); } else if (out_type == framework::proto::VarType::INT64) { auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast_v2(dev_ctx.x_context(), in_data, out_data, - numel); + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + out_data, numel); } else if ((out_type == framework::proto::VarType::BOOL) && (in_type == framework::proto::VarType::FP32)) { auto* out_data = out->mutable_data(context.GetPlace()); r = xpu::cast_v2( dev_ctx.x_context(), (const float*)in_data, reinterpret_cast(out_data), numel); + } else if (out_type == framework::proto::VarType::FP16) { + auto* out_data = + out->mutable_data(context.GetPlace()); + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + reinterpret_cast(out_data), numel); + } else { PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d", in_type, out_type)); @@ -75,5 +101,7 @@ namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( cast, ops::CastXPUKernel, ops::CastXPUKernel, + ops::CastXPUKernel, ops::CastXPUKernel); #endif diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index 8920541b9b9dcc..977a208d20e783 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -19,6 +19,12 @@ if(WITH_NCCL OR WITH_RCCL) op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) endif() +if(WITH_ASCEND) + op_library(gen_nccl_id_op) + op_library(c_gen_nccl_id_op) +endif() + + if(WITH_GLOO) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper) endif() diff --git a/paddle/fluid/operators/collective/allreduce_op.cc b/paddle/fluid/operators/collective/allreduce_op.cc index 86f1c28a9dd4f5..63b135a74cf4b7 100644 --- a/paddle/fluid/operators/collective/allreduce_op.cc +++ b/paddle/fluid/operators/collective/allreduce_op.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include // NOLINT #include -#include "paddle/fluid/operators/distributed_ops/allreduce_op.h" +#include "paddle/fluid/operators/collective/allreduce_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/collective/allreduce_op.cu.cc b/paddle/fluid/operators/collective/allreduce_op.cu.cc index 9b70f78399026b..fe2e4910552706 100644 --- a/paddle/fluid/operators/collective/allreduce_op.cu.cc +++ b/paddle/fluid/operators/collective/allreduce_op.cu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/distributed_ops/allreduce_op.h" +#include "paddle/fluid/operators/collective/allreduce_op.h" namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc index 1592d809f91e26..7da30f64d1ce39 100644 --- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc @@ -27,6 +27,7 @@ limitations under the License. */ namespace paddle { namespace operators { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) static void GenNCCLID(std::vector* nccl_ids) { for (size_t i = 0; i < nccl_ids->size(); ++i) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -84,6 +85,21 @@ class CGenNCCLIdOp : public framework::OperatorBase { } }; +#else +class CGenNCCLIdOp : public framework::OperatorBase { + public: + CGenNCCLIdOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override {} +}; + +#endif + class CGenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc index c4abe284d72096..700d1173e2ff68 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc @@ -15,40 +15,20 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -} // namespace paddle - namespace paddle { namespace operators { -class CSyncCalcStreamOp : public framework::OperatorBase { +class CSyncCalcStreamOp : public framework::OperatorWithKernel { public: - CSyncCalcStreamOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override { - PADDLE_ENFORCE_EQ(is_gpu_place(place), true, - platform::errors::PreconditionNotMet( - "Sync stream op can run on gpu place only for now.")); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) - auto dev_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(place)); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream())); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream())); -#endif -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with GPU.")); -#endif + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); } }; @@ -65,10 +45,36 @@ Call calculation stream synchronization. } }; +template +class CSyncCalcStreamCudaKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) + + auto place = ctx.GetPlace(); + auto dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream())); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream())); +#endif + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with GPU.")); +#endif + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(c_sync_calc_stream, ops::CSyncCalcStreamOp, - ops::CSyncCalcStreamOpMaker); +REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream, ops::CSyncCalcStreamOp, + ops::CSyncCalcStreamOpMaker); + +REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, + ops::CSyncCalcStreamCudaKernel); diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc index adf27069f524e4..95b9cd040fe94e 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc @@ -14,45 +14,25 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -} // namespace paddle + #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" #endif namespace paddle { namespace operators { -class CSyncCommStreamOp : public framework::OperatorBase { +class CSyncCommStreamOp : public framework::OperatorWithKernel { public: - CSyncCommStreamOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override { - PADDLE_ENFORCE_EQ(is_gpu_place(place), true, - platform::errors::PreconditionNotMet( - "Sync stream op can run on gpu place only for now.")); + using framework::OperatorWithKernel::OperatorWithKernel; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - int ring_id = Attr("ring_id"); - auto stream = - platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); -#ifdef PADDLE_WITH_RCCL - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with GPU.")); -#endif + void InferShape(framework::InferShapeContext* ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); } }; @@ -72,10 +52,38 @@ Call communication stream synchronization. } }; +template +class CSyncCommStreamCudaKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + + auto place = ctx.GetPlace(); + + int ring_id = ctx.Attr("ring_id"); + auto stream = + platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); + +#ifdef PADDLE_WITH_RCCL + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); +#endif + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with GPU.")); +#endif + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(c_sync_comm_stream, ops::CSyncCommStreamOp, - ops::CSyncCommStreamOpMaker); +REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream, ops::CSyncCommStreamOp, + ops::CSyncCommStreamOpMaker); + +REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream, + ops::CSyncCommStreamCudaKernel); diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc index 679713d05bcb40..99a92469e8502b 100644 --- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc @@ -34,6 +34,7 @@ class Scope; namespace paddle { namespace operators { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) static void GenNCCLID(std::vector* nccl_ids) { for (size_t i = 0; i < nccl_ids->size(); ++i) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -194,6 +195,20 @@ class GenNCCLIdOp : public framework::OperatorBase { } }; +#else +class GenNCCLIdOp : public framework::OperatorBase { + public: + GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override {} +}; + +#endif + class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc new file mode 100644 index 00000000000000..87bb3397ca2672 --- /dev/null +++ b/paddle/fluid/operators/concat_op_npu.cc @@ -0,0 +1,126 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/concat_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class ConcatNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + framework::LoDTensor* out = ctx.Output("Out"); + PADDLE_ENFORCE_NOT_NULL(ins[0], + platform::errors::NotFound( + "The first input tensor is not initalized.")); + auto axis = ctx.Attr("axis"); + + if (ctx.HasInput("AxisTensor")) { + PADDLE_THROW(platform::errors::NotFound( + "The AxisTensor is not supported on NPU now.")); + } + axis = ComputeAxis(static_cast(axis), + static_cast(ins[0]->dims().size())); + + auto place = ctx.GetPlace(); + out->mutable_data(place); + + std::vector inputs; + std::vector names; + for (size_t i = 0; i < ins.size(); ++i) { + if (ins[i] && ins[i]->numel() > 0) { + inputs.push_back(*ins[i]); + names.push_back("x" + std::to_string(i)); + } else { + continue; + } + } + auto stream = + ctx.template device_context() + .stream(); + auto runner = NpuOpRunner( + "ConcatD", {inputs}, {*out}, + {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}); + runner.AddInputNames(names); + runner.Run(stream); + } +}; + +template +class ConcatGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto ins = ctx.MultiInput("X"); + auto out_var_names = ctx.OutputNames(framework::GradVarName("X")); + auto outs = + ctx.MultiOutput(framework::GradVarName("X")); + + PADDLE_ENFORCE_NOT_NULL(ins[0], + platform::errors::NotFound( + "The first input tensor is not initalized.")); + + auto axis = ctx.Attr("axis"); + + axis = ComputeAxis(static_cast(axis), + static_cast(ins[0]->dims().size())); + + int offset = 0; + auto stream = + ctx.template device_context() + .stream(); + for (size_t j = 0; j < outs.size(); ++j) { + // For stop gradient + // get output tensor that the name is not kEmptyVarName + if (out_var_names[j] != framework::kEmptyVarName && + outs[j]->numel() != 0UL) { + outs[j]->mutable_data(ctx.GetPlace()); + std::vector offsets; + std::vector sizes; + for (int dim = 0; dim < ins[j]->dims().size(); ++dim) { + if (dim == axis) { + offsets.push_back(offset); + sizes.push_back(ins[j]->dims()[dim]); + } else { + offsets.push_back(0); + sizes.push_back(ins[j]->dims()[dim]); + } + } + auto runner = NpuOpRunner("SliceD", {*out_grad}, {*outs[j]}, + {{"offsets", offsets}, {"size", sizes}}); + runner.Run(stream); + } + if (ins[j]->numel() != 0UL) { + offset += ins[j]->dims()[axis]; + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL(concat, ops::ConcatNPUKernel, + ops::ConcatNPUKernel, + ops::ConcatNPUKernel); + +REGISTER_OP_NPU_KERNEL(concat_grad, ops::ConcatGradNPUKernel, + ops::ConcatGradNPUKernel, + ops::ConcatGradNPUKernel); diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc index aa0002cc6d1777..be299babdba7a4 100644 --- a/paddle/fluid/operators/concat_op_xpu.cc +++ b/paddle/fluid/operators/concat_op_xpu.cc @@ -132,16 +132,14 @@ class ConcatGradXPUKernel : public framework::OpKernel { axis = ComputeAxis(static_cast(axis), static_cast(ins[0]->dims().size())); // get output tensor that the name is not kEmptyVarName - std::vector outputs; - std::vector choose_idx; - int n = 0; + std::vector ptrs(outs.size()); for (size_t j = 0; j < outs.size(); ++j) { if (out_var_names[j] != framework::kEmptyVarName && outs[j]->numel() != 0UL) { outs[j]->mutable_data(ctx.GetPlace()); - outputs.push_back(outs[j]); - choose_idx.push_back(j); - n++; + ptrs[j] = outs[j]->data(); + } else { + ptrs[j] = nullptr; } } PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument( @@ -157,10 +155,10 @@ class ConcatGradXPUKernel : public framework::OpKernel { axis, out_grad->dims().size())); auto input_dims = ins[0]->dims(); - std::vector split_list(n); + std::vector split_list(ins.size()); std::vector xdims_list(input_dims.size()); int total_length = 0; - for (int i = 0; i < n; ++i) { + for (size_t i = 0; i < ins.size(); ++i) { split_list[i] = ins[i]->dims()[axis]; total_length += ins[i]->dims()[axis]; } @@ -172,11 +170,6 @@ class ConcatGradXPUKernel : public framework::OpKernel { } xdims_list[axis] = total_length; - std::vector ptrs(n); - for (int i = 0; i < n; ++i) { - ptrs[i] = outputs[i]->data(); - } - auto& dev_ctx = ctx.template device_context(); int r = xpu::split(dev_ctx.x_context(), out_grad->data(), ptrs, xdims_list, split_list, axis); diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc new file mode 100644 index 00000000000000..591fb55936734f --- /dev/null +++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#ifdef PADDLE_WITH_ASCEND_CL + +namespace paddle { +namespace operators { + +template +class EqualNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class LessThanNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + // int axis = context.Attr("axis"); + z->mutable_data(ctx.GetPlace()); // allocate + auto runner = NpuOpRunner("Less", {*x, *y}, {*z}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(equal, ops::EqualNPUKernel, + ops::EqualNPUKernel, + ops::EqualNPUKernel); + +REGISTER_OP_NPU_KERNEL( + less_than, + ops::LessThanNPUKernel, + ops::LessThanNPUKernel); + +#endif diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h index b9ea2ade6cb90b..6513bae839e989 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.h +++ b/paddle/fluid/operators/controlflow/conditional_block_op.h @@ -78,6 +78,13 @@ class ConditionalOp : public framework::OperatorBase { framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); res = cpu_tensor.data()[0]; +#endif + } else if (platform::is_npu_place(ips[0]->place())) { +#ifdef PADDLE_WITH_ASCEND_CL + framework::LoDTensor cpu_tensor; + framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); + platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); + res = cpu_tensor.data()[0]; #endif } else { res = ips[0]->data()[0]; diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index d86b6b48422d94..fdd1b776bd8fa3 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -44,6 +44,11 @@ static void DataCopy(const framework::LoDTensor &src_item, TensorCopySync(src_item, platform::CPUPlace(), dst_item); } #else +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(src_item.place())) { + platform::DeviceContextPool::Instance().Get(src_item.place())->Wait(); + } +#endif TensorCopySync(src_item, platform::CPUPlace(), dst_item); #endif } else { diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc new file mode 100644 index 00000000000000..1b0c0e444347af --- /dev/null +++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include + +#include "paddle/fluid/operators/controlflow/logical_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class LogicalNotNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + auto runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + logical_not, + ops::LogicalNotNPUKernel); + +#endif diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index 39e9d37ddc6c75..ab535e341f7575 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -1363,7 +1363,14 @@ REGISTER_OP_KERNEL( conv2d_grad_grad, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvDoubleGradOpKernel, paddle::operators::CUDNNConvDoubleGradOpKernel); - +// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue +// Use depthwise_conv2d in MIOPEN to resolve this issue +REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); REGISTER_OP_CUDA_KERNEL( depthwise_conv2d_grad_grad, paddle::operators::CUDNNConvDoubleGradOpKernel, diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 364e3ab8d26c3f..94d1f707b74c2e 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -903,29 +903,19 @@ class DepthwiseConvKernel : public framework::OpKernel { "and input channel number is %d", output->dims()[1], input->dims()[1])); } - // transform tensor - Tensor transformed_input(input->type()); - Tensor transformed_output(output->type()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output, - &transformed_output); - - } else { - transformed_input = *input; - transformed_output = *output; - } // update padding and dilation - auto in_dims = transformed_input.dims(); + auto in_dims = input->dims(); auto filter_dims = filter.dims(); framework::DDim in_data_dims; - in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_format); + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); + } framework::DDim filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); @@ -944,16 +934,12 @@ class DepthwiseConvKernel : public framework::OpKernel { if (fuse_relu) { math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, transformed_input, filter, strides, paddings, - dilations, &transformed_output); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, + output, data_layout); } else { math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, transformed_input, filter, strides, paddings, - dilations, &transformed_output); - } - if (channel_last) { - TransToChannelLast(context, &transformed_output, - output); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, + output, data_layout); } } }; @@ -981,33 +967,18 @@ class DepthwiseConvGradKernel : public framework::OpKernel { context.Attr("padding_algorithm"); const std::string data_format = context.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensor - Tensor transformed_input(input->type()); - Tensor transformed_output_grad(output_grad->type()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output_grad, - &transformed_output_grad); - TransToChannelFirst(context, output_grad, - &transformed_output_grad); - - } else { - transformed_input = *input; - transformed_output_grad = *output_grad; - } - // update padding and dilation - auto in_dims = transformed_input.dims(); + auto in_dims = input->dims(); auto filter_dims = filter.dims(); framework::DDim in_data_dims; - in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_format); + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); + } framework::DDim filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); std::vector ksize = framework::vectorize(filter_data_dims); @@ -1025,33 +996,18 @@ class DepthwiseConvGradKernel : public framework::OpKernel { if (input_grad) { input_grad->mutable_data(context.GetPlace()); - Tensor transformed_input_grad(input_grad->type()); - if (channel_last) { - ResizeToChannelFirst(context, input_grad, - &transformed_input_grad); - - } else { - transformed_input_grad = *input_grad; - } - - set_zero(dev_ctx, &transformed_input_grad, static_cast(0)); + set_zero(dev_ctx, input_grad, static_cast(0)); if (fuse_relu) { math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, transformed_input, filter, - transformed_output_grad, strides, paddings, - dilations, &transformed_input_grad); + depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, + paddings, dilations, input_grad, data_layout); } else { math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, transformed_input, filter, - transformed_output_grad, strides, paddings, - dilations, &transformed_input_grad); - } - if (channel_last) { - TransToChannelLast(context, &transformed_input_grad, - input_grad); + depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, + paddings, dilations, input_grad, data_layout); } } @@ -1061,15 +1017,13 @@ class DepthwiseConvGradKernel : public framework::OpKernel { if (fuse_relu) { math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, transformed_input, - transformed_output_grad, strides, paddings, - dilations, filter_grad); + depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, + paddings, dilations, filter_grad, data_layout); } else { math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, transformed_input, - transformed_output_grad, strides, paddings, - dilations, filter_grad); + depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, + paddings, dilations, filter_grad, data_layout); } } } diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index a712d31cf7e2c3..c4cd5854c0f78a 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -490,10 +490,6 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { bool deterministic = FLAGS_cudnn_deterministic; T* input_grad_data = nullptr; T* filter_grad_data = nullptr; - if (input_grad) - input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - if (filter_grad) - filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); if (input_grad) { input_grad_data = input_grad->mutable_data(ctx.GetPlace()); @@ -884,7 +880,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { int iwo_group = groups; int c_group = 1; -#if CUDNN_VERSION_MIN(7, 0, 1) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) iwo_group = 1; c_group = groups; groups = 1; @@ -948,7 +944,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args1.idesc.set(transformed_ddO_channel, iwo_group); args1.wdesc.set(*W, layout, iwo_group); args1.odesc.set(transformed_ddX, iwo_group); - args1.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args1.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; workspace_size = search1::GetWorkspaceSize(args1); @@ -967,7 +964,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args2.idesc.set(transformed_ddO_channel, iwo_group); args2.wdesc.set(*ddW, layout, iwo_group); args2.odesc.set(transformed_X, iwo_group); - args2.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args2.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; workspace_size = @@ -991,7 +989,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args3.odesc.set(transformed_ddX_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args3.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search3 = SearchAlgorithm; workspace_size = @@ -1013,7 +1012,8 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args4.idesc.set(transformed_dO, iwo_group); args4.wdesc.set(*ddW, layout, iwo_group); args4.odesc.set(transformed_dX_channel, iwo_group); - args4.cdesc.set(dtype, padding_common, strides, dilations, c_group); + args4.cdesc.set(dtype, padding_common, strides, dilations, + platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search4 = SearchAlgorithm; workspace_size = @@ -1083,6 +1083,10 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { if (ddW) { for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + Tensor conv_x_ddw(dO->type()); + conv_x_ddw.Resize(transformed_ddO_channel.dims()); + T* conv_x_ddw_data = conv_x_ddw.mutable_data(ctx.GetPlace()); wkspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -1090,11 +1094,17 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { handle, &alpha, args2.odesc.desc(), x + i * group_offset_in, args2.wdesc.desc(), ddw + i * group_offset_filter, args2.cdesc.desc(), - bwd_algo2, &alpha, args2.idesc.desc(), - transformed_ddy_channel + i * group_offset_out, - workspace_ptr, workspace_size)); + bwd_algo2, &beta, args2.idesc.desc(), + conv_x_ddw_data + i * group_offset_out, workspace_ptr, + workspace_size)); }, workspace_size); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor( + handle, miopenTensorOpAdd, &alpha, args2.idesc.desc(), + transformed_ddy_channel + i * group_offset_out, &alpha, + args2.idesc.desc(), conv_x_ddw_data + i * group_offset_out, &beta, + args2.idesc.desc(), + transformed_ddy_channel + i * group_offset_out)); #else // PADDLE_WITH_HIP wkspace_handle.RunFunc( [&](void* workspace_ptr) { diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu index b4c27a63dbd2f2..388b8531571086 100644 --- a/paddle/fluid/operators/detection/anchor_generator_op.cu +++ b/paddle/fluid/operators/detection/anchor_generator_op.cu @@ -49,14 +49,11 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num, anchor_width = scale_w * base_w; anchor_height = scale_h * base_h; - T xmin = (x_ctr - 0.5 * (anchor_width - 1)); - T ymin = (y_ctr - 0.5 * (anchor_height - 1)); - T xmax = (x_ctr + 0.5 * (anchor_width - 1)); - T ymax = (y_ctr + 0.5 * (anchor_height - 1)); - out[i * 4] = xmin; - out[i * 4 + 1] = ymin; - out[i * 4 + 2] = xmax; - out[i * 4 + 3] = ymax; + T xmin = (x_ctr - .5f * (anchor_width - 1)); + T ymin = (y_ctr - .5f * (anchor_height - 1)); + T xmax = (x_ctr + .5f * (anchor_width - 1)); + T ymax = (y_ctr + .5f * (anchor_height - 1)); + reinterpret_cast(out)[i] = make_float4(xmin, ymin, xmax, ymax); } } diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h index e0e499d76a19ba..599f6935736f94 100644 --- a/paddle/fluid/operators/detection/anchor_generator_op.h +++ b/paddle/fluid/operators/detection/anchor_generator_op.h @@ -22,6 +22,19 @@ limitations under the License. */ namespace paddle { namespace operators { +#ifdef PADDLE_WITH_CUDA +template +extern __global__ void GenAnchors(T* out, const T* aspect_ratios, + const int ar_num, const T* anchor_sizes, + const int as_num, const T* stride, + const int sd_num, const int height, + const int width, const T offset); + +template +extern __global__ void SetVariance(T* out, const T* var, const int vnum, + const int num); +#endif + template class AnchorGeneratorOpKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt deleted file mode 100644 index c9db6148bc45d4..00000000000000 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ /dev/null @@ -1,76 +0,0 @@ -return() - -if(WITH_GRPC) - set(cc_generic_services "false") -else() - set(cc_generic_services "true") -endif() -configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY) - -cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool) -cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder) - -cc_library(heart_beat_monitor SRCS heart_beat_monitor.cc DEPS enforce simple_threadpool) -cc_library(large_scale_kv SRCS large_scale_kv.cc DEPS enforce simple_threadpool device_context) -cc_test(heart_beat_monitor_test SRCS heart_beat_monitor_test.cc DEPS heart_beat_monitor) - -# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files -set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") -if(WITH_GRPC) - set(GRPC_DEPS grpc++_unsecure grpc_unsecure gpr zlib protobuf) - set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc) - grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc - request_handler_impl.cc rpc_client.cc rpc_server.cc - variable_response.cc - collective_client.cc collective_server.cc - ${GRPC_SRCS} - PROTO send_recv.proto - DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder heart_beat_monitor large_scale_kv) - - set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS}) - - cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc - DEPS ${RPC_DEPS} scope profiler math_function) - -else() - set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) - set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc communicator.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - - set(BRPC_DEPS brpc ssl crypto protobuf leveldb zlib) - - brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc - request_handler_impl.cc rpc_client.cc rpc_server.cc - variable_response.cc - collective_client.cc collective_server.cc - ${BRPC_SRCS} - PROTO send_recv.proto - DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS}) - - set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS}) - cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc - DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_read_op) -endif() - - -cc_test(rpc_server_test SRCS rpc_server_test.cc - DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op checkpoint_notify_op scale_op ) -cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) -cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory node) -cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) -cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory) -cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator) -cc_test(communicator_test SRCS communicator_test.cc DEPS communicator) -if(WITH_GPU OR WITH_ROCM) - cc_test(collective_server_test SRCS collective_server_test.cc - DEPS sendrecvop_rpc executor ${RPC_DEPS} - selected_rows_functor scope math_function) -endif() -if(WITH_TESTING) - if(TEST rpc_server_test) - set_tests_properties(rpc_server_test PROPERTIES TIMEOUT 120) - endif() - if(TEST heart_beat_monitor_test) - set_tests_properties(heart_beat_monitor_test PROPERTIES TIMEOUT 120) - endif() -endif() diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h deleted file mode 100644 index 28a5f2ad6c7648..00000000000000 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class ConcurrentSet { - public: - ConcurrentSet() : pool_(new ::ThreadPool(1)) {} - ~ConcurrentSet() {} - - std::future Update(const std::vector& rows) { - auto task = [this, rows] { - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto& id : rows) { - sstream << id << ", "; - } - sstream << "]"; - VLOG(3) << "update ids -> " << sstream.str(); - } - for (auto row : rows) { - set_.insert(row); - } - }; - return pool_->enqueue(std::move(task)); - } - - std::future GetAndClear(std::vector* result) { - auto task = [this, &result] { - result->clear(); - for (auto& id : set_) { - result->push_back(id); - } - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto& id : *result) { - sstream << id << ", "; - } - sstream << "]"; - VLOG(3) << "result ids size: " << result->size() << " " - << sstream.str(); - } - set_.clear(); - }; - return pool_->enqueue(std::move(task)); - } - - private: - std::unordered_set set_; - std::unique_ptr<::ThreadPool> pool_{nullptr}; -}; - -class AsyncSparseParamUpdateRecorder { - using TrainerToRows = std::vector>; - - public: - AsyncSparseParamUpdateRecorder( - int trainer_num, - const std::unordered_map& grad_to_param) - : trainer_num_(trainer_num), grad_to_param_(grad_to_param) { - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto& item : grad_to_param) { - sstream << item.first << ":" << item.second << ", "; - } - sstream << "]"; - VLOG(3) << "trainer_num: " << trainer_num - << " grad_to_param_: " << sstream.str(); - } - for (auto& iter : grad_to_param) { - param_to_grad_[iter.second] = iter.first; - auto& param_name = iter.second; - param_to_updated_rows_[param_name] = TrainerToRows(); - auto& trainer_to_rows = param_to_updated_rows_[param_name]; - for (auto i = 0; i < trainer_num; ++i) { - trainer_to_rows.emplace_back(new ConcurrentSet()); - } - } - } - - ~AsyncSparseParamUpdateRecorder() = default; - - void Update(const std::string& grad_name, - const std::vector& update_rows) { - VLOG(3) << "update grad: " << grad_name - << " row size: " << update_rows.size(); - auto& param_name = grad_to_param_.at(grad_name); - auto& trainer_to_rows = param_to_updated_rows_.at(param_name); - - std::vector> fs; - for (auto& set : trainer_to_rows) { - fs.push_back(set->Update(update_rows)); - } - for (auto& f : fs) { - f.wait(); - } - } - - void GetAndClear(const std::string& param_name, int trainer_id, - std::vector* result) { - VLOG(3) << "GetAndClear param: " << param_name - << " for trainer: " << trainer_id; - PADDLE_ENFORCE_LT( - trainer_id, trainer_num_, - platform::errors::InvalidArgument( - "The value of trainer_id: %s should less than trainer_num: %s.", - trainer_id, trainer_num_)); - param_to_updated_rows_.at(param_name)[trainer_id] - ->GetAndClear(result) - .wait(); - } - - bool HasParam(const std::string& param_name) { - return param_to_grad_.find(param_name) != param_to_grad_.end(); - } - - bool HasGrad(const std::string& grad_name) { - return grad_to_param_.find(grad_name) != grad_to_param_.end(); - } - - private: - const int trainer_num_; - std::unordered_map grad_to_param_; - std::unordered_map param_to_grad_; - std::unordered_map param_to_updated_rows_; - - // init recorder - public: - static void Init( - int trainer_num, - const std::unordered_map& grad_to_param) { - InitImpl(trainer_num, grad_to_param); - } - - static AsyncSparseParamUpdateRecorder* GetInstance() { - return recorder_.get(); - } - - private: - // Init is called by GetInstance. - static void InitImpl( - int trainer_num, - const std::unordered_map& grad_to_param) { - if (recorder_ == nullptr) { - recorder_.reset( - new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param)); - } - } - - static std::once_flag init_flag_; - static std::unique_ptr recorder_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc deleted file mode 100644 index 2d78559625c91f..00000000000000 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" -#include -#include "gtest/gtest.h" - -namespace paddle { -namespace operators { -namespace distributed { - -TEST(ConcurrentSet, All) { - ConcurrentSet concurrent_set; - std::vector in1 = {1, 2, 3, 4}; - std::vector in2 = {2, 3, 5, 6}; - - std::vector> futures; - futures.push_back(concurrent_set.Update(in1)); - futures.push_back(concurrent_set.Update(in2)); - - for (auto &f : futures) { - f.wait(); - } - - std::unordered_set in; - std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin())); - std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin())); - - std::vector ret; - concurrent_set.GetAndClear(&ret).wait(); - - std::unordered_set out; - std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin())); - - EXPECT_EQ(in, out); - - concurrent_set.GetAndClear(&ret).wait(); - EXPECT_EQ(ret.size(), 0UL); -} - -TEST(AsyncSparseParamUpdateRecorder, All) { - std::unordered_map grad_to_param; - grad_to_param["grad1"] = "param1"; - grad_to_param["grad2"] = "param2"; - - int trainer_num = 10; - - AsyncSparseParamUpdateRecorder recorder(trainer_num, grad_to_param); - std::vector in1 = {1, 2, 3, 4}; - std::vector in2 = {2, 3, 5, 6}; - - std::unordered_set in; - std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin())); - std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin())); - - recorder.Update("grad1", in1); - recorder.Update("grad1", in2); - - EXPECT_TRUE(recorder.HasParam("param1")); - EXPECT_TRUE(recorder.HasParam("param2")); - EXPECT_FALSE(recorder.HasParam("param3")); - - EXPECT_TRUE(recorder.HasGrad("grad1")); - EXPECT_TRUE(recorder.HasGrad("grad2")); - EXPECT_FALSE(recorder.HasGrad("grad3")); - - std::vector ret; - EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret)); - - for (int i = 0; i < trainer_num; ++i) { - std::vector ret; - std::unordered_set out; - - recorder.GetAndClear("param1", i, &ret); - std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin())); - - EXPECT_EQ(in, out); - - recorder.GetAndClear("param1", i, &ret); - EXPECT_EQ(ret.size(), 0UL); - } -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc deleted file mode 100644 index b2a26089c86896..00000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc +++ /dev/null @@ -1,462 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/brpc/brpc_client.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace operators { -namespace distributed { - -DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds"); -DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); - -BRPCClient::~BRPCClient() { Wait(); } - -void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls) { - // std::unique_ptr makes sure cntl/response will be deleted before returning. - std::unique_ptr cntl_guard(cntl); - std::unique_ptr response_guard(response); - - // this channel can be used by other now. - ch_ptr->Push(ch_ctx); - - if (cntl->Failed()) { - PADDLE_THROW(platform::errors::Unavailable( - "Failed to send variable %s, error text is %s.", var_h->name(), - cntl->ErrorText())); - var_h->Finish(false); - cls->DecreaseReqCount(); - return; - } - var_h->Finish(true); - cls->DecreaseReqCount(); - - VLOG(4) << "HandleSendResponse from: " << cntl->remote_side() - << ", varname: " << var_h->name() - << ", latency: " << cntl->latency_us() << "us"; - VLOG(4) << "Finish HandleSendResponse"; -} - -VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const framework::Scope* p_scope = &scope; - const auto ch_ptr = GetChannel(ep_val); - const std::string method = kSendRPC; - VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); - - framework::AsyncIO([=] { - auto ch_ctx = ch_ptr->Pop(); - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); - cntl->set_timeout_ms(time_out); - - auto* var = p_scope->FindVar(var_name_val); - sendrecv::VariableMessage request; - distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request, - &cntl->request_attachment(), "", false, - trainer_id_); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - platform::RecordRPCEvent record_event(method); - - ch_ctx->stub->SendVariable(cntl, &request, response, done); - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - }); - req_count_++; - - return var_h; -} -void HandleFetchBarrierResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls) { - // std::unique_ptr makes sure cntl/response will be deleted before returning. - std::unique_ptr cntl_guard(cntl); - std::unique_ptr response_guard(response); - - // this channel can be used other now. - ch_ptr->Push(ch_ctx); - - if (cntl->Failed()) { - PADDLE_THROW(platform::errors::Unavailable( - "Failed to get HandleFetchBarrierResponse %s, error text is %s.", - var_h->name(), cntl->ErrorText())); - var_h->Finish(false); - cls->DecreaseReqCount(); - return; - } - - var_h->Finish(true); - cls->DecreaseReqCount(); - - VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side() - << ", varname: " << var_h->name() - << ", latency: " << cntl->latency_us() << "us"; - VLOG(4) << "Finish HandleFetchBarrierResponse"; -} -void HandleGetResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, VarHandlePtr var_h, - ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx, - BRPCClient* cls) { - // std::unique_ptr makes sure cntl/response will be deleted before returning. - std::unique_ptr cntl_guard(cntl); - std::unique_ptr response_guard(response); - - // this channel can be used other now. - ch_ptr->Push(ch_ctx); - - if (cntl->Failed()) { - PADDLE_THROW(platform::errors::Unavailable( - "Failed to get variable %s, error text is %s.", var_h->name(), - cntl->ErrorText())); - cls->DecreaseReqCount(); - var_h->Finish(false); - return; - } - - VLOG(4) << "HandleGetResponse from: " << cntl->remote_side() - << ", varname: " << var_h->name() - << ", latency: " << cntl->latency_us() << "us"; - - framework::Variable* outvar = nullptr; - int trainer_id; - distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(), - *var_h->ctx(), var_h->scope(), &outvar, - &trainer_id); - VLOG(4) << "Finish HandleGetResponse"; - cls->DecreaseReqCount(); - var_h->Finish(true); -} - -VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_var_name, - const std::string& method_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const std::string out_varname_val = out_var_name; - const framework::Scope* p_scope = &scope; - const auto ch_ptr = GetChannel(ep_val); - const std::string method = kGetRPC; - VarHandlePtr var_h( - new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); - - framework::AsyncIO([=] { - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); - cntl->set_timeout_ms(time_out); - - sendrecv::VariableMessage req; - req.set_varname(var_name_val); - req.set_out_varname(out_varname_val); - req.set_trainer_id(trainer_id_); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - platform::RecordRPCEvent record_event(method); - - if (method_name == kGetMonomerRPC) { - ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); - } else if (method_name == kGetNoBarrierRPC) { - ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done); - } else { - ch_ctx->stub->GetVariable(cntl, &req, response, done); - } - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - }); - - req_count_++; - - return var_h; -} - -VarHandlePtr BRPCClient::AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_var_name, int64_t time_out) { - std::string var_name_no_barrier = - string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); - - return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name, - kGetNoBarrierRPC, time_out); -} - -VarHandlePtr BRPCClient::AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC, - time_out); -} - -VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep, - const std::string& var_name, - int64_t time_out) { - return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out); -} - -VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_var_name, - const std::string& table_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC, - time_out); -} - -VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string in_var_name_val = in_var_name; - const std::string out_var_name_val = out_var_name; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch_ptr = GetChannel(ep_val); - - const std::string method = kPrefetchRPC; - - VarHandlePtr var_h( - new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); - - framework::AsyncIO([=] { - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); - cntl->set_timeout_ms(time_out); - - auto* var = p_scope->FindVar(in_var_name_val); - sendrecv::VariableMessage req; - distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req, - &cntl->request_attachment(), out_var_name_val, - false, 0, table_name_val); - - platform::RecordRPCEvent record_event(method); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - ch_ctx->stub->PrefetchVariable(cntl, &req, response, done); - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - }); - - req_count_++; - return var_h; -} - -VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { - return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE, - time_out); -} - -VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { - auto ch_ptr = GetChannel(ep); - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); - cntl->set_timeout_ms(time_out); - - sendrecv::VariableMessage req; - req.set_varname(FETCH_BARRIER_MESSAGE); - - const std::string method = kFetchBarrierRPC; - // var handle - VarHandlePtr var_h( - new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); - - platform::RecordRPCEvent record_event(method); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - ch_ctx->stub->GetVariable(cntl, &req, response, done); - - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - - return var_h; -} - -bool BRPCClient::Wait() { - VLOG(9) << "begin to brpcclient wait"; - { - std::unique_lock lk(sync_mutex_); - sync_cond_.wait(lk, [this] { return req_count_ == 0; }); - } - VLOG(9) << "end to brpcclient wait"; - return true; -} - -ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { - VLOG(4) << "begin to GetChannel:" << ep; - { - std::lock_guard guard(chan_mutex_); - auto it = channels_.find(ep); - if (it != channels_.end()) { - VLOG(4) << "end to GetChannel:" << ep; - return it->second; - } - } - - ChannelQueuePtr q(new framework::BlockingQueue()); - - brpc::ChannelOptions options; -#ifdef PADDLE_WITH_BRPC_RDMA - options.use_rdma = true; -#endif - options.protocol = "baidu_std"; - // don't use pooled type. the server can't afford that. - options.connection_type = "single"; - options.connect_timeout_ms = 1000; - options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/; - options.max_retry = FLAGS_max_retry; - - VLOG(1) << "create " << brpc_channel_num_per_server_ - << " brpc channels to pserver:" << ep; - - for (int i = 0; i < brpc_channel_num_per_server_; ++i) { - std::shared_ptr c(new ChannelContext()); - if (c->channel.Init(ep.c_str(), &options) != 0) { - PADDLE_THROW( - platform::errors::Unavailable("Failed to initialize channel.")); - return nullptr; - } - - c->stub.reset(new sendrecv::SendRecvService_Stub( - static_cast(&c->channel))); - q->Push(c); - } - - { - std::lock_guard guard(chan_mutex_); - channels_[ep] = q; - } - - VLOG(4) << "end to GetChannel:" << ep; - return q; -} - -VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep, - int64_t time_out) { - return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out); -} - -void BRPCClient::SendComplete() { - for (auto& kv : channels_) { - AsyncSendComplete(kv.first); - } -} - -VarHandlePtr BRPCClient::AsyncSendVarMessage( - const std::string& ep, const std::string& method_name, - const sendrecv::VariableMessage& req, int64_t time_out) { - auto ch_ptr = GetChannel(ep); - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); - cntl->set_timeout_ms(time_out); - - platform::RecordRPCEvent record_event(method_name); - - VarHandlePtr var_h( - new VarHandle(ep, method_name, req.varname(), nullptr, nullptr)); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - if (method_name == kCheckPointNotifyRPC) { - ch_ctx->stub->CheckpointNotify(cntl, &req, response, done); - } else if (method_name == kSendMonomerFetchBarrierRPC) { - ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done); - } else { - ch_ctx->stub->SendVariable(cntl, &req, response, done); - } - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - - return var_h; -} - -VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep, - const std::string& method_name, - const std::string& message, - int64_t time_out) { - sendrecv::VariableMessage req; - req.set_varname(message); - - return AsyncSendVarMessage(ep, method_name, req, time_out); -} - -VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep, - const std::string& dirname, - const std::string& varname, - const int mode, - int64_t time_out) { - sendrecv::VariableMessage req; - req.set_varname(varname); - req.set_out_varname(dirname); - - return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h deleted file mode 100644 index 91f94b4c9d5a30..00000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.h +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include // NOLINT -#include -#include -#include -#include -#include -#include // NOLINT -#include -#include -#include - -#include "brpc/channel.h" -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN - -namespace paddle { -namespace operators { -namespace distributed { - -struct ChannelContext { - brpc::Channel channel; - std::shared_ptr stub; -}; - -typedef std::shared_ptr ChannelContextPtr; -typedef std::shared_ptr> - ChannelQueuePtr; - -class BRPCClient : public RPCClient { - public: - BRPCClient() {} - virtual ~BRPCClient(); - - VarHandlePtr AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetMonomerBarrier( - const std::string& ep, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - int64_t time_out = FLAGS_rpc_deadline); - - VarHandlePtr AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendBatchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendFetchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncCheckpointNotify( - const std::string& ep, const std::string& dirname, - const std::string& varname, const int mode, - int64_t time_out = FLAGS_rpc_deadline) override; - - bool Wait() override; - - void SendComplete() override; - - private: - VarHandlePtr _AsyncGetVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_var_name, const std::string& method_name, - const std::string& table_name, int64_t time_out = FLAGS_rpc_deadline); - - void Proceed(); - ChannelQueuePtr GetChannel(const std::string& ep); - - VarHandlePtr AsyncSendComplete(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline); - - VarHandlePtr AsyncSendMessage(const std::string& ep, - const std::string& method_name, - const std::string& message, int64_t time_out); - - VarHandlePtr AsyncSendVarMessage(const std::string& ep, - const std::string& method_name, - const sendrecv::VariableMessage& req, - int64_t time_out); - - friend void HandleSendResponse(brpc::Controller* cntl, - sendrecv::VoidMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls); - - friend void HandleGetResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls); - - friend void HandleFetchBarrierResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, - VarHandlePtr var_h, - ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, - BRPCClient* cls); - void DecreaseReqCount() { - if (--req_count_ <= 0) { - sync_cond_.notify_all(); - } - } - - private: - std::unordered_map channels_; - - // mutex for Wait client sync - std::mutex sync_mutex_; - std::condition_variable sync_cond_; - std::atomic req_count_{0}; - - static constexpr int brpc_channel_num_per_server_ = 4; - - // mutex for GetChannel thread safety - std::mutex chan_mutex_; - DISABLE_COPY_AND_ASSIGN(BRPCClient); -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc deleted file mode 100644 index 94f0b9919ace83..00000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifdef PADDLE_WITH_BRPC_RDMA - -#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h" -#include "brpc/channel.h" -#include "brpc/rdma/rdma_helper.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -RdmaMemPool& RdmaMemPool::Instance() { - static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool(); - return *g_rdma_mem_pool; -} - -void* RdmaMemPool::Find(const std::string& varname, int64_t size) { - pthread_rwlock_rdlock(&access_); - auto it = pool_.find(varname); - if (it == pool_.end()) { - pthread_rwlock_unlock(&access_); - return nullptr; - } - - auto info = it->second; - if (info.data_size != size) { - pthread_rwlock_unlock(&access_); - PADDLE_THROW(platform::errors::InvalidArgument( - "var:%s size:%ld != %ld", varname, size, info.data_size)); - return nullptr; - } - - pthread_rwlock_unlock(&access_); - return info.data; -} - -void RdmaMemPool::Register(const std::string& varname, void* data, - int64_t data_size) { - void* old = Find(varname, data_size); - if (old != nullptr) { - PADDLE_ENFORCE_EQ( - data, old, platform::errors::InvalidArgument("var:%s data:%ld != %ld", - varname, data, old)); - VLOG(7) << "Find on rdma:" << varname << " data:" << data - << " data_size:" << data_size; - return; - } - - VarInfo info; - info.data = data; - info.data_size = data_size; - - pthread_rwlock_wrlock(&access_); - pool_[varname] = info; - pthread_rwlock_unlock(&access_); - - if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) { - PADDLE_THROW(platform::errors::Unavailable( - "Register memory for RDMA failed. Register %s data: %s data size %d " - "error.", - varname, data, data_size)); - } - - VLOG(4) << "register on rdma:" << varname << " data:" << data - << " data_size:" << data_size; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle - -#endif diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h deleted file mode 100644 index 156a93ec578471..00000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#ifdef PADDLE_WITH_BRPC_RDMA - -#include // NOLINT -#include -#include - -namespace paddle { -namespace operators { -namespace distributed { - -/* - * This class is used to avoid duplicated registion of brpc::rdma. - */ -class RdmaMemPool { - public: - static RdmaMemPool& Instance(); - RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {} - - virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); } - - void Register(const std::string& varname, void* data, int64_t size); - void* Find(const std::string& varname, int64_t size); - - private: - struct VarInfo { - void* data; - int64_t data_size; - - VarInfo() : data(nullptr), data_size(0) {} - }; - - private: - std::unordered_map pool_; - pthread_rwlock_t access_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle - -#endif diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc deleted file mode 100644 index 411c0f36debd3b..00000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc +++ /dev/null @@ -1,224 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_NCCL -#include -#endif -#ifdef PADDLE_WITH_RCCL -#include -#endif -#include -#include -#include -#include // NOLINT - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class IOBufWriter { - public: - static void Append(const std::string& varname, butil::IOBuf* iobuf, int k, - const char* v, int64_t vlen) { - if (vlen >= std::numeric_limits::max() || vlen < 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Variable lenght is invalid. Variable name is %s, length is %d.", - varname, vlen)); - } - - iobuf->append(reinterpret_cast(&k), 4); - iobuf->append(reinterpret_cast(&vlen), 8); - iobuf->append(v, vlen); - } - - static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v, - int64_t vlen, bool in_cuda_pinned, - void (*destroy)(void*), void* user_data) { - VLOG(7) << "AppendTCPZeroCopy " - << " k:" << k - << " data:" << static_cast(const_cast(v)) - << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; - - iobuf->append(reinterpret_cast(&k), 4); - iobuf->append(reinterpret_cast(&vlen), 8); - - // FIXME(gongwb): use append_zerocopy - /* - if (in_cuda_pinned) { - iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory); - } else { - iobuf->append_zerocopy(v, vlen, nullptr); - } - */ - iobuf->append(v, vlen); - destroy(user_data); - } - -#ifdef PADDLE_WITH_BRPC_RDMA - static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf, - int k, const char* v, int64_t vlen, - bool in_cuda_pinned, void (*destroy)(void*), - void* user_data) { - VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k - << " data:" << static_cast(const_cast(v)) - << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; - - iobuf->append(reinterpret_cast(&k), 4); - iobuf->append(reinterpret_cast(&vlen), 8); - - RdmaMemPool::Instance().Register( - varname, static_cast(const_cast(v)), vlen); - - // FIXME(gongwb): use append_zerocopy - // iobuf->append_zerocopy(v, vlen, nullptr); - iobuf->append(v, vlen); - destroy(user_data); - return; - } -#endif - - static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf, - int k, const char* v, int64_t vlen, - bool in_cuda_pinned, void (*destroy)(void*), - void* user_data) { - if (vlen >= std::numeric_limits::max() || vlen < 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Variable lenght is invalid. Variable name is %s, length is %d.", - varname, vlen)); - } - -#ifdef PADDLE_WITH_BRPC_RDMA - IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, - destroy, user_data); -#else - IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy, - user_data); -#endif - } -}; - -void SerializeToIOBuf(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, VarMsg* request, - butil::IOBuf* iobuf, const std::string& out_varname, - bool var_is_not_stable, int trainer_id, - const std::string& table_name) { - std::unique_ptr payload; - - request->set_varname(name); - request->set_trainer_id(trainer_id); - // Note: normally the profiler is enabled in 1 trainer, hence only - // 1 trainer returns true for ShouldSendProfileState(). It tells PS - // servers the trainer's profiling state so that PS can follow the - // trainer. - if (platform::ShouldSendProfileState()) { - if (platform::IsProfileEnabled()) { - request->set_profile(platform::kEnableProfiler); - } else { - request->set_profile(platform::kDisableProfiler); - } - } - if (!out_varname.empty()) { - request->set_out_varname(out_varname); - } - if (!table_name.empty()) { - request->set_table_name(table_name); - } - if (var->IsType()) { - request->set_type(::sendrecv::LOD_TENSOR); - payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request))); - } else if (var->IsType()) { - request->set_type(::sendrecv::SELECTED_ROWS); - payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request))); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - } else if (var->IsType()) { - request->set_type(::sendrecv::NCCL_ID); - const ncclUniqueId& uid = var->Get(); - // TODO(gongwb): use append_zero to avoid data copy. - IOBufWriter::Append(name, iobuf, - sendrecv::VariableMessage::kSerializedFieldNumber, - uid.internal, NCCL_UNIQUE_ID_BYTES); - return; -#endif - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Serialize does not support type: %s", typeid(var->Type()).name())); - } - - PADDLE_ENFORCE_NOT_NULL( - payload, - platform::errors::InvalidArgument( - "Not support type: %s, need to be LOD_TENSOR or SELECTED_ROWS.", - var->Type())); - - // FIXME(gongwb): it seems that can use zero copy. - if (var_is_not_stable) { - IOBufWriter::Append( - name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, - static_cast(payload->ptr()), payload->memory_size()); - } else { - if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - IOBufWriter::AppendZeroCopy( - name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, - static_cast(payload->ptr()), payload->memory_size(), - true, SerializeDestroyCallback, static_cast(payload.get())); - payload.release(); -#endif - } else { - IOBufWriter::AppendZeroCopy( - name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, - static_cast(payload->ptr()), payload->memory_size(), - false, SerializeDestroyCallback, static_cast(payload.get())); - payload.release(); - } - } - - if (var->IsType()) { - auto* slr = var->GetMutable(); - PADDLE_ENFORCE_EQ(VectorElemName(slr->rows()), typeid(int64_t).name(), - platform::errors::InvalidArgument( - "Got wrong type: %s, expect type: int64_t", - VectorElemName(slr->rows()))); - size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); - - IOBufWriter::Append(name, iobuf, - ::sendrecv::VariableMessage::kRowsFieldNumber, - reinterpret_cast(slr->rows().data()), - static_cast(rows_memory_size)); - } -} - -void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta, - const butil::IOBuf& iobuf, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id) { - operators::distributed::BRPCVariableResponse resp(scope, &ctx); - PADDLE_ENFORCE_EQ( - resp.Parse(iobuf, meta), 0, - platform::errors::InvalidArgument("parse iobuf to tensor error!")); - *var = resp.GetVar(); - *trainer_id = resp.GetTrainerId(); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h deleted file mode 100644 index a5bdc331eb29c7..00000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "brpc/channel.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" - -namespace paddle { -namespace operators { -namespace distributed { - -void SerializeToIOBuf(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, VarMsg* request, - butil::IOBuf* iobuf, const std::string& out_varname, - bool var_is_not_stable, const int trainer_id = 0, - const std::string& table_name = std::string()); - -void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id); - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc deleted file mode 100644 index bcf20ad076b11f..00000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc +++ /dev/null @@ -1,175 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include // NOLINT - -#include "brpc/channel.h" -#include "google/protobuf/text_format.h" -#include "gtest/gtest.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/variable_response.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/printf.h" - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace operators = paddle::operators; -namespace math = paddle::operators::math; -namespace memory = paddle::memory; - -void RunSerdeTestSelectedRows(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - butil::IOBuf iobuf; - sendrecv::VariableMessage msg; - int tensor_numel = 564 * 128; - - // serialize var to IOBuf - { - framework::Variable var; - auto* slr = var.GetMutable(); - slr->set_height(1000); - auto* tensor = slr->mutable_value(); - auto* rows = slr->mutable_rows(); - tensor->Resize(framework::make_ddim({564, 128})); - tensor->mutable_data(place); - math::set_constant(ctx, tensor, 32.7); - for (int i = 0; i < 564; ++i) rows->push_back(i); - - operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, - "", false); - } - - // desrialize - { - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::BRPCVariableResponse resp(&scope, &ctx); - EXPECT_EQ(resp.Parse(iobuf, msg), 0); - - framework::Variable* var2 = resp.GetVar(); - - auto* slr2 = var2->GetMutable(); - auto* tensor2 = slr2->mutable_value(); - auto* rows2 = slr2->mutable_rows(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(*tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2->data()); - } - const int64_t* rows_data2 = rows2->data(); - - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); - } - for (size_t i = 0; i < rows2->size(); ++i) { - EXPECT_EQ(rows_data2[i], static_cast(i)); - } - EXPECT_EQ(slr2->height(), 1000); - } -} - -void RunTestLodTensor(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - // serialize var to ByteBuffer - butil::IOBuf iobuf; - sendrecv::VariableMessage msg; - int tensor_numel = 512 * 8 * 4 * 2; - { - framework::Variable var; - auto* tensor = var.GetMutable(); - tensor->Resize(framework::make_ddim({512, 8, 4, 2})); - framework::LoD lod; - lod.push_back(framework::Vector({1, 3, 8})); - tensor->set_lod(lod); - tensor->mutable_data(place); - math::set_constant(ctx, tensor, 31.9); - - operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, - "", false); - } - - // check sendrecv::VariableMessage meta data - { - EXPECT_EQ(msg.varname(), "myvar"); - EXPECT_EQ(msg.type(), 0); - EXPECT_EQ(msg.dims()[0], 512); - EXPECT_EQ(msg.dims()[1], 8); - EXPECT_EQ(msg.dims()[2], 4); - EXPECT_EQ(msg.dims()[3], 2); - EXPECT_EQ(msg.lod_level(), 1); - EXPECT_EQ(msg.lod(0).lod_data(0), 1); - EXPECT_EQ(msg.lod(0).lod_data(1), 3); - EXPECT_EQ(msg.lod(0).lod_data(2), 8); - } - - // deserialize - { - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::BRPCVariableResponse resp(&scope, &ctx); - EXPECT_EQ(resp.Parse(iobuf, msg), 0); - - framework::Variable* var2 = resp.GetVar(); - - auto tensor2 = var2->Get(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2.data()); - } - - for (int i = 0; i < tensor_numel; ++i) - EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); - } -} - -TEST(LodTensor, Run) { - platform::CPUPlace place; - RunTestLodTensor(place); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu(0); - RunTestLodTensor(gpu); -#endif -} - -TEST(SelectedRows, Run) { - platform::CPUPlace place; - RunSerdeTestSelectedRows(place); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu; - RunSerdeTestSelectedRows(gpu); -#endif -} diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc deleted file mode 100644 index 5ca26f006bf20e..00000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc +++ /dev/null @@ -1,417 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/brpc/brpc_server.h" -#include -#include -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace sendrecv { - -namespace distributed = paddle::operators::distributed; - -typedef std::unordered_map - HandlerMap; - -class BRPCServiceImpl : public SendRecvService { - public: - explicit BRPCServiceImpl(const HandlerMap& rpc_call_map, - distributed::RPCServer* rpc_server) - : rpc_server_(rpc_server) { - VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size(); - auto it = rpc_call_map.find(distributed::kRequestSend); - if (it != rpc_call_map.end()) { - request_send_h_ = it->second; - send_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestSend))); - } - - it = rpc_call_map.find(distributed::kRequestGet); - if (it != rpc_call_map.end()) { - request_get_h_ = it->second; - get_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestGet))); - } - - it = rpc_call_map.find(distributed::kRequestGetNoBarrier); - if (it != rpc_call_map.end()) { - request_getnobarrier_h_ = it->second; - getnobarrier_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier))); - } - - it = rpc_call_map.find(distributed::kRequestPrefetch); - if (it != rpc_call_map.end()) { - request_prefetch_h_ = it->second; - prefetch_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); - } - - it = rpc_call_map.find(distributed::kRequestCheckpoint); - if (it != rpc_call_map.end()) { - request_checkpoint_h_ = it->second; - checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); - } - - it = rpc_call_map.find(distributed::kRequestGetMonomerVariable); - if (it != rpc_call_map.end()) { - request_get_monomer_handler_h_ = it->second; - } - - it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier); - if (it != rpc_call_map.end()) { - request_get_monomer_barrier_handler_h_ = it->second; - } - } - - virtual ~BRPCServiceImpl() {} - void SendVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) override { - send_threads_->Run( - [=] { _SendVariable(cntl_butil, request, response, done); }); - } - - void _SendVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_send_h_, platform::errors::PreconditionNotMet( - "RequestSend handler should be registed first!")); - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - VLOG(3) << "RequestSend var_name:" << varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - distributed::BRPCVariableResponse resp(request_send_h_->scope(), - request_send_h_->dev_ctx(), - request_send_h_->distributed_mode()); - PADDLE_ENFORCE_EQ( - resp.Parse(cntl->request_attachment(), *request), 0, - platform::errors::InvalidArgument("parse iobuf to tensor error!")); - - auto scope = resp.GetMutableLocalScope(); - auto invar = resp.GetVar(); - int trainer_id = request->trainer_id(); - paddle::framework::Variable* outvar = nullptr; - - request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id); - } - - void GetVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VariableMessage* response, - google::protobuf::Closure* done) override { - get_threads_->Run( - [=] { _GetVariable(cntl_butil, request, response, done); }); - } - - void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) override { - getnobarrier_threads_->Run( - [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); }); - } - - void _GetVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VariableMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_get_h_, platform::errors::PreconditionNotMet( - "RequestGet handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - std::string out_varname = request->out_varname(); - VLOG(3) << "RequestGet varname:" << varname - << ", out_varname:" << out_varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - auto scope = request_get_h_->scope(); - paddle::framework::Variable* invar = nullptr; - int trainer_id = request->trainer_id(); - paddle::framework::Variable* outvar = nullptr; - - request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id, - out_varname); - - if (outvar) { - distributed::SerializeToIOBuf(out_varname, outvar, - *request_get_h_->dev_ctx(), response, - &cntl->response_attachment(), "", false); - } - } - - void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_getnobarrier_h_, - platform::errors::PreconditionNotMet( - "RequestGetNoBarrier handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - std::string out_varname = request->out_varname(); - int trainer_id = request->trainer_id(); - - VLOG(3) << "RequestGetNoBarrier varname:" << varname - << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id - << ", from:" << cntl->remote_side(); - - auto scope = request_getnobarrier_h_->scope(); - paddle::framework::Variable* invar = nullptr; - paddle::framework::Variable* outvar = nullptr; - - request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id, - out_varname); - - if (outvar) { - distributed::SerializeToIOBuf( - out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response, - &cntl->response_attachment(), "", false); - } - } - - void PrefetchVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) override { - prefetch_threads_->Run( - [=] { _PrefetchVariable(cntl_butil, request, response, done); }); - } - - void _PrefetchVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL(request_prefetch_h_, - platform::errors::PreconditionNotMet( - "kRequestPrefetch handler should be registed first!"); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - // prefetch process... - std::string in_var_name = request->varname(); - std::string out_var_name = request->out_varname(); - VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name - << ", out_var_name: " << out_var_name - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - distributed::BRPCVariableResponse resp( - request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true); - - PADDLE_ENFORCE_EQ(resp.Parse(cntl->request_attachment(), *request), 0, - platform::errors::InvalidArgument( - "parse iobuf to tensor error!")); - - auto scope = resp.GetMutableLocalScope(); - auto invar = scope->FindVar(in_var_name); - std::string table_name = request->table_name(); - int trainer_id = request->trainer_id(); - paddle::framework::Variable* outvar = scope->Var(out_var_name); - - request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id, - out_var_name, table_name); - - distributed::SerializeToIOBuf(out_var_name, outvar, - *request_prefetch_h_->dev_ctx(), response, - &cntl->response_attachment(), "", true); - } - - void CheckpointNotify(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) override { - checkpoint_notify_threads_->Run( - [=] { _CheckpointNotify(cntl_butil, request, response, done); }); - } - - void _CheckpointNotify(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_checkpoint_h_, - platform::errors::PreconditionNotMet( - "kRequestCheckpointNotify handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(), - request_checkpoint_h_->dev_ctx()); - - auto scope = resp.GetMutableLocalScope(); - - std::string checkpoint_notify = request->varname(); - std::string checkpoint_dir = request->out_varname(); - int trainer_id = request->trainer_id(); - - VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify - << ", dir: " << checkpoint_dir - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr, - trainer_id, checkpoint_dir); - } - - void GetMonomerVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) override { - PADDLE_ENFORCE_NOT_NULL( - request_get_monomer_handler_h_, - platform::errors::PreconditionNotMet( - "kRequestGetMonomerVariable handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - // proc request. - std::string varname = request->varname(); - VLOG(3) << "GetMonomerVariable " << varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - rpc_server_->WaitVarCond(varname); - distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); - - auto scope = h.scope_; - auto invar = scope->FindVar(varname); - paddle::framework::Variable* outvar = nullptr; - - request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar, - request->trainer_id()); - - if (outvar) { - distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response, - &cntl->response_attachment(), "", false); - } - } - - void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) override { - PADDLE_ENFORCE_NOT_NULL( - request_get_monomer_barrier_handler_h_, - platform::errors::PreconditionNotMet( - "RequestGetMonomerBarrier handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - rpc_server_->WaitVarCond(varname); - distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); - - paddle::framework::Scope* scope = nullptr; - paddle::framework::Variable* invar = nullptr; - paddle::framework::Variable* outvar = nullptr; - - request_get_monomer_barrier_handler_h_->Handle( - varname, scope, invar, &outvar, request->trainer_id()); - } - - private: - distributed::RequestHandler* request_send_h_{nullptr}; - distributed::RequestHandler* request_get_h_{nullptr}; - distributed::RequestHandler* request_getnobarrier_h_{nullptr}; - distributed::RequestHandler* request_prefetch_h_{nullptr}; - distributed::RequestHandler* request_checkpoint_h_{nullptr}; - distributed::RequestHandler* request_get_monomer_handler_h_{nullptr}; - distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr}; - - distributed::RPCServer* rpc_server_{nullptr}; - - // FIXME(gongwb): brpc should support process one rpc use one threadpool. - std::unique_ptr send_threads_; - std::unique_ptr get_threads_; - std::unique_ptr getnobarrier_threads_; - std::unique_ptr prefetch_threads_; - std::unique_ptr checkpoint_notify_threads_; -}; -} // namespace sendrecv - -namespace paddle { -namespace operators { -namespace distributed { - -void AsyncBRPCServer::StartServer() { - // Instance of your service. - sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this); - - // Add the service into server. Notice the second parameter, because the - // service is put on stack, we don't want server to delete it, otherwise - // use brpc::SERVER_OWNS_SERVICE. - if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Failed to add service into BRPC server.")); - return; - } - - brpc::ServerOptions options; -#ifdef PADDLE_WITH_BRPC_RDMA - options.use_rdma = true; -#endif - options.idle_timeout_sec = idle_timeout_s_; - options.max_concurrency = max_concurrency_; - if (server_.Start(bind_address_.c_str(), &options) != 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Failed to start EchoServer %s.", bind_address_)); - return; - } - - butil::EndPoint ep = server_.listen_address(); - selected_port_ = ep.port; - - { - std::lock_guard lock(this->mutex_ready_); - ready_ = 1; - } - condition_ready_.notify_all(); - - server_.Join(); -} - -void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); } - -void AsyncBRPCServer::WaitServerReady() { - VLOG(3) << "AsyncGRPCServer is wait server ready"; - std::unique_lock lock(this->mutex_ready_); - condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); - VLOG(3) << "AsyncGRPCServer WaitSeverReady"; -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.h b/paddle/fluid/operators/distributed/brpc/brpc_server.h deleted file mode 100644 index 78bbe5adc0813d..00000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_server.h +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // NOLINT -#include // NOLINT -#include - -#include "brpc/server.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class AsyncBRPCServer final : public RPCServer { - public: - explicit AsyncBRPCServer(const std::string& address, int client_num) - : RPCServer(address, client_num), ready_(0) {} - - virtual ~AsyncBRPCServer() {} - void StartServer() override; - void WaitServerReady() override; - - private: - void ShutDownImpl() override; - - brpc::Server server_; - - static constexpr int idle_timeout_s_ = -1; - static constexpr int max_concurrency_ = 0; - - std::mutex mutex_ready_; - std::condition_variable condition_ready_; - int ready_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc deleted file mode 100644 index 49521e8a77057b..00000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - -namespace paddle { -namespace operators { -namespace distributed { - -namespace pb = ::google::protobuf; -using vr = ::sendrecv::VariableMessage; - -int BRPCVariableResponse::Parse(Source* source) { - pb::io::ZeroCopyInputStream* input_stream = source->contents(); - pb::io::CodedInputStream input(input_stream); - input.SetTotalBytesLimit(INT_MAX, INT_MAX); - - while (1) { - unsigned int tag = 0; - if (!input.ReadLittleEndian32(&tag)) { - break; - } - - uint64_t num_bytes = 0; - if (!input.ReadLittleEndian64(&num_bytes)) { - break; - } - - int field = static_cast(tag); - int ret = field == 0 ? -1 : field; - switch (field) { - case vr::kSerializedFieldNumber: { - if (!ProcSerializedField(field, &input, num_bytes)) { - return ret; - } - break; - } - case vr::kRowsFieldNumber: { - PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || - meta_.type() == sendrecv::LOD_TENSOR) && - meta_.varname() != "", - platform::errors::PreconditionNotMet( - "meta info should be got first!")); - - if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { - return ret; - } - break; - } - default: { - PADDLE_THROW(platform::errors::Unavailable( - "not surpported %u fieldnumber", field)); - return ret; - } - } - } - - return 0; -} -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h deleted file mode 100644 index 6282f08a725367..00000000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "brpc/channel.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" - -#include "paddle/fluid/operators/distributed/distributed_pb.h" - -#include "google/protobuf/io/coded_stream.h" -#include "google/protobuf/io/zero_copy_stream.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/distributed/variable_response.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class BRPCSourceWrapper : public Source { - public: - explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {} - ::google::protobuf::io::ZeroCopyInputStream* contents() override { - return &source_; - } - - private: - butil::IOBufAsZeroCopyInputStream source_; -}; - -class BRPCVariableResponse : public VariableResponse { - public: - BRPCVariableResponse(const framework::Scope* scope, - const platform::DeviceContext* dev_ctx, - bool create_scope = false) - : VariableResponse(scope, dev_ctx, create_scope) {} - - virtual ~BRPCVariableResponse() {} - - // parse attachment from iobuf - int Parse(Source* source) override; - int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) { - BRPCSourceWrapper wrapper(iobuf); - return VariableResponse::Parse(&wrapper, meta); - } -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc deleted file mode 100644 index fcd3e6abead510..00000000000000 --- a/paddle/fluid/operators/distributed/collective_client.cc +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/collective_client.h" -#include -#include "gflags/gflags.h" - -DECLARE_int32(rpc_deadline); - -namespace paddle { -namespace operators { -namespace distributed { -std::once_flag CollectiveClient::init_flag_; -std::unique_ptr CollectiveClient::client_(nullptr); - -bool CollectiveClient::Gather(const std::vector& remote_vars, - std::vector* dst, - const platform::DeviceContext& ctx, - framework::Scope* scope, int64_t time_out) { - for (auto r : remote_vars) { - VLOG(50) << "begin gather from ep:" << r.String(); - scope->Var(r.var_name_)->GetMutable(); - VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable( - r.ep_, ctx, *scope, r.var_name_, time_out); - } - - rpc_client_->Wait(); - - for (auto r : remote_vars) { - auto select_rows = - scope->FindVar(r.var_name_)->GetMutable(); - dst->push_back(select_rows); - - VLOG(4) << "gather from ep:" << r.String() - << ", select_rows:" << GetSelectedRowsInfo(*select_rows); - - rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_); - } - - rpc_client_->Wait(); - return true; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h deleted file mode 100644 index e7d8bb8df98347..00000000000000 --- a/paddle/fluid/operators/distributed/collective_client.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include // NOLINT -#include -#include -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace paddle { -namespace framework { -class Scope; -class SelectedRows; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -DECLARE_int32(rpc_deadline); - -namespace paddle { -namespace operators { -namespace distributed { - -inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) { - std::stringstream ss; - ss << ", height:" << slr.height() << ", rows:["; - for (unsigned int i = 0; i < slr.rows().size(); i++) { - if (i != slr.rows().size() - 1) { - ss << slr.rows()[i] << ","; - } else { - ss << slr.rows()[i]; - } - } - ss << "], dims:" << slr.value().dims(); - return ss.str(); -} - -struct RemoteVar { - std::string ep_; - std::string var_name_; - int trainer_id_{0}; - - std::string String() { - std::stringstream ss; - ss << "ep:" << ep_ << ", var_name:" << var_name_ - << ", trainer_id:" << trainer_id_; - - return ss.str(); - } -}; - -class CollectiveClient { - public: - CollectiveClient() { - rpc_client_.reset(new RPCCLIENT_T()); - rpc_client_->InitImpl(); - } - virtual ~CollectiveClient() {} - - // note this function will retain the rank order. - bool Gather(const std::vector& remote_vars, - std::vector* dst, - const platform::DeviceContext& ctx, framework::Scope* scope, - int64_t time_out = FLAGS_rpc_deadline); - - static CollectiveClient* GetInstance() { - std::call_once(init_flag_, [&]() { - if (client_.get() == nullptr) { - client_.reset(new CollectiveClient()); - } - }); - return client_.get(); - } - - private: - std::unique_ptr rpc_client_; - - static std::once_flag init_flag_; - static std::unique_ptr client_; -}; -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc deleted file mode 100644 index cdd37742d2d5a5..00000000000000 --- a/paddle/fluid/operators/distributed/collective_server.cc +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/distributed/collective_server.h" -#include - -DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get"); - -namespace paddle { -namespace operators { -namespace distributed { - -std::once_flag CollectiveServer::init_flag_; -std::shared_ptr CollectiveServer::collective_server_(nullptr); - -CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) { - VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in; - rpc_server_.reset(new RPCSERVER_T(end_point, fan_in)); -} - -void CollectiveServer::Stop() { - rpc_server_->ShutDown(); - server_thread_->join(); - loop_thread_->join(); -} - -void CollectiveServer::StartServer() { - get_monomer_handler_.reset(new GetMonomerHandler()); - get_monomer_handler_->SetRPCServer(rpc_server_.get()); - - get_barrier_handler_.reset(new GetMonomerBarrierHandler()); - get_barrier_handler_->SetRPCServer(rpc_server_.get()); - - rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable, - get_monomer_handler_.get(), - FLAGS_collective_get_thread_num); - rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier, - get_barrier_handler_.get(), 1); - - server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); })); - rpc_server_->WaitServerReady(); - - loop_thread_.reset(new std::thread([&]() { - while (true) { - if (rpc_server_->IsExit()) { - LOG(WARNING) << "get exit!rpc_processor break!"; - break; - } - sleep(1); - } - VLOG(1) << "CollectiveServer loop_thread end"; - })); -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h deleted file mode 100644 index 4964923286094a..00000000000000 --- a/paddle/fluid/operators/distributed/collective_server.h +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include // NOLINT -#include -#include -#include "gflags/gflags.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/request_handler_impl.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" - -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class CollectiveServer; - -class GetMonomerHandler final : public RequestHandler { - public: - GetMonomerHandler() : RequestHandler(true) {} - virtual ~GetMonomerHandler() {} - bool Handle(const std::string& var_name, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override { - VLOG(50) << "GetMonomerHandler recv " << var_name; - - *outvar = scope->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL( - outvar, platform::errors::NotFound("var: %s is not found.", var_name)); - - return true; - } -}; - -class GetMonomerBarrierHandler final : public RequestHandler { - public: - GetMonomerBarrierHandler() : RequestHandler(true) {} - virtual ~GetMonomerBarrierHandler() {} - bool Handle(const std::string& var_name, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override { - VLOG(50) << "GetMonomerHandler recv " << var_name; - - rpc_server_->IncreaseVarBarrier(var_name); - - return true; - } -}; - -class CollectiveServer final { - public: - explicit CollectiveServer(const std::string& end_point, int fan_in); - - virtual ~CollectiveServer() {} - - void StartServer(); - - static CollectiveServer* GetInstance(const std::string& end_point, - int fan_in) { - std::call_once(init_flag_, [&]() { - if (collective_server_.get() == nullptr) { - collective_server_.reset(new CollectiveServer(end_point, fan_in)); - collective_server_->StartServer(); - } - }); - - return collective_server_.get(); - } - - std::shared_ptr GetRPCServer() { return rpc_server_; } - - void Stop(); - - private: - std::unique_ptr get_monomer_handler_; - std::unique_ptr get_barrier_handler_; - - std::shared_ptr rpc_server_; - std::shared_ptr server_thread_; - std::shared_ptr loop_thread_; - - bool ready_{false}; - - static std::once_flag init_flag_; - static std::shared_ptr collective_server_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc deleted file mode 100644 index 92b2eb4b51e59f..00000000000000 --- a/paddle/fluid/operators/distributed/collective_server_test.cc +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/operators/distributed/collective_client.h" -#include "paddle/fluid/operators/distributed/collective_server.h" - -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -} // namespace paddle - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace distributed = paddle::operators::distributed; - -std::unique_ptr StartServer( - const std::string& ep, int fan_in, framework::Scope* scope, - platform::DeviceContext* dev_ctx) { - distributed::CollectiveServer* server = - distributed::CollectiveServer::GetInstance(ep, fan_in); - - auto rpc_server = server->GetRPCServer(); - rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable, - scope, dev_ctx); - - std::cout << "StartServer return" << std::endl; - return std::unique_ptr(server); -} - -std::unique_ptr GenerateVars(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - framework::Scope* scope = new framework::Scope(); - framework::Variable* var = scope->Var("var1"); - auto* slr = var->GetMutable(); - slr->set_height(20000); - - auto* tensor = slr->mutable_value(); - auto* rows = slr->mutable_rows(); - - tensor->Resize(framework::make_ddim({3, 1024})); - tensor->mutable_data(place); - - paddle::operators::math::set_constant(ctx, tensor, 32.7); - for (int i = 0; i < 3; ++i) rows->push_back(i); - - std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr); - - return std::unique_ptr(scope); -} - -void Gather(const std::vector& vars, - platform::DeviceContext* dev_ctx) { - distributed::CollectiveClient* client = - distributed::CollectiveClient::GetInstance(); - - framework::Scope* scope = new framework::Scope(); - framework::Variable* var = scope->Var("var1"); - var->GetMutable(); - - std::vector dst; - client->Gather(vars, &dst, *dev_ctx, scope); - std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]); - dev_ctx->Wait(); - - ASSERT_EQ(dst[0]->value().dims(), framework::make_ddim({3, 1024})); - ASSERT_EQ(dst[0]->height(), 20000); - ASSERT_EQ(dst[0]->rows().size(), static_cast(3)); - for (int i = 0; i < 3; i++) { - ASSERT_EQ(dst[0]->rows()[i], i); - } - - std::vector vec; - TensorToVector(dst[0]->value(), *dev_ctx, &vec); - for (size_t i = 0; i < 3 * 1024; i++) { - ASSERT_FLOAT_EQ(vec[i], 32.7); - } -} - -TEST(CollectiveServer, GPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - - platform::CUDAPlace place; - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - std::string ep = "127.0.0.1:7164"; - auto scope = GenerateVars(place); - - auto* v1 = scope->FindVar("var1"); - std::cout << "var1:" << v1 << std::endl; - - auto server = StartServer(ep, 2, scope.get(), &ctx); - auto rpc_server = server->GetRPCServer(); - - distributed::RemoteVar var; - var.ep_ = ep; - var.var_name_ = "var1"; - var.trainer_id_ = 0; - - std::vector vars{var}; - Gather(vars, &ctx); - Gather(vars, &ctx); - - std::cout << "begin WaitVarBarrier" << std::endl; - rpc_server->WaitVarBarrier("var1"); - rpc_server->ClearRegisteredVars(); - server->Stop(); - - scope.release(); - server.release(); -} diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc deleted file mode 100644 index 4ee27a6414698f..00000000000000 --- a/paddle/fluid/operators/distributed/communicator.cc +++ /dev/null @@ -1,989 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/distributed/communicator.h" - -#include - -#include -#include // NOLINT -#include -#include // NOLINT -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/parameter_recv.h" -#include "paddle/fluid/operators/distributed/parameter_send.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/fluid/string/split.h" - -namespace paddle { -namespace operators { -namespace distributed { - -using Tree = - std::map>>; -using RpcCtxMap = operators::distributed::RpcCtxMap; - -inline double GetCurrentUS() { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+6 * time.tv_sec + time.tv_usec; -} - -Communicator::Communicator() {} - -std::once_flag Communicator::init_flag_; -std::shared_ptr Communicator::communicator_(nullptr); - -void AsyncCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) { - send_varname_to_ctx_ = std::move(send_varname_to_ctx); - recv_varname_to_ctx_ = std::move(recv_varname_to_ctx); - recv_scope_ = std::move(recv_scope); - - if (send_varname_to_ctx.size() == 0) { - VLOG(0) << "nothing need to be send, will not start send_thread"; - } else { - send_scope_.reset(new Scope()); - for (auto &iter : send_varname_to_ctx_) { - if (iter.first == STEP_COUNTER && !need_global_step_) continue; - send_varname_to_queue_[iter.first] = - std::make_shared>>( - send_queue_size_); - } - send_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - } - - if (recv_varname_to_ctx.size() == 0) { - VLOG(0) << "nothing need to be received, will not start recv_thread"; - } else { - recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - } - - InitParams(); -} - -void AsyncCommunicator::InitParams() { RecvNoBarrier(); } - -AsyncCommunicator::~AsyncCommunicator() { - running_ = false; - if (main_thread_) main_thread_->join(); -} - -void AsyncCommunicator::SendGlobalStep(int batches) { - if (!need_global_step_) { - return; - } - - if (batches == 0) { - return; - } - - auto &var_name = STEP_COUNTER; - auto *out_var = send_scope_->Var(var_name); - auto *out_t = out_var->GetMutable(); - auto *data = out_t->mutable_data({1}, platform::CPUPlace()); - data[0] = static_cast(batches); - - auto &ctx = send_varname_to_ctx_.at(var_name); - auto send_functor = distributed::ParameterSend(); - send_functor(ctx, *send_scope_, true, 1); -} - -void AsyncCommunicator::SendByCommunicator() { - std::vector> task_futures; - task_futures.reserve(send_varname_to_ctx_.size()); - VLOG(3) << "run send graph"; - - auto before_run_send_graph = GetCurrentUS(); - for (auto &iter : send_varname_to_queue_) { - auto &var_name = iter.first; - auto &var_queue = iter.second; - - auto send_task = [this, &var_name, &var_queue] { - VLOG(3) << var_name << " merge and send; "; - std::vector> vars; - - int merged_var_num = 0; - int wait_times = 0; - while (merged_var_num < max_merge_var_num_) { - if (var_queue->Size() == 0) { - VLOG(4) << "wait_times -> " << wait_times; - if (wait_times >= send_wait_times_) { - break; - } - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - wait_times++; - continue; - } else { - wait_times = 0; - - vars.push_back(var_queue->Pop()); - merged_var_num++; - } - } - auto before_merge = GetCurrentUS(); - if (var_name == STEP_COUNTER) { - SendGlobalStep(merged_var_num); - auto after_merge = GetCurrentUS(); - VLOG(3) << "merge and send " << merged_var_num << " " << var_name - << " use time " << after_merge - before_merge; - return; - } - - auto &ctx = send_varname_to_ctx_.at(var_name); - - MergeVars(var_name, vars, send_scope_.get(), ctx.merge_add); - auto after_merge = GetCurrentUS(); - VLOG(3) << "merge " << merged_var_num << " " << var_name << " use time " - << after_merge - before_merge; - - auto send_functor = distributed::ParameterSend(); - send_functor(ctx, *send_scope_, true, 1); - auto after_send = GetCurrentUS(); - VLOG(3) << "send " << var_name << " use time " - << after_send - after_merge; - - if (var_name.rfind("@GRAD") != var_name.size() - 5) return; - - auto recv_param = var_name.substr(0, var_name.size() - 5); - if (recv_varname_to_ctx_.find(recv_param) == recv_varname_to_ctx_.end()) - return; - - auto recv_functor = distributed::ParameterRecv(); - recv_functor(recv_varname_to_ctx_.at(recv_param), *recv_scope_); - auto after_recv = GetCurrentUS(); - VLOG(3) << "recv " << recv_param << " use time " - << after_recv - after_send; - }; - task_futures.emplace_back(send_threadpool_->enqueue(std::move(send_task))); - } - for (auto &task_f : task_futures) { - task_f.wait(); - } - auto after_run_send_graph = GetCurrentUS(); - - VLOG(3) << "run send graph use time " - << (after_run_send_graph - before_run_send_graph); -} - -void HalfAsyncCommunicator::SendByCommunicator() { - std::vector> task_futures; - task_futures.reserve(send_varname_to_ctx_.size()); - VLOG(3) << "run send graph"; - - int batches = BatchesCounter(); - if (batches <= 0) return; - - auto before_run_send_graph = GetCurrentUS(); - for (auto &iter : send_varname_to_queue_) { - auto &var_name = iter.first; - auto &var_queue = iter.second; - - auto send_task = [this, batches, &var_name, &var_queue] { - VLOG(3) << var_name << " merge and send; "; - auto before_task = GetCurrentUS(); - std::vector> vars; - vars.reserve(batches); - - for (int i = 0; i < batches; ++i) { - vars.push_back(var_queue->Pop()); - } - - if (var_name == STEP_COUNTER) { - SendGlobalStep(batches); - auto end_task = GetCurrentUS(); - VLOG(3) << "merge " << batches << " " << var_name << " use time " - << end_task - before_task; - return; - } - - auto &ctx = send_varname_to_ctx_.at(var_name); - - auto before_merge = GetCurrentUS(); - MergeVars(var_name, vars, send_scope_.get(), ctx.merge_add); - auto after_merge = GetCurrentUS(); - VLOG(3) << "merge " << batches << " " << var_name << " use time " - << after_merge - before_merge; - - auto send_functor = distributed::ParameterSend(); - send_functor(ctx, *send_scope_, true, 1); - auto after_send = GetCurrentUS(); - VLOG(3) << "send " << var_name << " use time " - << after_send - before_task; - - if (var_name.rfind("@GRAD") != var_name.size() - 5) return; - - auto recv_param = var_name.substr(0, var_name.size() - 5); - if (recv_varname_to_ctx_.find(recv_param) == recv_varname_to_ctx_.end()) - return; - - auto recv_functor = distributed::ParameterRecv(); - recv_functor(recv_varname_to_ctx_.at(recv_param), *recv_scope_); - auto after_recv = GetCurrentUS(); - VLOG(3) << "recv " << recv_param << " use time " - << after_recv - after_send; - return; - }; - task_futures.emplace_back(send_threadpool_->enqueue(std::move(send_task))); - } - for (auto &task_f : task_futures) { - task_f.wait(); - } - auto after_run_send_graph = GetCurrentUS(); - - VLOG(3) << "run send graph use time " - << (after_run_send_graph - before_run_send_graph); -} - -void AsyncCommunicator::MainThread() { - VLOG(3) << "MainThread start and wait"; - - while (waiting_ && running_) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - VLOG(3) << "wait for running"; - } - - while (running_) { - SendByCommunicator(); - BarrierSend(); - } - VLOG(3) << "communicator stopped, send thread exit"; -} - -void HalfAsyncCommunicator::MainThread() { - VLOG(3) << "MainThread start and wait"; - - while (waiting_ && running_) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - VLOG(3) << "wait for running"; - } - - while (running_) { - SendByCommunicator(); - BarrierSend(); - RecvByCommunicator(); - BarrierRecv(); - BarrierWeakUp(); - } - VLOG(3) << "communicator stopped, send thread exit"; -} - -void AsyncCommunicator::RecvByCommunicator() { - VLOG(3) << "parallel run recv graph"; - if (!running_) return; - RecvNoBarrier(); - VLOG(3) << "run recv graph use time"; -} - -void AsyncCommunicator::RecvNoBarrier() { - std::vector> task_futures; - task_futures.reserve(recv_varname_to_ctx_.size()); - - for (auto &iter : recv_varname_to_ctx_) { - auto recv_task = [this, &iter] { - auto before_task = GetCurrentUS(); - auto &var_name = iter.first; - auto recv_functor = distributed::ParameterRecv(); - recv_functor(iter.second, *recv_scope_); - auto end_task = GetCurrentUS(); - VLOG(1) << "recv var " << var_name << " use time " - << (end_task - before_task); - }; - task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task))); - } - - for (auto &task : task_futures) { - task.wait(); - } -} - -void AsyncCommunicator::Start() { - VLOG(3) << "Communicator start"; - if (!communicator_) { - VLOG(0) << "Communicator is not inited, do nothing"; - } else { - VLOG(3) << "start send thread and recv thread"; - waiting_ = true; - running_ = true; - BarrierTriggerReset(max_merge_var_num_); - // start send and recv thread - main_thread_.reset( - new std::thread(std::bind(&AsyncCommunicator::MainThread, this))); - } -} - -void AsyncCommunicator::Stop() { - VLOG(3) << "Communicator stop"; - running_ = false; - if (!communicator_) { - VLOG(0) << "Communicator is not inited, do nothing"; - } else { - if (main_thread_) { - VLOG(3) << "stop send thread"; - main_thread_->join(); - main_thread_.reset(nullptr); - } - } - VLOG(3) << "Communicator stop done"; -} - -void AsyncCommunicator::Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) { - waiting_ = false; - - PADDLE_ENFORCE_EQ( - var_tables.size(), 1, - platform::errors::InvalidArgument("var_tables.size() == 1 is permitted")); - - auto table_name = var_tables[0]; - - if (table_name == STEP_COUNTER && !need_global_step_) return; - - auto before_send_op = GetCurrentUS(); - auto &queue = send_varname_to_queue_.at(table_name); - - if (table_name == STEP_COUNTER) { - auto tmp_var = std::make_shared(); - auto *tensor = tmp_var->GetMutable(); - tensor->Resize(framework::make_ddim({1})); - auto *out_d = tensor->mutable_data(platform::CPUPlace()); - out_d[0] = 1; - queue->Push(tmp_var); - } else { - PADDLE_ENFORCE_GE(var_names.size(), 1, - platform::errors::InvalidArgument( - "var_names.size() >= 1 is permitted")); - - auto *var = scope.FindVar(var_names[0]); - - PADDLE_ENFORCE_EQ( - var->IsInitialized(), true, - platform::errors::InvalidArgument("grad var should be inited")); - - auto tmp_var = std::make_shared(); - if (var->IsType()) { - framework::CopyVariable(*var, tmp_var.get()); - queue->Push(tmp_var); - } else if (var->IsType()) { - // push var into send queue by var_name - auto var_name = var_names[0]; - framework::CopyVariable(*var, tmp_var.get()); - queue->Push(tmp_var); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "unknown var type to copy, only support LoDTensor/SelectedRows")); - } - } - auto after_send_op = GetCurrentUS(); - VLOG(3) << "send to " << table_name << " with queue size " << queue->Size() - << ", use time " << (after_send_op - before_send_op); -} - -void HalfAsyncCommunicator::Clean() { - for (auto &iter : send_varname_to_queue_) { - auto &var_name = iter.first; - auto &var_queue = iter.second; - - while (var_queue->Size() > 0) { - var_queue->Pop(); - } - - VLOG(3) << "clean var: " << var_name << " done"; - } -} - -int HalfAsyncCommunicator::BatchesCounter() { - while (running_) { - if (barrier_counter_.load() >= barrier_trigger_.load() && - barrier_trigger_.load() != 0) { - break; - } else { - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - } - - return barrier_counter_.load(); -} - -void HalfAsyncCommunicator::Barrier() { - barrier_counter_++; - - if (!running_) { - VLOG(3) << "Communicator is not running, release barrier"; - return; - } - - { - std::unique_lock lk(barrier_mutex_); - barrier_cond_.wait(lk, [this] { return (barrier_counter_ == 0); }); - } -} - -void HalfAsyncCommunicator::BarrierTriggerDecrement() { - barrier_trigger_--; - VLOG(3) << "BarrierTriggerDecrement decrement barrier trigger to " - << barrier_trigger_.load(); -} - -void HalfAsyncCommunicator::BarrierTriggerReset(int initial_val) { - barrier_trigger_.store(initial_val); - - VLOG(3) << "BarrierTriggerReset reset barrier trigger to " - << barrier_trigger_.load(); -} - -void HalfAsyncCommunicator::BarrierWeakUp() { - barrier_counter_.store(0); - barrier_cond_.notify_all(); -} - -void SyncCommunicator::BarrierSend() { - if (!running_) return; - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(trainer_id_); - - std::vector rets; - - for (auto &ep : pserver_endpoints_) { - rets.push_back(rpc_client->AsyncSendBatchBarrier(ep)); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::External( - "internal error in RPCClient")); - } - - VLOG(4) << "BarrierSend with SyncCommunicator"; -} - -void SyncCommunicator::BarrierRecv() { - if (!running_) return; - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(trainer_id_); - - std::vector rets; - for (auto &ep : pserver_endpoints_) { - rets.push_back(rpc_client->AsyncSendFetchBarrier(ep)); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::External( - "internal error in RPCClient")); - } - - VLOG(4) << "BarrierRecv with SyncCommunicator"; -} - -void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) { - send_varname_to_ctx_ = std::move(send_varname_to_ctx); - recv_varname_to_ctx_ = std::move(recv_varname_to_ctx); - recv_scope_ = std::move(recv_scope); - - PADDLE_ENFORCE_GT( - send_varname_to_ctx.size(), 0, - platform::errors::InvalidArgument("send var contexts can not be zero")); - - send_scope_.reset(new Scope()); - for (auto &iter : send_varname_to_ctx_) { - auto &varname = iter.first; - - if (varname == STEP_COUNTER) { - send_varname_to_queue_[varname] = - std::make_shared>>( - send_queue_size_); - } else { - auto &send_ctx = iter.second; - - send_var_nums_ += send_ctx.splited_varnames.size(); - if (!send_ctx.is_sparse) { - continue; - } - int pserver_num = static_cast(send_ctx.epmap.size()); - for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) { - sparse_id_queues_.insert( - std::pair>>>>( - send_ctx.splited_varnames[ep_idx], - std::make_shared< - BlockingQueue>>>( - send_queue_size_))); - } - } - } - send_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - - if (recv_varname_to_ctx.size() == 0) { - VLOG(0) << "nothing need to be received, will not start recv_thread"; - } else { - recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - } - - delta_scope_.reset(new Scope()); - old_scope_.reset(new Scope()); - pserver_scope_.reset(new Scope()); - - InitParams(); -} - -void GeoCommunicator::Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) { - waiting_ = false; - PADDLE_ENFORCE_EQ( - var_tables.size(), 1, - platform::errors::InvalidArgument("var_tables.size() == 1 is permitted")); - - auto table_name = var_tables[0]; - if (table_name == STEP_COUNTER) return; - - auto before_send = GetCurrentUS(); - size_t splited_var_nums = - send_varname_to_ctx_[table_name].splited_varnames.size(); - - std::unordered_map> ids_table; - - for (size_t j = 0; j < splited_var_nums; j++) { - ids_table.insert(std::pair>( - send_varname_to_ctx_[table_name].splited_varnames[j], - std::unordered_set())); - } - auto *var = scope.FindVar(var_names[0]); - auto &rows = var->Get().rows(); - - // insert ids which has not been record - for (size_t j = 0; j < rows.size(); j++) { - auto ep_idx = rows[j] % splited_var_nums; - ids_table.at(send_varname_to_ctx_[table_name].splited_varnames[ep_idx]) - .insert(rows[j]); - } - - auto before_push = GetCurrentUS(); - for (auto &iter : ids_table) { - auto &key = iter.first; - auto &sparse_ids_set = iter.second; - auto sparse_ids_vec = std::make_shared>(); - sparse_ids_vec->assign(sparse_ids_set.begin(), sparse_ids_set.end()); - sparse_id_queues_.at(key)->Push(sparse_ids_vec); - VLOG(3) << "push " << sparse_ids_vec->size() << " ids to " << key - << "'s queue"; - } - auto after_send = GetCurrentUS(); - VLOG(3) << "run send " << table_name << " op finish. using " - << (before_push - before_send) << "; " << (after_send - before_push); -} - -void GeoCommunicator::MainThread() { - VLOG(3) << "MainThread start and wait"; - - while (waiting_ && running_) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - VLOG(3) << "wait for running"; - } - - while (running_) { - std::vector> tasks; - tasks.reserve(send_var_nums_); - - for (auto &iter : send_varname_to_ctx_) { - auto &var_name = iter.first; - auto &send_ctx = iter.second; - int pserver_num = static_cast(send_ctx.epmap.size()); - if (send_ctx.is_sparse) { - for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) { - auto send_recv_task = [this, ep_idx, &var_name] { - auto before_send_sparse = GetCurrentUS(); - if (var_name == STEP_COUNTER) { - return; - } - auto send_varname = - send_varname_to_ctx_.at(var_name).splited_varnames[ep_idx]; - auto sparse_ids = MergeSparseIds(send_varname); - if (sparse_ids.size() == 0) { - return; - } - SendSparse(var_name, ep_idx, sparse_ids); - auto after_send_sparse = GetCurrentUS(); - RecvSparse(var_name, ep_idx); - auto after_recv_sparse = GetCurrentUS(); - VLOG(3) - << "send recv " - << send_varname_to_ctx_.at(var_name).splited_varnames[ep_idx] - << " finish, using " << (after_send_sparse - before_send_sparse) - << " and " << (after_recv_sparse - after_send_sparse) - << "; total = " << (after_recv_sparse - before_send_sparse); - }; - tasks.emplace_back( - send_threadpool_->enqueue(std::move(send_recv_task))); - } - } else { - auto send_recv_task = [this, &var_name, &send_ctx] { - if (var_name == STEP_COUNTER) { - return; - } - SendDense(var_name); - RecvDense(var_name); - }; - tasks.emplace_back( - send_threadpool_->enqueue(std::move(send_recv_task))); - } - } - for (auto &task : tasks) { - task.wait(); - } - } -} - -std::vector GeoCommunicator::MergeSparseIds( - const std::string &send_varname) { - size_t merge_num = 0, wait_times = 0; - std::unordered_set sparse_ids; - while (merge_num < static_cast(max_merge_var_num_)) { - VLOG(3) << "Merge Number of " << send_varname << " = " << merge_num; - if (sparse_id_queues_.at(send_varname)->Size() > 0) { - wait_times = 0; - std::shared_ptr> pop_ids = - sparse_id_queues_.at(send_varname)->Pop(); - for (size_t j = 0; j < pop_ids->size(); j++) { - sparse_ids.insert(pop_ids->at(j)); - } - merge_num += 1; - VLOG(3) << "sparse_id_queues_(" << send_varname << ") pushed"; - } else if (sparse_id_queues_.at(send_varname)->Size() == 0) { - VLOG(3) << "wait_times -> " << wait_times; - if (wait_times >= static_cast(send_wait_times_)) { - break; - } - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - wait_times++; - continue; - } - } - std::vector res; - res.assign(sparse_ids.begin(), sparse_ids.end()); - return res; -} -void GeoCommunicator::SendSparse(const std::string &varname, int ep_idx, - const std::vector &sparse_ids) { - auto &rpc_ctx = send_varname_to_ctx_.at(varname); - auto send_varname = rpc_ctx.splited_varnames[ep_idx]; - auto trainer_id = rpc_ctx.trainer_id; - auto endpoint = rpc_ctx.epmap[ep_idx]; - auto pserver_num = rpc_ctx.epmap.size(); - - auto *var_latest = recv_scope_->FindVar(varname); - - PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true, - platform::errors::Unavailable( - "%s is not initialized, please check", varname)); - auto &t_latest = var_latest->Get(); - - auto dims1 = t_latest.dims()[1]; - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto *var_delta = delta_scope_->Var(send_varname); - auto *t_delta = var_delta->GetMutable(); - - auto *t_value = t_delta->mutable_value(); - t_value->mutable_data( - framework::make_ddim({static_cast(sparse_ids.size()), dims1}), - cpu_ctx.GetPlace()); - - std::vector *>> values; - auto *ins = distributed::LargeScaleKV::GetInstance(); - ins->Get(varname)->Get(sparse_ids, {"Param"}, &values); - - auto blas = math::GetBlas(cpu_ctx); - float coefficient = 1.0 / static_cast(trainers_); - - for (auto j = 0; j < static_cast(sparse_ids.size()); ++j) { - blas.VSUB(dims1, t_latest.data() + sparse_ids[j] * dims1, - values[j][0]->data(), t_value->data() + j * dims1); - blas.SCAL(dims1, coefficient, t_value->data() + j * dims1); - blas.VADD(dims1, values[j][0]->data(), t_value->data() + j * dims1, - values[j][0]->data()); - } - - std::vector send_rows; - send_rows.reserve(sparse_ids.size()); - for (auto idx : sparse_ids) { - send_rows.push_back(idx / pserver_num); - } - t_delta->set_height(rpc_ctx.height_sections[ep_idx]); - t_delta->set_rows(send_rows); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &cpu_ctx_send = *pool.Get(platform::CPUPlace()); - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(trainer_id); - - auto ret = rpc_client->AsyncSendVar(endpoint, cpu_ctx_send, - *delta_scope_.get(), send_varname); - ret->Wait(); -} - -void GeoCommunicator::SendDense(const std::string &varname) { - auto *var_latest = recv_scope_->FindVar(varname); - auto *var_timestamp = old_scope_->FindVar(varname); - - PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true, - platform::errors::Unavailable( - "%s is not initialized, please check", varname)); - PADDLE_ENFORCE_EQ(var_timestamp->IsInitialized(), true, - platform::errors::Unavailable( - "%s is not initialized, please check", varname)); - - auto &t_latest = var_latest->Get(); - auto t_timestamp = var_timestamp->GetMutable(); - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto *var_delta = delta_scope_->Var(varname); - auto *t_delta = var_delta->GetMutable(); - t_delta->mutable_data(t_latest.dims(), cpu_ctx.GetPlace()); - - auto blas = math::GetBlas(cpu_ctx); - blas.VSUB(t_latest.numel(), t_latest.data(), - t_timestamp->data(), t_delta->data()); - - float coefficient = 1.0 / static_cast(trainers_); - blas.SCAL(t_latest.numel(), coefficient, t_delta->data()); - - blas.VADD(t_latest.numel(), t_timestamp->data(), - t_delta->data(), t_timestamp->data()); - - auto &ctx = send_varname_to_ctx_.at(varname); - auto send = distributed::ParameterSend(); - send(ctx, *delta_scope_, true, 1); -} - -void GeoCommunicator::RecvByCommunicator() { return; } - -void GeoCommunicator::RecvSparse(const std::string &varname, int ep_idx) { - auto train_id = recv_varname_to_ctx_.at(varname).trainer_id; - auto endpoint = recv_varname_to_ctx_.at(varname).epmap[ep_idx]; - auto splited_var_name = - recv_varname_to_ctx_.at(varname).splited_varnames[ep_idx]; - auto pserver_num = recv_varname_to_ctx_.at(varname).epmap.size(); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &cpu_ctx_recv = *pool.Get(platform::CPUPlace()); - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(train_id); - - auto *var_psrever = pserver_scope_->Var(splited_var_name); - auto handle = rpc_client->AsyncGetVar(endpoint, cpu_ctx_recv, - *pserver_scope_.get(), splited_var_name, - splited_var_name, splited_var_name); - handle->Wait(); - - auto *var_latest = recv_scope_->FindVar(varname); - - PADDLE_ENFORCE_EQ( - var_psrever->IsInitialized(), true, - platform::errors::Unavailable( - "%s in pserver scope is not initialized, please check", varname)); - - std::vector ids; - ids.assign(var_psrever->Get().rows().begin(), - var_psrever->Get().rows().end()); - - for (size_t j = 0; j < ids.size(); j++) { - ids[j] = ids[j] * pserver_num + ep_idx; - } - - VLOG(3) << "RecvSparse receive var: " << splited_var_name - << " ids Size: " << ids.size(); - - auto t_psrever = var_psrever->Get().value(); - - std::vector *>> old_values; - - auto *ins = distributed::LargeScaleKV::GetInstance(); - ins->Get(varname)->Get(ids, {"Param"}, &old_values); - - auto *t_latest = var_latest->GetMutable(); - - auto dims1 = t_latest->dims()[1]; - auto numel = ids.size() * dims1; - - std::vector v_delta; - v_delta.resize(numel); - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto blas = math::GetBlas(cpu_ctx); - - for (auto j = 0; j < static_cast(ids.size()); ++j) { - blas.VSUB(dims1, t_psrever.data() + j * dims1, - old_values[j][0]->data(), v_delta.data() + j * dims1); - blas.VADD(dims1, t_latest->data() + ids[j] * dims1, - v_delta.data() + j * dims1, - t_latest->data() + ids[j] * dims1); - blas.VCOPY(dims1, t_psrever.data() + j * dims1, - old_values[j][0]->data()); - } -} - -void GeoCommunicator::RecvDense(const std::string &varname) { - auto *var_latest = recv_scope_->FindVar(varname); - auto *var_timestamp = old_scope_->FindVar(varname); - auto *var_psrever = pserver_scope_->Var(varname); - - auto &ctx = recv_varname_to_ctx_.at(varname); - auto recv = distributed::ParameterRecv(); - recv(ctx, *pserver_scope_); - - PADDLE_ENFORCE_EQ( - var_psrever->IsInitialized(), true, - platform::errors::Unavailable( - "%s in pserver scope is not initialized, please check", varname)); - - auto t_psrever = var_psrever->Get(); - auto t_latest = var_latest->GetMutable(); - auto t_timestamp = var_timestamp->GetMutable(); - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto *var_delta = delta_scope_->Var(varname); - auto *t_delta = var_delta->GetMutable(); - t_delta->mutable_data(t_latest->dims(), cpu_ctx.GetPlace()); - - auto blas = math::GetBlas(cpu_ctx); - blas.VSUB(t_latest->numel(), t_psrever.data(), - t_timestamp->data(), t_delta->data()); - blas.VADD(t_latest->numel(), t_latest->data(), t_delta->data(), - t_latest->data()); - blas.VCOPY(t_latest->numel(), t_psrever.data(), - t_timestamp->data()); -} - -void GeoCommunicator::InitParams() { - std::vector> tasks; - tasks.reserve(recv_varname_to_ctx_.size()); - - for (auto &iter : recv_varname_to_ctx_) { - auto &var_name = iter.first; - auto &recv_ctx = iter.second; - - auto recv_task = [this, &var_name, &recv_ctx] { - if (!recv_ctx.is_sparse) { - InitDense(var_name); - } - }; - tasks.emplace_back(send_threadpool_->enqueue(std::move(recv_task))); - } - - for (auto &task : tasks) { - task.wait(); - } - InitSparse(); -} - -void GeoCommunicator::InitDense(const std::string varname) { - auto &ctx = recv_varname_to_ctx_.at(varname); - auto recv = distributed::ParameterRecv(); - recv(ctx, *recv_scope_); - - auto *global_var = recv_scope_->FindVar(varname); - global_var->GetMutable(); - - auto *old_var = old_scope_->Var(varname); - old_var->GetMutable(); - - framework::CopyVariable(*global_var, old_var); - VLOG(1) << "init dense variable " << varname << " done"; -} - -void GeoCommunicator::InitSparse() { - auto sparse_metas = string::split_string(sparse_attrs_, "#"); - - std::vector metas; - std::vector dicts; - - for (auto &sparse_meta : sparse_metas) { - auto attrs = string::split_string(sparse_meta, ":"); - - auto meta = distributed::SparseMeta(); - meta.name = attrs[0]; - meta.value_names = {"Param"}; - - auto dic = string::split_string(attrs[1], ","); - dicts.push_back(std::stoi(dic[0])); - meta.value_dims = {std::stoi(dic[1])}; - meta.mode = distributed::Mode::training; - meta.grad_name = "none"; - meta.cached_varnames = {}; - meta.initializer_attrs = string::split_string(attrs[2]); - meta.entry = "none"; - - VLOG(3) << "add sparse meta: " << meta.ToString(); - metas.push_back(meta); - } - - LargeScaleKV::Init(metas); - - for (auto &meta : metas) { - auto &ctx = recv_varname_to_ctx_.at(meta.name); - auto recv = distributed::ParameterRecv(); - - auto *global_var = recv_scope_->FindVar(meta.name); - auto global_value = global_var->Get(); - auto rows = global_value.dims()[0]; - auto dim1 = global_value.dims()[1]; - - recv(ctx, *recv_scope_); - VLOG(1) << "recv " << meta.name << " with global scope for init"; - - auto n_rows = global_var->Get().dims()[0]; - - PADDLE_ENFORCE_EQ( - rows, n_rows, - platform::errors::InvalidArgument( - "global var: %s origin dim must equal recved rows", meta.name)); - - std::vector ids(rows); - std::iota(ids.begin(), ids.end(), 0); - - auto *ins = distributed::LargeScaleKV::GetInstance(); - std::vector *>> values; - - ins->Get(meta.name)->Init(ids); - ins->Get(meta.name)->Get(ids, {"Param"}, &values); - - auto blas = math::GetBlas( - paddle::platform::CPUDeviceContext()); - - for (auto &id : ids) { - blas.VCOPY(dim1, global_value.data() + id * dim1, - values[id][0]->data()); - } - } - - VLOG(3) << "init sparse variable done"; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h deleted file mode 100644 index 4be3253d3923f8..00000000000000 --- a/paddle/fluid/operators/distributed/communicator.h +++ /dev/null @@ -1,490 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/large_scale_kv.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" -#include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/split.h" - -DECLARE_bool(communicator_is_sgd_optimizer); - -namespace paddle { -namespace operators { -namespace distributed { - -using Scope = framework::Scope; -using Variable = framework::Variable; - -template -class BlockingQueue { - public: - explicit BlockingQueue(size_t capacity) : capacity_(capacity) { - PADDLE_ENFORCE_GT(capacity_, 0, - platform::errors::InvalidArgument( - "The capacity must be greater than 0.")); - } - - bool Push(const T &elem) { - { - std::unique_lock lock(mutex_); - cv_.wait(lock, [&] { return queue_.size() < capacity_; }); - PADDLE_ENFORCE_LT( - queue_.size(), capacity_, - platform::errors::OutOfRange("The queue size: %s out of capacity:%s", - queue_.size(), capacity_)); - queue_.push_back(elem); - } - cv_.notify_one(); - return true; - } - - bool Push(T &&elem) { - { - std::unique_lock lock(mutex_); - cv_.wait(lock, [&] { return queue_.size() < capacity_; }); - PADDLE_ENFORCE_LT( - queue_.size(), capacity_, - platform::errors::OutOfRange("The queue size: %s out of capacity:%s", - queue_.size(), capacity_)); - queue_.emplace_back(std::move(elem)); - } - cv_.notify_one(); - return true; - } - - T Pop() { - std::unique_lock lock(mutex_); - cv_.wait(lock, [=] { return !queue_.empty(); }); - T rc(std::move(queue_.front())); - queue_.pop_front(); - cv_.notify_one(); - return rc; - } - - size_t Cap() const { - std::lock_guard lock(mutex_); - return capacity_; - } - - size_t Size() const { - std::lock_guard lock(mutex_); - return queue_.size(); - } - - private: - const size_t capacity_; - std::deque queue_; - - mutable std::mutex mutex_; - std::condition_variable cv_; -}; - -template -using EigenVector = framework::EigenVector; - -template -inline void MergeVars(const std::string &var_name, - const std::vector> &vars, - Scope *scope, bool merge_add = true) { - PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument( - "vector vars are empty.")); - auto cpu_place = platform::CPUPlace(); - auto &var0 = vars[0]; - auto *out_var = scope->Var(var_name); - if (var0->IsType()) { - auto dims = var0->Get().dims(); - VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims - << "; merge add: " << merge_add; - // init output tensor - auto *out_t = out_var->GetMutable(); - out_t->mutable_data(dims, cpu_place); - // check the input dims - for (auto &var : vars) { - auto &var_t = var->Get(); - PADDLE_ENFORCE_EQ( - var_t.dims(), dims, - platform::errors::InvalidArgument("vars should have the same dims")); - } - - // set output tensor to 0. - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - math::SetConstant constant_functor; - constant_functor(cpu_ctx, out_t, static_cast(0)); - // sum all vars to out - auto result = EigenVector::Flatten(*out_t); - for (auto &var : vars) { - auto &in_t = var->Get(); - auto in = EigenVector::Flatten(in_t); - result.device(*cpu_ctx.eigen_device()) = result + in; - } - if (!merge_add) { - result.device(*cpu_ctx.eigen_device()) = - result / static_cast(vars.size()); - } - } else if (var0->IsType()) { - auto &slr0 = var0->Get(); - auto *out_slr = out_var->GetMutable(); - out_slr->mutable_rows()->clear(); - out_slr->mutable_value()->mutable_data({{}}, cpu_place); - std::vector inputs; - inputs.reserve(vars.size()); - for (auto &var : vars) { - inputs.push_back(&var->Get()); - } - auto dev_ctx = paddle::platform::CPUDeviceContext(); - if (merge_add) { - math::scatter::MergeAdd merge_add; - merge_add(dev_ctx, inputs, out_slr); - } else { - math::scatter::MergeAverage - merge_average; - merge_average(dev_ctx, inputs, out_slr); - } - - VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height() - << " dims: " << slr0.value().dims() << "; merge add: " << merge_add; - } else { - PADDLE_THROW(platform::errors::InvalidArgument("unsupported var type: %s!", - var0->Type())); - } -} - -using RpcCtxMap = std::unordered_map; -using SparseValue = std::unordered_map>; - -class Communicator { - public: - Communicator(); - - explicit Communicator(const std::map &envs_) { - for (auto &iter : envs_) { - envs[iter.first] = iter.second; - } - } - - virtual ~Communicator() {} - - virtual void Start() = 0; - - virtual void Stop() = 0; - - virtual bool IsRunning() { return running_; } - - virtual void Clean() {} - - virtual void Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) = 0; - - virtual void RecvNoBarrier() {} - - virtual void Barrier() {} - - virtual void BarrierTriggerDecrement() {} - - virtual void BarrierTriggerReset(int init_counter) {} - - virtual void InitEnvs() = 0; - - virtual void InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) {} - - static Communicator *GetInstance() { return communicator_.get(); } - - static std::shared_ptr GetInstantcePtr() { - return communicator_; - } - - template - static Communicator *InitInstance( - const RpcCtxMap &send_ctx, const RpcCtxMap &recv_ctx, Scope *recv_scope, - const std::map &envs) { - std::call_once(init_flag_, &Communicator::InitWithRpcCtx, send_ctx, - recv_ctx, recv_scope, std::ref(envs)); - return communicator_.get(); - } - - // Init is called by InitInstance. - template - static void InitWithRpcCtx(const RpcCtxMap &send_ctx, - const RpcCtxMap &recv_ctx, Scope *recv_scope, - const std::map &envs) { - if (communicator_.get() == nullptr) { - communicator_.reset(new T(std::ref(envs))); - communicator_->InitEnvs(); - communicator_->InitImpl(send_ctx, recv_ctx, recv_scope); - } - } - - protected: - bool running_ = false; - bool waiting_ = true; - static std::shared_ptr communicator_; - static std::once_flag init_flag_; - std::unordered_map envs; -}; - -class AsyncCommunicator : public Communicator { - public: - AsyncCommunicator() : Communicator() {} - - explicit AsyncCommunicator(const std::map &envs) - : Communicator(envs) {} - - ~AsyncCommunicator(); - - void InitEnvs() { - min_send_grad_num_before_recv_ = - std::stoi(envs.at("communicator_min_send_grad_num_before_recv")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); - need_global_step_ = - static_cast(std::stoi(envs.at("need_global_step"))); - VLOG(0) << "AsyncCommunicator Initialized"; - } - - void Start() override; - - void Stop() override; - - void InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) override; - - void InitParams(); - - virtual void MainThread(); - - void Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) override; - - virtual void SendByCommunicator(); - virtual void SendGlobalStep(int batches); - - virtual void RecvByCommunicator(); - - virtual void RecvNoBarrier(); - - virtual void BarrierSend() {} - - virtual void BarrierRecv() {} - - virtual void BarrierWeakUp() {} - - protected: - int min_send_grad_num_before_recv_; - int thread_pool_size_; - int max_merge_var_num_; - int send_wait_times_; - int send_queue_size_; - int trainer_id_ = 0; - bool need_global_step_ = false; - - std::unordered_map>>> - send_varname_to_queue_; - RpcCtxMap send_varname_to_ctx_; - RpcCtxMap recv_varname_to_ctx_; - std::unique_ptr main_thread_{nullptr}; - Scope *recv_scope_; // should be global scope - std::unique_ptr send_scope_; // an independent scope - std::unique_ptr<::ThreadPool> send_threadpool_{nullptr}; - std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr}; - std::atomic_uint grad_num_{0}; // the num of gradient sent since last recv -}; - -class HalfAsyncCommunicator : public AsyncCommunicator { - public: - HalfAsyncCommunicator() {} - - explicit HalfAsyncCommunicator(const std::map &envs) - : AsyncCommunicator(envs) {} - - void InitEnvs() { - min_send_grad_num_before_recv_ = 0; - - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); - need_global_step_ = - static_cast(std::stoi(envs.at("need_global_step"))); - VLOG(0) << "HalfAsyncCommunicator Initialized"; - } - - void MainThread() override; - - void SendByCommunicator() override; - - void Clean() override; - - void Barrier() override; - - void BarrierTriggerDecrement() override; - - void BarrierTriggerReset(int initial_val) override; - - int BatchesCounter(); - - void BarrierWeakUp(); - - protected: - // mutex for Wait for barrier - std::mutex barrier_mutex_; - std::condition_variable barrier_cond_; - std::atomic barrier_trigger_{0}; - std::atomic barrier_counter_{0}; -}; - -class SyncCommunicator : public HalfAsyncCommunicator { - public: - SyncCommunicator() : HalfAsyncCommunicator() {} - - explicit SyncCommunicator(const std::map &envs) - : HalfAsyncCommunicator(envs) {} - - void InitEnvs() { - min_send_grad_num_before_recv_ = 0; - - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); - need_global_step_ = - static_cast(std::stoi(envs.at("need_global_step"))); - - trainer_id_ = std::stoi(envs.at("trainer_id")); - auto pserver_strings = envs.at("pserver_endpoints"); - pserver_endpoints_ = paddle::string::Split(pserver_strings, ','); - VLOG(0) << "SyncCommunicator Initialized"; - } - - void BarrierSend(); - - void BarrierRecv(); - - private: - std::vector pserver_endpoints_{}; -}; - -class GeoCommunicator : public AsyncCommunicator { - public: - GeoCommunicator() : AsyncCommunicator() {} - - explicit GeoCommunicator(const std::map &envs) - : AsyncCommunicator(envs) {} - - void InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) override; - void MainThread() override; - void InitEnvs() { - min_send_grad_num_before_recv_ = 0; - - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - - send_queue_size_ = max_merge_var_num_; - trainers_ = std::stoi(envs.at("trainers")); - sparse_attrs_ = envs.at("sparse_attrs"); - VLOG(0) << "GeoCommunicator Initialized"; - } - - void Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) override; - - void SendByCommunicator() { return; } - - std::vector MergeSparseIds(const std::string &send_varname); - - void SendSparse(const std::string &varname, int ep_idx, - const std::vector &sparse_ids); - - void SendDense(const std::string &varname); - - void SendGlobalStep(int batches) override {} - - void RecvByCommunicator() override; - - void RecvSparse(const std::string &varname, int ep_idx); - - void RecvDense(const std::string &varname); - - void InitParams(); - - void InitSparse(); - - void InitDense(const std::string varname); - - private: - int trainers_; - std::string sparse_attrs_; - - // parameter for delta calc and send - std::shared_ptr delta_scope_; - - // parameter for storage the pserver param after last recv - std::shared_ptr old_scope_; - - // parameter on pserver - std::shared_ptr pserver_scope_; - - int send_var_nums_ = 0; - - std::unordered_map> old_sparses_; - - std::unordered_map< - std::string, - std::shared_ptr>>>> - sparse_id_queues_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator_common.h b/paddle/fluid/operators/distributed/communicator_common.h deleted file mode 100644 index 122d904eba27aa..00000000000000 --- a/paddle/fluid/operators/distributed/communicator_common.h +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -namespace paddle { -namespace operators { -namespace distributed { - -struct CommContext { - CommContext() = default; - - CommContext(const std::string &name, const std::vector &names, - const std::vector &emap, - const std::vector §ions, - const std::vector &origin_names, int id, - bool merge_add_ = true, bool is_sparse_ = true, - bool is_distributed_ = false) - : var_name(name), - splited_varnames(names), - epmap(emap), - height_sections(sections), - origin_varnames(origin_names), - trainer_id(id), - merge_add(merge_add_), - is_sparse(is_sparse_), - is_distributed(is_distributed_) {} - - CommContext(const CommContext &ctx) { - var_name = ctx.var_name; - splited_varnames = ctx.splited_varnames; - epmap = ctx.epmap; - height_sections = ctx.height_sections; - trainer_id = ctx.trainer_id; - merge_add = ctx.merge_add; - is_sparse = ctx.is_sparse; - origin_varnames = ctx.origin_varnames; - is_distributed = ctx.is_distributed; - } - - std::string print() const { - std::stringstream ss; - - ss << "varname: " << var_name << " trainer_id: " << trainer_id << " "; - - for (size_t i = 0; i < splited_varnames.size(); i++) { - ss << "slice varname: " << splited_varnames[i] << " ep: " << epmap[i] - << " section: " << height_sections[i] << " "; - } - - ss << "origin varnames: "; - for (size_t i = 0; i < origin_varnames.size(); i++) { - ss << origin_varnames[i] << " "; - } - - ss << " aggregation->add: " << merge_add << " "; - ss << " is_sparse: " << is_sparse << "\n"; - ss << " is_distributed: " << is_distributed << "\n"; - - return ss.str(); - } - - std::string var_name; - std::vector splited_varnames; - std::vector epmap; - std::vector height_sections; - std::vector origin_varnames; - int trainer_id; - bool merge_add; - bool is_sparse; - bool is_distributed; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc deleted file mode 100644 index 38b7c8b00317e6..00000000000000 --- a/paddle/fluid/operators/distributed/communicator_test.cc +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "paddle/fluid/operators/distributed/communicator.h" - -namespace paddle { -namespace operators { -namespace distributed { - -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; - -TEST(communicator, merge_lod_tensors) { - auto cpu_place = platform::CPUPlace(); - auto dims = framework::make_ddim({2, 3}); - std::vector> in_vars; - float out_value = 0; - for (auto i = 0; i < 10; ++i) { - auto var = std::make_shared(); - in_vars.emplace_back(var); - auto *tensor = var->GetMutable(); - auto *data = tensor->mutable_data(dims, cpu_place); - for (auto j = 0; j < tensor->numel(); ++j) { - data[j] = static_cast(i); - } - out_value += static_cast(i); - } - const std::string out_name = "Out"; - std::unique_ptr scope; - scope.reset(new framework::Scope()); - scope->Var(out_name); - for (auto i = 0; i < 10; ++i) { - MergeVars(out_name, in_vars, scope.get()); - } - auto &out_tensor = scope->FindVar(out_name)->Get(); - auto *out_data = out_tensor.data(); - ASSERT_EQ(out_tensor.dims(), dims); - for (auto i = 0; i < out_tensor.numel(); ++i) { - ASSERT_EQ(out_data[i], out_value); - } -} - -TEST(communicator, merge_selected_rows) { - auto cpu_place = platform::CPUPlace(); - int64_t width = 10; - std::vector> in_vars; - const int64_t height = 100; - for (auto i = 0; i < 10; ++i) { - std::vector rows; - for (auto k = 0; k <= i; ++k) { - rows.push_back(k); - } - auto var = std::make_shared(); - in_vars.emplace_back(var); - auto *slr = var->GetMutable(); - slr->set_height(height); - slr->set_rows(rows); - auto dims = - framework::make_ddim({static_cast(rows.size()), width}); - auto *data = slr->mutable_value()->mutable_data(dims, cpu_place); - for (size_t i = 0; i < rows.size(); ++i) { - for (auto j = 0; j < width; ++j) { - data[i * width + j] = static_cast(rows[i]); - } - } - } - const std::string out_name = "Out"; - std::unique_ptr scope; - scope.reset(new framework::Scope()); - scope->Var(out_name); - for (auto i = 0; i < 10; ++i) { - MergeVars(out_name, in_vars, scope.get()); - } - auto &out_slr = scope->FindVar(out_name)->Get(); - auto &out_t = out_slr.value(); - auto *out_data = out_t.data(); - ASSERT_EQ(out_t.dims(), framework::make_ddim({10, width})); - std::vector out_values; - out_values.reserve(10); - for (auto i = 0; i < 10; ++i) { - out_values.push_back(static_cast(i * (10 - i))); - } - for (size_t i = 0; i < out_slr.rows().size(); ++i) { - ASSERT_EQ(out_slr.rows()[i], static_cast(i)); - for (auto j = 0; j < width; ++j) { - ASSERT_EQ(out_data[i * width + j], out_values[i]); - } - } -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/distributed.h b/paddle/fluid/operators/distributed/distributed.h deleted file mode 100644 index 5917c18fb0d201..00000000000000 --- a/paddle/fluid/operators/distributed/distributed.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#ifdef PADDLE_WITH_DISTRIBUTE - -#ifdef PADDLE_WITH_GRPC -#include "paddle/fluid/operators/distributed/communicator.h" - -#include "paddle/fluid/operators/distributed/grpc/grpc_client.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_server.h" -#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer -#define RPCCLIENT_T paddle::operators::distributed::GRPCClient - -#else // PADDLE_WITH_GRPC - -#include "paddle/fluid/operators/distributed/brpc/brpc_client.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_server.h" -#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer -#define RPCCLIENT_T paddle::operators::distributed::BRPCClient - -#endif // PADDLE_WITH_GRPC - -#endif // PADDLE_WITH_DISTRIBUTE diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc deleted file mode 100644 index 7d6756b41363d1..00000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// NOTE: This file was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// file and did some modifications so that we can send gRPC -// requests without too much copying of the tensor data. - -#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" - -namespace grpc { -class ByteBuffer; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -GrpcByteBufferSource::GrpcByteBufferSource() {} - -bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) { - cur_ = -1; - left_ = 0; - ptr_ = nullptr; - byte_count_ = 0; - bool ok = src.Dump(&slices_).ok(); - if (!ok) { - slices_.clear(); - } - return ok; -} - -bool GrpcByteBufferSource::Next(const void** data, int* size) { - // Use loop instead of if in case buffer contained empty slices. - while (left_ == 0) { - // Advance to next slice. - cur_++; - if (cur_ >= slices_.size()) { - return false; - } - const ::grpc::Slice& s = slices_[cur_]; - left_ = s.size(); - ptr_ = reinterpret_cast(s.begin()); - } - - *data = ptr_; - *size = left_; - byte_count_ += left_; - ptr_ += left_; - left_ = 0; - return true; -} - -void GrpcByteBufferSource::BackUp(int count) { - ptr_ -= count; - left_ += count; - byte_count_ -= count; -} - -bool GrpcByteBufferSource::Skip(int count) { - const void* data; - int size; - while (Next(&data, &size)) { - if (size >= count) { - BackUp(size - count); - return true; - } - // size < count; - count -= size; - } - // error or we have too large count; - return false; -} - -google::protobuf::int64 GrpcByteBufferSource::ByteCount() const { - return byte_count_; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h deleted file mode 100644 index 486870de7a554e..00000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// NOTE: This file was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// file and did some modifications so that we can send gRPC -// requests without too much copying of the tensor data. - -#pragma once - -#include - -#include "google/protobuf/io/coded_stream.h" -#include "google/protobuf/io/zero_copy_stream.h" -#include "grpc++/grpc++.h" -#include "paddle/fluid/operators/distributed/variable_response.h" - -struct grpc_byte_buffer; - -namespace grpc { -// A ZeroCopyInputStream that reads from grpc_byte_buffer -class ByteBuffer; - -class GrpcBufferReader final - : public ::google::protobuf::io::ZeroCopyInputStream { - typedef void (CoreCodegenInterface::*OldReaderInitAPI)( - grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer); - typedef int (CoreCodegenInterface::*NewReaderInitAPI)( - grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer); - void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader, - grpc_byte_buffer* buffer) { - (g_core_codegen_interface->*ptr)(reader, buffer); - } - void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader, - grpc_byte_buffer* buffer) { - int result = (g_core_codegen_interface->*ptr)(reader, buffer); - (void)result; - } - - public: - explicit GrpcBufferReader(grpc_byte_buffer* buffer) - : byte_count_(0), backup_count_(0) { - ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_, - buffer); - } - ~GrpcBufferReader() override { - g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_); - } - - bool Next(const void** data, int* size) override { - if (backup_count_ > 0) { - *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) - - backup_count_; - GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX); - *size = static_cast(backup_count_); - backup_count_ = 0; - return true; - } - if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_, - &slice_)) { - return false; - } - g_core_codegen_interface->grpc_slice_unref(slice_); - *data = GRPC_SLICE_START_PTR(slice_); - // On win x64, int is only 32bit - GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX); - byte_count_ += * size = static_cast(GRPC_SLICE_LENGTH(slice_)); - return true; - } - - void BackUp(int count) override { backup_count_ = count; } - - bool Skip(int count) override { - const void* data; - int size; - while (Next(&data, &size)) { - if (size >= count) { - BackUp(size - count); - return true; - } - // size < count; - count -= size; - } - // error or we have too large count; - return false; - } - - ::google::protobuf::int64 ByteCount() const override { - return byte_count_ - backup_count_; - } - - private: - int64_t byte_count_; - int64_t backup_count_; - grpc_byte_buffer_reader reader_; - grpc_slice slice_; -}; - -}; // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -// A ZeroCopyInputStream that reads from a grpc::ByteBuffer. -class GrpcByteBufferSource - : public ::google::protobuf::io::ZeroCopyInputStream { - public: - GrpcByteBufferSource(); - bool Init(const ::grpc::ByteBuffer& src); // Can be called multiple times. - bool Next(const void** data, int* size) override; - void BackUp(int count) override; - bool Skip(int count) override; - ::google::protobuf::int64 ByteCount() const override; - - private: - std::vector<::grpc::Slice> slices_; - size_t cur_; // Current slice index. - int left_; // Number of bytes in slices_[cur_] left to yield. - const char* ptr_; // Address of next byte in slices_[cur_] to yield. - ::google::protobuf::int64 byte_count_; -}; - -class GrpcByteBufferSourceWrapper : public Source { - public: - explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) - : source_(source) {} - ::google::protobuf::io::ZeroCopyInputStream* contents() override { - return source_; - } - - private: - GrpcByteBufferSource* source_; -}; - -class GrpcByteSource : public Source { - public: - explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {} - ~GrpcByteSource() override { DeleteStream(); } - - typedef ::grpc::GrpcBufferReader Reader; - - ::google::protobuf::io::ZeroCopyInputStream* contents() override { - DeleteStream(); - stream_ = new (&space_) Reader(buffer_); - return stream_; - } - - private: - void DeleteStream() { - if (stream_) { - stream_->~Reader(); - } - } - - grpc_byte_buffer* buffer_; // Not owned - Reader* stream_ = nullptr; // Points into space_ if non-nullptr - char space_[sizeof(Reader)]; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc deleted file mode 100644 index 97a9c14e4f1850..00000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc +++ /dev/null @@ -1,671 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "glog/logging.h" // For VLOG -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_client.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/platform/port.h" -#include "paddle/fluid/platform/profiler.h" - -DEFINE_int32(rpc_client_threads, 2, ""); -DECLARE_bool(rpc_disable_reuse_port); - -namespace paddle { -namespace operators { -namespace distributed { - -void GRPCClient::InitImpl() { - // start the client process thread - // TODO(wuyi): can make this in a threadpool - client_threads_.resize(FLAGS_rpc_client_threads); - for (int i = 0; i < FLAGS_rpc_client_threads; i++) { - client_threads_[i].reset( - new std::thread(std::bind(&GRPCClient::Proceed, this))); - } -} - -void GRPCClient::SendComplete() { - std::unique_lock lk(completed_mutex_); - if (!completed_) { - for (auto& it : channels_) { - VLOG(3) << "send complete message to " << it.first; - this->AsyncSendComplete(it.first); - } - PADDLE_ENFORCE_EQ(this->Wait(), true, platform::errors::PreconditionNotMet( - "internal grpc service error.")); - completed_ = true; - } -} - -GRPCClient::~GRPCClient() { - stopped_ = true; - Wait(); - cq_.Shutdown(); - { - std::lock_guard guard(chan_mutex_); - for (auto& it : channels_) { - it.second.reset(); - } - channels_.clear(); - } - for (size_t i = 0; i < client_threads_.size(); i++) - client_threads_[i]->join(); -} - -VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - const std::string method = kSendRPC; - - int retry_times_ = 0; - - while (true) { - SendProcessor* s = new SendProcessor(ch); - VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - - framework::Async([var_name_val, p_scope, p_ctx, s, method, h, this] { - auto* var = p_scope->FindVar(var_name_val); - - ::grpc::ByteBuffer req; - SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = nullptr; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, - &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -void ProcGetResponse(const VarHandle& var_h, - const ::grpc::ByteBuffer& ret_msg) { - VLOG(4) << "ProcGetResponse"; - framework::Variable* outvar = nullptr; - // get response's trainer_id is not used - int trainer_id; - DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar, - &trainer_id); -} - -void ProcGetRecvResponse(const VarHandle& var_h, - const ::grpc::ByteBuffer& ret_msg) { - VLOG(4) << "ProcGetRecvResponse"; - framework::Variable* outvar = nullptr; - int trainer_id; - DeserializeRecvFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar, - &trainer_id); -} - -template -void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { - ::grpc::Slice slice(proto.ByteSizeLong()); - proto.SerializeWithCachedSizesToArray(const_cast(slice.begin())); - ::grpc::ByteBuffer tmp(&slice, 1); - result->Swap(&tmp); -} - -VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - const std::string& table_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname, - "/sendrecv.SendRecvService/GetVariable", table_name, - time_out); -} - -VarHandlePtr GRPCClient::AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_varname, int64_t time_out) { - std::string var_name_no_barrier = - string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); - - return _AsyncGetVar( - ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname, - "/sendrecv.SendRecvService/GetVariableNoBarrier", "", time_out); -} - -VarHandlePtr GRPCClient::AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name, - "/sendrecv.SendRecvService/GetMonomerVariable", "", - time_out); -} - -VarHandlePtr GRPCClient::_AsyncGetVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& method, - const std::string& var_name, const std::string& out_varname, - const std::string& rpc_path, const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const std::string out_varname_val = out_varname; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - - int retry_times_ = 0; - - while (true) { - GetProcessor* s = new GetProcessor(ch); - - VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - - framework::Async([var_name_val, out_varname_val, table_name_val, s, method, - p_ctx, h, rpc_path, this] { - // prepare input - sendrecv::VariableMessage req; - req.set_varname(var_name_val); - req.set_out_varname(out_varname_val); - req.set_trainer_id(trainer_id_); - req.set_table_name(table_name_val); - ::grpc::ByteBuffer buf; - RequestToByteBuffer(req, &buf); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = ProcGetResponse; - - platform::RecordRPCEvent record_event(method); - - auto call = - s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string in_var_name_val = in_var_name; - const std::string out_var_name_val = out_var_name; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - - const std::string method = kPrefetchRPC; - int retry_times_ = 0; - - while (true) { - GetProcessor* s = new GetProcessor(ch); - VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); - s->Prepare(h, kPrefetchTimeout); - - auto* var = p_scope->FindVar(in_var_name_val); - - ::grpc::ByteBuffer req; - SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val, - 0, table_name_val); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = ProcGetResponse; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, - &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, static_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { - const auto ch = GetChannel(ep); - - BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = kBatchBarrierRPC; - VarHandlePtr h( - new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_varname(BATCH_BARRIER_MESSAGE); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { - const auto ch = GetChannel(ep); - FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); - const std::string method = kFetchBarrierRPC; - VarHandlePtr h( - new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_varname(FETCH_BARRIER_MESSAGE); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, - const std::string& var_name, - int64_t time_out) { - const auto ch = GetChannel(ep); - BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = kSendMonomerFetchBarrierRPC; - VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr)); - s->Prepare(h, time_out); - - VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; - - sendrecv::VariableMessage req; - req.set_varname(var_name); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, - int64_t time_out) { - const auto ch = GetChannel(ep); - - BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = kSendCompleteRPC; - VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_trainer_id(trainer_id_); - req.set_varname(COMPLETE_MESSAGE); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, - const std::string& dirname, - const std::string& varname, - const int mode, - int64_t time_out) { - const auto ch = GetChannel(ep); - - CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch); - - const std::string method = kCheckPointNotifyRPC; - - VarHandlePtr h( - new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_varname(varname); - req.set_table_name(std::to_string(mode)); - req.set_out_varname(dirname); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncDistributeNotify( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - const std::string method = kRequestNotify; - - SendProcessor* s = new SendProcessor(ch); - VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - - framework::Async([var_name_val, p_scope, p_ctx, s, method, h, this] { - auto* var = p_scope->FindVar(var_name_val); - - ::grpc::ByteBuffer req; - SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = nullptr; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/DistributeNotify", req, - &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - }); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncSendAndRecv(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& send_var_name, - const std::string& recv_var_name, - const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string send_var_name_val = send_var_name; - const std::string recv_var_name_val = recv_var_name; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - const std::string method = kSendAndRecvRPC; - VLOG(4) << "GRPCClient::SendAndRecv Begin ,Send_var_name: " - << send_var_name_val << " Recv_var_name: " << recv_var_name_val; - int retry_times_ = 0; - - while (true) { - SendAndRecvProcessor* s = new SendAndRecvProcessor(ch); - VarHandlePtr h( - new VarHandle(ep, method, send_var_name_val, p_ctx, p_scope)); - VarHandlePtr h_recv( - new VarHandle(ep, method, recv_var_name_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - s->RecvPrepare(h_recv); - - framework::Async([send_var_name_val, recv_var_name_val, table_name_val, - p_scope, p_ctx, s, method, h, this] { - auto* send_var = p_scope->FindVar(send_var_name_val); - send_var->GetMutable()->set_lod({}); - ::grpc::ByteBuffer buf; - VLOG(4) << "SerializeToByteBuffer: send_var_name_val: " - << send_var_name_val - << " recv_var_name_val: " << recv_var_name_val; - SerializeToByteBuffer(send_var_name_val, send_var, *p_ctx, &buf, - recv_var_name_val, trainer_id_, table_name_val); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = ProcGetRecvResponse; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/SendAndRecvVariable", - buf, &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -bool GRPCClient::Wait() { - std::unique_lock lk(sync_mutex_); - sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); }); - return ok_; -} - -inline bool ShouldRetry(const std::string& method, int error_code) { - if (method == kPrefetchRPC) { - return true; - } - - if (error_code == grpc::StatusCode::DEADLINE_EXCEEDED) { - return true; - } - - return false; -} - -void GRPCClient::Proceed() { - void* tag = nullptr; - bool ok = false; - - VLOG(3) << "GRPCClient Proceed begin"; - while (!stopped_ && cq_.Next(&tag, &ok)) { - BaseProcessor* c = static_cast(tag); - GPR_ASSERT(ok); - PADDLE_ENFORCE_NOT_NULL( - c, platform::errors::PreconditionNotMet("Make BaseProcessor failed.")); - - if (c->status_.ok()) { - VLOG(3) << c->GetVarHandlePtr()->String() << " process"; - c->Process(); - } else if (ShouldRetry(c->GetVarHandlePtr()->method(), - c->status_.error_code())) { - VLOG(0) << c->GetVarHandlePtr()->String() - << " meets grpc error, error_code:" << c->status_.error_code() - << " error_message:" << c->status_.error_message() - << " error_details:" << c->status_.error_details() - << " should retry!"; - c->GetVarHandlePtr()->should_retry = true; - c->Finish(false); - } else { - PADDLE_THROW(platform::errors::External( - "%s meets grpc error, error_code is %d, error message is %s, error " - "details is %s.", - c->GetVarHandlePtr()->String(), c->status_.error_code(), - c->status_.error_message(), c->status_.error_details())); - c->Finish(false); - } - - bool notify = false; - { - std::lock_guard lk(sync_mutex_); - req_count_--; - notify = (req_count_ <= 0 || !c->status_.ok()); - } - - delete c; - - if (notify) { - sync_cond_.notify_all(); - } - } - - // Last log message - // Avoid using VLOG() and LOG(): in the destructor of google::LogMessage() a - // static Mutex log_mutex is used for synchronization, which might have been - // destructed at this moment. - if (FLAGS_v >= 3) { - std::string msg("GRPCClient Proceed end"); - fwrite(msg.c_str(), msg.length(), 1, stderr); - } -} - -std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { - std::lock_guard guard(chan_mutex_); - auto it = channels_.find(ep); - if (it != channels_.end()) { - return it->second; - } - - // Channel configurations: - grpc::ChannelArguments args; - args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000); - if (FLAGS_rpc_disable_reuse_port) { - args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0); - } - args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE); - args.SetMaxSendMessageSize(std::numeric_limits::max()); - args.SetMaxReceiveMessageSize(std::numeric_limits::max()); - - auto ch = - grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args); - channels_[ep] = ch; - return ch; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h deleted file mode 100644 index 5885f944b60a15..00000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.h +++ /dev/null @@ -1,321 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include // NOLINT -#include // NOLINT -#include -#include -#include -#include -#include -#include // NOLINT -#include -#include // NOLINT -#include -#include - -#include "grpc++/channel.h" -#include "grpc++/generic/generic_stub.h" -#include "grpc++/grpc++.h" -#include "grpc++/support/byte_buffer.h" -#include "grpc++/support/slice.h" -#include "grpc/support/log.h" -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN - -namespace grpc { -class Channel; -} // namespace grpc -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); - -void ProcGetRecvResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); - -class BaseProcessor { - public: - BaseProcessor() { context_ = nullptr; } - - virtual ~BaseProcessor() {} - - virtual void Prepare(VarHandlePtr h, int64_t time_out) { - var_h_ = h; - - context_.reset(new grpc::ClientContext()); - context_->set_wait_for_ready(true); - if (time_out) { - std::chrono::system_clock::time_point deadline = - std::chrono::system_clock::now() + - std::chrono::milliseconds(time_out); - context_->set_deadline(deadline); - } - } - - void Process() { - ProcessImpl(); - var_h_->Finish(true); - } - - VarHandlePtr GetVarHandlePtr() { return var_h_; } - bool Wait() { return var_h_->Wait(); } - void Finish(bool ok) { return var_h_->Finish(ok); } - virtual void ProcessImpl() = 0; - - std::unique_ptr context_; - grpc::Status status_; - - protected: - VarHandlePtr var_h_; -}; - -typedef std::function - RequestSendCallBack; - -class SendProcessor : public BaseProcessor { - public: - explicit SendProcessor(std::shared_ptr ch) - : BaseProcessor(), stub_g_(ch) {} - - virtual ~SendProcessor() {} - - void ProcessImpl() override { - if (response_call_back_) { - response_call_back_(*var_h_.get(), reply_); - } - } - - ::grpc::GenericStub stub_g_; - ::grpc::ByteBuffer reply_; - RequestSendCallBack response_call_back_ = nullptr; -}; - -typedef std::function - RequestGetCallBack; - -class GetProcessor : public BaseProcessor { - public: - explicit GetProcessor(std::shared_ptr ch) - : BaseProcessor(), stub_g_(ch) {} - - virtual ~GetProcessor() {} - - void ProcessImpl() override { - if (response_call_back_) { - response_call_back_(*var_h_.get(), reply_); - } - } - - ::grpc::ByteBuffer reply_; - ::grpc::GenericStub stub_g_; - RequestGetCallBack response_call_back_ = ProcGetResponse; -}; - -class SendAndRecvProcessor : public BaseProcessor { - public: - explicit SendAndRecvProcessor(std::shared_ptr ch) - : BaseProcessor(), stub_g_(ch) {} - - virtual ~SendAndRecvProcessor() {} - - void ProcessImpl() override { - if (response_call_back_) { - response_call_back_(*var_h_recv_.get(), reply_); - var_h_recv_->Finish(true); - } - } - - void RecvPrepare(VarHandlePtr h_recv) { var_h_recv_ = h_recv; } - - ::grpc::ByteBuffer reply_; - ::grpc::GenericStub stub_g_; - RequestGetCallBack response_call_back_ = ProcGetResponse; - VarHandlePtr var_h_recv_; -}; - -class BatchBarrierProcessor : public BaseProcessor { - public: - explicit BatchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor() { - stub_ = sendrecv::SendRecvService::NewStub(ch); - } - - virtual ~BatchBarrierProcessor() {} - - void ProcessImpl() override {} - sendrecv::VoidMessage reply_; - std::unique_ptr stub_; -}; - -class FetchBarrierProcessor : public BaseProcessor { - public: - explicit FetchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor() { - stub_ = sendrecv::SendRecvService::NewStub(ch); - } - - virtual ~FetchBarrierProcessor() {} - - void ProcessImpl() override {} - sendrecv::VariableMessage reply_; - std::unique_ptr stub_; -}; - -class CheckpointNotifyProcessor : public BaseProcessor { - public: - explicit CheckpointNotifyProcessor(std::shared_ptr ch) - : BaseProcessor() { - stub_ = sendrecv::SendRecvService::NewStub(ch); - } - - virtual ~CheckpointNotifyProcessor() {} - - void ProcessImpl() override {} - sendrecv::VoidMessage reply_; - std::unique_ptr stub_; -}; - -class GRPCClient : public RPCClient { - public: - GRPCClient() : ok_(true), completed_(false), stopped_(false) {} - virtual ~GRPCClient(); - - VarHandlePtr AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_varname, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendBatchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) override; - - VarHandlePtr AsyncGetMonomerBarrier( - const std::string& ep, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncCheckpointNotify( - const std::string& ep, const std::string& dirname, - const std::string& varname, const int mode, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncDistributeNotify( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendAndRecv(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& send_var_name, - const std::string& recv_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendComplete( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - bool Wait() override; - - void SendComplete() override; - - void InitImpl() override; - - private: - void Proceed(); - - std::shared_ptr GetChannel(const std::string& ep); - VarHandlePtr _AsyncGetVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& method, - const std::string& var_name, const std::string& out_varname, - const std::string& rpc_path, const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline); - - private: - grpc::CompletionQueue cq_; - std::unordered_map> channels_; - std::vector> client_threads_; - - // mutex for Wait client sync - std::mutex sync_mutex_; - std::condition_variable sync_cond_; - std::atomic req_count_{0}; - bool ok_; - - // mutex for GetChannel thread safety - std::mutex chan_mutex_; - DISABLE_COPY_AND_ASSIGN(GRPCClient); - - // mutex for sending complete message only once - std::mutex completed_mutex_; - bool completed_; - - volatile bool stopped_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc deleted file mode 100644 index 0fc9b695779149..00000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc +++ /dev/null @@ -1,190 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_NCCL -#include -#endif -#ifdef PADDLE_WITH_RCCL -#include -#endif -#include -#include -#include "grpcpp/impl/codegen/byte_buffer.h" -#include "grpcpp/impl/codegen/slice.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/operators/distributed/proto_encoder_helper.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace framework { -class Scope; -class Variable; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -void SerializeToByteBuffer(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, - ::grpc::ByteBuffer* msg, const std::string& out_name, - const int trainer_id, - const std::string& table_name) { - platform::RecordRPCEvent record_event("serial"); - VarMsg request; - TensorPayload* payload = nullptr; - - request.set_varname(name); - request.set_trainer_id(trainer_id); - // Note: normally the profiler is enabled in 1 trainer, hence only - // 1 trainer returns true for ShouldSendProfileState(). It tells PS - // servers the trainer's profiling state so that PS can follow the - // trainer. - if (platform::ShouldSendProfileState()) { - if (platform::IsProfileEnabled()) { - request.set_profile(platform::kEnableProfiler); - } else { - request.set_profile(platform::kDisableProfiler); - } - } - if (!out_name.empty()) { - request.set_out_varname(out_name); - } - if (!table_name.empty()) { - request.set_table_name(table_name); - } - if (var->IsType()) { - request.set_type(::sendrecv::LOD_TENSOR); - payload = new TensorPayload(GetTensorPayload(var, ctx, &request)); - } else if (var->IsType()) { - request.set_type(::sendrecv::SELECTED_ROWS); - payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request)); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - } else if (var->IsType()) { - request.set_type(::sendrecv::NCCL_ID); -#endif - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Serialize does not support type: %s", typeid(var->Type()).name())); - } - std::string header; - request.AppendToString(&header); - auto buffer = std::unique_ptr(new char[1024]); - void* buf = buffer.get(); - ProtoEncodeHelper e(static_cast(buf), 1024); - e.WriteRawBytes(std::string(header.data(), header.size())); -// NCCLID is copied directly to the message, return bytebuffer -// with only one slice if serializing NCCLID. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - if (var->IsType()) { - e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, - NCCL_UNIQUE_ID_BYTES); - const ncclUniqueId& uid = var->Get(); - e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES)); - - // for serialize NCCL_ID - ::grpc::Slice slices(e.size()); - memcpy(const_cast(slices.begin()), e.data(), e.size()); - ::grpc::ByteBuffer tmp(&slices, 1); - msg->Swap(&tmp); - return; - } -#endif - PADDLE_ENFORCE_NOT_NULL( - payload, - platform::errors::InvalidArgument( - "Not support type: %s, need to be LOD_TENSOR or SELECTED_ROWS", - var->Type())); - e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, - payload->memory_size()); - if (payload->memory_size() >= std::numeric_limits::max()) { - PADDLE_THROW(platform::errors::InvalidArgument( - "Variable %s length %d should less than %d.", name, - payload->memory_size(), std::numeric_limits::max())); - } - // steal reference of tensor data - ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows - int num_slices = 2; // only SelectedRows have rows buffer - slices[0] = ::grpc::Slice(e.size()); - memcpy(const_cast(slices[0].begin()), e.data(), e.size()); - slices[1] = ::grpc::Slice( - grpc_slice_new_with_user_data(payload->ptr(), payload->memory_size(), - SerializeDestroyCallback, payload), - ::grpc::Slice::STEAL_REF); - - if (var->IsType()) { - auto* slr = var->GetMutable(); - ProtoEncodeHelper e2(static_cast(buf), 128); - - PADDLE_ENFORCE_EQ(VectorElemName(slr->rows()), typeid(int64_t).name(), - platform::errors::InvalidArgument( - "Got wrong type %s, expect type: int64_t", - VectorElemName(slr->rows()))); - size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); - - e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); - slices[2] = ::grpc::Slice(e2.size()); - memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); - - slices[3] = ::grpc::Slice( - grpc_slice_new_with_user_data( - const_cast( - reinterpret_cast(slr->rows().data())), - rows_memory_size, [](void* backing) {}, - const_cast( - reinterpret_cast(slr->rows().data()))), - ::grpc::Slice::STEAL_REF); - num_slices = 4; - } - ::grpc::ByteBuffer tmp(&slices[0], num_slices); - msg->Swap(&tmp); -} - -void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id) { - platform::RecordRPCEvent record_event("deserial"); - operators::distributed::GRPCVariableResponse resp(scope, &ctx); - PADDLE_ENFORCE_EQ( - resp.Parse(msg), 0, - platform::errors::InvalidArgument("parse bytebuffer to tensor error!")); - *var = resp.GetVar(); - *trainer_id = resp.GetTrainerId(); -} - -void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id) { - platform::RecordRPCEvent record_event("deserial"); - operators::distributed::GRPCVariableResponse resp(scope, &ctx); - PADDLE_ENFORCE_EQ( - resp.Parse(msg), 0, - platform::errors::InvalidArgument("parse bytebuffer to tensor error!")); - *var = resp.GetRecvVar(); - *trainer_id = resp.GetTrainerId(); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h deleted file mode 100644 index 932f3e2f069a2b..00000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/port.h" - -namespace grpc { -class ByteBuffer; -} // namespace grpc -namespace paddle { -namespace framework { -class Scope; -class Variable; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -typedef void (*DestroyCallback)(void*); - -void SerializeToByteBuffer(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, - ::grpc::ByteBuffer* msg, - const std::string& out_varname = std::string(), - const int trainer_id = 0, - const std::string& table_name = std::string()); - -void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id); - -void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id); - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc deleted file mode 100644 index d407a72938a741..00000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc +++ /dev/null @@ -1,224 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include // NOLINT - -#include "google/protobuf/text_format.h" -#include "gtest/gtest.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/printf.h" - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace operators = paddle::operators; -namespace math = paddle::operators::math; -namespace memory = paddle::memory; - -void RunSerdeTestSelectedRows(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - // serialize var to ByteBuffer - framework::Variable var; - auto* slr = var.GetMutable(); - slr->set_height(1000); - auto* tensor = slr->mutable_value(); - auto* rows = slr->mutable_rows(); - tensor->Resize(framework::make_ddim({564, 128})); - tensor->mutable_data(place); - int tensor_numel = 564 * 128; - math::set_constant(ctx, tensor, 32.7); - for (int i = 0; i < 564; ++i) rows->push_back(i); - - ::grpc::ByteBuffer msg; - operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg); - EXPECT_GT(msg.Length(), static_cast(0)); - - // deserialize - std::vector<::grpc::Slice> slices; - (void)msg.Dump(&slices); - std::string tmp; - for (const auto& s : slices) { - tmp.append(reinterpret_cast(s.begin()), s.size()); - } - - sendrecv::VariableMessage varmsg; - EXPECT_TRUE(varmsg.ParseFromString(tmp)); - - // deserialize bytebuffer - EXPECT_EQ(varmsg.varname(), "myvar"); - EXPECT_EQ(varmsg.type(), 1); - - const float* tensor_data = - reinterpret_cast(varmsg.serialized().data()); - const int64_t* rows_data = - reinterpret_cast(varmsg.rows().data()); - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data[i], 32.7); - } - for (int i = 0; i < 564; ++i) { - EXPECT_EQ(rows_data[i], i); - } - - // deserialize zero-copy - // framework::Variable var2; - // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2); - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::GRPCVariableResponse resp(&scope, &ctx); - EXPECT_EQ(resp.Parse(msg), 0); - - framework::Variable* var2 = resp.GetVar(); - - auto* slr2 = var2->GetMutable(); - auto* tensor2 = slr2->mutable_value(); - auto* rows2 = slr2->mutable_rows(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(*tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2->data()); - } - const int64_t* rows_data2 = rows2->data(); - - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); - } - for (size_t i = 0; i < rows2->size(); ++i) { - EXPECT_EQ(rows_data2[i], static_cast(i)); - } - EXPECT_EQ(slr2->height(), 1000); -} - -void RunTestLodTensor(platform::Place place, int from_type = 0) { - // serialize var to ByteBuffer - framework::Variable var; - auto* tensor = var.GetMutable(); - tensor->Resize(framework::make_ddim({512, 8, 4, 2})); - framework::LoD lod; - lod.push_back(framework::Vector({1, 3, 8})); - tensor->set_lod(lod); - int tensor_numel = 512 * 8 * 4 * 2; - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - tensor->mutable_data(place); - math::set_constant(ctx, tensor, 31.9); - - ::grpc::ByteBuffer msg; - operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg, - "outvar", 0, "table_name"); - EXPECT_GT(msg.Length(), static_cast(0)); - - // deserialize - std::vector<::grpc::Slice> slices; - (void)msg.Dump(&slices); - std::string tmp; - for (const auto& s : slices) { - tmp.append(reinterpret_cast(s.begin()), s.size()); - } - sendrecv::VariableMessage varmsg; - EXPECT_TRUE(varmsg.ParseFromString(tmp)); - EXPECT_EQ(varmsg.varname(), "myvar"); - EXPECT_EQ(varmsg.type(), 0); - EXPECT_EQ(varmsg.dims()[0], 512); - EXPECT_EQ(varmsg.dims()[1], 8); - EXPECT_EQ(varmsg.dims()[2], 4); - EXPECT_EQ(varmsg.dims()[3], 2); - EXPECT_EQ(varmsg.lod_level(), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); - EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); - - const float* tensor_data = - reinterpret_cast(varmsg.serialized().data()); - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data[i], 31.9); - } - - // message binary - std::string str; - varmsg.SerializeToString(&str); - - // message bytebuffer - ::grpc::Slice slices_2[1]; - int num_slices = 1; - slices_2[0] = ::grpc::Slice(str.length()); - memcpy(const_cast(slices_2[0].begin()), str.c_str(), str.length()); - ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices); - - // deserialize zero-copy - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::GRPCVariableResponse resp(&scope, &ctx); - if (from_type == 0) { - EXPECT_EQ(resp.Parse(msg), 0); - } else { - EXPECT_EQ(resp.Parse(bytebuffer2), 0); - } - - framework::Variable* var2 = resp.GetVar(); - - auto tensor2 = var2->Get(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2.data()); - } - - EXPECT_EQ(varmsg.lod_level(), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); - EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); - for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); -} - -TEST(LodTensor, Run) { - platform::CPUPlace place; - RunTestLodTensor(place); - RunTestLodTensor(place, 1); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu(0); - RunTestLodTensor(gpu); - RunTestLodTensor(gpu, 1); -#endif -} - -TEST(SelectedRows, Run) { - platform::CPUPlace place; - RunSerdeTestSelectedRows(place); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu; - RunSerdeTestSelectedRows(gpu); -#endif -} diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc deleted file mode 100644 index 912520d782d756..00000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc +++ /dev/null @@ -1,720 +0,0 @@ -/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include - -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_server.h" - -namespace grpc { -class ChannelArguments; -} // namespace grpc -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -namespace operators { -namespace distributed { -class GRPCVariableResponse; -} // namespace distributed -} // namespace operators -} // namespace paddle - -using ::grpc::ServerAsyncResponseWriter; - -DECLARE_bool(rpc_disable_reuse_port); -DECLARE_int32(rpc_retry_bind_port); - -namespace paddle { -namespace operators { -namespace distributed { - -enum CallStatus { PROCESS = 0, FINISH }; - -// reference: -// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server -class RequestBase { - public: - explicit RequestBase(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : service_(service), - cq_(cq), - status_(PROCESS), - request_handler_(request_handler), - req_id_(req_id) { - PADDLE_ENFORCE_NOT_NULL(cq_, platform::errors::InvalidArgument( - "ServerCompletionQueue cq are empty")); - } - virtual ~RequestBase() {} - virtual void Process() = 0; - - std::string Status2String(const std::string& method) { - std::string status = "Process"; - if (status_ == FINISH) { - status = "Finish"; - } - - std::ostringstream s; - s << method << " name:[" << GetReqName() << "]" - << ", ep:[" << ctx_.peer() << "]" - << " " << status << " using req_id:" << req_id_; - return s.str(); - } - - CallStatus Status() const { - std::lock_guard l(status_mu_); - return status_; - } - - template - void Finish(const T& reply, ServerAsyncResponseWriter* responder) { - std::lock_guard l(status_mu_); - status_ = FINISH; - responder->Finish(reply, ::grpc::Status::OK, - reinterpret_cast(static_cast(req_id_))); - } - virtual std::string GetReqName() = 0; - - protected: - mutable std::mutex status_mu_; - ::grpc::ServerContext ctx_; - GrpcService::AsyncService* service_; - ::grpc::ServerCompletionQueue* cq_; - CallStatus status_; - RequestHandler* request_handler_; - int req_id_; -}; - -class RequestSend final : public RequestBase { - public: - explicit RequestSend(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - int method_id = static_cast(distributed::GrpcMethod::kSendVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - virtual ~RequestSend() {} - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - std::string varname = GetReqName(); - - auto scope = request_->GetMutableLocalScope(); - auto invar = request_->GetVar(); - int trainer_id = request_->GetTrainerId(); - - VLOG(4) << "RequestSend var_name:" << varname << " trainer: " << trainer_id; - - framework::Variable* outvar = nullptr; - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); - Finish(reply_, &responder_); - } - - protected: - sendrecv::VoidMessage reply_; - std::shared_ptr request_; - ServerAsyncResponseWriter responder_; -}; - -class RequestGet final : public RequestBase { - public: - explicit RequestGet(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - auto method_id = static_cast(distributed::GrpcMethod::kGetVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGet() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - std::string out_varname = request_.out_varname(); - std::string table_name = request_.table_name(); - int trainer_id = request_.trainer_id(); - - VLOG(4) << "RequestGet " << out_varname << " from " << varname; - - auto scope = request_handler_->scope(); - framework::Variable* invar = nullptr; - framework::Variable* outvar = nullptr; - - tmp_scope_ = std::move(scope->NewTmpScope()); - request_handler_->Handle(varname, tmp_scope_.get(), invar, &outvar, - trainer_id, out_varname, table_name); - - VLOG(1) << "before SerializeToByteBuffer"; - if (outvar) { - SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), - &reply_); - } - VLOG(1) << "after SerializeToByteBuffer"; - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - ::grpc::ByteBuffer reply_; - std::unique_ptr tmp_scope_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; -}; - -class RequestGetNoBarrier final : public RequestBase { - public: - explicit RequestGetNoBarrier(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - auto method_id = - static_cast(distributed::GrpcMethod::kGetVariableNoBarrier); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGetNoBarrier() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - std::string out_varname = request_.out_varname(); - int trainer_id = request_.trainer_id(); - - VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname; - - auto scope = request_handler_->scope(); - framework::Variable* invar = nullptr; - framework::Variable* outvar = nullptr; - - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id, - out_varname); - - if (outvar) { - SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), - &reply_); - } - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; -}; - -class RequestGetMonomerVariable final : public RequestBase { - public: - explicit RequestGetMonomerVariable(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, - int req_id, RPCServer* rpc_server) - : RequestBase(service, cq, request_handler, req_id), - responder_(&ctx_), - rpc_server_(rpc_server) { - auto method_id = - static_cast(distributed::GrpcMethod::kGetMonomerVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGetMonomerVariable() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - - rpc_server_->WaitVarCond(varname); - MonomerHandle h = rpc_server_->GetMonomer(varname); - - auto scope = h.scope_; - auto invar = scope->FindVar(varname); - framework::Variable* outvar = nullptr; - - request_handler_->Handle(varname, scope, invar, &outvar, - request_.trainer_id()); - - if (outvar) { - SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_); - } - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; - RPCServer* rpc_server_{nullptr}; -}; - -class RequestGetMonomerBarrier final : public RequestBase { - public: - explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id, - RPCServer* rpc_server) - : RequestBase(service, cq, request_handler, req_id), - responder_(&ctx_), - rpc_server_(rpc_server) { - auto method_id = - static_cast(distributed::GrpcMethod::kGetMonomerBarrier); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGetMonomerBarrier() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - VLOG(4) << "RequestGetMonomerBarrier " << varname; - - rpc_server_->WaitVarCond(varname); - MonomerHandle h = rpc_server_->GetMonomer(varname); - - framework::Scope* scope = nullptr; - framework::Variable* invar = nullptr; - framework::Variable* outvar = nullptr; - - request_handler_->Handle(varname, scope, invar, &outvar, - request_.trainer_id()); - - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - sendrecv::VoidMessage reply_; - ServerAsyncResponseWriter responder_; - RPCServer* rpc_server_{nullptr}; -}; - -class RequestPrefetch final : public RequestBase { - public: - explicit RequestPrefetch(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), - responder_(&ctx_), - local_scope_(nullptr) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - int method_id = - static_cast(distributed::GrpcMethod::kPrefetchVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestPrefetch() {} - - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - // prefetch process... - std::string in_var_name = request_->Varname(); - std::string out_var_name = request_->OutVarname(); - std::string table_name = request_->TableName(); - int trainer_id = request_->GetTrainerId(); - - VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name - << " out_var_name: " << out_var_name << " trainer: " << trainer_id; - - auto scope = request_->GetMutableLocalScope(); - auto invar = scope->FindVar(in_var_name); - // out var must be created in local scope! - framework::Variable* outvar = scope->Var(out_var_name); - - request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id, - out_var_name, table_name); - - SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(), - &reply_); - Finish(reply_, &responder_); - } - - protected: - std::shared_ptr request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; - framework::Scope* local_scope_; -}; - -class RequestCheckpointNotify final : public RequestBase { - public: - explicit RequestCheckpointNotify(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx())); - int method_id = - static_cast(distributed::GrpcMethod::kCheckpointNotify); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestCheckpointNotify() {} - - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - auto scope = request_->GetMutableLocalScope(); - - std::string checkpoint_notify = request_->Varname(); - std::string checkpoint_dir = request_->OutVarname(); - int trainer_id = request_->GetTrainerId(); - std::string table_name = request_->TableName(); - - VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify - << ", dir: " << checkpoint_dir; - - request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr, - trainer_id, checkpoint_dir, table_name); - Finish(reply_, &responder_); - } - - protected: - std::shared_ptr request_; - sendrecv::VoidMessage reply_; - ServerAsyncResponseWriter responder_; -}; - -class RequestNotify final : public RequestBase { - public: - explicit RequestNotify(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - int method_id = static_cast(distributed::GrpcMethod::kRequestNotify); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - virtual ~RequestNotify() {} - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - std::string varname = GetReqName(); - VLOG(4) << "RequestNotify var_name:" << varname; - - auto scope = request_->GetMutableLocalScope(); - auto invar = request_->GetVar(); - int trainer_id = request_->GetTrainerId(); - framework::Variable* outvar = nullptr; - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); - Finish(reply_, &responder_); - } - - protected: - sendrecv::VoidMessage reply_; - std::shared_ptr request_; - ServerAsyncResponseWriter responder_; -}; - -class RequestSendAndRecv final : public RequestBase { - public: - explicit RequestSendAndRecv(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - - int method_id = - static_cast(distributed::GrpcMethod::kRequestSendAndRecv); - - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestSendAndRecv() {} - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - std::string in_var_name = request_->Varname(); - std::string out_var_name = request_->OutVarname(); - std::string table_name = request_->TableName(); - int trainer_id = request_->GetTrainerId(); - - VLOG(4) << "RequestSendAndRecv, in_var_name: " << in_var_name - << " out_var_name: " << out_var_name << " trainer: " << trainer_id; - auto scope = request_->GetMutableLocalScope(); - auto invar = scope->FindVar(in_var_name); - framework::Variable* outvar = nullptr; - request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id, - out_var_name, table_name); - SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(), - &reply_); - Finish(reply_, &responder_); - } - - protected: - std::shared_ptr request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; -}; - -void AsyncGRPCServer::WaitServerReady() { - VLOG(4) << "AsyncGRPCServer is waiting server ready"; - std::unique_lock lock(this->mutex_ready_); - condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); - VLOG(4) << "AsyncGRPCServer WaitSeverReady"; -} - -// Define an option subclass in order to disable SO_REUSEPORT for the -// server socket. -// Come from: -// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc -class NoReusePortOption : public ::grpc::ServerBuilderOption { - public: - void UpdateArguments(::grpc::ChannelArguments* args) override { - args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0); - } - - void UpdatePlugins(std::vector>* - plugins) override {} -}; - -void AsyncGRPCServer::StartServer() { - for (int i = 0; i < FLAGS_rpc_retry_bind_port; i++) { - ::grpc::ServerBuilder builder; - std::unique_ptr service( - new GrpcService::AsyncService()); - builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(), - &selected_port_); - - builder.SetMaxSendMessageSize(std::numeric_limits::max()); - builder.SetMaxReceiveMessageSize(std::numeric_limits::max()); - if (FLAGS_rpc_disable_reuse_port) { - builder.SetOption( - std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption)); - LOG(INFO) << "set FLAGS_rpc_disable_reuse_port"; - } - builder.RegisterService(service.get()); - - for (auto t : rpc_call_map_) { - rpc_cq_[t.first].reset(builder.AddCompletionQueue().release()); - } - - server_ = builder.BuildAndStart(); - if (selected_port_ != 0) { - LOG(INFO) << "Server listening on " << bind_address_ - << " successful, selected port: " << selected_port_; - service_.reset(service.release()); - break; - } - - LOG(WARNING) << "Server listening on " << bind_address_ - << " failed, selected port: " << selected_port_ - << ", retry after 3 seconds!"; - - sleep(3); - } - - PADDLE_ENFORCE_NE( - selected_port_, 0, - platform::errors::Unavailable("can't bind to address:%s", bind_address_)); - - std::function f = - std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this, - std::placeholders::_1, std::placeholders::_2); - - for (auto& t : rpc_call_map_) { - auto& rpc_name = t.first; - auto& cq = rpc_cq_[rpc_name]; - auto threadnum = rpc_thread_num_[rpc_name]; - auto& reqs = rpc_reqs_[rpc_name]; - - reqs.reserve(kRequestBufSize); - - for (int i = 0; i < kRequestBufSize; i++) { - VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i; - TryToRegisterNewOne(rpc_name, i); - } - - for (int i = 0; i < threadnum; i++) { - rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind( - &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f))); - VLOG(4) << t.first << " creates threads!"; - } - } - - { - std::lock_guard lock(this->mutex_ready_); - ready_ = 1; - } - condition_ready_.notify_all(); - - // wait server - server_->Wait(); - - for (auto& t : rpc_threads_) { - auto& threads = t.second; - for (size_t i = 0; i < threads.size(); ++i) { - threads[i]->join(); - VLOG(4) << t.first << " threads ends!"; - } - } -} - -void AsyncGRPCServer::ShutdownQueue() { - for (auto& t : rpc_cq_) { - t.second->Shutdown(); - VLOG(4) << t.first << " queue shutdown!"; - } -} - -void AsyncGRPCServer::ShutDownImpl() { - std::unique_lock lock(cq_mutex_); - is_shut_down_ = true; - ShutdownQueue(); - - VLOG(4) << "server_ shutdown!"; - server_->Shutdown(); -} - -void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, - int req_id) { - std::unique_lock lock(cq_mutex_); - if (is_shut_down_) { - VLOG(4) << "shutdown, do not TryToRegisterNewSendOne"; - return; - } - - VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name - << " REQ ID: " << req_id; - - auto& reqs = rpc_reqs_[rpc_name]; - auto& handler = rpc_call_map_[rpc_name]; - auto& cq = rpc_cq_[rpc_name]; - - RequestBase* b = nullptr; - if (rpc_name == kRequestSend) { - b = new RequestSend(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestGet) { - b = new RequestGet(service_.get(), cq.get(), handler, req_id); - - } else if (rpc_name == kRequestGetNoBarrier) { - b = new RequestGetNoBarrier(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestGetMonomerVariable) { - b = new RequestGetMonomerVariable(service_.get(), cq.get(), handler, req_id, - this); - } else if (rpc_name == kRequestGetMonomerBarrier) { - b = new RequestGetMonomerBarrier(service_.get(), cq.get(), handler, req_id, - this); - } else if (rpc_name == kRequestPrefetch) { - b = new RequestPrefetch(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestCheckpoint) { - b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestNotify) { - b = new RequestNotify(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestSendAndRecv) { - b = new RequestSendAndRecv(service_.get(), cq.get(), handler, req_id); - } else { - PADDLE_THROW( - platform::errors::InvalidArgument("not supported rpc: %s", rpc_name)); - } - - reqs[req_id] = b; - - VLOG(4) << "TryToRegisterNewOne status:" << b->Status(); -} - -void AsyncGRPCServer::HandleRequest( - ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name, - std::function TryToRegisterNewOne) { - void* tag = NULL; - bool ok = false; - - while (true) { - VLOG(4) << "HandleRequest " << rpc_name << " wait next"; - if (!cq->Next(&tag, &ok)) { - VLOG(4) << "CompletionQueue " << rpc_name << " shutdown!"; - break; - } - - int req_id = static_cast(reinterpret_cast(tag)); - VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id - << " get next"; - - auto& reqs = rpc_reqs_[rpc_name]; - RequestBase* base = nullptr; - { - PADDLE_ENFORCE_EQ( - (req_id >= 0 && req_id < kRequestBufSize), true, - platform::errors::OutOfRange("request id: %s out of bounds: [0, %s)", - req_id, kRequestBufSize)); - std::unique_lock lock(cq_mutex_); - base = reqs[req_id]; - } - - VLOG(3) << base->Status2String(rpc_name); - - // reference: - // https://github.com/tensorflow/tensorflow/issues/5596 - // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM - // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I - if (!ok) { - VLOG(4) << "completion queue:" << rpc_name << " recv no regular event" - << " context:" << base->Status2String(rpc_name); - TryToRegisterNewOne(rpc_name, req_id); - delete base; - continue; - } - - switch (base->Status()) { - case PROCESS: { - base->Process(); - break; - } - case FINISH: { - TryToRegisterNewOne(rpc_name, req_id); - delete base; - break; - } - default: { assert(false); } - } - } -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h deleted file mode 100644 index 3d68b7e8cebb40..00000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.h +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include // NOLINT -#include -#include - -#include "grpc++/grpc++.h" -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_service.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/profiler.h" - -namespace grpc { -class ServerCompletionQueue; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestBase; - -class AsyncGRPCServer final : public RPCServer { - public: - explicit AsyncGRPCServer(const std::string& address, int client_num) - : RPCServer(address, client_num), ready_(0) {} - - virtual ~AsyncGRPCServer() {} - void WaitServerReady() override; - void StartServer() override; - - private: - // HandleRequest needs to be thread-safe. - void HandleRequest( - ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name, - std::function TryToRegisterNewOne); - - void TryToRegisterNewOne(const std::string& rpc_name, int req_id); - void ShutdownQueue(); - void ShutDownImpl() override; - - private: - static const int kRequestBufSize = 100; - - std::mutex cq_mutex_; - volatile bool is_shut_down_ = false; - - std::unique_ptr service_; - std::unique_ptr<::grpc::Server> server_; - - // condition of the sub program - std::condition_variable barrier_condition_; - - std::mutex mutex_ready_; - std::condition_variable condition_ready_; - - int ready_; - - std::map> rpc_cq_; - std::map>> rpc_threads_; - std::map> rpc_reqs_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h deleted file mode 100644 index 10037c90853deb..00000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_service.h +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/platform/profiler.h" - -// NOTE: This method was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// method and did some modifications so that we can parse gRPC -// requests without too much copying of the tensor data. - -namespace grpc { -class CompletionQueue; -class Channel; -class RpcService; -class ServerCompletionQueue; -class ServerContext; - -// Support parsing/unparsing of tensorflow::VariableResponse. -// Wire-format is identical to RecvVariableResponse. -template <> -class SerializationTraits< - paddle::operators::distributed::GRPCVariableResponse> { - public: - static Status Serialize( - const paddle::operators::distributed::GRPCVariableResponse& msg, - grpc_byte_buffer** bp, bool* own_buffer) { - PADDLE_THROW(paddle::platform::errors::Unimplemented( - "SerializationTraits::Serialize not implemented!")); - return Status(); - } - static Status Deserialize( - grpc_byte_buffer* buffer, - paddle::operators::distributed::GRPCVariableResponse* msg, - int max_message_size = INT_MAX) { - if (buffer == nullptr) { - return Status(StatusCode::INTERNAL, "No payload"); - } - - Status result = g_core_codegen_interface->ok(); - if (result.ok()) { - paddle::operators::distributed::GrpcByteSource source(buffer); - int ret = msg->Parse(&source); - if (ret != 0) { - result = Status(StatusCode::INTERNAL, "VariableResponse parse error"); - } - } - g_core_codegen_interface->grpc_byte_buffer_destroy(buffer); - return result; - } -}; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -enum class GrpcMethod { - kSendVariable, - kGetVariable, - kPrefetchVariable, - kCheckpointNotify, - kGetVariableNoBarrier, - kGetMonomerVariable, - kGetMonomerBarrier, - kRequestNotify, - kRequestSendAndRecv, - // when you add new handler, change kGrpcNumMethods at the same time! -}; - -static const int kGrpcNumMethods = - static_cast(GrpcMethod::kRequestSendAndRecv) + 1; - -inline const char* GrpcMethodName(GrpcMethod id) { - switch (id) { - case GrpcMethod::kSendVariable: - return "/sendrecv.SendRecvService/SendVariable"; - case GrpcMethod::kGetVariable: - return "/sendrecv.SendRecvService/GetVariable"; - case GrpcMethod::kGetVariableNoBarrier: - return "/sendrecv.SendRecvService/GetVariableNoBarrier"; - case GrpcMethod::kGetMonomerVariable: - return "/sendrecv.SendRecvService/GetMonomerVariable"; - case GrpcMethod::kGetMonomerBarrier: - return "/sendrecv.SendRecvService/GetMonomerBarrier"; - case GrpcMethod::kPrefetchVariable: - return "/sendrecv.SendRecvService/PrefetchVariable"; - case GrpcMethod::kCheckpointNotify: - return "/sendrecv.SendRecvService/CheckpointNotify"; - case GrpcMethod::kRequestNotify: - return "/sendrecv.SendRecvService/DistributeNotify"; - case GrpcMethod::kRequestSendAndRecv: - return "/sendrecv.SendRecvService/SendAndRecvVariable"; - } - - // Shouldn't be reached. - PADDLE_THROW(platform::errors::InvalidArgument( - "Invalid id: not found valid method name")); - return nullptr; -} - -class GrpcService final { - public: - class AsyncService : public ::grpc::Service { - public: - AsyncService() { - for (int i = 0; i < kGrpcNumMethods; ++i) { - AddMethod(new ::grpc::internal::RpcServiceMethod( - GrpcMethodName(static_cast(i)), - ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr)); - ::grpc::Service::MarkMethodAsync(i); - } - } - virtual ~AsyncService() {} - - // Make RequestAsyncUnary public for grpc_call.h - using ::grpc::Service::RequestAsyncUnary; - }; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc deleted file mode 100644 index f7679e9fc924df..00000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc +++ /dev/null @@ -1,344 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "google/protobuf/io/coded_stream.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/profiler.h" - -namespace google { -namespace protobuf { -namespace io { -class ZeroCopyInputStream; -} // namespace io -} // namespace protobuf -} // namespace google -namespace grpc { -class ByteBuffer; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -enum WireType { - WIRETYPE_VARINT = 0, - WIRETYPE_LENGTH_DELIMITED = 2, -}; - -inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; } - -inline WireType GetTagWireType(uint32_t tag) { - return static_cast(tag & 0x7); -} - -bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input, - int* result) { - uint64_t v; - if (input->ReadVarint64(&v) && v <= static_cast(INT_MAX)) { - *result = static_cast(v); - return true; - } else { - return false; - } -} - -int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) { - GrpcByteBufferSource source; - source.Init(byte_buffer); - GrpcByteBufferSourceWrapper r(&source); - - return Parse(&r); -} - -bool ParseLodData(::google::protobuf::io::CodedInputStream* input, - std::vector* lod) { - while (true) { - auto p = input->ReadTagWithCutoff(127); - int tag = GetTagFieldNumber(p.first); - WireType wt = GetTagWireType(p.first); - - if (!p.second) { - return (tag == 0); - } - - switch (tag) { - case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: { - uint64_t v; - if (wt == WIRETYPE_VARINT) { - if (!input->ReadVarint64(&v)) { - return false; - } - lod->push_back(v); - break; - } - - if (wt == WIRETYPE_LENGTH_DELIMITED) { - int num_bytes = 0; - if (!input->ReadVarintSizeAsInt(&num_bytes)) { - return tag; - } - int start_pos = input->CurrentPosition(); - while (input->CurrentPosition() - start_pos < num_bytes) { - uint64_t v; - if (!input->ReadVarint64(&v)) { - return tag; - } - lod->push_back(v); - } - break; - } - - return false; - } - default: { return false; } - } - } - - return true; -} - -int GRPCVariableResponse::Parse(Source* source) { - ::google::protobuf::io::ZeroCopyInputStream* input_stream = - source->contents(); - ::google::protobuf::io::CodedInputStream input(input_stream); - input.SetTotalBytesLimit(INT_MAX, INT_MAX); - - while (true) { - auto p = input.ReadTagWithCutoff(127); - int tag = GetTagFieldNumber(p.first); - WireType wt = GetTagWireType(p.first); - if (!p.second) { - if (tag != 0) { - return -1; - } - return 0; - } - - switch (tag) { - case sendrecv::VariableMessage::kVarnameFieldNumber: { - uint32_t length; - if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { - return tag; - } - - std::string temp; - if (!input.ReadString(&temp, length)) { - return tag; - } - - meta_.set_varname(temp); - break; - } - case sendrecv::VariableMessage::kTypeFieldNumber: { - uint32_t v; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) { - return tag; - } - - meta_.set_type(static_cast<::sendrecv::VarType>(v)); - break; - } - case sendrecv::VariableMessage::kDataTypeFieldNumber: { - uint32_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) { - return tag; - } - - meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v)); - break; - } - case sendrecv::VariableMessage::kDimsFieldNumber: { - // not packed - if (wt == WIRETYPE_VARINT) { - uint64_t v; - if (!input.ReadVarint64(&v)) { - return tag; - } - meta_.add_dims(v); - break; - } - - // packed - if (wt == WIRETYPE_LENGTH_DELIMITED) { - int num_bytes = 0; - if (!input.ReadVarintSizeAsInt(&num_bytes)) { - return tag; - } - int start_pos = input.CurrentPosition(); - while (input.CurrentPosition() - start_pos < num_bytes) { - uint64_t v; - if (!input.ReadVarint64(&v)) { - return tag; - } - meta_.add_dims(v); - } - break; - } - return tag; - } - case sendrecv::VariableMessage::kLodLevelFieldNumber: { - uint64_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { - return tag; - } - meta_.set_lod_level(static_cast(v)); - break; - } - case sendrecv::VariableMessage::kLodFieldNumber: { - int length = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &length)) { - return tag; - } - - std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p = - input.IncrementRecursionDepthAndPushLimit(length); - - std::vector lod_data; - if (p.second < 0 || !ParseLodData(&input, &lod_data)) { - return tag; - } - - if (!input.DecrementRecursionDepthAndPopLimit(p.first)) { - return tag; - } - - if (lod_data.size() == 0) { - break; - } - - auto lod = meta_.add_lod(); - for (uint32_t i = 0; i < lod_data.size(); i++) { - lod->add_lod_data(lod_data[i]); - } - break; - } - case sendrecv::VariableMessage::kSlrHeightFieldNumber: { - uint64_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { - return tag; - } - meta_.set_slr_height(static_cast(v)); - break; - } - case sendrecv::VariableMessage::kSerializedFieldNumber: { - int num_bytes = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &num_bytes)) { - return tag; - } - - if (!ProcSerializedField(tag, &input, num_bytes)) { - return tag; - } - - break; - } - case sendrecv::VariableMessage::kRowsFieldNumber: { - PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || - meta_.type() == sendrecv::LOD_TENSOR) && - meta_.varname() != "", - platform::errors::PreconditionNotMet( - "meta info should be got first!")); - - int num_bytes = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &num_bytes)) { - return tag; - } - - if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { - return tag; - } - break; - } - case sendrecv::VariableMessage::kOutVarnameFieldNumber: { - uint32_t length; - if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { - return tag; - } - - std::string temp; - if (!input.ReadString(&temp, length)) { - return tag; - } - - meta_.set_out_varname(temp); - break; - } - case sendrecv::VariableMessage::kProfileFieldNumber: { - uint64_t profiling = 0; - if (!input.ReadVarint64(&profiling)) { - return tag; - } - meta_.set_profile(profiling); - int64_t listener_id = platform::ListenerId(); - if (listener_id <= 0) { - break; - } - if (profiling == platform::kEnableProfiler && - !platform::IsProfileEnabled()) { - platform::EnableProfiler(platform::ProfilerState::kCPU); - } else if (profiling == platform::kDisableProfiler && - platform::IsProfileEnabled()) { - platform::DisableProfiler( - platform::EventSortingKey::kDefault, - string::Sprintf("%s_%lld", FLAGS_rpc_server_profile_path, - listener_id)); - } - break; - } - case sendrecv::VariableMessage::kTrainerIdFieldNumber: { - uint64_t trainer_id = 0; - if (!input.ReadVarint64(&trainer_id)) { - return tag; - } - meta_.set_trainer_id(trainer_id); - break; - } - case sendrecv::VariableMessage::kTableNameFieldNumber: { - uint32_t length; - if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { - return tag; - } - - std::string temp; - if (!input.ReadString(&temp, length)) { - return tag; - } - - meta_.set_table_name(temp); - break; - } - default: { - // Unknown tag, return unknown error. - return -1; - } - } - } - - return 0; -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h deleted file mode 100644 index 4d12b4a4bacd7f..00000000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "google/protobuf/io/coded_stream.h" -#include "google/protobuf/io/zero_copy_stream.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" -#include "paddle/fluid/operators/distributed/variable_response.h" - -namespace grpc { -class ByteBuffer; -} // namespace grpc -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class GRPCVariableResponse : public VariableResponse { - public: - GRPCVariableResponse(const framework::Scope* scope, - const platform::DeviceContext* dev_ctx, - bool create_scope = false) - : VariableResponse(scope, dev_ctx, create_scope) {} - - virtual ~GRPCVariableResponse() {} - - int Parse(Source* source) override; - - // return: - // 0:ok. - // -1: unkown error. - // other: number of error field. - int Parse(const ::grpc::ByteBuffer& byte_buffer); -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.cc b/paddle/fluid/operators/distributed/heart_beat_monitor.cc deleted file mode 100644 index 9f537f53348986..00000000000000 --- a/paddle/fluid/operators/distributed/heart_beat_monitor.cc +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" - -#include - -namespace paddle { -namespace operators { -namespace distributed { - -DEFINE_int32(worker_update_interval_secs, 900, - " the longest time interval between the worker update variables"); - -inline int GetCurrentUS() { - // current date/time based on current system - time_t t = std::time(0); - int now = static_cast(t); - return now; -} - -void HeartBeatMonitor::Update(const int worker_id, std::string be_monitored_var, - WorkerStatus status) { - if (status == UNINITED) { - LOG(WARNING) << "HeartBeatMonitor receive UNINITED status can not be used " - "in Update, something error"; - } - - if (!is_chief_) { - return; - } - - if ((be_monitored_var == be_monitored_var_ && status == RUNNING) || - status == COMPLETED) { - auto timestamp = GetCurrentUS(); - UnderMonitoredWorker& worker = worker_status_map_.at(worker_id); - - if (worker.status != COMPLETED) { - worker.status = status; - } - worker.timestamp = timestamp; - return; - } -} - -void HeartBeatMonitor::LostWorkerMonitor() { - VLOG(1) << "worker heartbeat monitor start at No.0 parameter server"; - while (running_) { - for (int id = 0; id < workers_; ++id) { - auto& worker = worker_status_map_.at(id); - - if (worker.status == UNINITED) { - VLOG(4) << "worker " << worker.id << " is under UNINITED"; - continue; - } - if (worker.status == COMPLETED) { - VLOG(4) << "worker " << worker.id << " is under COMPLETED"; - continue; - } - - auto timestamp = GetCurrentUS(); - - VLOG(4) << "worker " << worker.id << " status is " << worker.status - << " timestamp is " << worker.timestamp << " the interval is " - << timestamp - worker.timestamp; - - if (timestamp - worker.timestamp >= FLAGS_worker_update_interval_secs) { - PADDLE_THROW(platform::errors::ExecutionTimeout( - "the latest update of worker %d is %d secs ago, we doubt the " - "the worker is not alive and this may have a bad effect on the " - "fitting result, please check", - worker.id, FLAGS_worker_update_interval_secs)); - } - } - - std::this_thread::sleep_for(std::chrono::milliseconds(10 * 1000)); - } - VLOG(1) << "worker heartbeat monitor stopped, thread exit"; -} - -std::once_flag HeartBeatMonitor::init_flag_; -std::unique_ptr HeartBeatMonitor::monitor_(nullptr); - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.h b/paddle/fluid/operators/distributed/heart_beat_monitor.h deleted file mode 100644 index d96433c318b357..00000000000000 --- a/paddle/fluid/operators/distributed/heart_beat_monitor.h +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include // NOLINT -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include "gflags/gflags.h" - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -enum WorkerStatus { UNINITED = 0, RUNNING, COMPLETED }; - -struct UnderMonitoredWorker { - int id; - WorkerStatus status; - int timestamp; - - UnderMonitoredWorker() {} - - explicit UnderMonitoredWorker(int worker_id) { - this->id = worker_id; - this->status = UNINITED; - this->timestamp = 0; - } -}; - -class HeartBeatMonitor { - public: - explicit HeartBeatMonitor(int workers, bool is_chief, - std::string be_monitored_var) - : workers_(workers), - is_chief_(is_chief), - be_monitored_var_(be_monitored_var), - running_(true) { - PADDLE_ENFORCE_GT(workers, 0, platform::errors::InvalidArgument( - "workers must greater than 0.")); - - for (auto worker_id = 0; worker_id < workers; worker_id++) { - UnderMonitoredWorker worker(worker_id); - worker_status_map_[worker_id] = std::move(worker); - } - - // we define the No.0 pserver is the first parameter server - // only No.0 will check the heartbeat of all trainers - if (is_chief) { - monitor_thread_.reset(new std::thread( - std::bind(&HeartBeatMonitor::LostWorkerMonitor, this))); - } - } - - ~HeartBeatMonitor() { - running_ = false; - if (monitor_thread_) monitor_thread_->join(); - } - - static void Init(int workers, bool is_chief, std::string be_monitored_var) { - std::call_once(init_flag_, &HeartBeatMonitor::InitImpl, workers, is_chief, - be_monitored_var); - } - - static HeartBeatMonitor* GetInstance() { return monitor_.get(); } - - void Stop() { - running_ = false; - if (!monitor_) { - VLOG(0) << "HeartBeatMonitor is not inited, do nothing"; - } else { - if (monitor_thread_) { - monitor_thread_->join(); - monitor_thread_.reset(nullptr); - } - } - } - - void Update(const int worker_id, std::string be_monitored_var, - WorkerStatus status); - - void LostWorkerMonitor(); - - private: - // Init is called by GetInstance. - static void InitImpl(int workers, bool is_chief, - std::string be_monitored_var) { - if (monitor_ == nullptr) { - monitor_.reset(new HeartBeatMonitor(workers, is_chief, be_monitored_var)); - } - } - - static std::once_flag init_flag_; - static std::unique_ptr monitor_; - - int workers_; - bool is_chief_; - std::string be_monitored_var_; - std::unordered_map worker_status_map_; - std::unique_ptr monitor_thread_{nullptr}; - std::mutex mutex_; - bool running_ = false; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc b/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc deleted file mode 100644 index 8505023f63a95d..00000000000000 --- a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" - -#include "gtest/gtest.h" - -namespace paddle { -namespace operators { -namespace distributed { - -void run(HeartBeatMonitor* monitor) { monitor->LostWorkerMonitor(); } - -TEST(HeartBeatMonitor, All) { - int trainers = 10; - int pserver_id = 0; - std::string var = "nce_w@GRAD.block0"; - std::string var2 = "nce_w@GRAD.block2"; - - HeartBeatMonitor::Init(trainers, pserver_id == 0, var); - - auto* monitor = HeartBeatMonitor::GetInstance(); - - std::vector ids{1, 3, 5, 7}; - - for (auto& id : ids) { - monitor->Update(id, var, RUNNING); - } - - monitor->Update(9, var2, RUNNING); - monitor->Update(2, var, COMPLETED); - - std::thread t(run, monitor); - t.detach(); - - std::this_thread::sleep_for(std::chrono::milliseconds(15 * 1000)); - - monitor->Stop(); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h deleted file mode 100644 index da2281231fc8a3..00000000000000 --- a/paddle/fluid/operators/distributed/large_scale_kv.h +++ /dev/null @@ -1,848 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include // NOLINT -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include "gflags/gflags.h" - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/rw_lock.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/port.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/fluid/string/string_helper.h" - -namespace paddle { -namespace operators { -namespace distributed { - -enum Mode { training, infer }; -enum InitType { uniform_random, fill_constant, gaussian_random }; - -inline std::vector bucket(const int v_size, const int b_size) { - int remainder = v_size % b_size; - int bucket = v_size / b_size; - std::vector ret_vec(b_size, bucket); - for (int i = 0; i < remainder; ++i) { - ret_vec[i] = ret_vec[i] + 1; - } - int cur_bucket = 0; - for (int &j : ret_vec) { - int tmp = j; - j = cur_bucket; - cur_bucket += tmp; - } - ret_vec.push_back(cur_bucket); - return ret_vec; -} - -class Initializer { - public: - Initializer() {} - - explicit Initializer(const std::vector &attrs) {} - - virtual float GetValue() = 0; - - virtual ~Initializer() {} - - protected: - std::string name_; - unsigned int seed_; -}; - -class UniformInitializer : public Initializer { - public: - explicit UniformInitializer(const std::vector &attrs) { - name_ = attrs[0]; - seed_ = static_cast(std::stoi(attrs[1])); - min_ = std::stof(attrs[2]); - max_ = std::stof(attrs[3]); - - dist_ = std::uniform_real_distribution(min_, max_); - random_engine_ = framework::GetCPURandomEngine(seed_); - } - - float GetValue() override { return dist_(*random_engine_); } - - private: - float min_; - float max_; - - std::shared_ptr random_engine_; - std::uniform_real_distribution dist_; -}; - -template -inline bool entry(const int count, const T threshold); - -template <> -inline bool entry(const int count, const std::string threshold) { - return true; -} - -template <> -inline bool entry(const int count, const int threshold) { - return count >= threshold; -} - -template <> -inline bool entry(const int count, const float threshold) { - UniformInitializer uniform = UniformInitializer({"0", "0", "1"}); - return uniform.GetValue() >= threshold; -} - -class GaussianInitializer : public Initializer { - public: - explicit GaussianInitializer(const std::vector &attrs) { - name_ = attrs[0]; - seed_ = static_cast(std::stoi(attrs[1])); - mean_ = std::stof(attrs[2]); - std_ = std::stof(attrs[3]); - - random_engine_ = framework::GetCPURandomEngine(seed_); - - dist_ = std::normal_distribution(mean_, std_); - } - - float GetValue() override { return dist_(*random_engine_); } - - private: - float std_; - float mean_; - - std::shared_ptr random_engine_; - std::normal_distribution dist_; -}; - -class FillConstantInitializer : public Initializer { - public: - explicit FillConstantInitializer(const std::vector &attrs) { - name_ = attrs[0]; - value_ = std::stof(attrs[1]); - } - - float GetValue() override { return value_; } - - private: - float value_; -}; - -struct SparseMeta { - std::string name; - std::string grad_name; - std::vector value_names; - std::vector value_dims; - std::vector cached_varnames; - std::vector initializer_attrs; - std::string entry; - Mode mode; - - std::string ToString() { - std::stringstream ss; - ss << "name: " << name << " "; - ss << "mode: " << mode << " "; - - for (int i = 0; i < static_cast(value_names.size()); i++) { - ss << "value_name: " << value_names[i] << " dim: " << value_dims[i] - << " "; - } - - ss << " grad var: " << grad_name; - - ss << " cached varnames: "; - for (int i = 0; i < static_cast(cached_varnames.size()); i++) { - ss << cached_varnames[i] << " "; - } - - ss << " initializer attrs: "; - for (int i = 0; i < static_cast(initializer_attrs.size()); i++) { - ss << initializer_attrs[i] << " "; - } - - ss << " entry attrs: " << entry; - - return ss.str(); - } -}; - -struct VALUE { - explicit VALUE(const std::vector &names) - : names_(names), count_(0), unseen_days_(0) { - values_.resize(names.size()); - for (int i = 0; i < static_cast(names.size()); i++) { - places[names[i]] = i; - } - } - - void set(std::vector> *values) { - values_ = std::move(*values); - } - - void set(const std::vector &names, - const std::vector> &values) { - for (int i = 0; i < static_cast(names.size()); i++) { - auto idx = places[names[i]]; - auto value = values[i]; - values_[idx].assign(value.begin(), value.end()); - } - } - - std::vector *> get() { - auto pts = std::vector *>(); - pts.reserve(values_.size()); - - for (auto &value : values_) { - pts.push_back(&value); - } - return pts; - } - - int fetch_count() { return ++count_; } - void reset_unseen_days() { unseen_days_ = 0; } - - void set_entry(bool is_entry) { is_entry_ = is_entry; } - - bool get_entry() { return is_entry_; } - - std::vector *> get(const std::vector names) { - auto pts = std::vector *>(); - pts.reserve(values_.size()); - - for (int i = 0; i < static_cast(names.size()); i++) { - pts.push_back(&(values_[places[names[i]]])); - } - return pts; - } - - std::vector names_; - int count_; - bool seen_after_last_save_; - int unseen_days_; - bool is_entry_; - std::vector> values_; - std::unordered_map places; -}; - -class ValueBlock { - public: - explicit ValueBlock(const std::vector value_names, - const std::vector value_dims, const Mode &mode, - const std::vector &init_attrs, - const std::string &entry_attr) - : value_names_(value_names), value_dims_(value_dims), mode_(mode) { - // for Initializer - for (size_t i = 0; i < value_names.size(); i++) { - auto name = value_names[i]; - auto slices = string::split_string(init_attrs[i], "&"); - - if (slices[0] == "gaussian_random") { - initializers_[name] = new GaussianInitializer(slices); - } else if (slices[0] == "fill_constant") { - initializers_[name] = new FillConstantInitializer(slices); - } else if (slices[0] == "uniform_random") { - initializers_[name] = new UniformInitializer(slices); - } else { - PADDLE_THROW( - platform::errors::InvalidArgument("%s can not be supported", name)); - } - } - - // for Entry - { - if (entry_attr == "none") { - entry_func_ = - std::bind(entry, std::placeholders::_1, "none"); - } else { - auto slices = string::split_string(entry_attr, "&"); - if (slices[0] == "count_filter") { - int threshold = std::stoi(slices[1]); - entry_func_ = std::bind(entry, std::placeholders::_1, threshold); - } else if (slices[0] == "probability") { - float threshold = std::stof(slices[1]); - entry_func_ = - std::bind(entry, std::placeholders::_1, threshold); - } - } - } - - rwlock_.reset(new framework::RWLock); - } - - ~ValueBlock() { - // for (auto init : initializers_) { - // delete init.second; - // initializers_.erase(init.first); - // } - // - // for (auto value : values_) { - // delete value.second; - // values_.erase(value.first); - // } - } - - void Init(const int64_t &id, std::vector> *values, - int count) { - if (Has(id)) { - PADDLE_THROW(platform::errors::AlreadyExists("id already exist, error")); - } - - if (values->size() != value_names_.size()) { - PADDLE_THROW( - platform::errors::AlreadyExists("values can not match, error")); - } - - auto value = new VALUE(value_names_); - value->set(values); - value->seen_after_last_save_ = true; - value->count_ = count; - values_[id] = value; - } - - std::vector *> Get( - const int64_t &id, const std::vector &value_names) { - rwlock_->RDLock(); - auto ret_values = values_.at(id)->get(value_names); - rwlock_->UNLock(); - return ret_values; - } - - void InitFromInitializer(const int64_t &id, - const std::vector &value_names) { - rwlock_->WRLock(); - - if (Has(id)) { - Update(id); - rwlock_->UNLock(); - return; - } - - auto rets = std::vector>(); - rets.resize(value_names_.size()); - - for (int i = 0; i < static_cast(value_names_.size()); i++) { - auto name = value_names_[i]; - auto *init = initializers_.at(name); - - auto dim = value_dims_[i]; - rets[i].resize(dim); - - for (int j = 0; j < static_cast(dim); j++) { - rets[i][j] = init->GetValue(); - } - } - - Init(id, &rets, 0); - Update(id); - rwlock_->UNLock(); - } - - bool GetEntry(const int64_t &id) { - rwlock_->RDLock(); - auto value = values_.at(id); - auto entry = value->get_entry(); - rwlock_->UNLock(); - return entry; - } - - void Set(const int64_t &id, const std::vector &value_names, - const std::vector> &values) { - rwlock_->WRLock(); - auto value = values_.at(id); - value->set(value_names, values); - rwlock_->UNLock(); - } - - void Update(const int64_t id) { - auto *value = values_.at(id); - value->reset_unseen_days(); - auto count = value->fetch_count(); - - if (!value->get_entry()) { - value->set_entry(entry_func_(count)); - } - } - - private: - bool Has(const int64_t id) { - auto got = values_.find(id); - if (got == values_.end()) { - return false; - } else { - return true; - } - } - - public: - std::unordered_map values_; - - private: - std::vector value_names_; - std::vector value_dims_; - Mode mode_; - std::function entry_func_; - std::unordered_map initializers_; - std::unique_ptr rwlock_{nullptr}; -}; - -class SparseVariable { - public: - explicit SparseVariable(const SparseMeta &meta) { - meta_.name = meta.name; - meta_.mode = meta.mode; - meta_.value_names = meta.value_names; - meta_.value_dims = meta.value_dims; - meta_.grad_name = meta.grad_name; - meta_.cached_varnames = meta.cached_varnames; - meta_.initializer_attrs = meta.initializer_attrs; - meta_.entry = meta.entry; - - for (int i = 0; i < static_cast(meta_.value_names.size()); i++) { - values_dims_[meta_.value_names[i]] = meta_.value_dims[i]; - } - - for (size_t i = 0; i < shard_num_; i++) { - auto block = std::make_shared( - meta.value_names, meta.value_dims, meta.mode, meta.initializer_attrs, - meta.entry); - shard_blocks_.emplace_back(block); - } - - rwlock_.reset(new framework::RWLock); - } - - void Init(const std::vector &ids) { - rwlock_->RDLock(); - for (auto &id : ids) { - auto *block = GetShard(id); - block->InitFromInitializer(id, meta_.value_names); - } - rwlock_->UNLock(); - } - - void Get(const std::vector &ids, - const std::vector &value_names, - std::vector *>> *values) { - values->resize(ids.size()); - - auto buckets = bucket(ids.size(), 8); - std::vector> fs; - - for (int j = 0; j < 8; ++j) { - auto begin = buckets[j]; - auto end = buckets[j + 1]; - - fs.push_back( - framework::Async([begin, end, &values, &ids, &value_names, this]() { - for (int x = begin; x < end; x++) { - auto id = ids[x]; - auto *block = GetShard(id); - auto id_values = block->Get(id, value_names); - (*values)[x] = id_values; - } - })); - } - - for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); - } - - void GetEntry(const std::vector &ids, std::vector *values) { - auto buckets = bucket(ids.size(), 8); - std::vector> fs; - - for (int j = 0; j < 8; ++j) { - auto begin = buckets[j]; - auto end = buckets[j + 1]; - - fs.push_back(framework::Async([begin, end, &values, &ids, this]() { - for (int x = begin; x < end; x++) { - auto id = ids[x]; - auto *block = GetShard(id); - auto is_entry = block->GetEntry(id); - - if (!is_entry) { - values->push_back(id); - } - } - })); - } - for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); - } - - void Set(const std::vector &ids, - const std::vector &value_names, - const std::vector>> &values) { - for (int i = 0; i < static_cast(ids.size()); i++) { - GetShard(ids[i])->Set(ids[i], value_names, values[i]); - } - } - - void Dims(std::vector value_names, std::vector *dims) { - for (auto &name : value_names) { - dims->push_back(values_dims_.at(name)); - } - } - - std::vector CachedVarnames() const { - return meta_.cached_varnames; - } - - void Load(const std::string &dirname) { - rwlock_->WRLock(); - VLOG(1) << "load " << meta_.name << " from dir: " << dirname << " begin"; - - std::vector filenames; - for (auto &value_name : meta_.value_names) { - auto filename = string::Sprintf("%s/%s", dirname, value_name); - filenames.push_back(filename); - } - - LoadFromSelectedRows(filenames, meta_.value_names); - VLOG(1) << "load " << meta_.name << " in dir: " << dirname << " done"; - rwlock_->UNLock(); - } - - void LoadFromSelectedRows(const std::vector &filenames, - const std::vector &valuenames) { - std::vector> variables; - auto place = platform::CPUPlace(); - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto var = std::make_shared(); - variables.push_back(var); - auto &filename = filenames[i]; - std::ifstream fin(filename, std::ios::binary); - auto *selectedRows = var->GetMutable(); - - platform::DeviceContextPool &pool = - platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - framework::DeserializeFromStream(fin, selectedRows, dev_ctx); - selectedRows->SyncIndex(); - } - - std::vector tensors; - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto &slr = variables[i]->Get(); - auto src_t = slr.value(); - const auto *value = src_t.data(); - tensors.push_back(value); - } - - for (int i = 1; i < static_cast(filenames.size()); i++) { - auto rows_0 = variables[0]->Get().rows(); - auto rows_i = variables[i]->Get().rows(); - - bool is_equal = std::equal(rows_0.begin(), rows_0.end(), rows_i.begin()); - - if (!is_equal) { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s and %s are not equal, can not be load rightly", filenames[0], - filenames[i])); - } - } - - auto rows = variables[0]->Get().rows(); - - for (auto i = 0; i < static_cast(rows.size()); i++) { - auto id = rows[i]; - std::vector> values; - values.resize(filenames.size()); - - for (int j = 0; j < static_cast(filenames.size()); ++j) { - values[j].resize(meta_.value_dims[j]); - std::memcpy(values[j].data(), tensors[j] + i * meta_.value_dims[j], - sizeof(float) * meta_.value_dims[j]); - } - - auto *block = GetShard(id); - block->Init(id, &values, 0); - block->Update(id); - } - } - - void Save(const std::string &dirname, const int mode = 0) { - rwlock_->WRLock(); - VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " begin"; - - MkDirRecursively(dirname.c_str()); - - std::vector filenames; - for (auto &value_name : meta_.value_names) { - auto filename = string::Sprintf("%s/%s", dirname, value_name); - filenames.push_back(filename); - } - - SaveToSelectedRows(filenames, meta_.value_names, mode); - VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " done"; - rwlock_->UNLock(); - } - - void SaveToSelectedRows(const std::vector &filenames, - const std::vector &valuenames, - const int mode) { - for (auto &value_name : valuenames) { - auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(), - value_name); - if (it == meta_.value_names.end()) { - PADDLE_THROW(platform::errors::InvalidArgument( - "[%s] is invalid param for [%s]", value_name, meta_.name)); - } - } - - auto place = platform::CPUPlace(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - std::vector ids; - - for (auto &block : shard_blocks_) { - for (auto value : block->values_) { - if (mode == 0) { - ids.push_back(value.first); - } else { - bool id_need_save = false; - // save all params - if (mode == 1) { - id_need_save = true; - } else { - id_need_save = value.second->seen_after_last_save_; - } - - if (id_need_save) { - ids.push_back(value.first); - } - value.second->seen_after_last_save_ = false; - } - } - } - - VLOG(3) << "save " << ids.size() << " feasigns for " << meta_.name - << " with mode: " << mode; - - std::vector> variables; - std::vector tensors; - std::vector dims; - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto dim = values_dims_.at(valuenames[i]); - auto var = std::make_shared(); - auto *slr = var->GetMutable(); - auto *src_t = slr->mutable_value(); - - src_t->Resize({static_cast(ids.size()), dim}); - auto *value = src_t->mutable_data(place); - - dims.push_back(dim); - variables.push_back(var); - tensors.push_back(value); - } - - std::vector *>> values; - Get(ids, valuenames, &values); - - int64_t offset = 0; - for (auto &vss : values) { - for (int i = 0; i < static_cast(vss.size()); i++) { - auto &vs = vss[i]; - std::memcpy(tensors[i] + offset * dims[i], vs->data(), - sizeof(float) * dims[i]); - } - offset += 1; - } - - for (auto &var : variables) { - auto *slr = var->GetMutable(); - slr->set_rows(ids); - slr->set_height(ids.size()); - } - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto &filename = filenames[i]; - auto &selectedRows = variables[i]->Get(); - - std::ofstream fout(filename, std::ios::binary); - PADDLE_ENFORCE_EQ(static_cast(fout), true, - platform::errors::Unavailable( - "Cannot open %s to save variables.", filename)); - - framework::SerializeToStream(fout, selectedRows, dev_ctx); - fout.close(); - } - } - - void SaveToText(const std::vector &filenames, - const std::vector &valuenames) { - for (auto &value_name : valuenames) { - auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(), - value_name); - if (it == meta_.value_names.end()) { - PADDLE_THROW(platform::errors::InvalidArgument( - "[%s] is invalid param for [%s]", value_name, meta_.name)); - } - } - - std::vector> fouts; - - for (auto filename : filenames) { - std::unique_ptr fout(new std::ofstream(filename)); - fouts.push_back(std::move(fout)); - } - - for (auto &block : shard_blocks_) { - for (auto value : block->values_) { - std::vector *> vss = value.second->get(valuenames); - - auto id = value.first; - - for (int i = 0; i < static_cast(vss.size()); i++) { - auto &vs = vss[i]; - std::stringstream ss; - ss << id << "\t"; - ss << vs->size() << "\t"; - for (auto v : (*vs)) { - ss << v << " "; - } - ss << "\n"; - - fouts[i]->write(ss.str().c_str(), sizeof(char) * ss.str().size()); - } - } - } - - for (int i = 0; i < static_cast(fouts.size()); i++) { - fouts[i]->close(); - } - } - - int64_t Size() { - int64_t cnt = 0; - - for (auto &block : shard_blocks_) { - cnt += block->values_.size(); - } - return cnt; - } - - ValueBlock *GetShard(const int64_t id) { - return shard_blocks_[id & shard_mask_].get(); - } - - SparseMeta *GetMeta() { return &meta_; } - - private: - std::unique_ptr rwlock_{nullptr}; - - SparseMeta meta_; - std::unordered_map values_dims_; - const size_t shard_mask_ = 127; - const size_t shard_num_ = 128; - std::vector> shard_blocks_; -}; - -class LargeScaleKV { - public: - LargeScaleKV() {} - - explicit LargeScaleKV(const std::vector &table_metas) { - for (auto &sparse_meta : table_metas) { - auto table_name = sparse_meta.name; - auto meta = std::shared_ptr( - new SparseVariable(std::move(sparse_meta))); - sparse_variables[table_name] = meta; - grad_to_variables[sparse_meta.grad_name] = table_name; - grad_names_.push_back(sparse_meta.grad_name); - } - } - - ~LargeScaleKV() {} - - static std::shared_ptr GetInstantcePtr() { return scale_kv_; } - - static LargeScaleKV *GetInstance() { return scale_kv_.get(); } - - static LargeScaleKV *InitInstance( - const std::vector &table_metas) { - std::call_once(init_flag_, &LargeScaleKV::Init, table_metas); - return scale_kv_.get(); - } - - static void Init(const std::vector &table_metas) { - if (scale_kv_.get() == nullptr) { - scale_kv_.reset(new LargeScaleKV(table_metas)); - } - } - - SparseVariable *Get(const std::string &name) { - auto variable = sparse_variables.at(name); - return variable.get(); - } - - bool ParamInLargeScale(const std::string &name) { - auto got = sparse_variables.find(name); - - if (got == sparse_variables.end()) { - return false; - } - - return true; - } - - bool GradInLargeScale(const std::string &name) { - auto got = grad_to_variables.find(name); - - if (got == grad_to_variables.end()) { - return false; - } - - return true; - } - - SparseVariable *GetByGrad(const std::string &name) { - return Get(grad_to_variables[name]); - } - - const std::vector &GetAllGrads() { return grad_names_; } - - private: - std::unordered_map> - sparse_variables; - std::unordered_map grad_to_variables; - std::vector grad_names_; - static std::shared_ptr scale_kv_; - static std::once_flag init_flag_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc deleted file mode 100644 index 558d70e5c3353f..00000000000000 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ /dev/null @@ -1,311 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/parameter_prefetch.h" -#include -#include -#include -#include -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/distributed.h" - -namespace paddle { -namespace framework { -class ExecutionContext; -class Scope; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient; - -using LoDTensor = framework::LoDTensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; -using DDim = framework::DDim; - -static void SplitIdsIntoMultipleVarsBySection( - const std::vector &in_ids, - const std::vector &in_varnames, const int tables, - const int pservers, const bool is_distibuted, framework::Scope *scope, - std::vector> *splited_ids, - std::vector> *origin_ids) { - PADDLE_ENFORCE_EQ( - in_varnames.size(), tables, - platform::errors::OutOfRange( - "send varnames size: %d not equal table number: %d, internal error", - in_varnames.size(), tables)); - - PADDLE_ENFORCE_LE( - tables, pservers, - platform::errors::OutOfRange("table number %d not equal or less than " - "pserver number: %d, internal error", - tables, pservers)); - - auto place = platform::CPUPlace(); - - std::set st(in_ids.begin(), in_ids.end()); - std::vector all_ids; - all_ids.assign(st.begin(), st.end()); - - splited_ids->resize(tables); - origin_ids->resize(tables); - - if (is_distibuted) { - for (auto &id : all_ids) { - auto pserver_id = id % pservers; - (*splited_ids)[pserver_id].push_back(id); - (*origin_ids)[pserver_id].push_back(id); - } - } else { - for (auto &id : all_ids) { - auto pserver_id = id % pservers; - (*origin_ids)[pserver_id].push_back(id); - id = id / pservers; - (*splited_ids)[pserver_id].push_back(id); - } - } - - for (size_t i = 0; i < in_varnames.size(); ++i) { - auto *id_tensor = - scope->Var(in_varnames[i])->GetMutable(); - - auto &ids = (*splited_ids)[i]; - if (!ids.empty()) { - auto *id_tensor_data = id_tensor->mutable_data( - framework::make_ddim({static_cast(ids.size()), 1}), place); - memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size()); - } - } -} - -typedef std::vector> TableAndEndpoints; - -void prefetch_core( - const std::vector &ids, const TableAndEndpoints &tables, - const framework::ExecutionContext &context, const framework::Scope &scope, - const bool is_distributed, - std::unordered_map> *recved_vec_map) { - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance( - context.Attr("trainer_id")); - - int pservers = context.Attr("pserver_num"); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &actual_ctx = *pool.Get(platform::CPUPlace()); - - std::unique_ptr local_scope = scope.NewTmpScope(); - - std::vector in_var_names; - std::vector out_var_names; - for (size_t i = 0; i < tables.size(); ++i) { - in_var_names.push_back("prefetch_send@" + tables[i].second); - out_var_names.push_back("prefetch_recv@" + tables[i].second); - } - - std::vector> split_ids; - std::vector> origin_ids; - SplitIdsIntoMultipleVarsBySection(ids, in_var_names, tables.size(), pservers, - is_distributed, local_scope.get(), - &split_ids, &origin_ids); - - // create output var in local scope - for (auto &name : out_var_names) { - local_scope->Var(name)->GetMutable(); - } - - std::vector rets; - for (size_t i = 0; i < in_var_names.size(); i++) { - if (NeedSend(*local_scope.get(), in_var_names[i])) { - VLOG(3) << "sending " << in_var_names[i] << " to " << tables[i].second - << " to get " << out_var_names[i] << " back"; - rets.push_back(rpc_client->AsyncPrefetchVar( - tables[i].second, actual_ctx, *local_scope.get(), in_var_names[i], - out_var_names[i], tables[i].first)); - } else { - VLOG(3) << "don't send no-initialied variable: " << out_var_names[i]; - } - } - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } - - for (size_t o_idx = 0; o_idx < out_var_names.size(); ++o_idx) { - auto &ids_in_this_section = origin_ids[o_idx]; - - if (!ids_in_this_section.empty()) { - auto &prefetch_out_var = - local_scope->Var(out_var_names[o_idx])->Get(); - const auto *out_var_data = prefetch_out_var.data(); - auto &dims = prefetch_out_var.dims(); - - PADDLE_ENFORCE_EQ(dims.size(), 2, - platform::errors::InvalidArgument( - "The size of Tensor dims must be 2.")); - PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0], - platform::errors::InvalidArgument( - "The size of ids in this section must equal to " - "dims[0]: %s, but got %s", - dims[0], ids_in_this_section.size())); - - auto row_numel = dims[1]; - - for (int64_t i = 0; i < dims[0]; ++i) { - auto origin_id = ids_in_this_section[i]; - std::vector vecs(row_numel); - - std::copy_n(out_var_data + i * row_numel, row_numel, vecs.begin()); - (*recved_vec_map)[origin_id] = vecs; - } - } else { - VLOG(3) << "ids in this section is empty"; - } - } -} - -void prefetch(const std::string &id_name, const std::string &out_name, - const std::string &persistable_var_name, - const bool is_distributed, - const std::vector &table_names, - const std::vector &endpoints, - const framework::ExecutionContext &context, - const framework::Scope &scope) { - prefetchs({id_name}, {out_name}, persistable_var_name, is_distributed, - table_names, endpoints, context, scope); -} - -void prefetchs(const std::vector &id_var_names, - const std::vector &out_var_names, - const std::string &persistable_var_name, - const bool is_distributed, - const std::vector &table_names, - const std::vector &endpoints, - const framework::ExecutionContext &context, - const framework::Scope &scope) { - auto vec_dim_1 = 0; - auto vec_dim_0 = 0; - framework::Variable *var = scope.FindVar(persistable_var_name); - - if (var->IsType()) { - vec_dim_1 = var->Get().value().dims()[1]; - } else { - vec_dim_0 = var->Get().dims()[0]; - vec_dim_1 = var->Get().dims()[1]; - } - - PADDLE_ENFORCE_GT(vec_dim_1, 0, - platform::errors::InvalidArgument( - "lookup table var's dim must gather than 0")); - - const auto place = - scope.FindVar(id_var_names[0])->Get().place(); - - std::vector> ids_group; - std::vector ids_union; - std::vector ids_lods; - TableAndEndpoints tables; - - for (auto &id_name : id_var_names) { - auto &id_tensor = scope.FindVar(id_name)->Get(); - std::vector ids; - TensorToVector(id_tensor, context.device_context(), &ids); - ids_union.insert(ids_union.end(), ids.begin(), ids.end()); - ids_group.push_back(ids); - ids_lods.push_back(id_tensor.lod()); - } - - std::unordered_set s(ids_union.begin(), ids_union.end()); - ids_union.assign(s.begin(), s.end()); - - for (auto &i : ids_union) { - PADDLE_ENFORCE_GE( - i, 0, platform::errors::OutOfRange( - "each element in embedding should be larger or equal 0")); - if (!is_distributed) { - PADDLE_ENFORCE_LT( - i, vec_dim_0, - platform::errors::OutOfRange( - "embedding id must in [0, %d) when is_distributed False", - vec_dim_0)); - } - } - - for (size_t i = 0; i < table_names.size(); i++) { - tables.push_back(std::make_pair(table_names[i], endpoints[i])); - } - std::unordered_map> recved_vec_map; - prefetch_core(ids_union, tables, context, scope, is_distributed, - &recved_vec_map); - - auto padding_idx = distributed::kNoPadding; - - if (context.HasAttr("padding_idx")) { - padding_idx = context.Attr("padding_idx"); - } - - for (size_t i = 0; i < out_var_names.size(); i++) { - std::vector ids = ids_group[i]; - auto ids_size = ids.size(); - auto *out_t = - scope.FindVar(out_var_names[i])->GetMutable(); - out_t->set_lod(ids_lods[i]); - out_t->Resize( - framework::make_ddim({static_cast(ids_size), vec_dim_1})); - auto *out_d = out_t->mutable_data(place); - - if (platform::is_cpu_place(out_t->place())) { - for (auto idx = 0; idx < static_cast(ids_size); idx++) { - const auto &id = ids[idx]; - if (padding_idx != distributed::kNoPadding && id == padding_idx) { - memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1); - } else { - std::copy_n(recved_vec_map[id].begin(), vec_dim_1, - out_d + idx * vec_dim_1); - } - } - } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - std::vector ids_value_vec(ids_size * vec_dim_1); - for (auto idx = 0; idx < static_cast(ids_size); idx++) { - const auto &id = ids[idx]; - if (padding_idx != distributed::kNoPadding && id == padding_idx) { - memset(&ids_value_vec[idx * vec_dim_1], 0, sizeof(float) * vec_dim_1); - } else { - memcpy(&ids_value_vec[idx * vec_dim_1], &recved_vec_map[id][0], - sizeof(float) * vec_dim_1); - } - } - auto &gpu_place = BOOST_GET_CONST(platform::CUDAPlace, out_t->place()); - auto &cpu_place = BOOST_GET_CONST( - platform::CPUPlace, paddle::platform::CPUDeviceContext().GetPlace()); - auto stream = context.cuda_device_context().stream(); - memory::Copy(gpu_place, out_d, cpu_place, &ids_value_vec[0], - sizeof(float) * ids_size * vec_dim_1, stream); -#else - PADDLE_ENFORCE(true, platform::errors::PermissionDenied( - "Paddle is not compiled with GPU!")); -#endif - } - } -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h deleted file mode 100644 index 6fd3a998813c0b..00000000000000 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace framework { -class ExecutionContext; -class Scope; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -constexpr int64_t kNoPadding = -1; - -void prefetchs(const std::vector& id_var_names, - const std::vector& out_var_names, - const std::string& persistable_var_name, const bool backfill, - const std::vector& table_names, - const std::vector& endpoints, - const framework::ExecutionContext& context, - const framework::Scope& scope); - -void prefetch(const std::string& id_name, const std::string& out_name, - const std::string& persistable_var_name, const bool backfill, - const std::vector& table_names, - const std::vector& endpoints, - const framework::ExecutionContext& context, - const framework::Scope& scope); - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc deleted file mode 100644 index d5d3c9c3c7c48f..00000000000000 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "glog/logging.h" -#include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/parameter_recv.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient; - -using LoDTensor = framework::LoDTensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; -using DDim = framework::DDim; - -template -void RecvSparseLodTensor(const CommContext &rpc_ctx, - const framework::Scope &scope) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto cpu_place = platform::CPUPlace(); - auto &cpu_ctx = *pool.Get(cpu_place); - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::unique_ptr local_scope = scope.NewTmpScope(); - std::vector tensors; - std::vector rets; - std::vector recv_varnames; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; - local_scope->Var(recv_var_name); - // sparse param in recv_scope is LoDTensor - rets.push_back(rpc_client->AsyncGetVarNoBarrier( - rpc_ctx.epmap[i], cpu_ctx, *local_scope.get(), recv_var_name, - recv_var_name)); - recv_varnames.push_back(recv_var_name); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - auto &recv_var_name = recv_varnames[i]; - auto *local_var = local_scope->FindVar(recv_var_name); - const auto *value = local_var->Get().data(); - tensors.push_back(value); - } - - auto *merged_var = scope.FindVar(rpc_ctx.var_name); - - if (merged_var == nullptr || !merged_var->IsInitialized()) { - PADDLE_THROW( - platform::errors::InvalidArgument("%s must initialized at first.")); - } - auto dims1 = merged_var->Get().dims()[1]; - int64_t height = 0; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto *splited_var = local_scope->FindVar(rpc_ctx.splited_varnames[i]); - height += splited_var->Get().dims()[0]; - } - - PADDLE_ENFORCE_EQ( - merged_var->Get().dims()[0], height, - platform::errors::InvalidArgument( - "Received variable must has same dimension with local variable.")); - - auto *merged_t = merged_var->GetMutable(); - auto *merged_d = merged_t->mutable_data(cpu_place); - - auto pserver_num = rpc_ctx.splited_varnames.size(); - for (int x = 0; x < height; ++x) { - auto id = x % pserver_num; - auto idx = x / pserver_num; - std::memcpy(merged_d + x * dims1, tensors[id] + idx * dims1, - sizeof(float) * dims1); - } -} - -template -void RecvGeoSparseRecords(const CommContext &rpc_ctx, - const framework::Scope &scope) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto cpu_place = platform::CPUPlace(); - auto &cpu_ctx = *pool.Get(cpu_place); - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::unique_ptr local_scope = scope.NewTmpScope(); - - std::vector rets; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - local_scope->Var(recv_var_name); - VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; - // sparse param in recv_scope is LoDTensor - rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx, - *local_scope.get(), recv_var_name, - recv_var_name, recv_var_name)); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } - - int64_t height = 0; - int64_t ids_num = 0; - int64_t width = 0; - - std::vector all_ids; - auto pserver_num = rpc_ctx.splited_varnames.size(); - - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - auto *recv_var = local_scope->FindVar(recv_var_name); - auto &recv_t = recv_var->Get(); - - height += recv_t.height(); - ids_num += recv_t.rows().size(); - width = recv_t.value().dims()[1]; - - if (rpc_ctx.is_distributed) { - std::copy(recv_t.rows().begin(), recv_t.rows().end(), - std::back_inserter(all_ids)); - } else { - std::transform(recv_t.rows().begin(), recv_t.rows().end(), - std::back_inserter(all_ids), - [&](int64_t id) { return id * pserver_num + i; }); - } - } - - auto *var = scope.FindVar(rpc_ctx.var_name); - auto *t_ = var->GetMutable(); - T *out_data = - t_->mutable_value()->mutable_data({ids_num, width}, cpu_place); - t_->set_height(height); - t_->set_rows(all_ids); - - int64_t cnt = 0; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - auto *recv_var = local_scope->FindVar(recv_var_name); - auto &recv_t = recv_var->Get(); - - auto rows = recv_t.rows().size(); - const T *in_data = recv_t.value().data(); - std::copy_n(in_data, rows * width, out_data + cnt); - cnt += rows * width; - } - t_->SyncIndex(); -} - -template -void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) { - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::vector rets; - - // variable do not spilt - if (rpc_ctx.origin_varnames.size() == 1 && - rpc_ctx.splited_varnames.size() == 1) { - auto varname = rpc_ctx.origin_varnames[0]; - const auto place = - scope.FindVar(varname)->Get().place(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &ctx = *pool.Get(place); - VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0] << " in gpu? " - << platform::is_gpu_place(place); - rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], ctx, - scope, varname, varname)); - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE( - rets[i]->Wait(), 0U, - platform::errors::ExecutionTimeout("internal error in RPCClient")); - } - - VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name; - return; - } else { - PADDLE_ENFORCE(false, platform::errors::Unimplemented( - "ParameterRecv can not recv dense with multi " - "parts now, add it soon.")); - } -} - -template -void ParameterRecv::operator()(const CommContext &rpc_ctx, - const framework::Scope &scope, - bool geo_records) { - VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name; - - PADDLE_ENFORCE_GE(rpc_ctx.origin_varnames.size(), 1, - platform::errors::InvalidArgument( - "origin_varnames.size() >= 1 is permitted")); - - if (rpc_ctx.is_sparse) { - if (geo_records) { - RecvGeoSparseRecords(rpc_ctx, scope); - } else { - RecvSparseLodTensor(rpc_ctx, scope); - } - } else { - RecvLodTensor(rpc_ctx, scope); - } - - VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name; -} -template -void ParameterRecv::operator()(const CommContext &rpc_ctx, - const framework::Scope &scope) { - this->operator()(rpc_ctx, scope, false); -} - -template struct ParameterRecv; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h deleted file mode 100644 index c30d21aa791e23..00000000000000 --- a/paddle/fluid/operators/distributed/parameter_recv.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" - -namespace paddle { -namespace operators { -namespace distributed { - -template -struct ParameterRecv { - void operator()(const CommContext &rpc_ctx, const framework::Scope &scope, - bool barrier); - - void operator()(const CommContext &rpc_ctx, const framework::Scope &scope); -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc deleted file mode 100644 index 109514ca2541c3..00000000000000 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ /dev/null @@ -1,331 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/parameter_send.h" -#include -#include -#include "glog/logging.h" -#include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace framework { -class Scope; -class Tensor; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient; - -using LoDTensor = framework::LoDTensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; -using DDim = framework::DDim; - -typedef std::vector> EP_SPLIT_TABLE_PAIRS; - -inline EP_SPLIT_TABLE_PAIRS GetMultiFieldCommContext( - const CommContext &rpc_ctx, const framework::Scope &scope, - int multi_parts) { - EP_SPLIT_TABLE_PAIRS table_pairs; - - auto *send_var = scope.FindVar(rpc_ctx.var_name); - if (send_var->IsType()) { - PADDLE_ENFORCE_GE(multi_parts, 1, - platform::errors::InvalidArgument( - "multi_parts must == 1 in parameter send, now is: %d", - multi_parts)); - - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - table_pairs.push_back( - std::make_pair(rpc_ctx.epmap[i], rpc_ctx.splited_varnames[i])); - } - - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "GetMultiFieldCommContext unsupported LoDTensor current!")); - } - - return table_pairs; -} // namespace distributed - -void SendByNotifyRPC(const CommContext &rpc_ctx, - const framework::Scope &scope) { - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto &send_var_name = rpc_ctx.var_name; - std::vector rets; - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - if (NeedSend(scope, send_var_name)) { - for (size_t j = 0; j < rpc_ctx.epmap.size(); j++) { - auto &endpoint = rpc_ctx.epmap[j]; - VLOG(4) << "sending " << send_var_name << " to " << endpoint; - rets.push_back(rpc_client->AsyncDistributeNotify(endpoint, cpu_ctx, scope, - send_var_name)); - VLOG(4) << "send var " << send_var_name << " by notify RPC done"; - } - } else { - VLOG(3) << "don't send non-initialized variable: " << rpc_ctx.var_name; - } - - for (auto &handle : rets) { - PADDLE_ENFORCE_NE(handle->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } -} - -template -void ParameterSend::operator()(const CommContext &rpc_ctx, - const framework::Scope &scope, bool sync, - int multi_parts) { - if (rpc_ctx.var_name == STEP_COUNTER) { - SendByNotifyRPC(rpc_ctx, scope); - return; - } - - std::unique_ptr local_scope = scope.NewTmpScope(); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &cpu_ctx = *pool.Get(platform::CPUPlace()); - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::vector rets; - auto *send_var = scope.FindVar(rpc_ctx.var_name); - - if (send_var->IsType()) { - size_t out_num = rpc_ctx.splited_varnames.size(); - if (out_num > 1) { - auto &send_tensor = send_var->Get(); - auto &send_tensor_dims = send_tensor.dims(); - std::vector outs_dims; - outs_dims.reserve(out_num); - - // infer output shape - PADDLE_ENFORCE_EQ( - rpc_ctx.height_sections.size(), out_num, - platform::errors::InvalidArgument("tensor split sections size" - "should be equal to output size.")); - for (size_t i = 0; i < out_num; ++i) { - auto dim = send_tensor_dims; - dim[0] = rpc_ctx.height_sections[i]; - outs_dims.push_back(dim); - } - - // create output var in local scope - size_t row_offset = 0; - for (size_t i = 0; i < out_num; ++i) { - framework::Tensor *out = local_scope->Var(rpc_ctx.splited_varnames[i]) - ->GetMutable(); - *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); - row_offset += outs_dims[i][0]; - } - } else { - auto &send_tensor = send_var->Get(); - framework::Tensor *out = local_scope->Var(rpc_ctx.splited_varnames[0]) - ->GetMutable(); - out->ShareDataWith(send_tensor); - } - - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &send_var_name = rpc_ctx.splited_varnames[i]; - auto &endpoint = rpc_ctx.epmap[i]; - VLOG(4) << " send var name: " << send_var_name - << "endpoint: " << endpoint; - if (NeedSend(*local_scope.get(), send_var_name)) { - VLOG(3) << "sending " << send_var_name << " to " << endpoint; - rets.push_back(rpc_client->AsyncSendVar( - endpoint, cpu_ctx, *local_scope.get(), send_var_name)); - VLOG(4) << "send var " << send_var_name << " async handle done"; - } else { - VLOG(3) << "don't send non-initialized variable: " - << rpc_ctx.splited_varnames[i]; - } - } - } else if (send_var->IsType()) { - auto &send_slr = send_var->Get(); - - auto &send_rows = send_slr.rows(); - if (send_rows.size() == 0) { - LOG(WARNING) - << "WARNING: The variable sent to pserver is empty, which " - "may cause an unknown error. Please check the state of " - "use_double_buffer in pyreader/dataloader async mode, you need to " - "turn it false."; - } - - std::vector> outs_rows_idx; - std::vector> outs_dense_idx; - - auto table_pairs = GetMultiFieldCommContext(rpc_ctx, scope, 1); - outs_rows_idx.resize(table_pairs.size()); - outs_dense_idx.resize(table_pairs.size()); - - auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0]; - auto *src = send_slr.value().data(); - - // create output var in local scope - std::vector outs; - for (auto &table : table_pairs) { - auto *out = - local_scope->Var(table.second)->GetMutable(); - outs.push_back(out); - } - - if (!rpc_ctx.is_distributed) { - auto pserver_num = rpc_ctx.epmap.size(); - - // split rows index into output sparse vars - for (size_t i = 0; i < send_rows.size(); ++i) { - auto ep_idx = send_rows[i] % pserver_num; - auto id = send_rows[i] / pserver_num; - outs_rows_idx[ep_idx].push_back(id); - outs_dense_idx[ep_idx].push_back(i); - } - - auto place = platform::CPUPlace(); - - for (size_t out_idx = 0; out_idx < rpc_ctx.splited_varnames.size(); - out_idx++) { - auto rows_idx = outs_rows_idx[out_idx]; - - auto dims = send_slr.GetCompleteDims(); - dims[0] = rows_idx.size(); - outs[out_idx]->set_height(rpc_ctx.height_sections[out_idx]); - outs[out_idx]->mutable_rows()->clear(); - outs[out_idx]->mutable_value()->mutable_data(dims, send_slr.place()); - - if (rows_idx.size() > 0) { - for (auto idx : rows_idx) { - outs[out_idx]->mutable_rows()->push_back(idx); - } - auto dst = outs[out_idx]->mutable_value()->mutable_data(place); - for (size_t j = 0; j < rows_idx.size(); j++) { - if (platform::is_cpu_place(place)) { - memory::Copy(platform::CPUPlace(), dst + j * row_numel, - platform::CPUPlace(), - src + outs_dense_idx[out_idx][j] * row_numel, - sizeof(T) * row_numel); - } else { - PADDLE_THROW( - platform::errors::Unimplemented("do not support GPU now")); - } - } - } - PADDLE_ENFORCE_EQ( - rows_idx.size(), outs[out_idx]->rows().size(), - platform::errors::InvalidArgument( - "rows should has the same size with tensor dim 0")); - } - } else { - auto pserver_num = rpc_ctx.epmap.size(); - - // split rows index into output sparse vars - for (size_t i = 0; i < send_rows.size(); ++i) { - auto out_idx = send_rows[i] % pserver_num; - outs_rows_idx[out_idx].push_back(send_rows[i]); - outs_dense_idx[out_idx].push_back(i); - } - - auto place = platform::CPUPlace(); - - for (size_t out_idx = 0; out_idx < rpc_ctx.splited_varnames.size(); - out_idx++) { - auto rows_idx = outs_rows_idx[out_idx]; - - auto dims = send_slr.GetCompleteDims(); - dims[0] = rows_idx.size(); - - outs[out_idx]->set_height(rpc_ctx.height_sections[out_idx]); - outs[out_idx]->mutable_rows()->clear(); - outs[out_idx]->mutable_value()->mutable_data(dims, send_slr.place()); - - if (rows_idx.size() > 0) { - for (auto idx : rows_idx) { - outs[out_idx]->mutable_rows()->push_back(idx); - } - auto dst = outs[out_idx]->mutable_value()->mutable_data(place); - for (size_t j = 0; j < rows_idx.size(); j++) { - if (platform::is_cpu_place(place)) { - memory::Copy(platform::CPUPlace(), dst + j * row_numel, - platform::CPUPlace(), - src + outs_dense_idx[out_idx][j] * row_numel, - sizeof(T) * row_numel); - } else { - PADDLE_THROW( - platform::errors::Unimplemented("do not support GPU now")); - } - } - } - PADDLE_ENFORCE_EQ( - rows_idx.size(), outs[out_idx]->rows().size(), - platform::errors::InvalidArgument( - "rows should has the same size with tensor dim 0")); - } - } - - for (size_t i = 0; i < table_pairs.size(); i++) { - auto &send_var_name = table_pairs[i].second; - auto &endpoint = table_pairs[i].first; - auto need_send = NeedSend(*local_scope.get(), send_var_name); - - VLOG(4) << "send var name: " << send_var_name - << " send var endpoint: " << endpoint - << " need send: " << need_send; - - if (need_send) { - VLOG(4) << "sending " << send_var_name << " to " << endpoint; - - rets.push_back(rpc_client->AsyncSendVar( - endpoint, cpu_ctx, *local_scope.get(), send_var_name)); - VLOG(4) << "send var " << send_var_name << " async handle done"; - } else { - VLOG(4) << "don't send non-initialized variable: " - << rpc_ctx.splited_varnames[i]; - } - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "unsupported var type: %s to send!", send_var->Type())); - } - - VLOG(4) << "Prepare to send var " << rpc_ctx.var_name; - if (sync) { - for (auto &handle : rets) { - VLOG(4) << "Wait send var to pserver handle: " << handle; - PADDLE_ENFORCE_NE(handle->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } - } -} - -template struct ParameterSend; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h deleted file mode 100644 index cedc98b1fcadd4..00000000000000 --- a/paddle/fluid/operators/distributed/proto_encoder_helper.h +++ /dev/null @@ -1,146 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// NOTE: This file was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// file and did some modifications so that we can send gRPC -// requests without too much copying of the tensor data. - -#pragma once - -#include - -#include "grpc++/grpc++.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -char* EncodeVarint32(char* dst, uint32_t v) { - // Operate on characters as unsigneds - unsigned char* ptr = reinterpret_cast(dst); - static const int B = 128; - if (v < (1 << 7)) { - *(ptr++) = v; - } else if (v < (1 << 14)) { - *(ptr++) = v | B; - *(ptr++) = v >> 7; - } else if (v < (1 << 21)) { - *(ptr++) = v | B; - *(ptr++) = (v >> 7) | B; - *(ptr++) = v >> 14; - } else if (v < (1 << 28)) { - *(ptr++) = v | B; - *(ptr++) = (v >> 7) | B; - *(ptr++) = (v >> 14) | B; - *(ptr++) = v >> 21; - } else { - *(ptr++) = v | B; - *(ptr++) = (v >> 7) | B; - *(ptr++) = (v >> 14) | B; - *(ptr++) = (v >> 21) | B; - *(ptr++) = v >> 28; - } - return reinterpret_cast(ptr); -} - -char* EncodeVarint64(char* dst, uint64_t v) { - static const int B = 128; - unsigned char* ptr = reinterpret_cast(dst); - while (v >= B) { - *(ptr++) = (v & (B - 1)) | B; - v >>= 7; - } - *(ptr++) = static_cast(v); - return reinterpret_cast(ptr); -} - -int VarintLength(uint64_t v) { - int len = 1; - while (v >= 128) { - v >>= 7; - len++; - } - return len; -} - -class ProtoEncodeHelper { - public: - ProtoEncodeHelper(char* buf, int max_size) - : base_(buf), p_(buf), limit_(base_ + max_size) {} - - ~ProtoEncodeHelper() {} - - const char* data() const { return base_; } - size_t size() const { return p_ - base_; } - - void WriteUint64(int tag, uint64_t v) { - Encode32(combine(tag, WIRETYPE_VARINT)); - Encode64(v); - } - void WriteBool(int tag, bool v) { - Encode32(combine(tag, WIRETYPE_VARINT)); - EncodeBool(v); - } - void WriteString(int tag, const std::string& v) { - Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); - Encode32(v.size()); - EncodeBytes(v.data(), v.size()); - } - void WriteVarlengthBeginning(int tag, uint32_t len) { - Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); - Encode32(len); - } - void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); } - - private: - // Note: this module's behavior must match the protocol buffer wire encoding - // format. - enum { - WIRETYPE_VARINT = 0, - WIRETYPE_LENGTH_DELIMITED = 2, - }; - static uint32_t combine(uint32_t tag, uint32_t type) { - return ((tag << 3) | type); - } - inline void Encode32(uint32_t v) { - if (v < 128) { - // Fast path for single-byte values. Many of the calls will use a - // constant value for v, so the comparison will get optimized away - // when Encode32 is inlined into the caller. - *p_ = v; - p_++; - } else { - p_ = EncodeVarint32(p_, v); - } - } - void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); } - void EncodeBool(bool v) { - *p_ = (v ? 1 : 0); // Equal to varint32 encoding of 0 or 1 - p_++; - } - void EncodeBytes(const char* bytes, int N) { - memcpy(p_, bytes, N); - p_ += N; - } - - char* base_; - char* p_; - char* limit_; // Just for CHECKs -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h deleted file mode 100644 index 44359af1b1b2a6..00000000000000 --- a/paddle/fluid/operators/distributed/request_handler.h +++ /dev/null @@ -1,261 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include // NOLINT - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/platform/macros.h" - -namespace paddle { -namespace operators { -namespace distributed { - -constexpr char kRequestSend[] = "RequestSend"; -constexpr char kRequestGet[] = "RequestGet"; -constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable"; -constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier"; -constexpr char kRequestPrefetch[] = "RequestPrefetch"; -constexpr char kRequestCheckpoint[] = "RequestCheckpoint"; -constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; -constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier"; -constexpr char kRequestNotify[] = "RequestNotify"; -constexpr char kRequestSendAndRecv[] = "RequestSendAndRecv"; - -constexpr char kSendRPC[] = "SendRPC"; -constexpr char kGetRPC[] = "GetRPC"; -constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC"; -constexpr char kGetMonomerRPC[] = "GetMonomerRPC"; -constexpr char kPrefetchRPC[] = "PrefetchRPC"; -constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC"; -constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC"; -constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC"; -constexpr char kSendCompleteRPC[] = "SendCompleteRPC"; -constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC"; -constexpr char kSendAndRecvRPC[] = "SendAndRecvRPC"; -constexpr int64_t kPrefetchTimeout = 60000; - -#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" -#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" -#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" -#define COMPLETE_MESSAGE "COMPLETE@RECV" -#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV" -#define LEARNING_RATE_DECAY_COUNTER "@LR_DECAY_COUNTER@" -#define STEP_COUNTER "@PS_STEP_COUNTER@" - -#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY" -#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY" - -enum DistributedMode { kSync = 0, kAsync = 1, kHalfAsync = 2, kGeo = 3 }; - -class RPCServer; - -class VarHandle { - public: - VarHandle(const std::string ep, const std::string& method, - const std::string& name, - const platform::DeviceContext* p_ctx = nullptr, - const framework::Scope* p_scope = nullptr) - : status_(kDefaultState) { - ep_ = ep; - ctx_ = p_ctx; - scope_ = p_scope; - name_ = name; - method_ = method; - } - - virtual ~VarHandle() {} - - public: - bool should_retry = false; - - bool Wait() { - int ret = kDefaultState; - { - std::unique_lock lk(sync_mutex_); - wait_cond_.wait(lk, [this] { return status_ != kDefaultState; }); - ret = status_; - } - VLOG(7) << "VarHandle wait:" << ret; - return ret != kErrorState; - } - - void Finish(bool ok) { - { - std::unique_lock lk(sync_mutex_); - status_ = ok ? kFinishState : kErrorState; - } - VLOG(7) << "VarHandle finish:" << ok; - wait_cond_.notify_all(); - } - - std::string String() const { - std::ostringstream s; - s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], status:[" - << status_ << "]"; - return s.str(); - } - - std::string ep() const { return ep_; } - const platform::DeviceContext* ctx() const { return ctx_; } - const framework::Scope* scope() const { return scope_; } - std::string name() const { return name_; } - std::string method() const { return method_; } - - protected: - // RPC endpoint. - std::string ep_; - const platform::DeviceContext* ctx_; - const framework::Scope* scope_; - // Variable name. - std::string name_; - // RPC method name. - std::string method_; - - protected: - std::mutex sync_mutex_; - std::condition_variable wait_cond_; - - enum VarHandleStatus { - kDefaultState = -1, - kErrorState = 0, - kFinishState = 1, - }; - VarHandleStatus status_; - - private: - DISABLE_COPY_AND_ASSIGN(VarHandle); -}; - -typedef std::shared_ptr VarHandlePtr; - -class RequestHandler { - public: - explicit RequestHandler(int distributed_mode) - : distributed_mode_(distributed_mode), - dev_ctx_(nullptr), - executor_(nullptr), - scope_(nullptr), - program_(nullptr), - rpc_server_(nullptr) {} - - virtual ~RequestHandler() {} - - // Set attributes. - void SetScope(framework::Scope* scope) { scope_ = scope; } - void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; } - void SetProgram(framework::ProgramDesc* program) { program_ = program; } - void SetExecutor(framework::Executor* executor) { executor_ = executor; } - - // Used for dist lookup table prefetch - void SetPrefetchPreparedCtx( - std::unordered_map< - std::string, std::shared_ptr>* g) { - prefetch_var_name_to_prepared_ctx_ = g; - } - - void SetCheckpointNotifyPreparedCtx( - std::shared_ptr g) { - checkpoint_prepared_ctx_ = g; - } - - // Used for async. - void SetGradToPreparedCtx( - std::unordered_map< - std::string, std::shared_ptr>* g) { - grad_to_prepared_ctx_ = g; - } - - void SetSparseGradToParam(std::unordered_map* g) { - sparse_grad_to_param_ = g; - } - - void SetLrDecayPreparedCtx( - std::shared_ptr g) { - lr_decay_prepared_ctx_ = g; - } - - void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; } - - // Get attributes. - int distributed_mode() { return distributed_mode_; } - framework::Scope* scope() { return scope_; } - const platform::DeviceContext* dev_ctx() { return dev_ctx_; } - framework::ProgramDesc* program() { return program_; } - framework::Executor* executor() { return executor_; } - - // This function processes user's rpc request. - // The implemention is in request_handler_impl. - // example: - // std::string varname = request_.varname(); - // - // auto scope = request_handler_->scope(); - // auto invar = scope->FindVar(varname); - // framework::Variable* outvar = nullptr; - // - // request_handler_->Handle(varname, scope, invar, &outvar); - // if (outvar) { - // SerializeToByteBuffer(varname, outvar, - // *request_handler_->dev_ctx(), &reply_); - // } - virtual bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, - const std::string& out_var_name = "", - const std::string& table_name = "") = 0; - - protected: - const int distributed_mode_; - - const platform::DeviceContext* dev_ctx_; - framework::Executor* executor_; - framework::Scope* scope_; - framework::ProgramDesc* program_; - - // used for distribute lookup table prefetch - std::unordered_map>* - prefetch_var_name_to_prepared_ctx_; - // used for checkpoint notify - std::shared_ptr checkpoint_prepared_ctx_; - - // Used for async. - std::unordered_map>* - grad_to_prepared_ctx_; - std::unordered_map* sparse_grad_to_param_; - - // used for lr decay - std::shared_ptr lr_decay_prepared_ctx_; - RPCServer* rpc_server_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc deleted file mode 100644 index 8c4f2ef57a32c8..00000000000000 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ /dev/null @@ -1,354 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/request_handler_impl.h" -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" -#include "paddle/fluid/string/piece.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/fluid/string/split.h" - -#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" -#include "paddle/fluid/operators/distributed/large_scale_kv.h" - -namespace paddle { -namespace operators { -namespace distributed { - -// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables -// to directory specified. -constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath"; - -bool RequestSendHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "RequestSendHandler:" << varname; - - // Sync - if (varname == BATCH_BARRIER_MESSAGE) { - VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE"; - rpc_server_->IncreaseBatchBarrier(kRequestSend); - } else if (varname == COMPLETE_MESSAGE) { - VLOG(3) << "sync: recv complete message"; - - if (HeartBeatMonitor::GetInstance() != nullptr) { - HeartBeatMonitor::GetInstance()->Update(trainer_id, "", COMPLETED); - } - - rpc_server_->Complete(); - } else { - // Async - if (distributed_mode_ != DistributedMode::kSync) { - VLOG(3) << "async process var: " << varname; - if (varname == BATCH_BARRIER_MESSAGE) { - PADDLE_THROW(platform::errors::InvalidArgument( - "async mode should not recv BATCH_BARRIER_MESSAGE or " - "COMPLETE_MESSAGE")); - } - HeartBeatMonitor::GetInstance()->Update(trainer_id, varname, RUNNING); - - std::string run_varname = varname; - - string::Piece part_piece("@PIECE"); - string::Piece var_name_piece = string::Piece(varname); - - if (string::Contains(var_name_piece, part_piece)) { - auto varname_splits = paddle::string::Split(varname, '@'); - PADDLE_ENFORCE_EQ( - varname_splits.size(), 3, - platform::errors::InvalidArgument( - "varname: %s should be separated into 3 parts by @", varname)); - run_varname = varname_splits[0]; - scope->Rename(varname, run_varname); - } - - auto *var = scope->FindVar(run_varname); - - // for sparse ids - if (var->IsType()) { - if (distributed_mode_ == DistributedMode::kAsync || - distributed_mode_ == DistributedMode::kHalfAsync) { - auto *ins = distributed::LargeScaleKV::GetInstance(); - if (ins->GradInLargeScale(run_varname)) { - auto *large_scale_var = ins->GetByGrad(run_varname); - - for (auto name : large_scale_var->CachedVarnames()) { - scope->Var(name); - } - } - } - if (distributed_mode_ == DistributedMode::kGeo) { - if (AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad( - run_varname)) { - auto &grad_slr = - scope->FindVar(run_varname)->Get(); - AsyncSparseParamUpdateRecorder::GetInstance()->Update( - run_varname, grad_slr.rows()); - } - } - } - - executor_->RunPreparedContext((*grad_to_prepared_ctx_)[run_varname].get(), - scope); - return true; - } else { // sync - rpc_server_->WaitCond(kRequestSend); - VLOG(3) << "sync: processing received var: " << varname; - PADDLE_ENFORCE_NOT_NULL( - invar, platform::errors::NotFound( - "sync: Can not find server side var %s.", varname)); - } - } - return true; -} - -bool RequestGetHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(3) << "RequestGetHandler:" << varname - << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id - << " table_name: " << table_name; - - if (distributed_mode_ == DistributedMode::kSync) { - if (varname == FETCH_BARRIER_MESSAGE) { - VLOG(3) << "sync: recv fetch barrier message"; - rpc_server_->IncreaseBatchBarrier(kRequestGet); - } else { - rpc_server_->WaitCond(kRequestGet); - *outvar = scope_->FindVar(varname); - } - } else { - if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) { - if (enable_dc_asgd_) { - // NOTE: the format is determined by distribute_transpiler.py - std::string param_bak_name = - string::Sprintf("%s.trainer_%d_bak", varname, trainer_id); - VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id; - auto var = scope_->FindVar(varname); - auto t_orig = var->Get(); - auto param_bak = scope_->Var(param_bak_name); - auto t = param_bak->GetMutable(); - t->mutable_data(dev_ctx_->GetPlace(), t_orig.type()); - VLOG(3) << "copying " << varname << " to " << param_bak_name; - framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); - } - - if (distributed_mode_ == DistributedMode::kGeo && - AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) && - !table_name.empty()) { - VLOG(3) << "AsyncSparseParamUpdateRecorder " << varname << " exist "; - - std::vector updated_rows; - AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear( - varname, trainer_id, &updated_rows); - - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto &row_id : updated_rows) { - sstream << row_id << ", "; - } - sstream << "]"; - VLOG(3) << "updated_rows size: " << updated_rows.size() << " " - << sstream.str(); - } - - auto &origin_tensor = - scope_->FindVar(varname)->Get(); - auto *origin_tensor_data = origin_tensor.data(); - auto &dims = origin_tensor.dims(); - *outvar = scope->Var(); - auto *out_slr = (*outvar)->GetMutable(); - out_slr->set_rows(updated_rows); - out_slr->set_height(dims[0]); - auto out_dims = framework::make_ddim( - {static_cast(updated_rows.size()), dims[1]}); - auto *data = out_slr->mutable_value()->mutable_data( - out_dims, origin_tensor.place()); - auto width = dims[1]; - for (size_t i = 0; i < updated_rows.size(); ++i) { - PADDLE_ENFORCE_LT( - updated_rows[i], dims[0], - platform::errors::OutOfRange( - "The value of updated_rows: %s out of Tensor %s dims[0]: %s", - updated_rows[i], varname, dims[0])); - memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width, - sizeof(float) * width); - } - } else { - *outvar = scope_->FindVar(varname); - } - } - } - return true; -} - -bool RequestGetNoBarrierHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "RequestGetNoBarrierHandler:" << varname - << " out_var_name: " << out_var_name; - - // get var from pserver immediately without barriers - string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE); - string::Piece var_name_piece = string::Piece(varname); - - if (string::Contains(var_name_piece, without_barrier_piece)) { - var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece); - VLOG(4) << "Get var " << var_name_piece << " with " - << WITHOUT_BARRIER_MESSAGE; - *outvar = scope_->FindVar(var_name_piece.ToString()); - return true; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE)); - } - return true; -} - -bool RequestPrefetchHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "RequestPrefetchHandler " << varname; - - (*outvar)->GetMutable(); - - VLOG(1) << "Prefetch " - << "tablename: " << table_name << " ids:" << varname - << " out: " << out_var_name; - paddle::platform::CPUPlace cpu_place; - auto *ins = distributed::LargeScaleKV::GetInstance(); - - if (ins->ParamInLargeScale(table_name)) { - auto lookup_table_op = PullLargeScaleOp(table_name, varname, out_var_name); - lookup_table_op->Run(*scope, cpu_place); - } else { - auto lookup_table_op = - BuildLookupTableOp(table_name, varname, out_var_name); - lookup_table_op->Run(*scope, cpu_place); - } - - return true; -} - -bool RequestCheckpointHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "receive save var " << varname << " with path " << out_var_name - << " mode " << table_name; - - int mode = std::stoi(table_name); - - auto *ins = distributed::LargeScaleKV::GetInstance(); - ins->Get(varname)->Save(out_var_name, mode); - return true; -} - -bool RequestNotifyHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(3) << "RequestNotifyHandler: " << varname - << ", trainer_id: " << trainer_id; - - string::Piece decay_piece(STEP_COUNTER); - string::Piece var_name_piece = string::Piece(varname); - if (string::Contains(var_name_piece, decay_piece)) { - VLOG(3) << "LearningRate Decay Counter Update"; - - auto *send_var = scope->FindVar(varname); - auto send_var_tensor = send_var->Get(); - auto *send_value = - send_var_tensor.mutable_data(send_var_tensor.place()); - - auto counter = decay_counters.at(trainer_id); - counter += send_value[0]; - decay_counters.at(trainer_id) = counter; - - auto *global_step_var = this->scope()->FindVar(LEARNING_RATE_DECAY_COUNTER); - if (global_step_var == nullptr) { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not find LEARNING_RATE_DECAY_COUNTER ")); - } - - auto *tensor = global_step_var->GetMutable(); - auto *value = tensor->mutable_data(platform::CPUPlace()); - - auto global_counter = 0; - for (auto &trainer_counter : decay_counters) { - global_counter += trainer_counter.second; - } - value[0] = global_counter; - - if (lr_decay_prepared_ctx_.get() == nullptr) { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not find decay block for executor")); - } - - executor_->RunPreparedContext(lr_decay_prepared_ctx_.get(), scope_); - } - return true; -} - -bool RequestSendAndRecvHandler::Handle(const std::string &varname, - framework::Scope *Scope, - framework::Variable *var, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(3) << "SendAndRecvHandle: " << varname - << " out_var_name: " << out_var_name - << " , trainer_id: " << trainer_id; - - executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), Scope); - *outvar = Scope->FindVar(out_var_name); - return true; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h deleted file mode 100644 index 6d239673f91041..00000000000000 --- a/paddle/fluid/operators/distributed/request_handler_impl.h +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace paddle { -namespace framework { -class Scope; -class Variable; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestSendHandler final : public RequestHandler { - public: - explicit RequestSendHandler(int distributed_mode, bool enable_dc_asgd = false) - : RequestHandler(distributed_mode) { - enable_dc_asgd_ = enable_dc_asgd; - } - virtual ~RequestSendHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - bool enable_dc_asgd_; -}; - -class RequestGetHandler final : public RequestHandler { - public: - explicit RequestGetHandler(int distributed_mode, bool enable_dc_asgd = false) - : RequestHandler(distributed_mode) { - enable_dc_asgd_ = enable_dc_asgd; - } - virtual ~RequestGetHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - bool enable_dc_asgd_; -}; - -class RequestGetNoBarrierHandler final : public RequestHandler { - public: - RequestGetNoBarrierHandler() : RequestHandler(false) {} - virtual ~RequestGetNoBarrierHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; -}; - -static inline void BuildVar(const std::string& param_name, - std::initializer_list arguments, - paddle::framework::proto::OpDesc::Var* var) { - var->set_parameter(param_name); - for (auto& arg_name : arguments) { - *var->mutable_arguments()->Add() = arg_name; - } -} - -class RequestPrefetchHandler final : public RequestHandler { - public: - explicit RequestPrefetchHandler(int distributed_mode) - : RequestHandler(distributed_mode) {} - virtual ~RequestPrefetchHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - std::unique_ptr PullLargeScaleOp( - const std::string& table_name, const std::string& id_name, - const std::string& out_name) { - framework::OpDesc desc; - desc.SetType("lookup_sparse_table_read"); - desc.SetInput("Ids", {id_name}); - desc.SetOutput("Out", std::vector({out_name})); - desc.SetAttr("tablename", {table_name}); - desc.SetAttr("init", true); - desc.SetAttr("value_names", std::vector({"Param"})); - - auto op = paddle::framework::OpRegistry::CreateOp(desc); - return op; - } - - std::unique_ptr BuildLookupTableOp( - const std::string& table_name, const std::string& id_name, - const std::string& out_name) { - paddle::framework::proto::OpDesc op_desc; - op_desc.set_type("lookup_table"); - BuildVar("W", {table_name.data()}, op_desc.add_inputs()); - BuildVar("Ids", {id_name.data()}, op_desc.add_inputs()); - BuildVar("Out", {out_name.data()}, op_desc.add_outputs()); - - auto op = paddle::framework::OpRegistry::CreateOp(op_desc); - return op; - } -}; - -class RequestCheckpointHandler final : public RequestHandler { - public: - explicit RequestCheckpointHandler(int distributed_mode) - : RequestHandler(distributed_mode) {} - - virtual ~RequestCheckpointHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - std::unique_ptr BuildCheckpointOp( - const std::string& varname, const std::string& file_path) { - paddle::framework::proto::OpDesc op_desc; - op_desc.set_type("save"); - BuildVar("X", {varname.data()}, op_desc.add_inputs()); - - auto attr = op_desc.mutable_attrs()->Add(); - attr->set_name("file_path"); - attr->set_type(paddle::framework::proto::AttrType::STRING); - attr->set_s(file_path); - - auto op = paddle::framework::OpRegistry::CreateOp(op_desc); - return op; - } -}; - -class RequestNotifyHandler final : public RequestHandler { - public: - explicit RequestNotifyHandler(int distributed_mode, int trainers) - : RequestHandler(distributed_mode) { - this->trainers = trainers; - for (int i = 0; i < trainers; i++) { - decay_counters[i] = 0; - } - } - virtual ~RequestNotifyHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - int trainers; - std::unordered_map decay_counters; -}; - -class RequestSendAndRecvHandler final : public RequestHandler { - public: - explicit RequestSendAndRecvHandler(int distributed_mode) - : RequestHandler(distributed_mode) {} - virtual ~RequestSendAndRecvHandler() {} - bool Handle(const std::string& varname, framework::Scope* Scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc deleted file mode 100644 index 57ce54870decf2..00000000000000 --- a/paddle/fluid/operators/distributed/rpc_client.cc +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "gflags/gflags.h" - -// default to 3min to avoid temprary network failures. -DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc"); -DEFINE_int32(rpc_retry_times, 3, "retry times for rpc"); - -namespace paddle { -namespace operators { -namespace distributed { - -std::once_flag RPCClient::init_flag_; -std::unique_ptr RPCClient::rpc_client_(nullptr); -int RPCClient::trainer_id_ = 0; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h deleted file mode 100644 index 2c756a6f71ff94..00000000000000 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include // NOLINT -#include -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -DECLARE_int32(rpc_deadline); -DECLARE_int32(rpc_retry_times); - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient { - public: - RPCClient() {} - virtual ~RPCClient() {} - virtual VarHandlePtr AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_varname, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncPrefetchVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& in_var_name, - const std::string& out_var_name, const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendBatchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendFetchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetMonomerBarrier( - const std::string& ep, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncCheckpointNotify( - const std::string& ep, const std::string& dirname, - const std::string& varname, const int mode, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncDistributeNotify( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendAndRecv( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& send_var_name, - const std::string& recv_var_name, const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendComplete( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; - - // Complete tells all the pserver instances that finishe the training, - // the pserver can reduce it's barrier count, and continue to train - // with other trainers. - virtual void SendComplete() = 0; - - virtual bool Wait() = 0; - - template - static RPCClient* GetInstance(int trainer_id) { - std::call_once(init_flag_, &RPCClient::Init, trainer_id); - return rpc_client_.get(); - } - - // Init is called by GetInstance. - template - static void Init(int trainer_id) { - VLOG(1) << "init rpc client with trainer_id " << trainer_id; - trainer_id_ = trainer_id; - if (rpc_client_.get() == nullptr) { - rpc_client_.reset(new T()); - rpc_client_->InitImpl(); - } - } - - virtual void InitImpl() {} - - protected: - // each trainer have exact one trainer id, it should be static - static int trainer_id_; - - private: - static std::once_flag init_flag_; - static std::unique_ptr rpc_client_; -}; -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc deleted file mode 100644 index 37cf0460fb1fa1..00000000000000 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/rpc_server.h" - -#include -#include - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestHandler; - -void RPCServer::ShutDown() { - VLOG(3) << "RPCServer ShutDown "; - ShutDownImpl(); - - exit_flag_ = true; - barrier_cond_.notify_all(); - rpc_cond_.notify_all(); -} - -void RPCServer::SavePort() const { - auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid()); - std::ofstream port_file; - port_file.open(file_path); - port_file << selected_port_; - port_file.close(); - VLOG(3) << "selected port written to " << file_path; -} - -void RPCServer::WaitBarrier(const std::string& rpc_name) { - VLOG(3) << "WaitBarrier in: " << rpc_name; - std::unique_lock lock(this->mutex_); - barrier_cond_.wait(lock, [this, &rpc_name] { - return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) || - exit_flag_.load()); - }); - - VLOG(3) << "WaitBarrier out: " << rpc_name - << " counter: " << barrier_counter_[rpc_name]; -} - -void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { - VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; - // barrier msg should make sure that it's in the right cond(send|recv) - WaitCond(rpc_name); - int b = 0; - std::unique_lock lock(mutex_); - b = ++barrier_counter_[rpc_name]; - VLOG(3) << rpc_name << " barrier_counter: " << b; - if (b >= client_num_) { - lock.unlock(); - VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for " - << rpc_name; - barrier_cond_.notify_all(); - lock.lock(); - } -} - -void RPCServer::Complete() { - { - std::unique_lock lock(mutex_); - client_num_--; - need_reset_all_vars_ = true; - - VLOG(3) << "decrease client_num to: " << client_num_; - if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { - barrier_counter_[kRequestGet]--; - } - } - barrier_cond_.notify_all(); -} - -bool RPCServer::NeedResetAllVars() { - std::unique_lock lock(mutex_); - return need_reset_all_vars_; -} - -int RPCServer::GetClientNum() { - std::unique_lock lock(mutex_); - return client_num_; -} - -void RPCServer::ResetBarrierCounter() { - VLOG(3) << "RPCServer ResetBarrierCounter "; - std::unique_lock lock(mutex_); - for (auto& t : barrier_counter_) { - t.second = 0; - } - need_reset_all_vars_ = false; -} - -void RPCServer::RegisterRPC(const std::string& rpc_name, - RequestHandler* handler, int thread_num) { - rpc_call_map_[rpc_name] = handler; - rpc_thread_num_[rpc_name] = thread_num; - - static int cond = -1; - rpc_cond_map_[rpc_name] = ++cond; - VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler - << ", cond: " << rpc_cond_map_[rpc_name]; -} - -void RPCServer::SetCond(const std::string& rpc_name) { - VLOG(3) << "RPCServer SetCond " << rpc_name; - { - std::unique_lock lock(mutex_); - cur_cond_ = rpc_cond_map_[rpc_name]; - } - - rpc_cond_.notify_all(); -} - -void RPCServer::WaitCond(const std::string& rpc_name) { - VLOG(3) << "RPCServer WaitCond in " << rpc_name; - int cond = 0; - { - std::unique_lock lock(mutex_); - cond = rpc_cond_map_[rpc_name]; - } - - std::unique_lock lock(mutex_); - rpc_cond_.wait( - lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); - VLOG(3) << "RPCServer WaitCond out " << rpc_name; -} - -void RPCServer::RegisterVar(const std::string& var_name, - const std::string& rpc_name, - framework::Scope* scope, - platform::DeviceContext* dev_ctx) { - MonomerHandle h; - h.var_name_ = var_name; - h.rpc_name_ = rpc_name; - h.scope_ = scope; - h.dev_ctx_ = dev_ctx; - - { - std::unique_lock lock(mutex_); - PADDLE_ENFORCE_EQ( - var_map_.find(var_name), var_map_.end(), - platform::errors::AlreadyExists("%s already in var_map.", var_name)); - var_map_[var_name] = h; - } - - rpc_cond_.notify_all(); - VLOG(3) << "RegisterVar context:" << h.String(); -} - -void RPCServer::IncreaseVarBarrier(const std::string& var_name) { - int b = 0; - MonomerHandle h; - { - std::unique_lock lock(mutex_); - b = ++var_map_[var_name].barrier_; - h = var_map_[var_name]; - } - - if (b >= client_num_) { - barrier_cond_.notify_all(); - } - - VLOG(3) << "IncreaseVarBarrier context:" << h.String(); -} - -void RPCServer::WaitVarBarrier(const std::string& var_name) { - VLOG(3) << "WaitVarBarrier var_name:" << var_name; - - std::unique_lock lock(mutex_); - barrier_cond_.wait(lock, [&]() { - return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) || - exit_flag_.load()); - }); - - VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String(); -} - -void RPCServer::SetVarCond(const std::string& var_name) { - VLOG(3) << "SetVarCond var_name:" << var_name; - { - std::unique_lock lock(mutex_); - if (var_map_.find(var_name) != var_map_.end()) { - rpc_cond_.notify_all(); - } - } -} - -void RPCServer::WaitVarCond(const std::string& var_name) { - VLOG(3) << "WaitVarCond var_name:" << var_name; - - std::unique_lock lock(mutex_); - rpc_cond_.wait(lock, [=] { - return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load()); - }); - - VLOG(3) << "WaitVarCond var_name:" << var_name << " end"; -} - -MonomerHandle RPCServer::GetMonomer(const std::string& var_name) { - MonomerHandle h; - { - std::unique_lock lock(mutex_); - h = var_map_[var_name]; - } - - return h; -} - -void RPCServer::ClearRegisteredVars() { - std::unique_lock lock(mutex_); - var_map_.clear(); -} - -void RPCServer::ClearVar(const std::string& var_name) { - std::unique_lock lock(mutex_); - var_map_.erase(var_name); -} -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h deleted file mode 100644 index 2120260515e255..00000000000000 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include // NOLINT -#include -#include -#include - -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestHandler; - -struct MonomerHandle { - std::string var_name_; - std::string rpc_name_; - framework::Scope* scope_{nullptr}; - platform::DeviceContext* dev_ctx_{nullptr}; - int64_t barrier_{0}; - - std::string String() { - std::stringstream ss; - ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_ - << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_ - << ", barrier_:" << barrier_; - return ss.str(); - } -}; - -class RPCServer { - public: - explicit RPCServer(const std::string& address, int client_num) - : cur_cond_(0), - bind_address_(address), - exit_flag_(false), - selected_port_(0), - client_num_(client_num), - need_reset_all_vars_(false) {} - - virtual ~RPCServer() {} - virtual void StartServer() = 0; - virtual void WaitServerReady() = 0; - - void ShutDown(); - - bool IsExit() { return exit_flag_.load(); } - - int GetSelectedPort() const { return selected_port_; } - - int GetClientNum(); - - void SavePort() const; - - // RegisterRPC, register the rpc method name to a handler - // class, and auto generate a condition id for this call - // to be used for the barrier. - void RegisterRPC(const std::string& rpc_name, RequestHandler* handler, - int thread_num = 1); - - int GetThreadNum(const std::string& rpc_name) { - return rpc_thread_num_[rpc_name]; - } - - // Wait util all the clients have reached the barrier for one - // rpc method. This function should be called in the - // RequestHandler if you want to run the server/client in a - // synchronous mode. - void WaitBarrier(const std::string& rpc_name); - - void SetCond(const std::string& rpc_name); - void WaitCond(const std::string& rpc_name); - void IncreaseBatchBarrier(const std::string rpc_name); - - void RegisterVar(const std::string& var_name, const std::string& rpc_name, - framework::Scope* scope, platform::DeviceContext* dev_ctx); - void IncreaseVarBarrier(const std::string& var_name); - void WaitVarBarrier(const std::string& var_name); - void SetVarCond(const std::string& var_name); - void WaitVarCond(const std::string& var_name); - void ClearRegisteredVars(); - void ClearVar(const std::string& var_name); - MonomerHandle GetMonomer(const std::string& var_name); - - void Complete(); - - void ResetBarrierCounter(); - - bool NeedResetAllVars(); - - protected: - virtual void ShutDownImpl() = 0; - - private: - std::mutex mutex_; - std::unordered_map barrier_counter_; - std::condition_variable barrier_cond_; - - std::unordered_map rpc_cond_map_; - std::atomic cur_cond_; - std::condition_variable rpc_cond_; - - protected: - std::string bind_address_; - std::atomic exit_flag_; - int selected_port_; - int client_num_; - bool need_reset_all_vars_; - - std::unordered_map rpc_call_map_; - std::unordered_map rpc_thread_num_; - friend class RequestHandler; - - // TODO(gongwb): use more cond to notify or wait; - std::unordered_map var_map_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc deleted file mode 100644 index f59285400033df..00000000000000 --- a/paddle/fluid/operators/distributed/rpc_server_test.cc +++ /dev/null @@ -1,344 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include // NOLINT -#include -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" -#include "paddle/fluid/operators/distributed/large_scale_kv.h" -#include "paddle/fluid/operators/distributed/request_handler_impl.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace distributed = paddle::operators::distributed; - -USE_NO_KERNEL_OP(lookup_sparse_table_read); -USE_NO_KERNEL_OP(checkpoint_notify); -USE_OP(scale); - -std::unique_ptr g_rpc_service; -std::unique_ptr g_req_handler; - -framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) { - auto root_block = program->MutableBlock(0); - auto* block = program->AppendBlock(*root_block); - - framework::OpDesc* op = block->AppendOp(); - op->SetType("scale"); - op->SetInput("X", {"x"}); - op->SetOutput("Out", {"res"}); - op->SetAttr("scale", 0.5f); - - auto& out = *root_block->Var("res"); - out.SetType(framework::proto::VarType::LOD_TENSOR); - out.SetShape({1, 10}); - - return block; -} - -void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) { - auto w_var = scope->Var("w"); - w_var->GetMutable(); - - auto out_var = scope->Var("out"); - out_var->GetMutable(); - - auto ids_var = scope->Var("ids"); - ids_var->GetMutable(); - - auto x_var = scope->Var("x"); - x_var->GetMutable(); - - auto res_var = scope->Var("res"); - res_var->GetMutable(); -} - -void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place, - int64_t rows_numel) { - CreateVarsOnScope(scope, place); - auto ids_var = scope->Var("ids")->GetMutable(); - int64_t* ids_ptr = - ids_var->mutable_data(framework::DDim({rows_numel, 1}), *place); - for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2; - - auto x_var = scope->Var("x")->GetMutable(); - float* x_ptr = - x_var->mutable_data(framework::DDim({1, rows_numel}), *place); - for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0; -} - -void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, - int64_t rows_numel) { - CreateVarsOnScope(scope, place); - auto w = scope->Var("w")->GetMutable(); - auto w_value = w->mutable_value(); - w_value->Resize({rows_numel, 10}); - for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true); - - auto ptr = w_value->mutable_data(*place); - - for (int64_t i = 0; i < w_value->numel(); ++i) { - ptr[i] = static_cast(i / 10); - } -} - -void StartServer(const std::string& rpc_name) { - framework::ProgramDesc program; - framework::Scope scope; - platform::CPUPlace place; - framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); - - std::unordered_map> - prefetch_var_name_to_prepared; - - g_req_handler->SetProgram(&program); - g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); - g_req_handler->SetDevCtx(&ctx); - g_req_handler->SetScope(&scope); - g_req_handler->SetExecutor(&exe); - - g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get()); - - // distributed::HeartBeatMonitor::Init(1, true, "w@grad"); - - g_req_handler->SetRPCServer(g_rpc_service.get()); - - std::thread server_thread( - std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - - server_thread.join(); -} - -void StartSendAndRecvServer(const std::string& rpc_name) { - framework::ProgramDesc program; - framework::Scope scope; - platform::CPUPlace place; - framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); - auto block = AppendSendAndRecvBlock(&program); - std::string in_var_name("x"); - std::vector prefetch_block_ids{block->ID()}; - auto prepared = exe.Prepare(program, prefetch_block_ids); - InitTensorsOnServer(&scope, &place, 10); - - std::unordered_map> - grad_to_prepared_ctx; - grad_to_prepared_ctx[in_var_name] = prepared[0]; - - g_req_handler->SetProgram(&program); - g_req_handler->SetGradToPreparedCtx(&grad_to_prepared_ctx); - g_req_handler->SetDevCtx(&ctx); - g_req_handler->SetScope(&scope); - g_req_handler->SetExecutor(&exe); - - g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get()); - g_req_handler->SetRPCServer(g_rpc_service.get()); - - std::thread server_thread( - std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - - server_thread.join(); -} - -TEST(COMPLETE, CPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - g_req_handler.reset( - new distributed::RequestSendHandler(distributed::DistributedMode::kSync)); - g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2)); - distributed::RPCClient* client = - distributed::RPCClient::GetInstance(0); - PADDLE_ENFORCE_NE(client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); - std::thread server_thread(StartServer, distributed::kRequestSend); - g_rpc_service->WaitServerReady(); - int port = g_rpc_service->GetSelectedPort(); - std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); - client->AsyncSendComplete(ep); - client->Wait(); - - EXPECT_EQ(g_rpc_service->GetClientNum(), 1); - - g_rpc_service->ShutDown(); - server_thread.join(); - g_rpc_service.reset(nullptr); - g_req_handler.reset(nullptr); -} - -TEST(SENDANDRECV, CPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - g_req_handler.reset(new distributed::RequestSendAndRecvHandler( - distributed::DistributedMode::kAsync)); - g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); - distributed::RPCClient* client = - distributed::RPCClient::GetInstance(0); - PADDLE_ENFORCE_NE(client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); - std::thread server_thread(StartSendAndRecvServer, - distributed::kRequestSendAndRecv); - g_rpc_service->WaitServerReady(); - int port = g_rpc_service->GetSelectedPort(); - std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); - - framework::Scope scope; - platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); - - // create var on local scope - int64_t rows_numel = 10; - InitTensorsOnClient(&scope, &place, rows_numel); - std::string in_var_name("x"); - std::string out_var_name("res"); - - client->AsyncSendAndRecv(ep, ctx, scope, in_var_name, out_var_name); - client->Wait(); - auto var = scope.Var(out_var_name); - auto value = var->GetMutable(); - auto ptr = value->mutable_data(place); - - for (int64_t i = 0; i < rows_numel; ++i) { - EXPECT_EQ(ptr[i], 0.5); - } - g_rpc_service->ShutDown(); - server_thread.join(); - LOG(INFO) << "begin reset"; - g_rpc_service.reset(nullptr); - g_req_handler.reset(nullptr); -} - -void StartCheckpointServer(const std::string& rpc_name) { - framework::ProgramDesc program; - framework::Scope scope; - platform::CPUPlace place; - framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); - - std::vector metas; - - auto meta = distributed::SparseMeta(); - meta.name = "embedding.block0"; - meta.value_names = {"Param"}; - meta.value_dims = {64}; - meta.mode = distributed::Mode::training; - meta.grad_name = "embedding@Grad"; - meta.cached_varnames = {"kSparseIds"}; - meta.initializer_attrs = {"fill_constant&1.0"}; - meta.entry = "none"; - - metas.push_back(meta); - distributed::LargeScaleKV::Init(metas); - - auto* ins = distributed::LargeScaleKV::GetInstance(); - ins->Get("embedding.block0")->Init({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); - - std::unordered_map> - prefetch_var_name_to_prepared; - - g_req_handler->SetProgram(&program); - g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); - g_req_handler->SetDevCtx(&ctx); - g_req_handler->SetScope(&scope); - g_req_handler->SetExecutor(&exe); - - g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get()); - - g_req_handler->SetRPCServer(g_rpc_service.get()); - - std::thread server_thread( - std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - - server_thread.join(); -} - -TEST(LARGE_SCALE_CHECKPOINT, CPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - - paddle::framework::Scope scope; - paddle::platform::CPUPlace place; - - g_req_handler.reset(new distributed::RequestCheckpointHandler( - distributed::DistributedMode::kAsync)); - g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); - - distributed::RPCClient* client = - distributed::RPCClient::GetInstance(0); - - PADDLE_ENFORCE_NE(client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); - - std::thread server_thread(StartCheckpointServer, - distributed::kRequestCheckpoint); - g_rpc_service->WaitServerReady(); - - int port = g_rpc_service->GetSelectedPort(); - std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); - - auto save_path = - paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/base", - "embedding", "embedding.block0"); - int mode = 0; - client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode); - client->Wait(); - - save_path = - paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/delta", - "embedding", "embedding.block0"); - mode = 1; - client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode); - client->Wait(); - - paddle::framework::AttributeMap attrs; - - std::vector eps = {ep}; - attrs["endpoints"] = eps; - attrs["dirname"] = std::string("/tmp/large_scale_table/delta1"); - attrs["varname"] = std::string("embedding"); - attrs["mode"] = 2; - std::vector slices = {"embedding.block0"}; - attrs["slice_varnames"] = slices; - std::vector remotes = {"embedding.block0"}; - attrs["remote_varnames"] = remotes; - - auto ops = - framework::OpRegistry::CreateOp("checkpoint_notify", {}, {}, attrs, true); - ops->Run(scope, place); - - g_rpc_service->ShutDown(); - server_thread.join(); - LOG(INFO) << "begin reset"; - g_rpc_service.reset(nullptr); - g_req_handler.reset(nullptr); -} diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in deleted file mode 100644 index a333642bd16fbf..00000000000000 --- a/paddle/fluid/operators/distributed/send_recv.proto.in +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under -the Apache License, Version 2.0 (the "License"); you may not use this file -except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -syntax = "proto3"; -package sendrecv; - -option cc_generic_services = @cc_generic_services@; - -service SendRecvService { - // For parameter server round-robin like hashing, do not split tensors. - // Send and recv only one tensor - // TODO(typhoonzero): add streaming API - rpc SendVariable(VariableMessage) returns (VoidMessage) {} - // Argument VariableMessage for GetVariable should only contain varname. - rpc GetVariable(VariableMessage) returns (VariableMessage) {} - rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {} - // pre-fetch variable by given variable name and Ids - rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} - - rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} - rpc DistributeNotify(VariableMessage) returns (VoidMessage) {} - rpc SendAndRecvVariable(VariableMessage) returns (VariableMessage) {} - rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {} - rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} -} - -// It can be: LoDTensor、SelectedRows or NCCL_ID -enum VarType { - LOD_TENSOR = 0; - SELECTED_ROWS = 1; - NCCL_ID = 2; -} - -// VariableMessage is serialized paddle variable message. -// NOTICE(gongwb):don't modify this proto if you are not -// not familar with how we serialize in sendrecvop_utils.h -// and deserilize it in variable_response.h. -message VariableMessage { - enum Type { - // Pod Types - BOOL = 0; - INT16 = 1; - INT32 = 2; - INT64 = 3; - FP16 = 4; - FP32 = 5; - FP64 = 6; - } - - message LodData { repeated int64 lod_data = 1; } - string varname = 1; - // TODO(Yancey1989): reference framework::proto::VarDesc::VarType - VarType type = 2; - // bool persistable is not needed for sending. - // tensor info: - Type data_type = 3; - repeated int64 dims = 4; - - // lod details: - int64 lod_level = 5; - repeated LodData lod = 6; - // selected_rows height, aka. original dim0 - int64 slr_height = 7; - // tensor data - bytes serialized = 8; - // selected_rows data - bytes rows = 9; - // Look up table block execution output variable name. - string out_varname = 10; - // If 1, the ps server will start profiling, the ps - // server stops profiling and generates a profile to /tmp/profile_ps_* - // when profile switches from 1 to 2. - int64 profile = 11; - int64 trainer_id = 12; - string table_name = 13; -} - -message VoidMessage {} diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc deleted file mode 100644 index 107c74eb2670e4..00000000000000 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include - -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" - -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -namespace memory { -namespace allocation { -class Allocation; -} // namespace allocation -} // namespace memory -} // namespace paddle - -DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not."); -DEFINE_int32(rpc_retry_bind_port, 3, - "Retry to bind the address if address is already used."); - -namespace paddle { -namespace operators { -namespace distributed { - -using VarMsg = sendrecv::VariableMessage; - -static TensorPayload GetCommunicationAllocationFromTensor( - const platform::DeviceContext& ctx, const framework::Tensor& tensor) { - if (is_gpu_place(ctx.GetPlace())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - PADDLE_ENFORCE_EQ( - is_gpu_place(tensor.place()), true, - platform::errors::PreconditionNotMet("Please run in gpu place.")); - auto& gpu_dev_ctx = - reinterpret_cast(ctx); - auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type()); - platform::CUDAPinnedPlace cuda_pinned; - auto result = memory::AllocShared(cuda_pinned, copy_size); - - memory::Copy(cuda_pinned, result->ptr(), - BOOST_GET_CONST(platform::CUDAPlace, tensor.place()), - tensor.data(), copy_size, gpu_dev_ctx.stream()); - ctx.Wait(); - return TensorPayload(result); -#else - PADDLE_THROW( - platform::errors::Unavailable("This situation should not be happened")); -#endif - } else { - return TensorPayload(tensor); - } -} -TensorPayload GetTensorPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request) { - auto tensor = var->Get(); - // FIXME(wuyi): data types in send_recv.proto is copied from - // framework.proto - request->set_data_type(static_cast(tensor.type())); - for (auto& dim : framework::vectorize(tensor.dims())) { - request->add_dims(dim); - } - const framework::LoD lod = tensor.lod(); - if (lod.size() > 0) { - request->set_lod_level(lod.size()); - for (auto& each : lod) { - VarMsg::LodData* lod_inner = request->add_lod(); - for (auto& d : each) { - lod_inner->add_lod_data(d); - } - } - } - return GetCommunicationAllocationFromTensor(ctx, tensor); -} - -TensorPayload GetSelectedRowsPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request) { - auto* slr = var->GetMutable(); - request->set_data_type(static_cast(slr->value().type())); - request->set_lod_level(0); - request->set_slr_height(slr->height()); - - for (auto& dim : framework::vectorize(slr->value().dims())) { - request->add_dims(dim); - } - - auto* tensor = slr->mutable_value(); - return GetCommunicationAllocationFromTensor(ctx, *tensor); -} - -TensorPayload::TensorPayload(std::shared_ptr allocation) - : allocation_(allocation), offset_(0), memory_size_(allocation->size()) {} -TensorPayload::TensorPayload(const framework::Tensor& tensor) - : allocation_(tensor.Holder()), - offset_(tensor.offset()), - memory_size_(tensor.numel() * framework::SizeOfType(tensor.type())) {} -void* TensorPayload::ptr() const { - return reinterpret_cast( - reinterpret_cast(allocation_->ptr()) + offset_); -} -size_t TensorPayload::memory_size() const { return memory_size_; } -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h deleted file mode 100644 index 84ed1ab0247124..00000000000000 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/platform/port.h" - -namespace paddle { -namespace framework { -class Tensor; -class Variable; -} // namespace framework -namespace memory { -namespace allocation { -class Allocation; -} // namespace allocation -} // namespace memory -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -using VarMsg = sendrecv::VariableMessage; - -class TensorPayload final { - public: - explicit TensorPayload(const framework::Tensor& tensor); - explicit TensorPayload(std::shared_ptr allocation); - - TensorPayload(const TensorPayload& o) = default; - TensorPayload& operator=(const TensorPayload& o) = default; - - void* ptr() const; - size_t memory_size() const; - - private: - std::shared_ptr allocation_; - size_t offset_; - size_t memory_size_; -}; - -inline void SerializeDestroyCallback(void* payload) { - if (payload != nullptr) { - auto* shared_payload = reinterpret_cast(payload); - delete shared_payload; - } -} - -TensorPayload GetTensorPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request); - -TensorPayload GetSelectedRowsPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request); - -inline framework::proto::VarType::Type ToVarType( - sendrecv::VariableMessage::Type type) { - switch (type) { - case sendrecv::VariableMessage::FP32: - return framework::proto::VarType::FP32; // NOLINT - case sendrecv::VariableMessage::FP64: - return framework::proto::VarType::FP64; // NOLINT - case sendrecv::VariableMessage::INT32: - return framework::proto::VarType::INT32; // NOLINT - case sendrecv::VariableMessage::INT64: - return framework::proto::VarType::INT64; // NOLINT - case sendrecv::VariableMessage::BOOL: - return framework::proto::VarType::BOOL; // NOLINT - default: - PADDLE_THROW( - platform::errors::InvalidArgument("Not support type id: %d.", type)); - } -} - -template