Skip to content

Commit

Permalink
Merge pull request #21 from qingshui/paddlebox
Browse files Browse the repository at this point in the history
fix paddle complie infer
  • Loading branch information
qingshui authored Jan 4, 2022
2 parents eecb4d4 + 075b552 commit 170de11
Show file tree
Hide file tree
Showing 10 changed files with 777 additions and 698 deletions.
8 changes: 7 additions & 1 deletion cmake/external/box_ps.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@ IF((NOT DEFINED BOX_PS_VER) OR (NOT DEFINED BOX_PS_URL))
SET(BOX_PS_VER "0.1.1" CACHE STRING "" FORCE)
SET(BOX_PS_NAME "box_ps" CACHE STRING "" FORCE)
#SET(BOX_PS_URL "http://box-ps.gz.bcebos.com/box_ps.tar.gz" CACHE STRING "" FORCE)
SET(BOX_PS_URL "data-im.baidu.com:/home/work/var/CI_DATA/im/static/box_ps.tar.gz/box_ps.tar.gz.30" CACHE STRING "" FORCE)
IF(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
#cuda10.2
SET(BOX_PS_URL "data-im.baidu.com:/home/work/var/CI_DATA/im/static/box_ps.tar.gz/box_ps.tar.gz.32" CACHE STRING "" FORCE)
ELSE()
#cuda11.4
SET(BOX_PS_URL "data-im.baidu.com:/home/work/var/CI_DATA/im/static/box_ps.tar.gz/box_ps.tar.gz.31" CACHE STRING "" FORCE)
ENDIF()
ENDIF()
MESSAGE(STATUS "BOX_PS_NAME: ${BOX_PS_NAME}, BOX_PS_URL: ${BOX_PS_URL}")
SET(BOX_PS_SOURCE_DIR "${THIRD_PARTY_PATH}/box_ps")
Expand Down
15 changes: 9 additions & 6 deletions paddle/fluid/framework/data_feed.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ limitations under the License. */
namespace paddle {
namespace framework {

// CUDA: use 512 threads per block
const int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}

#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
Expand Down Expand Up @@ -85,12 +92,7 @@ void MultiSlotInMemoryDataFeed::CopyForTensor(
cudaStreamSynchronize(stream);
}

// CUDA: use 512 threads per block
const int CUDA_NUM_THREADS = 512;
// CUDA: number of blocks for threads.
inline int GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
#ifdef PADDLE_WITH_BOX_PS
// fill slot values
__global__ void FillSlotValueOffsetKernel(
const int ins_num, const int used_slot_num, size_t *slot_value_offsets,
Expand Down Expand Up @@ -269,6 +271,7 @@ void SlotPaddleBoxDataFeed::CopyRankOffset(int *dest, const int ins_num,
dest, ins_num, pv_num, max_rank, ranks, cmatchs, ad_offsets, cols);
cudaStreamSynchronize(stream);
}
#endif

} // namespace framework
} // namespace paddle
2 changes: 1 addition & 1 deletion paddle/fluid/framework/data_feed_factory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed);
REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed);
REGISTER_DATAFEED_CLASS(SlotPaddleBoxDataFeedWithGpuReplicaCache);
#ifdef PADDLE_WITH_BOX_PS
REGISTER_DATAFEED_CLASS(SlotPaddleBoxDataFeedWithGpuReplicaCache);
REGISTER_DATAFEED_CLASS(SlotPaddleBoxDataFeed);
REGISTER_DATAFEED_CLASS(InputTableDataFeed);
REGISTER_DATAFEED_CLASS(InputIndexDataFeed);
Expand Down
Loading

0 comments on commit 170de11

Please sign in to comment.