From 4d487c6f350b168f5e24094adeacb3c193d5d888 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 29 Nov 2016 07:40:17 +0000 Subject: [PATCH 1/4] Integrate warp-ctc as WarpCTCLayer, including unitest and layer interface. --- .gitmodules | 0 CMakeLists.txt | 5 + cmake/util.cmake | 5 + paddle/cuda/CMakeLists.txt | 34 ++- paddle/cuda/include/hl_dso_loader.h | 12 +- paddle/cuda/include/hl_gpu.h | 1 + paddle/cuda/include/hl_sequence.h | 33 +++ paddle/cuda/include/hl_warpctc_wrap.h | 94 +++++++ paddle/cuda/include/stub/hl_sequence_stub.h | 9 + paddle/cuda/src/hl_cuda_sequence.cu | 118 +++++++++ paddle/cuda/src/hl_cudart_wrap.cc | 1 + paddle/cuda/src/hl_dso_loader.cc | 25 +- paddle/cuda/src/hl_warpctc_wrap.cc | 157 +++++++++++ paddle/gserver/layers/WarpCTCLayer.cpp | 229 ++++++++++++++++ paddle/gserver/layers/WarpCTCLayer.h | 65 +++++ paddle/gserver/tests/CMakeLists.txt | 7 + paddle/gserver/tests/test_WarpCTCLayer.cpp | 247 ++++++++++++++++++ proto/ModelConfig.proto.m4 | 2 + python/paddle/trainer/config_parser.py | 21 ++ .../paddle/trainer_config_helpers/layers.py | 79 ++++++ .../protostr/test_cost_layers.protostr | 17 ++ .../tests/configs/test_cost_layers.py | 2 + 22 files changed, 1140 insertions(+), 23 deletions(-) create mode 100644 .gitmodules create mode 100644 paddle/cuda/include/hl_warpctc_wrap.h create mode 100644 paddle/cuda/src/hl_warpctc_wrap.cc create mode 100644 paddle/gserver/layers/WarpCTCLayer.cpp create mode 100644 paddle/gserver/layers/WarpCTCLayer.h create mode 100644 paddle/gserver/tests/test_WarpCTCLayer.cpp diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/CMakeLists.txt b/CMakeLists.txt index af193c27ae7d8..e5e54cc8cfcfd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -94,6 +94,11 @@ endif() if(NOT WITH_GPU) add_definitions(-DPADDLE_ONLY_CPU) add_definitions(-DHPPL_STUB_FUNC) + + if(WITH_DSO) + add_definitions(-DPADDLE_USE_DSO) + endif(WITH_DSO) + list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) else() if(${CUDA_VERSION_MAJOR} GREATER 6) diff --git a/cmake/util.cmake b/cmake/util.cmake index a8282f07184c3..11641f6064b9d 100644 --- a/cmake/util.cmake +++ b/cmake/util.cmake @@ -148,6 +148,11 @@ function(link_paddle_exe TARGET_NAME) target_link_libraries(${TARGET_NAME} rt) endif() endif() + + if(NOT WITH_DSO) + target_link_libraries(${TARGET_NAME} + ${WARPCTC_LIBRARY}) + endif() endfunction() # link_paddle_test diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt index 11dbfb54b2687..7e45d3d578982 100755 --- a/paddle/cuda/CMakeLists.txt +++ b/paddle/cuda/CMakeLists.txt @@ -15,20 +15,29 @@ else() endif() set(CUDA_CXX_WITH_GPU_SOURCES + src/hl_cudart_wrap.cc src/hl_cuda_cublas.cc src/hl_cuda_cudnn.cc - src/hl_cuda_device.cc) + src/hl_cuda_device.cc + ) -set_source_files_properties(${CUDA_CXX_WITH_GPU_SOURCES} - PROPERTIES COMPILE_FLAGS "-D__NVCC__") +if(WITH_GPU) + set(CUDA_CXX_SOURCES + src/hl_dso_loader.cc + src/hl_warpctc_wrap.cc + ${CUDA_CXX_WITH_GPU_SOURCES}) + + set_source_files_properties(${CUDA_CXX_SOURCES} + PROPERTIES COMPILE_FLAGS "-D__NVCC__") +else() + set(CUDA_CXX_SOURCES + src/hl_dso_loader.cc + src/hl_warpctc_wrap.cc) +endif() set_source_files_properties(${AVX_SOURCES} PROPERTIES COMPILE_FLAGS "-mavx") -set(CUDA_DSO_SOURCES - src/hl_dso_loader.cc - src/hl_cudart_wrap.cc) - set(CUDA_CU_SOURCES src/hl_perturbation_util.cu src/hl_cuda_aggregate.cu @@ -44,6 +53,7 @@ set(CUDA_CU_SOURCES set(CUDA_HEADERS include/hl_time.h include/hl_dso_loader.h + include/hl_warpctc_wrap.h include/hl_sequence.h include/hl_cuda_cublas.h include/hl_batch_transpose.h @@ -75,14 +85,14 @@ if(WITH_GPU) cuda_add_library(paddle_cuda ${CUDA_SOURCES} ${CUDA_CU_SOURCES} - ${CUDA_DSO_SOURCES} - ${CUDA_CXX_WITH_GPU_SOURCES}) + ${CUDA_CXX_SOURCES}) else() - add_library(paddle_cuda ${CUDA_SOURCES}) + add_library(paddle_cuda + ${CUDA_SOURCES} + ${CUDA_CXX_SOURCES}) endif() add_style_check_target(paddle_cuda ${CUDA_SOURCES} ${CUDA_HEADERS} - ${CUDA_DSO_SOURCES} - ${CUDA_CXX_WITH_GPU_SOURCES}) + ${CUDA_CXX_SOURCES}) diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h index 1eb9f9ca888d3..c52066e3d7ec4 100644 --- a/paddle/cuda/include/hl_dso_loader.h +++ b/paddle/cuda/include/hl_dso_loader.h @@ -18,10 +18,6 @@ limitations under the License. */ #include #include #include -#include -#include -#include -#include #include "hl_base.h" /** @@ -56,4 +52,12 @@ void GetCudartDsoHandle(void** dso_handle); */ void GetCurandDsoHandle(void** dso_handle); +/** + * @brief load the DSO of warp-ctc + * + * @param **dso_handle dso handler + * + */ +void GetWarpctcDsoHandle(void** dso_handle); + #endif // HL_DSO_LOADER_H_ diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h index 3be0df3b93b69..6dd6d1321270a 100644 --- a/paddle/cuda/include/hl_gpu.h +++ b/paddle/cuda/include/hl_gpu.h @@ -25,6 +25,7 @@ limitations under the License. */ #include "hl_sparse.h" #include "hl_lstm.h" #include "hl_sequence.h" +#include "hl_warpctc_wrap.h" #ifdef HPPL_STUB_FUNC #include "stub/hl_cuda_stub.h" diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h index bb5124df44b49..b98d7bdeafe5d 100644 --- a/paddle/cuda/include/hl_sequence.h +++ b/paddle/cuda/include/hl_sequence.h @@ -172,6 +172,39 @@ extern void hl_sequence2batch_add(real* batch, int batchCount, bool seq2batch); +/** + * @brief Memory copy from sequence to batch, + * while padding all sequences to the same length. + * + * if seq2batch == true + * + * copy from sequence to batch: + * batch[i] = sequence[sequenceStartPositions[i]] + * + * if seq2batch == false + * + * copy from batch to sequence: + * sequence[sequenceStartPositions[i]] = batch[i] + * + * @param[in,out] batch batch matrix. + * @param[in,out] sequence sequence matrix. + * @param[in] sequenceStartPositions index vector. + * @param[in] sequenceWidth width of sequence. + * @param[in] maxSequenceLength maximum length of sequences. + * @param[in] numSequences number of sequences. + * @param[in] normByTimes whether dividing sequence's length. + * @param[in] seq2batch copy direction. + * + */ +extern void hl_sequence2batch_copy_padding(real* batch, + real* sequence, + const int* sequenceStartPositions, + const size_t sequenceWidth, + const size_t maxSequenceLength, + const size_t numSequences, + bool normByTimes, + bool seq2batch); + /** * @brief dst = Op(src), src is sequence. * diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/cuda/include/hl_warpctc_wrap.h new file mode 100644 index 0000000000000..9d2379a024fe1 --- /dev/null +++ b/paddle/cuda/include/hl_warpctc_wrap.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef HL_WARPCTC_WRAP_H_ +#define HL_WARPCTC_WRAP_H_ + +#include "hl_base.h" +/// #include "hl_cuda.h" +#include "warp-ctc/include/ctc.h" + +typedef ctcStatus_t hl_warpctc_status_t; +typedef ctcOptions hl_warpctc_options_t; + +/** + * @brief Init ctc options. + * + * @param[in] blank blank label used in ctc loss function. + * @param[in] useGpu whether use gpu. + * @param[out] options handle to store cpu or gpu informations. + * + */ +extern void hl_warpctc_init(const size_t blank, + bool useGpu, + hl_warpctc_options_t* options); + +/** + * @brief Compute the connectionist temporal classification loss, + * and optionally compute the gradient with respect to the inputs. + * + * if batchGrad == nullptr + * + * only compute the ctc loss. + * + * if batchGrad != nullptr + * + * compute both ctc loss and gradient. + * + * @param[in] batchInput batch matrix of input probabilities, + * in maxSequenceLength x numSequence x numClasses + * (row-major) format. + * @param[out] batchGrad batch matrix of gradient. + * @param[in] cpuLabels labels always in CPU memory. + * @param[in] cpuLabelLengths length of all labels in CPU memory. + * @param[in] cpuInputLengths length of all sequences in CPU memory. + * @param[in] numClasses number of possible output symbols. + * @param[in] numSequences number of sequence. + * @param[out] cpuCosts cost of each sequence in CPU memory. + * @param[out] workspace workspace to store some temporary results. + * @param[in] options handle to store cpu or gpu informations. + * + */ +extern void hl_warpctc_compute_loss(const real* batchInput, + real* batchGrad, + const int* cpuLabels, + const int* cpuLabelLengths, + const int* cpuInputLengths, + const size_t numClasses, + const size_t numSequences, + real* cpuCosts, + void* workspace, + hl_warpctc_options_t* options); + +/** + * @brief Compute the required workspace size. + * There is no memory allocated operations within warp-ctc. + * + * @param[in] cpuLabelLengths length of all labels in CPU memory. + * @param[in] cpuInputLengths length of all sequences in CPU memory. + * @param[in] numClasses number of possible output symbols. + * @param[in] numSequences number of sequence. + * @param[in] options handle to store cpu or gpu informations. + * @param[out] bytes pointer to a scalar where the memory + * requirement in bytes will be placed. + * + */ +extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths, + const int* cpuInputLengths, + const size_t numClasses, + const size_t numSequences, + hl_warpctc_options_t* options, + size_t* bytes); + +#endif // HL_WARPCTC_WRAP_H_ diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h index 381f0a6f26c56..3343463a8d5fa 100644 --- a/paddle/cuda/include/stub/hl_sequence_stub.h +++ b/paddle/cuda/include/stub/hl_sequence_stub.h @@ -70,6 +70,15 @@ inline void hl_sequence2batch_add(real* batch, int batchCount, bool seq2batch) {} +inline void hl_sequence2batch_copy_padding(real* batch, + real* sequence, + const int* sequenceStartPositions, + const size_t sequenceWidth, + const size_t maxSequenceLength, + const size_t numSequences, + bool normByTimes, + bool seq2batch) {} + inline void hl_sequence_avg_forward(real* dst, real* src, const int* starts, diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu index 63824eaa4c201..0f1d72043935e 100644 --- a/paddle/cuda/src/hl_cuda_sequence.cu +++ b/paddle/cuda/src/hl_cuda_sequence.cu @@ -447,6 +447,124 @@ void hl_sequence2batch_add(real *batch, CHECK_SYNC("hl_sequence2batch_add failed"); } +template +__global__ +void KeSequence2BatchPadding(real* batch, + real* sequence, + const int* sequenceStartPositions, + const size_t sequenceWidth, + const size_t maxSequenceLength, + const size_t numSequences) { + int batchIdx = blockIdx.y; + int sequenceStart = sequenceStartPositions[batchIdx]; + int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart; + + int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y; + int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth; + int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth; + + if (sequenceIdx < sequenceLength) { + if (seq2batch) { + /* sequence -> batch */ + if (normByTimes) { + real scale = 1.0f / (real)sequenceLength; + for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { + batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i]; + } + } else { + for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { + batch[batchBaseIdx + i] = sequence[sequenceBaseIdx + i]; + } + } + } else { + /* batch -> sequence */ + if (normByTimes) { + real scale = 1.0f / (real)sequenceLength; + for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { + sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i]; + } + } else { + for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { + sequence[sequenceBaseIdx + i] = batch[batchBaseIdx + i]; + } + } + } + } else if (sequenceIdx < maxSequenceLength) { + if (seq2batch) { + /* sequence -> batch */ + for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { + batch[batchBaseIdx + i] = 0; + } + } + } +} + +void hl_sequence2batch_copy_padding(real* batch, + real* sequence, + const int* sequenceStartPositions, + const size_t sequenceWidth, + const size_t maxSequenceLength, + const size_t numSequences, + bool normByTimes, + bool seq2batch) { + CHECK_NOTNULL(batch); + CHECK_NOTNULL(sequence); + CHECK_NOTNULL(sequenceStartPositions); + + if (!normByTimes && numSequences == 1) { + size_t elementCount = maxSequenceLength * sequenceWidth; + if (seq2batch) { + /* sequence -> batch */ + hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount); + } else { + /* batch -> sequence */ + hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount); + } + return; + } + + const int CUDA_BLOCK_SIZE = 512; + + /* At least use 32 threads to copy sequenceWidth elements, + and at least 8 elements for each thread. */ + int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5; + blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE; + + int blockDimY = CUDA_BLOCK_SIZE / blockDimX; + dim3 threads(blockDimX, blockDimY); + + int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) / + CUDA_BLOCK_SIZE; + int gridDimY = numSequences; + dim3 grid(gridDimX, gridDimY); + + if (seq2batch) { + /* sequence -> batch */ + if (normByTimes) { + KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( + batch, sequence, sequenceStartPositions, + sequenceWidth, maxSequenceLength, numSequences); + } else { + KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>( + batch, sequence, sequenceStartPositions, + sequenceWidth, maxSequenceLength, numSequences); + } + } else { + /* batch -> sequence */ + if (normByTimes) { + KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( + batch, sequence, sequenceStartPositions, + sequenceWidth, maxSequenceLength, numSequences); + } else { + KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>( + batch, sequence, sequenceStartPositions, + sequenceWidth, maxSequenceLength, numSequences); + } + } + + CHECK_SYNC("hl_sequence2batch_copy_padding failed"); +} + __device__ inline float my_rsqrt(float x) { return rsqrtf(x); } diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc index ff6b830b7addc..a95f5557afb49 100644 --- a/paddle/cuda/src/hl_cudart_wrap.cc +++ b/paddle/cuda/src/hl_cudart_wrap.cc @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef PADDLE_USE_DSO #include +#include #include "hl_dso_loader.h" /** diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc index 1a3ce08619fc3..a6ea2a3b9f4b5 100644 --- a/paddle/cuda/src/hl_dso_loader.cc +++ b/paddle/cuda/src/hl_dso_loader.cc @@ -30,6 +30,8 @@ P_DEFINE_string(cuda_dir, "build-in function in cudart already ran before main entry). " "If default, dlopen will search cuda from LD_LIBRARY_PATH"); +P_DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); + static inline std::string join(const std::string& part1, const std::string& part2) { // directory separator @@ -92,27 +94,28 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root, *dso_handle = dlopen(dlPath.c_str(), dynload_flags); // if not found, search from default path if (nullptr == *dso_handle) { - LOG(WARNING) << "Failed to find cuda library: " << dlPath; + LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " (" + << dlerror() << ")"; dlPath = dso_name; GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags); } } - CHECK(nullptr != *dso_handle) << "Failed to find cuda library: " << dlPath - << std::endl + CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath + << " (" << dlerror() << ") \n" << "Please specify its path correctly using " - "one of the following ways: \n" // NOLINT + "one of the following ways: \n" << "Method 1. set cuda and cudnn lib path at " "runtime. " << "http://www.paddlepaddle.org/doc/ui/" "cmd_argument/" - "argument_outline.html \n" // NOLINT + "argument_outline.html \n" << "For instance, issue command: paddle train " "--use_gpu=1 " << "--cuda_dir=/usr/local/cuda/lib64 " "--cudnn_dir=/usr/local/cudnn/lib " - "...\n" // NOLINT + "...\n" << "Method 2. set environment variable " "LD_LIBRARY_PATH on Linux or " @@ -124,7 +127,7 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root, "DYLD_LIBRARY_PATH is impossible " << "unless System Integrity Protection (SIP) " "is disabled. However, " - "method 1 " // NOLINT + "method 1 " << "always work well."; } @@ -159,3 +162,11 @@ void GetCurandDsoHandle(void** dso_handle) { GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle); #endif } + +void GetWarpctcDsoHandle(void** dso_handle) { +#if defined(__APPLE__) || defined(__OSX__) + GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle); +#else + GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle); +#endif +} diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc new file mode 100644 index 0000000000000..99db0f242df74 --- /dev/null +++ b/paddle/cuda/src/hl_warpctc_wrap.cc @@ -0,0 +1,157 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "hl_warpctc_wrap.h" +#include "hl_dso_loader.h" +#include "paddle/utils/Logging.h" + +namespace dynload { + +std::once_flag warpctc_dso_flag; +void* warpctc_dso_handle = nullptr; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load warpctc routine + * via operator overloading. When PADDLE_USE_DSO is + * false, you need to add the path of libwarp-ctc.so to + * the linked-libs of paddle or to LD_PRELOAD. + */ +#ifdef PADDLE_USE_DSO +#define DYNAMIC_LOAD_WARPCTC_WRAP(__name, __type) \ + struct DynLoad__##__name { \ + template \ + __type operator()(Args... args) { \ + typedef __type (*warpctcFunc)(Args...); \ + std::call_once( \ + warpctc_dso_flag, GetWarpctcDsoHandle, &warpctc_dso_handle); \ + void* p_##_name = dlsym(warpctc_dso_handle, #__name); \ + return reinterpret_cast(p_##_name)(args...); \ + } \ + } __name; // struct DynLoad__##__name +#else +#define DYNAMIC_LOAD_WARPCTC_WRAP(__name, __type) \ + struct DynLoad__##__name { \ + template \ + __type operator()(Args... args) { \ + return __name(args...); \ + } \ + } __name; // struct DynLoad__##__name +#endif + +// include all needed warp-ctc functions +DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version, int) +DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString, const char*) +DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss, hl_warpctc_status_t) +DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size, hl_warpctc_status_t) + +#undef DYNAMIC_LOAD_WARPCTC_WRAP + +} /* namespace dynload */ + +#define WARPCTC_GET_VERSION dynload::get_warpctc_version +#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString + +#ifndef PADDLE_TYPE_DOUBLE +#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss +#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size +#else +#define WARPCTC_LOG_FATAL \ + LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion \ + << "] Error: not support double precision." +#define WARPCTC_COMPUTE_LOSS(...) WARPCTC_LOG_FATAL(__VA_ARGS__) +#define WARPCTC_GET_WORKSPACE_SIZE(...) WARPCTC_LOG_FATAL(__VA_ARGS__) +#endif + +/** + * Check build-in warp-ctc function using glog and it also + * support << operator for more details error info. + */ +static int g_warpctcVersion = -1; +#define CHECK_WARPCTC(warpctcStat) \ + CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat) \ + << "warp-ctc [version " << g_warpctcVersion \ + << "] Error: " << WARPCTC_GET_STATUS_STRING(warpctcStat) << " " + +void hl_warpctc_init(const size_t blank, + bool useGpu, + hl_warpctc_options_t* options) { + CHECK_NOTNULL(options); + + g_warpctcVersion = WARPCTC_GET_VERSION(); + + if (useGpu) { +#ifdef __NVCC__ + options->loc = CTC_GPU; + options->stream = STREAM_DEFAULT; +#else + LOG(FATAL) << "[warpctc init] GPU is not enabled."; +#endif + } else { + options->loc = CTC_CPU; + options->num_threads = 1; + } + + options->blank_label = blank; +} + +void hl_warpctc_compute_loss(const real* batchInput, + real* batchGrad, + const int* cpuLabels, + const int* cpuLabelLengths, + const int* cpuInputLengths, + const size_t numClasses, + const size_t numSequences, + real* cpuCosts, + void* workspace, + hl_warpctc_options_t* options) { + CHECK_NOTNULL(batchInput); + CHECK_NOTNULL(cpuLabels); + CHECK_NOTNULL(cpuLabelLengths); + CHECK_NOTNULL(cpuInputLengths); + CHECK_NOTNULL(cpuCosts); + CHECK_NOTNULL(workspace); + CHECK_NOTNULL(options); + + CHECK_WARPCTC(WARPCTC_COMPUTE_LOSS(batchInput, + batchGrad, + cpuLabels, + cpuLabelLengths, + cpuInputLengths, + numClasses, + numSequences, + cpuCosts, + workspace, + *options)); +} + +void hl_warpctc_get_workspace_size(const int* cpuLabelLengths, + const int* cpuInputLengths, + const size_t numClasses, + const size_t numSequences, + hl_warpctc_options_t* options, + size_t* bytes) { + CHECK_NOTNULL(cpuLabelLengths); + CHECK_NOTNULL(cpuInputLengths); + CHECK_NOTNULL(options); + CHECK_NOTNULL(bytes); + + CHECK_WARPCTC(WARPCTC_GET_WORKSPACE_SIZE(cpuLabelLengths, + cpuInputLengths, + numClasses, + numSequences, + *options, + bytes)); +} diff --git a/paddle/gserver/layers/WarpCTCLayer.cpp b/paddle/gserver/layers/WarpCTCLayer.cpp new file mode 100644 index 0000000000000..b99e9b9c7a620 --- /dev/null +++ b/paddle/gserver/layers/WarpCTCLayer.cpp @@ -0,0 +1,229 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "WarpCTCLayer.h" + +namespace paddle { + +REGISTER_LAYER(warp_ctc, WarpCTCLayer); + +bool WarpCTCLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + /* Initialize the basic parament class */ + Layer::init(layerMap, parameterMap); + + CHECK_EQ(inputLayers_.size(), 2UL); + + /* The inputLayers_[0] must be sequence output without softmax */ + numClasses_ = config_.size(); + CHECK_GE(numClasses_, 2UL); + CHECK_EQ(numClasses_, inputLayers_[0]->getSize()); + + blank_ = config_.blank(); + CHECK_GE(blank_, 0UL); + CHECK_LT(blank_, numClasses_); + + normByTimes_ = config_.norm_by_times(); + + // We don't need sequenceStartPositions because each sample of output_ is + // for the cost of one sequence. + setNeedSequenceInfo(false); + + return true; +} + +void WarpCTCLayer::forward(PassType passType) { + Layer::forward(passType); + + const Argument& output = getInput(0); + const Argument& labels = getInput(1); + + CHECK(output.sequenceStartPositions); + CHECK(labels.sequenceStartPositions); + CHECK(labels.ids); + + size_t numSequences = labels.sequenceStartPositions->getSize() - 1; + CHECK_EQ(numSequences, output.sequenceStartPositions->getSize() - 1); + + resizeOutput(numSequences, 1); + + const int* cpuLabelStartPositions = + labels.sequenceStartPositions->getData(false); + const int* cpuOutputStartPositions = + output.sequenceStartPositions->getData(false); + + std::vector cpuLabelLengths(numSequences); + std::vector cpuOutputLengths(numSequences); + for (size_t i = 0; i < numSequences; i++) { + cpuLabelLengths[i] = + cpuLabelStartPositions[i + 1] - cpuLabelStartPositions[i]; + cpuOutputLengths[i] = + cpuOutputStartPositions[i + 1] - cpuOutputStartPositions[i]; + } + + /* Get the maximum sequence length */ + maxSequenceLength_ = 0; + maxSequenceLength_ = *std::max_element( + cpuOutputLengths.data(), cpuOutputLengths.data() + numSequences); + + Matrix::resizeOrCreate(batchValue_, + /* height */ numSequences * maxSequenceLength_, + /* width */ numClasses_, + /* trans */ false, + /* useGpu */ useGpu_); + + Matrix::resizeOrCreate(batchGrad_, + /* height */ numSequences * maxSequenceLength_, + /* width */ numClasses_, + /* trans */ false, + /* useGpu */ useGpu_); + batchGrad_->zeroMem(); + + seq2batchPadding(output.value, batchValue_, output.sequenceStartPositions); + + /* labels always in CPU memory */ + IVector::resizeOrCreate(cpuLabels_, + /* size */ (labels.ids)->getSize(), + /* useGpu */ false); + cpuLabels_->copyFrom(*(labels.ids)); + + /* labels always in CPU memory */ + Matrix::resizeOrCreate(cpuCosts_, + /* width */ numSequences, + /* height */ 1, + /* trans */ false, + /* useGpu */ false); + + /* Init warp-ctc options */ + hl_warpctc_options_t options; + hl_warpctc_init(blank_, useGpu_, &options); + + /* Get the needed workspace size */ + size_t workspaceBytes = 0; + hl_warpctc_get_workspace_size(cpuLabelLengths.data(), + cpuOutputLengths.data(), + numClasses_, + numSequences, + &options, + &workspaceBytes); + CHECK_GT(workspaceBytes, 0UL); + + size_t workspaceLength = workspaceBytes / sizeof(real) + 1; + Vector::resizeOrCreate(workspace_, + /* size */ workspaceLength, + /* useGpu */ useGpu_); + + hl_warpctc_compute_loss(batchValue_->getData(), + batchGrad_->getData(), + cpuLabels_->getData(), + cpuLabelLengths.data(), + cpuOutputLengths.data(), + numClasses_, + numSequences, + cpuCosts_->getData(), + workspace_->getData(), + &options); + + /* Copy the costs */ + output_.value->copyFrom(*cpuCosts_); +} + +void WarpCTCLayer::backward(const UpdateCallback& callback) { + (void)callback; + + const Argument& output = getInput(0); + CHECK(batchGrad_); + + batch2seqPadding( + output.grad, batchGrad_, output.sequenceStartPositions, normByTimes_); +} + +void WarpCTCLayer::seq2batchPadding(const MatrixPtr& seqValue, + MatrixPtr& batchValue, + const ICpuGpuVectorPtr& seqStartPositions) { + size_t numSequences = seqStartPositions->getSize() - 1; + const int* seqStartPositionsData = seqStartPositions->getData(useGpu_); + + real* seqData = seqValue->getData(); + real* batchData = batchValue->getData(); + if (useGpu_) { + hl_sequence2batch_copy_padding(batchData, + seqData, + seqStartPositionsData, + numClasses_, + maxSequenceLength_, + numSequences, + false, + true); + } else { + for (size_t i = 0; i < maxSequenceLength_; i++) { + for (size_t j = 0; j < numSequences; j++) { + size_t sequenceStart = seqStartPositionsData[j]; + size_t sequenceLength = + seqStartPositionsData[j + 1] - seqStartPositionsData[j]; + if (i < sequenceLength) { + memcpy(batchData + (i * numSequences + j) * numClasses_, + seqData + (sequenceStart + i) * numClasses_, + numClasses_ * sizeof(real)); + } else { + memset(batchData + (i * numSequences + j) * numClasses_, + 0, + numClasses_ * sizeof(real)); + } + } + } + } +} + +void WarpCTCLayer::batch2seqPadding(const MatrixPtr& seqValue, + MatrixPtr& batchValue, + const ICpuGpuVectorPtr& seqStartPositions, + bool normByTimes) { + size_t numSequences = seqStartPositions->getSize() - 1; + const int* seqStartPositionsData = seqStartPositions->getData(useGpu_); + + real* seqData = seqValue->getData(); + real* batchData = batchValue->getData(); + if (useGpu_) { + hl_sequence2batch_copy_padding(batchData, + seqData, + seqStartPositionsData, + numClasses_, + maxSequenceLength_, + numSequences, + normByTimes, + false); + } else { + for (size_t i = 0; i < numSequences; i++) { + int sequenceStart = seqStartPositionsData[i]; + int sequenceLength = + seqStartPositionsData[i + 1] - seqStartPositionsData[i]; + for (int j = 0; j < sequenceLength; j++) { + if (normByTimes) { + for (size_t k = 0; k < numClasses_; k++) { + seqData[(sequenceStart + j) * numClasses_ + k] = + batchData[(j * numSequences + i) * numClasses_ + k] / + sequenceLength; + } + } else { + memcpy(seqData + (sequenceStart + j) * numClasses_, + batchData + (j * numSequences + i) * numClasses_, + numClasses_ * sizeof(real)); + } + } + } + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/gserver/layers/WarpCTCLayer.h new file mode 100644 index 0000000000000..1b0f5ba267ae5 --- /dev/null +++ b/paddle/gserver/layers/WarpCTCLayer.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "Layer.h" + +namespace paddle { + +/** + * @brief A layer integrating the open-source warp-ctc library + * to compute connectionist + * temporal classification cost. + * + * The config file api is warp_ctc_layer. + */ +class WarpCTCLayer : public Layer { +public: + explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {} + ~WarpCTCLayer() {} + + virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap); + virtual void forward(PassType passType); + virtual void backward(const UpdateCallback& callback); + +protected: + /** + * sequence matrix and batch matrix copy: + * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3) + * batch (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0) + */ + void seq2batchPadding(const MatrixPtr& seqValue, + MatrixPtr& batchValue, + const ICpuGpuVectorPtr& seqStartPositions); + void batch2seqPadding(const MatrixPtr& seqValue, + MatrixPtr& batchValue, + const ICpuGpuVectorPtr& seqStartPositions, + bool normByTimes); + +protected: + size_t numClasses_; + size_t blank_; + size_t maxSequenceLength_; + bool normByTimes_; + + MatrixPtr batchValue_; + MatrixPtr batchGrad_; + VectorPtr workspace_; + + IVectorPtr cpuLabels_; + MatrixPtr cpuCosts_; +}; + +} // namespace paddle diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index 0651d0b4733ea..5427dc062d566 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -62,6 +62,13 @@ add_unittest(test_RecurrentLayer test_RecurrentLayer.cpp TestUtil.cpp) +############### test_WarpCTCLayer ####################### +if(NOT WITH_DOUBLE) + add_unittest(test_WarpCTCLayer + test_WarpCTCLayer.cpp + TestUtil.cpp) +endif() + ############### test_RecurrentGradientMachine ############### # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine # I will fix it. diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp new file mode 100644 index 0000000000000..5289c9892ceb2 --- /dev/null +++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp @@ -0,0 +1,247 @@ +/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/gserver/layers/Layer.h" +#include "paddle/gserver/layers/DataLayer.h" +#include "paddle/gserver/layers/CTCLayer.h" +#include "paddle/gserver/layers/WarpCTCLayer.h" +#include "ModelConfig.pb.h" + +#include "TestUtil.h" + +using namespace paddle; // NOLINT +using namespace std; // NOLINT + +P_DECLARE_bool(use_gpu); + +const real* getData(const Matrix& matrix) { + if (matrix.useGpu()) { + MatrixPtr cpuMatrix = Matrix::create( + matrix.getWidth(), matrix.getHeight(), matrix.isTransposed(), false); + cpuMatrix->copyFrom(matrix); + return cpuMatrix->getData(); + } else { + return matrix.getData(); + } +} + +void checkError(const Matrix& matrix1, const Matrix& matrix2) { + CHECK_EQ(matrix1.getHeight(), matrix2.getHeight()); + CHECK_EQ(matrix1.getWidth(), matrix2.getWidth()); + CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed()); +#ifndef PADDLE_TYPE_DOUBLE + real err = 1e-3; +#else + real err = 1e-10; +#endif + + int height = matrix1.getHeight(); + int width = matrix1.getWidth(); + + const real* data1 = getData(matrix1); + const real* data2 = getData(matrix2); + int count = 0; + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + if (fabs(data1[i * width + j] - data2[i * width + j]) > err) { + count++; + } + } + } + EXPECT_EQ(count, 0) << "There are " << count << " different element."; +} + +void initArgument(size_t batchSize, + int layerSize, + bool useGpu, + Argument& data) { + data.value = Matrix::create(batchSize, layerSize, false, useGpu); + data.grad = Matrix::create(batchSize, layerSize, false, useGpu); + data.value->randomizeUniform(); + data.value->add(-0.5); + /// data.value->sigmoid(*data.value); + data.grad->zeroMem(); + + generateSequenceStartPositions(batchSize, data.sequenceStartPositions); +} + +LayerPtr createDataLayer( + string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) { + LayerConfig layerConfig; + layerConfig.set_name(name); + layerConfig.set_type("data"); + layerConfig.set_size(layerSize); + LayerPtr layer = LayerPtr(new DataLayer(layerConfig)); + + DataLayerPtr dataLayer = std::dynamic_pointer_cast(layer); + dataLayer->setData(data); + dataLayer->forward(PASS_GC); + + /// std::cout << "dataLayer: " << std::endl; + /// (dataLayer->getOutput().value)->print(std::cout); + + return layer; +} + +LayerPtr createLabelLayer(string name, + size_t batchSize, + size_t numClasses, + bool useGpu) { + LayerConfig layerConfig; + layerConfig.set_name(name); + layerConfig.set_type("data"); + layerConfig.set_size(1); + LayerPtr layer = LayerPtr(new DataLayer(layerConfig)); + + Argument data; + data.ids = IVector::create(batchSize, useGpu); + data.ids->rand(numClasses - 1); + + generateSequenceStartPositions(batchSize, data.sequenceStartPositions); + + DataLayerPtr labelLayer = std::dynamic_pointer_cast(layer); + labelLayer->setData(data); + labelLayer->forward(PASS_GC); + + return layer; +} + +LayerPtr createCTCLayer(string name, + size_t numClasses, + bool useGpu, + bool normByTimes, + LayerPtr dataLayer, + LayerPtr labelLayer) { + LayerMap layerMap; + layerMap[dataLayer->getName()] = dataLayer; + layerMap[labelLayer->getName()] = labelLayer; + + ParameterMap parameterMap; + + LayerConfig layerConfig; + layerConfig.set_name(name); + layerConfig.set_type("ctc"); + layerConfig.set_size(numClasses); + layerConfig.set_norm_by_times(normByTimes); + + layerConfig.add_inputs(); + LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0)); + input0.set_input_layer_name(dataLayer->getName()); + + layerConfig.add_inputs(); + LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1)); + input1.set_input_layer_name(labelLayer->getName()); + + LayerPtr layer = LayerPtr(new CTCLayer(layerConfig)); + layerMap[layer->getName()] = layer; + layer->init(layerMap, parameterMap); + + ActivationFunction* softmaxActivation = ActivationFunction::create("softmax"); + + softmaxActivation->forward(dataLayer->getOutput()); + layer->forward(PASS_GC); + + layer->backward(); + softmaxActivation->backward(dataLayer->getOutput()); + + return layer; +} + +LayerPtr createWarpCTCLayer(string name, + size_t numClasses, + bool useGpu, + bool normByTimes, + LayerPtr dataLayer, + LayerPtr labelLayer) { + LayerMap layerMap; + layerMap[dataLayer->getName()] = dataLayer; + layerMap[labelLayer->getName()] = labelLayer; + + ParameterMap parameterMap; + + LayerConfig layerConfig; + layerConfig.set_name(name); + layerConfig.set_type("warp_ctc"); + layerConfig.set_size(numClasses); + layerConfig.set_blank(numClasses - 1); + layerConfig.set_norm_by_times(normByTimes); + + layerConfig.add_inputs(); + LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0)); + input0.set_input_layer_name(dataLayer->getName()); + + layerConfig.add_inputs(); + LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1)); + input1.set_input_layer_name(labelLayer->getName()); + + LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig)); + layerMap[layer->getName()] = layer; + layer->init(layerMap, parameterMap); + + layer->forward(PASS_GC); + layer->backward(); + + return layer; +} + +TEST(Layer, WarpCTCLayer) { + for (auto layerSize : {10, 64, 128}) { + for (auto batchSize : {1, 10, 20, 64}) { + for (auto useGpu : {false, true}) { +#ifdef PADDLE_ONLY_CPU + if (useGpu) continue; +#endif + LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize + << " useGpu=" << useGpu; + + FLAGS_use_gpu = useGpu; + + Argument data0; + initArgument(batchSize, layerSize, useGpu, data0); + + Argument data1; + data1.resizeAndCopyFrom(data0); + + LayerPtr dataLayer0 = + createDataLayer("data", batchSize, layerSize, useGpu, data0); + LayerPtr dataLayer1 = + createDataLayer("data", batchSize, layerSize, useGpu, data1); + + LayerPtr labelLayer = + createLabelLayer("label", batchSize, layerSize, useGpu); + + LayerPtr warpctcLayer = createWarpCTCLayer( + "cost", layerSize, useGpu, false, dataLayer0, labelLayer); + LayerPtr ctcLayer = createCTCLayer( + "cost", layerSize, useGpu, false, dataLayer1, labelLayer); + + /// Check loss + checkError(*(warpctcLayer->getOutput().value), + *(ctcLayer->getOutput().value)); + + /// Check gradients + checkError(*(dataLayer0->getOutput().grad), + *(dataLayer1->getOutput().grad)); + } + } + } +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + initMain(argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4 index 68a5eb9dd2231..08108a46661b5 100644 --- a/proto/ModelConfig.proto.m4 +++ b/proto/ModelConfig.proto.m4 @@ -414,6 +414,8 @@ sinclude(`ModelConfigLayer.proto.m4') // to string and reinterpreted in the user's own layer implementation. optional string user_arg = 49; + // For WarpCTCLayer + optional uint32 blank = 50 [default = 0]; } message EvaluatorConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 9db42bf172a77..e987ad17d6543 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2993,6 +2993,27 @@ def __init__(self, name, size, inputs, norm_by_times=False, device=None): config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs') +@config_layer('warp_ctc') +class WarpCTCLayer(LayerBase): + def __init__(self, + name, + size, + inputs, + blank=0, + norm_by_times=False, + device=None): + super(WarpCTCLayer, self).__init__( + name, 'warp_ctc', size=size, inputs=inputs, device=device) + self.config.blank = blank + self.config.norm_by_times = norm_by_times + config_assert(len(self.inputs) == 2, 'WarpCTCLayer must have 2 inputs') + input_layer = self.get_input_layer(0) + config_assert( + (input_layer.active_type == '' or + input_layer.active_type == 'linear'), + "Expecting the active_type of input layer to be linear or null") + + @config_layer('recurrent_layer_group') class RecurrentLayerGroup(LayerBase): def __init__(self, name, device=None): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 9a45a5158984a..888d48722ab91 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -91,6 +91,7 @@ 'linear_comb_layer', 'convex_comb_layer', 'ctc_layer', + 'warp_ctc_layer', 'crf_layer', 'crf_decoding_layer', 'nce_layer', @@ -169,6 +170,7 @@ class LayerType(object): PRINT_LAYER = "print" CTC_LAYER = "ctc" + WARP_CTC_LAYER = "warp_ctc" CRF_LAYER = "crf" CRF_DECODING_LAYER = "crf_decoding" NCE_LAYER = 'nce' @@ -4085,6 +4087,83 @@ def ctc_layer(input, return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size) +@wrap_name_default() +@layer_support() +def warp_ctc_layer(input, + label, + size=None, + name=None, + blank=0, + norm_by_times=False, + layer_attr=None): + """ + A layer intergrating the open-source `warp-ctc + ` library, which is used in + `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin + `, to compute Connectionist Temporal + Classification (CTC) loss. + + More details of CTC can be found by referring to `Connectionist Temporal + Classification: Labelling Unsegmented Sequence Data with Recurrent + Neural Networks `_ + + Note: + - Let num_classes represent the category number. Considering the 'blank' + label needed by CTC, you need to use (num_classes + 1) as the input size. + Thus, the size of both warp_ctc_layer and 'input' layer should be set to + num_classes + 1. + - You can set 'blank' to [0, num_classes - 1], which should be consistent + as that used in your labels. + - As a native 'softmax' activation is interated to the warp-ctc library, + 'linear' activation is expected instead in the 'input' layer. + + The simple usage: + + .. code-block:: python + + ctc = warp_ctc_layer(input=input, + label=label, + size=1001, + blank=1000, + norm_by_times=False) + + :param input: The input layer. + :type input: LayerOutput + :param label: The data layer of label with variable length. + :type label: LayerOutput + :param size: category numbers + 1. + :type size: int + :param name: The name of this layer, which can not specify. + :type name: basestring|None + :param blank: the 'blank' label used in ctc + :type blank: int + :param norm_by_times: Whether to normalization by times. False by default. + :type norm_by_times: bool + :param layer_attr: Extra Layer config. + :type layer_attr: ExtraLayerAttribute|None + :return: LayerOutput object. + :rtype: LayerOutput + """ + assert isinstance(input, LayerOutput) + assert isinstance(label, LayerOutput) + if label.size is not None: + if size is not None: + assert size == label.size + 1 + else: + size = label.size + 1 + Layer( + name=name, + type=LayerType.WARP_CTC_LAYER, + size=size, + blank=blank, + norm_by_times=norm_by_times, + inputs=[input.name, label.name], + **ExtraLayerAttribute.to_kwargs(layer_attr)) + return LayerOutput( + name, LayerType.WARP_CTC_LAYER, parents=[input, label], size=size) + + @wrap_name_default() @wrap_param_attr_default() @layer_support() diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr index f6045fe1f6825..10e59e21bc7a4 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr @@ -47,6 +47,20 @@ layers { } norm_by_times: false } +layers { + name: "__warp_ctc_layer_0__" + type: "warp_ctc" + size: 5001 + active_type: "" + inputs { + input_layer_name: "input" + } + inputs { + input_layer_name: "labels" + } + norm_by_times: false + blank: 0 +} layers { name: "crf_label" type: "data" @@ -244,6 +258,7 @@ input_layer_names: "xe-label" input_layer_names: "huber_probs" input_layer_names: "huber_label" output_layer_names: "__ctc_layer_0__" +output_layer_names: "__warp_ctc_layer_0__" output_layer_names: "__crf_layer_0__" output_layer_names: "__rank_cost_0__" output_layer_names: "__lambda_cost_0__" @@ -260,6 +275,7 @@ sub_models { layer_names: "xe-label" layer_names: "__fc_layer_0__" layer_names: "__ctc_layer_0__" + layer_names: "__warp_ctc_layer_0__" layer_names: "crf_label" layer_names: "__crf_layer_0__" layer_names: "left" @@ -289,6 +305,7 @@ sub_models { input_layer_names: "huber_probs" input_layer_names: "huber_label" output_layer_names: "__ctc_layer_0__" + output_layer_names: "__warp_ctc_layer_0__" output_layer_names: "__crf_layer_0__" output_layer_names: "__rank_cost_0__" output_layer_names: "__lambda_cost_0__" diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py index fd979a1e9f433..18ff6b48c495b 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py @@ -12,6 +12,8 @@ outputs( ctc_layer( input=seq_in, label=labels), + warp_ctc_layer( + input=seq_in, label=labels, blank=0), crf_layer( input=hidden, label=data_layer( name='crf_label', size=4)), From a816443e118f3abc69d0a474e01135d06d5f638e Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Wed, 30 Nov 2016 05:32:16 +0000 Subject: [PATCH 2/4] Add submodule warp-ctc. --- .gitmodules | 3 ++ .pre-commit-config.yaml | 2 + paddle/cuda/include/hl_dso_loader.h | 2 +- paddle/cuda/include/hl_warpctc_wrap.h | 1 - paddle/cuda/src/hl_cuda_sequence.cu | 24 +++------- paddle/cuda/src/hl_dso_loader.cc | 2 +- paddle/cuda/src/hl_warpctc_wrap.cc | 28 +++++------ paddle/gserver/layers/WarpCTCLayer.cpp | 18 +++----- paddle/gserver/tests/test_WarpCTCLayer.cpp | 54 +++++++++++----------- warp-ctc | 1 + 10 files changed, 62 insertions(+), 73 deletions(-) create mode 160000 warp-ctc diff --git a/.gitmodules b/.gitmodules index e69de29bb2d1d..f635e65784af4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "warp-ctc"] + path = warp-ctc + url = https://github.com/baidu-research/warp-ctc.git diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 90c25e435083d..942669c41ff15 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,6 +2,7 @@ sha: c25201a00e6b0514370501050cf2a8538ac12270 hooks: - id: remove-crlf + files: (?!.*warp-ctc)^.*$ - repo: https://github.com/reyoung/mirrors-yapf.git sha: v0.13.2 hooks: @@ -13,6 +14,7 @@ - id: check-merge-conflict - id: check-symlinks - id: detect-private-key + files: (?!.*warp-ctc)^.*$ - id: end-of-file-fixer - repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29 diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h index c52066e3d7ec4..e5d3d40311403 100644 --- a/paddle/cuda/include/hl_dso_loader.h +++ b/paddle/cuda/include/hl_dso_loader.h @@ -58,6 +58,6 @@ void GetCurandDsoHandle(void** dso_handle); * @param **dso_handle dso handler * */ -void GetWarpctcDsoHandle(void** dso_handle); +void GetWarpCTCDsoHandle(void** dso_handle); #endif // HL_DSO_LOADER_H_ diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/cuda/include/hl_warpctc_wrap.h index 9d2379a024fe1..dc50cf9d20829 100644 --- a/paddle/cuda/include/hl_warpctc_wrap.h +++ b/paddle/cuda/include/hl_warpctc_wrap.h @@ -16,7 +16,6 @@ limitations under the License. */ #define HL_WARPCTC_WRAP_H_ #include "hl_base.h" -/// #include "hl_cuda.h" #include "warp-ctc/include/ctc.h" typedef ctcStatus_t hl_warpctc_status_t; diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu index 0f1d72043935e..e83a60ad72fa4 100644 --- a/paddle/cuda/src/hl_cuda_sequence.cu +++ b/paddle/cuda/src/hl_cuda_sequence.cu @@ -463,30 +463,18 @@ void KeSequence2BatchPadding(real* batch, int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth; int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth; + real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f; + if (sequenceIdx < sequenceLength) { if (seq2batch) { /* sequence -> batch */ - if (normByTimes) { - real scale = 1.0f / (real)sequenceLength; - for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { - batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i]; - } - } else { - for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { - batch[batchBaseIdx + i] = sequence[sequenceBaseIdx + i]; - } + for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { + batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i]; } } else { /* batch -> sequence */ - if (normByTimes) { - real scale = 1.0f / (real)sequenceLength; - for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { - sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i]; - } - } else { - for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { - sequence[sequenceBaseIdx + i] = batch[batchBaseIdx + i]; - } + for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) { + sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i]; } } } else if (sequenceIdx < maxSequenceLength) { diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc index a6ea2a3b9f4b5..ce19073626a8e 100644 --- a/paddle/cuda/src/hl_dso_loader.cc +++ b/paddle/cuda/src/hl_dso_loader.cc @@ -163,7 +163,7 @@ void GetCurandDsoHandle(void** dso_handle) { #endif } -void GetWarpctcDsoHandle(void** dso_handle) { +void GetWarpCTCDsoHandle(void** dso_handle) { #if defined(__APPLE__) || defined(__OSX__) GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle); #else diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc index 99db0f242df74..3d3bf46158674 100644 --- a/paddle/cuda/src/hl_warpctc_wrap.cc +++ b/paddle/cuda/src/hl_warpctc_wrap.cc @@ -30,32 +30,32 @@ void* warpctc_dso_handle = nullptr; * the linked-libs of paddle or to LD_PRELOAD. */ #ifdef PADDLE_USE_DSO -#define DYNAMIC_LOAD_WARPCTC_WRAP(__name, __type) \ +#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ struct DynLoad__##__name { \ template \ - __type operator()(Args... args) { \ - typedef __type (*warpctcFunc)(Args...); \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using warpctcFunc = decltype(__name(args...)) (*)(Args...); \ std::call_once( \ - warpctc_dso_flag, GetWarpctcDsoHandle, &warpctc_dso_handle); \ + warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \ void* p_##_name = dlsym(warpctc_dso_handle, #__name); \ return reinterpret_cast(p_##_name)(args...); \ } \ } __name; // struct DynLoad__##__name #else -#define DYNAMIC_LOAD_WARPCTC_WRAP(__name, __type) \ - struct DynLoad__##__name { \ - template \ - __type operator()(Args... args) { \ - return __name(args...); \ - } \ +#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + return __name(args...); \ + } \ } __name; // struct DynLoad__##__name #endif // include all needed warp-ctc functions -DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version, int) -DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString, const char*) -DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss, hl_warpctc_status_t) -DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size, hl_warpctc_status_t) +DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version) +DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString) +DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss) +DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size) #undef DYNAMIC_LOAD_WARPCTC_WRAP diff --git a/paddle/gserver/layers/WarpCTCLayer.cpp b/paddle/gserver/layers/WarpCTCLayer.cpp index b99e9b9c7a620..e68363a1b2bb3 100644 --- a/paddle/gserver/layers/WarpCTCLayer.cpp +++ b/paddle/gserver/layers/WarpCTCLayer.cpp @@ -100,8 +100,8 @@ void WarpCTCLayer::forward(PassType passType) { /* labels always in CPU memory */ Matrix::resizeOrCreate(cpuCosts_, - /* width */ numSequences, - /* height */ 1, + /* height */ numSequences, + /* width */ 1, /* trans */ false, /* useGpu */ false); @@ -209,17 +209,11 @@ void WarpCTCLayer::batch2seqPadding(const MatrixPtr& seqValue, int sequenceStart = seqStartPositionsData[i]; int sequenceLength = seqStartPositionsData[i + 1] - seqStartPositionsData[i]; + real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f; for (int j = 0; j < sequenceLength; j++) { - if (normByTimes) { - for (size_t k = 0; k < numClasses_; k++) { - seqData[(sequenceStart + j) * numClasses_ + k] = - batchData[(j * numSequences + i) * numClasses_ + k] / - sequenceLength; - } - } else { - memcpy(seqData + (sequenceStart + j) * numClasses_, - batchData + (j * numSequences + i) * numClasses_, - numClasses_ * sizeof(real)); + for (size_t k = 0; k < numClasses_; k++) { + seqData[(sequenceStart + j) * numClasses_ + k] = + batchData[(j * numSequences + i) * numClasses_ + k] * scale; } } } diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp index 5289c9892ceb2..aba48935a6f68 100644 --- a/paddle/gserver/tests/test_WarpCTCLayer.cpp +++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp @@ -30,7 +30,7 @@ P_DECLARE_bool(use_gpu); const real* getData(const Matrix& matrix) { if (matrix.useGpu()) { MatrixPtr cpuMatrix = Matrix::create( - matrix.getWidth(), matrix.getHeight(), matrix.isTransposed(), false); + matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false); cpuMatrix->copyFrom(matrix); return cpuMatrix->getData(); } else { @@ -200,41 +200,43 @@ LayerPtr createWarpCTCLayer(string name, TEST(Layer, WarpCTCLayer) { for (auto layerSize : {10, 64, 128}) { for (auto batchSize : {1, 10, 20, 64}) { - for (auto useGpu : {false, true}) { + for (auto normByTimes : {false, true}) { + for (auto useGpu : {false, true}) { #ifdef PADDLE_ONLY_CPU - if (useGpu) continue; + if (useGpu) continue; #endif - LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize - << " useGpu=" << useGpu; + LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize + << " normByTimes = " << normByTimes << " useGpu=" << useGpu; - FLAGS_use_gpu = useGpu; + FLAGS_use_gpu = useGpu; - Argument data0; - initArgument(batchSize, layerSize, useGpu, data0); + Argument data0; + initArgument(batchSize, layerSize, useGpu, data0); - Argument data1; - data1.resizeAndCopyFrom(data0); + Argument data1; + data1.resizeAndCopyFrom(data0); - LayerPtr dataLayer0 = - createDataLayer("data", batchSize, layerSize, useGpu, data0); - LayerPtr dataLayer1 = - createDataLayer("data", batchSize, layerSize, useGpu, data1); + LayerPtr dataLayer0 = + createDataLayer("data", batchSize, layerSize, useGpu, data0); + LayerPtr dataLayer1 = + createDataLayer("data", batchSize, layerSize, useGpu, data1); - LayerPtr labelLayer = - createLabelLayer("label", batchSize, layerSize, useGpu); + LayerPtr labelLayer = + createLabelLayer("label", batchSize, layerSize, useGpu); - LayerPtr warpctcLayer = createWarpCTCLayer( - "cost", layerSize, useGpu, false, dataLayer0, labelLayer); - LayerPtr ctcLayer = createCTCLayer( - "cost", layerSize, useGpu, false, dataLayer1, labelLayer); + LayerPtr warpctcLayer = createWarpCTCLayer( + "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer); + LayerPtr ctcLayer = createCTCLayer( + "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer); - /// Check loss - checkError(*(warpctcLayer->getOutput().value), - *(ctcLayer->getOutput().value)); + /// Check loss + checkError(*(warpctcLayer->getOutput().value), + *(ctcLayer->getOutput().value)); - /// Check gradients - checkError(*(dataLayer0->getOutput().grad), - *(dataLayer1->getOutput().grad)); + /// Check gradients + checkError(*(dataLayer0->getOutput().grad), + *(dataLayer1->getOutput().grad)); + } } } } diff --git a/warp-ctc b/warp-ctc new file mode 160000 index 0000000000000..bd535c8d44e03 --- /dev/null +++ b/warp-ctc @@ -0,0 +1 @@ +Subproject commit bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2 From 18b85e558a35009c3d7108e59c5ce511cf494946 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Thu, 1 Dec 2016 05:49:51 +0000 Subject: [PATCH 3/4] Add a script to auto compile the warp-ctc submodule. --- paddle/cuda/CMakeLists.txt | 3 +-- paddle/gserver/tests/CMakeLists.txt | 6 ++++- paddle/gserver/tests/test_WarpCTCLayer.cpp | 27 +++++++++++----------- paddle/scripts/travis/build_and_test.sh | 1 + paddle/scripts/travis/submodules.sh | 18 +++++++++++++++ 5 files changed, 39 insertions(+), 16 deletions(-) create mode 100755 paddle/scripts/travis/submodules.sh diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt index 7e45d3d578982..10fa34b92727b 100755 --- a/paddle/cuda/CMakeLists.txt +++ b/paddle/cuda/CMakeLists.txt @@ -18,8 +18,7 @@ set(CUDA_CXX_WITH_GPU_SOURCES src/hl_cudart_wrap.cc src/hl_cuda_cublas.cc src/hl_cuda_cudnn.cc - src/hl_cuda_device.cc - ) + src/hl_cuda_device.cc) if(WITH_GPU) set(CUDA_CXX_SOURCES diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index 8fc6656bf4d79..310c8ad08826f 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -71,9 +71,13 @@ add_unittest(test_RecurrentLayer ############### test_WarpCTCLayer ####################### if(NOT WITH_DOUBLE) - add_unittest(test_WarpCTCLayer + add_unittest_without_exec(test_WarpCTCLayer test_WarpCTCLayer.cpp TestUtil.cpp) + + add_test(NAME test_WarpCTCLayer + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${PROJ_ROOT}/warp-ctc/build + WORKING_DIRECTORY ${PROJ_ROOT}/paddle) endif() ############### test_RecurrentGradientMachine ############### diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp index aba48935a6f68..2dd83db345132 100644 --- a/paddle/gserver/tests/test_WarpCTCLayer.cpp +++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp @@ -38,7 +38,7 @@ const real* getData(const Matrix& matrix) { } } -void checkError(const Matrix& matrix1, const Matrix& matrix2) { +int checkError(const Matrix& matrix1, const Matrix& matrix2) { CHECK_EQ(matrix1.getHeight(), matrix2.getHeight()); CHECK_EQ(matrix1.getWidth(), matrix2.getWidth()); CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed()); @@ -62,6 +62,7 @@ void checkError(const Matrix& matrix1, const Matrix& matrix2) { } } EXPECT_EQ(count, 0) << "There are " << count << " different element."; + return count; } void initArgument(size_t batchSize, @@ -72,7 +73,6 @@ void initArgument(size_t batchSize, data.grad = Matrix::create(batchSize, layerSize, false, useGpu); data.value->randomizeUniform(); data.value->add(-0.5); - /// data.value->sigmoid(*data.value); data.grad->zeroMem(); generateSequenceStartPositions(batchSize, data.sequenceStartPositions); @@ -90,9 +90,6 @@ LayerPtr createDataLayer( dataLayer->setData(data); dataLayer->forward(PASS_GC); - /// std::cout << "dataLayer: " << std::endl; - /// (dataLayer->getOutput().value)->print(std::cout); - return layer; } @@ -198,14 +195,14 @@ LayerPtr createWarpCTCLayer(string name, } TEST(Layer, WarpCTCLayer) { - for (auto layerSize : {10, 64, 128}) { - for (auto batchSize : {1, 10, 20, 64}) { + for (auto layerSize : {10, 64}) { + for (auto batchSize : {1, 10, 32}) { for (auto normByTimes : {false, true}) { for (auto useGpu : {false, true}) { #ifdef PADDLE_ONLY_CPU if (useGpu) continue; #endif - LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize + LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize << " normByTimes = " << normByTimes << " useGpu=" << useGpu; FLAGS_use_gpu = useGpu; @@ -229,13 +226,17 @@ TEST(Layer, WarpCTCLayer) { LayerPtr ctcLayer = createCTCLayer( "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer); - /// Check loss - checkError(*(warpctcLayer->getOutput().value), - *(ctcLayer->getOutput().value)); + /// Check cost + LOG(INFO) << "Check cost: " + << checkError(*(warpctcLayer->getOutput().value), + *(ctcLayer->getOutput().value)) + << " different elements."; /// Check gradients - checkError(*(dataLayer0->getOutput().grad), - *(dataLayer1->getOutput().grad)); + LOG(INFO) << "Check gradients: " + << checkError(*(dataLayer0->getOutput().grad), + *(dataLayer1->getOutput().grad)) + << " different elements"; } } } diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh index 242fd982aa001..c46c119daeb61 100755 --- a/paddle/scripts/travis/build_and_test.sh +++ b/paddle/scripts/travis/build_and_test.sh @@ -1,4 +1,5 @@ #!/bin/bash +./submodules.sh source ./common.sh CMAKE_EXTRA="" if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then diff --git a/paddle/scripts/travis/submodules.sh b/paddle/scripts/travis/submodules.sh new file mode 100755 index 0000000000000..47bd8d87ac6a7 --- /dev/null +++ b/paddle/scripts/travis/submodules.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -e +PROJ_ROOT=$(git rev-parse --show-cdup) +SUBMODULES=$(grep path ${PROJ_ROOT}.gitmodules | sed 's/^.*path = //') + +for module in $SUBMODULES +do + case $module in + "warp-ctc") + if [ -d ${PROJ_ROOT}warp-ctc/build ]; then + rm -rf ${PROJ_ROOT}warp-ctc/build + fi + mkdir ${PROJ_ROOT}warp-ctc/build + cd ${PROJ_ROOT}warp-ctc/build + cmake ..; make + ;; + esac +done From 7bb7fed8336232321bcb8dfff002c224ae746cf2 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Fri, 2 Dec 2016 09:22:51 +0000 Subject: [PATCH 4/4] Simplify the CMakelist.txt and fix typos. --- CMakeLists.txt | 13 ++++--------- paddle/scripts/travis/build_and_test.sh | 2 +- .../travis/{submodules.sh => build_submodules.sh} | 2 ++ python/paddle/trainer_config_helpers/layers.py | 12 ++++++------ 4 files changed, 13 insertions(+), 16 deletions(-) rename paddle/scripts/travis/{submodules.sh => build_submodules.sh} (93%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 28375d0cd0607..dfb5159ea1217 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,15 +77,10 @@ find_package(Git REQUIRED) include(version) add_definitions(-DPADDLE_VERSION=\"${PADDLE_VERSION}\") - if(NOT WITH_GPU) add_definitions(-DPADDLE_ONLY_CPU) add_definitions(-DHPPL_STUB_FUNC) - if(WITH_DSO) - add_definitions(-DPADDLE_USE_DSO) - endif(WITH_DSO) - list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) else() if(${CUDA_VERSION_MAJOR} GREATER 6) @@ -107,15 +102,15 @@ else() set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}") endif(WITH_AVX) - if(WITH_DSO) - add_definitions(-DPADDLE_USE_DSO) - endif(WITH_DSO) - # Include cuda and cudnn include_directories(${CUDNN_INCLUDE_DIR}) include_directories(${CUDA_TOOLKIT_INCLUDE}) endif(NOT WITH_GPU) +if(WITH_DSO) + add_definitions(-DPADDLE_USE_DSO) +endif(WITH_DSO) + if(WITH_DOUBLE) add_definitions(-DPADDLE_TYPE_DOUBLE) set(ACCURACY double) diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh index c46c119daeb61..9caeb21beb15e 100755 --- a/paddle/scripts/travis/build_and_test.sh +++ b/paddle/scripts/travis/build_and_test.sh @@ -1,5 +1,5 @@ #!/bin/bash -./submodules.sh +./build_submodules.sh source ./common.sh CMAKE_EXTRA="" if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then diff --git a/paddle/scripts/travis/submodules.sh b/paddle/scripts/travis/build_submodules.sh similarity index 93% rename from paddle/scripts/travis/submodules.sh rename to paddle/scripts/travis/build_submodules.sh index 47bd8d87ac6a7..d458bf92bf455 100755 --- a/paddle/scripts/travis/submodules.sh +++ b/paddle/scripts/travis/build_submodules.sh @@ -1,5 +1,6 @@ #!/bin/bash set -e +WORK_DIR=$PWD PROJ_ROOT=$(git rev-parse --show-cdup) SUBMODULES=$(grep path ${PROJ_ROOT}.gitmodules | sed 's/^.*path = //') @@ -16,3 +17,4 @@ do ;; esac done +cd $WORK_DIR diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index bf043c3674b61..bec675a8cea34 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -1874,7 +1874,7 @@ def img_conv_layer(input, param_attr.attr["initial_std"] = init_w param_attr.attr["initial_strategy"] = 0 param_attr.attr["initial_smart"] = False - + if layer_type: if trans: assert layer_type in ["exconvt"] @@ -4125,11 +4125,11 @@ def warp_ctc_layer(input, Note: - Let num_classes represent the category number. Considering the 'blank' - label needed by CTC, you need to use (num_classes + 1) as the input size. - Thus, the size of both warp_ctc_layer and 'input' layer should be set to - num_classes + 1. - - You can set 'blank' to [0, num_classes - 1], which should be consistent - as that used in your labels. + label needed by CTC, you need to use (num_classes + 1) as the input + size. Thus, the size of both warp_ctc_layer and 'input' layer should + be set to num_classes + 1. + - You can set 'blank' to any value ranged in [0, num_classes], which + should be consistent as that used in your labels. - As a native 'softmax' activation is interated to the warp-ctc library, 'linear' activation is expected instead in the 'input' layer.