PaddlePaddle · Xreki · Dec 7, 2016 · Nov 29, 2016 · Nov 30, 2016 · Nov 30, 2016
diff --git a/.gitmodules b/.gitmodules
@@ -94,6 +94,11 @@ endif()
 if(NOT WITH_GPU)
     add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
+
+    if(WITH_DSO)
+        add_definitions(-DPADDLE_USE_DSO)
+    endif(WITH_DSO)
+
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
     if(${CUDA_VERSION_MAJOR} GREATER 6)

diff --git a/cmake/util.cmake b/cmake/util.cmake
@@ -148,6 +148,11 @@ function(link_paddle_exe TARGET_NAME)
             target_link_libraries(${TARGET_NAME} rt)
         endif()
     endif()
+
+    if(NOT WITH_DSO)
+        target_link_libraries(${TARGET_NAME}
+            ${WARPCTC_LIBRARY})
+    endif()
 endfunction()
 
 # link_paddle_test

diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
@@ -15,20 +15,29 @@ else()
 endif()
 
 set(CUDA_CXX_WITH_GPU_SOURCES
+    src/hl_cudart_wrap.cc
     src/hl_cuda_cublas.cc
     src/hl_cuda_cudnn.cc
-    src/hl_cuda_device.cc)
+    src/hl_cuda_device.cc
+    )
 
-set_source_files_properties(${CUDA_CXX_WITH_GPU_SOURCES}
-                            PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+if(WITH_GPU)
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc
+        ${CUDA_CXX_WITH_GPU_SOURCES})
+
+    set_source_files_properties(${CUDA_CXX_SOURCES}
+                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+else()
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc)
+endif()
 
 set_source_files_properties(${AVX_SOURCES}
                             PROPERTIES COMPILE_FLAGS "-mavx")
 
-set(CUDA_DSO_SOURCES
-    src/hl_dso_loader.cc
-    src/hl_cudart_wrap.cc)
-
 set(CUDA_CU_SOURCES
     src/hl_perturbation_util.cu
     src/hl_cuda_aggregate.cu
@@ -44,6 +53,7 @@ set(CUDA_CU_SOURCES
 set(CUDA_HEADERS
     include/hl_time.h
     include/hl_dso_loader.h
+    include/hl_warpctc_wrap.h
     include/hl_sequence.h
     include/hl_cuda_cublas.h
     include/hl_batch_transpose.h
@@ -75,14 +85,14 @@ if(WITH_GPU)
     cuda_add_library(paddle_cuda
         ${CUDA_SOURCES}
         ${CUDA_CU_SOURCES}
-        ${CUDA_DSO_SOURCES}
-        ${CUDA_CXX_WITH_GPU_SOURCES})
+        ${CUDA_CXX_SOURCES})
 else()
-    add_library(paddle_cuda ${CUDA_SOURCES})
+    add_library(paddle_cuda
+                ${CUDA_SOURCES}
+                ${CUDA_CXX_SOURCES})
 endif()
 
 add_style_check_target(paddle_cuda
                        ${CUDA_SOURCES}
                        ${CUDA_HEADERS}
-                       ${CUDA_DSO_SOURCES}
-                       ${CUDA_CXX_WITH_GPU_SOURCES})
+                       ${CUDA_CXX_SOURCES})
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
@@ -18,10 +18,6 @@ limitations under the License. */
 #include <dlfcn.h>
 #include <string>
 #include <memory>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <curand.h>
-#include <cudnn.h>
 #include "hl_base.h"
 
 /**
@@ -56,4 +52,12 @@ void GetCudartDsoHandle(void** dso_handle);
  */
 void GetCurandDsoHandle(void** dso_handle);
 
+/**
+ * @brief    load the DSO of warp-ctc
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetWarpctcDsoHandle(void** dso_handle);
+
 #endif  // HL_DSO_LOADER_H_
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "hl_sparse.h"
 #include "hl_lstm.h"
 #include "hl_sequence.h"
+#include "hl_warpctc_wrap.h"
 
 #ifdef HPPL_STUB_FUNC
 #include "stub/hl_cuda_stub.h"

diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
@@ -172,6 +172,39 @@ extern void hl_sequence2batch_add(real* batch,
                                   int batchCount,
                                   bool seq2batch);
 
+/**
+ * @brief   Memory copy from sequence to batch,
+ *          while padding all sequences to the same length.
+ *
+ * if seq2batch == true
+ *
+ *    copy from sequence to batch:
+ *        batch[i] = sequence[sequenceStartPositions[i]]
+ *
+ * if seq2batch == false
+ *
+ *    copy from batch to sequence:
+ *        sequence[sequenceStartPositions[i]] = batch[i]
+ *
+ * @param[in,out]   batch                   batch matrix.
+ * @param[in,out]   sequence                sequence matrix.
+ * @param[in]       sequenceStartPositions  index vector.
+ * @param[in]       sequenceWidth           width of sequence.
+ * @param[in]       maxSequenceLength       maximum length of sequences.
+ * @param[in]       numSequences            number of sequences.
+ * @param[in]       normByTimes             whether dividing sequence's length.
+ * @param[in]       seq2batch               copy direction.
+ *
+ */
+extern void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch);
+
 /**
  * @brief  dst = Op(src), src is sequence.
  *

diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/cuda/include/hl_warpctc_wrap.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_WARPCTC_WRAP_H_
+#define HL_WARPCTC_WRAP_H_
+
+#include "hl_base.h"
+/// #include "hl_cuda.h"
+#include "warp-ctc/include/ctc.h"
+
+typedef ctcStatus_t hl_warpctc_status_t;
+typedef ctcOptions hl_warpctc_options_t;
+
+/**
+ * @brief Init ctc options.
+ *
+ * @param[in]   blank     blank label used in ctc loss function.
+ * @param[in]   useGpu    whether use gpu.
+ * @param[out]  options   handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_init(const size_t blank,
+                            bool useGpu,
+                            hl_warpctc_options_t* options);
+
+/**
+ * @brief Compute the connectionist temporal classification loss,
+ *        and optionally compute the gradient with respect to the inputs.
+ *
+ * if batchGrad == nullptr
+ *
+ *    only compute the ctc loss.
+ *
+ * if batchGrad != nullptr
+ *
+ *    compute both ctc loss and gradient.
+ *
+ * @param[in]   batchInput      batch matrix of input probabilities,
+ *                              in maxSequenceLength x numSequence x numClasses
+ *                              (row-major) format.
+ * @param[out]  batchGrad       batch matrix of gradient.
+ * @param[in]   cpuLabels       labels always in CPU memory.
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[out]  cpuCosts        cost of each sequence in CPU memory.
+ * @param[out]  workspace       workspace to store some temporary results.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_compute_loss(const real* batchInput,
+                                    real* batchGrad,
+                                    const int* cpuLabels,
+                                    const int* cpuLabelLengths,
+                                    const int* cpuInputLengths,
+                                    const size_t numClasses,
+                                    const size_t numSequences,
+                                    real* cpuCosts,
+                                    void* workspace,
+                                    hl_warpctc_options_t* options);
+
+/**
+ * @brief Compute the required workspace size.
+ *        There is no memory allocated operations within warp-ctc.
+ *
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ * @param[out]  bytes           pointer to a scalar where the memory
+ *                              requirement in bytes will be placed.
+ *
+ */
+extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                          const int* cpuInputLengths,
+                                          const size_t numClasses,
+                                          const size_t numSequences,
+                                          hl_warpctc_options_t* options,
+                                          size_t* bytes);
+
+#endif  // HL_WARPCTC_WRAP_H_
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -70,6 +70,15 @@ inline void hl_sequence2batch_add(real* batch,
                                   int batchCount,
                                   bool seq2batch) {}
 
+inline void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch) {}
+
 inline void hl_sequence_avg_forward(real* dst,
                                     real* src,
                                     const int* starts,

diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -447,6 +447,124 @@ void hl_sequence2batch_add(real *batch,
   CHECK_SYNC("hl_sequence2batch_add failed");
 }
 
+template<bool normByTimes, bool seq2batch>
+__global__
+void KeSequence2BatchPadding(real* batch,
+                             real* sequence,
+                             const int* sequenceStartPositions,
+                             const size_t sequenceWidth,
+                             const size_t maxSequenceLength,
+                             const size_t numSequences) {
+  int batchIdx = blockIdx.y;
+  int sequenceStart = sequenceStartPositions[batchIdx];
+  int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
+
+  int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y;
+  int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth;
+  int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth;
+
+  if (sequenceIdx < sequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      if (normByTimes) {
+        real scale = 1.0f / (real)sequenceLength;
+        for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+          batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
+        }
+      } else {
+        for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+          batch[batchBaseIdx + i] = sequence[sequenceBaseIdx + i];
+        }
+      }
+    } else {
+      /* batch -> sequence */
+      if (normByTimes) {
+        real scale = 1.0f / (real)sequenceLength;
+        for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+          sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
+        }
+      } else {
+        for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+          sequence[sequenceBaseIdx + i] = batch[batchBaseIdx + i];
+        }
+      }
+    }
+  } else if (sequenceIdx < maxSequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = 0;
+      }
+    }
+  }
+}
+
+void hl_sequence2batch_copy_padding(real* batch,
+                                    real* sequence,
+                                    const int* sequenceStartPositions,
+                                    const size_t sequenceWidth,
+                                    const size_t maxSequenceLength,
+                                    const size_t numSequences,
+                                    bool normByTimes,
+                                    bool seq2batch) {
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(sequenceStartPositions);
+
+  if (!normByTimes && numSequences == 1) {
+    size_t elementCount = maxSequenceLength * sequenceWidth;
+    if (seq2batch) {
+      /* sequence -> batch */
+      hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount);
+    } else {
+      /* batch -> sequence */
+      hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount);
+    }
+    return;
+  }
+
+  const int CUDA_BLOCK_SIZE = 512;
+
+  /* At least use 32 threads to copy sequenceWidth elements,
+     and at least 8 elements for each thread. */
+  int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5;
+  blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE;
+
+  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
+  dim3 threads(blockDimX, blockDimY);
+
+  int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) /
+      CUDA_BLOCK_SIZE;
+  int gridDimY = numSequences;
+  dim3 grid(gridDimX, gridDimY);
+
+  if (seq2batch) {
+    /* sequence -> batch */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    }
+  } else {
+    /* batch -> sequence */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    }
+  }
+
+  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
+}
+
 __device__ inline float my_rsqrt(float x) {
   return rsqrtf(x);
 }