From 4d487c6f350b168f5e24094adeacb3c193d5d888 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Tue, 29 Nov 2016 07:40:17 +0000
Subject: [PATCH 1/4] Integrate warp-ctc as WarpCTCLayer, including unitest and
 layer interface.

---
 .gitmodules                                   |   0
 CMakeLists.txt                                |   5 +
 cmake/util.cmake                              |   5 +
 paddle/cuda/CMakeLists.txt                    |  34 ++-
 paddle/cuda/include/hl_dso_loader.h           |  12 +-
 paddle/cuda/include/hl_gpu.h                  |   1 +
 paddle/cuda/include/hl_sequence.h             |  33 +++
 paddle/cuda/include/hl_warpctc_wrap.h         |  94 +++++++
 paddle/cuda/include/stub/hl_sequence_stub.h   |   9 +
 paddle/cuda/src/hl_cuda_sequence.cu           | 118 +++++++++
 paddle/cuda/src/hl_cudart_wrap.cc             |   1 +
 paddle/cuda/src/hl_dso_loader.cc              |  25 +-
 paddle/cuda/src/hl_warpctc_wrap.cc            | 157 +++++++++++
 paddle/gserver/layers/WarpCTCLayer.cpp        | 229 ++++++++++++++++
 paddle/gserver/layers/WarpCTCLayer.h          |  65 +++++
 paddle/gserver/tests/CMakeLists.txt           |   7 +
 paddle/gserver/tests/test_WarpCTCLayer.cpp    | 247 ++++++++++++++++++
 proto/ModelConfig.proto.m4                    |   2 +
 python/paddle/trainer/config_parser.py        |  21 ++
 .../paddle/trainer_config_helpers/layers.py   |  79 ++++++
 .../protostr/test_cost_layers.protostr        |  17 ++
 .../tests/configs/test_cost_layers.py         |   2 +
 22 files changed, 1140 insertions(+), 23 deletions(-)
 create mode 100644 .gitmodules
 create mode 100644 paddle/cuda/include/hl_warpctc_wrap.h
 create mode 100644 paddle/cuda/src/hl_warpctc_wrap.cc
 create mode 100644 paddle/gserver/layers/WarpCTCLayer.cpp
 create mode 100644 paddle/gserver/layers/WarpCTCLayer.h
 create mode 100644 paddle/gserver/tests/test_WarpCTCLayer.cpp

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/CMakeLists.txt b/CMakeLists.txt
index af193c27ae7d8..e5e54cc8cfcfd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,6 +94,11 @@ endif()
 if(NOT WITH_GPU)
     add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
+
+    if(WITH_DSO)
+        add_definitions(-DPADDLE_USE_DSO)
+    endif(WITH_DSO)
+
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
     if(${CUDA_VERSION_MAJOR} GREATER 6)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index a8282f07184c3..11641f6064b9d 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -148,6 +148,11 @@ function(link_paddle_exe TARGET_NAME)
             target_link_libraries(${TARGET_NAME} rt)
         endif()
     endif()
+
+    if(NOT WITH_DSO)
+        target_link_libraries(${TARGET_NAME}
+            ${WARPCTC_LIBRARY})
+    endif()
 endfunction()
 
 # link_paddle_test
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index 11dbfb54b2687..7e45d3d578982 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -15,20 +15,29 @@ else()
 endif()
 
 set(CUDA_CXX_WITH_GPU_SOURCES
+    src/hl_cudart_wrap.cc
     src/hl_cuda_cublas.cc
     src/hl_cuda_cudnn.cc
-    src/hl_cuda_device.cc)
+    src/hl_cuda_device.cc
+    )
 
-set_source_files_properties(${CUDA_CXX_WITH_GPU_SOURCES}
-                            PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+if(WITH_GPU)
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc
+        ${CUDA_CXX_WITH_GPU_SOURCES})
+
+    set_source_files_properties(${CUDA_CXX_SOURCES}
+                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+else()
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc)
+endif()
 
 set_source_files_properties(${AVX_SOURCES}
                             PROPERTIES COMPILE_FLAGS "-mavx")
 
-set(CUDA_DSO_SOURCES
-    src/hl_dso_loader.cc
-    src/hl_cudart_wrap.cc)
-
 set(CUDA_CU_SOURCES
     src/hl_perturbation_util.cu
     src/hl_cuda_aggregate.cu
@@ -44,6 +53,7 @@ set(CUDA_CU_SOURCES
 set(CUDA_HEADERS
     include/hl_time.h
     include/hl_dso_loader.h
+    include/hl_warpctc_wrap.h
     include/hl_sequence.h
     include/hl_cuda_cublas.h
     include/hl_batch_transpose.h
@@ -75,14 +85,14 @@ if(WITH_GPU)
     cuda_add_library(paddle_cuda
         ${CUDA_SOURCES}
         ${CUDA_CU_SOURCES}
-        ${CUDA_DSO_SOURCES}
-        ${CUDA_CXX_WITH_GPU_SOURCES})
+        ${CUDA_CXX_SOURCES})
 else()
-    add_library(paddle_cuda ${CUDA_SOURCES})
+    add_library(paddle_cuda
+                ${CUDA_SOURCES}
+                ${CUDA_CXX_SOURCES})
 endif()
 
 add_style_check_target(paddle_cuda
                        ${CUDA_SOURCES}
                        ${CUDA_HEADERS}
-                       ${CUDA_DSO_SOURCES}
-                       ${CUDA_CXX_WITH_GPU_SOURCES})
+                       ${CUDA_CXX_SOURCES})
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index 1eb9f9ca888d3..c52066e3d7ec4 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -18,10 +18,6 @@ limitations under the License. */
 #include <dlfcn.h>
 #include <string>
 #include <memory>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <curand.h>
-#include <cudnn.h>
 #include "hl_base.h"
 
 /**
@@ -56,4 +52,12 @@ void GetCudartDsoHandle(void** dso_handle);
  */
 void GetCurandDsoHandle(void** dso_handle);
 
+/**
+ * @brief    load the DSO of warp-ctc
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetWarpctcDsoHandle(void** dso_handle);
+
 #endif  // HL_DSO_LOADER_H_
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index 3be0df3b93b69..6dd6d1321270a 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "hl_sparse.h"
 #include "hl_lstm.h"
 #include "hl_sequence.h"
+#include "hl_warpctc_wrap.h"
 
 #ifdef HPPL_STUB_FUNC
 #include "stub/hl_cuda_stub.h"
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index bb5124df44b49..b98d7bdeafe5d 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -172,6 +172,39 @@ extern void hl_sequence2batch_add(real* batch,
                                   int batchCount,
                                   bool seq2batch);
 
+/**
+ * @brief   Memory copy from sequence to batch,
+ *          while padding all sequences to the same length.
+ *
+ * if seq2batch == true
+ *
+ *    copy from sequence to batch:
+ *        batch[i] = sequence[sequenceStartPositions[i]]
+ *
+ * if seq2batch == false
+ *
+ *    copy from batch to sequence:
+ *        sequence[sequenceStartPositions[i]] = batch[i]
+ *
+ * @param[in,out]   batch                   batch matrix.
+ * @param[in,out]   sequence                sequence matrix.
+ * @param[in]       sequenceStartPositions  index vector.
+ * @param[in]       sequenceWidth           width of sequence.
+ * @param[in]       maxSequenceLength       maximum length of sequences.
+ * @param[in]       numSequences            number of sequences.
+ * @param[in]       normByTimes             whether dividing sequence's length.
+ * @param[in]       seq2batch               copy direction.
+ *
+ */
+extern void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch);
+
 /**
  * @brief  dst = Op(src), src is sequence.
  *
diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/cuda/include/hl_warpctc_wrap.h
new file mode 100644
index 0000000000000..9d2379a024fe1
--- /dev/null
+++ b/paddle/cuda/include/hl_warpctc_wrap.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_WARPCTC_WRAP_H_
+#define HL_WARPCTC_WRAP_H_
+
+#include "hl_base.h"
+/// #include "hl_cuda.h"
+#include "warp-ctc/include/ctc.h"
+
+typedef ctcStatus_t hl_warpctc_status_t;
+typedef ctcOptions hl_warpctc_options_t;
+
+/**
+ * @brief Init ctc options.
+ *
+ * @param[in]   blank     blank label used in ctc loss function.
+ * @param[in]   useGpu    whether use gpu.
+ * @param[out]  options   handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_init(const size_t blank,
+                            bool useGpu,
+                            hl_warpctc_options_t* options);
+
+/**
+ * @brief Compute the connectionist temporal classification loss,
+ *        and optionally compute the gradient with respect to the inputs.
+ *
+ * if batchGrad == nullptr
+ *
+ *    only compute the ctc loss.
+ *
+ * if batchGrad != nullptr
+ *
+ *    compute both ctc loss and gradient.
+ *
+ * @param[in]   batchInput      batch matrix of input probabilities,
+ *                              in maxSequenceLength x numSequence x numClasses
+ *                              (row-major) format.
+ * @param[out]  batchGrad       batch matrix of gradient.
+ * @param[in]   cpuLabels       labels always in CPU memory.
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[out]  cpuCosts        cost of each sequence in CPU memory.
+ * @param[out]  workspace       workspace to store some temporary results.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_compute_loss(const real* batchInput,
+                                    real* batchGrad,
+                                    const int* cpuLabels,
+                                    const int* cpuLabelLengths,
+                                    const int* cpuInputLengths,
+                                    const size_t numClasses,
+                                    const size_t numSequences,
+                                    real* cpuCosts,
+                                    void* workspace,
+                                    hl_warpctc_options_t* options);
+
+/**
+ * @brief Compute the required workspace size.
+ *        There is no memory allocated operations within warp-ctc.
+ *
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ * @param[out]  bytes           pointer to a scalar where the memory
+ *                              requirement in bytes will be placed.
+ *
+ */
+extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                          const int* cpuInputLengths,
+                                          const size_t numClasses,
+                                          const size_t numSequences,
+                                          hl_warpctc_options_t* options,
+                                          size_t* bytes);
+
+#endif  // HL_WARPCTC_WRAP_H_
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index 381f0a6f26c56..3343463a8d5fa 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -70,6 +70,15 @@ inline void hl_sequence2batch_add(real* batch,
                                   int batchCount,
                                   bool seq2batch) {}
 
+inline void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch) {}
+
 inline void hl_sequence_avg_forward(real* dst,
                                     real* src,
                                     const int* starts,
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index 63824eaa4c201..0f1d72043935e 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -447,6 +447,124 @@ void hl_sequence2batch_add(real *batch,
   CHECK_SYNC("hl_sequence2batch_add failed");
 }
 
+template<bool normByTimes, bool seq2batch>
+__global__
+void KeSequence2BatchPadding(real* batch,
+                             real* sequence,
+                             const int* sequenceStartPositions,
+                             const size_t sequenceWidth,
+                             const size_t maxSequenceLength,
+                             const size_t numSequences) {
+  int batchIdx = blockIdx.y;
+  int sequenceStart = sequenceStartPositions[batchIdx];
+  int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
+
+  int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y;
+  int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth;
+  int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth;
+
+  if (sequenceIdx < sequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      if (normByTimes) {
+        real scale = 1.0f / (real)sequenceLength;
+        for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+          batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
+        }
+      } else {
+        for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+          batch[batchBaseIdx + i] = sequence[sequenceBaseIdx + i];
+        }
+      }
+    } else {
+      /* batch -> sequence */
+      if (normByTimes) {
+        real scale = 1.0f / (real)sequenceLength;
+        for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+          sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
+        }
+      } else {
+        for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+          sequence[sequenceBaseIdx + i] = batch[batchBaseIdx + i];
+        }
+      }
+    }
+  } else if (sequenceIdx < maxSequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = 0;
+      }
+    }
+  }
+}
+
+void hl_sequence2batch_copy_padding(real* batch,
+                                    real* sequence,
+                                    const int* sequenceStartPositions,
+                                    const size_t sequenceWidth,
+                                    const size_t maxSequenceLength,
+                                    const size_t numSequences,
+                                    bool normByTimes,
+                                    bool seq2batch) {
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(sequenceStartPositions);
+
+  if (!normByTimes && numSequences == 1) {
+    size_t elementCount = maxSequenceLength * sequenceWidth;
+    if (seq2batch) {
+      /* sequence -> batch */
+      hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount);
+    } else {
+      /* batch -> sequence */
+      hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount);
+    }
+    return;
+  }
+
+  const int CUDA_BLOCK_SIZE = 512;
+
+  /* At least use 32 threads to copy sequenceWidth elements,
+     and at least 8 elements for each thread. */
+  int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5;
+  blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE;
+
+  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
+  dim3 threads(blockDimX, blockDimY);
+
+  int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) /
+      CUDA_BLOCK_SIZE;
+  int gridDimY = numSequences;
+  dim3 grid(gridDimX, gridDimY);
+
+  if (seq2batch) {
+    /* sequence -> batch */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    }
+  } else {
+    /* batch -> sequence */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    }
+  }
+
+  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
+}
+
 __device__ inline float my_rsqrt(float x) {
   return rsqrtf(x);
 }
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
index ff6b830b7addc..a95f5557afb49 100644
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_USE_DSO
 
 #include <mutex>
+#include <cuda_runtime.h>
 #include "hl_dso_loader.h"
 
 /**
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index 1a3ce08619fc3..a6ea2a3b9f4b5 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -30,6 +30,8 @@ P_DEFINE_string(cuda_dir,
                 "build-in function in cudart already ran before main entry). "
                 "If default, dlopen will search cuda from LD_LIBRARY_PATH");
 
+P_DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
@@ -92,27 +94,28 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
     *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
     // if not found, search from default path
     if (nullptr == *dso_handle) {
-      LOG(WARNING) << "Failed to find cuda library: " << dlPath;
+      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
+                   << dlerror() << ")";
       dlPath = dso_name;
       GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
     }
   }
 
-  CHECK(nullptr != *dso_handle) << "Failed to find cuda library: " << dlPath
-                                << std::endl
+  CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath
+                                << " (" << dlerror() << ") \n"
                                 << "Please specify its path correctly using "
-                                   "one of the following ways: \n"  // NOLINT
+                                   "one of the following ways: \n"
 
                                 << "Method 1. set cuda and cudnn lib path at "
                                    "runtime. "
                                 << "http://www.paddlepaddle.org/doc/ui/"
                                    "cmd_argument/"
-                                   "argument_outline.html \n"  // NOLINT
+                                   "argument_outline.html \n"
                                 << "For instance, issue command: paddle train "
                                    "--use_gpu=1 "
                                 << "--cuda_dir=/usr/local/cuda/lib64 "
                                    "--cudnn_dir=/usr/local/cudnn/lib "
-                                   "...\n"  // NOLINT
+                                   "...\n"
 
                                 << "Method 2. set environment variable "
                                    "LD_LIBRARY_PATH on Linux or "
@@ -124,7 +127,7 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
                                    "DYLD_LIBRARY_PATH is impossible "
                                 << "unless System Integrity Protection (SIP) "
                                    "is disabled. However, "
-                                   "method 1 "  // NOLINT
+                                   "method 1 "
                                 << "always work well.";
 }
 
@@ -159,3 +162,11 @@ void GetCurandDsoHandle(void** dso_handle) {
   GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
 #endif
 }
+
+void GetWarpctcDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
+#endif
+}
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
new file mode 100644
index 0000000000000..99db0f242df74
--- /dev/null
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <mutex>
+#include "hl_warpctc_wrap.h"
+#include "hl_dso_loader.h"
+#include "paddle/utils/Logging.h"
+
+namespace dynload {
+
+std::once_flag warpctc_dso_flag;
+void* warpctc_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading. When PADDLE_USE_DSO is
+ * false, you need to add the path of libwarp-ctc.so to
+ * the linked-libs of paddle or to LD_PRELOAD.
+ */
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name, __type)                      \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    __type operator()(Args... args) {                                  \
+      typedef __type (*warpctcFunc)(Args...);                          \
+      std::call_once(                                                  \
+          warpctc_dso_flag, GetWarpctcDsoHandle, &warpctc_dso_handle); \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);            \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
+    }                                                                  \
+  } __name;  // struct DynLoad__##__name
+#else
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name, __type) \
+  struct DynLoad__##__name {                      \
+    template <typename... Args>                   \
+    __type operator()(Args... args) {             \
+      return __name(args...);                     \
+    }                                             \
+  } __name;  // struct DynLoad__##__name
+#endif
+
+// include all needed warp-ctc functions
+DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version, int)
+DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString, const char*)
+DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss, hl_warpctc_status_t)
+DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size, hl_warpctc_status_t)
+
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+
+} /* namespace dynload */
+
+#define WARPCTC_GET_VERSION dynload::get_warpctc_version
+#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
+
+#ifndef PADDLE_TYPE_DOUBLE
+#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
+#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
+#else
+#define WARPCTC_LOG_FATAL                                \
+  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion \
+             << "] Error: not support double precision."
+#define WARPCTC_COMPUTE_LOSS(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+#define WARPCTC_GET_WORKSPACE_SIZE(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+#endif
+
+/**
+ * Check build-in warp-ctc function using glog and it also
+ * support << operator for more details error info.
+ */
+static int g_warpctcVersion = -1;
+#define CHECK_WARPCTC(warpctcStat)                \
+  CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
+      << "warp-ctc [version " << g_warpctcVersion \
+      << "] Error: " << WARPCTC_GET_STATUS_STRING(warpctcStat) << " "
+
+void hl_warpctc_init(const size_t blank,
+                     bool useGpu,
+                     hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(options);
+
+  g_warpctcVersion = WARPCTC_GET_VERSION();
+
+  if (useGpu) {
+#ifdef __NVCC__
+    options->loc = CTC_GPU;
+    options->stream = STREAM_DEFAULT;
+#else
+    LOG(FATAL) << "[warpctc init] GPU is not enabled.";
+#endif
+  } else {
+    options->loc = CTC_CPU;
+    options->num_threads = 1;
+  }
+
+  options->blank_label = blank;
+}
+
+void hl_warpctc_compute_loss(const real* batchInput,
+                             real* batchGrad,
+                             const int* cpuLabels,
+                             const int* cpuLabelLengths,
+                             const int* cpuInputLengths,
+                             const size_t numClasses,
+                             const size_t numSequences,
+                             real* cpuCosts,
+                             void* workspace,
+                             hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(batchInput);
+  CHECK_NOTNULL(cpuLabels);
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(cpuCosts);
+  CHECK_NOTNULL(workspace);
+  CHECK_NOTNULL(options);
+
+  CHECK_WARPCTC(WARPCTC_COMPUTE_LOSS(batchInput,
+                                     batchGrad,
+                                     cpuLabels,
+                                     cpuLabelLengths,
+                                     cpuInputLengths,
+                                     numClasses,
+                                     numSequences,
+                                     cpuCosts,
+                                     workspace,
+                                     *options));
+}
+
+void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                   const int* cpuInputLengths,
+                                   const size_t numClasses,
+                                   const size_t numSequences,
+                                   hl_warpctc_options_t* options,
+                                   size_t* bytes) {
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(options);
+  CHECK_NOTNULL(bytes);
+
+  CHECK_WARPCTC(WARPCTC_GET_WORKSPACE_SIZE(cpuLabelLengths,
+                                           cpuInputLengths,
+                                           numClasses,
+                                           numSequences,
+                                           *options,
+                                           bytes));
+}
diff --git a/paddle/gserver/layers/WarpCTCLayer.cpp b/paddle/gserver/layers/WarpCTCLayer.cpp
new file mode 100644
index 0000000000000..b99e9b9c7a620
--- /dev/null
+++ b/paddle/gserver/layers/WarpCTCLayer.cpp
@@ -0,0 +1,229 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "WarpCTCLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(warp_ctc, WarpCTCLayer);
+
+bool WarpCTCLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  /* Initialize the basic parament class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2UL);
+
+  /* The inputLayers_[0] must be sequence output without softmax */
+  numClasses_ = config_.size();
+  CHECK_GE(numClasses_, 2UL);
+  CHECK_EQ(numClasses_, inputLayers_[0]->getSize());
+
+  blank_ = config_.blank();
+  CHECK_GE(blank_, 0UL);
+  CHECK_LT(blank_, numClasses_);
+
+  normByTimes_ = config_.norm_by_times();
+
+  // We don't need sequenceStartPositions because each sample of output_ is
+  // for the cost of one sequence.
+  setNeedSequenceInfo(false);
+
+  return true;
+}
+
+void WarpCTCLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& output = getInput(0);
+  const Argument& labels = getInput(1);
+
+  CHECK(output.sequenceStartPositions);
+  CHECK(labels.sequenceStartPositions);
+  CHECK(labels.ids);
+
+  size_t numSequences = labels.sequenceStartPositions->getSize() - 1;
+  CHECK_EQ(numSequences, output.sequenceStartPositions->getSize() - 1);
+
+  resizeOutput(numSequences, 1);
+
+  const int* cpuLabelStartPositions =
+      labels.sequenceStartPositions->getData(false);
+  const int* cpuOutputStartPositions =
+      output.sequenceStartPositions->getData(false);
+
+  std::vector<int> cpuLabelLengths(numSequences);
+  std::vector<int> cpuOutputLengths(numSequences);
+  for (size_t i = 0; i < numSequences; i++) {
+    cpuLabelLengths[i] =
+        cpuLabelStartPositions[i + 1] - cpuLabelStartPositions[i];
+    cpuOutputLengths[i] =
+        cpuOutputStartPositions[i + 1] - cpuOutputStartPositions[i];
+  }
+
+  /* Get the maximum sequence length */
+  maxSequenceLength_ = 0;
+  maxSequenceLength_ = *std::max_element(
+      cpuOutputLengths.data(), cpuOutputLengths.data() + numSequences);
+
+  Matrix::resizeOrCreate(batchValue_,
+                         /* height */ numSequences * maxSequenceLength_,
+                         /* width */ numClasses_,
+                         /* trans */ false,
+                         /* useGpu */ useGpu_);
+
+  Matrix::resizeOrCreate(batchGrad_,
+                         /* height */ numSequences * maxSequenceLength_,
+                         /* width */ numClasses_,
+                         /* trans */ false,
+                         /* useGpu */ useGpu_);
+  batchGrad_->zeroMem();
+
+  seq2batchPadding(output.value, batchValue_, output.sequenceStartPositions);
+
+  /* labels always in CPU memory */
+  IVector::resizeOrCreate(cpuLabels_,
+                          /* size */ (labels.ids)->getSize(),
+                          /* useGpu */ false);
+  cpuLabels_->copyFrom(*(labels.ids));
+
+  /* labels always in CPU memory */
+  Matrix::resizeOrCreate(cpuCosts_,
+                         /* width */ numSequences,
+                         /* height */ 1,
+                         /* trans */ false,
+                         /* useGpu */ false);
+
+  /* Init warp-ctc options */
+  hl_warpctc_options_t options;
+  hl_warpctc_init(blank_, useGpu_, &options);
+
+  /* Get the needed workspace size */
+  size_t workspaceBytes = 0;
+  hl_warpctc_get_workspace_size(cpuLabelLengths.data(),
+                                cpuOutputLengths.data(),
+                                numClasses_,
+                                numSequences,
+                                &options,
+                                &workspaceBytes);
+  CHECK_GT(workspaceBytes, 0UL);
+
+  size_t workspaceLength = workspaceBytes / sizeof(real) + 1;
+  Vector::resizeOrCreate(workspace_,
+                         /* size */ workspaceLength,
+                         /* useGpu */ useGpu_);
+
+  hl_warpctc_compute_loss(batchValue_->getData(),
+                          batchGrad_->getData(),
+                          cpuLabels_->getData(),
+                          cpuLabelLengths.data(),
+                          cpuOutputLengths.data(),
+                          numClasses_,
+                          numSequences,
+                          cpuCosts_->getData(),
+                          workspace_->getData(),
+                          &options);
+
+  /* Copy the costs */
+  output_.value->copyFrom(*cpuCosts_);
+}
+
+void WarpCTCLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  const Argument& output = getInput(0);
+  CHECK(batchGrad_);
+
+  batch2seqPadding(
+      output.grad, batchGrad_, output.sequenceStartPositions, normByTimes_);
+}
+
+void WarpCTCLayer::seq2batchPadding(const MatrixPtr& seqValue,
+                                    MatrixPtr& batchValue,
+                                    const ICpuGpuVectorPtr& seqStartPositions) {
+  size_t numSequences = seqStartPositions->getSize() - 1;
+  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
+
+  real* seqData = seqValue->getData();
+  real* batchData = batchValue->getData();
+  if (useGpu_) {
+    hl_sequence2batch_copy_padding(batchData,
+                                   seqData,
+                                   seqStartPositionsData,
+                                   numClasses_,
+                                   maxSequenceLength_,
+                                   numSequences,
+                                   false,
+                                   true);
+  } else {
+    for (size_t i = 0; i < maxSequenceLength_; i++) {
+      for (size_t j = 0; j < numSequences; j++) {
+        size_t sequenceStart = seqStartPositionsData[j];
+        size_t sequenceLength =
+            seqStartPositionsData[j + 1] - seqStartPositionsData[j];
+        if (i < sequenceLength) {
+          memcpy(batchData + (i * numSequences + j) * numClasses_,
+                 seqData + (sequenceStart + i) * numClasses_,
+                 numClasses_ * sizeof(real));
+        } else {
+          memset(batchData + (i * numSequences + j) * numClasses_,
+                 0,
+                 numClasses_ * sizeof(real));
+        }
+      }
+    }
+  }
+}
+
+void WarpCTCLayer::batch2seqPadding(const MatrixPtr& seqValue,
+                                    MatrixPtr& batchValue,
+                                    const ICpuGpuVectorPtr& seqStartPositions,
+                                    bool normByTimes) {
+  size_t numSequences = seqStartPositions->getSize() - 1;
+  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
+
+  real* seqData = seqValue->getData();
+  real* batchData = batchValue->getData();
+  if (useGpu_) {
+    hl_sequence2batch_copy_padding(batchData,
+                                   seqData,
+                                   seqStartPositionsData,
+                                   numClasses_,
+                                   maxSequenceLength_,
+                                   numSequences,
+                                   normByTimes,
+                                   false);
+  } else {
+    for (size_t i = 0; i < numSequences; i++) {
+      int sequenceStart = seqStartPositionsData[i];
+      int sequenceLength =
+          seqStartPositionsData[i + 1] - seqStartPositionsData[i];
+      for (int j = 0; j < sequenceLength; j++) {
+        if (normByTimes) {
+          for (size_t k = 0; k < numClasses_; k++) {
+            seqData[(sequenceStart + j) * numClasses_ + k] =
+                batchData[(j * numSequences + i) * numClasses_ + k] /
+                sequenceLength;
+          }
+        } else {
+          memcpy(seqData + (sequenceStart + j) * numClasses_,
+                 batchData + (j * numSequences + i) * numClasses_,
+                 numClasses_ * sizeof(real));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/gserver/layers/WarpCTCLayer.h
new file mode 100644
index 0000000000000..1b0f5ba267ae5
--- /dev/null
+++ b/paddle/gserver/layers/WarpCTCLayer.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer integrating the open-source warp-ctc library
+ *        <https://github.com/baidu-research/warp-ctc> to compute connectionist
+ *        temporal classification cost.
+ *
+ * The config file api is warp_ctc_layer.
+ */
+class WarpCTCLayer : public Layer {
+public:
+  explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
+  ~WarpCTCLayer() {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  /**
+   * sequence matrix and batch matrix copy:
+   * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+   * batch    (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
+   */
+  void seq2batchPadding(const MatrixPtr& seqValue,
+                        MatrixPtr& batchValue,
+                        const ICpuGpuVectorPtr& seqStartPositions);
+  void batch2seqPadding(const MatrixPtr& seqValue,
+                        MatrixPtr& batchValue,
+                        const ICpuGpuVectorPtr& seqStartPositions,
+                        bool normByTimes);
+
+protected:
+  size_t numClasses_;
+  size_t blank_;
+  size_t maxSequenceLength_;
+  bool normByTimes_;
+
+  MatrixPtr batchValue_;
+  MatrixPtr batchGrad_;
+  VectorPtr workspace_;
+
+  IVectorPtr cpuLabels_;
+  MatrixPtr cpuCosts_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 0651d0b4733ea..5427dc062d566 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -62,6 +62,13 @@ add_unittest(test_RecurrentLayer
     test_RecurrentLayer.cpp
     TestUtil.cpp)
 
+############### test_WarpCTCLayer #######################
+if(NOT WITH_DOUBLE)
+    add_unittest(test_WarpCTCLayer
+        test_WarpCTCLayer.cpp
+        TestUtil.cpp)
+endif()
+
 ############### test_RecurrentGradientMachine ###############
 # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
 # I will fix it.
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
new file mode 100644
index 0000000000000..5289c9892ceb2
--- /dev/null
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -0,0 +1,247 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/utils/Version.h>
+#include "paddle/gserver/layers/Layer.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/CTCLayer.h"
+#include "paddle/gserver/layers/WarpCTCLayer.h"
+#include "ModelConfig.pb.h"
+
+#include "TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+
+const real* getData(const Matrix& matrix) {
+  if (matrix.useGpu()) {
+    MatrixPtr cpuMatrix = Matrix::create(
+        matrix.getWidth(), matrix.getHeight(), matrix.isTransposed(), false);
+    cpuMatrix->copyFrom(matrix);
+    return cpuMatrix->getData();
+  } else {
+    return matrix.getData();
+  }
+}
+
+void checkError(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
+  CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
+  CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+
+  const real* data1 = getData(matrix1);
+  const real* data2 = getData(matrix2);
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+}
+
+void initArgument(size_t batchSize,
+                  int layerSize,
+                  bool useGpu,
+                  Argument& data) {
+  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.value->randomizeUniform();
+  data.value->add(-0.5);
+  /// data.value->sigmoid(*data.value);
+  data.grad->zeroMem();
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+}
+
+LayerPtr createDataLayer(
+    string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_GC);
+
+  /// std::cout << "dataLayer: " << std::endl;
+  /// (dataLayer->getOutput().value)->print(std::cout);
+
+  return layer;
+}
+
+LayerPtr createLabelLayer(string name,
+                          size_t batchSize,
+                          size_t numClasses,
+                          bool useGpu) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(1);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+
+  Argument data;
+  data.ids = IVector::create(batchSize, useGpu);
+  data.ids->rand(numClasses - 1);
+
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+
+  DataLayerPtr labelLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  labelLayer->setData(data);
+  labelLayer->forward(PASS_GC);
+
+  return layer;
+}
+
+LayerPtr createCTCLayer(string name,
+                        size_t numClasses,
+                        bool useGpu,
+                        bool normByTimes,
+                        LayerPtr dataLayer,
+                        LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+
+  ParameterMap parameterMap;
+
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_norm_by_times(normByTimes);
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+
+  LayerPtr layer = LayerPtr(new CTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+
+  ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
+
+  softmaxActivation->forward(dataLayer->getOutput());
+  layer->forward(PASS_GC);
+
+  layer->backward();
+  softmaxActivation->backward(dataLayer->getOutput());
+
+  return layer;
+}
+
+LayerPtr createWarpCTCLayer(string name,
+                            size_t numClasses,
+                            bool useGpu,
+                            bool normByTimes,
+                            LayerPtr dataLayer,
+                            LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+
+  ParameterMap parameterMap;
+
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("warp_ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_blank(numClasses - 1);
+  layerConfig.set_norm_by_times(normByTimes);
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+
+  LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+
+  layer->forward(PASS_GC);
+  layer->backward();
+
+  return layer;
+}
+
+TEST(Layer, WarpCTCLayer) {
+  for (auto layerSize : {10, 64, 128}) {
+    for (auto batchSize : {1, 10, 20, 64}) {
+      for (auto useGpu : {false, true}) {
+#ifdef PADDLE_ONLY_CPU
+        if (useGpu) continue;
+#endif
+        LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
+                  << " useGpu=" << useGpu;
+
+        FLAGS_use_gpu = useGpu;
+
+        Argument data0;
+        initArgument(batchSize, layerSize, useGpu, data0);
+
+        Argument data1;
+        data1.resizeAndCopyFrom(data0);
+
+        LayerPtr dataLayer0 =
+            createDataLayer("data", batchSize, layerSize, useGpu, data0);
+        LayerPtr dataLayer1 =
+            createDataLayer("data", batchSize, layerSize, useGpu, data1);
+
+        LayerPtr labelLayer =
+            createLabelLayer("label", batchSize, layerSize, useGpu);
+
+        LayerPtr warpctcLayer = createWarpCTCLayer(
+            "cost", layerSize, useGpu, false, dataLayer0, labelLayer);
+        LayerPtr ctcLayer = createCTCLayer(
+            "cost", layerSize, useGpu, false, dataLayer1, labelLayer);
+
+        /// Check loss
+        checkError(*(warpctcLayer->getOutput().value),
+                   *(ctcLayer->getOutput().value));
+
+        /// Check gradients
+        checkError(*(dataLayer0->getOutput().grad),
+                   *(dataLayer1->getOutput().grad));
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index 68a5eb9dd2231..08108a46661b5 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -414,6 +414,8 @@ sinclude(`ModelConfigLayer.proto.m4')
   // to string and reinterpreted in the user's own layer implementation.  
   optional string user_arg = 49;
 
+  // For WarpCTCLayer
+  optional uint32 blank = 50 [default = 0];
 }
 
 message EvaluatorConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 9db42bf172a77..e987ad17d6543 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2993,6 +2993,27 @@ def __init__(self, name, size, inputs, norm_by_times=False, device=None):
         config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')
 
 
+@config_layer('warp_ctc')
+class WarpCTCLayer(LayerBase):
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 blank=0,
+                 norm_by_times=False,
+                 device=None):
+        super(WarpCTCLayer, self).__init__(
+            name, 'warp_ctc', size=size, inputs=inputs, device=device)
+        self.config.blank = blank
+        self.config.norm_by_times = norm_by_times
+        config_assert(len(self.inputs) == 2, 'WarpCTCLayer must have 2 inputs')
+        input_layer = self.get_input_layer(0)
+        config_assert(
+            (input_layer.active_type == '' or
+             input_layer.active_type == 'linear'),
+            "Expecting the active_type of input layer to be linear or null")
+
+
 @config_layer('recurrent_layer_group')
 class RecurrentLayerGroup(LayerBase):
     def __init__(self, name, device=None):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 9a45a5158984a..888d48722ab91 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -91,6 +91,7 @@
     'linear_comb_layer',
     'convex_comb_layer',
     'ctc_layer',
+    'warp_ctc_layer',
     'crf_layer',
     'crf_decoding_layer',
     'nce_layer',
@@ -169,6 +170,7 @@ class LayerType(object):
     PRINT_LAYER = "print"
 
     CTC_LAYER = "ctc"
+    WARP_CTC_LAYER = "warp_ctc"
     CRF_LAYER = "crf"
     CRF_DECODING_LAYER = "crf_decoding"
     NCE_LAYER = 'nce'
@@ -4085,6 +4087,83 @@ def ctc_layer(input,
     return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
 
 
+@wrap_name_default()
+@layer_support()
+def warp_ctc_layer(input,
+                   label,
+                   size=None,
+                   name=None,
+                   blank=0,
+                   norm_by_times=False,
+                   layer_attr=None):
+    """
+    A layer intergrating the open-source `warp-ctc
+    <https://github.com/baidu-research/warp-ctc>` library, which is used in
+    `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
+    <https://arxiv.org/pdf/1512.02595v1.pdf>`, to compute Connectionist Temporal
+    Classification (CTC) loss.
+
+    More details of CTC can be found by referring to `Connectionist Temporal
+    Classification: Labelling Unsegmented Sequence Data with Recurrent
+    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
+    icml2006_GravesFGS06.pdf>`_
+
+    Note:
+        - Let num_classes represent the category number. Considering the 'blank'
+          label needed by CTC, you need to use (num_classes + 1) as the input size.
+          Thus, the size of both warp_ctc_layer and 'input' layer should be set to
+          num_classes + 1.
+        - You can set 'blank' to [0, num_classes - 1], which should be consistent
+          as that used in your labels.
+        - As a native 'softmax' activation is interated to the warp-ctc library,
+         'linear' activation is expected instead in the 'input' layer.
+
+    The simple usage:
+
+    .. code-block:: python
+
+      ctc = warp_ctc_layer(input=input,
+                           label=label,
+                           size=1001,
+                           blank=1000,
+                           norm_by_times=False)
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param label: The data layer of label with variable length.
+    :type label: LayerOutput
+    :param size: category numbers + 1.
+    :type size: int
+    :param name: The name of this layer, which can not specify.
+    :type name: basestring|None
+    :param blank: the 'blank' label used in ctc
+    :type blank: int
+    :param norm_by_times: Whether to normalization by times. False by default.
+    :type norm_by_times: bool
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert isinstance(label, LayerOutput)
+    if label.size is not None:
+        if size is not None:
+            assert size == label.size + 1
+        else:
+            size = label.size + 1
+    Layer(
+        name=name,
+        type=LayerType.WARP_CTC_LAYER,
+        size=size,
+        blank=blank,
+        norm_by_times=norm_by_times,
+        inputs=[input.name, label.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.WARP_CTC_LAYER, parents=[input, label], size=size)
+
+
 @wrap_name_default()
 @wrap_param_attr_default()
 @layer_support()
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
index f6045fe1f6825..10e59e21bc7a4 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -47,6 +47,20 @@ layers {
   }
   norm_by_times: false
 }
+layers {
+  name: "__warp_ctc_layer_0__"
+  type: "warp_ctc"
+  size: 5001
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "labels"
+  }
+  norm_by_times: false
+  blank: 0
+}
 layers {
   name: "crf_label"
   type: "data"
@@ -244,6 +258,7 @@ input_layer_names: "xe-label"
 input_layer_names: "huber_probs"
 input_layer_names: "huber_label"
 output_layer_names: "__ctc_layer_0__"
+output_layer_names: "__warp_ctc_layer_0__"
 output_layer_names: "__crf_layer_0__"
 output_layer_names: "__rank_cost_0__"
 output_layer_names: "__lambda_cost_0__"
@@ -260,6 +275,7 @@ sub_models {
   layer_names: "xe-label"
   layer_names: "__fc_layer_0__"
   layer_names: "__ctc_layer_0__"
+  layer_names: "__warp_ctc_layer_0__"
   layer_names: "crf_label"
   layer_names: "__crf_layer_0__"
   layer_names: "left"
@@ -289,6 +305,7 @@ sub_models {
   input_layer_names: "huber_probs"
   input_layer_names: "huber_label"
   output_layer_names: "__ctc_layer_0__"
+  output_layer_names: "__warp_ctc_layer_0__"
   output_layer_names: "__crf_layer_0__"
   output_layer_names: "__rank_cost_0__"
   output_layer_names: "__lambda_cost_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
index fd979a1e9f433..18ff6b48c495b 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -12,6 +12,8 @@
 outputs(
     ctc_layer(
         input=seq_in, label=labels),
+    warp_ctc_layer(
+        input=seq_in, label=labels, blank=0),
     crf_layer(
         input=hidden, label=data_layer(
             name='crf_label', size=4)),

From a816443e118f3abc69d0a474e01135d06d5f638e Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Wed, 30 Nov 2016 05:32:16 +0000
Subject: [PATCH 2/4] Add submodule warp-ctc.

---
 .gitmodules                                |  3 ++
 .pre-commit-config.yaml                    |  2 +
 paddle/cuda/include/hl_dso_loader.h        |  2 +-
 paddle/cuda/include/hl_warpctc_wrap.h      |  1 -
 paddle/cuda/src/hl_cuda_sequence.cu        | 24 +++-------
 paddle/cuda/src/hl_dso_loader.cc           |  2 +-
 paddle/cuda/src/hl_warpctc_wrap.cc         | 28 +++++------
 paddle/gserver/layers/WarpCTCLayer.cpp     | 18 +++-----
 paddle/gserver/tests/test_WarpCTCLayer.cpp | 54 +++++++++++-----------
 warp-ctc                                   |  1 +
 10 files changed, 62 insertions(+), 73 deletions(-)
 create mode 160000 warp-ctc

diff --git a/.gitmodules b/.gitmodules
index e69de29bb2d1d..f635e65784af4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "warp-ctc"]
+	path = warp-ctc
+	url = https://github.com/baidu-research/warp-ctc.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 90c25e435083d..942669c41ff15 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,6 +2,7 @@
     sha: c25201a00e6b0514370501050cf2a8538ac12270
     hooks:
     -   id: remove-crlf
+        files: (?!.*warp-ctc)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
     sha: v0.13.2
     hooks:
@@ -13,6 +14,7 @@
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
+        files: (?!.*warp-ctc)^.*$
     -   id: end-of-file-fixer
 -   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
     sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index c52066e3d7ec4..e5d3d40311403 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -58,6 +58,6 @@ void GetCurandDsoHandle(void** dso_handle);
  * @param    **dso_handle   dso handler
  *
  */
-void GetWarpctcDsoHandle(void** dso_handle);
+void GetWarpCTCDsoHandle(void** dso_handle);
 
 #endif  // HL_DSO_LOADER_H_
diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/cuda/include/hl_warpctc_wrap.h
index 9d2379a024fe1..dc50cf9d20829 100644
--- a/paddle/cuda/include/hl_warpctc_wrap.h
+++ b/paddle/cuda/include/hl_warpctc_wrap.h
@@ -16,7 +16,6 @@ limitations under the License. */
 #define HL_WARPCTC_WRAP_H_
 
 #include "hl_base.h"
-/// #include "hl_cuda.h"
 #include "warp-ctc/include/ctc.h"
 
 typedef ctcStatus_t hl_warpctc_status_t;
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index 0f1d72043935e..e83a60ad72fa4 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -463,30 +463,18 @@ void KeSequence2BatchPadding(real* batch,
   int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth;
   int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth;
 
+  real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
+
   if (sequenceIdx < sequenceLength) {
     if (seq2batch) {
       /* sequence -> batch */
-      if (normByTimes) {
-        real scale = 1.0f / (real)sequenceLength;
-        for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-          batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
-        }
-      } else {
-        for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-          batch[batchBaseIdx + i] = sequence[sequenceBaseIdx + i];
-        }
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
       }
     } else {
       /* batch -> sequence */
-      if (normByTimes) {
-        real scale = 1.0f / (real)sequenceLength;
-        for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-          sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
-        }
-      } else {
-        for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
-          sequence[sequenceBaseIdx + i] = batch[batchBaseIdx + i];
-        }
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
       }
     }
   } else if (sequenceIdx < maxSequenceLength) {
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index a6ea2a3b9f4b5..ce19073626a8e 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -163,7 +163,7 @@ void GetCurandDsoHandle(void** dso_handle) {
 #endif
 }
 
-void GetWarpctcDsoHandle(void** dso_handle) {
+void GetWarpCTCDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
   GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
 #else
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
index 99db0f242df74..3d3bf46158674 100644
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -30,32 +30,32 @@ void* warpctc_dso_handle = nullptr;
  * the linked-libs of paddle or to LD_PRELOAD.
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name, __type)                      \
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
   struct DynLoad__##__name {                                           \
     template <typename... Args>                                        \
-    __type operator()(Args... args) {                                  \
-      typedef __type (*warpctcFunc)(Args...);                          \
+    auto operator()(Args... args) -> decltype(__name(args...)) {       \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);      \
       std::call_once(                                                  \
-          warpctc_dso_flag, GetWarpctcDsoHandle, &warpctc_dso_handle); \
+          warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \
       void* p_##_name = dlsym(warpctc_dso_handle, #__name);            \
       return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
     }                                                                  \
   } __name;  // struct DynLoad__##__name
 #else
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name, __type) \
-  struct DynLoad__##__name {                      \
-    template <typename... Args>                   \
-    __type operator()(Args... args) {             \
-      return __name(args...);                     \
-    }                                             \
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                        \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
   } __name;  // struct DynLoad__##__name
 #endif
 
 // include all needed warp-ctc functions
-DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version, int)
-DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString, const char*)
-DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss, hl_warpctc_status_t)
-DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size, hl_warpctc_status_t)
+DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
+DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString)
+DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss)
+DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
 
 #undef DYNAMIC_LOAD_WARPCTC_WRAP
 
diff --git a/paddle/gserver/layers/WarpCTCLayer.cpp b/paddle/gserver/layers/WarpCTCLayer.cpp
index b99e9b9c7a620..e68363a1b2bb3 100644
--- a/paddle/gserver/layers/WarpCTCLayer.cpp
+++ b/paddle/gserver/layers/WarpCTCLayer.cpp
@@ -100,8 +100,8 @@ void WarpCTCLayer::forward(PassType passType) {
 
   /* labels always in CPU memory */
   Matrix::resizeOrCreate(cpuCosts_,
-                         /* width */ numSequences,
-                         /* height */ 1,
+                         /* height */ numSequences,
+                         /* width */ 1,
                          /* trans */ false,
                          /* useGpu */ false);
 
@@ -209,17 +209,11 @@ void WarpCTCLayer::batch2seqPadding(const MatrixPtr& seqValue,
       int sequenceStart = seqStartPositionsData[i];
       int sequenceLength =
           seqStartPositionsData[i + 1] - seqStartPositionsData[i];
+      real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
       for (int j = 0; j < sequenceLength; j++) {
-        if (normByTimes) {
-          for (size_t k = 0; k < numClasses_; k++) {
-            seqData[(sequenceStart + j) * numClasses_ + k] =
-                batchData[(j * numSequences + i) * numClasses_ + k] /
-                sequenceLength;
-          }
-        } else {
-          memcpy(seqData + (sequenceStart + j) * numClasses_,
-                 batchData + (j * numSequences + i) * numClasses_,
-                 numClasses_ * sizeof(real));
+        for (size_t k = 0; k < numClasses_; k++) {
+          seqData[(sequenceStart + j) * numClasses_ + k] =
+              batchData[(j * numSequences + i) * numClasses_ + k] * scale;
         }
       }
     }
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index 5289c9892ceb2..aba48935a6f68 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -30,7 +30,7 @@ P_DECLARE_bool(use_gpu);
 const real* getData(const Matrix& matrix) {
   if (matrix.useGpu()) {
     MatrixPtr cpuMatrix = Matrix::create(
-        matrix.getWidth(), matrix.getHeight(), matrix.isTransposed(), false);
+        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
     cpuMatrix->copyFrom(matrix);
     return cpuMatrix->getData();
   } else {
@@ -200,41 +200,43 @@ LayerPtr createWarpCTCLayer(string name,
 TEST(Layer, WarpCTCLayer) {
   for (auto layerSize : {10, 64, 128}) {
     for (auto batchSize : {1, 10, 20, 64}) {
-      for (auto useGpu : {false, true}) {
+      for (auto normByTimes : {false, true}) {
+        for (auto useGpu : {false, true}) {
 #ifdef PADDLE_ONLY_CPU
-        if (useGpu) continue;
+          if (useGpu) continue;
 #endif
-        LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
-                  << " useGpu=" << useGpu;
+          LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
+                    << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
 
-        FLAGS_use_gpu = useGpu;
+          FLAGS_use_gpu = useGpu;
 
-        Argument data0;
-        initArgument(batchSize, layerSize, useGpu, data0);
+          Argument data0;
+          initArgument(batchSize, layerSize, useGpu, data0);
 
-        Argument data1;
-        data1.resizeAndCopyFrom(data0);
+          Argument data1;
+          data1.resizeAndCopyFrom(data0);
 
-        LayerPtr dataLayer0 =
-            createDataLayer("data", batchSize, layerSize, useGpu, data0);
-        LayerPtr dataLayer1 =
-            createDataLayer("data", batchSize, layerSize, useGpu, data1);
+          LayerPtr dataLayer0 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data0);
+          LayerPtr dataLayer1 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data1);
 
-        LayerPtr labelLayer =
-            createLabelLayer("label", batchSize, layerSize, useGpu);
+          LayerPtr labelLayer =
+              createLabelLayer("label", batchSize, layerSize, useGpu);
 
-        LayerPtr warpctcLayer = createWarpCTCLayer(
-            "cost", layerSize, useGpu, false, dataLayer0, labelLayer);
-        LayerPtr ctcLayer = createCTCLayer(
-            "cost", layerSize, useGpu, false, dataLayer1, labelLayer);
+          LayerPtr warpctcLayer = createWarpCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer);
+          LayerPtr ctcLayer = createCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
 
-        /// Check loss
-        checkError(*(warpctcLayer->getOutput().value),
-                   *(ctcLayer->getOutput().value));
+          /// Check loss
+          checkError(*(warpctcLayer->getOutput().value),
+                     *(ctcLayer->getOutput().value));
 
-        /// Check gradients
-        checkError(*(dataLayer0->getOutput().grad),
-                   *(dataLayer1->getOutput().grad));
+          /// Check gradients
+          checkError(*(dataLayer0->getOutput().grad),
+                     *(dataLayer1->getOutput().grad));
+        }
       }
     }
   }
diff --git a/warp-ctc b/warp-ctc
new file mode 160000
index 0000000000000..bd535c8d44e03
--- /dev/null
+++ b/warp-ctc
@@ -0,0 +1 @@
+Subproject commit bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2

From 18b85e558a35009c3d7108e59c5ce511cf494946 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Thu, 1 Dec 2016 05:49:51 +0000
Subject: [PATCH 3/4] Add a script to auto compile the warp-ctc submodule.

---
 paddle/cuda/CMakeLists.txt                 |  3 +--
 paddle/gserver/tests/CMakeLists.txt        |  6 ++++-
 paddle/gserver/tests/test_WarpCTCLayer.cpp | 27 +++++++++++-----------
 paddle/scripts/travis/build_and_test.sh    |  1 +
 paddle/scripts/travis/submodules.sh        | 18 +++++++++++++++
 5 files changed, 39 insertions(+), 16 deletions(-)
 create mode 100755 paddle/scripts/travis/submodules.sh

diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index 7e45d3d578982..10fa34b92727b 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -18,8 +18,7 @@ set(CUDA_CXX_WITH_GPU_SOURCES
     src/hl_cudart_wrap.cc
     src/hl_cuda_cublas.cc
     src/hl_cuda_cudnn.cc
-    src/hl_cuda_device.cc
-    )
+    src/hl_cuda_device.cc)
 
 if(WITH_GPU)
     set(CUDA_CXX_SOURCES
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 8fc6656bf4d79..310c8ad08826f 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -71,9 +71,13 @@ add_unittest(test_RecurrentLayer
 
 ############### test_WarpCTCLayer #######################
 if(NOT WITH_DOUBLE)
-    add_unittest(test_WarpCTCLayer
+    add_unittest_without_exec(test_WarpCTCLayer
         test_WarpCTCLayer.cpp
         TestUtil.cpp)
+
+    add_test(NAME test_WarpCTCLayer
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${PROJ_ROOT}/warp-ctc/build
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
 endif()
 
 ############### test_RecurrentGradientMachine ###############
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index aba48935a6f68..2dd83db345132 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -38,7 +38,7 @@ const real* getData(const Matrix& matrix) {
   }
 }
 
-void checkError(const Matrix& matrix1, const Matrix& matrix2) {
+int checkError(const Matrix& matrix1, const Matrix& matrix2) {
   CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
   CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
   CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
@@ -62,6 +62,7 @@ void checkError(const Matrix& matrix1, const Matrix& matrix2) {
     }
   }
   EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+  return count;
 }
 
 void initArgument(size_t batchSize,
@@ -72,7 +73,6 @@ void initArgument(size_t batchSize,
   data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
   data.value->randomizeUniform();
   data.value->add(-0.5);
-  /// data.value->sigmoid(*data.value);
   data.grad->zeroMem();
 
   generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
@@ -90,9 +90,6 @@ LayerPtr createDataLayer(
   dataLayer->setData(data);
   dataLayer->forward(PASS_GC);
 
-  /// std::cout << "dataLayer: " << std::endl;
-  /// (dataLayer->getOutput().value)->print(std::cout);
-
   return layer;
 }
 
@@ -198,14 +195,14 @@ LayerPtr createWarpCTCLayer(string name,
 }
 
 TEST(Layer, WarpCTCLayer) {
-  for (auto layerSize : {10, 64, 128}) {
-    for (auto batchSize : {1, 10, 20, 64}) {
+  for (auto layerSize : {10, 64}) {
+    for (auto batchSize : {1, 10, 32}) {
       for (auto normByTimes : {false, true}) {
         for (auto useGpu : {false, true}) {
 #ifdef PADDLE_ONLY_CPU
           if (useGpu) continue;
 #endif
-          LOG(INFO) << " layerSize=" << layerSize << " batchSize=" << batchSize
+          LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
                     << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
 
           FLAGS_use_gpu = useGpu;
@@ -229,13 +226,17 @@ TEST(Layer, WarpCTCLayer) {
           LayerPtr ctcLayer = createCTCLayer(
               "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
 
-          /// Check loss
-          checkError(*(warpctcLayer->getOutput().value),
-                     *(ctcLayer->getOutput().value));
+          /// Check cost
+          LOG(INFO) << "Check cost: "
+                    << checkError(*(warpctcLayer->getOutput().value),
+                                  *(ctcLayer->getOutput().value))
+                    << " different elements.";
 
           /// Check gradients
-          checkError(*(dataLayer0->getOutput().grad),
-                     *(dataLayer1->getOutput().grad));
+          LOG(INFO) << "Check gradients: "
+                    << checkError(*(dataLayer0->getOutput().grad),
+                                  *(dataLayer1->getOutput().grad))
+                    << " different elements";
         }
       }
     }
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 242fd982aa001..c46c119daeb61 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+./submodules.sh
 source ./common.sh
 CMAKE_EXTRA=""
 if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
diff --git a/paddle/scripts/travis/submodules.sh b/paddle/scripts/travis/submodules.sh
new file mode 100755
index 0000000000000..47bd8d87ac6a7
--- /dev/null
+++ b/paddle/scripts/travis/submodules.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+set -e
+PROJ_ROOT=$(git rev-parse --show-cdup)
+SUBMODULES=$(grep path ${PROJ_ROOT}.gitmodules | sed 's/^.*path = //')
+
+for module in $SUBMODULES
+do
+  case $module in
+    "warp-ctc")
+      if [ -d ${PROJ_ROOT}warp-ctc/build ]; then
+        rm -rf ${PROJ_ROOT}warp-ctc/build
+      fi
+      mkdir ${PROJ_ROOT}warp-ctc/build
+      cd ${PROJ_ROOT}warp-ctc/build
+      cmake ..; make
+    ;;
+  esac
+done

From 7bb7fed8336232321bcb8dfff002c224ae746cf2 Mon Sep 17 00:00:00 2001
From: Liu Yiqun <liuyiqun01@baidu.com>
Date: Fri, 2 Dec 2016 09:22:51 +0000
Subject: [PATCH 4/4] Simplify the CMakelist.txt and fix typos.

---
 CMakeLists.txt                                      | 13 ++++---------
 paddle/scripts/travis/build_and_test.sh             |  2 +-
 .../travis/{submodules.sh => build_submodules.sh}   |  2 ++
 python/paddle/trainer_config_helpers/layers.py      | 12 ++++++------
 4 files changed, 13 insertions(+), 16 deletions(-)
 rename paddle/scripts/travis/{submodules.sh => build_submodules.sh} (93%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 28375d0cd0607..dfb5159ea1217 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,15 +77,10 @@ find_package(Git REQUIRED)
 include(version)
 add_definitions(-DPADDLE_VERSION=\"${PADDLE_VERSION}\")
 
-
 if(NOT WITH_GPU)
     add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
 
-    if(WITH_DSO)
-        add_definitions(-DPADDLE_USE_DSO)
-    endif(WITH_DSO)
-
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
     if(${CUDA_VERSION_MAJOR} GREATER 6)
@@ -107,15 +102,15 @@ else()
         set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
     endif(WITH_AVX)
 
-    if(WITH_DSO)
-        add_definitions(-DPADDLE_USE_DSO)
-    endif(WITH_DSO)
-
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
+if(WITH_DSO)
+    add_definitions(-DPADDLE_USE_DSO)
+endif(WITH_DSO)
+
 if(WITH_DOUBLE)
     add_definitions(-DPADDLE_TYPE_DOUBLE)
     set(ACCURACY double)
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index c46c119daeb61..9caeb21beb15e 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-./submodules.sh
+./build_submodules.sh
 source ./common.sh
 CMAKE_EXTRA=""
 if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
diff --git a/paddle/scripts/travis/submodules.sh b/paddle/scripts/travis/build_submodules.sh
similarity index 93%
rename from paddle/scripts/travis/submodules.sh
rename to paddle/scripts/travis/build_submodules.sh
index 47bd8d87ac6a7..d458bf92bf455 100755
--- a/paddle/scripts/travis/submodules.sh
+++ b/paddle/scripts/travis/build_submodules.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 set -e
+WORK_DIR=$PWD
 PROJ_ROOT=$(git rev-parse --show-cdup)
 SUBMODULES=$(grep path ${PROJ_ROOT}.gitmodules | sed 's/^.*path = //')
 
@@ -16,3 +17,4 @@ do
     ;;
   esac
 done
+cd $WORK_DIR
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index bf043c3674b61..bec675a8cea34 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1874,7 +1874,7 @@ def img_conv_layer(input,
         param_attr.attr["initial_std"] = init_w
         param_attr.attr["initial_strategy"] = 0
         param_attr.attr["initial_smart"] = False
-    
+
     if layer_type:
         if trans:
             assert layer_type in ["exconvt"]
@@ -4125,11 +4125,11 @@ def warp_ctc_layer(input,
 
     Note:
         - Let num_classes represent the category number. Considering the 'blank'
-          label needed by CTC, you need to use (num_classes + 1) as the input size.
-          Thus, the size of both warp_ctc_layer and 'input' layer should be set to
-          num_classes + 1.
-        - You can set 'blank' to [0, num_classes - 1], which should be consistent
-          as that used in your labels.
+          label needed by CTC, you need to use (num_classes + 1) as the input
+          size. Thus, the size of both warp_ctc_layer and 'input' layer should
+          be set to num_classes + 1.
+        - You can set 'blank' to any value ranged in [0, num_classes], which
+          should be consistent as that used in your labels.
         - As a native 'softmax' activation is interated to the warp-ctc library,
          'linear' activation is expected instead in the 'input' layer.