From 2377d719473543da3a6129de3c6c32667bdb9f18 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Mon, 21 Aug 2017 09:28:03 +0800
Subject: [PATCH 1/6] Add3DPooling

---
 paddle/cuda/include/hl_cnn.h             | 198 ++++++++-
 paddle/cuda/include/stub/hl_cnn_stub.h   |  90 ++++
 paddle/cuda/src/hl_cuda_cnn.cu           | 427 ++++++++++++++++++-
 paddle/gserver/layers/Pool3DLayer.cpp    | 198 +++++++++
 paddle/gserver/layers/Pool3DLayer.h      |  48 +++
 paddle/gserver/tests/test_LayerGrad.cpp  |  69 ++++
 paddle/math/Matrix.cpp                   | 502 +++++++++++++++++++++++
 paddle/math/Matrix.h                     | 254 +++++++++++-
 paddle/math/tests/test_matrixCompare.cpp | 204 +++++++++
 paddle/parameter/Argument.cpp            |   2 +
 paddle/parameter/Argument.h              |   8 +-
 proto/ModelConfig.proto                  |  12 +
 12 files changed, 1998 insertions(+), 14 deletions(-)
 create mode 100644 paddle/gserver/layers/Pool3DLayer.cpp
 create mode 100644 paddle/gserver/layers/Pool3DLayer.h

diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 9f84db72da24b..e9687d0a58d94 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -173,6 +173,202 @@ extern void hl_avgpool_backward(const int frameCnt,
                                 real* backGrad,
                                 const int outStride);
 
+/**
+ * @brief   Maximum pool forward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   depth      image depth.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledD     output image depth.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeZ       depth of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   strideD     pooling stride depth.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingD    padding depth.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[out]  tgtData     output data.
+ * @param[in]   tgtStride   stride between output data samples.
+ *
+ */
+extern void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride);
+
+/**
+ * @brief   Maximum pool backward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[out]  outData     output data.
+ * @param[out]  outGrad     output grad data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   depth       image depth.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledD     output image depth.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeZ       depth of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   strideD     pooling stride depth.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   scaleA      scale.
+ * @param[in]   scaleB      scale.
+ * @param[in]   paddingD    padding depth.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[out]  targetGrad  output grad.
+ * @param[in]   outStride   stride between output data samples.
+ *
+ */
+extern void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* inputData,
+                                  const real* outData,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  const int outStride);
+
+/**
+ * @brief   Averge pool forward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   inputData   input data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   depth       image depth.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledD     output image depth.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeZ       depth of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   strideD     pooling stride depth.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingD    padding depth.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[out]  tgtData     output data.
+ * @param[in]   tgtStride   stride between output data samples.
+ *
+ */
+extern void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride);
+
+/**
+ * @brief   Maximum pool backward.
+ *
+ * @param[in]   frameCnt    batch size of input image.
+ * @param[in]   outGrad     output grad data.
+ * @param[in]   channels    number of channel.
+ * @param[in]   depth      image depth.
+ * @param[in]   height      image height.
+ * @param[in]   width       image width.
+ * @param[in]   pooledD     output image depth.
+ * @param[in]   pooledH     output image height.
+ * @param[in]   pooledW     output image width.
+ * @param[in]   sizeZ       depth of pooling window.
+ * @param[in]   sizeY       height of pooling window.
+ * @param[in]   sizeX       width of pooling window.
+ * @param[in]   strideD     pooling stride depth.
+ * @param[in]   strideH     pooling stride height.
+ * @param[in]   strideW     pooling stride width.
+ * @param[in]   paddingD    padding depth.
+ * @param[in]   paddingH    padding height.
+ * @param[in]   paddingW    padding width.
+ * @param[in]   scaleA      scale.
+ * @param[in]   scaleB      scale.
+ * @param[out]  backGrad    output grad.
+ * @param[in]   outStride   stride between output data samples.
+ *
+ */
+extern void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  int paddingD,
+                                  int paddingH,
+                                  int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride);
+
 /**
  * @brief   Bilinear interpolation forward.
  *
@@ -275,4 +471,4 @@ extern void hl_maxout_backward(real* inGrad,
                                size_t featLen,
                                size_t groups);
 
-#endif /* HL_CNN_H_ */
+#endif  // HL_CNN_H_
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 2bbb9fa8dfd5e..28f61781be0b9 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -87,6 +87,96 @@ inline void hl_avgpool_backward(const int frameCnt,
                                 real* backGrad,
                                 const int outStride) {}
 
+inline void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride) {}
+
+inline void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* inputData,
+                                  const real* outData,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  const int outStride) {}
+
+inline void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride) {}
+
+inline void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  int paddingD,
+                                  int paddingH,
+                                  int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride) {}
+
 inline void hl_bilinear_forward(const real* inData,
                                 const size_t inImgH,
                                 const size_t inImgW,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index aac19b1ea566a..458c347728952 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -353,6 +350,430 @@ void hl_avgpool_backward(const int frameCnt,
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
+/////////////////
+__global__ void KeMaxPool3DForward(const int nthreads,
+                                   const real* inputData,
+                                   const int channels,
+                                   const int depth,
+                                   const int height,
+                                   const int width,
+                                   const int pooledD,
+                                   const int pooledH,
+                                   const int pooledW,
+                                   const int ksizeD,
+                                   const int ksizeH,
+                                   const int ksizeW,
+                                   const int strideD,
+                                   const int strideH,
+                                   const int strideW,
+                                   const int offsetD,
+                                   const int offsetH,
+                                   const int offsetW,
+                                   real* tgtData,
+                                   const int tgtStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int pw = index % pooledW;
+    int ph = (index / pooledW) % pooledH;
+    int pd = (index / pooledW / pooledH) % pooledD;
+    int c = (index / pooledW / pooledH / pooledD) % channels;
+    int frameNum = index / pooledW / pooledH / pooledD / channels;
+    int dstart = pd * strideD - offsetD;
+    int hstart = ph * strideH - offsetH;
+    int wstart = pw * strideW - offsetW;
+    int dend = min(dstart + ksizeD, depth);
+    int hend = min(hstart + ksizeH, height);
+    int wend = min(wstart + ksizeW, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    real maxval = -FLT_MAX;
+    inputData += (frameNum * channels + c) * depth * height * width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          if (maxval < inputData[(d * height + h) * width + w])
+            maxval = inputData[(d * height + h) * width + w];
+        }
+      }
+    }
+    int tgtIndex =
+        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
+    tgtData[tgtIndex] = maxval;
+  }
+}
+
+void hl_maxpool3D_forward(const int frameCnt,
+                          const real* inputData,
+                          const int channels,
+                          const int depth,
+                          const int height,
+                          const int width,
+                          const int pooledD,
+                          const int pooledH,
+                          const int pooledW,
+                          const int sizeZ,
+                          const int sizeY,
+                          const int sizeX,
+                          const int strideD,
+                          const int strideH,
+                          const int strideW,
+                          const int paddingD,
+                          const int paddingH,
+                          const int paddingW,
+                          real* tgtData,
+                          const int tgtStride) {
+  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KeMaxPool3DForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           inputData,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           pooledD,
+                                                           pooledH,
+                                                           pooledW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           paddingD,
+                                                           paddingH,
+                                                           paddingW,
+                                                           tgtData,
+                                                           tgtStride);
+  CHECK_SYNC("hl_maxpool3D_forward failed");
+}
+
+__global__ void KeMaxPool3DBackward(const int nthreads,
+                                    const real* inputData,
+                                    const real* outData,
+                                    const real* outGrad,
+                                    const int channels,
+                                    const int depth,
+                                    const int height,
+                                    const int width,
+                                    const int pooledD,
+                                    const int pooledH,
+                                    const int pooledW,
+                                    const int sizeZ,
+                                    const int sizeY,
+                                    const int sizeX,
+                                    const int strideD,
+                                    const int strideH,
+                                    const int strideW,
+                                    const int padD,
+                                    const int padH,
+                                    const int padW,
+                                    real scaleA,
+                                    real scaleB,
+                                    real* targetGrad,
+                                    const int outStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    // find out the local index
+    // find out the local offset
+    int offsetW = index % width + padW;
+    int offsetH = (index / width) % height + padH;
+    int offsetD = (index / width / height) % depth + padD;
+    int offsetC = (index / width / height / depth) % channels;
+    int frameNum = index / width / height / depth / channels;
+
+    int pdstart = (offsetD < sizeZ) ? 0 : (offsetD - sizeZ) / strideD + 1;
+    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
+    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
+    int pdend = min(offsetD / strideD + 1, pooledD);
+    int phend = min(offsetH / strideH + 1, pooledH);
+    int pwend = min(offsetW / strideW + 1, pooledW);
+
+    real gradient = 0;
+    real input = inputData[index];
+
+    outData += ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
+    outGrad += ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (input == outData[(pd * pooledH + ph) * pooledW + pw])
+            gradient += outGrad[(pd * pooledH + ph) * pooledW + pw];
+        }
+      }
+    }
+    targetGrad[index] = scaleA * gradient + scaleB * targetGrad[index];
+  }
+}
+
+void hl_maxpool3D_backward(const int frameCnt,
+                           const real* inputData,
+                           const real* outData,
+                           const real* outGrad,
+                           const int channels,
+                           const int depth,
+                           const int height,
+                           const int width,
+                           const int outputD,
+                           const int outputH,
+                           const int outputW,
+                           const int sizeZ,
+                           const int sizeY,
+                           const int sizeX,
+                           const int strideD,
+                           const int strideH,
+                           const int strideW,
+                           const int paddingD,
+                           const int paddingH,
+                           const int paddingW,
+                           real scaleA,
+                           real scaleB,
+                           real* targetGrad,
+                           const int outStride) {
+  int num_kernels = depth * height * width * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+
+  KeMaxPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           inputData,
+                                                           outData,
+                                                           outGrad,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           outputD,
+                                                           outputH,
+                                                           outputW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           paddingD,
+                                                           paddingH,
+                                                           paddingW,
+                                                           scaleA,
+                                                           scaleB,
+                                                           targetGrad,
+                                                           outStride);
+  CHECK_SYNC("hl_maxpool3D_backward");
+}
+
+__global__ void KeAvgPool3DForward(const int nthreads,
+                                   const real* inputData,
+                                   const int channels,
+                                   const int depth,
+                                   const int height,
+                                   const int width,
+                                   const int pooledD,
+                                   const int pooledH,
+                                   const int pooledW,
+                                   const int sizeZ,
+                                   const int sizeY,
+                                   const int sizeX,
+                                   const int strideD,
+                                   const int strideH,
+                                   const int strideW,
+                                   const int padD,
+                                   const int padH,
+                                   const int padW,
+                                   real* tgtData,
+                                   const int tgtStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int pw = index % pooledW;
+    int ph = (index / pooledW) % pooledH;
+    int pd = (index / pooledW / pooledH) % pooledD;
+    int c = (index / pooledW / pooledH / pooledD) % channels;
+    int frameNum = index / pooledW / pooledH / pooledD / channels;
+    int dstart = pd * strideD - padD;
+    int hstart = ph * strideH - padH;
+    int wstart = pw * strideW - padW;
+    int dend = min(dstart + sizeZ, depth + padD);
+    int hend = min(hstart + sizeY, height + padH);
+    int wend = min(wstart + sizeX, width + padW);
+    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    dend = min(dend, depth);
+    hend = min(hend, height);
+    wend = min(wend, width);
+
+    real aveval = 0;
+    inputData += (frameNum * channels + c) * depth * height * width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          aveval += inputData[(d * height + h) * width + w];
+        }
+      }
+    }
+    int tgtIndex =
+        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
+    tgtData[tgtIndex] = aveval / pool_size;
+  }
+}
+
+void hl_avgpool3D_forward(const int frameCnt,
+                          const real* inputData,
+                          const int channels,
+                          const int depth,
+                          const int height,
+                          const int width,
+                          const int pooledD,
+                          const int pooledH,
+                          const int pooledW,
+                          const int sizeZ,
+                          const int sizeY,
+                          const int sizeX,
+                          const int strideD,
+                          const int strideH,
+                          const int strideW,
+                          const int paddingD,
+                          const int paddingH,
+                          const int paddingW,
+                          real* tgtData,
+                          const int tgtStride) {
+  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  KeAvgPool3DForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          inputData,
+                                                          channels,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          pooledD,
+                                                          pooledH,
+                                                          pooledW,
+                                                          sizeZ,
+                                                          sizeY,
+                                                          sizeX,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          tgtData,
+                                                          tgtStride);
+  CHECK_SYNC("hl_avgpool3D_forward failed");
+}
+
+__global__ void KeAvgPool3DBackward(const int nthreads,
+                                    const real* outGrad,
+                                    const int channels,
+                                    const int depth,
+                                    const int height,
+                                    const int width,
+                                    const int pooledD,
+                                    const int pooledH,
+                                    const int pooledW,
+                                    const int sizeZ,
+                                    const int sizeY,
+                                    const int sizeX,
+                                    const int strideD,
+                                    const int strideH,
+                                    const int strideW,
+                                    const int padD,
+                                    const int padH,
+                                    const int padW,
+                                    real scaleA,
+                                    real scaleB,
+                                    real* tgtGrad,
+                                    const int outStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % width + padW;
+    int offsetH = (index / width) % height + padH;
+    int offsetD = (index / width / height) % depth + padD;
+    int offsetC = (index / width / height / depth) % channels;
+    int frameNum = index / width / height / depth / channels;
+
+    int pdstart = (offsetD < sizeZ) ? 0 : (offsetD - sizeZ) / strideD + 1;
+    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
+    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
+    int pdend = min(offsetD / strideD + 1, pooledD);
+    int phend = min(offsetH / strideH + 1, pooledH);
+    int pwend = min(offsetW / strideW + 1, pooledW);
+
+    real gradient = 0;
+    outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW;
+
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          // figure out the pooling size
+          int dstart = pd * strideD - padD;
+          int hstart = ph * strideH - padH;
+          int wstart = pw * strideW - padW;
+          int dend = min(dstart + sizeZ, depth + padD);
+          int hend = min(hstart + sizeY, height + padH);
+          int wend = min(wstart + sizeX, width + padW);
+          int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize;
+        }
+      }
+    }
+    tgtGrad[index] = scaleA * gradient + scaleB * tgtGrad[index];
+  }
+}
+
+void hl_avgpool3D_backward(const int frameCnt,
+                           const real* outGrad,
+                           const int channels,
+                           const int depth,
+                           const int height,
+                           const int width,
+                           const int outputD,
+                           const int outputH,
+                           const int outputW,
+                           const int sizeZ,
+                           const int sizeY,
+                           const int sizeX,
+                           const int strideD,
+                           const int strideH,
+                           const int strideW,
+                           int paddingD,
+                           int paddingH,
+                           int paddingW,
+                           real scaleA,
+                           real scaleB,
+                           real* backGrad,
+                           const int outStride) {
+  int num_kernels = depth * height * width * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+
+  KeAvgPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           outGrad,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           outputD,
+                                                           outputH,
+                                                           outputW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           paddingD,
+                                                           paddingH,
+                                                           paddingW,
+                                                           scaleA,
+                                                           scaleB,
+                                                           backGrad,
+                                                           outStride);
+  CHECK_SYNC("hl_avgpool3D_backward failed");
+}
+/////////////////
+
 __global__ void KeBilinearInterpFw(const real* in,
                                    const size_t inImgH,
                                    const size_t inImgW,
diff --git a/paddle/gserver/layers/Pool3DLayer.cpp b/paddle/gserver/layers/Pool3DLayer.cpp
new file mode 100644
index 0000000000000..fc6b9bdd2f95d
--- /dev/null
+++ b/paddle/gserver/layers/Pool3DLayer.cpp
@@ -0,0 +1,198 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Pool3DLayer.h"
+#include "PoolProjectionLayer.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+
+REGISTER_LAYER(pool3d, Pool3DLayer);
+
+bool Pool3DLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  poolType_ = conf.pool_type();
+  channels_ = conf.channels();
+
+  sizeX_ = conf.size_x();
+  sizeY_ = conf.size_y();
+  sizeZ_ = conf.size_z();
+
+  strideW_ = conf.stride();
+  strideH_ = conf.stride_y();
+  strideD_ = conf.stride_z();
+
+  imgSizeW_ = conf.img_size();
+  imgSizeH_ = conf.img_size_y();
+  imgSizeD_ = conf.img_size_z();
+
+  paddingW_ = conf.padding();
+  paddingH_ = conf.padding_y();
+  paddingD_ = conf.padding_z();
+
+  outputW_ = conf.output_x();
+  outputH_ = conf.output_y();
+  outputD_ = conf.output_z();
+
+  return true;
+}
+
+size_t Pool3DLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+
+  size_t layerSize = 0;
+  //  imgSizeD_ = inputLayers_[0]->getOutput().getFrameDepth();
+  //  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  //  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    //    imgSizeH_ = imgSizeY_;
+  }
+  if (imgSizeW_ == 0) {
+    //    imgSizeW_ = imgSize_;
+  }
+  outputD_ = outputSize(imgSizeD_,
+                        sizeZ_,
+                        paddingD_,
+                        strideD_,
+                        /* caffeMode */ false);
+  outputH_ = outputSize(imgSizeH_,
+                        sizeY_,
+                        paddingH_,
+                        strideH_,
+                        /* caffeMode */ false);
+  outputW_ = outputSize(imgSizeW_,
+                        sizeX_,
+                        paddingW_,
+                        strideW_,
+                        /* caffeMode */ false);
+
+  layerSize = outputD_ * outputH_ * outputW_ * channels_;
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+  getOutput().setFrameDepth(outputD_);
+  return layerSize;
+}
+
+void Pool3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  int batchSize = inMat->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+  const MatrixPtr outMat = getOutputValue();
+
+  if (poolType_ == "avg") {
+    outMat->avgPool3DForward(*inMat,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             channels_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else if (poolType_ == "max") {
+    outMat->maxPool3DForward(*inMat,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             channels_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+  forwardActivation();
+}
+
+void Pool3DLayer::backward(const UpdateCallback& callback) {
+  backwardActivation();
+
+  (void)callback;
+  if (NULL == getInputGrad(0)) return;
+  MatrixPtr inMat = inputLayers_[0]->getOutputValue();
+  MatrixPtr inGradMat = inputLayers_[0]->getOutputGrad();
+  MatrixPtr outMat = getOutputValue();
+  MatrixPtr outGradMat = getOutputGrad();
+
+  if (poolType_ == "avg") {
+    inGradMat->avgPool3DBackward(*outGradMat,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 1,
+                                 1,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_);
+  } else if (poolType_ == "max") {
+    inGradMat->maxPool3DBackward(*inMat,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 *outGradMat,
+                                 *outMat,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 1,
+                                 1,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Pool3DLayer.h b/paddle/gserver/layers/Pool3DLayer.h
new file mode 100644
index 0000000000000..afc65ac2b0f82
--- /dev/null
+++ b/paddle/gserver/layers/Pool3DLayer.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief Basic parent layer of pooling
+ * Pools the input within regions
+ */
+class Pool3DLayer : public Layer {
+public:
+  explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
+  ~Pool3DLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+  size_t getSize();
+
+protected:
+  int channels_;
+  int sizeX_, sizeY_, sizeZ_;
+  int strideW_, strideH_, strideD_;
+  int paddingW_, paddingH_, paddingD_;
+  int imgSizeW_, imgSizeH_, imgSizeD_;
+  int outputW_, outputH_, outputD_;
+  std::string poolType_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 0f312b6ca50bc..43fb255ae0b30 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1206,6 +1206,75 @@ TEST(Layer, PoolLayer) {
 #endif
 }
 
+void setPool3DConfig(TestConfig* config,
+                     PoolConfig* pool,
+                     const string& poolType) {
+  // filter size
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+  const int CHANNELS = 16;
+
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool3d");
+  (*config).layerConfig.set_num_filters(NUM_FILTERS);
+
+  int kw = FILTER_SIZE, kh = FILTER_SIZE_Y, kd = FILTER_SIZE_Z;
+  int pw = 0, ph = 0, pd = 0;
+  int sw = 2, sh = 2, sd = 2;
+
+  pool->set_pool_type(poolType);
+  pool->set_pool_type("avg");
+  pool->set_channels(CHANNELS);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_size_z(kd);
+  pool->set_padding(0);
+  pool->set_padding_y(0);
+  pool->set_padding_z(0);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+  pool->set_stride_z(sd);
+  pool->set_start(0);
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  int od = outputSize(pool->img_size_z(), kd, pd, sd, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+  pool->set_output_z(od);
+}
+
+void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 11664, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+
+  pool->set_img_size(IMAGE_SIZE);
+  pool->set_img_size_y(IMAGE_SIZE_Y);
+  pool->set_img_size_z(IMAGE_SIZE_Z);
+
+  setPool3DConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool3d", 100, trans, useGpu);
+}
+
+TEST(Layer, Pool3DLayer) {
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
+#ifndef PADDLE_ONLY_CPU
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
 void testSppLayer(const string& poolType,
                   const int pyramidHeight,
                   bool trans,
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 27f7d95b752d4..e7f1489b8ba4f 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1190,6 +1190,224 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                       outGrad.getStride());
 }
 
+void GpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  size_t num = inputMat.getHeight();
+  size_t width = imgSizeW;
+  size_t height = imgSizeH;
+  size_t depth = imgSizeD;
+  CHECK(depth * height * width * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputD * outputH * outputW * channels);
+
+  hl_maxpool3D_forward(num,
+                       inputData,
+                       channels,
+                       depth,
+                       height,
+                       width,
+                       outputD,
+                       outputH,
+                       outputW,
+                       sizeZ,
+                       sizeY,
+                       sizeX,
+                       strideD,
+                       strideH,
+                       strideW,
+                       paddingD,
+                       paddingH,
+                       paddingW,
+                       data_,
+                       getStride());
+}
+
+void GpuMatrix::maxPool3DBackward(Matrix& inputMat,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  Matrix& outGrad,
+                                  Matrix& outV,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real scaleTargets,
+                                  real scaleOutput,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW) {
+  CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
+        outV.useGpu_ == true)
+      << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  real* outData = outV.getData();
+  real* outDiff = outGrad.getData();
+  size_t frameNum = inputMat.getHeight();
+  size_t channels = outV.getWidth() / outputD / outputH / outputW;
+  size_t width = imgSizeW;
+  size_t height = imgSizeH;
+  size_t depth = imgSizeD;
+  CHECK(depth * height * width * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == depth * width * height * channels);
+  CHECK(outGrad.getHeight() == outV.getHeight() &&
+        outGrad.getWidth() == outV.getWidth());
+
+  hl_maxpool3D_backward(frameNum,
+                        inputData,
+                        outData,
+                        outDiff,
+                        channels,
+                        depth,
+                        height,
+                        width,
+                        outputD,
+                        outputH,
+                        outputW,
+                        sizeZ,
+                        sizeY,
+                        sizeX,
+                        strideD,
+                        strideH,
+                        strideW,
+                        paddingD,
+                        paddingH,
+                        paddingW,
+                        scaleTargets,
+                        scaleOutput,
+                        data_,
+                        outGrad.getStride());
+}
+
+void GpuMatrix::avgPool3DForward(Matrix& inputMat,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  size_t frameNum = inputMat.getHeight();
+  size_t height = imgSizeH;
+  size_t width = imgSizeW;
+  size_t depth = imgSizeD;
+  CHECK(depth * height * width * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputD * outputH * outputW * channels);
+
+  hl_avgpool3D_forward(frameNum,
+                       inputData,
+                       channels,
+                       depth,
+                       height,
+                       width,
+                       outputD,
+                       outputH,
+                       outputW,
+                       sizeZ,
+                       sizeY,
+                       sizeX,
+                       strideD,
+                       strideH,
+                       strideW,
+                       paddingD,
+                       paddingH,
+                       paddingW,
+                       data_,
+                       getStride());
+}
+
+void GpuMatrix::avgPool3DBackward(Matrix& outGrad,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real scaleTargets,
+                                  real scaleOutput,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW) {
+  CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  size_t frameNum = outGrad.getHeight();
+  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
+  size_t height = imgSizeH;
+  size_t width = imgSizeW;
+  size_t depth = imgSizeD;
+  CHECK(depth * height * width * channels == width_);
+  CHECK(height_ == outGrad.getHeight());
+  CHECK(outGrad.getWidth() == outputD * outputH * outputW * channels);
+
+  hl_avgpool3D_backward(frameNum,
+                        outDiff,
+                        channels,
+                        depth,
+                        height,
+                        width,
+                        outputD,
+                        outputH,
+                        outputW,
+                        sizeZ,
+                        sizeY,
+                        sizeX,
+                        strideD,
+                        strideH,
+                        strideW,
+                        paddingD,
+                        paddingH,
+                        paddingW,
+                        scaleTargets,
+                        scaleOutput,
+                        data_,
+                        outGrad.getStride());
+}
+
 void GpuMatrix::maxSequenceForward(Matrix& input,
                                    const IVector& sequence,
                                    IVector& index) {
@@ -1930,6 +2148,290 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
   }
 }
 
+void CpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  real* inputData = inputMat.getData();
+  real* outData = data_;
+  size_t num = inputMat.getHeight();
+  size_t inWidth = imgSizeW;
+  size_t inHeight = imgSizeH;
+  size_t inDepth = imgSizeD;
+  CHECK(inHeight * inWidth * inDepth == inputMat.getWidth() / channels);
+  CHECK_EQ(num, this->getHeight());
+  CHECK_EQ(channels * outputH * outputW * outputD, this->getWidth());
+  size_t outStride = getStride();
+
+  /* initialize the data_ */
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      outData[(i)*outStride + j] = -(real)FLT_MAX;
+    }
+  }
+
+  /* pool max one by one */
+  for (size_t n = 0; n < num; ++n) {  // frame by frame
+    if (!isContiguous()) {
+      outData = data_ + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {  // channel by channel
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int dstart = pd * strideD - paddingD;
+            int hstart = ph * strideH - paddingH;
+            int wstart = pw * strideW - paddingW;
+            int dend = std::min(dstart + sizeZ, inDepth);
+            int hend = std::min(hstart + sizeY, inHeight);
+            int wend = std::min(wstart + sizeX, inWidth);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  outData[(pd * outputH + ph) * outputW + pw] =
+                      std::max(outData[(pd * outputH + ph) * outputW + pw],
+                               inputData[(d * inHeight + h) * inWidth + w]);
+                }
+              }
+            }
+          }
+        }
+      }
+      // compute offset
+      inputData += inDepth * inHeight * inWidth;
+      outData += outputD * outputH * outputW;
+    }
+  }
+}
+
+void CpuMatrix::maxPool3DBackward(Matrix& image,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  Matrix& outGrad,
+                                  Matrix& outV,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real scaleTargets,
+                                  real scaleOutput,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW) {
+  size_t num = image.getHeight();
+  size_t channels = size_t(width_ / imgSizeD / imgSizeH / imgSizeW);
+  CHECK(image.getWidth() == imgSizeD * imgSizeH * imgSizeW * channels);
+  CHECK(image.getHeight() == height_ && image.getWidth() == width_);
+  CHECK(outV.getHeight() == outGrad.getHeight() &&
+        outV.getWidth() == outGrad.getWidth());
+
+  real* tgtGrad = data_;
+  real* inData = image.getData();
+  real* otData = outV.getData();
+  real* otGrad = outGrad.getData();
+
+  size_t outStride = outV.getStride();
+  real* origOutData = otData;
+  real* origOutGrad = otGrad;
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!outV.isContiguous()) {
+      otData = origOutData + n * outStride;
+      otGrad = origOutGrad + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int dstart = pd * strideD - paddingD;
+            int hstart = ph * strideH - paddingH;
+            int wstart = pw * strideW - paddingW;
+            int dend = std::min(dstart + sizeZ, imgSizeD);
+            int hend = std::min(hstart + sizeY, imgSizeH);
+            int wend = std::min(wstart + sizeX, imgSizeW);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            for (int d = 0; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  tgtGrad[(d * imgSizeH + h) * imgSizeW + w] =
+                      scaleTargets *
+                          tgtGrad[(d * imgSizeH + h) * imgSizeW + w] +
+                      scaleOutput * otGrad[(pd * outputH + ph) * outputW + pw] *
+                          (inData[(d * imgSizeH + h) * imgSizeW + w] ==
+                           otData[(pd * outputH + ph) * outputW + pw]);
+                }
+              }
+            }
+          }
+        }
+      }
+      // offset
+      inData += imgSizeD * imgSizeH * imgSizeW;
+      tgtGrad += imgSizeD * imgSizeH * imgSizeW;
+      otData += outputD * outputH * outputW;
+      otGrad += outputD * outputH * outputW;
+    }
+  }
+}
+
+void CpuMatrix::avgPool3DForward(Matrix& input,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t channels,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  // The main loop
+  size_t num = input.getHeight();
+  size_t inDepth = imgSizeD;
+  size_t inHeight = imgSizeH;
+  size_t inWidth = imgSizeW;
+  CHECK(inDepth * inHeight * inWidth * channels == input.getWidth());
+  CHECK(outputD * outputH * outputW * channels * num == height_ * width_);
+  real* tgtData = data_;
+  real* inData = input.getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!isContiguous()) {
+      tgtData = data_ + n * getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int dstart = pd * strideD - paddingD;
+            int hstart = ph * strideH - paddingH;
+            int wstart = pw * strideW - paddingW;
+            int dend = std::min(dstart + sizeZ, inDepth + paddingD);
+            int hend = std::min(hstart + sizeY, inHeight + paddingH);
+            int wend = std::min(wstart + sizeX, inWidth + paddingW);
+            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            dend = std::min(dend, static_cast<int>(inDepth));
+            hend = std::min(hend, static_cast<int>(inHeight));
+            wend = std::min(wend, static_cast<int>(inWidth));
+
+            CHECK(poolSize);
+            tgtData[(pd * outputH + ph) * outputW + pw] = 0;  // clear
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  tgtData[(pd * outputH + ph) * outputW + pw] +=
+                      inData[(d * inHeight + h) * inWidth + w];
+                }
+              }
+            }
+            tgtData[(pd * outputH + ph) * outputW + pw] /= poolSize;
+          }
+        }
+      }
+      // compute offset
+      inData += inDepth * inHeight * inWidth;
+      tgtData += outputD * outputH * outputW;
+    }
+  }
+}
+
+void CpuMatrix::avgPool3DBackward(Matrix& input,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  real scaleTargets,
+                                  real scaleOutput,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW) {
+  size_t num = input.getHeight();
+  size_t channels = input.getWidth() / outputD / outputH / outputW;
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth());
+  real* inData = input.getData();
+  real* outData = getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!input.isContiguous()) {
+      inData = input.getData() + n * input.getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int dstart = pd * strideD - paddingD;
+            int hstart = ph * strideH - paddingH;
+            int wstart = pw * strideW - paddingW;
+            int dend = std::min(dstart + sizeZ, imgSizeD + paddingD);
+            int hend = std::min(hstart + sizeY, imgSizeH + paddingH);
+            int wend = std::min(wstart + sizeX, imgSizeW + paddingW);
+            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            dstart = std::max(dstart, 0);
+            hstart = std::max(hstart, 0);
+            wstart = std::max(wstart, 0);
+            dend = std::min(dend, static_cast<int>(imgSizeD));
+            hend = std::min(hend, static_cast<int>(imgSizeH));
+            wend = std::min(wend, static_cast<int>(imgSizeW));
+            CHECK(poolSize);
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  outData[(d * imgSizeH + h) * imgSizeW + w] +=
+                      inData[(pd * outputH + ph) * outputW + pw] / poolSize;
+                }
+              }
+            }
+          }
+        }
+      }
+      // offset
+      outData += imgSizeD * imgSizeH * imgSizeW;
+      inData += outputD * outputH * outputW;
+    }
+  }
+}
+
 /**
  * Input: one or more sequences. Each sequence contains some instances.
  * Output: output size is the number of input sequences (NOT input instances).
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index bb802bbb2c752..f1534c5ea0db3 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -928,15 +928,102 @@ class Matrix : public BaseMatrix {
                                size_t paddingW) {
     LOG(FATAL) << "Not implemeted";
   }
-
   /**
-   * Input: one or more sequences. Each sequence contains some instances.
-   *
-   * Output: output size is the number of input sequences (NOT input
-   * instances).
-   *
-   * output[i] is set to max_input[i].
+   * Pooling 3D forward operation, pick out the largest element
+   * in the sizeX of value
    */
+  virtual void maxPool3DForward(Matrix& inputMat,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void maxPool3DBackward(Matrix& image,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 Matrix& outGrad,
+                                 Matrix& outV,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 real scaleTargets,
+                                 real scaleOutput,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DForward(Matrix& input,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t channels,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DBackward(Matrix& input,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 real scaleTargets,
+                                 real scaleOutput,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+ * Input: one or more sequences. Each sequence contains some instances.
+ *
+ * Output: output size is the number of input sequences (NOT input
+ * instances).
+ *
+ * output[i] is set to max_input[i].
+ */
   virtual void maxSequenceForward(Matrix& input,
                                   const IVector& sequence,
                                   IVector& index) {
@@ -1348,6 +1435,83 @@ class GpuMatrix : public Matrix {
                        size_t paddingH,
                        size_t paddingW);
 
+  /////////////////////////
+  void maxPool3DForward(Matrix& inputMat,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& image,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         Matrix& outGrad,
+                         Matrix& outV,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         real scaleTargets,
+                         real scaleOutput,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         real scaleTargets,
+                         real scaleOutput,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW);
+
   void maxSequenceForward(Matrix& input,
                           const IVector& sequence,
                           IVector& index);
@@ -1506,6 +1670,82 @@ class CpuMatrix : public Matrix {
                        real scaleOutput,
                        size_t paddingH,
                        size_t paddingW);
+  //////////////////////
+  void maxPool3DForward(Matrix& inputMat,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& image,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         Matrix& outGrad,
+                         Matrix& outV,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         real scaleTargets,
+                         real scaleOutput,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t channels,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         real scaleTargets,
+                         real scaleOutput,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW);
 
   void maxSequenceForward(Matrix& input,
                           const IVector& sequence,
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index d77478f345df9..7a961d2751f1d 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "TensorCheck.h"
+#include "paddle/math/MathUtils.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/testing/TestUtil.h"
@@ -1203,4 +1204,207 @@ TEST(Matrix, warpCTC) {
   }
 }
 
+/////
+void testMatrixPool3D(int depth, int height, int width) {
+  int channel = 3;
+  int filterX = 3, filterY = 4, filterZ = 5;
+  int strideX = 2, strideY = 2, strideZ = 2;
+  int padX = 1, padY = 1, padZ = 1;
+
+  MatrixPtr cpuImage =
+      std::make_shared<CpuMatrix>(1, channel * depth * height * width);
+  MatrixPtr gpuImage =
+      std::make_shared<GpuMatrix>(1, channel * depth * height * width);
+
+  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
+  int outH = outputSize(height, filterY, padZ, strideY, true);
+  int outW = outputSize(width, filterX, padZ, strideX, true);
+
+  int colBufWidth = outD * outH * outW;
+  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(1, channel * colBufWidth);
+  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(1, channel * colBufWidth);
+
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+  // std::cout << "test maxPool3DForward...\n";
+  cpuOutput->maxPool3DForward(*cpuImage,
+                              depth,
+                              height,
+                              width,
+                              channel,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              padZ,
+                              padY,
+                              padX);
+  gpuOutput->maxPool3DForward(*gpuImage,
+                              depth,
+                              height,
+                              width,
+                              channel,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              padZ,
+                              padY,
+                              padX);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+  // std::cout << "test avgPool3DForward...\n";
+  cpuOutput->avgPool3DForward(*cpuImage,
+                              depth,
+                              height,
+                              width,
+                              channel,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              padZ,
+                              padY,
+                              padX);
+
+  gpuOutput->avgPool3DForward(*gpuImage,
+                              depth,
+                              height,
+                              width,
+                              channel,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              padZ,
+                              padY,
+                              padX);
+  TensorCheckErr(*cpuOutput, *gpuOutput);
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+  // std::cout << "test avgPool3DBackward...\n";
+  cpuImage->avgPool3DBackward(*cpuOutput,
+                              depth,
+                              height,
+                              width,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              1,
+                              1,
+                              padZ,
+                              padY,
+                              padX);
+
+  gpuImage->avgPool3DBackward(*gpuOutput,
+                              depth,
+                              height,
+                              width,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              1,
+                              1,
+                              padZ,
+                              padY,
+                              padX);
+  TensorCheckErr(*cpuImage, *gpuImage);
+
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+  cpuOutput->randomizeUniform();
+  gpuOutput->copyFrom(*cpuOutput);
+  // std::cout << "test maxPool3DBackward...\n";
+  cpuImage->maxPool3DBackward(*cpuImage,
+                              depth,
+                              height,
+                              width,
+                              *cpuOutput,
+                              *cpuOutput,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              1,
+                              1,
+                              padZ,
+                              padY,
+                              padX);
+
+  gpuImage->maxPool3DBackward(*gpuImage,
+                              depth,
+                              height,
+                              width,
+                              *gpuOutput,
+                              *gpuOutput,
+                              filterZ,
+                              filterY,
+                              filterX,
+                              strideZ,
+                              strideY,
+                              strideX,
+                              outD,
+                              outH,
+                              outW,
+                              1,
+                              1,
+                              padZ,
+                              padY,
+                              padX);
+  TensorCheckErr(*cpuImage, *gpuImage);
+}
+
+TEST(Matrix, Pool3D) {
+  for (auto depth : {9, 16, 64, 128}) {
+    for (auto height : {9, 11, 128, 256}) {
+      for (auto width : {9, 32, 128}) {
+        VLOG(3) << "depth=" << depth << " height=" << height
+                << " width=" << width;
+        testMatrixPool3D(depth, height, width);
+      }
+    }
+  }
+}
+
 #endif
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 0547ac93cd183..77fd0c5890b45 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -186,6 +186,7 @@ void Argument::resizeAndCopyFrom(const Argument& src,
   resizeAndCopy(strs, src.strs, useGpu, stream);
   frameWidth = src.frameWidth;
   frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
 }
 
 int32_t Argument::resizeAndCopyFrom(const Argument& src,
@@ -206,6 +207,7 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
   dataId = src.dataId;
   frameWidth = src.frameWidth;
   frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
 
   if (!src.sequenceStartPositions) {
     // non-sequence input, copy samples directly
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index d8d7a4398f99a..ba3ad2fd4d992 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,6 +32,7 @@ struct Argument {
         strs(nullptr),
         frameHeight(0),
         frameWidth(0),
+        frameDepth(0),
         sequenceStartPositions(nullptr),
         subSequenceStartPositions(nullptr),
         cpuSequenceDims(nullptr),
@@ -64,6 +62,7 @@ struct Argument {
     allCount = argument.allCount;
     frameHeight = argument.frameHeight;
     frameWidth = argument.frameWidth;
+    frameDepth = argument.frameDepth;
     dataId = argument.dataId;
   }
 
@@ -76,6 +75,7 @@ struct Argument {
   // A dataBatch includes batchSize frames, one frame maybe not only vector
   size_t frameHeight;
   size_t frameWidth;
+  size_t frameDepth;
 
   // If NULL, each position is treated independently.
   // Otherwise, its size should be #NumberOfSequences + 1.
@@ -136,8 +136,10 @@ struct Argument {
   }
   size_t getFrameHeight() const { return frameHeight; }
   size_t getFrameWidth() const { return frameWidth; }
+  size_t getFrameDepth() const { return frameDepth; }
   void setFrameHeight(size_t h) { frameHeight = h; }
   void setFrameWidth(size_t w) { frameWidth = w; }
+  void setFrameDepth(size_t d) { frameDepth = d; }
 
   int64_t getNumSequences() const {
     return sequenceStartPositions ? sequenceStartPositions->getSize() - 1
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 4f3d5bf3f6cb9..42cf10e9d3f91 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -82,6 +82,12 @@ message ConvConfig {
 
   // if not set, use img_size
   optional uint32 img_size_y = 14;
+
+  optional uint32 filter_size_z = 15 [ default = 1 ];
+  optional uint32 padding_z = 16 [ default = 1 ];
+  optional uint32 stride_z = 17 [ default = 1 ];
+  optional uint32 output_z = 18 [ default = 1 ];
+  optional uint32 img_size_z = 19 [ default = 1 ];
 }
 
 message PoolConfig {
@@ -124,6 +130,12 @@ message PoolConfig {
 
   // if not set, use padding
   optional uint32 padding_y = 13;
+
+  optional uint32 size_z = 14 [ default = 1 ];
+  optional uint32 stride_z = 15 [ default = 1 ];
+  optional uint32 output_z = 16 [ default = 1 ];
+  optional uint32 img_size_z = 17 [ default = 1 ];
+  optional uint32 padding_z = 18 [ default = 1 ];
 }
 
 message SppConfig {

From 790379f1477835badbc35c563623d13ec5fd2b7a Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Wed, 23 Aug 2017 14:11:30 +0800
Subject: [PATCH 2/6] fix above comments

---
 paddle/cuda/include/hl_cnn.h             | 106 ------------------
 paddle/cuda/include/stub/hl_cnn_stub.h   |   6 +-
 paddle/gserver/layers/Pool3DLayer.cpp    |  71 +++++-------
 paddle/gserver/layers/Pool3DLayer.h      |   1 +
 paddle/math/Matrix.cpp                   | 131 +++++++++++-----------
 paddle/math/Matrix.h                     | 135 +++++++++++------------
 paddle/math/tests/test_matrixCompare.cpp |  97 ++++++++--------
 7 files changed, 208 insertions(+), 339 deletions(-)

diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index e9687d0a58d94..84f1c843596d4 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -173,31 +173,6 @@ extern void hl_avgpool_backward(const int frameCnt,
                                 real* backGrad,
                                 const int outStride);
 
-/**
- * @brief   Maximum pool forward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   inputData   input data.
- * @param[in]   channels    number of channel.
- * @param[in]   depth      image depth.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledD     output image depth.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeZ       depth of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   strideD     pooling stride depth.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   paddingD    padding depth.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[out]  tgtData     output data.
- * @param[in]   tgtStride   stride between output data samples.
- *
- */
 extern void hl_maxpool3D_forward(const int frameCnt,
                                  const real* inputData,
                                  const int channels,
@@ -219,35 +194,6 @@ extern void hl_maxpool3D_forward(const int frameCnt,
                                  real* tgtData,
                                  const int tgtStride);
 
-/**
- * @brief   Maximum pool backward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   inputData   input data.
- * @param[out]  outData     output data.
- * @param[out]  outGrad     output grad data.
- * @param[in]   channels    number of channel.
- * @param[in]   depth       image depth.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledD     output image depth.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeZ       depth of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   strideD     pooling stride depth.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   scaleA      scale.
- * @param[in]   scaleB      scale.
- * @param[in]   paddingD    padding depth.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[out]  targetGrad  output grad.
- * @param[in]   outStride   stride between output data samples.
- *
- */
 extern void hl_maxpool3D_backward(const int frameCnt,
                                   const real* inputData,
                                   const real* outData,
@@ -273,31 +219,6 @@ extern void hl_maxpool3D_backward(const int frameCnt,
                                   real* targetGrad,
                                   const int outStride);
 
-/**
- * @brief   Averge pool forward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   inputData   input data.
- * @param[in]   channels    number of channel.
- * @param[in]   depth       image depth.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledD     output image depth.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeZ       depth of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   strideD     pooling stride depth.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   paddingD    padding depth.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[out]  tgtData     output data.
- * @param[in]   tgtStride   stride between output data samples.
- *
- */
 extern void hl_avgpool3D_forward(const int frameCnt,
                                  const real* inputData,
                                  const int channels,
@@ -319,33 +240,6 @@ extern void hl_avgpool3D_forward(const int frameCnt,
                                  real* tgtData,
                                  const int tgtStride);
 
-/**
- * @brief   Maximum pool backward.
- *
- * @param[in]   frameCnt    batch size of input image.
- * @param[in]   outGrad     output grad data.
- * @param[in]   channels    number of channel.
- * @param[in]   depth      image depth.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   pooledD     output image depth.
- * @param[in]   pooledH     output image height.
- * @param[in]   pooledW     output image width.
- * @param[in]   sizeZ       depth of pooling window.
- * @param[in]   sizeY       height of pooling window.
- * @param[in]   sizeX       width of pooling window.
- * @param[in]   strideD     pooling stride depth.
- * @param[in]   strideH     pooling stride height.
- * @param[in]   strideW     pooling stride width.
- * @param[in]   paddingD    padding depth.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   scaleA      scale.
- * @param[in]   scaleB      scale.
- * @param[out]  backGrad    output grad.
- * @param[in]   outStride   stride between output data samples.
- *
- */
 extern void hl_avgpool3D_backward(const int frameCnt,
                                   const real* outGrad,
                                   const int channels,
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 28f61781be0b9..6750f537bfbf6 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -169,9 +169,9 @@ inline void hl_avgpool3D_backward(const int frameCnt,
                                   const int strideD,
                                   const int strideH,
                                   const int strideW,
-                                  int paddingD,
-                                  int paddingH,
-                                  int paddingW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
                                   real scaleA,
                                   real scaleB,
                                   real* backGrad,
diff --git a/paddle/gserver/layers/Pool3DLayer.cpp b/paddle/gserver/layers/Pool3DLayer.cpp
index fc6b9bdd2f95d..40a913ebfc668 100644
--- a/paddle/gserver/layers/Pool3DLayer.cpp
+++ b/paddle/gserver/layers/Pool3DLayer.cpp
@@ -58,30 +58,9 @@ size_t Pool3DLayer::getSize() {
   CHECK_EQ(inputLayers_.size(), 1UL);
 
   size_t layerSize = 0;
-  //  imgSizeD_ = inputLayers_[0]->getOutput().getFrameDepth();
-  //  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  //  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imgSizeH_ == 0) {
-    //    imgSizeH_ = imgSizeY_;
-  }
-  if (imgSizeW_ == 0) {
-    //    imgSizeW_ = imgSize_;
-  }
-  outputD_ = outputSize(imgSizeD_,
-                        sizeZ_,
-                        paddingD_,
-                        strideD_,
-                        /* caffeMode */ false);
-  outputH_ = outputSize(imgSizeH_,
-                        sizeY_,
-                        paddingH_,
-                        strideH_,
-                        /* caffeMode */ false);
-  outputW_ = outputSize(imgSizeW_,
-                        sizeX_,
-                        paddingW_,
-                        strideW_,
-                        /* caffeMode */ false);
+  outputD_ = outputSize(imgSizeD_, sizeZ_, paddingD_, strideD_, false);
+  outputH_ = outputSize(imgSizeH_, sizeY_, paddingH_, strideH_, false);
+  outputW_ = outputSize(imgSizeW_, sizeX_, paddingW_, strideW_, false);
 
   layerSize = outputD_ * outputH_ * outputW_ * channels_;
   getOutput().setFrameHeight(outputH_);
@@ -100,37 +79,37 @@ void Pool3DLayer::forward(PassType passType) {
 
   if (poolType_ == "avg") {
     outMat->avgPool3DForward(*inMat,
+                             channels_,
                              imgSizeD_,
                              imgSizeH_,
                              imgSizeW_,
-                             channels_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
                              sizeZ_,
                              sizeY_,
                              sizeX_,
                              strideD_,
                              strideH_,
                              strideW_,
-                             outputD_,
-                             outputH_,
-                             outputW_,
                              paddingD_,
                              paddingH_,
                              paddingW_);
   } else if (poolType_ == "max") {
     outMat->maxPool3DForward(*inMat,
+                             channels_,
                              imgSizeD_,
                              imgSizeH_,
                              imgSizeW_,
-                             channels_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
                              sizeZ_,
                              sizeY_,
                              sizeX_,
                              strideD_,
                              strideH_,
                              strideW_,
-                             outputD_,
-                             outputH_,
-                             outputW_,
                              paddingD_,
                              paddingH_,
                              paddingW_);
@@ -155,41 +134,41 @@ void Pool3DLayer::backward(const UpdateCallback& callback) {
                                  imgSizeD_,
                                  imgSizeH_,
                                  imgSizeW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
                                  sizeZ_,
                                  sizeY_,
                                  sizeZ_,
                                  strideD_,
                                  strideH_,
                                  strideW_,
-                                 outputD_,
-                                 outputH_,
-                                 outputW_,
-                                 1,
-                                 1,
                                  paddingD_,
                                  paddingH_,
-                                 paddingW_);
+                                 paddingW_,
+                                 1.0,
+                                 1.0);
   } else if (poolType_ == "max") {
     inGradMat->maxPool3DBackward(*inMat,
+                                 *outGradMat,
+                                 *outMat,
                                  imgSizeD_,
                                  imgSizeH_,
                                  imgSizeW_,
-                                 *outGradMat,
-                                 *outMat,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
                                  sizeZ_,
                                  sizeY_,
                                  sizeZ_,
                                  strideD_,
                                  strideH_,
                                  strideW_,
-                                 outputD_,
-                                 outputH_,
-                                 outputW_,
-                                 1,
-                                 1,
                                  paddingD_,
                                  paddingH_,
-                                 paddingW_);
+                                 paddingW_,
+                                 1.0,
+                                 1.0);
   } else {
     LOG(FATAL) << "Unknown pool type: " << poolType_;
   }
diff --git a/paddle/gserver/layers/Pool3DLayer.h b/paddle/gserver/layers/Pool3DLayer.h
index afc65ac2b0f82..8329a02f571bf 100644
--- a/paddle/gserver/layers/Pool3DLayer.h
+++ b/paddle/gserver/layers/Pool3DLayer.h
@@ -44,5 +44,6 @@ class Pool3DLayer : public Layer {
   int imgSizeW_, imgSizeH_, imgSizeD_;
   int outputW_, outputH_, outputD_;
   std::string poolType_;
+  MatrixPtr maxPoolIdx_;
 };
 }  // namespace paddle
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index e7f1489b8ba4f..4f9216896c0eb 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1191,23 +1191,23 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
 }
 
 void GpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 size_t channels,
                                  size_t imgSizeD,
                                  size_t imgSizeH,
                                  size_t imgSizeW,
-                                 size_t channels,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
                                  size_t sizeZ,
                                  size_t sizeY,
                                  size_t sizeX,
                                  size_t strideD,
                                  size_t strideH,
                                  size_t strideW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
                                  size_t paddingD,
                                  size_t paddingH,
                                  size_t paddingW) {
-  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
+  CHECK(inputMat.useGpu_) << "Matrix type are not correct";
 
   real* inputData = inputMat.getData();
   size_t num = inputMat.getHeight();
@@ -1236,32 +1236,31 @@ void GpuMatrix::maxPool3DForward(Matrix& inputMat,
                        paddingD,
                        paddingH,
                        paddingW,
-                       data_,
+                       getData(),
                        getStride());
 }
 
 void GpuMatrix::maxPool3DBackward(Matrix& inputMat,
+                                  Matrix& outGrad,
+                                  Matrix& outV,
                                   size_t imgSizeD,
                                   size_t imgSizeH,
                                   size_t imgSizeW,
-                                  Matrix& outGrad,
-                                  Matrix& outV,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
                                   size_t sizeZ,
                                   size_t sizeY,
                                   size_t sizeX,
                                   size_t strideD,
                                   size_t strideH,
                                   size_t strideW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real scaleTargets,
-                                  real scaleOutput,
                                   size_t paddingD,
                                   size_t paddingH,
-                                  size_t paddingW) {
-  CHECK(inputMat.useGpu_ == true && outGrad.useGpu_ == true &&
-        outV.useGpu_ == true)
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  CHECK(inputMat.useGpu_ && outGrad.useGpu_ && outV.useGpu_)
       << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
@@ -1300,28 +1299,28 @@ void GpuMatrix::maxPool3DBackward(Matrix& inputMat,
                         paddingW,
                         scaleTargets,
                         scaleOutput,
-                        data_,
+                        getData(),
                         outGrad.getStride());
 }
 
 void GpuMatrix::avgPool3DForward(Matrix& inputMat,
+                                 size_t channels,
                                  size_t imgSizeD,
                                  size_t imgSizeH,
                                  size_t imgSizeW,
-                                 size_t channels,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
                                  size_t sizeZ,
                                  size_t sizeY,
                                  size_t sizeX,
                                  size_t strideD,
                                  size_t strideH,
                                  size_t strideW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
                                  size_t paddingD,
                                  size_t paddingH,
                                  size_t paddingW) {
-  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
+  CHECK(inputMat.useGpu_) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
   size_t frameNum = inputMat.getHeight();
@@ -1350,7 +1349,7 @@ void GpuMatrix::avgPool3DForward(Matrix& inputMat,
                        paddingD,
                        paddingH,
                        paddingW,
-                       data_,
+                       getData(),
                        getStride());
 }
 
@@ -1358,21 +1357,21 @@ void GpuMatrix::avgPool3DBackward(Matrix& outGrad,
                                   size_t imgSizeD,
                                   size_t imgSizeH,
                                   size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
                                   size_t sizeZ,
                                   size_t sizeY,
                                   size_t sizeX,
                                   size_t strideD,
                                   size_t strideH,
                                   size_t strideW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real scaleTargets,
-                                  real scaleOutput,
                                   size_t paddingD,
                                   size_t paddingH,
-                                  size_t paddingW) {
-  CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  CHECK(outGrad.useGpu_) << "Matrix type are not equal";
 
   real* outDiff = outGrad.getData();
   size_t frameNum = outGrad.getHeight();
@@ -1404,7 +1403,7 @@ void GpuMatrix::avgPool3DBackward(Matrix& outGrad,
                         paddingW,
                         scaleTargets,
                         scaleOutput,
-                        data_,
+                        getData(),
                         outGrad.getStride());
 }
 
@@ -2149,24 +2148,24 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
 }
 
 void CpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 size_t channels,
                                  size_t imgSizeD,
                                  size_t imgSizeH,
                                  size_t imgSizeW,
-                                 size_t channels,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
                                  size_t sizeZ,
                                  size_t sizeY,
                                  size_t sizeX,
                                  size_t strideD,
                                  size_t strideH,
                                  size_t strideW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
                                  size_t paddingD,
                                  size_t paddingH,
                                  size_t paddingW) {
   real* inputData = inputMat.getData();
-  real* outData = data_;
+  real* outData = getData();
   size_t num = inputMat.getHeight();
   size_t inWidth = imgSizeW;
   size_t inHeight = imgSizeH;
@@ -2186,7 +2185,7 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat,
   /* pool max one by one */
   for (size_t n = 0; n < num; ++n) {  // frame by frame
     if (!isContiguous()) {
-      outData = data_ + n * outStride;
+      outData = getData() + n * outStride;
     }
     for (size_t c = 0; c < channels; ++c) {  // channel by channel
       for (size_t pd = 0; pd < outputD; ++pd) {
@@ -2201,15 +2200,18 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat,
             dstart = std::max(dstart, 0);
             hstart = std::max(hstart, 0);
             wstart = std::max(wstart, 0);
+            real maxOutData = outData[(pd * outputH + ph) * outputW + pw];
             for (int d = dstart; d < dend; ++d) {
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
-                  outData[(pd * outputH + ph) * outputW + pw] =
-                      std::max(outData[(pd * outputH + ph) * outputW + pw],
-                               inputData[(d * inHeight + h) * inWidth + w]);
+                  if (maxOutData <
+                      inputData[(d * inHeight + h) * inWidth + w]) {
+                    maxOutData = inputData[(d * inHeight + h) * inWidth + w];
+                  }
                 }
               }
             }
+            outData[(pd * outputH + ph) * outputW + pw] = maxOutData;
           }
         }
       }
@@ -2221,25 +2223,25 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat,
 }
 
 void CpuMatrix::maxPool3DBackward(Matrix& image,
+                                  Matrix& outGrad,
+                                  Matrix& outV,
                                   size_t imgSizeD,
                                   size_t imgSizeH,
                                   size_t imgSizeW,
-                                  Matrix& outGrad,
-                                  Matrix& outV,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
                                   size_t sizeZ,
                                   size_t sizeY,
                                   size_t sizeX,
                                   size_t strideD,
                                   size_t strideH,
                                   size_t strideW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real scaleTargets,
-                                  real scaleOutput,
                                   size_t paddingD,
                                   size_t paddingH,
-                                  size_t paddingW) {
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
   size_t num = image.getHeight();
   size_t channels = size_t(width_ / imgSizeD / imgSizeH / imgSizeW);
   CHECK(image.getWidth() == imgSizeD * imgSizeH * imgSizeW * channels);
@@ -2247,19 +2249,18 @@ void CpuMatrix::maxPool3DBackward(Matrix& image,
   CHECK(outV.getHeight() == outGrad.getHeight() &&
         outV.getWidth() == outGrad.getWidth());
 
-  real* tgtGrad = data_;
+  real* tgtGrad = getData();
   real* inData = image.getData();
   real* otData = outV.getData();
   real* otGrad = outGrad.getData();
 
   size_t outStride = outV.getStride();
-  real* origOutData = otData;
-  real* origOutGrad = otGrad;
+  ;
 
   for (size_t n = 0; n < num; ++n) {
     if (!outV.isContiguous()) {
-      otData = origOutData + n * outStride;
-      otGrad = origOutGrad + n * outStride;
+      otData = outV.getData() + n * outStride;
+      otGrad = outGrad.getData() + n * outStride;
     }
     for (size_t c = 0; c < channels; ++c) {
       for (size_t pd = 0; pd < outputD; ++pd) {
@@ -2274,7 +2275,7 @@ void CpuMatrix::maxPool3DBackward(Matrix& image,
             dstart = std::max(dstart, 0);
             hstart = std::max(hstart, 0);
             wstart = std::max(wstart, 0);
-            for (int d = 0; d < dend; ++d) {
+            for (int d = dstart; d < dend; ++d) {
               for (int h = hstart; h < hend; ++h) {
                 for (int w = wstart; w < wend; ++w) {
                   tgtGrad[(d * imgSizeH + h) * imgSizeW + w] =
@@ -2299,19 +2300,19 @@ void CpuMatrix::maxPool3DBackward(Matrix& image,
 }
 
 void CpuMatrix::avgPool3DForward(Matrix& input,
+                                 size_t channels,
                                  size_t imgSizeD,
                                  size_t imgSizeH,
                                  size_t imgSizeW,
-                                 size_t channels,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
                                  size_t sizeZ,
                                  size_t sizeY,
                                  size_t sizeX,
                                  size_t strideD,
                                  size_t strideH,
                                  size_t strideW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
                                  size_t paddingD,
                                  size_t paddingH,
                                  size_t paddingW) {
@@ -2322,7 +2323,7 @@ void CpuMatrix::avgPool3DForward(Matrix& input,
   size_t inWidth = imgSizeW;
   CHECK(inDepth * inHeight * inWidth * channels == input.getWidth());
   CHECK(outputD * outputH * outputW * channels * num == height_ * width_);
-  real* tgtData = data_;
+  real* tgtData = getData();
   real* inData = input.getData();
 
   for (size_t n = 0; n < num; ++n) {
@@ -2372,20 +2373,20 @@ void CpuMatrix::avgPool3DBackward(Matrix& input,
                                   size_t imgSizeD,
                                   size_t imgSizeH,
                                   size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
                                   size_t sizeZ,
                                   size_t sizeY,
                                   size_t sizeX,
                                   size_t strideD,
                                   size_t strideH,
                                   size_t strideW,
-                                  size_t outputD,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real scaleTargets,
-                                  real scaleOutput,
                                   size_t paddingD,
                                   size_t paddingH,
-                                  size_t paddingW) {
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
   size_t num = input.getHeight();
   size_t channels = input.getWidth() / outputD / outputH / outputW;
   CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth());
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index f1534c5ea0db3..dec9702433cdc 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -933,19 +933,19 @@ class Matrix : public BaseMatrix {
    * in the sizeX of value
    */
   virtual void maxPool3DForward(Matrix& inputMat,
+                                size_t channels,
                                 size_t imgSizeD,
                                 size_t imgSizeH,
                                 size_t imgSizeW,
-                                size_t channels,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
                                 size_t sizeZ,
                                 size_t sizeY,
                                 size_t sizeX,
                                 size_t strideD,
                                 size_t strideH,
                                 size_t strideW,
-                                size_t outputD,
-                                size_t outputH,
-                                size_t outputW,
                                 size_t paddingD,
                                 size_t paddingH,
                                 size_t paddingW) {
@@ -953,42 +953,42 @@ class Matrix : public BaseMatrix {
   }
 
   virtual void maxPool3DBackward(Matrix& image,
+                                 Matrix& outGrad,
+                                 Matrix& outV,
                                  size_t imgSizeD,
                                  size_t imgSizeH,
                                  size_t imgSizeW,
-                                 Matrix& outGrad,
-                                 Matrix& outV,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
                                  size_t sizeZ,
                                  size_t sizeY,
                                  size_t sizeX,
                                  size_t strideD,
                                  size_t strideH,
                                  size_t strideW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 real scaleTargets,
-                                 real scaleOutput,
                                  size_t paddingD,
                                  size_t paddingH,
-                                 size_t paddingW) {
+                                 size_t paddingW,
+                                 real scaleTargets,
+                                 real scaleOutput) {
     LOG(FATAL) << "Not implemeted";
   }
 
   virtual void avgPool3DForward(Matrix& input,
+                                size_t channels,
                                 size_t imgSizeD,
                                 size_t imgSizeH,
                                 size_t imgSizeW,
-                                size_t channels,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
                                 size_t sizeZ,
                                 size_t sizeY,
                                 size_t sizeX,
                                 size_t strideD,
                                 size_t strideH,
                                 size_t strideW,
-                                size_t outputD,
-                                size_t outputH,
-                                size_t outputW,
                                 size_t paddingD,
                                 size_t paddingH,
                                 size_t paddingW) {
@@ -999,20 +999,20 @@ class Matrix : public BaseMatrix {
                                  size_t imgSizeD,
                                  size_t imgSizeH,
                                  size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
                                  size_t sizeZ,
                                  size_t sizeY,
                                  size_t sizeX,
                                  size_t strideD,
                                  size_t strideH,
                                  size_t strideW,
-                                 size_t outputD,
-                                 size_t outputH,
-                                 size_t outputW,
-                                 real scaleTargets,
-                                 real scaleOutput,
                                  size_t paddingD,
                                  size_t paddingH,
-                                 size_t paddingW) {
+                                 size_t paddingW,
+                                 real scaleTargets,
+                                 real scaleOutput) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -1435,60 +1435,59 @@ class GpuMatrix : public Matrix {
                        size_t paddingH,
                        size_t paddingW);
 
-  /////////////////////////
   void maxPool3DForward(Matrix& inputMat,
+                        size_t channels,
                         size_t imgSizeD,
                         size_t imgSizeH,
                         size_t imgSizeW,
-                        size_t channels,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
                         size_t sizeZ,
                         size_t sizeY,
                         size_t sizeX,
                         size_t strideD,
                         size_t strideH,
                         size_t strideW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
                         size_t paddingD,
                         size_t paddingH,
                         size_t paddingW);
 
   void maxPool3DBackward(Matrix& image,
+                         Matrix& outGrad,
+                         Matrix& outV,
                          size_t imgSizeD,
                          size_t imgSizeH,
                          size_t imgSizeW,
-                         Matrix& outGrad,
-                         Matrix& outV,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
                          size_t sizeZ,
                          size_t sizeY,
                          size_t sizeX,
                          size_t strideD,
                          size_t strideH,
                          size_t strideW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         real scaleTargets,
-                         real scaleOutput,
                          size_t paddingD,
                          size_t paddingH,
-                         size_t paddingW);
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
 
   void avgPool3DForward(Matrix& input,
+                        size_t channels,
                         size_t imgSizeD,
                         size_t imgSizeH,
                         size_t imgSizeW,
-                        size_t channels,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
                         size_t sizeZ,
                         size_t sizeY,
                         size_t sizeX,
                         size_t strideD,
                         size_t strideH,
                         size_t strideW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
                         size_t paddingD,
                         size_t paddingH,
                         size_t paddingW);
@@ -1497,20 +1496,20 @@ class GpuMatrix : public Matrix {
                          size_t imgSizeD,
                          size_t imgSizeH,
                          size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
                          size_t sizeZ,
                          size_t sizeY,
                          size_t sizeX,
                          size_t strideD,
                          size_t strideH,
                          size_t strideW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         real scaleTargets,
-                         real scaleOutput,
                          size_t paddingD,
                          size_t paddingH,
-                         size_t paddingW);
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
 
   void maxSequenceForward(Matrix& input,
                           const IVector& sequence,
@@ -1670,60 +1669,60 @@ class CpuMatrix : public Matrix {
                        real scaleOutput,
                        size_t paddingH,
                        size_t paddingW);
-  //////////////////////
+
   void maxPool3DForward(Matrix& inputMat,
+                        size_t channels,
                         size_t imgSizeD,
                         size_t imgSizeH,
                         size_t imgSizeW,
-                        size_t channels,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
                         size_t sizeZ,
                         size_t sizeY,
                         size_t sizeX,
                         size_t strideD,
                         size_t strideH,
                         size_t strideW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
                         size_t paddingD,
                         size_t paddingH,
                         size_t paddingW);
 
   void maxPool3DBackward(Matrix& image,
+                         Matrix& outGrad,
+                         Matrix& outV,
                          size_t imgSizeD,
                          size_t imgSizeH,
                          size_t imgSizeW,
-                         Matrix& outGrad,
-                         Matrix& outV,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
                          size_t sizeZ,
                          size_t sizeY,
                          size_t sizeX,
                          size_t strideD,
                          size_t strideH,
                          size_t strideW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         real scaleTargets,
-                         real scaleOutput,
                          size_t paddingD,
                          size_t paddingH,
-                         size_t paddingW);
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
 
   void avgPool3DForward(Matrix& input,
+                        size_t channels,
                         size_t imgSizeD,
                         size_t imgSizeH,
                         size_t imgSizeW,
-                        size_t channels,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
                         size_t sizeZ,
                         size_t sizeY,
                         size_t sizeX,
                         size_t strideD,
                         size_t strideH,
                         size_t strideW,
-                        size_t outputD,
-                        size_t outputH,
-                        size_t outputW,
                         size_t paddingD,
                         size_t paddingH,
                         size_t paddingW);
@@ -1732,20 +1731,20 @@ class CpuMatrix : public Matrix {
                          size_t imgSizeD,
                          size_t imgSizeH,
                          size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
                          size_t sizeZ,
                          size_t sizeY,
                          size_t sizeX,
                          size_t strideD,
                          size_t strideH,
                          size_t strideW,
-                         size_t outputD,
-                         size_t outputH,
-                         size_t outputW,
-                         real scaleTargets,
-                         real scaleOutput,
                          size_t paddingD,
                          size_t paddingH,
-                         size_t paddingW);
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
 
   void maxSequenceForward(Matrix& input,
                           const IVector& sequence,
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 7a961d2751f1d..21ee8543cd2ea 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1204,7 +1204,6 @@ TEST(Matrix, warpCTC) {
   }
 }
 
-/////
 void testMatrixPool3D(int depth, int height, int width) {
   int channel = 3;
   int filterX = 3, filterY = 4, filterZ = 5;
@@ -1226,38 +1225,37 @@ void testMatrixPool3D(int depth, int height, int width) {
 
   cpuImage->randomizeUniform();
   gpuImage->copyFrom(*cpuImage);
-  // std::cout << "test maxPool3DForward...\n";
   cpuOutput->maxPool3DForward(*cpuImage,
+                              channel,
                               depth,
                               height,
                               width,
-                              channel,
+                              outD,
+                              outH,
+                              outW,
                               filterZ,
                               filterY,
                               filterX,
                               strideZ,
                               strideY,
                               strideX,
-                              outD,
-                              outH,
-                              outW,
                               padZ,
                               padY,
                               padX);
   gpuOutput->maxPool3DForward(*gpuImage,
+                              channel,
                               depth,
                               height,
                               width,
-                              channel,
+                              outD,
+                              outH,
+                              outW,
                               filterZ,
                               filterY,
                               filterX,
                               strideZ,
                               strideY,
                               strideX,
-                              outD,
-                              outH,
-                              outW,
                               padZ,
                               padY,
                               padX);
@@ -1265,39 +1263,38 @@ void testMatrixPool3D(int depth, int height, int width) {
 
   cpuImage->randomizeUniform();
   gpuImage->copyFrom(*cpuImage);
-  // std::cout << "test avgPool3DForward...\n";
   cpuOutput->avgPool3DForward(*cpuImage,
+                              channel,
                               depth,
                               height,
                               width,
-                              channel,
+                              outD,
+                              outH,
+                              outW,
                               filterZ,
                               filterY,
                               filterX,
                               strideZ,
                               strideY,
                               strideX,
-                              outD,
-                              outH,
-                              outW,
                               padZ,
                               padY,
                               padX);
 
   gpuOutput->avgPool3DForward(*gpuImage,
+                              channel,
                               depth,
                               height,
                               width,
-                              channel,
+                              outD,
+                              outH,
+                              outW,
                               filterZ,
                               filterY,
                               filterX,
                               strideZ,
                               strideY,
                               strideX,
-                              outD,
-                              outH,
-                              outW,
                               padZ,
                               padY,
                               padX);
@@ -1306,98 +1303,96 @@ void testMatrixPool3D(int depth, int height, int width) {
   gpuImage->copyFrom(*cpuImage);
   cpuOutput->randomizeUniform();
   gpuOutput->copyFrom(*cpuOutput);
-  // std::cout << "test avgPool3DBackward...\n";
   cpuImage->avgPool3DBackward(*cpuOutput,
                               depth,
                               height,
                               width,
+                              outD,
+                              outH,
+                              outW,
                               filterZ,
                               filterY,
                               filterX,
                               strideZ,
                               strideY,
                               strideX,
-                              outD,
-                              outH,
-                              outW,
-                              1,
-                              1,
                               padZ,
                               padY,
-                              padX);
+                              padX,
+                              1.0,
+                              1.0);
 
   gpuImage->avgPool3DBackward(*gpuOutput,
                               depth,
                               height,
                               width,
+                              outD,
+                              outH,
+                              outW,
                               filterZ,
                               filterY,
                               filterX,
                               strideZ,
                               strideY,
                               strideX,
-                              outD,
-                              outH,
-                              outW,
-                              1,
-                              1,
                               padZ,
                               padY,
-                              padX);
+                              padX,
+                              1.0,
+                              1.0);
   TensorCheckErr(*cpuImage, *gpuImage);
 
   cpuImage->randomizeUniform();
   gpuImage->copyFrom(*cpuImage);
   cpuOutput->randomizeUniform();
   gpuOutput->copyFrom(*cpuOutput);
-  // std::cout << "test maxPool3DBackward...\n";
   cpuImage->maxPool3DBackward(*cpuImage,
+                              *cpuOutput,
+                              *cpuOutput,
                               depth,
                               height,
                               width,
-                              *cpuOutput,
-                              *cpuOutput,
+                              outD,
+                              outH,
+                              outW,
                               filterZ,
                               filterY,
                               filterX,
                               strideZ,
                               strideY,
                               strideX,
-                              outD,
-                              outH,
-                              outW,
-                              1,
-                              1,
                               padZ,
                               padY,
-                              padX);
+                              padX,
+                              1.0,
+                              1.0);
 
   gpuImage->maxPool3DBackward(*gpuImage,
+                              *gpuOutput,
+                              *gpuOutput,
                               depth,
                               height,
                               width,
-                              *gpuOutput,
-                              *gpuOutput,
+                              outD,
+                              outH,
+                              outW,
                               filterZ,
                               filterY,
                               filterX,
                               strideZ,
                               strideY,
                               strideX,
-                              outD,
-                              outH,
-                              outW,
-                              1,
-                              1,
                               padZ,
                               padY,
-                              padX);
+                              padX,
+                              1.0,
+                              1.0);
   TensorCheckErr(*cpuImage, *gpuImage);
 }
 
 TEST(Matrix, Pool3D) {
-  for (auto depth : {9, 16, 64, 128}) {
-    for (auto height : {9, 11, 128, 256}) {
+  for (auto depth : {9, 16, 64}) {
+    for (auto height : {9, 11, 128}) {
       for (auto width : {9, 32, 128}) {
         VLOG(3) << "depth=" << depth << " height=" << height
                 << " width=" << width;

From 860bf192c904627ee0446051fe97911eb11895ad Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Thu, 24 Aug 2017 19:28:56 +0800
Subject: [PATCH 3/6] Add maxPoolIdx

---
 paddle/cuda/include/hl_cnn.h             |   4 +-
 paddle/cuda/include/stub/hl_cnn_stub.h   |   4 +-
 paddle/cuda/src/hl_cuda_cnn.cu           |  73 +--
 paddle/gserver/layers/Pool3DLayer.cpp    |  11 +-
 paddle/math/Matrix.cpp                   |  86 ++--
 paddle/math/Matrix.h                     |  18 +-
 paddle/math/tests/test_matrixCompare.cpp | 564 +++++++++++++++--------
 7 files changed, 473 insertions(+), 287 deletions(-)

diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 84f1c843596d4..6b56d9ec8d3da 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -192,11 +192,10 @@ extern void hl_maxpool3D_forward(const int frameCnt,
                                  const int paddingH,
                                  const int paddingW,
                                  real* tgtData,
+                                 real* maxPoolIdxData,
                                  const int tgtStride);
 
 extern void hl_maxpool3D_backward(const int frameCnt,
-                                  const real* inputData,
-                                  const real* outData,
                                   const real* outGrad,
                                   const int channels,
                                   const int depth,
@@ -217,6 +216,7 @@ extern void hl_maxpool3D_backward(const int frameCnt,
                                   real scaleA,
                                   real scaleB,
                                   real* targetGrad,
+                                  real* maxPoolIdxData,
                                   const int outStride);
 
 extern void hl_avgpool3D_forward(const int frameCnt,
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 6750f537bfbf6..a76dbf0b6578d 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -106,11 +106,10 @@ inline void hl_maxpool3D_forward(const int frameCnt,
                                  const int paddingH,
                                  const int paddingW,
                                  real* tgtData,
+                                 real* maxPoolIdxData,
                                  const int tgtStride) {}
 
 inline void hl_maxpool3D_backward(const int frameCnt,
-                                  const real* inputData,
-                                  const real* outData,
                                   const real* outGrad,
                                   const int channels,
                                   const int depth,
@@ -131,6 +130,7 @@ inline void hl_maxpool3D_backward(const int frameCnt,
                                   real scaleA,
                                   real scaleB,
                                   real* targetGrad,
+                                  real* maxPoolIdxData,
                                   const int outStride) {}
 
 inline void hl_avgpool3D_forward(const int frameCnt,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index 458c347728952..95440c9446384 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -366,10 +366,11 @@ __global__ void KeMaxPool3DForward(const int nthreads,
                                    const int strideD,
                                    const int strideH,
                                    const int strideW,
-                                   const int offsetD,
-                                   const int offsetH,
-                                   const int offsetW,
+                                   const int padD,
+                                   const int padH,
+                                   const int padW,
                                    real* tgtData,
+                                   real* maxPoolIdxData,
                                    const int tgtStride) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
        index += blockDim.x * gridDim.x) {
@@ -378,9 +379,9 @@ __global__ void KeMaxPool3DForward(const int nthreads,
     int pd = (index / pooledW / pooledH) % pooledD;
     int c = (index / pooledW / pooledH / pooledD) % channels;
     int frameNum = index / pooledW / pooledH / pooledD / channels;
-    int dstart = pd * strideD - offsetD;
-    int hstart = ph * strideH - offsetH;
-    int wstart = pw * strideW - offsetW;
+    int dstart = pd * strideD - padD;
+    int hstart = ph * strideH - padH;
+    int wstart = pw * strideW - padW;
     int dend = min(dstart + ksizeD, depth);
     int hend = min(hstart + ksizeH, height);
     int wend = min(wstart + ksizeW, width);
@@ -388,18 +389,22 @@ __global__ void KeMaxPool3DForward(const int nthreads,
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     real maxval = -FLT_MAX;
+    int maxIdx = -1;
     inputData += (frameNum * channels + c) * depth * height * width;
     for (int d = dstart; d < dend; ++d) {
       for (int h = hstart; h < hend; ++h) {
         for (int w = wstart; w < wend; ++w) {
-          if (maxval < inputData[(d * height + h) * width + w])
+          if (maxval < inputData[(d * height + h) * width + w]) {
             maxval = inputData[(d * height + h) * width + w];
+            maxIdx = (d * height + h) * width + w;
+          }
         }
       }
     }
     int tgtIndex =
         index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
     tgtData[tgtIndex] = maxval;
+    maxPoolIdxData[tgtIndex] = maxIdx;
   }
 }
 
@@ -418,10 +423,11 @@ void hl_maxpool3D_forward(const int frameCnt,
                           const int strideD,
                           const int strideH,
                           const int strideW,
-                          const int paddingD,
-                          const int paddingH,
-                          const int paddingW,
+                          const int padD,
+                          const int padH,
+                          const int padW,
                           real* tgtData,
+                          real* maxPoolIdxData,
                           const int tgtStride) {
   int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
@@ -443,17 +449,16 @@ void hl_maxpool3D_forward(const int frameCnt,
                                                            strideD,
                                                            strideH,
                                                            strideW,
-                                                           paddingD,
-                                                           paddingH,
-                                                           paddingW,
+                                                           padD,
+                                                           padH,
+                                                           padW,
                                                            tgtData,
+                                                           maxPoolIdxData,
                                                            tgtStride);
   CHECK_SYNC("hl_maxpool3D_forward failed");
 }
 
 __global__ void KeMaxPool3DBackward(const int nthreads,
-                                    const real* inputData,
-                                    const real* outData,
                                     const real* outGrad,
                                     const int channels,
                                     const int depth,
@@ -474,33 +479,35 @@ __global__ void KeMaxPool3DBackward(const int nthreads,
                                     real scaleA,
                                     real scaleB,
                                     real* targetGrad,
+                                    real* maxPoolIdxData,
                                     const int outStride) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
        index += blockDim.x * gridDim.x) {
-    // find out the local index
-    // find out the local offset
-    int offsetW = index % width + padW;
-    int offsetH = (index / width) % height + padH;
-    int offsetD = (index / width / height) % depth + padD;
+    int offsetW = index % width;
+    int offsetH = (index / width) % height;
+    int offsetD = (index / width / height) % depth;
     int offsetC = (index / width / height / depth) % channels;
     int frameNum = index / width / height / depth / channels;
 
-    int pdstart = (offsetD < sizeZ) ? 0 : (offsetD - sizeZ) / strideD + 1;
-    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
-    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
-    int pdend = min(offsetD / strideD + 1, pooledD);
-    int phend = min(offsetH / strideH + 1, pooledH);
-    int pwend = min(offsetW / strideW + 1, pooledW);
+    int pdstart =
+        (offsetD + padD < sizeZ) ? 0 : (offsetD + padD - sizeZ) / strideD + 1;
+    int phstart =
+        (offsetH + padH < sizeY) ? 0 : (offsetH + padH - sizeY) / strideH + 1;
+    int pwstart =
+        (offsetW + padW < sizeX) ? 0 : (offsetW + padW - sizeX) / strideW + 1;
+    int pdend = min((offsetD + padD) / strideD + 1, pooledD);
+    int phend = min((offsetH + padH) / strideH + 1, pooledH);
+    int pwend = min((offsetW + padW) / strideW + 1, pooledW);
 
     real gradient = 0;
-    real input = inputData[index];
-
-    outData += ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
     outGrad += ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
+    maxPoolIdxData +=
+        ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
     for (int pd = pdstart; pd < pdend; ++pd) {
       for (int ph = phstart; ph < phend; ++ph) {
         for (int pw = pwstart; pw < pwend; ++pw) {
-          if (input == outData[(pd * pooledH + ph) * pooledW + pw])
+          if (((offsetD * height + offsetH) * width + offsetW) ==
+              maxPoolIdxData[(pd * pooledH + ph) * pooledW + pw])
             gradient += outGrad[(pd * pooledH + ph) * pooledW + pw];
         }
       }
@@ -510,8 +517,6 @@ __global__ void KeMaxPool3DBackward(const int nthreads,
 }
 
 void hl_maxpool3D_backward(const int frameCnt,
-                           const real* inputData,
-                           const real* outData,
                            const real* outGrad,
                            const int channels,
                            const int depth,
@@ -532,13 +537,12 @@ void hl_maxpool3D_backward(const int frameCnt,
                            real scaleA,
                            real scaleB,
                            real* targetGrad,
+                           real* maxPoolIdxData,
                            const int outStride) {
   int num_kernels = depth * height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
 
   KeMaxPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
-                                                           inputData,
-                                                           outData,
                                                            outGrad,
                                                            channels,
                                                            depth,
@@ -559,6 +563,7 @@ void hl_maxpool3D_backward(const int frameCnt,
                                                            scaleA,
                                                            scaleB,
                                                            targetGrad,
+                                                           maxPoolIdxData,
                                                            outStride);
   CHECK_SYNC("hl_maxpool3D_backward");
 }
diff --git a/paddle/gserver/layers/Pool3DLayer.cpp b/paddle/gserver/layers/Pool3DLayer.cpp
index 40a913ebfc668..199f21adb1a59 100644
--- a/paddle/gserver/layers/Pool3DLayer.cpp
+++ b/paddle/gserver/layers/Pool3DLayer.cpp
@@ -72,9 +72,10 @@ size_t Pool3DLayer::getSize() {
 void Pool3DLayer::forward(PassType passType) {
   Layer::forward(passType);
   const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
-  int batchSize = inMat->getHeight();
-  int outWidth = getSize();
+  size_t batchSize = inMat->getHeight();
+  size_t outWidth = getSize();
   resetOutput(batchSize, outWidth);
+  Matrix::resizeOrCreate(maxPoolIdx_, batchSize, outWidth, false, useGpu_);
   const MatrixPtr outMat = getOutputValue();
 
   if (poolType_ == "avg") {
@@ -97,6 +98,7 @@ void Pool3DLayer::forward(PassType passType) {
                              paddingW_);
   } else if (poolType_ == "max") {
     outMat->maxPool3DForward(*inMat,
+                             *maxPoolIdx_,
                              channels_,
                              imgSizeD_,
                              imgSizeH_,
@@ -149,9 +151,8 @@ void Pool3DLayer::backward(const UpdateCallback& callback) {
                                  1.0,
                                  1.0);
   } else if (poolType_ == "max") {
-    inGradMat->maxPool3DBackward(*inMat,
-                                 *outGradMat,
-                                 *outMat,
+    inGradMat->maxPool3DBackward(*outGradMat,
+                                 *maxPoolIdx_,
                                  imgSizeD_,
                                  imgSizeH_,
                                  imgSizeW_,
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 4f9216896c0eb..54c2eae475047 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1191,6 +1191,7 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
 }
 
 void GpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 Matrix& maxPoolIdx,
                                  size_t channels,
                                  size_t imgSizeD,
                                  size_t imgSizeH,
@@ -1210,6 +1211,7 @@ void GpuMatrix::maxPool3DForward(Matrix& inputMat,
   CHECK(inputMat.useGpu_) << "Matrix type are not correct";
 
   real* inputData = inputMat.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
   size_t num = inputMat.getHeight();
   size_t width = imgSizeW;
   size_t height = imgSizeH;
@@ -1237,12 +1239,12 @@ void GpuMatrix::maxPool3DForward(Matrix& inputMat,
                        paddingH,
                        paddingW,
                        getData(),
+                       maxPoolIdxData,
                        getStride());
 }
 
-void GpuMatrix::maxPool3DBackward(Matrix& inputMat,
-                                  Matrix& outGrad,
-                                  Matrix& outV,
+void GpuMatrix::maxPool3DBackward(Matrix& outGrad,
+                                  Matrix& maxPoolIdx,
                                   size_t imgSizeD,
                                   size_t imgSizeH,
                                   size_t imgSizeW,
@@ -1260,26 +1262,21 @@ void GpuMatrix::maxPool3DBackward(Matrix& inputMat,
                                   size_t paddingW,
                                   real scaleTargets,
                                   real scaleOutput) {
-  CHECK(inputMat.useGpu_ && outGrad.useGpu_ && outV.useGpu_)
-      << "Matrix type are not equal";
+  CHECK(outGrad.useGpu_ && maxPoolIdx.useGpu_) << "Matrix type are not equal";
 
-  real* inputData = inputMat.getData();
-  real* outData = outV.getData();
   real* outDiff = outGrad.getData();
-  size_t frameNum = inputMat.getHeight();
-  size_t channels = outV.getWidth() / outputD / outputH / outputW;
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t frameNum = getHeight();
+  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
   size_t width = imgSizeW;
   size_t height = imgSizeH;
   size_t depth = imgSizeD;
-  CHECK(depth * height * width * channels == inputMat.getWidth());
-  CHECK(height_ == inputMat.getHeight());
+  CHECK(depth * height * width * channels == getWidth());
   CHECK(width_ == depth * width * height * channels);
-  CHECK(outGrad.getHeight() == outV.getHeight() &&
-        outGrad.getWidth() == outV.getWidth());
+  CHECK(outGrad.getHeight() == maxPoolIdx.getHeight() &&
+        outGrad.getWidth() == maxPoolIdx.getWidth());
 
   hl_maxpool3D_backward(frameNum,
-                        inputData,
-                        outData,
                         outDiff,
                         channels,
                         depth,
@@ -1300,6 +1297,7 @@ void GpuMatrix::maxPool3DBackward(Matrix& inputMat,
                         scaleTargets,
                         scaleOutput,
                         getData(),
+                        maxPoolIdxData,
                         outGrad.getStride());
 }
 
@@ -2148,6 +2146,7 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
 }
 
 void CpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 Matrix& maxPoolIdx,
                                  size_t channels,
                                  size_t imgSizeD,
                                  size_t imgSizeH,
@@ -2166,6 +2165,7 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat,
                                  size_t paddingW) {
   real* inputData = inputMat.getData();
   real* outData = getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
   size_t num = inputMat.getHeight();
   size_t inWidth = imgSizeW;
   size_t inHeight = imgSizeH;
@@ -2179,6 +2179,7 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat,
   for (size_t i = 0; i < height_; i++) {
     for (size_t j = 0; j < width_; j++) {
       outData[(i)*outStride + j] = -(real)FLT_MAX;
+      maxPoolIdxData[(i)*outStride + j] = -1;
     }
   }
 
@@ -2186,6 +2187,7 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat,
   for (size_t n = 0; n < num; ++n) {  // frame by frame
     if (!isContiguous()) {
       outData = getData() + n * outStride;
+      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
     }
     for (size_t c = 0; c < channels; ++c) {  // channel by channel
       for (size_t pd = 0; pd < outputD; ++pd) {
@@ -2200,6 +2202,7 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat,
             dstart = std::max(dstart, 0);
             hstart = std::max(hstart, 0);
             wstart = std::max(wstart, 0);
+            int maxIdx = -1;
             real maxOutData = outData[(pd * outputH + ph) * outputW + pw];
             for (int d = dstart; d < dend; ++d) {
               for (int h = hstart; h < hend; ++h) {
@@ -2207,24 +2210,26 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat,
                   if (maxOutData <
                       inputData[(d * inHeight + h) * inWidth + w]) {
                     maxOutData = inputData[(d * inHeight + h) * inWidth + w];
+                    maxIdx = (d * inHeight + h) * inWidth + w;
                   }
                 }
               }
             }
             outData[(pd * outputH + ph) * outputW + pw] = maxOutData;
+            maxPoolIdxData[(pd * outputH + ph) * outputW + pw] = maxIdx;
           }
         }
       }
       // compute offset
       inputData += inDepth * inHeight * inWidth;
       outData += outputD * outputH * outputW;
+      maxPoolIdxData += outputD * outputH * outputW;
     }
   }
 }
 
-void CpuMatrix::maxPool3DBackward(Matrix& image,
-                                  Matrix& outGrad,
-                                  Matrix& outV,
+void CpuMatrix::maxPool3DBackward(Matrix& outGrad,
+                                  Matrix& maxPoolIdx,
                                   size_t imgSizeD,
                                   size_t imgSizeH,
                                   size_t imgSizeW,
@@ -2242,59 +2247,38 @@ void CpuMatrix::maxPool3DBackward(Matrix& image,
                                   size_t paddingW,
                                   real scaleTargets,
                                   real scaleOutput) {
-  size_t num = image.getHeight();
+  size_t num = getHeight();
   size_t channels = size_t(width_ / imgSizeD / imgSizeH / imgSizeW);
-  CHECK(image.getWidth() == imgSizeD * imgSizeH * imgSizeW * channels);
-  CHECK(image.getHeight() == height_ && image.getWidth() == width_);
-  CHECK(outV.getHeight() == outGrad.getHeight() &&
-        outV.getWidth() == outGrad.getWidth());
+  CHECK(maxPoolIdx.getHeight() == outGrad.getHeight() &&
+        maxPoolIdx.getWidth() == outGrad.getWidth());
 
   real* tgtGrad = getData();
-  real* inData = image.getData();
-  real* otData = outV.getData();
   real* otGrad = outGrad.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
 
-  size_t outStride = outV.getStride();
+  size_t outStride = outGrad.getStride();
   ;
 
   for (size_t n = 0; n < num; ++n) {
-    if (!outV.isContiguous()) {
-      otData = outV.getData() + n * outStride;
+    if (!outGrad.isContiguous()) {
       otGrad = outGrad.getData() + n * outStride;
+      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
     }
     for (size_t c = 0; c < channels; ++c) {
       for (size_t pd = 0; pd < outputD; ++pd) {
         for (size_t ph = 0; ph < outputH; ++ph) {
           for (size_t pw = 0; pw < outputW; ++pw) {
-            int dstart = pd * strideD - paddingD;
-            int hstart = ph * strideH - paddingH;
-            int wstart = pw * strideW - paddingW;
-            int dend = std::min(dstart + sizeZ, imgSizeD);
-            int hend = std::min(hstart + sizeY, imgSizeH);
-            int wend = std::min(wstart + sizeX, imgSizeW);
-            dstart = std::max(dstart, 0);
-            hstart = std::max(hstart, 0);
-            wstart = std::max(wstart, 0);
-            for (int d = dstart; d < dend; ++d) {
-              for (int h = hstart; h < hend; ++h) {
-                for (int w = wstart; w < wend; ++w) {
-                  tgtGrad[(d * imgSizeH + h) * imgSizeW + w] =
-                      scaleTargets *
-                          tgtGrad[(d * imgSizeH + h) * imgSizeW + w] +
-                      scaleOutput * otGrad[(pd * outputH + ph) * outputW + pw] *
-                          (inData[(d * imgSizeH + h) * imgSizeW + w] ==
-                           otData[(pd * outputH + ph) * outputW + pw]);
-                }
-              }
-            }
+            const size_t index = (pd * outputH + ph) * outputW + pw;
+            const size_t tgtIdx = static_cast<size_t>(maxPoolIdxData[index]);
+            tgtGrad[tgtIdx] =
+                scaleTargets * tgtGrad[tgtIdx] + scaleOutput * otGrad[index];
           }
         }
       }
       // offset
-      inData += imgSizeD * imgSizeH * imgSizeW;
       tgtGrad += imgSizeD * imgSizeH * imgSizeW;
-      otData += outputD * outputH * outputW;
       otGrad += outputD * outputH * outputW;
+      maxPoolIdxData += outputD * outputH * outputW;
     }
   }
 }
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index dec9702433cdc..e674c1e9abef2 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -933,6 +933,7 @@ class Matrix : public BaseMatrix {
    * in the sizeX of value
    */
   virtual void maxPool3DForward(Matrix& inputMat,
+                                Matrix& maxPoolIdx,
                                 size_t channels,
                                 size_t imgSizeD,
                                 size_t imgSizeH,
@@ -952,9 +953,8 @@ class Matrix : public BaseMatrix {
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void maxPool3DBackward(Matrix& image,
-                                 Matrix& outGrad,
-                                 Matrix& outV,
+  virtual void maxPool3DBackward(Matrix& outGrad,
+                                 Matrix& maxPoolIdx,
                                  size_t imgSizeD,
                                  size_t imgSizeH,
                                  size_t imgSizeW,
@@ -1436,6 +1436,7 @@ class GpuMatrix : public Matrix {
                        size_t paddingW);
 
   void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
                         size_t channels,
                         size_t imgSizeD,
                         size_t imgSizeH,
@@ -1453,9 +1454,8 @@ class GpuMatrix : public Matrix {
                         size_t paddingH,
                         size_t paddingW);
 
-  void maxPool3DBackward(Matrix& image,
-                         Matrix& outGrad,
-                         Matrix& outV,
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
                          size_t imgSizeD,
                          size_t imgSizeH,
                          size_t imgSizeW,
@@ -1671,6 +1671,7 @@ class CpuMatrix : public Matrix {
                        size_t paddingW);
 
   void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
                         size_t channels,
                         size_t imgSizeD,
                         size_t imgSizeH,
@@ -1688,9 +1689,8 @@ class CpuMatrix : public Matrix {
                         size_t paddingH,
                         size_t paddingW);
 
-  void maxPool3DBackward(Matrix& image,
-                         Matrix& outGrad,
-                         Matrix& outV,
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
                          size_t imgSizeD,
                          size_t imgSizeH,
                          size_t imgSizeW,
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 21ee8543cd2ea..d7ad6f18ac0aa 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1204,202 +1204,398 @@ TEST(Matrix, warpCTC) {
   }
 }
 
-void testMatrixPool3D(int depth, int height, int width) {
-  int channel = 3;
-  int filterX = 3, filterY = 4, filterZ = 5;
-  int strideX = 2, strideY = 2, strideZ = 2;
-  int padX = 1, padY = 1, padZ = 1;
-
-  MatrixPtr cpuImage =
-      std::make_shared<CpuMatrix>(1, channel * depth * height * width);
-  MatrixPtr gpuImage =
-      std::make_shared<GpuMatrix>(1, channel * depth * height * width);
-
-  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
-  int outH = outputSize(height, filterY, padZ, strideY, true);
-  int outW = outputSize(width, filterX, padZ, strideX, true);
-
-  int colBufWidth = outD * outH * outW;
-  MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(1, channel * colBufWidth);
-  MatrixPtr gpuOutput = std::make_shared<GpuMatrix>(1, channel * colBufWidth);
-
-  cpuImage->randomizeUniform();
-  gpuImage->copyFrom(*cpuImage);
-  cpuOutput->maxPool3DForward(*cpuImage,
-                              channel,
-                              depth,
-                              height,
-                              width,
-                              outD,
-                              outH,
-                              outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX);
-  gpuOutput->maxPool3DForward(*gpuImage,
-                              channel,
-                              depth,
-                              height,
-                              width,
-                              outD,
-                              outH,
-                              outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX);
-  TensorCheckErr(*cpuOutput, *gpuOutput);
+void testMaxPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = channels * imgSizeD * imgSizeH * imgSizeW;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
 
-  cpuImage->randomizeUniform();
-  gpuImage->copyFrom(*cpuImage);
-  cpuOutput->avgPool3DForward(*cpuImage,
-                              channel,
-                              depth,
-                              height,
-                              width,
-                              outD,
-                              outH,
-                              outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX);
-
-  gpuOutput->avgPool3DForward(*gpuImage,
-                              channel,
-                              depth,
-                              height,
-                              width,
-                              outD,
-                              outH,
-                              outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX);
-  TensorCheckErr(*cpuOutput, *gpuOutput);
-  cpuImage->randomizeUniform();
-  gpuImage->copyFrom(*cpuImage);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-  cpuImage->avgPool3DBackward(*cpuOutput,
-                              depth,
-                              height,
-                              width,
-                              outD,
-                              outH,
-                              outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX,
-                              1.0,
-                              1.0);
-
-  gpuImage->avgPool3DBackward(*gpuOutput,
-                              depth,
-                              height,
-                              width,
-                              outD,
-                              outH,
-                              outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX,
-                              1.0,
-                              1.0);
-  TensorCheckErr(*cpuImage, *gpuImage);
-
-  cpuImage->randomizeUniform();
-  gpuImage->copyFrom(*cpuImage);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-  cpuImage->maxPool3DBackward(*cpuImage,
-                              *cpuOutput,
-                              *cpuOutput,
-                              depth,
-                              height,
-                              width,
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr maxIdx = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr maxIdxGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->maxPool3DForward(*input,
+                           *maxIdx,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+  targetGpu->maxPool3DForward(*inputGpu,
+                              *maxIdxGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
                               outD,
                               outH,
                               outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX,
-                              1.0,
-                              1.0);
-
-  gpuImage->maxPool3DBackward(*gpuImage,
-                              *gpuOutput,
-                              *gpuOutput,
-                              depth,
-                              height,
-                              width,
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  checkMatrixEqual(target, targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxPool3DBackward(*targetGrad,
+                               *maxIdx,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+  inputGpuGrad->maxPool3DBackward(*targetGpuGrad,
+                                  *maxIdxGpu,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  checkMatrixEqual(inputGrad, targetBwdCheck);
+}
+
+void testAvgPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = imgSizeD * imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->avgPool3DForward(*input,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+
+  targetGpu->avgPool3DForward(*inputGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
                               outD,
                               outH,
                               outW,
-                              filterZ,
-                              filterY,
-                              filterX,
-                              strideZ,
-                              strideY,
-                              strideX,
-                              padZ,
-                              padY,
-                              padX,
-                              1.0,
-                              1.0);
-  TensorCheckErr(*cpuImage, *gpuImage);
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+
+  TensorCheckErr(*target, *targetGpu);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->avgPool3DBackward(*targetGrad,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+
+  inputGpuGrad->avgPool3DBackward(*targetGpuGrad,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
 }
 
-TEST(Matrix, Pool3D) {
-  for (auto depth : {9, 16, 64}) {
-    for (auto height : {9, 11, 128}) {
-      for (auto width : {9, 32, 128}) {
-        VLOG(3) << "depth=" << depth << " height=" << height
-                << " width=" << width;
-        testMatrixPool3D(depth, height, width);
+// TODO(yi): I noticed many such blindly combinatorial tests in this
+// file.  They are no help to locate defects at all.
+TEST(Matrix, Pool3DFwdBwd) {
+  for (auto numSamples : {1, 3}) {
+    for (auto channels : {3}) {
+      for (auto imgSizeD : {9, 16}) {
+        for (auto imgSizeH : {9, 32}) {
+          for (auto imgSizeW : {9, 32}) {
+            for (auto sizeX : {3}) {
+              for (auto sizeY : {3}) {
+                for (auto sizeZ : {3}) {
+                  for (auto sD : {2}) {
+                    for (auto sH : {2}) {
+                      for (auto sW : {2}) {
+                        for (auto pD : {0, (sizeZ - 1) / 2}) {
+                          for (auto pH : {0, (sizeY - 1) / 2}) {
+                            for (auto pW : {0, (sizeX - 1) / 2}) {
+                              VLOG(3) << " numSamples=" << numSamples
+                                      << " channels=" << channels
+                                      << " imgSizeD=" << imgSizeD
+                                      << " imgSizeH=" << imgSizeH
+                                      << " imgSizeW=" << imgSizeW
+                                      << " sizeX=" << sizeX
+                                      << " sizeY=" << sizeY
+                                      << " sizeZ=" << sizeZ << " strideD=" << sD
+                                      << " strideH=" << sH << " strideW=" << sW
+                                      << " padingD=" << pD << " padingH=" << pH
+                                      << " padingW=" << pW;
+
+                              testMaxPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                              testAvgPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
       }
     }
   }
+
+  //
+  //  for (auto numSamples : {1, 3}) {
+  //    for (auto channels : {1, 3}) {
+  //      for (auto imgSizeD : {9,16}) {
+  //      for (auto imgSizeH : {9, 32}) {
+  //        for (auto imgSizeW : {9, 32}) {
+  //          for (auto sizeX : {2, 3}) {
+  //            for (auto sizeY : {2, 3}) {
+  //            for (auto sizeZ : {2,3}){
+  //              for (auto sD : {1, 2}) {
+  //              for (auto sH : {1, 2}) {
+  //                for (auto sW : {1, 2}) {
+  //                  for (auto pD : {0, (sizeZ - 1) / 2}){
+  //                  for (auto pH : {0, (sizeY - 1) / 2}) {
+  //                    for (auto pW : {0, (sizeX - 1) / 2}) {
+  //                      VLOG(3) << " numSamples=" << numSamples
+  //                              << " channels=" << channels
+  //                              << " imgSizeD=" << imgSizeD
+  //                              << " imgSizeH=" << imgSizeH
+  //                              << " imgSizeW=" << imgSizeW
+  //                              << " sizeX=" << sizeX
+  //                              << " sizeY=" << sizeY
+  //                              << " sizeZ=" << sizeZ
+  //                              << " strideD=" << sD
+  //                              << " strideH=" << sH
+  //                              << " strideW=" << sW
+  //                              << " padingD=" << pD
+  //                              << " padingH=" << pH
+  //                              << " padingW=" << pW;
+  //
+  //                      testMaxPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                      testAvgPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                    }
+  //                  }
+  //                }
+  //              }
+  //            }
+  //            }
+  //          }
+  //        }
+  //      }
+  //      }
+  //    }
+  //    }
+  //  }
+  //  }
 }
 
 #endif

From b1c0bad9fe8258ac9c12141c07fddb8600f781c5 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Sat, 26 Aug 2017 13:09:05 +0800
Subject: [PATCH 4/6] Add config parser for pooling3D

---
 paddle/math/Matrix.cpp                        |   2 -
 proto/ModelConfig.proto                       |   1 +
 python/paddle/trainer/config_parser.py        | 120 +++++++++++++-
 .../paddle/trainer_config_helpers/layers.py   | 146 +++++++++++++++++-
 .../tests/configs/test_pooling3D_layer.py     |  38 +++++
 .../tests/layers_test.py                      |   2 +-
 6 files changed, 304 insertions(+), 5 deletions(-)
 create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py

diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 54c2eae475047..e93a154556bc3 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -2255,9 +2255,7 @@ void CpuMatrix::maxPool3DBackward(Matrix& outGrad,
   real* tgtGrad = getData();
   real* otGrad = outGrad.getData();
   real* maxPoolIdxData = maxPoolIdx.getData();
-
   size_t outStride = outGrad.getStride();
-  ;
 
   for (size_t n = 0; n < num; ++n) {
     if (!outGrad.isContiguous()) {
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 42cf10e9d3f91..259f3c33c398a 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -495,6 +495,7 @@ message LayerConfig {
   // to indicate rectangle image data
   optional uint64 height = 50;
   optional uint64 width = 51;
+  optional uint64 depth = 57 [ default = 1 ];
 
   // blank label used in ctc loss
   optional uint32 blank = 52 [ default = 0 ];
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index b7b696ef0c13e..405c5e1f13acc 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -903,6 +903,31 @@ def __init__(
         self.add_keys(locals())
 
 
+@config_class
+class Pool3d(Cfg):
+    def __init__(
+            self,
+            pool_type,
+            channels,
+            size_x,
+            size_y=None,
+            size_z=None,
+            start=None,
+            stride=None,  # 1 by defalut in protobuf
+            stride_y=None,
+            stride_z=None,
+            padding=None,  # 0 by defalut in protobuf
+            padding_y=None,
+            padding_z=None):
+        self.add_keys(locals())
+        self.filter_size_y = size_y if size_y else size_x
+        self.filter_size_z = size_z if size_z else size_x
+        self.padding_y = padding_y if padding_y else padding
+        self.padding_z = padding_z if padding_z else padding
+        self.stride_y = stride_y if stride_y else stride
+        self.stride_z = stride_z if stride_z else stride
+
+
 @config_class
 class SpatialPyramidPool(Cfg):
     def __init__(self, pool_type, pyramid_height, channels):
@@ -1167,6 +1192,20 @@ def get_img_size(input_layer_name, channels):
     return img_size, img_size_y
 
 
+def get_img3d_size(input_layer_name, channels):
+    input = g_layer_map[input_layer_name]
+    img_pixels = input.size / channels
+    img_size = input.width
+    img_size_y = input.height
+    img_size_z = input.depth
+
+    config_assert(
+        img_size * img_size_y * img_size_z == img_pixels,
+        "Input layer %s: Incorrect input image size %d * %d * %d for input image pixels %d"
+        % (input_layer_name, img_size, img_size_y, img_size_z, img_pixels))
+    return img_size, img_size_y, img_size_z
+
+
 def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
     parse_image(bilinear, input_layer_name, bilinear_conf.image_conf)
     bilinear_conf.out_size_x = bilinear.out_size_x
@@ -1204,6 +1243,45 @@ def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
                                          pool_conf.stride_y, not ceil_mode)
 
 
+def parse_pool3d(pool, input_layer_name, pool_conf, ceil_mode):
+    pool_conf.pool_type = pool.pool_type
+    config_assert(pool.pool_type in ['max-projection', 'avg-projection'],
+                  "pool-type %s is not in "
+                  "['max-projection', 'avg-projection']" % pool.pool_type)
+
+    pool_conf.channels = pool.channels
+
+    pool_conf.size_x = pool.size_x
+    pool_conf.stride = pool.stride
+    pool_conf.padding = pool.padding
+
+    pool_conf.size_y = default(pool.size_y, pool_conf.size_x)
+    pool_conf.size_z = default(pool.size_z, pool_conf.size_x)
+    pool_conf.stride_y = default(pool.stride_y, pool_conf.stride)
+    pool_conf.stride_z = default(pool.stride_z, pool_conf.stride)
+    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
+    pool_conf.padding_z = default(pool.padding_z, pool_conf.padding)
+
+    pool_conf.img_size, pool_conf.img_size_y, pool_conf.img_size_z = \
+        get_img3d_size(input_layer_name, pool.channels)
+
+    config_assert(not pool.start, "start is deprecated in pooling.")
+
+    if pool.padding is not None:
+        pool_conf.padding = pool.padding
+    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
+    pool_conf.padding_z = default(pool.padding_z, pool_conf.padding)
+    pool_conf.output_x = cnn_output_size(pool_conf.img_size, pool_conf.size_x,
+                                         pool_conf.padding, pool_conf.stride,
+                                         not ceil_mode)
+    pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
+                                         pool_conf.padding_y,
+                                         pool_conf.stride_y, not ceil_mode)
+    pool_conf.output_z = cnn_output_size(pool_conf.img_size_z, pool_conf.size_z,
+                                         pool_conf.padding_z,
+                                         pool_conf.stride_z, not ceil_mode)
+
+
 def parse_spp(spp, input_layer_name, spp_conf):
     parse_image(spp, input_layer_name, spp_conf.image_conf)
     spp_conf.pool_type = spp.pool_type
@@ -1580,6 +1658,9 @@ def set_layer_height_width(self, height, width):
         self.config.height = height
         self.config.width = width
 
+    def set_layer_depth(self, depth):
+        self.config.depth = depth
+
     def set_cnn_layer(self,
                       input_layer_name,
                       height,
@@ -1763,11 +1844,19 @@ def __init__(self, name, inputs, size, input_num, num_classes,
 
 @config_layer('data')
 class DataLayer(LayerBase):
-    def __init__(self, name, size, height=None, width=None, device=None):
+    def __init__(self,
+                 name,
+                 size,
+                 depth=None,
+                 height=None,
+                 width=None,
+                 device=None):
         super(DataLayer, self).__init__(
             name, 'data', size, inputs=[], device=device)
         if height and width:
             self.set_layer_height_width(height, width)
+        if depth:
+            self.set_layer_depth(depth)
 
 
 '''
@@ -1995,6 +2084,35 @@ def __init__(self, name, inputs, ceil_mode=True, **xargs):
                                pool_conf.channels)
 
 
+@config_layer('pool3d')
+class Pool3DLayer(LayerBase):
+    def __init__(self, name, inputs, ceil_mode=True, **xargs):
+        super(Pool3DLayer, self).__init__(
+            name, 'pool3d', 0, inputs=inputs, **xargs)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            pool_conf = self.config.inputs[input_index].pool_conf
+            parse_pool3d(self.inputs[input_index].pool, input_layer.name,
+                         pool_conf, ceil_mode)
+            self.set_cnn_layer(name, pool_conf.output_z, pool_conf.output_y,
+                               pool_conf.output_x, pool_conf.channels)
+
+    def set_cnn_layer(self,
+                      input_layer_name,
+                      depth,
+                      height,
+                      width,
+                      channels,
+                      is_print=True):
+        size = depth * height * width * channels
+        self.set_layer_size(size)
+        self.set_layer_height_width(height, width)
+        self.set_layer_depth(depth)
+        if is_print:
+            print("output for %s: c = %d, d = %d, h = %d, w = %d, size = %d" %
+                  (input_layer_name, channels, depth, height, width, size))
+
+
 @config_layer('spp')
 class SpatialPyramidPoolLayer(LayerBase):
     def __init__(self, name, inputs, **xargs):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 1bc55c8696015..5c5e737b56de1 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -133,6 +133,7 @@
     'clip_layer',
     'slice_projection',
     'kmax_sequence_score_layer',
+    'img_pool3d_layer',
 ]
 
 
@@ -161,6 +162,7 @@ class LayerType(object):
     EXCONVTRANS_LAYER = 'exconvt'
     CUDNNCONV_LAYER = 'cudnn_conv'
     POOL_LAYER = 'pool'
+    POOL3D_LAYER = 'pool3d'
     BATCH_NORM_LAYER = 'batch_norm'
     NORM_LAYER = 'norm'
     SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm'
@@ -878,7 +880,8 @@ def mixed_layer(size=0,
 
 
 @layer_support()
-def data_layer(name, size, height=None, width=None, layer_attr=None):
+def data_layer(name, size, depth=None, height=None, width=None,
+               layer_attr=None):
     """
     Define DataLayer For NeuralNetwork.
 
@@ -905,6 +908,7 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
         type=LayerType.DATA,
         name=name,
         size=size,
+        depth=depth,
         height=height,
         width=width,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
@@ -2610,6 +2614,146 @@ def img_pool_layer(input,
         size=l.config.size)
 
 
+@wrap_name_default("pool3d")
+@layer_support()
+def img_pool3d_layer(input,
+                     pool_size,
+                     name=None,
+                     num_channels=None,
+                     pool_type=None,
+                     stride=1,
+                     padding=0,
+                     layer_attr=None,
+                     pool_size_y=None,
+                     stride_y=None,
+                     padding_y=None,
+                     pool_size_z=None,
+                     stride_z=None,
+                     padding_z=None,
+                     ceil_mode=True):
+    """
+    Image pooling Layer.
+
+    The details of pooling layer, please refer ufldl's pooling_ .
+
+    .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
+
+    - ceil_mode=True:
+
+    ..  math::
+
+        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
+        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+        d = 1 + int(ceil(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z))
+
+    - ceil_mode=False:
+
+    ..  math::
+
+        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
+        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+        d = 1 + int(floor(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z))
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        maxpool = img_pool3d_layer(input=conv,
+                                 pool_size=3,
+                                 num_channels=8,
+                                 stride=1,
+                                 padding=1,
+                                 pool_type=MaxPooling())
+
+    :param padding: pooling padding width.
+    :type padding: int|tuple|list
+    :param name: name of pooling layer
+    :type name: basestring.
+    :param input: layer's input
+    :type input: LayerOutput
+    :param pool_size: pooling window width
+    :type pool_size: int|tuple|list
+    :param num_channels: number of input channel.
+    :type num_channels: int
+    :param pool_type: pooling type. MaxPooling or AvgPooling. Default is
+                      MaxPooling.
+    :type pool_type: BasePoolingType
+    :param stride: stride width of pooling.
+    :type stride: int|tuple|list
+    :param layer_attr: Extra Layer attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :param ceil_mode: Wether to use ceil mode to calculate output height and with.
+                      Defalut is True. If set false, Otherwise use floor.
+
+    :type ceil_mode: bool
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+
+    if pool_type is None:
+        pool_type = MaxPooling()
+    elif isinstance(pool_type, AvgPooling):
+        pool_type.name = 'avg'
+
+    type_name = pool_type.name + '-projection' \
+        if (
+        isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
+        else pool_type.name
+
+    if isinstance(pool_size, collections.Sequence):
+        assert len(pool_size) == 3
+        pool_size, pool_size_y, pool_size_z = pool_size
+    else:
+        pool_size_y = pool_size
+        pool_size_z = pool_size
+
+    if isinstance(stride, collections.Sequence):
+        assert len(stride) == 3
+        stride, stride_y, stride_z = stride
+    else:
+        stride_y = stride
+        stride_z = stride
+
+    if isinstance(padding, collections.Sequence):
+        assert len(padding) == 3
+        padding, padding_y, padding_y = padding
+    else:
+        padding_y = padding
+        padding_z = padding
+
+    l = Layer(
+        name=name,
+        type=LayerType.POOL3D_LAYER,
+        inputs=[
+            Input(
+                input.name,
+                pool=Pool3d(
+                    pool_type=type_name,
+                    channels=num_channels,
+                    size_x=pool_size,
+                    start=None,
+                    stride=stride,
+                    padding=padding,
+                    size_y=pool_size_y,
+                    stride_y=stride_y,
+                    padding_y=padding_y,
+                    size_z=pool_size_z,
+                    stride_z=stride_z,
+                    padding_z=padding_z))
+        ],
+        ceil_mode=ceil_mode,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        LayerType.POOL_LAYER,
+        parents=[input],
+        num_filters=num_channels,
+        size=l.config.size)
+
+
 @wrap_name_default("spp")
 @layer_support()
 def spp_layer(input,
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
new file mode 100644
index 0000000000000..0dbb921d41986
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
@@ -0,0 +1,38 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=100, learning_rate=1e-5)
+
+data_2d = data_layer(name='data_2d', size=6000, height=20, width=10)
+
+pool_2d = img_pool_layer(
+    name="pool___2d",
+    input=data_2d,
+    num_channels=30,
+    pool_size=5,
+    stride=3,
+    padding=1,
+    pool_type=AvgPooling())
+outputs(pool_2d)
+
+data_3d = data_layer(
+    name='data_3d_1', size=60000, depth=10, height=20, width=10)
+
+pool_3d_1 = img_pool3d_layer(
+    name="pool_3d_1",
+    input=data_3d,
+    num_channels=30,
+    pool_size=5,
+    stride=3,
+    padding=1,
+    pool_type=AvgPooling())
+outputs(pool_3d_1)
+
+pool_3d_2 = img_pool3d_layer(
+    name="pool_3d_2",
+    input=data_3d,
+    num_channels=30,
+    pool_size=[5, 5, 5],
+    stride=[3, 3, 3],
+    padding=[1, 1, 1],
+    pool_type=MaxPooling())
+outputs(pool_3d_2)
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test.py b/python/paddle/trainer_config_helpers/tests/layers_test.py
index 05902ea293df5..52218972bfd88 100644
--- a/python/paddle/trainer_config_helpers/tests/layers_test.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test.py
@@ -16,4 +16,4 @@
 
 if __name__ == '__main__':
     parse_config_and_serialize(
-        'trainer_config_helpers/tests/layers_test_config.py', '')
+        'trainer_config_helpers/tests/configs/test_pooling3D_layer.py', '')

From f557b0c4c5e1ac97fdc092ed85993c4dda72fd2d Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Thu, 31 Aug 2017 00:11:44 +0800
Subject: [PATCH 5/6] fix data_layer for 3D data

---
 python/paddle/trainer_config_helpers/layers.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index ebb6f36504f50..c92764e1f9f9c 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -929,11 +929,13 @@ def data_layer(name, size, depth=None, height=None, width=None,
         width=width,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
 
+    if depth is None:
+        depth = 1
     num_filters = None
     if height is not None and width is not None:
-        num_filters = size / (width * height)
-        assert num_filters * width * height == size, \
-            "size=%s width=%s height=%s" % (size, width, height)
+        num_filters = size / (width * height * depth)
+        assert num_filters * width * height * depth == size, \
+                "size=%s width=%s height=%s depth=%s"  % (size, width, height, depth)
 
     return LayerOutput(name, LayerType.DATA, size=size, num_filters=num_filters)
 

From d747c5d5119b7e564b9b7dcc7d7528ac91972712 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Thu, 31 Aug 2017 13:57:59 +0800
Subject: [PATCH 6/6] fix layers_test.py

---
 paddle/cuda/src/hl_cuda_cnn.cu                               | 5 +++--
 paddle/parameter/Argument.h                                  | 3 +++
 .../paddle/trainer_config_helpers/tests/configs/file_list.sh | 2 +-
 python/paddle/trainer_config_helpers/tests/layers_test.py    | 2 +-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index 95440c9446384..9ba3d14261753 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -1,8 +1,11 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -350,7 +353,6 @@ void hl_avgpool_backward(const int frameCnt,
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
-/////////////////
 __global__ void KeMaxPool3DForward(const int nthreads,
                                    const real* inputData,
                                    const int channels,
@@ -777,7 +779,6 @@ void hl_avgpool3D_backward(const int frameCnt,
                                                            outStride);
   CHECK_SYNC("hl_avgpool3D_backward failed");
 }
-/////////////////
 
 __global__ void KeBilinearInterpFw(const real* in,
                                    const size_t inImgH,
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 7b59199dded5b..9ed63462b1a60 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -1,8 +1,11 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 1ca5c8a07ebb7..e7dc08c6dd06a 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -9,6 +9,6 @@ test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
 test_kmax_seq_socre_layer test_seq_select_layers test_scale_shift_layer
-test_seq_slice_layer)
+test_seq_slice_layer test_pooling3D_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test.py b/python/paddle/trainer_config_helpers/tests/layers_test.py
index 52218972bfd88..05902ea293df5 100644
--- a/python/paddle/trainer_config_helpers/tests/layers_test.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test.py
@@ -16,4 +16,4 @@
 
 if __name__ == '__main__':
     parse_config_and_serialize(
-        'trainer_config_helpers/tests/configs/test_pooling3D_layer.py', '')
+        'trainer_config_helpers/tests/layers_test_config.py', '')