[WIP]: index_sample grad basic kernel realization. Needed to be optim…

…ized.
PaddlePaddle · Jan 14, 2021 · 866dd28 · 866dd28
1 parent b04cc91
commit 866dd28
Showing 1 changed file with 17 additions and 6 deletions.
diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu
@@ -40,17 +40,25 @@ __global__ void index_kernel(const IndexT* p_index, const T* p_input,
 }
 
 template <typename T, typename IndexT = int>
-__global__ void index_kernel_grad(const IndexT* p_index, const T* p_input,
-                                  T* p_output, size_t stride_index,
+__global__ void index_kernel_grad(const IndexT_* p_index, T* p_input,
+                                  const T* p_output, size_t stride_index,
                                   size_t stride_input, size_t height) {
+  extern __shared__ T s_buf[];
   int ix = blockDim.x * blockIdx.x + threadIdx.x;
   int iy = blockDim.y * blockIdx.y + threadIdx.y;
   int tid = iy * stride_index + ix;
   int tid_y = iy * stride_input + ix;
+  s_buf[tid_y] = p_input[tid_y];
+  s_buf[tid_y] = 0;
 
   if (ix < stride_index & iy < height) {
-    IndexT idx = p_index[tid];
-    p_output[tid_y - ix + idx] += p_input[tid];
+    for (int i = 0; i < stride_index; ++i) {
+      if (ix == i) {
+        IndexT idx = p_index[tid];
+        s_buf[tid_y - ix + idx] += p_output[tid];
+      }
+    }
+    p_input[tid_y] = s_buf[tid_y];
   }
 }
 
@@ -178,6 +186,7 @@ class IndexSampleGradCUDAKernel : public framework::OpKernel<T> {
     auto stream =
         ctx.template device_context<platform::CUDADeviceContext>().stream();
 
+    auto input_num = input_grad->numel();
     auto input_dim = input_grad->dims();
     auto index_dim = index->dims();
     size_t batch_size = index_dim[0];
@@ -194,12 +203,14 @@ class IndexSampleGradCUDAKernel : public framework::OpKernel<T> {
 
     if (index_type == framework::proto::VarType::INT64) {
       const int64_t* index_data = index->data<int64_t>();
-      index_kernel_grad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
+      index_kernel_grad<
+          T, int64_t><<<grid_dim, block_dim, input_num * sizeof(T), stream>>>(
           index_data, output_grad_data, input_grad_data, index_length,
           input_length, batch_size);
     } else if (index_type == framework::proto::VarType::INT32) {
       const int* index_data = index->data<int>();
-      index_kernel_grad<T, int><<<grid_dim, block_dim, 0, stream>>>(
+      index_kernel_grad<
+          T, int><<<grid_dim, block_dim, input_num * sizeof(T), stream>>>(
           index_data, output_grad_data, input_grad_data, index_length,
           input_length, batch_size);
     }