From aa83a1bec3cf11b1202089e488c50501403b961d Mon Sep 17 00:00:00 2001
From: haohongxiang <haohongxiang@baidu.com>
Date: Mon, 12 Jul 2021 15:06:12 +0800
Subject: [PATCH 1/2] Fix gather_op by adding OurOfRangeCheck for param[Index],
 test=develop

---
 paddle/fluid/operators/gather.cu.h | 36 ++++++++++++++++++++++++++----
 paddle/fluid/operators/gather.h    | 22 +++++++++++++++++-
 2 files changed, 53 insertions(+), 5 deletions(-)
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index 95cb428abdf34..2647ecff936b3 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -30,13 +30,21 @@ using platform::DeviceContext;
 
 template <typename T, typename IndexT = int>
 __global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
-                                 T* output, size_t index_size,
-                                 size_t slice_size) {
+                                 T* output, size_t input_size,
+                                 size_t index_size, size_t slice_size,
+                                 size_t end_size) {
   CUDA_KERNEL_LOOP(i, index_size * slice_size) {
     int indices_i = i / slice_size;
     int slice_i = i - indices_i * slice_size;  // offset inside the slice
     IndexT gather_i = indices[indices_i];
     IndexT params_i = gather_i * slice_size + slice_i;
+    PADDLE_ENFORCE(
+        gather_i >= 0 && gather_i < input_size,
+        "The index is out of bounds, "
+        "please check whether the dimensions of index and "
+        "input meet the requirements. It should "
+        "be less than [%d] and greater than or equal to 0, but received [%d]",
+        input_size, gather_i);
     *(output + i) = *(params + params_i);
   }
 }
@@ -58,7 +66,7 @@ __global__ void GatherNdCUDAKernel(const T* input, const int* input_dims,
           "The index is out of bounds, "
           "please check whether the dimensions of index and "
           "input meet the requirements. It should "
-          "be less than [%d] and greater or equal to 0, but received [%d]",
+          "be less than [%d] and greater than or equal to 0, but received [%d]",
           input_dims[j], index_value);
       gather_i += (index_value * temp);
       temp *= input_dims[j];
@@ -91,15 +99,26 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
                           " the second dimension should be 1."));
   }
 
+  const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+  auto cplace = platform::CPUPlace();
+  auto index_dims = index.dims();
+  auto index_dims_size = index_dims.size();
+  auto input_dims = src.dims();
+  auto input_dims_size = input_dims.size();
+
   int index_size = index.dims()[0];
 
   auto src_dims = src.dims();
   framework::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
+  // final dim
+  int64_t end_size = index_dims[index_dims_size - 1];
   // slice size
   int slice_size = 1;
   for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+  // input size
+  int input_size = src_dims[0] * slice_size;
 
   const T* p_src = src.data<T>();
   const IndexT* p_index = index.data<IndexT>();
@@ -112,7 +131,7 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   GatherCUDAKernel<T, IndexT><<<
       grid, block, 0,
       reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_src, p_index, p_output, index_size, slice_size);
+      p_src, p_index, p_output, input_size, index_size, slice_size, end_size);
 }
 
 template <typename DeviceContext, typename T, typename IndexT = int>
@@ -177,6 +196,15 @@ __global__ void GatherGPUKernel(const T* input, const U* index, T* out,
     int next_idx = idx - outer_size * inner_dim_index;
     int index_dim_index = next_idx / outer_dim_size;
     int index_val = index[index_dim_index];
+
+    PADDLE_ENFORCE(
+        index_val >= 0 && index_val < input_index_dim_size,
+        "The index is out of bounds, "
+        "please check whether the dimensions of index and "
+        "input meet the requirements. It should "
+        "be less than [%d] and greater than or equal to 0, but received [%d]",
+        input_index_dim_size, index_val);
+
     int out_dim_index = next_idx - outer_dim_size * index_dim_index;
     int input_index =
         inner_dim_index * (outer_dim_size * input_index_dim_size) +
diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h
index 8deab709220d7..b7fa4022882b7 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
@@ -67,11 +67,25 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   // slice size
   int slice_size = 1;
   for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+  // input size
+  int input_size = src_dims[0] * slice_size;
 
   const size_t slice_bytes = slice_size * sizeof(T);
 
   for (int64_t i = 0; i < index_size; ++i) {
     IndexT index_ = p_index[i];
+    PADDLE_ENFORCE_LT(p_index[i], input_size,
+                      platform::errors::OutOfRange(
+                          "The element of Index must be less than the size of "
+                          "input dim size of axis which is %d, but received "
+                          "index element which is %d in the %d index.",
+                          input_size, p_index[i], i));
+    PADDLE_ENFORCE_GE(p_index[i], 0UL,
+                      platform::errors::OutOfRange(
+                          "The element of Index must be greater than or equal "
+                          "to 0, but received index element which is %d in the "
+                          "%d index.",
+                          p_index[i], i));
     memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
   }
 }
@@ -141,11 +155,17 @@ void GatherV2Function(const Tensor* input, const Tensor* index, int axis,
   int input_index_dim_size = input_dim[axis_index];
   for (int i = 0; i < index_size; i++) {
     PADDLE_ENFORCE_LT(index_data[i], input_index_dim_size,
-                      platform::errors::InvalidArgument(
+                      platform::errors::OutOfRange(
                           "The element of Index must be less than the size of "
                           "input dim size of axis which is %d, but received "
                           "index element which is %d in the %d index.",
                           input_index_dim_size, index_data[i], i));
+    PADDLE_ENFORCE_GE(index_data[i], 0UL,
+                      platform::errors::OutOfRange(
+                          "The element of Index must be greater than or equal "
+                          "to 0, but received index element which is %d in the "
+                          "%d index.",
+                          index_data[i], i));
   }
 
   int inner_dim_size = 1;

From 3bba806f469e7a35868ed10733429d3c14e2c169 Mon Sep 17 00:00:00 2001
From: haohongxiang <haohongxiang@baidu.com>
Date: Tue, 13 Jul 2021 10:40:33 +0800
Subject: [PATCH 2/2] Code Optimization, test=develop

---
 paddle/fluid/operators/gather.cu.h | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index 2647ecff936b3..6469307bc5652 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -31,8 +31,7 @@ using platform::DeviceContext;
 template <typename T, typename IndexT = int>
 __global__ void GatherCUDAKernel(const T* params, const IndexT* indices,
                                  T* output, size_t input_size,
-                                 size_t index_size, size_t slice_size,
-                                 size_t end_size) {
+                                 size_t index_size, size_t slice_size) {
   CUDA_KERNEL_LOOP(i, index_size * slice_size) {
     int indices_i = i / slice_size;
     int slice_i = i - indices_i * slice_size;  // offset inside the slice
@@ -99,21 +98,13 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
                           " the second dimension should be 1."));
   }
 
-  const auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-  auto cplace = platform::CPUPlace();
-  auto index_dims = index.dims();
-  auto index_dims_size = index_dims.size();
-  auto input_dims = src.dims();
-  auto input_dims_size = input_dims.size();
-
+  // index size
   int index_size = index.dims()[0];
 
   auto src_dims = src.dims();
   framework::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
-  // final dim
-  int64_t end_size = index_dims[index_dims_size - 1];
   // slice size
   int slice_size = 1;
   for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
@@ -131,7 +122,7 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
   GatherCUDAKernel<T, IndexT><<<
       grid, block, 0,
       reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
-      p_src, p_index, p_output, input_size, index_size, slice_size, end_size);
+      p_src, p_index, p_output, input_size, index_size, slice_size);
 }
 
 template <typename DeviceContext, typename T, typename IndexT = int>