Skip to content

Commit c0d2715

Browse files
authored
cummax cummin (#73111)
1 parent ed70d23 commit c0d2715

File tree

1 file changed

+7
-7
lines changed

1 file changed

+7
-7
lines changed

paddle/phi/kernels/gpu/cum_maxmin_kernel.cu

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -93,30 +93,30 @@ template <typename T1,
9393
__global__ void KernelScanInnerWithIndices(const T1* x_data,
9494
T1* values_data,
9595
T2* indices_data,
96-
int num_rows,
97-
int row_size,
96+
int64_t num_rows,
97+
int64_t row_size,
9898
T1 init,
9999
BinaryFunction binary_op) {
100100
__shared__ T1 vbuf[num_threads_y][2 * num_threads_x];
101101
__shared__ T2 ibuf[num_threads_y][2 * num_threads_x];
102102
T1* row_buf = vbuf[threadIdx.y];
103103
T2* row_idx_buf = ibuf[threadIdx.y];
104104

105-
for (int block_row = blockIdx.x * blockDim.y; block_row < num_rows;
105+
for (int64_t block_row = blockIdx.x * blockDim.y; block_row < num_rows;
106106
block_row += blockDim.y * gridDim.x) {
107-
int row = block_row + threadIdx.y;
107+
int64_t row = block_row + threadIdx.y;
108108
const T1* row_self = x_data + row * row_size;
109109
T1* row_values = values_data + row * row_size;
110110
T2* row_indices = indices_data + row * row_size;
111111
T1 block_total = init;
112112
T2 block_idx_final = 0;
113113
// Perform scan on one block at a time, keeping track of the total value of
114114
// all blocks processed so far.
115-
for (int block_col = 0; block_col < row_size;
115+
for (int64_t block_col = 0; block_col < row_size;
116116
block_col += 2 * num_threads_x) {
117117
// Load data into shared memory (two values per thread).
118-
int col1 = block_col + threadIdx.x;
119-
int col2 = block_col + num_threads_x + threadIdx.x;
118+
int64_t col1 = block_col + threadIdx.x;
119+
int64_t col2 = block_col + num_threads_x + threadIdx.x;
120120
if (row < num_rows) {
121121
if (col1 < row_size) {
122122
row_buf[threadIdx.x] = *reinterpret_cast<const T1*>(&row_self[col1]);

0 commit comments

Comments
 (0)