@@ -93,30 +93,30 @@ template <typename T1,
9393__global__ void KernelScanInnerWithIndices (const T1* x_data,
9494 T1* values_data,
9595 T2* indices_data,
96- int num_rows,
97- int row_size,
96+ int64_t num_rows,
97+ int64_t row_size,
9898 T1 init,
9999 BinaryFunction binary_op) {
100100 __shared__ T1 vbuf[num_threads_y][2 * num_threads_x];
101101 __shared__ T2 ibuf[num_threads_y][2 * num_threads_x];
102102 T1* row_buf = vbuf[threadIdx .y ];
103103 T2* row_idx_buf = ibuf[threadIdx .y ];
104104
105- for (int block_row = blockIdx .x * blockDim .y ; block_row < num_rows;
105+ for (int64_t block_row = blockIdx .x * blockDim .y ; block_row < num_rows;
106106 block_row += blockDim .y * gridDim .x ) {
107- int row = block_row + threadIdx .y ;
107+ int64_t row = block_row + threadIdx .y ;
108108 const T1* row_self = x_data + row * row_size;
109109 T1* row_values = values_data + row * row_size;
110110 T2* row_indices = indices_data + row * row_size;
111111 T1 block_total = init;
112112 T2 block_idx_final = 0 ;
113113 // Perform scan on one block at a time, keeping track of the total value of
114114 // all blocks processed so far.
115- for (int block_col = 0 ; block_col < row_size;
115+ for (int64_t block_col = 0 ; block_col < row_size;
116116 block_col += 2 * num_threads_x) {
117117 // Load data into shared memory (two values per thread).
118- int col1 = block_col + threadIdx .x ;
119- int col2 = block_col + num_threads_x + threadIdx .x ;
118+ int64_t col1 = block_col + threadIdx .x ;
119+ int64_t col2 = block_col + num_threads_x + threadIdx .x ;
120120 if (row < num_rows) {
121121 if (col1 < row_size) {
122122 row_buf[threadIdx .x ] = *reinterpret_cast <const T1*>(&row_self[col1]);
0 commit comments