From cd7d69af58a6a4196b6e2c8f5255ca7b4b642f83 Mon Sep 17 00:00:00 2001 From: Connor Goggins Date: Tue, 3 Mar 2020 22:29:45 -0800 Subject: [PATCH] [Large Tensor] Backport of Fixed RNN op (#17632) * Changed relevant function args to index_t * Added nightly test for RNN * Added fix for LSTM, GRU, RNN-ReLU, RNN-tanh * Using const instead of literals * Added nightly test for RNN ReLU & tanh, LSTM, GRU * Type assertion to force evaluation of output NDArray * Incorporated latest round of comments --- src/operator/rnn-inl.h | 40 +-- src/operator/rnn_impl.h | 408 +++++++++++++++--------------- tests/nightly/test_large_array.py | 36 ++- 3 files changed, 259 insertions(+), 225 deletions(-) diff --git a/src/operator/rnn-inl.h b/src/operator/rnn-inl.h index 557c1117739a..d41b5b4f030b 100644 --- a/src/operator/rnn-inl.h +++ b/src/operator/rnn-inl.h @@ -63,7 +63,7 @@ struct RNNParam : public dmlc::Parameter { bool bidirectional, state_outputs; int mode; float p; - int seq_length_, batch_size_, input_size_; + index_t seq_length_, batch_size_, input_size_; bool use_sequence_length; dmlc::optional projection_size; @@ -122,8 +122,8 @@ struct RNNParam : public dmlc::Parameter { } }; -inline int GetRnnParamSize(int num_layer, - int input_size, +inline index_t GetRnnParamSize(int num_layer, + index_t input_size, int state_size, int direction, int mode, @@ -140,14 +140,14 @@ inline int GetRnnParamSize(int num_layer, size *= 3; break; } - int size1 = (input_size + state_size + 2) * size; // first layer size - int size2 = (state_size * direction + state_size + 2) * size; // other layers size + index_t size1 = (input_size + state_size + 2) * size; // first layer size + index_t size2 = (state_size * direction + state_size + 2) * size; // other layers size if (projection_size.has_value()) { - int proj_size = projection_size.value(); + index_t proj_size = projection_size.value(); size1 = (input_size + proj_size + 2) * size; size2 = (proj_size * direction + proj_size + 2) * size; } - int param_size = size1 + (num_layer - 1) * size2; + index_t param_size = size1 + (num_layer - 1) * size2; if (projection_size.has_value()) { param_size += projection_size.value() * state_size * num_layer * direction; } @@ -182,8 +182,8 @@ inline int GetRnnBiasSize(int num_layer, * - output -> h[t](, c[t] additionally with Lstm) time by time(sz: NxH(x2)) * - intermediate y[1...T] as next layer's inputs(sz: TxNxHxD) */ -inline size_t GetRNNWorkspaceSize(int seq_length, - int batch_size, +inline size_t GetRNNWorkspaceSize(index_t seq_length, + index_t batch_size, int hidden_size, int projection_size, int direction, @@ -214,8 +214,8 @@ inline size_t GetRNNWorkspaceSize(int seq_length, inline size_t GetRNNReserveSpaceSize(int num_layer, int direction, - int seq_length, - int batch_size, + index_t seq_length, + index_t batch_size, int hidden_size, int mode) { size_t size = 0; @@ -279,9 +279,9 @@ void RNNForwardTraining(DType* ws, bool state_outputs, const int num_layers, const int direction, - const int seq_length, - const int batch_size, - const int input_size, + const index_t seq_length, + const index_t batch_size, + const index_t input_size, const int state_size, DType* x_ptr, DType* hx_ptr, @@ -321,9 +321,9 @@ void RNNForwardInference(DType* ws, bool state_outputs, const int num_layers, const int direction, - const int seq_length, - const int batch_size, - const int input_size, + const index_t seq_length, + const index_t batch_size, + const index_t input_size, const int state_size, const int projection_size, DType* x_ptr, @@ -363,9 +363,9 @@ void RNNBackward(DType* ws, DType* rs, const int num_layers, const int direction, - const int seq_length, - const int batch_size, - const int input_size, + const index_t seq_length, + const index_t batch_size, + const index_t input_size, const int state_size, DType* x_ptr, DType* hx_ptr, diff --git a/src/operator/rnn_impl.h b/src/operator/rnn_impl.h index 008ba7d315c6..459345797936 100644 --- a/src/operator/rnn_impl.h +++ b/src/operator/rnn_impl.h @@ -59,9 +59,9 @@ void LstmForwardTrainingSingleLayer(DType* ws, DType* rs, bool state_outputs, bool bid, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -88,17 +88,17 @@ void LstmForwardTrainingSingleLayer(DType* ws, const int offset = bid ? H : 0; const DType alpha = 1.0; const DType beta = 0.0; - const int cell_size = N * H; + const index_t cell_size = N * H; linalg_gemm(x, wx, yx_flat, alpha, beta, false, true); const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); - for (int i = 0; i < T; ++i) { - int t = bid ? T - 1 - i : i; + for (index_t i = 0; i < T; ++i) { + index_t t = bid ? T - 1 - i : i; linalg_gemm(i ? h : hx, wh, yh_flat, alpha, beta, false, true); #pragma omp parallel for num_threads(omp_threads) - for (int jk = 0; jk < cell_size; ++jk) { - int j = jk / H; - int k = jk % H; + for (index_t jk = 0; jk < cell_size; ++jk) { + index_t j = jk / H; + index_t k = jk % H; DType it = sigmoid(yx[t][j][0][k] + yh[j][0][k] + bx[0][k] + bh[0][k]); DType ft = sigmoid(yx[t][j][1][k] + yh[j][1][k] + bx[1][k] + bh[1][k]); DType gt = tanh(yx[t][j][2][k] + yh[j][2][k] + bx[2][k] + bh[2][k]); @@ -127,9 +127,9 @@ void LstmForwardTraining(DType* ws, bool state_outputs, const int L, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -145,16 +145,16 @@ void LstmForwardTraining(DType* ws, const int total_layers = D * L; Tensor hx(hx_ptr, Shape3(total_layers, N, H)); Tensor cx(cx_ptr, Shape3(total_layers, N, H)); - const int b_size = 2 * H * 4; - const int r_size = D * T * N * H * 6; - const int y_offset = T * N * H * 5; - const int cell_size = N * H; + const index_t b_size = 2 * H * 4; + const index_t r_size = D * T * N * H * 6; + const index_t y_offset = T * N * H * 5; + const index_t cell_size = N * H; unsigned int seed_ = 17 + rand() % 4096; // NOLINT(runtime/threadsafe_fn) int idx = 0; // state & cell state's idx; const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); for (int i = 0; i < L; ++i) { - const int input_size = i ? H * D : I; - const int w_size = (input_size + H) * H * 4; + const index_t input_size = i ? H * D : I; + const index_t w_size = (input_size + H) * H * 4; Tensor x(x_ptr, Shape2(T * N, input_size)); Tensor y(rs2 + y_offset, Shape3(T, N, H * D)); LstmForwardTrainingSingleLayer(ws, rs2, state_outputs, false, T, N, input_size, H, x, @@ -175,7 +175,7 @@ void LstmForwardTraining(DType* ws, b_ptr += b_size; if (dropout > 0.0f) { #pragma omp parallel for num_threads(omp_threads) - for (int j = 0; j < T * N * H * D; j++) { + for (index_t j = 0; j < T * N * H * D; j++) { int rand_data = rand_r(&seed_); if (static_cast(rand_data % 1000) < static_cast(1000 * dropout)) { dropout_random[i * T * N * H * D + j] = 0; @@ -196,7 +196,7 @@ void LstmForwardTraining(DType* ws, } } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * H * D; ++i) { + for (index_t i = 0; i < T * N * H * D; ++i) { y_ptr[i] = (rs2 + y_offset)[i]; } } @@ -205,9 +205,9 @@ template void LstmForwardInferenceSingleLayer(DType* ws, bool state_outputs, bool bid, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const int P, const Tensor &x, @@ -237,19 +237,19 @@ void LstmForwardInferenceSingleLayer(DType* ws, const int proj_offset = bid ? P : 0; const DType alpha = 1.0; const DType beta = 0.0; - const int cell_size = N * H; + const index_t cell_size = N * H; linalg_gemm(x, wx, yx_flat, alpha, beta, false, true); const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); - for (int i = 0; i < T; ++i) { - int t = bid ? T - 1 - i : i; + for (index_t i = 0; i < T; ++i) { + index_t t = bid ? T - 1 - i : i; if (P > 0) { linalg_gemm(i ? r : hx, wh, yh_flat, alpha, beta, false, true); } else { linalg_gemm(i ? h : hx, wh, yh_flat, alpha, beta, false, true); } #pragma omp parallel for num_threads(omp_threads) - for (int jk = 0; jk < cell_size; ++jk) { + for (index_t jk = 0; jk < cell_size; ++jk) { int j = jk / H; int k = jk % H; DType it = sigmoid(yx[t][j][0][k] + yh[j][0][k] + bx[0][k] + bh[0][k]); @@ -282,9 +282,9 @@ void LstmForwardInference(DType* ws, bool state_outputs, const int L, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const int P, DType* x_ptr, @@ -298,16 +298,16 @@ void LstmForwardInference(DType* ws, const int total_layers = D * L; Tensor hx(hx_ptr, Shape3(total_layers, N, P ? P : H)); Tensor cx(cx_ptr, Shape3(total_layers, N, H)); - const int b_size = 2 * H * 4; - const int cell_size = N * H; - const int projection_size = (P ? P : H) * N; + const index_t b_size = 2 * H * 4; + const index_t cell_size = N * H; + const index_t projection_size = (P ? P : H) * N; DType* y_tmp_ptr = ws + (T + 1) * cell_size * 4 + cell_size * 2; DType* y_cur_ptr = y_ptr; int idx = 0; // state & cell state's idx; bool flag = L % 2 ? false : true; for (int i = 0; i < L; ++i) { - const int input_size = i ? (P ? P : H) * D : I; - int w_size = (input_size + (P ? P : H)) * H * 4; + const index_t input_size = i ? (P ? P : H) * D : I; + index_t w_size = (input_size + (P ? P : H)) * H * 4; if (P > 0) { w_size += P * H; } @@ -351,9 +351,9 @@ void LstmBackwardSingleLayer(DType* ws, DType* rs, DType* tmp_buf, bool bid, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -403,41 +403,41 @@ void LstmBackwardSingleLayer(DType* ws, const DType beta0 = 0.0; const DType beta1 = 1.0; const DType beta2 = 2.0; - const int cell_size = N * H; + const index_t cell_size = N * H; if (dhy_ptr != nullptr) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < cell_size; ++i) { + for (index_t i = 0; i < cell_size; ++i) { dh.dptr_[i] = dhy_ptr[i]; } } else { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < cell_size; ++i) { + for (index_t i = 0; i < cell_size; ++i) { dh.dptr_[i] = 0; } } if (dcy_ptr != nullptr) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < cell_size; ++i) { + for (index_t i = 0; i < cell_size; ++i) { dc.dptr_[i] = dcy_ptr[i]; } } else { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < cell_size; ++i) { + for (index_t i = 0; i < cell_size; ++i) { dc.dptr_[i] = 0; } } - for (int i = T - 1; i >= 0; --i) { - int t = bid ? T - 1 - i : i; - int tnext = bid ? t + 1 : t - 1; + for (index_t i = T - 1; i >= 0; --i) { + index_t t = bid ? T - 1 - i : i; + index_t tnext = bid ? t + 1 : t - 1; const Tensor& dhnext = i ? dh : dhx; const Tensor& dcnext = i ? dc : dcx; const Tensor& hnext = i ? htmp : hx; const Tensor& cnext = i ? c[i - 1] : cx; #pragma omp parallel for num_threads(omp_threads) - for (int jk = 0; jk < cell_size; ++jk) { - int j = jk / H; - int k = jk % H; + for (index_t jk = 0; jk < cell_size; ++jk) { + index_t j = jk / H; + index_t k = jk % H; DType tc = tanh(c[i][j][k]); DType it = ifgo[i][j][k][0]; DType ft = ifgo[i][j][k][1]; @@ -480,13 +480,13 @@ void LstmBackwardSingleLayer(DType* ws, if (req_params != kNullOp && req_params != kAddTo) { linalg_gemm(dyx, x, dwx, alpha, beta0, true, false); } - const int row = T * N; - const int col = H * 4; + const index_t row = T * N; + const index_t col = H * 4; if (req_params != kNullOp) { if (req_params != kAddTo) { - for (int i = 0; i < row; ++i) { + for (index_t i = 0; i < row; ++i) { #pragma omp parallel for num_threads(omp_threads) - for (int j = 0; j < col; ++j) { + for (index_t j = 0; j < col; ++j) { dbx[j] += dyx[i][j]; dbh[j] = dbx[j]; } @@ -495,20 +495,20 @@ void LstmBackwardSingleLayer(DType* ws, const Tensor tmp_dbx(tmp_buf, Shape2(col, T)); const Tensor tmp_dbh(tmp_buf + col * T, Shape2(col, T)); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < col * T; ++i) { + for (index_t i = 0; i < col * T; ++i) { tmp_dbx.dptr_[i] = 0; tmp_dbh.dptr_[i] = 0; } - for (int t = T - 1; t >= 0; --t) { + for (index_t t = T - 1; t >= 0; --t) { #pragma omp parallel for num_threads(omp_threads) - for (int j = 0; j < col; ++j) { - for (int i = 0; i < N; ++i) { + for (index_t j = 0; j < col; ++j) { + for (index_t i = 0; i < N; ++i) { tmp_dbx[j][t] += dyx[t * N + i][j]; tmp_dbh[j][t] = tmp_dbx[j][t]; } } #pragma omp parallel for num_threads(omp_threads) - for (int j = 0; j < col; ++j) { + for (index_t j = 0; j < col; ++j) { dbx[j] += tmp_dbx[j][t] + dbx[j]; dbh[j] += tmp_dbh[j][t] + dbh[j]; } @@ -522,9 +522,9 @@ void LstmBackward(DType* ws, DType* rs, const int L, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -553,16 +553,16 @@ void LstmBackward(DType* ws, Tensor cx(cx_ptr, Shape3(total_layers, N, H)); Tensor dhx(dhx_ptr, Shape3(total_layers, N, H)); Tensor dcx(dcx_ptr, Shape3(total_layers, N, H)); - const int b_size = 2 * H * 4; - const int r_size = D * T * N * H * 6; - const int y_offset = T * N * H * 5; - const int w_size1 = (I + H) * H * 4; // first layer - const int w_size2 = (D * H + H) * H * 4; // other layers - const int cell_size = N * H; + const index_t b_size = 2 * H * 4; + const index_t r_size = D * T * N * H * 6; + const index_t y_offset = T * N * H * 5; + const index_t w_size1 = (I + H) * H * 4; // first layer + const index_t w_size2 = (D * H + H) * H * 4; // other layers + const index_t cell_size = N * H; DType* dy_tmp_ptr = ws2 + T * cell_size * 4 + cell_size * 3; for (int i = L - 1; i >= 0; --i) { - const int input_size = i ? H * D : I; - const int w_size = i ? w_size2 : w_size1; + const index_t input_size = i ? H * D : I; + const index_t w_size = i ? w_size2 : w_size1; int idx = i * D; DType* w_cur_ptr = i ? w_ptr + (w_size1 + (i - 1) * w_size2) * D : w_ptr; DType* dw_cur_ptr = i ? dw_ptr + (w_size1 + (i - 1) * w_size2) * D : dw_ptr; @@ -594,7 +594,7 @@ void LstmBackward(DType* ws, dropout_random = dropout_random - T * N * D * H; const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); #pragma omp parallel for num_threads(omp_threads) - for (int j = 0; j < T * N * D * H; j++) { + for (index_t j = 0; j < T * N * D * H; j++) { if (dropout_random[j] == 0) { dx.dptr_[j] = 0; } else { @@ -611,9 +611,9 @@ void GruForwardInferenceSingleLayer(DType* ws, DType* tmp_buf, bool state_outputs, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -650,13 +650,13 @@ void GruForwardInferenceSingleLayer(DType* ws, const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); if (D == 1) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * H + j] = hx[i][j]; } } else { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * D * H + j] = hx[i][j]; back_ht_1[i * D * H + j] = hx[N + i][j]; @@ -674,7 +674,7 @@ void GruForwardInferenceSingleLayer(DType* ws, linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true); } - for (int t = 0; t < T; t++) { + for (index_t t = 0; t < T; t++) { // perform the first direction, X * wx and H * wh for each step // ht-1 * wh, ht-1:[N, H] wh:[3 * H, H] Tensor dht_1(ht_1, Shape2(N, D * H)); @@ -688,11 +688,11 @@ void GruForwardInferenceSingleLayer(DType* ws, } gemmC1_t = gemmC1 + t * N * 3 * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int rtb = i * 3 * H; - int ztb = i * 3 * H + H; - int ntb = i * 3 * H + 2 * H; + index_t rtb = i * 3 * H; + index_t ztb = i * 3 * H + H; + index_t ntb = i * 3 * H + 2 * H; rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] + gemmC2[rtb + j] + bx[0][j] + bh[0][j]); zt[i * H + j] = sigmoid(gemmC1_t[ztb + j] + gemmC2[ztb + j] @@ -715,11 +715,11 @@ void GruForwardInferenceSingleLayer(DType* ws, linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int rtb = i * 3 * H; - int ztb = i * 3 * H + H; - int ntb = i * 3 * H + 2 * H; + index_t rtb = i * 3 * H; + index_t ztb = i * 3 * H + H; + index_t ntb = i * 3 * H + 2 * H; rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] + gemmC2[rtb + j] + back_bx[0][j] + back_bh[0][j]); zt[i * H + j] = sigmoid(gemmC1_t[ztb + j] + @@ -739,7 +739,7 @@ void GruForwardInferenceSingleLayer(DType* ws, if (D == 1) { DType* y_start = y_ptr + (T - 1) * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * H + j]; } @@ -747,7 +747,7 @@ void GruForwardInferenceSingleLayer(DType* ws, DType* y_start = y_ptr + (T - 1) * N * H * D; DType* y_back_start = y_ptr + H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * D * H + j]; hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j]; @@ -761,9 +761,9 @@ void GruForwardInference(DType* ws, bool state_outputs, const int L, const int D, - const int T, - const int N, - int I, + const index_t T, + const index_t N, + index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -814,9 +814,9 @@ void GruForwardTrainingSingleLayer(DType* ws, DType* tmp_buf, bool state_outputs, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -862,13 +862,13 @@ void GruForwardTrainingSingleLayer(DType* ws, const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); if (D == 1) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * H + j] = hx[i][j]; } } else { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * D * H + j] = hx[i][j]; back_ht_1[i * D * H + j] = hx[N + i][j]; @@ -887,7 +887,7 @@ void GruForwardTrainingSingleLayer(DType* ws, linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true); } - for (int t = 0; t < T; t++) { + for (index_t t = 0; t < T; t++) { // perform the first direction, X * wx and H * wh for each step // ht-1 * wh, ht-1:[N, H] wh:[3 * H, H] Tensor dht_1(ht_1, Shape2(N, D * H)); @@ -905,11 +905,11 @@ void GruForwardTrainingSingleLayer(DType* ws, gemmC1_t = gemmC1 + t * N * 3 * H; DType* Mnht = Mnh + t * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int rtb = i * 3 * H; - int ztb = i * 3 * H + H; - int ntb = i * 3 * H + 2 * H; + index_t rtb = i * 3 * H; + index_t ztb = i * 3 * H + H; + index_t ntb = i * 3 * H + 2 * H; Mnht[i * H + j] = gemmC2[ntb + j] + bh[2][j]; rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] + gemmC2[rtb + j] + bx[0][j] + bh[0][j]); @@ -937,11 +937,11 @@ void GruForwardTrainingSingleLayer(DType* ws, DType* back_Mnht = back_Mnh + (T - 1 - t) * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int rtb = i * 3 * H; - int ztb = i * 3 * H + H; - int ntb = i * 3 * H + 2 * H; + index_t rtb = i * 3 * H; + index_t ztb = i * 3 * H + H; + index_t ntb = i * 3 * H + 2 * H; back_Mnht[i * H + j] = gemmC2[ntb + j] + back_bh[2][j]; rt[i * H + j] = sigmoid(gemmC1_t[rtb + j] + gemmC2[rtb + j] + back_bx[0][j] + back_bh[0][j]); @@ -963,7 +963,7 @@ void GruForwardTrainingSingleLayer(DType* ws, if (D == 1) { DType* y_start = y_ptr + (T - 1) * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * H + j]; } @@ -971,7 +971,7 @@ void GruForwardTrainingSingleLayer(DType* ws, DType* y_start = y_ptr + (T - 1) * N * H * D; DType* y_back_start = y_ptr + H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * D * H + j]; hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j]; @@ -986,9 +986,9 @@ void GruForwardTraining(DType* ws, bool state_outputs, const int L, const int D, - const int T, - const int N, - int I, + const index_t T, + const index_t N, + index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -1025,7 +1025,7 @@ void GruForwardTraining(DType* ws, if (dropout > 0.0f && l > 0) { const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * I; i++) { + for (index_t i = 0; i < T * N * I; i++) { int rand_data = rand_r(&seed_); if (static_cast(rand_data % 1000) < static_cast(1000 * dropout)) { dropout_random[(l - 1) * T * N * I + i] = 0; @@ -1057,7 +1057,7 @@ void GruForwardTraining(DType* ws, } const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * H * D; ++i) { + for (index_t i = 0; i < T * N * H * D; ++i) { y_ptr[i] = y_l[i]; } } @@ -1066,9 +1066,9 @@ template void GruBackwardSingleLayer(DType* ws, DType* tmp_buf, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -1134,7 +1134,7 @@ void GruBackwardSingleLayer(DType* ws, } } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N * H; ++i) { + for (index_t i = 0; i < N * H; ++i) { if (dhy_ptr) { dht1[i] = dhy_ptr[i]; } else { @@ -1143,7 +1143,7 @@ void GruBackwardSingleLayer(DType* ws, } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { hx_[i * D * H + j] = hx[i][j]; } @@ -1151,7 +1151,7 @@ void GruBackwardSingleLayer(DType* ws, if (D == 2) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N * H; ++i) { + for (index_t i = 0; i < N * H; ++i) { if (dhy_ptr) { back_dht1[i] = dhy_ptr[N * H + i]; } else { @@ -1159,13 +1159,13 @@ void GruBackwardSingleLayer(DType* ws, } } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { hx_[i * D * H + H + j] = hx[N + i][j]; } } } - for (int t = T - 1; t >= 0; --t) { + for (index_t t = T - 1; t >= 0; --t) { if (t) { ht1 = y_ptr + (t - 1) * N * D * H; } else { @@ -1175,7 +1175,7 @@ void GruBackwardSingleLayer(DType* ws, dyt = dy_ptr + t * N * D * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { dht1[i * H + j] += dyt[i * D * H + j]; } @@ -1188,7 +1188,7 @@ void GruBackwardSingleLayer(DType* ws, dat = da + t * N * 3 * H; dart = dar + t * N * 3 * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { int nid = i * 3 * H + 2 * H + j; int zid = i * 3 * H + H + j; @@ -1234,7 +1234,7 @@ void GruBackwardSingleLayer(DType* ws, if (req_params != kAddTo) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < 3 * H; ++i) { - for (int j = 0; j < N * T; ++j) { + for (index_t j = 0; j < N * T; ++j) { dbx[i] += da[j * 3 * H + i]; dbh[i] += dar[j * 3 * H + i]; } @@ -1243,15 +1243,15 @@ void GruBackwardSingleLayer(DType* ws, const Tensor tmp_dbx(tmp_buf + T * N * D * H, Shape2(H * 3, T)); const Tensor tmp_dbh(tmp_buf + T * N * D * H + 3 * H * T, Shape2(H * 3, T)); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < H * T * 3; ++i) { + for (index_t i = 0; i < H * T * 3; ++i) { tmp_dbx.dptr_[i] = 0; tmp_dbh.dptr_[i] = 0; } - for (int t = T - 1; t >= 0; --t) { + for (index_t t = T - 1; t >= 0; --t) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < 3 * H; ++i) { - for (int j = 0; j < N; ++j) { + for (index_t j = 0; j < N; ++j) { tmp_dbx[i][t] += da[t * N * 3 * H + j * 3 * H + i]; tmp_dbh[i][t] += dar[t * N * 3 * H + j * 3 * H + i]; } @@ -1281,7 +1281,7 @@ void GruBackwardSingleLayer(DType* ws, } if (D == 2) { - for (int t = 0; t < T; ++t) { + for (index_t t = 0; t < T; ++t) { if (t == T-1) { back_ht1 = hx_; } else { @@ -1291,7 +1291,7 @@ void GruBackwardSingleLayer(DType* ws, // add dy[T, N, D, H] to dhy[D, N, H] dyt = dy_ptr + t * N * D * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { back_dht1[i * H + j] += dyt[i * D * H + H + j]; } @@ -1305,12 +1305,12 @@ void GruBackwardSingleLayer(DType* ws, dart = dar + t * N * 3 * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int nid = i * 3 * H + 2 * H + j; - int zid = i * 3 * H + H + j; - int rid = i * 3 * H + j; - int id = i * H + j; + index_t nid = i * 3 * H + 2 * H + j; + index_t zid = i * 3 * H + H + j; + index_t rid = i * 3 * H + j; + index_t id = i * H + j; dat[nid] = back_dht1[id] * (1 - zt[id]) * (1 - nt[id] * nt[id]); dart[zid] = dat[zid] = back_dht1[id] * (back_ht1[i * D * H + H + j] - nt[id]) * zt[id] * (1 - zt[id]); @@ -1352,7 +1352,7 @@ void GruBackwardSingleLayer(DType* ws, if (req_params != kAddTo) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < 3 * H; ++i) { - for (int j = 0; j < N * T; ++j) { + for (index_t j = 0; j < N * T; ++j) { back_dbx[i] += da[j * 3 * H + i]; back_dbh[i] += dar[j * 3 * H + i]; } @@ -1361,14 +1361,14 @@ void GruBackwardSingleLayer(DType* ws, const Tensor tmp_dbx(tmp_buf + T * N * D * H, Shape2(H * 3, T)); const Tensor tmp_dbh(tmp_buf + T * N * D * H + 3 * H * T, Shape2(H * 3, T)); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < H * T * 3; ++i) { + for (index_t i = 0; i < H * T * 3; ++i) { tmp_dbx.dptr_[i] = 0; tmp_dbh.dptr_[i] = 0; } - for (int t = T - 1; t >= 0; --t) { + for (index_t t = T - 1; t >= 0; --t) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < 3 * H; ++i) { - for (int j = 0; j < N; ++j) { + for (index_t j = 0; j < N; ++j) { tmp_dbx[i][t] += da[t * N * 3 * H + j * 3 * H + i]; tmp_dbh[i][t] += dar[t * N * 3 * H + j * 3 * H + i]; } @@ -1399,7 +1399,7 @@ void GruBackwardSingleLayer(DType* ws, } if (req_state != kNullOp) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N * H * D; ++i) { + for (index_t i = 0; i < N * H * D; ++i) { dhx[i] = dht1[i]; } } @@ -1410,9 +1410,9 @@ void GruBackward(DType* ws, DType* rs, const int L, const int D, - const int T, - const int N, - int I, + const index_t T, + const index_t N, + index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -1464,7 +1464,7 @@ void GruBackward(DType* ws, DType* dhx_l = dhx_ptr + (L - 1) * D * N * H; DType* dy_l = dy_ptr; Tensor hx(hx_ptr, Shape3(L, D * N, H)); - int inputsize = I; + index_t inputsize = I; DType* y_tmp = y_l - T * N * H * D; const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); for (int l = L - 1; l >= 0; --l) { @@ -1483,7 +1483,7 @@ void GruBackward(DType* ws, if (dropout > 0.0f && l > 0 && req_data != kNullOp) { dropout_random = dropout_random - T * N * D * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * I; i++) { + for (index_t i = 0; i < T * N * I; i++) { if (dropout_random[i] == 0) { dx_l[i] = 0; } else { @@ -1493,7 +1493,7 @@ void GruBackward(DType* ws, } if (l > 0) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * H * D; ++i) { + for (index_t i = 0; i < T * N * H * D; ++i) { dy_l[i] = dx_l[i]; } gateR_l = gateR_l - T * D * N * H; @@ -1527,9 +1527,9 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws, DType* tmp_buf, bool state_outputs, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -1564,13 +1564,13 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws, const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); if (D == 1) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * H + j] = hx[i][j]; } } else { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * D * H + j] = hx[i][j]; back_ht_1[i * D * H + j] = hx[N + i][j]; @@ -1588,7 +1588,7 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws, linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true); } - for (int t = 0; t < T; t++) { + for (index_t t = 0; t < T; t++) { // perform the first direction, X * wx and H * wh for each step // ht-1 * wh, ht-1:[N, H] wh:[H, H] Tensor dht_1(ht_1, Shape2(N, D * H)); @@ -1602,9 +1602,9 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws, } gemmC1_t = gemmC1 + t * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int tb = i * H; + index_t tb = i * H; if (mode == 1) { ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + bx[0][j] + gemmC2[tb + j] + bh[0][j]); @@ -1626,9 +1626,9 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws, linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int tb = i * H; + index_t tb = i * H; if (mode == 1) { back_ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + back_bx[0][j] + gemmC2[tb + j] + back_bh[0][j]); @@ -1647,7 +1647,7 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws, if (D == 1) { DType* y_start = y_ptr + (T - 1) * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * H + j]; } @@ -1655,7 +1655,7 @@ void VanillaRNNForwardInferenceSingleLayer(DType* ws, DType* y_start = y_ptr + (T - 1) * N * H * D; DType* y_back_start = y_ptr + H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * D * H + j]; hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j]; @@ -1669,9 +1669,9 @@ void VanillaRNNForwardInference(DType* ws, bool state_outputs, const int L, const int D, - const int T, - const int N, - int I, + const index_t T, + const index_t N, + index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -1724,9 +1724,9 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws, DType* tmp_buf, bool state_outputs, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -1765,13 +1765,13 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws, const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); if (D == 1) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * H + j] = hx[i][j]; } } else { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { y_ptr[i * D * H + j] = hx[i][j]; back_ht_1[i * D * H + j] = hx[N + i][j]; @@ -1790,7 +1790,7 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws, linalg_gemm(x, back_wx, dback_gemmC1, alpha, beta, false, true); } - for (int t = 0; t < T; t++) { + for (index_t t = 0; t < T; t++) { // perform the first direction, X * wx and H * wh for each step // ht-1 * wh, ht-1:[N, H] wh:[H, H] Tensor dht_1(ht_1, Shape2(N, D * H)); @@ -1805,9 +1805,9 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws, nt = gateN + t * N * H; gemmC1_t = gemmC1 + t * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int tb = i * H; + index_t tb = i * H; if (mode == 1) { nt[tb + j] = ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + bx[0][j] + gemmC2[tb + j] + bh[0][j]); @@ -1829,9 +1829,9 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws, dback_ht_1_tmp = reshape(dback_ht_1.T(), Shape3(D, H, N)); linalg_gemm(dback_ht_1_tmp[1], back_wh, dgemmC2, alpha, beta, true, true); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int tb = i * H; + index_t tb = i * H; if (mode == 1) { nt[tb + j] = back_ht[i * D * H + j] = tanh(gemmC1_t[tb + j] + back_bx[0][j] + gemmC2[tb + j] + back_bh[0][j]); @@ -1851,7 +1851,7 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws, if (D == 1) { DType* y_start = y_ptr + (T - 1) * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * H + j]; } @@ -1859,7 +1859,7 @@ void VanillaRNNForwardTrainingSingleLayer(DType* ws, DType* y_start = y_ptr + (T - 1) * N * H * D; DType* y_back_start = y_ptr + H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; i++) + for (index_t i = 0; i < N; i++) for (int j = 0; j < H; j++) { hy_ptr[i * H + j] = y_start[i * D * H + j]; hy_ptr[N * H + i * H + j] = y_back_start[i * D * H + j]; @@ -1874,9 +1874,9 @@ void VanillaRNNForwardTraining(DType* ws, bool state_outputs, const int L, const int D, - const int T, - const int N, - int I, + const index_t T, + const index_t N, + index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -1911,7 +1911,7 @@ void VanillaRNNForwardTraining(DType* ws, } if (dropout > 0.0f && l > 0) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * I; i++) { + for (index_t i = 0; i < T * N * I; i++) { int rand_data = rand_r(&seed_); if (static_cast(rand_data % 1000) < static_cast(1000 * dropout)) { dropout_random[(l - 1) * T * N * I + i] = 0; @@ -1939,7 +1939,7 @@ void VanillaRNNForwardTraining(DType* ws, wh_l = wx_l + I * H; } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * H * D; ++i) { + for (index_t i = 0; i < T * N * H * D; ++i) { y_ptr[i] = y_l[i]; } } @@ -1948,9 +1948,9 @@ template void VanillaRNNBackwardSingleLayer(DType* ws, DType* tmp_buf, const int D, - const int T, - const int N, - const int I, + const index_t T, + const index_t N, + const index_t I, const int H, const Tensor &x, const Tensor &hx, @@ -2008,7 +2008,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N * H; ++i) { + for (index_t i = 0; i < N * H; ++i) { if (dhy_ptr) { dht1[i] = dhy_ptr[i]; } else { @@ -2017,7 +2017,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { hx_[i * D * H + j] = hx[i][j]; } @@ -2025,7 +2025,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, if (D == 2) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N * H; ++i) { + for (index_t i = 0; i < N * H; ++i) { if (dhy_ptr) { back_dht1[i] = dhy_ptr[N * H + i]; } else { @@ -2033,13 +2033,13 @@ void VanillaRNNBackwardSingleLayer(DType* ws, } } #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { hx_[i * D * H + H + j] = hx[N + i][j]; } } } - for (int t = T - 1; t >= 0; --t) { + for (index_t t = T - 1; t >= 0; --t) { if (t) { ht1 = y_ptr + (t - 1) * N * D * H; } else { @@ -2049,7 +2049,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, dyt = dy_ptr + t * N * D * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { dht1[i * H + j] += dyt[i * D * H + j]; } @@ -2058,9 +2058,9 @@ void VanillaRNNBackwardSingleLayer(DType* ws, nt = gateN + t * N * H; dart = dar + t * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int id = i * H + j; + index_t id = i * H + j; if (mode == 1) { dart[id] = dht1[id] * (1 - nt[id] * nt[id]); } else { @@ -2099,7 +2099,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, if (req_params != kAddTo) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < H; ++i) { - for (int j = 0; j < N * T; ++j) { + for (index_t j = 0; j < N * T; ++j) { dbx[i] += dar[j * H + i]; dbh[i] = dbx[i]; } @@ -2108,15 +2108,15 @@ void VanillaRNNBackwardSingleLayer(DType* ws, const Tensor tmp_dbx(tmp_buf + T * N * D * H, Shape2(H, T)); const Tensor tmp_dbh(tmp_buf + T * N * D * H + H * T, Shape2(H, T)); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < H * T; ++i) { + for (index_t i = 0; i < H * T; ++i) { tmp_dbx.dptr_[i] = 0; tmp_dbh.dptr_[i] = 0; } - for (int t = T - 1; t >= 0; --t) { + for (index_t t = T - 1; t >= 0; --t) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < H; ++i) { - for (int j = 0; j < N; ++j) { + for (index_t j = 0; j < N; ++j) { tmp_dbx[i][t] += dar[t * N * H + j * H + i]; tmp_dbh[i][t] = tmp_dbx[i][t]; } @@ -2146,7 +2146,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, } if (D == 2) { - for (int t = 0; t < T; ++t) { + for (index_t t = 0; t < T; ++t) { if (t == T-1) { back_ht1 = hx_; } else { @@ -2156,7 +2156,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, // add dy[T, N, D, H] to dhy[D, N, H] dyt = dy_ptr + t * N * D * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { back_dht1[i * H + j] += dyt[i * D * H + H + j]; } @@ -2166,9 +2166,9 @@ void VanillaRNNBackwardSingleLayer(DType* ws, dart = dar + t * N * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N; ++i) { + for (index_t i = 0; i < N; ++i) { for (int j = 0; j < H; ++j) { - int id = i * H + j; + index_t id = i * H + j; if (mode == 1) { dart[id] = back_dht1[id] * (1 - nt[id] * nt[id]); } else { @@ -2208,7 +2208,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, if (req_params != kAddTo) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < H; ++i) { - for (int j = 0; j < N * T; ++j) { + for (index_t j = 0; j < N * T; ++j) { back_dbx[i] += dar[j * H + i]; back_dbh[i] = back_dbx[i]; } @@ -2217,15 +2217,15 @@ void VanillaRNNBackwardSingleLayer(DType* ws, const Tensor tmp_dbx(tmp_buf + T * N * D * H, Shape2(H, T)); const Tensor tmp_dbh(tmp_buf + T * N * D * H + H * T, Shape2(H, T)); #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < H * T; ++i) { + for (index_t i = 0; i < H * T; ++i) { tmp_dbx.dptr_[i] = 0; tmp_dbh.dptr_[i] = 0; } - for (int t = T - 1; t >= 0; --t) { + for (index_t t = T - 1; t >= 0; --t) { #pragma omp parallel for num_threads(omp_threads) for (int i = 0; i < H; ++i) { - for (int j = 0; j < N; ++j) { + for (index_t j = 0; j < N; ++j) { tmp_dbx[i][t] += dar[t * N * H + j * H + i]; tmp_dbh[i][t] = tmp_dbx[i][t]; } @@ -2256,7 +2256,7 @@ void VanillaRNNBackwardSingleLayer(DType* ws, } if (req_state != kNullOp) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < N * H * D; ++i) { + for (index_t i = 0; i < N * H * D; ++i) { dhx[i] = dht1[i]; } } @@ -2267,9 +2267,9 @@ void VanillaRNNBackward(DType* ws, DType* rs, const int L, const int D, - const int T, - const int N, - int I, + const index_t T, + const index_t N, + index_t I, const int H, DType* x_ptr, DType* hx_ptr, @@ -2319,7 +2319,7 @@ void VanillaRNNBackward(DType* ws, DType* dhx_l = dhx_ptr + (L - 1) * D * N * H; DType* dy_l = dy_ptr; Tensor hx(hx_ptr, Shape3(L, D * N, H)); - int inputsize = I; + index_t inputsize = I; DType* y_tmp = y_l - T * N * H * D; const int omp_threads = mxnet::engine::OpenMP::Get()->GetRecommendedOMPThreadCount(); for (int l = L - 1; l >= 0; --l) { @@ -2338,7 +2338,7 @@ void VanillaRNNBackward(DType* ws, if (dropout > 0.0f && l > 0 && req_data != kNullOp) { dropout_random = dropout_random - T * N * D * H; #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * I; i++) { + for (index_t i = 0; i < T * N * I; i++) { if (dropout_random[i] == 0) { dx_l[i] = 0; } else { @@ -2348,7 +2348,7 @@ void VanillaRNNBackward(DType* ws, } if (l > 0) { #pragma omp parallel for num_threads(omp_threads) - for (int i = 0; i < T * N * H * D; ++i) { + for (index_t i = 0; i < T * N * H * D; ++i) { dy_l[i] = dx_l[i]; } gateN_l = gateN_l - T * D * N * H; diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py index 1528bc02dce6..39fee72290b5 100644 --- a/tests/nightly/test_large_array.py +++ b/tests/nightly/test_large_array.py @@ -39,6 +39,7 @@ SMALL_Y = 50 LARGE_SIZE = LARGE_X * SMALL_Y LARGE_TENSOR_SHAPE = 2**32 +RNN_LARGE_TENSOR = 2**28 def test_nn(): @@ -479,7 +480,6 @@ def check_embedding(): assert out.shape[0] == LARGE_TENSOR_SHAPE assert out.shape[1] == 1 - assert out.shape[2] == 1 def check_spatial_transformer(): data = nd.random_normal(shape=(2, 2**29, 1, 6)) @@ -504,6 +504,39 @@ def check_ravel(): assert out.shape[0] == LARGE_TENSOR_SHAPE + def check_rnn(): + data = nd.random_normal(shape=(RNN_LARGE_TENSOR, 4, 4)) + parameters_relu_tanh = nd.random_normal(shape=(7,)) + parameters_lstm = nd.random_normal(shape=(28,)) + parameters_gru = nd.random_normal(shape=(21,)) + state = nd.random_normal(shape=(1, 4, 1)) + state_cell = nd.random_normal(shape=(1, 4, 1)) + mode_relu = 'rnn_relu' + mode_tanh = 'rnn_tanh' + mode_lstm = 'lstm' + mode_gru = 'gru' + state_size = 1 + num_layers = 1 + + out_relu = nd.RNN(data=data, parameters=parameters_relu_tanh, state=state, mode=mode_relu, + state_size=state_size, num_layers=num_layers) + + out_tanh = nd.RNN(data=data, parameters=parameters_relu_tanh, state=state, mode=mode_tanh, + state_size=state_size, num_layers=num_layers) + + out_lstm = nd.RNN(data=data, parameters=parameters_lstm, state=state, mode=mode_lstm, + state_cell=state_cell, state_size=state_size, num_layers=num_layers) + + out_gru = nd.RNN(data=data, parameters=parameters_gru, state=state, mode=mode_gru, + state_size=state_size, num_layers=num_layers) + + for out in [out_relu, out_tanh, out_lstm, out_gru]: + assert out.shape[0] == RNN_LARGE_TENSOR + assert out.shape[1] == 4 + assert out.shape[2] == 1 + + assert type(out[0, 0, 0].asscalar()).__name__ == 'float32' + check_gluon_embedding() check_fully_connected() check_dense() @@ -527,6 +560,7 @@ def check_ravel(): check_embedding() check_spatial_transformer() check_ravel() + check_rnn() def test_tensor():