rapidsai · rapids-bot · Jun 1, 2021 · May 25, 2021 · May 25, 2021 · May 26, 2021
@@ -63,10 +63,13 @@ class ArimaLoglikelihood : public TsFixtureRandom<DataT> {
 
     // Benchmark loop
     this->loopOnState(state, [this]() {
+      ARIMAMemory<double> arima_mem(order, this->params.batch_size,
+                                    this->params.n_obs, temp_mem);
+
       // Evaluate log-likelihood
-      batched_loglike(*this->handle, this->data.X, this->params.batch_size,
-                      this->params.n_obs, order, param, loglike, residual, true,
-                      false);
+      batched_loglike(*this->handle, arima_mem, this->data.X,
+                      this->params.batch_size, this->params.n_obs, order, param,
+                      loglike, residual, true, false);
     });
   }
 
@@ -86,6 +89,11 @@ class ArimaLoglikelihood : public TsFixtureRandom<DataT> {
       this->params.batch_size * sizeof(DataT), stream);
     residual = (DataT*)allocator->allocate(
       this->params.batch_size * this->params.n_obs * sizeof(DataT), stream);
+
+    // Temporary memory
+    size_t temp_buf_size = ARIMAMemory<double>::compute_size(
+      order, this->params.batch_size, this->params.n_obs);
+    temp_mem = (char*)allocator->allocate(temp_buf_size, stream);
   }
 
   void deallocateBuffers(const ::benchmark::State& state) {
@@ -110,6 +118,7 @@ class ArimaLoglikelihood : public TsFixtureRandom<DataT> {
   DataT* param;
   DataT* loglike;
   DataT* residual;
+  char* temp_mem;
 };
 
 std::vector<ArimaParams> getInputs() {

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -207,4 +207,162 @@ struct ARIMAParams {
   }
 };
 
+/**
+ * Structure to manage ARIMA temporary memory allocations
+ * @note The user is expected to give a preallocated buffer to the constructor,
+ *       and ownership is not transferred to this struct! The buffer must be allocated
+ *       as long as the object lives, and deallocated afterwards.
+ */
+template <typename T, int ALIGN = 256>
+struct ARIMAMemory {
+  T *params_mu, *params_ar, *params_ma, *params_sar, *params_sma,
+    *params_sigma2, *Tparams_mu, *Tparams_ar, *Tparams_ma, *Tparams_sar,
+    *Tparams_sma, *Tparams_sigma2, *d_params, *d_Tparams, *Z_dense, *R_dense,
+    *T_dense, *RQR_dense, *RQ_dense, *P_dense, *alpha_dense, *ImT_dense,
+    *ImT_inv_dense, *T_values, *v_tmp_dense, *m_tmp_dense, *K_dense, *TP_dense,
+    *vs, *y_diff, *loglike, *loglike_base, *loglike_pert, *x_pert, *F_buffer,
+    *sumLogF_buffer, *sigma2_buffer, *I_m_AxA_dense, *I_m_AxA_inv_dense,
+    *Ts_dense, *RQRs_dense, *Ps_dense;
+  T **Z_batches, **R_batches, **T_batches, **RQR_batches, **RQ_batches,
+    **P_batches, **alpha_batches, **ImT_batches, **ImT_inv_batches,
+    **v_tmp_batches, **m_tmp_batches, **K_batches, **TP_batches,
+    **I_m_AxA_batches, **I_m_AxA_inv_batches, **Ts_batches, **RQRs_batches,
+    **Ps_batches;
+  int *T_col_index, *T_row_index, *ImT_inv_P, *ImT_inv_info, *I_m_AxA_P,
+    *I_m_AxA_info;
+
+  size_t size;
+
+ protected:
+  char* buf;
+
+  template <bool assign, typename ValType>
+  inline void append_buffer(ValType*& ptr, size_t n_elem) {
+    if (assign) {
+      ptr = reinterpret_cast<ValType*>(buf + size);
+    }
+    size += ((n_elem * sizeof(ValType) + ALIGN - 1) / ALIGN) * ALIGN;
+  }
+
+  template <bool assign>
+  inline void buf_offsets(const ARIMAOrder& order, int batch_size, int n_obs,
+                          char* in_buf = nullptr) {
+    buf = in_buf;
+    size = 0;
+
+    int r = order.r();
+    int rd = order.rd();
+    int N = order.complexity();
+    int n_diff = order.n_diff();
+
+    append_buffer<assign>(params_mu, order.k * batch_size);
+    append_buffer<assign>(params_ar, order.p * batch_size);
+    append_buffer<assign>(params_ma, order.q * batch_size);
+    append_buffer<assign>(params_sar, order.P * batch_size);
+    append_buffer<assign>(params_sma, order.Q * batch_size);
+    append_buffer<assign>(params_sigma2, batch_size);
+
+    append_buffer<assign>(Tparams_mu, order.k * batch_size);
+    append_buffer<assign>(Tparams_ar, order.p * batch_size);
+    append_buffer<assign>(Tparams_ma, order.q * batch_size);
+    append_buffer<assign>(Tparams_sar, order.P * batch_size);
+    append_buffer<assign>(Tparams_sma, order.Q * batch_size);
+    append_buffer<assign>(Tparams_sigma2, batch_size);
+
+    append_buffer<assign>(d_params, N * batch_size);
+    append_buffer<assign>(d_Tparams, N * batch_size);
+    append_buffer<assign>(Z_dense, rd * batch_size);
+    append_buffer<assign>(Z_batches, batch_size);
+    append_buffer<assign>(R_dense, rd * batch_size);
+    append_buffer<assign>(R_batches, batch_size);
+    append_buffer<assign>(T_dense, rd * rd * batch_size);
+    append_buffer<assign>(T_batches, batch_size);
+    append_buffer<assign>(RQ_dense, rd * batch_size);
+    append_buffer<assign>(RQ_batches, batch_size);
+    append_buffer<assign>(RQR_dense, rd * rd * batch_size);
+    append_buffer<assign>(RQR_batches, batch_size);
+    append_buffer<assign>(P_dense, rd * rd * batch_size);
+    append_buffer<assign>(P_batches, batch_size);
+    append_buffer<assign>(alpha_dense, rd * batch_size);
+    append_buffer<assign>(alpha_batches, batch_size);
+    append_buffer<assign>(ImT_dense, r * r * batch_size);
+    append_buffer<assign>(ImT_batches, batch_size);
+    append_buffer<assign>(ImT_inv_dense, r * r * batch_size);
+    append_buffer<assign>(ImT_inv_batches, batch_size);
+    append_buffer<assign>(ImT_inv_P, r * batch_size);
+    append_buffer<assign>(ImT_inv_info, batch_size);
+    append_buffer<assign>(T_values, rd * rd * batch_size);
+    append_buffer<assign>(T_col_index, rd * rd);
+    append_buffer<assign>(T_row_index, rd + 1);
+    append_buffer<assign>(v_tmp_dense, rd * batch_size);
+    append_buffer<assign>(v_tmp_batches, batch_size);
+    append_buffer<assign>(m_tmp_dense, rd * rd * batch_size);
+    append_buffer<assign>(m_tmp_batches, batch_size);
+    append_buffer<assign>(K_dense, rd * batch_size);
+    append_buffer<assign>(K_batches, batch_size);
+    append_buffer<assign>(TP_dense, rd * rd * batch_size);
+    append_buffer<assign>(TP_batches, batch_size);
+    append_buffer<assign>(F_buffer, n_obs * batch_size);
+    append_buffer<assign>(sumLogF_buffer, batch_size);
+    append_buffer<assign>(sigma2_buffer, batch_size);
+
+    append_buffer<assign>(vs, n_obs * batch_size);
+    append_buffer<assign>(y_diff, n_obs * batch_size);
+    append_buffer<assign>(loglike, batch_size);
+    append_buffer<assign>(loglike_base, batch_size);
+    append_buffer<assign>(loglike_pert, batch_size);
+    append_buffer<assign>(x_pert, N * batch_size);
+
+    if (n_diff > 0) {
+      append_buffer<assign>(Ts_dense, r * r * batch_size);
+      append_buffer<assign>(Ts_batches, batch_size);
+      append_buffer<assign>(RQRs_dense, r * r * batch_size);
+      append_buffer<assign>(RQRs_batches, batch_size);
+      append_buffer<assign>(Ps_dense, r * r * batch_size);
+      append_buffer<assign>(Ps_batches, batch_size);
+    }
+
+    if (r <= 5) {
+      // Note: temp mem for the direct Lyapunov solver grows very quickly!
+      // This solver is used iff the condition above is satisifed
+      append_buffer<assign>(I_m_AxA_dense, r * r * r * r * batch_size);
+      append_buffer<assign>(I_m_AxA_batches, batch_size);
+      append_buffer<assign>(I_m_AxA_inv_dense, r * r * r * r * batch_size);
+      append_buffer<assign>(I_m_AxA_inv_batches, batch_size);
+      append_buffer<assign>(I_m_AxA_P, r * r * batch_size);
+      append_buffer<assign>(I_m_AxA_info, batch_size);
+    }
+  }
+
+  /** Protected constructor to estimate max size */
+  ARIMAMemory(const ARIMAOrder& order, int batch_size, int n_obs) {
+    buf_offsets<false>(order, batch_size, n_obs);
+  }
+
+ public:
+  /** Constructor to create pointers from buffer
+   * @param[in] order      ARIMA order
+   * @param[in] batch_size Number of series in the batch
+   * @param[in] n_obs      Length of the series
+   * @param[in] in_buff    Pointer to the temporary memory buffer.
+   *                       Ownership is retained by the caller
+   */
+  ARIMAMemory(const ARIMAOrder& order, int batch_size, int n_obs,
+              char* in_buf) {
+    buf_offsets<true>(order, batch_size, n_obs, in_buf);
+  }
+
+  /** Static method to get the size of the required buffer allocation
+   * @param[in] order      ARIMA order
+   * @param[in] batch_size Number of series in the batch
+   * @param[in] n_obs      Length of the series
+   * @return Buffer size in bytes
+   */
+  static size_t compute_size(const ARIMAOrder& order, int batch_size,
+                             int n_obs) {
+    ARIMAMemory temp(order, batch_size, n_obs);
+    return temp.size;
+  }
+};
+
 }  // namespace ML
@@ -68,6 +68,7 @@ void batched_diff(raft::handle_t& handle, double* d_y_diff, const double* d_y,
  * in a batched context.
  *
  * @param[in]  handle       cuML handle
+ * @param[in]  arima_mem    Pre-allocated temporary memory
  * @param[in]  d_y          Series to fit: shape = (n_obs, batch_size) and
  *                          expects column major data layout. (device)
  * @param[in]  batch_size   Number of time series
@@ -91,13 +92,14 @@ void batched_diff(raft::handle_t& handle, double* d_y_diff, const double* d_y,
  * @param[out] d_lower      Lower limit of the prediction interval
  * @param[out] d_upper      Upper limit of the prediction interval
  */
-void batched_loglike(raft::handle_t& handle, const double* d_y, int batch_size,
-                     int n_obs, const ARIMAOrder& order, const double* d_params,
-                     double* loglike, double* d_vs, bool trans = true,
-                     bool host_loglike = true, LoglikeMethod method = MLE,
-                     int truncate = 0, int fc_steps = 0, double* d_fc = nullptr,
-                     double level = 0, double* d_lower = nullptr,
-                     double* d_upper = nullptr);
+void batched_loglike(raft::handle_t& handle,
+                     const ARIMAMemory<double>& arima_mem, const double* d_y,
+                     int batch_size, int n_obs, const ARIMAOrder& order,
+                     const double* d_params, double* loglike, double* d_vs,
+                     bool trans = true, bool host_loglike = true,
+                     LoglikeMethod method = MLE, int truncate = 0,
+                     int fc_steps = 0, double* d_fc = nullptr, double level = 0,
+                     double* d_lower = nullptr, double* d_upper = nullptr);
 
 /**
  * Compute the loglikelihood of the given parameter on the given time series
@@ -107,6 +109,7 @@ void batched_loglike(raft::handle_t& handle, const double* d_y, int batch_size,
  *        to avoid useless packing / unpacking
  *
  * @param[in]  handle       cuML handle
+ * @param[in]  arima_mem    Pre-allocated temporary memory
  * @param[in]  d_y          Series to fit: shape = (n_obs, batch_size) and
  *                          expects column major data layout. (device)
  * @param[in]  batch_size   Number of time series
@@ -129,8 +132,9 @@ void batched_loglike(raft::handle_t& handle, const double* d_y, int batch_size,
  * @param[out] d_lower      Lower limit of the prediction interval
  * @param[out] d_upper      Upper limit of the prediction interval
  */
-void batched_loglike(raft::handle_t& handle, const double* d_y, int batch_size,
-                     int n_obs, const ARIMAOrder& order,
+void batched_loglike(raft::handle_t& handle,
+                     const ARIMAMemory<double>& arima_mem, const double* d_y,
+                     int batch_size, int n_obs, const ARIMAOrder& order,
                      const ARIMAParams<double>& params, double* loglike,
                      double* d_vs, bool trans = true, bool host_loglike = true,
                      LoglikeMethod method = MLE, int truncate = 0,
@@ -141,6 +145,7 @@ void batched_loglike(raft::handle_t& handle, const double* d_y, int batch_size,
  * Compute the gradient of the log-likelihood
  * 
  * @param[in]  handle       cuML handle
+ * @param[in]  arima_mem    Pre-allocated temporary memory
  * @param[in]  d_y          Series to fit: shape = (n_obs, batch_size) and
  *                          expects column major data layout. (device)
  * @param[in]  batch_size   Number of time series
@@ -154,17 +159,19 @@ void batched_loglike(raft::handle_t& handle, const double* d_y, int batch_size,
  * @param[in]  truncate     For CSS, start the sum-of-squares after a given
  *                          number of observations
  */
-void batched_loglike_grad(raft::handle_t& handle, const double* d_y,
-                          int batch_size, int n_obs, const ARIMAOrder& order,
-                          const double* d_x, double* d_grad, double h,
-                          bool trans = true, LoglikeMethod method = MLE,
-                          int truncate = 0);
+void batched_loglike_grad(raft::handle_t& handle,
+                          const ARIMAMemory<double>& arima_mem,
+                          const double* d_y, int batch_size, int n_obs,
+                          const ARIMAOrder& order, const double* d_x,
+                          double* d_grad, double h, bool trans = true,
+                          LoglikeMethod method = MLE, int truncate = 0);
 
 /**
  * Batched in-sample and out-of-sample prediction of a time-series given all
  * the model parameters
  *
  * @param[in]  handle      cuML handle
+ * @param[in]  arima_mem   Pre-allocated temporary memory
  * @param[in]  d_y         Batched Time series to predict.
  *                         Shape: (num_samples, batch size) (device)
  * @param[in]  batch_size  Total number of batched time series
@@ -181,16 +188,17 @@ void batched_loglike_grad(raft::handle_t& handle, const double* d_y,
  * @param[out] d_lower     Lower limit of the prediction interval
  * @param[out] d_upper     Upper limit of the prediction interval
  */
-void predict(raft::handle_t& handle, const double* d_y, int batch_size,
-             int n_obs, int start, int end, const ARIMAOrder& order,
-             const ARIMAParams<double>& params, double* d_y_p,
-             bool pre_diff = true, double level = 0, double* d_lower = nullptr,
-             double* d_upper = nullptr);
+void predict(raft::handle_t& handle, const ARIMAMemory<double>& arima_mem,
+             const double* d_y, int batch_size, int n_obs, int start, int end,
+             const ARIMAOrder& order, const ARIMAParams<double>& params,
+             double* d_y_p, bool pre_diff = true, double level = 0,
+             double* d_lower = nullptr, double* d_upper = nullptr);
 
 /**
  * Compute an information criterion (AIC, AICc, BIC)
  *
  * @param[in]  handle      cuML handle
+ * @param[in]  arima_mem   Pre-allocated temporary memory
  * @param[in]  d_y         Series to fit: shape = (n_obs, batch_size) and
  *                         expects column major data layout. (device)
  * @param[in]  batch_size  Total number of batched time series
@@ -203,8 +211,10 @@ void predict(raft::handle_t& handle, const double* d_y, int batch_size,
  * @param[in]  ic_type     Type of information criterion wanted.
  *                         0: AIC, 1: AICc, 2: BIC
  */
-void information_criterion(raft::handle_t& handle, const double* d_y,
-                           int batch_size, int n_obs, const ARIMAOrder& order,
+void information_criterion(raft::handle_t& handle,
+                           const ARIMAMemory<double>& arima_mem,
+                           const double* d_y, int batch_size, int n_obs,
+                           const ARIMAOrder& order,
                            const ARIMAParams<double>& params, double* ic,
                            int ic_type);
 

@@ -29,6 +29,7 @@ namespace ML {
  * provide the resulting prediction as well as loglikelihood fit.
  *
  * @param[in]  handle        cuML handle
+ * @param[in]  arima_mem     Pre-allocated temporary memory
  * @param[in]  d_ys_b        Batched time series
  *                           Shape (nobs, batch_size) (col-major, device)
  * @param[in]  nobs          Number of samples per time series
@@ -46,20 +47,20 @@ namespace ML {
  * @param[out] d_lower       Lower limit of the prediction interval
  * @param[out] d_upper       Upper limit of the prediction interval
  */
-void batched_kalman_filter(raft::handle_t& handle, const double* d_ys_b,
-                           int nobs, const ARIMAParams<double>& params,
-                           const ARIMAOrder& order, int batch_size,
-                           double* d_loglike, double* d_vs, int fc_steps = 0,
-                           double* d_fc = nullptr, double level = 0,
-                           double* d_lower = nullptr,
-                           double* d_upper = nullptr);
+void batched_kalman_filter(
+  raft::handle_t& handle, const ARIMAMemory<double>& arima_mem,
+  const double* d_ys_b, int nobs, const ARIMAParams<double>& params,
+  const ARIMAOrder& order, int batch_size, double* d_loglike, double* d_vs,
+  int fc_steps = 0, double* d_fc = nullptr, double level = 0,
+  double* d_lower = nullptr, double* d_upper = nullptr);
 
 /**
  * Convenience function for batched "jones transform" used in ARIMA to ensure
  * certain properties of the AR and MA parameters (takes host array and
  * returns host array)
  *
  * @param[in]  handle     cuML handle
+ * @param[in]  arima_mem  Pre-allocated temporary memory
  * @param[in]  order      ARIMA hyper-parameters
  * @param[in]  batch_size Number of time series analyzed.
  * @param[in]  isInv      Do the inverse transform?
@@ -68,7 +69,9 @@ void batched_kalman_filter(raft::handle_t& handle, const double* d_ys_b,
  *                        (expects pre-allocated array of size
  *                         (p+q)*batch_size) (host)
  */
-void batched_jones_transform(raft::handle_t& handle, const ARIMAOrder& order,
-                             int batch_size, bool isInv, const double* h_params,
+void batched_jones_transform(raft::handle_t& handle,
+                             const ARIMAMemory<double>& arima_mem,
+                             const ARIMAOrder& order, int batch_size,
+                             bool isInv, const double* h_params,
                              double* h_Tparams);
 }  // namespace ML