PaddlePaddle · haohongxiang · Oct 9, 2022 · Oct 8, 2022 · Oct 8, 2022 · Oct 9, 2022
diff --git a/ppfleetx/data/data_tools/cpp/fast_index_map_helpers.cpp b/ppfleetx/data/data_tools/cpp/fast_index_map_helpers.cpp
@@ -89,8 +89,8 @@ void build_blending_indices(
   }
 }
 
-py::array build_sample_idx(const py::array_t<int32_t> &sizes_,
-                           const py::array_t<int32_t> &doc_idx_,
+py::array build_sample_idx(const py::array_t<int64_t> &sizes_,
+                           const py::array_t<int64_t> &doc_idx_,
                            const int32_t seq_length, const int32_t num_epochs,
                            const int64_t tokens_per_epoch) {
   /* Sample index (sample_idx) is used for gpt2 like dataset for which
@@ -110,7 +110,7 @@ py::array build_sample_idx(const py::array_t<int32_t> &sizes_,
 
   // Mapping and it's length (1D).
   int64_t num_samples = (num_epochs * tokens_per_epoch - 1) / seq_length;
-  int32_t *sample_idx = new int32_t[2 * (num_samples + 1)];
+  int64_t *sample_idx = new int64_t[2 * (num_samples + 1)];
 
   cout << "    using:" << endl << std::flush;
   cout << "     number of documents:       " << doc_idx_.shape(0) / num_epochs
@@ -128,15 +128,15 @@ py::array build_sample_idx(const py::array_t<int32_t> &sizes_,
   // Index into doc_idx.
   int64_t doc_idx_index = 0;
   // Begining offset for each document.
-  int32_t doc_offset = 0;
+  int64_t doc_offset = 0;
   // Start with first document and no offset.
   sample_idx[2 * sample_index] = doc_idx_index;
   sample_idx[2 * sample_index + 1] = doc_offset;
   ++sample_index;
 
   while (sample_index <= num_samples) {
     // Start with a fresh sequence.
-    int32_t remaining_seq_length = seq_length + 1;
+    int64_t remaining_seq_length = seq_length + 1;
     while (remaining_seq_length != 0) {
       // Get the document length.
       auto doc_id = doc_idx[doc_idx_index];
@@ -164,12 +164,12 @@ py::array build_sample_idx(const py::array_t<int32_t> &sizes_,
 
   // Method to deallocate memory.
   py::capsule free_when_done(sample_idx, [](void *mem_) {
-    int32_t *mem = reinterpret_cast<int32_t *>(mem_);
+    int64_t *mem = reinterpret_cast<int64_t *>(mem_);
     delete[] mem;
   });
 
   // Return the numpy array.
-  const auto byte_size = sizeof(int32_t);
+  const auto byte_size = sizeof(int64_t);
   return py::array(std::vector<int64_t>{num_samples + 1, 2}, // shape
                    {2 * byte_size, byte_size}, // C-style contiguous strides
                    sample_idx,                 // the data pointer