Print time duration for each PCA step (#32)

oap-project · Mar 5, 2021 · e1c33d9 · e1c33d9
1 parent 3252ae9
commit e1c33d9
Showing 1 changed file with 17 additions and 4 deletions.
diff --git a/mllib-dal/src/main/native/PCADALImpl.cpp b/mllib-dal/src/main/native/PCADALImpl.cpp
@@ -41,6 +41,8 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL
   int nThreadsNew = services::Environment::getInstance()->getNumberOfThreads();
   cout << "oneDAL (native): Number of threads used: " << nThreadsNew << endl;
 
+  auto t1 = std::chrono::high_resolution_clock::now();
+
   pca::Distributed<step1Local, algorithmFPType, pca::svdDense> localAlgorithm;
 
   /* Set the input data set to the algorithm */
@@ -49,6 +51,12 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL
   /* Compute PCA decomposition */
   localAlgorithm.compute();
 
+  auto t2 = std::chrono::high_resolution_clock::now();
+  auto duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
+  std::cout << "PCA (native): local step took " << duration << " secs" << std::endl;
+
+  t1 = std::chrono::high_resolution_clock::now();
+
   /* Serialize partial results required by step 2 */
   services::SharedPtr<byte> serializedData;
   InputDataArchive dataArch;
@@ -60,26 +68,31 @@ JNIEXPORT jlong JNICALL Java_org_apache_spark_ml_feature_PCADALImpl_cPCATrainDAL
   byte* nodeResults = new byte[perNodeArchLength];
   dataArch.copyArchiveToArray(nodeResults, perNodeArchLength);
 
+  t2 = std::chrono::high_resolution_clock::now();
+
+  duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
+  std::cout << "PCA (native): serializing partial results took " << duration << " secs" << std::endl;
+
   vector<size_t> recv_counts(comm_size * perNodeArchLength);
   for (int i = 0; i < comm_size; i++) recv_counts[i] = perNodeArchLength;
 
   cout << "PCA (native): ccl_allgatherv receiving " << perNodeArchLength * nBlocks << " bytes" << endl;
 
-  auto t1 = std::chrono::high_resolution_clock::now();
+  t1 = std::chrono::high_resolution_clock::now();
 
   /* Transfer partial results to step 2 on the root node */
   // MPI_Gather(nodeResults, perNodeArchLength, MPI_CHAR, serializedData.get(),
   // perNodeArchLength, MPI_CHAR, ccl_root, MPI_COMM_WORLD);
   ccl::allgatherv(nodeResults, perNodeArchLength, serializedData.get(), recv_counts,
                   ccl::datatype::uint8, comm).wait();
 
-  auto t2 = std::chrono::high_resolution_clock::now();
+  t2 = std::chrono::high_resolution_clock::now();
 
-  auto duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
+  duration = std::chrono::duration_cast<std::chrono::seconds>( t2 - t1 ).count();
   std::cout << "PCA (native): ccl_allgatherv took " << duration << " secs" << std::endl;
 
   if (rankId == ccl_root) {
-    auto t1 = std::chrono::high_resolution_clock::now();        
+    auto t1 = std::chrono::high_resolution_clock::now();
 
     /* Create an algorithm for principal component analysis using the svdDense method
      * on the master node */