[ML-279]Fix GPU_CPU_PROFILE can't run daal cpu (#280)

* fix GPU-CPU-PROFILE jar can't not run daal CPU Signed-off-by: minmingzhu <minming.zhu@intel.com> * update Signed-off-by: minmingzhu <minming.zhu@intel.com> * update Signed-off-by: minmingzhu <minming.zhu@intel.com> * Update ci-tests.yml * update Signed-off-by: minmingzhu <minming.zhu@intel.com> * update Signed-off-by: minmingzhu <minming.zhu@intel.com> * update Signed-off-by: minmingzhu <minming.zhu@intel.com> * update * retrigger checks --------- Signed-off-by: minmingzhu <minming.zhu@intel.com>
oap-project · Apr 28, 2023 · 4e1ce4d · 4e1ce4d
1 parent bbd9fc2
commit 4e1ce4d
Show file tree

Hide file tree

Showing 8 changed files with 95 additions and 101 deletions.
diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
@@ -74,7 +74,7 @@ jobs:
         run: |
           ${{github.workspace}}/dev/ci/ci-yarn-test.sh
   standalone-test:
-    name: Standalone Test for Examples (CPU)
+    name: Standalone CPU_GPU_PROFILE Test for Examples (CPU)
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v3

diff --git a/dev/ci/ci-standalone-test.sh b/dev/ci/ci-standalone-test.sh
@@ -17,7 +17,7 @@ source /opt/intel/oneapi/setvars.sh --ccl-configuration=cpu
 # Prepare lib resources
 cd $GITHUB_WORKSPACE/mllib-dal
 ../dev/prepare-build-deps.sh
-./build.sh -p CPU_ONLY_PROFILE -q
+./build.sh -p CPU_GPU_PROFILE -q
 
 # Setup cluster
 source $GITHUB_WORKSPACE/dev/test-cluster/standalone/setup-cluster.sh

diff --git a/mllib-dal/src/main/native/CorrelationImpl.cpp b/mllib-dal/src/main/native/CorrelationImpl.cpp
@@ -27,14 +27,12 @@
 
 using namespace std;
 #ifdef CPU_GPU_PROFILE
-using namespace oneapi::dal;
-#else
+namespace covariance_gpu = oneapi::dal::covariance;
+#endif
 using namespace daal;
-using namespace daal::algorithms;
 using namespace daal::services;
-#endif
+namespace covariance_cpu = daal::algorithms::covariance;
 
-#ifdef CPU_ONLY_PROFILE
 typedef double algorithmFPType; /* Algorithm floating-point type */
 
 static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
@@ -46,10 +44,10 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
 
     const bool isRoot = (rankId == ccl_root);
 
-    covariance::Distributed<step1Local, algorithmFPType> localAlgorithm;
+    covariance_cpu::Distributed<step1Local, algorithmFPType> localAlgorithm;
 
     /* Set the input data set to the algorithm */
-    localAlgorithm.input.set(covariance::data, pData);
+    localAlgorithm.input.set(covariance_cpu::data, pData);
 
     /* Compute covariance */
     localAlgorithm.compute();
@@ -89,35 +87,36 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
     if (isRoot) {
         auto t1 = std::chrono::high_resolution_clock::now();
         /* Create an algorithm to compute covariance on the master node */
-        covariance::Distributed<step2Master, algorithmFPType> masterAlgorithm;
+        covariance_cpu::Distributed<step2Master, algorithmFPType>
+            masterAlgorithm;
 
         for (size_t i = 0; i < nBlocks; i++) {
             /* Deserialize partial results from step 1 */
             OutputDataArchive dataArch(serializedData.get() +
                                            perNodeArchLength * i,
                                        perNodeArchLength);
 
-            covariance::PartialResultPtr dataForStep2FromStep1(
-                new covariance::PartialResult());
+            covariance_cpu::PartialResultPtr dataForStep2FromStep1(
+                new covariance_cpu::PartialResult());
             dataForStep2FromStep1->deserialize(dataArch);
 
             /* Set local partial results as input for the master-node algorithm
              */
-            masterAlgorithm.input.add(covariance::partialResults,
+            masterAlgorithm.input.add(covariance_cpu::partialResults,
                                       dataForStep2FromStep1);
         }
 
         /* Set the parameter to choose the type of the output matrix */
         masterAlgorithm.parameter.outputMatrixType =
-            covariance::correlationMatrix;
+            covariance_cpu::correlationMatrix;
 
         /* Merge and finalizeCompute covariance decomposition on the master node
          */
         masterAlgorithm.compute();
         masterAlgorithm.finalizeCompute();
 
         /* Retrieve the algorithm results */
-        covariance::ResultPtr result = masterAlgorithm.getResult();
+        covariance_cpu::ResultPtr result = masterAlgorithm.getResult();
         auto t2 = std::chrono::high_resolution_clock::now();
         auto duration =
             std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
@@ -126,7 +125,7 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
                   << duration / 1000 << " secs" << std::endl;
 
         /* Print the results */
-        printNumericTable(result->get(covariance::correlation),
+        printNumericTable(result->get(covariance_cpu::correlation),
                           "Correlation first 20 columns of "
                           "correlation matrix:",
                           1, 20);
@@ -138,13 +137,12 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
             env->GetFieldID(clazz, "correlationNumericTable", "J");
 
         NumericTablePtr *correlation =
-            new NumericTablePtr(result->get(covariance::correlation));
+            new NumericTablePtr(result->get(covariance_cpu::correlation));
 
         env->SetLongField(resultObj, correlationNumericTableField,
                           (jlong)correlation);
     }
 }
-#endif
 
 #ifdef CPU_GPU_PROFILE
 static void doCorrelationOneAPICompute(
@@ -156,9 +154,9 @@ static void doCorrelationOneAPICompute(
     homogen_table htable =
         *reinterpret_cast<const homogen_table *>(pNumTabData);
 
-    const auto cor_desc = covariance::descriptor{}.set_result_options(
-        covariance::result_options::cor_matrix |
-        covariance::result_options::means);
+    const auto cor_desc = covariance_gpu::descriptor{}.set_result_options(
+        covariance_gpu::result_options::cor_matrix |
+        covariance_gpu::result_options::means);
     auto t1 = std::chrono::high_resolution_clock::now();
     const auto result_train = preview::compute(comm, cor_desc, htable);
     if (isRoot) {
@@ -200,7 +198,6 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
     int rankId = cclComm.rank();
     ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
     switch (device) {
-#ifdef CPU_ONLY_PROFILE
     case ComputeDevice::host:
     case ComputeDevice::cpu: {
         NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
@@ -213,8 +210,9 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
                   << nThreadsNew << std::endl;
         doCorrelationDaalCompute(env, obj, rankId, cclComm, pData, executorNum,
                                  resultObj);
+        break;
     }
-#else
+#ifdef CPU_GPU_PROFILE
     case ComputeDevice::gpu: {
         int nGpu = env->GetArrayLength(gpuIdxArray);
         std::cout << "oneDAL (native): use GPU kernels with " << nGpu
@@ -224,7 +222,6 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
         jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
 
         int size = cclComm.size();
-        ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
 
         auto queue =
             getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
@@ -234,9 +231,10 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
             preview::spmd::make_communicator<preview::spmd::backend::ccl>(
                 queue, size, rankId, kvs);
         doCorrelationOneAPICompute(env, pNumTabData, comm, resultObj);
+        env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
+        break;
     }
 #endif
     }
-
     return 0;
 }
diff --git a/mllib-dal/src/main/native/KMeansImpl.cpp b/mllib-dal/src/main/native/KMeansImpl.cpp
@@ -29,14 +29,12 @@
 
 using namespace std;
 #ifdef CPU_GPU_PROFILE
-using namespace oneapi::dal;
-#else
+namespace kmeans_gpu = oneapi::dal::kmeans;
+#endif
 using namespace daal;
-using namespace daal::algorithms;
 using namespace daal::services;
-#endif
+namespace kmeans_cpu = daal::algorithms::kmeans;
 
-#ifdef CPU_ONLY_PROFILE
 typedef double algorithmFPType; /* Algorithm floating-point type */
 
 static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
@@ -75,11 +73,12 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
     centroids->deserialize(outArch);
 
     /* Create an algorithm to compute k-means on local nodes */
-    kmeans::Distributed<step1Local, algorithmFPType> localAlgorithm(nClusters);
+    kmeans_cpu::Distributed<step1Local, algorithmFPType> localAlgorithm(
+        nClusters);
 
     /* Set the input data set to the algorithm */
-    localAlgorithm.input.set(kmeans::data, pData);
-    localAlgorithm.input.set(kmeans::inputCentroids, centroids);
+    localAlgorithm.input.set(kmeans_cpu::data, pData);
+    localAlgorithm.input.set(kmeans_cpu::inputCentroids, centroids);
 
     /* Compute k-means */
     localAlgorithm.compute();
@@ -108,21 +107,21 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
 
     if (isRoot) {
         /* Create an algorithm to compute k-means on the master node */
-        kmeans::Distributed<step2Master, algorithmFPType> masterAlgorithm(
+        kmeans_cpu::Distributed<step2Master, algorithmFPType> masterAlgorithm(
             nClusters);
 
         for (size_t i = 0; i < nBlocks; i++) {
             /* Deserialize partial results from step 1 */
             OutputDataArchive dataArch(&serializedData[perNodeArchLength * i],
                                        perNodeArchLength);
 
-            kmeans::PartialResultPtr dataForStep2FromStep1(
-                new kmeans::PartialResult());
+            kmeans_cpu::PartialResultPtr dataForStep2FromStep1(
+                new kmeans_cpu::PartialResult());
             dataForStep2FromStep1->deserialize(dataArch);
 
             /* Set local partial results as input for the master-node algorithm
              */
-            masterAlgorithm.input.add(kmeans::partialResults,
+            masterAlgorithm.input.add(kmeans_cpu::partialResults,
                                       dataForStep2FromStep1);
         }
 
@@ -131,11 +130,11 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
         masterAlgorithm.finalizeCompute();
 
         ret_cost = masterAlgorithm.getResult()
-                       ->get(kmeans::objectiveFunction)
+                       ->get(kmeans_cpu::objectiveFunction)
                        ->getValue<algorithmFPType>(0, 0);
 
         /* Retrieve the algorithm results */
-        return masterAlgorithm.getResult()->get(kmeans::centroids);
+        return masterAlgorithm.getResult()->get(kmeans_cpu::centroids);
     }
     return NumericTablePtr();
 }
@@ -239,7 +238,6 @@ static jlong doKMeansDaalCompute(JNIEnv *env, jobject obj, int rankId,
         return (jlong)0;
     }
 }
-#endif
 
 #ifdef CPU_GPU_PROFILE
 static jlong doKMeansOneAPICompute(
@@ -253,13 +251,13 @@ static jlong doKMeansOneAPICompute(
         *reinterpret_cast<const homogen_table *>(pNumTabData);
     homogen_table centroids =
         *reinterpret_cast<const homogen_table *>(pNumTabCenters);
-    const auto kmeans_desc = kmeans::descriptor<>()
+    const auto kmeans_desc = kmeans_gpu::descriptor<>()
                                  .set_cluster_count(clusterNum)
                                  .set_max_iteration_count(iterationNum)
                                  .set_accuracy_threshold(tolerance);
-    kmeans::train_input local_input{htable, centroids};
+    kmeans_gpu::train_input local_input{htable, centroids};
     auto t1 = std::chrono::high_resolution_clock::now();
-    kmeans::train_result result_train =
+    kmeans_gpu::train_result result_train =
         preview::train(comm, kmeans_desc, local_input);
     if (isRoot) {
         std::cout << "Iteration count: " << result_train.get_iteration_count()
@@ -314,7 +312,6 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
     int rankId = cclComm.rank();
     ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
     switch (device) {
-#ifdef CPU_ONLY_PROFILE
     case ComputeDevice::host:
     case ComputeDevice::cpu: {
         NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
@@ -329,8 +326,9 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
         ret = doKMeansDaalCompute(env, obj, rankId, cclComm, pData, centroids,
                                   clusterNum, tolerance, iterationNum,
                                   executorNum, resultObj);
+        break;
     }
-#else
+#ifdef CPU_GPU_PROFILE
     case ComputeDevice::gpu: {
         int nGpu = env->GetArrayLength(gpuIdxArray);
         std::cout << "oneDAL (native): use GPU kernels with " << nGpu
@@ -340,7 +338,6 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
         jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
 
         int size = cclComm.size();
-        ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
 
         auto queue =
             getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
@@ -352,6 +349,9 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
         ret =
             doKMeansOneAPICompute(env, pNumTabData, pNumTabCenters, clusterNum,
                                   tolerance, iterationNum, comm, resultObj);
+
+        env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
+        break;
     }
 #endif
     }

diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile
@@ -32,11 +32,12 @@ $(info )
 CFLAGS_COMMON := -g -Wall -Wno-deprecated-declarations -fPIC -std=c++17 \
                  -I $(I_MPI_ROOT)/include \
                  -I $(DAALROOT)/include \
+                 -I $(CCL_ROOT)/include/cpu/oneapi/ \
                  -I $(CMPLR_ROOT)/linux/include \
                  -I $(CMPLR_ROOT)/linux/include/sycl
 
 ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
-  CFLAGS := $(CFLAGS_COMMON)  -I $(CCL_ROOT)/include/cpu/oneapi/
+  CFLAGS := $(CFLAGS_COMMON)
 else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
   CFLAGS := $(CFLAGS_COMMON) -fsycl \
             -fsycl-device-code-split=per_kernel \
@@ -46,14 +47,15 @@ else
   exit 1
 endif
 
-INCS := -I $(JAVA_HOME)/include \
+INCS := -I $(CCL_ROOT)/include/cpu \
+        -I $(JAVA_HOME)/include \
         -I $(JAVA_HOME)/include/linux \
         -I $(DAALROOT)/include \
         -I ./javah \
         -I ./
 
 ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
-  INCS := $(INCS)  -I $(CCL_ROOT)/include/cpu
+  INCS := $(INCS)
 else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
   INCS := $(INCS) -I $(CCL_ROOT)/include/cpu_gpu_dpcpp
 else
@@ -62,17 +64,15 @@ else
 endif
 
 # Use static link if possible, TBB is only available as dynamic libs
-LIBS_COMMON :=-L $(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \
-        -L $(DAALROOT)/lib/intel64 -lonedal_core -lonedal_thread -lonedal_dpc \
-        -L $(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc \
-        -L $(I_MPI_ROOT)
+LIBS_COMMON := -L$(CCL_ROOT)/lib/cpu -lccl \
+        -L$(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \
+        -L$(DAALROOT)/lib/intel64 -lonedal_core -lonedal_thread -lonedal_dpc \
+        -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc \
+        -L$(I_MPI_ROOT)
 
-ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
-      LIBS_COMMON := $(LIBS_COMMON) \
-                     -L $(CCL_ROOT)/lib/cpu -lccl
-else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
+ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
       LIBS_COMMON := $(LIBS_COMMON) \
-                     -L $(CCL_ROOT)/lib/cpu_gpu_dpcpp -lccl
+                     -L$(CCL_ROOT)/lib/cpu_gpu_dpcpp -lccl
 endif
 
 ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)

diff --git a/mllib-dal/src/main/native/OneCCL.cpp b/mllib-dal/src/main/native/OneCCL.cpp
@@ -100,11 +100,9 @@ Java_com_intel_oap_mllib_OneCCL_00024_c_1initDpcpp(JNIEnv *env, jobject) {
 
 JNIEXPORT void JNICALL
 Java_com_intel_oap_mllib_OneCCL_00024_c_1cleanup(JNIEnv *env, jobject obj) {
-
+    std::cerr << "OneCCL (native): cleanup" << std::endl;
     g_kvs.pop_back();
     g_comms.pop_back();
-
-    std::cerr << "OneCCL (native): cleanup" << std::endl;
 }
 
 JNIEXPORT jboolean JNICALL