From 88e801e67d24917a2230a57e2f26ab562b47d0e1 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Tue, 25 Apr 2023 10:14:29 +0000
Subject: [PATCH 1/9] fix GPU-CPU-PROFILE jar can't not run daal CPU

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .github/workflows/ci-tests.yml                | 27 ++++++++-
 dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh  | 37 ++++++++++++
 ...=> ci-standalone-CPU_ONLY_PROFILE-test.sh} |  0
 mllib-dal/src/main/native/CorrelationImpl.cpp | 40 ++++++-------
 mllib-dal/src/main/native/KMeansImpl.cpp      | 37 ++++++------
 mllib-dal/src/main/native/Makefile            | 24 ++++----
 mllib-dal/src/main/native/PCAImpl.cpp         | 60 +++++++++----------
 mllib-dal/src/main/native/SummarizerImpl.cpp  | 10 +---
 8 files changed, 141 insertions(+), 94 deletions(-)
 create mode 100755 dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh
 rename dev/ci/{ci-standalone-test.sh => ci-standalone-CPU_ONLY_PROFILE-test.sh} (100%)

diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
index 52e2917c7..86ca043f5 100644
--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@@ -73,8 +73,31 @@ jobs:
       - name: Cluster Test
         run: |
           ${{github.workspace}}/dev/ci/ci-yarn-test.sh
-  standalone-test:
-    name: Standalone Test for Examples (CPU)
+  standalone-CPU_ONLY_PROFILE-test:
+    name: Standalone Test for Examples
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up JDK 1.8
+        uses: actions/setup-java@v1
+        with:
+          java-version: 1.8
+      - name: Restore cached dependencies
+        uses: actions/cache@v3
+        with:
+          path: |
+            #/var/cache/apt/archives/*.deb
+            ~/.m2/repository
+            /opt/intel/oneapi
+            ~/opt
+          key: ${{ runner.os }}_spark-3.2.0_hadoop-3.2.0_oneapi-2023.0.0
+          restore-keys: |
+            ${{ runner.os }}-
+      - name: Cluster Test
+        run: |
+          ${{github.workspace}}/dev/ci/ci-standalone-test.sh
+standalone-CPU_GPU_PROFILE-test:
+    name: Standalone Test for Examples
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v3
diff --git a/dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh b/dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh
new file mode 100755
index 000000000..a0a8ae2fe
--- /dev/null
+++ b/dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+# exit when any command fails
+set -e
+
+# keep track of the last executed command
+trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG
+# echo an error message before exiting
+trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT
+
+# Install dependencies for building
+$GITHUB_WORKSPACE/dev/install-build-deps-ubuntu.sh
+
+# Setup building envs
+source /opt/intel/oneapi/setvars.sh --ccl-configuration=cpu
+
+# Prepare lib resources
+cd $GITHUB_WORKSPACE/mllib-dal
+../dev/prepare-build-deps.sh
+./build.sh -p CPU_GPU_PROFILE -q
+
+# Setup cluster
+source $GITHUB_WORKSPACE/dev/test-cluster/standalone/setup-cluster.sh
+
+# Setup OAP MLlib envs
+cp $GITHUB_WORKSPACE/dev/test-cluster/standalone/env.sh $GITHUB_WORKSPACE/conf
+cd $GITHUB_WORKSPACE/examples
+
+
+echo "========================================="
+echo "Cluster Testing with Spark Version: $SPARK_VERSION"
+echo "========================================="
+
+# Build and run all examples
+./build-all-scala.sh
+./run-all-scala.sh
+./run-all-pyspark.sh
diff --git a/dev/ci/ci-standalone-test.sh b/dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh
similarity index 100%
rename from dev/ci/ci-standalone-test.sh
rename to dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh
diff --git a/mllib-dal/src/main/native/CorrelationImpl.cpp b/mllib-dal/src/main/native/CorrelationImpl.cpp
index a86e33a1a..1ce43470d 100644
--- a/mllib-dal/src/main/native/CorrelationImpl.cpp
+++ b/mllib-dal/src/main/native/CorrelationImpl.cpp
@@ -27,14 +27,12 @@
 
 using namespace std;
 #ifdef CPU_GPU_PROFILE
-using namespace oneapi::dal;
-#else
+namespace covariance_gpu = oneapi::dal::covariance;
+#endif
 using namespace daal;
-using namespace daal::algorithms;
 using namespace daal::services;
-#endif
+namespace covariance_cpu = daal::algorithms::covariance;
 
-#ifdef CPU_ONLY_PROFILE
 typedef double algorithmFPType; /* Algorithm floating-point type */
 
 static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
@@ -46,10 +44,10 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
 
     const bool isRoot = (rankId == ccl_root);
 
-    covariance::Distributed<step1Local, algorithmFPType> localAlgorithm;
+    covariance_cpu::Distributed<step1Local, algorithmFPType> localAlgorithm;
 
     /* Set the input data set to the algorithm */
-    localAlgorithm.input.set(covariance::data, pData);
+    localAlgorithm.input.set(covariance_cpu::data, pData);
 
     /* Compute covariance */
     localAlgorithm.compute();
@@ -89,7 +87,8 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
     if (isRoot) {
         auto t1 = std::chrono::high_resolution_clock::now();
         /* Create an algorithm to compute covariance on the master node */
-        covariance::Distributed<step2Master, algorithmFPType> masterAlgorithm;
+        covariance_cpu::Distributed<step2Master, algorithmFPType>
+            masterAlgorithm;
 
         for (size_t i = 0; i < nBlocks; i++) {
             /* Deserialize partial results from step 1 */
@@ -97,19 +96,19 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
                                            perNodeArchLength * i,
                                        perNodeArchLength);
 
-            covariance::PartialResultPtr dataForStep2FromStep1(
-                new covariance::PartialResult());
+            covariance_cpu::PartialResultPtr dataForStep2FromStep1(
+                new covariance_cpu::PartialResult());
             dataForStep2FromStep1->deserialize(dataArch);
 
             /* Set local partial results as input for the master-node algorithm
              */
-            masterAlgorithm.input.add(covariance::partialResults,
+            masterAlgorithm.input.add(covariance_cpu::partialResults,
                                       dataForStep2FromStep1);
         }
 
         /* Set the parameter to choose the type of the output matrix */
         masterAlgorithm.parameter.outputMatrixType =
-            covariance::correlationMatrix;
+            covariance_cpu::correlationMatrix;
 
         /* Merge and finalizeCompute covariance decomposition on the master node
          */
@@ -117,7 +116,7 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
         masterAlgorithm.finalizeCompute();
 
         /* Retrieve the algorithm results */
-        covariance::ResultPtr result = masterAlgorithm.getResult();
+        covariance_cpu::ResultPtr result = masterAlgorithm.getResult();
         auto t2 = std::chrono::high_resolution_clock::now();
         auto duration =
             std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
@@ -126,7 +125,7 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
                   << duration / 1000 << " secs" << std::endl;
 
         /* Print the results */
-        printNumericTable(result->get(covariance::correlation),
+        printNumericTable(result->get(covariance_cpu::correlation),
                           "Correlation first 20 columns of "
                           "correlation matrix:",
                           1, 20);
@@ -138,13 +137,12 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
             env->GetFieldID(clazz, "correlationNumericTable", "J");
 
         NumericTablePtr *correlation =
-            new NumericTablePtr(result->get(covariance::correlation));
+            new NumericTablePtr(result->get(covariance_cpu::correlation));
 
         env->SetLongField(resultObj, correlationNumericTableField,
                           (jlong)correlation);
     }
 }
-#endif
 
 #ifdef CPU_GPU_PROFILE
 static void doCorrelationOneAPICompute(
@@ -156,9 +154,9 @@ static void doCorrelationOneAPICompute(
     homogen_table htable =
         *reinterpret_cast<const homogen_table *>(pNumTabData);
 
-    const auto cor_desc = covariance::descriptor{}.set_result_options(
-        covariance::result_options::cor_matrix |
-        covariance::result_options::means);
+    const auto cor_desc = covariance_gpu::descriptor{}.set_result_options(
+        covariance_gpu::result_options::cor_matrix |
+        covariance_gpu::result_options::means);
     auto t1 = std::chrono::high_resolution_clock::now();
     const auto result_train = preview::compute(comm, cor_desc, htable);
     if (isRoot) {
@@ -200,7 +198,6 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
     int rankId = cclComm.rank();
     ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
     switch (device) {
-#ifdef CPU_ONLY_PROFILE
     case ComputeDevice::host:
     case ComputeDevice::cpu: {
         NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
@@ -214,7 +211,7 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
         doCorrelationDaalCompute(env, obj, rankId, cclComm, pData, executorNum,
                                  resultObj);
     }
-#else
+#ifdef CPU_GPU_PROFILE
     case ComputeDevice::gpu: {
         int nGpu = env->GetArrayLength(gpuIdxArray);
         std::cout << "oneDAL (native): use GPU kernels with " << nGpu
@@ -237,6 +234,5 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
     }
 #endif
     }
-
     return 0;
 }
diff --git a/mllib-dal/src/main/native/KMeansImpl.cpp b/mllib-dal/src/main/native/KMeansImpl.cpp
index 8a7d969ff..ac08d9c2d 100644
--- a/mllib-dal/src/main/native/KMeansImpl.cpp
+++ b/mllib-dal/src/main/native/KMeansImpl.cpp
@@ -29,14 +29,12 @@
 
 using namespace std;
 #ifdef CPU_GPU_PROFILE
-using namespace oneapi::dal;
-#else
+namespace kmeans_gpu = oneapi::dal::kmeans;
+#endif
 using namespace daal;
-using namespace daal::algorithms;
 using namespace daal::services;
-#endif
+namespace kmeans_cpu = daal::algorithms::kmeans;
 
-#ifdef CPU_ONLY_PROFILE
 typedef double algorithmFPType; /* Algorithm floating-point type */
 
 static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
@@ -75,11 +73,12 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
     centroids->deserialize(outArch);
 
     /* Create an algorithm to compute k-means on local nodes */
-    kmeans::Distributed<step1Local, algorithmFPType> localAlgorithm(nClusters);
+    kmeans_cpu::Distributed<step1Local, algorithmFPType> localAlgorithm(
+        nClusters);
 
     /* Set the input data set to the algorithm */
-    localAlgorithm.input.set(kmeans::data, pData);
-    localAlgorithm.input.set(kmeans::inputCentroids, centroids);
+    localAlgorithm.input.set(kmeans_cpu::data, pData);
+    localAlgorithm.input.set(kmeans_cpu::inputCentroids, centroids);
 
     /* Compute k-means */
     localAlgorithm.compute();
@@ -108,7 +107,7 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
 
     if (isRoot) {
         /* Create an algorithm to compute k-means on the master node */
-        kmeans::Distributed<step2Master, algorithmFPType> masterAlgorithm(
+        kmeans_cpu::Distributed<step2Master, algorithmFPType> masterAlgorithm(
             nClusters);
 
         for (size_t i = 0; i < nBlocks; i++) {
@@ -116,13 +115,13 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
             OutputDataArchive dataArch(&serializedData[perNodeArchLength * i],
                                        perNodeArchLength);
 
-            kmeans::PartialResultPtr dataForStep2FromStep1(
-                new kmeans::PartialResult());
+            kmeans_cpu::PartialResultPtr dataForStep2FromStep1(
+                new kmeans_cpu::PartialResult());
             dataForStep2FromStep1->deserialize(dataArch);
 
             /* Set local partial results as input for the master-node algorithm
              */
-            masterAlgorithm.input.add(kmeans::partialResults,
+            masterAlgorithm.input.add(kmeans_cpu::partialResults,
                                       dataForStep2FromStep1);
         }
 
@@ -131,11 +130,11 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
         masterAlgorithm.finalizeCompute();
 
         ret_cost = masterAlgorithm.getResult()
-                       ->get(kmeans::objectiveFunction)
+                       ->get(kmeans_cpu::objectiveFunction)
                        ->getValue<algorithmFPType>(0, 0);
 
         /* Retrieve the algorithm results */
-        return masterAlgorithm.getResult()->get(kmeans::centroids);
+        return masterAlgorithm.getResult()->get(kmeans_cpu::centroids);
     }
     return NumericTablePtr();
 }
@@ -239,7 +238,6 @@ static jlong doKMeansDaalCompute(JNIEnv *env, jobject obj, int rankId,
         return (jlong)0;
     }
 }
-#endif
 
 #ifdef CPU_GPU_PROFILE
 static jlong doKMeansOneAPICompute(
@@ -253,13 +251,13 @@ static jlong doKMeansOneAPICompute(
         *reinterpret_cast<const homogen_table *>(pNumTabData);
     homogen_table centroids =
         *reinterpret_cast<const homogen_table *>(pNumTabCenters);
-    const auto kmeans_desc = kmeans::descriptor<>()
+    const auto kmeans_desc = kmeans_gpu::descriptor<>()
                                  .set_cluster_count(clusterNum)
                                  .set_max_iteration_count(iterationNum)
                                  .set_accuracy_threshold(tolerance);
-    kmeans::train_input local_input{htable, centroids};
+    kmeans_gpu::train_input local_input{htable, centroids};
     auto t1 = std::chrono::high_resolution_clock::now();
-    kmeans::train_result result_train =
+    kmeans_gpu::train_result result_train =
         preview::train(comm, kmeans_desc, local_input);
     if (isRoot) {
         std::cout << "Iteration count: " << result_train.get_iteration_count()
@@ -314,7 +312,6 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
     int rankId = cclComm.rank();
     ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
     switch (device) {
-#ifdef CPU_ONLY_PROFILE
     case ComputeDevice::host:
     case ComputeDevice::cpu: {
         NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
@@ -330,7 +327,7 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
                                   clusterNum, tolerance, iterationNum,
                                   executorNum, resultObj);
     }
-#else
+#ifdef CPU_GPU_PROFILE
     case ComputeDevice::gpu: {
         int nGpu = env->GetArrayLength(gpuIdxArray);
         std::cout << "oneDAL (native): use GPU kernels with " << nGpu
diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile
index 6b4a4eb8a..304b9a789 100644
--- a/mllib-dal/src/main/native/Makefile
+++ b/mllib-dal/src/main/native/Makefile
@@ -32,11 +32,12 @@ $(info )
 CFLAGS_COMMON := -g -Wall -Wno-deprecated-declarations -fPIC -std=c++17 \
                  -I $(I_MPI_ROOT)/include \
                  -I $(DAALROOT)/include \
+                 -I $(CCL_ROOT)/include/cpu/oneapi/ \
                  -I $(CMPLR_ROOT)/linux/include \
                  -I $(CMPLR_ROOT)/linux/include/sycl
 
 ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
-  CFLAGS := $(CFLAGS_COMMON)  -I $(CCL_ROOT)/include/cpu/oneapi/
+  CFLAGS := $(CFLAGS_COMMON)
 else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
   CFLAGS := $(CFLAGS_COMMON) -fsycl \
             -fsycl-device-code-split=per_kernel \
@@ -46,14 +47,15 @@ else
   exit 1
 endif
 
-INCS := -I $(JAVA_HOME)/include \
+INCS := -I $(CCL_ROOT)/include/cpu \
+        -I $(JAVA_HOME)/include \
         -I $(JAVA_HOME)/include/linux \
         -I $(DAALROOT)/include \
         -I ./javah \
         -I ./
 
 ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
-  INCS := $(INCS)  -I $(CCL_ROOT)/include/cpu
+  INCS := $(INCS)
 else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
   INCS := $(INCS) -I $(CCL_ROOT)/include/cpu_gpu_dpcpp
 else
@@ -62,17 +64,15 @@ else
 endif
 
 # Use static link if possible, TBB is only available as dynamic libs
-LIBS_COMMON :=-L $(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \
-        -L $(DAALROOT)/lib/intel64 -lonedal_core -lonedal_thread -lonedal_dpc \
-        -L $(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc \
-        -L $(I_MPI_ROOT)
+LIBS_COMMON := -L$(CCL_ROOT)/lib/cpu -lccl \
+        -L$(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \
+        -L$(DAALROOT)/lib/intel64 -lonedal_core -lonedal_thread -lonedal_dpc \
+        -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc \
+        -L$(I_MPI_ROOT)
 
-ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
-      LIBS_COMMON := $(LIBS_COMMON) \
-                     -L $(CCL_ROOT)/lib/cpu -lccl
-else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
+ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
       LIBS_COMMON := $(LIBS_COMMON) \
-                     -L $(CCL_ROOT)/lib/cpu_gpu_dpcpp -lccl
+                     -L$(CCL_ROOT)/lib/cpu_gpu_dpcpp -lccl
 endif
 
 ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
diff --git a/mllib-dal/src/main/native/PCAImpl.cpp b/mllib-dal/src/main/native/PCAImpl.cpp
index 1996d7291..d07c06acc 100644
--- a/mllib-dal/src/main/native/PCAImpl.cpp
+++ b/mllib-dal/src/main/native/PCAImpl.cpp
@@ -29,14 +29,14 @@
 
 using namespace std;
 #ifdef CPU_GPU_PROFILE
-using namespace oneapi::dal;
-#else
+namespace pca_gpu = oneapi::dal::pca;
+namespace covariance_gpu = oneapi::dal::covariance;
+#endif
 using namespace daal;
-using namespace daal::algorithms;
 using namespace daal::services;
-#endif
+namespace pca_cpu = daal::algorithms::pca;
+namespace covariance_cpu = daal::algorithms::covariance;
 
-#ifdef CPU_ONLY_PROFILE
 typedef double algorithmFPType; /* Algorithm floating-point type */
 
 static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
@@ -48,10 +48,10 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
 
     const bool isRoot = (rankId == ccl_root);
 
-    covariance::Distributed<step1Local, algorithmFPType> localAlgorithm;
+    covariance_cpu::Distributed<step1Local, algorithmFPType> localAlgorithm;
 
     /* Set the input data set to the algorithm */
-    localAlgorithm.input.set(covariance::data, pData);
+    localAlgorithm.input.set(covariance_cpu::data, pData);
 
     /* Compute covariance for PCA*/
     localAlgorithm.compute();
@@ -90,7 +90,8 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
     if (isRoot) {
         auto t1 = std::chrono::high_resolution_clock::now();
         /* Create an algorithm to compute covariance on the master node */
-        covariance::Distributed<step2Master, algorithmFPType> masterAlgorithm;
+        covariance_cpu::Distributed<step2Master, algorithmFPType>
+            masterAlgorithm;
 
         for (size_t i = 0; i < nBlocks; i++) {
             /* Deserialize partial results from step 1 */
@@ -98,19 +99,19 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
                                            perNodeArchLength * i,
                                        perNodeArchLength);
 
-            covariance::PartialResultPtr dataForStep2FromStep1(
-                new covariance::PartialResult());
+            covariance_cpu::PartialResultPtr dataForStep2FromStep1(
+                new covariance_cpu::PartialResult());
             dataForStep2FromStep1->deserialize(dataArch);
 
             /* Set local partial results as input for the master-node algorithm
              */
-            masterAlgorithm.input.add(covariance::partialResults,
+            masterAlgorithm.input.add(covariance_cpu::partialResults,
                                       dataForStep2FromStep1);
         }
 
         /* Set the parameter to choose the type of the output matrix */
         masterAlgorithm.parameter.outputMatrixType =
-            covariance::covarianceMatrix;
+            covariance_cpu::covarianceMatrix;
 
         /* Merge and finalizeCompute covariance decomposition on the master node
          */
@@ -118,7 +119,8 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
         masterAlgorithm.finalizeCompute();
 
         /* Retrieve the algorithm results */
-        covariance::ResultPtr covariance_result = masterAlgorithm.getResult();
+        covariance_cpu::ResultPtr covariance_result =
+            masterAlgorithm.getResult();
         auto t2 = std::chrono::high_resolution_clock::now();
         auto duration =
             std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
@@ -130,12 +132,12 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
 
         /* Create an algorithm for principal component analysis using the
          * correlation method*/
-        pca::Batch<algorithmFPType> algorithm;
+        pca_cpu::Batch<algorithmFPType> algorithm;
 
         /* Set the algorithm input data*/
-        algorithm.input.set(pca::correlation,
-                            covariance_result->get(covariance::covariance));
-        algorithm.parameter.resultsToCompute = pca::eigenvalue;
+        algorithm.input.set(pca_cpu::correlation,
+                            covariance_result->get(covariance_cpu::covariance));
+        algorithm.parameter.resultsToCompute = pca_cpu::eigenvalue;
 
         /* Compute results of the PCA algorithm*/
         algorithm.compute();
@@ -148,11 +150,11 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
                   << " secs" << std::endl;
 
         /* Print the results */
-        pca::ResultPtr result = algorithm.getResult();
-        printNumericTable(result->get(pca::eigenvalues),
+        pca_cpu::ResultPtr result = algorithm.getResult();
+        printNumericTable(result->get(pca_cpu::eigenvalues),
                           "First 10 eigenvalues with first 20 dimensions:", 10,
                           20);
-        printNumericTable(result->get(pca::eigenvectors),
+        printNumericTable(result->get(pca_cpu::eigenvectors),
                           "First 10 eigenvectors with first 20 dimensions:", 10,
                           20);
 
@@ -166,16 +168,15 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
             env->GetFieldID(clazz, "explainedVarianceNumericTable", "J");
 
         NumericTablePtr *eigenvalues =
-            new NumericTablePtr(result->get(pca::eigenvalues));
+            new NumericTablePtr(result->get(pca_cpu::eigenvalues));
         NumericTablePtr *eigenvectors =
-            new NumericTablePtr(result->get(pca::eigenvectors));
+            new NumericTablePtr(result->get(pca_cpu::eigenvectors));
 
         env->SetLongField(resultObj, pcNumericTableField, (jlong)eigenvectors);
         env->SetLongField(resultObj, explainedVarianceNumericTableField,
                           (jlong)eigenvalues);
     }
 }
-#endif
 
 #ifdef CPU_GPU_PROFILE
 static void doPCAOneAPICompute(
@@ -187,8 +188,8 @@ static void doPCAOneAPICompute(
     homogen_table htable =
         *reinterpret_cast<const homogen_table *>(pNumTabData);
 
-    const auto cov_desc = covariance::descriptor{}.set_result_options(
-        covariance::result_options::cov_matrix);
+    const auto cov_desc = covariance_gpu::descriptor{}.set_result_options(
+        covariance_gpu::result_options::cov_matrix);
 
     auto t1 = std::chrono::high_resolution_clock::now();
     const auto result = preview::compute(comm, cov_desc, htable);
@@ -199,9 +200,9 @@ static void doPCAOneAPICompute(
               << " secs" << std::endl;
     if (isRoot) {
         using float_t = double;
-        using method_t = pca::method::precomputed;
-        using task_t = pca::task::dim_reduction;
-        using descriptor_t = pca::descriptor<float_t, method_t, task_t>;
+        using method_t = pca_gpu::method::precomputed;
+        using task_t = pca_gpu::task::dim_reduction;
+        using descriptor_t = pca_gpu::descriptor<float_t, method_t, task_t>;
         const auto pca_desc = descriptor_t().set_deterministic(true);
 
         t1 = std::chrono::high_resolution_clock::now();
@@ -254,7 +255,6 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL(
     int rankId = cclComm.rank();
     ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
     switch (device) {
-#ifdef CPU_ONLY_PROFILE
     case ComputeDevice::host:
     case ComputeDevice::cpu: {
         NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
@@ -268,7 +268,7 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL(
         doPCADAALCompute(env, obj, rankId, cclComm, pData, executorNum,
                          resultObj);
     }
-#else
+#ifdef CPU_GPU_PROFILE
     case ComputeDevice::gpu: {
         int nGpu = env->GetArrayLength(gpuIdxArray);
         std::cout << "oneDAL (native): use GPU kernels with " << nGpu
diff --git a/mllib-dal/src/main/native/SummarizerImpl.cpp b/mllib-dal/src/main/native/SummarizerImpl.cpp
index 22c50ace2..6786dc794 100644
--- a/mllib-dal/src/main/native/SummarizerImpl.cpp
+++ b/mllib-dal/src/main/native/SummarizerImpl.cpp
@@ -28,13 +28,10 @@
 using namespace std;
 #ifdef CPU_GPU_PROFILE
 using namespace oneapi::dal;
-#else
+#endif
 using namespace daal;
 using namespace daal::algorithms;
 using namespace daal::services;
-#endif
-
-#ifdef CPU_ONLY_PROFILE
 
 typedef double algorithmFPType; /* Algorithm floating-point type */
 
@@ -199,7 +196,6 @@ static void doSummarizerDAALCompute(JNIEnv *env, jobject obj, int rankId,
         env->SetLongField(resultObj, minimumNumericTableField, (jlong)min);
     }
 }
-#endif
 
 #ifdef CPU_GPU_PROFILE
 static void doSummarizerOneAPICompute(
@@ -271,7 +267,6 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL(
     int rankId = cclComm.rank();
     ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
     switch (device) {
-#ifdef CPU_ONLY_PROFILE
     case ComputeDevice::host:
     case ComputeDevice::cpu: {
         NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
@@ -285,7 +280,7 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL(
         doSummarizerDAALCompute(env, obj, rankId, cclComm, pData, executorNum,
                                 resultObj);
     }
-#else
+#ifdef CPU_GPU_PROFILE
     case ComputeDevice::gpu: {
         int nGpu = env->GetArrayLength(gpuIdxArray);
         std::cout << "oneDAL (native): use GPU kernels with " << nGpu
@@ -308,6 +303,5 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL(
     }
 #endif
     }
-
     return 0;
 }

From d5a7ea8941fb279264f1e6ed4ad71c69d5b10dfa Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Tue, 25 Apr 2023 10:38:54 +0000
Subject: [PATCH 2/9] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 mllib-dal/src/main/native/CorrelationImpl.cpp | 1 +
 mllib-dal/src/main/native/KMeansImpl.cpp      | 2 ++
 mllib-dal/src/main/native/PCAImpl.cpp         | 1 +
 mllib-dal/src/main/native/SummarizerImpl.cpp  | 1 +
 4 files changed, 5 insertions(+)

diff --git a/mllib-dal/src/main/native/CorrelationImpl.cpp b/mllib-dal/src/main/native/CorrelationImpl.cpp
index 1ce43470d..dddc35284 100644
--- a/mllib-dal/src/main/native/CorrelationImpl.cpp
+++ b/mllib-dal/src/main/native/CorrelationImpl.cpp
@@ -231,6 +231,7 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
             preview::spmd::make_communicator<preview::spmd::backend::ccl>(
                 queue, size, rankId, kvs);
         doCorrelationOneAPICompute(env, pNumTabData, comm, resultObj);
+        env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
     }
 #endif
     }
diff --git a/mllib-dal/src/main/native/KMeansImpl.cpp b/mllib-dal/src/main/native/KMeansImpl.cpp
index ac08d9c2d..d864ea671 100644
--- a/mllib-dal/src/main/native/KMeansImpl.cpp
+++ b/mllib-dal/src/main/native/KMeansImpl.cpp
@@ -349,6 +349,8 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
         ret =
             doKMeansOneAPICompute(env, pNumTabData, pNumTabCenters, clusterNum,
                                   tolerance, iterationNum, comm, resultObj);
+
+        env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
     }
 #endif
     }
diff --git a/mllib-dal/src/main/native/PCAImpl.cpp b/mllib-dal/src/main/native/PCAImpl.cpp
index d07c06acc..935942f0f 100644
--- a/mllib-dal/src/main/native/PCAImpl.cpp
+++ b/mllib-dal/src/main/native/PCAImpl.cpp
@@ -288,6 +288,7 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL(
             preview::spmd::make_communicator<preview::spmd::backend::ccl>(
                 queue, size, rankId, kvs);
         doPCAOneAPICompute(env, pNumTabData, comm, resultObj);
+        env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
     }
 #endif
     }
diff --git a/mllib-dal/src/main/native/SummarizerImpl.cpp b/mllib-dal/src/main/native/SummarizerImpl.cpp
index 6786dc794..20f0e87d6 100644
--- a/mllib-dal/src/main/native/SummarizerImpl.cpp
+++ b/mllib-dal/src/main/native/SummarizerImpl.cpp
@@ -300,6 +300,7 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL(
             preview::spmd::make_communicator<preview::spmd::backend::ccl>(
                 queue, size, rankId, kvs);
         doSummarizerOneAPICompute(env, pNumTabData, comm, resultObj);
+        env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
     }
 #endif
     }

From c6f7241e2bb05c0165079d96a7b1ea55636f644b Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Tue, 25 Apr 2023 13:20:27 +0000
Subject: [PATCH 3/9] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 mllib-dal/src/main/native/CorrelationImpl.cpp | 3 ++-
 mllib-dal/src/main/native/KMeansImpl.cpp      | 3 ++-
 mllib-dal/src/main/native/OneCCL.cpp          | 4 +---
 mllib-dal/src/main/native/PCAImpl.cpp         | 3 ++-
 mllib-dal/src/main/native/SummarizerImpl.cpp  | 3 ++-
 5 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/mllib-dal/src/main/native/CorrelationImpl.cpp b/mllib-dal/src/main/native/CorrelationImpl.cpp
index dddc35284..aba09e5a1 100644
--- a/mllib-dal/src/main/native/CorrelationImpl.cpp
+++ b/mllib-dal/src/main/native/CorrelationImpl.cpp
@@ -210,6 +210,7 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
                   << nThreadsNew << std::endl;
         doCorrelationDaalCompute(env, obj, rankId, cclComm, pData, executorNum,
                                  resultObj);
+        break;
     }
 #ifdef CPU_GPU_PROFILE
     case ComputeDevice::gpu: {
@@ -221,7 +222,6 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
         jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
 
         int size = cclComm.size();
-        ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
 
         auto queue =
             getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
@@ -232,6 +232,7 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
                 queue, size, rankId, kvs);
         doCorrelationOneAPICompute(env, pNumTabData, comm, resultObj);
         env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
+        break;
     }
 #endif
     }
diff --git a/mllib-dal/src/main/native/KMeansImpl.cpp b/mllib-dal/src/main/native/KMeansImpl.cpp
index d864ea671..f167d41ed 100644
--- a/mllib-dal/src/main/native/KMeansImpl.cpp
+++ b/mllib-dal/src/main/native/KMeansImpl.cpp
@@ -326,6 +326,7 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
         ret = doKMeansDaalCompute(env, obj, rankId, cclComm, pData, centroids,
                                   clusterNum, tolerance, iterationNum,
                                   executorNum, resultObj);
+        break;
     }
 #ifdef CPU_GPU_PROFILE
     case ComputeDevice::gpu: {
@@ -337,7 +338,6 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
         jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
 
         int size = cclComm.size();
-        ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
 
         auto queue =
             getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
@@ -351,6 +351,7 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
                                   tolerance, iterationNum, comm, resultObj);
 
         env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
+        break;
     }
 #endif
     }
diff --git a/mllib-dal/src/main/native/OneCCL.cpp b/mllib-dal/src/main/native/OneCCL.cpp
index bbfc9fbe9..6264d36ca 100644
--- a/mllib-dal/src/main/native/OneCCL.cpp
+++ b/mllib-dal/src/main/native/OneCCL.cpp
@@ -100,11 +100,9 @@ Java_com_intel_oap_mllib_OneCCL_00024_c_1initDpcpp(JNIEnv *env, jobject) {
 
 JNIEXPORT void JNICALL
 Java_com_intel_oap_mllib_OneCCL_00024_c_1cleanup(JNIEnv *env, jobject obj) {
-
+    std::cerr << "OneCCL (native): cleanup" << std::endl;
     g_kvs.pop_back();
     g_comms.pop_back();
-
-    std::cerr << "OneCCL (native): cleanup" << std::endl;
 }
 
 JNIEXPORT jboolean JNICALL
diff --git a/mllib-dal/src/main/native/PCAImpl.cpp b/mllib-dal/src/main/native/PCAImpl.cpp
index 935942f0f..5fabd3e24 100644
--- a/mllib-dal/src/main/native/PCAImpl.cpp
+++ b/mllib-dal/src/main/native/PCAImpl.cpp
@@ -267,6 +267,7 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL(
                   << nThreadsNew << std::endl;
         doPCADAALCompute(env, obj, rankId, cclComm, pData, executorNum,
                          resultObj);
+        break;
     }
 #ifdef CPU_GPU_PROFILE
     case ComputeDevice::gpu: {
@@ -278,7 +279,6 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL(
         jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
 
         int size = cclComm.size();
-        ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
 
         auto queue =
             getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
@@ -289,6 +289,7 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL(
                 queue, size, rankId, kvs);
         doPCAOneAPICompute(env, pNumTabData, comm, resultObj);
         env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
+        break;
     }
 #endif
     }
diff --git a/mllib-dal/src/main/native/SummarizerImpl.cpp b/mllib-dal/src/main/native/SummarizerImpl.cpp
index 20f0e87d6..8333be531 100644
--- a/mllib-dal/src/main/native/SummarizerImpl.cpp
+++ b/mllib-dal/src/main/native/SummarizerImpl.cpp
@@ -279,6 +279,7 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL(
                   << nThreadsNew << std::endl;
         doSummarizerDAALCompute(env, obj, rankId, cclComm, pData, executorNum,
                                 resultObj);
+        break;
     }
 #ifdef CPU_GPU_PROFILE
     case ComputeDevice::gpu: {
@@ -290,7 +291,6 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL(
         jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
 
         int size = cclComm.size();
-        ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
 
         auto queue =
             getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
@@ -301,6 +301,7 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL(
                 queue, size, rankId, kvs);
         doSummarizerOneAPICompute(env, pNumTabData, comm, resultObj);
         env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
+        break;
     }
 #endif
     }

From 6e0b0b9c42e277b838bf0d03d610925347fb86b6 Mon Sep 17 00:00:00 2001
From: minmingzhu <45281494+minmingzhu@users.noreply.github.com>
Date: Tue, 25 Apr 2023 22:05:05 +0800
Subject: [PATCH 4/9] Update ci-tests.yml

---
 .github/workflows/ci-tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
index 86ca043f5..c96021b0c 100644
--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@@ -95,7 +95,7 @@ jobs:
             ${{ runner.os }}-
       - name: Cluster Test
         run: |
-          ${{github.workspace}}/dev/ci/ci-standalone-test.sh
+          ${{github.workspace}}/dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh
 standalone-CPU_GPU_PROFILE-test:
     name: Standalone Test for Examples
     runs-on: ubuntu-20.04
@@ -118,4 +118,4 @@ standalone-CPU_GPU_PROFILE-test:
             ${{ runner.os }}-
       - name: Cluster Test
         run: |
-          ${{github.workspace}}/dev/ci/ci-standalone-test.sh
+          ${{github.workspace}}/dev/ci/ci-standalone-CPU_GPU_PROFILE-test.sh

From b8947afef3fb29acebe147d3e1666fad3207c13c Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Tue, 25 Apr 2023 14:19:15 +0000
Subject: [PATCH 5/9] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .github/workflows/ci-tests.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
index c96021b0c..5647d2e8f 100644
--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@@ -74,7 +74,7 @@ jobs:
         run: |
           ${{github.workspace}}/dev/ci/ci-yarn-test.sh
   standalone-CPU_ONLY_PROFILE-test:
-    name: Standalone Test for Examples
+    name: Standalone CPU_ONLY_PROFILE Test for Examples
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v3
@@ -96,8 +96,8 @@ jobs:
       - name: Cluster Test
         run: |
           ${{github.workspace}}/dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh
-standalone-CPU_GPU_PROFILE-test:
-    name: Standalone Test for Examples
+  standalone-CPU_GPU_PROFILE-test:
+    name: Standalone CPU_GPU_PROFILE Test for Examples
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v3

From 59db0d7f550bb49173c1c24f98a2f659b06af9d8 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Tue, 25 Apr 2023 15:52:41 +0000
Subject: [PATCH 6/9] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .github/workflows/ci-tests.yml                              | 6 +++---
 ...OFILE-test.sh => ci-standalone-CPU-ONLY-PROFILE-test.sh} | 0
 2 files changed, 3 insertions(+), 3 deletions(-)
 rename dev/ci/{ci-standalone-CPU_ONLY_PROFILE-test.sh => ci-standalone-CPU-ONLY-PROFILE-test.sh} (100%)

diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
index 5647d2e8f..6ea681198 100644
--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@@ -73,7 +73,7 @@ jobs:
       - name: Cluster Test
         run: |
           ${{github.workspace}}/dev/ci/ci-yarn-test.sh
-  standalone-CPU_ONLY_PROFILE-test:
+  standalone-CPU-ONLY-PROFILE-test:
     name: Standalone CPU_ONLY_PROFILE Test for Examples
     runs-on: ubuntu-20.04
     steps:
@@ -96,7 +96,7 @@ jobs:
       - name: Cluster Test
         run: |
           ${{github.workspace}}/dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh
-  standalone-CPU_GPU_PROFILE-test:
+  standalone-CPU-GPU-PROFILE-test:
     name: Standalone CPU_GPU_PROFILE Test for Examples
     runs-on: ubuntu-20.04
     steps:
@@ -118,4 +118,4 @@ jobs:
             ${{ runner.os }}-
       - name: Cluster Test
         run: |
-          ${{github.workspace}}/dev/ci/ci-standalone-CPU_GPU_PROFILE-test.sh
+          ${{github.workspace}}/dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh
diff --git a/dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh b/dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh
similarity index 100%
rename from dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh
rename to dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh

From 90aa4f2cd9a0c5e92ac7bcfbf13dabcb6dbbfa7f Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Tue, 25 Apr 2023 16:05:08 +0000
Subject: [PATCH 7/9] update

Signed-off-by: minmingzhu <minming.zhu@intel.com>
---
 .github/workflows/ci-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
index 6ea681198..4c00dd63a 100644
--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@@ -95,7 +95,7 @@ jobs:
             ${{ runner.os }}-
       - name: Cluster Test
         run: |
-          ${{github.workspace}}/dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh
+          ${{github.workspace}}/dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh
   standalone-CPU-GPU-PROFILE-test:
     name: Standalone CPU_GPU_PROFILE Test for Examples
     runs-on: ubuntu-20.04

From 954c15f5a579a188d4675b4c7c6d7865167e2432 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Wed, 26 Apr 2023 06:06:25 +0000
Subject: [PATCH 8/9] update

---
 .github/workflows/ci-tests.yml                | 29 ++-------------
 dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh | 37 -------------------
 ...-PROFILE-test.sh => ci-standalone-test.sh} |  0
 3 files changed, 3 insertions(+), 63 deletions(-)
 delete mode 100755 dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh
 rename dev/ci/{ci-standalone-CPU-GPU-PROFILE-test.sh => ci-standalone-test.sh} (100%)

diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
index 4c00dd63a..96018b1fa 100644
--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@@ -73,8 +73,8 @@ jobs:
       - name: Cluster Test
         run: |
           ${{github.workspace}}/dev/ci/ci-yarn-test.sh
-  standalone-CPU-ONLY-PROFILE-test:
-    name: Standalone CPU_ONLY_PROFILE Test for Examples
+  standalone-test:
+    name: Standalone CPU_GPU_PROFILE Test for Examples (CPU)
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v3
@@ -95,27 +95,4 @@ jobs:
             ${{ runner.os }}-
       - name: Cluster Test
         run: |
-          ${{github.workspace}}/dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh
-  standalone-CPU-GPU-PROFILE-test:
-    name: Standalone CPU_GPU_PROFILE Test for Examples
-    runs-on: ubuntu-20.04
-    steps:
-      - uses: actions/checkout@v3
-      - name: Set up JDK 1.8
-        uses: actions/setup-java@v1
-        with:
-          java-version: 1.8
-      - name: Restore cached dependencies
-        uses: actions/cache@v3
-        with:
-          path: |
-            #/var/cache/apt/archives/*.deb
-            ~/.m2/repository
-            /opt/intel/oneapi
-            ~/opt
-          key: ${{ runner.os }}_spark-3.2.0_hadoop-3.2.0_oneapi-2023.0.0
-          restore-keys: |
-            ${{ runner.os }}-
-      - name: Cluster Test
-        run: |
-          ${{github.workspace}}/dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh
+          ${{github.workspace}}/dev/ci/ci-standalone-test.sh
diff --git a/dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh b/dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh
deleted file mode 100755
index edc5af65b..000000000
--- a/dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/env bash
-
-# exit when any command fails
-set -e
-
-# keep track of the last executed command
-trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG
-# echo an error message before exiting
-trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT
-
-# Install dependencies for building
-$GITHUB_WORKSPACE/dev/install-build-deps-ubuntu.sh
-
-# Setup building envs
-source /opt/intel/oneapi/setvars.sh --ccl-configuration=cpu
-
-# Prepare lib resources
-cd $GITHUB_WORKSPACE/mllib-dal
-../dev/prepare-build-deps.sh
-./build.sh -p CPU_ONLY_PROFILE -q
-
-# Setup cluster
-source $GITHUB_WORKSPACE/dev/test-cluster/standalone/setup-cluster.sh
-
-# Setup OAP MLlib envs
-cp $GITHUB_WORKSPACE/dev/test-cluster/standalone/env.sh $GITHUB_WORKSPACE/conf
-cd $GITHUB_WORKSPACE/examples
-
-
-echo "========================================="
-echo "Cluster Testing with Spark Version: $SPARK_VERSION"
-echo "========================================="
-
-# Build and run all examples
-./build-all-scala.sh
-./run-all-scala.sh
-./run-all-pyspark.sh
diff --git a/dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh b/dev/ci/ci-standalone-test.sh
similarity index 100%
rename from dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh
rename to dev/ci/ci-standalone-test.sh

From 558b216428f2b1f252c690f6e943d2eec637c927 Mon Sep 17 00:00:00 2001
From: minmingzhu <minming.zhu@intel.com>
Date: Wed, 26 Apr 2023 06:39:21 +0000
Subject: [PATCH 9/9] retrigger checks