diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml
index 52e2917c7..96018b1fa 100644
--- a/.github/workflows/ci-tests.yml
+++ b/.github/workflows/ci-tests.yml
@@ -74,7 +74,7 @@ jobs:
         run: |
           ${{github.workspace}}/dev/ci/ci-yarn-test.sh
   standalone-test:
-    name: Standalone Test for Examples (CPU)
+    name: Standalone CPU_GPU_PROFILE Test for Examples (CPU)
     runs-on: ubuntu-20.04
     steps:
       - uses: actions/checkout@v3
diff --git a/dev/ci/ci-standalone-test.sh b/dev/ci/ci-standalone-test.sh
index edc5af65b..a0a8ae2fe 100755
--- a/dev/ci/ci-standalone-test.sh
+++ b/dev/ci/ci-standalone-test.sh
@@ -17,7 +17,7 @@ source /opt/intel/oneapi/setvars.sh --ccl-configuration=cpu
 # Prepare lib resources
 cd $GITHUB_WORKSPACE/mllib-dal
 ../dev/prepare-build-deps.sh
-./build.sh -p CPU_ONLY_PROFILE -q
+./build.sh -p CPU_GPU_PROFILE -q
 
 # Setup cluster
 source $GITHUB_WORKSPACE/dev/test-cluster/standalone/setup-cluster.sh
diff --git a/mllib-dal/src/main/native/CorrelationImpl.cpp b/mllib-dal/src/main/native/CorrelationImpl.cpp
index a86e33a1a..aba09e5a1 100644
--- a/mllib-dal/src/main/native/CorrelationImpl.cpp
+++ b/mllib-dal/src/main/native/CorrelationImpl.cpp
@@ -27,14 +27,12 @@
 
 using namespace std;
 #ifdef CPU_GPU_PROFILE
-using namespace oneapi::dal;
-#else
+namespace covariance_gpu = oneapi::dal::covariance;
+#endif
 using namespace daal;
-using namespace daal::algorithms;
 using namespace daal::services;
-#endif
+namespace covariance_cpu = daal::algorithms::covariance;
 
-#ifdef CPU_ONLY_PROFILE
 typedef double algorithmFPType; /* Algorithm floating-point type */
 
 static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
@@ -46,10 +44,10 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
 
     const bool isRoot = (rankId == ccl_root);
 
-    covariance::Distributed<step1Local, algorithmFPType> localAlgorithm;
+    covariance_cpu::Distributed<step1Local, algorithmFPType> localAlgorithm;
 
     /* Set the input data set to the algorithm */
-    localAlgorithm.input.set(covariance::data, pData);
+    localAlgorithm.input.set(covariance_cpu::data, pData);
 
     /* Compute covariance */
     localAlgorithm.compute();
@@ -89,7 +87,8 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
     if (isRoot) {
         auto t1 = std::chrono::high_resolution_clock::now();
         /* Create an algorithm to compute covariance on the master node */
-        covariance::Distributed<step2Master, algorithmFPType> masterAlgorithm;
+        covariance_cpu::Distributed<step2Master, algorithmFPType>
+            masterAlgorithm;
 
         for (size_t i = 0; i < nBlocks; i++) {
             /* Deserialize partial results from step 1 */
@@ -97,19 +96,19 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
                                            perNodeArchLength * i,
                                        perNodeArchLength);
 
-            covariance::PartialResultPtr dataForStep2FromStep1(
-                new covariance::PartialResult());
+            covariance_cpu::PartialResultPtr dataForStep2FromStep1(
+                new covariance_cpu::PartialResult());
             dataForStep2FromStep1->deserialize(dataArch);
 
             /* Set local partial results as input for the master-node algorithm
              */
-            masterAlgorithm.input.add(covariance::partialResults,
+            masterAlgorithm.input.add(covariance_cpu::partialResults,
                                       dataForStep2FromStep1);
         }
 
         /* Set the parameter to choose the type of the output matrix */
         masterAlgorithm.parameter.outputMatrixType =
-            covariance::correlationMatrix;
+            covariance_cpu::correlationMatrix;
 
         /* Merge and finalizeCompute covariance decomposition on the master node
          */
@@ -117,7 +116,7 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
         masterAlgorithm.finalizeCompute();
 
         /* Retrieve the algorithm results */
-        covariance::ResultPtr result = masterAlgorithm.getResult();
+        covariance_cpu::ResultPtr result = masterAlgorithm.getResult();
         auto t2 = std::chrono::high_resolution_clock::now();
         auto duration =
             std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
@@ -126,7 +125,7 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
                   << duration / 1000 << " secs" << std::endl;
 
         /* Print the results */
-        printNumericTable(result->get(covariance::correlation),
+        printNumericTable(result->get(covariance_cpu::correlation),
                           "Correlation first 20 columns of "
                           "correlation matrix:",
                           1, 20);
@@ -138,13 +137,12 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
             env->GetFieldID(clazz, "correlationNumericTable", "J");
 
         NumericTablePtr *correlation =
-            new NumericTablePtr(result->get(covariance::correlation));
+            new NumericTablePtr(result->get(covariance_cpu::correlation));
 
         env->SetLongField(resultObj, correlationNumericTableField,
                           (jlong)correlation);
     }
 }
-#endif
 
 #ifdef CPU_GPU_PROFILE
 static void doCorrelationOneAPICompute(
@@ -156,9 +154,9 @@ static void doCorrelationOneAPICompute(
     homogen_table htable =
         *reinterpret_cast<const homogen_table *>(pNumTabData);
 
-    const auto cor_desc = covariance::descriptor{}.set_result_options(
-        covariance::result_options::cor_matrix |
-        covariance::result_options::means);
+    const auto cor_desc = covariance_gpu::descriptor{}.set_result_options(
+        covariance_gpu::result_options::cor_matrix |
+        covariance_gpu::result_options::means);
     auto t1 = std::chrono::high_resolution_clock::now();
     const auto result_train = preview::compute(comm, cor_desc, htable);
     if (isRoot) {
@@ -200,7 +198,6 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
     int rankId = cclComm.rank();
     ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
     switch (device) {
-#ifdef CPU_ONLY_PROFILE
     case ComputeDevice::host:
     case ComputeDevice::cpu: {
         NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
@@ -213,8 +210,9 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
                   << nThreadsNew << std::endl;
         doCorrelationDaalCompute(env, obj, rankId, cclComm, pData, executorNum,
                                  resultObj);
+        break;
     }
-#else
+#ifdef CPU_GPU_PROFILE
     case ComputeDevice::gpu: {
         int nGpu = env->GetArrayLength(gpuIdxArray);
         std::cout << "oneDAL (native): use GPU kernels with " << nGpu
@@ -224,7 +222,6 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
         jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
 
         int size = cclComm.size();
-        ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
 
         auto queue =
             getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
@@ -234,9 +231,10 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
             preview::spmd::make_communicator<preview::spmd::backend::ccl>(
                 queue, size, rankId, kvs);
         doCorrelationOneAPICompute(env, pNumTabData, comm, resultObj);
+        env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
+        break;
     }
 #endif
     }
-
     return 0;
 }
diff --git a/mllib-dal/src/main/native/KMeansImpl.cpp b/mllib-dal/src/main/native/KMeansImpl.cpp
index 8a7d969ff..f167d41ed 100644
--- a/mllib-dal/src/main/native/KMeansImpl.cpp
+++ b/mllib-dal/src/main/native/KMeansImpl.cpp
@@ -29,14 +29,12 @@
 
 using namespace std;
 #ifdef CPU_GPU_PROFILE
-using namespace oneapi::dal;
-#else
+namespace kmeans_gpu = oneapi::dal::kmeans;
+#endif
 using namespace daal;
-using namespace daal::algorithms;
 using namespace daal::services;
-#endif
+namespace kmeans_cpu = daal::algorithms::kmeans;
 
-#ifdef CPU_ONLY_PROFILE
 typedef double algorithmFPType; /* Algorithm floating-point type */
 
 static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
@@ -75,11 +73,12 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
     centroids->deserialize(outArch);
 
     /* Create an algorithm to compute k-means on local nodes */
-    kmeans::Distributed<step1Local, algorithmFPType> localAlgorithm(nClusters);
+    kmeans_cpu::Distributed<step1Local, algorithmFPType> localAlgorithm(
+        nClusters);
 
     /* Set the input data set to the algorithm */
-    localAlgorithm.input.set(kmeans::data, pData);
-    localAlgorithm.input.set(kmeans::inputCentroids, centroids);
+    localAlgorithm.input.set(kmeans_cpu::data, pData);
+    localAlgorithm.input.set(kmeans_cpu::inputCentroids, centroids);
 
     /* Compute k-means */
     localAlgorithm.compute();
@@ -108,7 +107,7 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
 
     if (isRoot) {
         /* Create an algorithm to compute k-means on the master node */
-        kmeans::Distributed<step2Master, algorithmFPType> masterAlgorithm(
+        kmeans_cpu::Distributed<step2Master, algorithmFPType> masterAlgorithm(
             nClusters);
 
         for (size_t i = 0; i < nBlocks; i++) {
@@ -116,13 +115,13 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
             OutputDataArchive dataArch(&serializedData[perNodeArchLength * i],
                                        perNodeArchLength);
 
-            kmeans::PartialResultPtr dataForStep2FromStep1(
-                new kmeans::PartialResult());
+            kmeans_cpu::PartialResultPtr dataForStep2FromStep1(
+                new kmeans_cpu::PartialResult());
             dataForStep2FromStep1->deserialize(dataArch);
 
             /* Set local partial results as input for the master-node algorithm
              */
-            masterAlgorithm.input.add(kmeans::partialResults,
+            masterAlgorithm.input.add(kmeans_cpu::partialResults,
                                       dataForStep2FromStep1);
         }
 
@@ -131,11 +130,11 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
         masterAlgorithm.finalizeCompute();
 
         ret_cost = masterAlgorithm.getResult()
-                       ->get(kmeans::objectiveFunction)
+                       ->get(kmeans_cpu::objectiveFunction)
                        ->getValue<algorithmFPType>(0, 0);
 
         /* Retrieve the algorithm results */
-        return masterAlgorithm.getResult()->get(kmeans::centroids);
+        return masterAlgorithm.getResult()->get(kmeans_cpu::centroids);
     }
     return NumericTablePtr();
 }
@@ -239,7 +238,6 @@ static jlong doKMeansDaalCompute(JNIEnv *env, jobject obj, int rankId,
         return (jlong)0;
     }
 }
-#endif
 
 #ifdef CPU_GPU_PROFILE
 static jlong doKMeansOneAPICompute(
@@ -253,13 +251,13 @@ static jlong doKMeansOneAPICompute(
         *reinterpret_cast<const homogen_table *>(pNumTabData);
     homogen_table centroids =
         *reinterpret_cast<const homogen_table *>(pNumTabCenters);
-    const auto kmeans_desc = kmeans::descriptor<>()
+    const auto kmeans_desc = kmeans_gpu::descriptor<>()
                                  .set_cluster_count(clusterNum)
                                  .set_max_iteration_count(iterationNum)
                                  .set_accuracy_threshold(tolerance);
-    kmeans::train_input local_input{htable, centroids};
+    kmeans_gpu::train_input local_input{htable, centroids};
     auto t1 = std::chrono::high_resolution_clock::now();
-    kmeans::train_result result_train =
+    kmeans_gpu::train_result result_train =
         preview::train(comm, kmeans_desc, local_input);
     if (isRoot) {
         std::cout << "Iteration count: " << result_train.get_iteration_count()
@@ -314,7 +312,6 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
     int rankId = cclComm.rank();
     ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
     switch (device) {
-#ifdef CPU_ONLY_PROFILE
     case ComputeDevice::host:
     case ComputeDevice::cpu: {
         NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
@@ -329,8 +326,9 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
         ret = doKMeansDaalCompute(env, obj, rankId, cclComm, pData, centroids,
                                   clusterNum, tolerance, iterationNum,
                                   executorNum, resultObj);
+        break;
     }
-#else
+#ifdef CPU_GPU_PROFILE
     case ComputeDevice::gpu: {
         int nGpu = env->GetArrayLength(gpuIdxArray);
         std::cout << "oneDAL (native): use GPU kernels with " << nGpu
@@ -340,7 +338,6 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
         jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
 
         int size = cclComm.size();
-        ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
 
         auto queue =
             getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
@@ -352,6 +349,9 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
         ret =
             doKMeansOneAPICompute(env, pNumTabData, pNumTabCenters, clusterNum,
                                   tolerance, iterationNum, comm, resultObj);
+
+        env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
+        break;
     }
 #endif
     }
diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile
index 6b4a4eb8a..304b9a789 100644
--- a/mllib-dal/src/main/native/Makefile
+++ b/mllib-dal/src/main/native/Makefile
@@ -32,11 +32,12 @@ $(info )
 CFLAGS_COMMON := -g -Wall -Wno-deprecated-declarations -fPIC -std=c++17 \
                  -I $(I_MPI_ROOT)/include \
                  -I $(DAALROOT)/include \
+                 -I $(CCL_ROOT)/include/cpu/oneapi/ \
                  -I $(CMPLR_ROOT)/linux/include \
                  -I $(CMPLR_ROOT)/linux/include/sycl
 
 ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
-  CFLAGS := $(CFLAGS_COMMON)  -I $(CCL_ROOT)/include/cpu/oneapi/
+  CFLAGS := $(CFLAGS_COMMON)
 else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
   CFLAGS := $(CFLAGS_COMMON) -fsycl \
             -fsycl-device-code-split=per_kernel \
@@ -46,14 +47,15 @@ else
   exit 1
 endif
 
-INCS := -I $(JAVA_HOME)/include \
+INCS := -I $(CCL_ROOT)/include/cpu \
+        -I $(JAVA_HOME)/include \
         -I $(JAVA_HOME)/include/linux \
         -I $(DAALROOT)/include \
         -I ./javah \
         -I ./
 
 ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
-  INCS := $(INCS)  -I $(CCL_ROOT)/include/cpu
+  INCS := $(INCS)
 else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
   INCS := $(INCS) -I $(CCL_ROOT)/include/cpu_gpu_dpcpp
 else
@@ -62,17 +64,15 @@ else
 endif
 
 # Use static link if possible, TBB is only available as dynamic libs
-LIBS_COMMON :=-L $(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \
-        -L $(DAALROOT)/lib/intel64 -lonedal_core -lonedal_thread -lonedal_dpc \
-        -L $(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc \
-        -L $(I_MPI_ROOT)
+LIBS_COMMON := -L$(CCL_ROOT)/lib/cpu -lccl \
+        -L$(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \
+        -L$(DAALROOT)/lib/intel64 -lonedal_core -lonedal_thread -lonedal_dpc \
+        -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc \
+        -L$(I_MPI_ROOT)
 
-ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
-      LIBS_COMMON := $(LIBS_COMMON) \
-                     -L $(CCL_ROOT)/lib/cpu -lccl
-else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
+ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
       LIBS_COMMON := $(LIBS_COMMON) \
-                     -L $(CCL_ROOT)/lib/cpu_gpu_dpcpp -lccl
+                     -L$(CCL_ROOT)/lib/cpu_gpu_dpcpp -lccl
 endif
 
 ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
diff --git a/mllib-dal/src/main/native/OneCCL.cpp b/mllib-dal/src/main/native/OneCCL.cpp
index bbfc9fbe9..6264d36ca 100644
--- a/mllib-dal/src/main/native/OneCCL.cpp
+++ b/mllib-dal/src/main/native/OneCCL.cpp
@@ -100,11 +100,9 @@ Java_com_intel_oap_mllib_OneCCL_00024_c_1initDpcpp(JNIEnv *env, jobject) {
 
 JNIEXPORT void JNICALL
 Java_com_intel_oap_mllib_OneCCL_00024_c_1cleanup(JNIEnv *env, jobject obj) {
-
+    std::cerr << "OneCCL (native): cleanup" << std::endl;
     g_kvs.pop_back();
     g_comms.pop_back();
-
-    std::cerr << "OneCCL (native): cleanup" << std::endl;
 }
 
 JNIEXPORT jboolean JNICALL
diff --git a/mllib-dal/src/main/native/PCAImpl.cpp b/mllib-dal/src/main/native/PCAImpl.cpp
index 1996d7291..5fabd3e24 100644
--- a/mllib-dal/src/main/native/PCAImpl.cpp
+++ b/mllib-dal/src/main/native/PCAImpl.cpp
@@ -29,14 +29,14 @@
 
 using namespace std;
 #ifdef CPU_GPU_PROFILE
-using namespace oneapi::dal;
-#else
+namespace pca_gpu = oneapi::dal::pca;
+namespace covariance_gpu = oneapi::dal::covariance;
+#endif
 using namespace daal;
-using namespace daal::algorithms;
 using namespace daal::services;
-#endif
+namespace pca_cpu = daal::algorithms::pca;
+namespace covariance_cpu = daal::algorithms::covariance;
 
-#ifdef CPU_ONLY_PROFILE
 typedef double algorithmFPType; /* Algorithm floating-point type */
 
 static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
@@ -48,10 +48,10 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
 
     const bool isRoot = (rankId == ccl_root);
 
-    covariance::Distributed<step1Local, algorithmFPType> localAlgorithm;
+    covariance_cpu::Distributed<step1Local, algorithmFPType> localAlgorithm;
 
     /* Set the input data set to the algorithm */
-    localAlgorithm.input.set(covariance::data, pData);
+    localAlgorithm.input.set(covariance_cpu::data, pData);
 
     /* Compute covariance for PCA*/
     localAlgorithm.compute();
@@ -90,7 +90,8 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
     if (isRoot) {
         auto t1 = std::chrono::high_resolution_clock::now();
         /* Create an algorithm to compute covariance on the master node */
-        covariance::Distributed<step2Master, algorithmFPType> masterAlgorithm;
+        covariance_cpu::Distributed<step2Master, algorithmFPType>
+            masterAlgorithm;
 
         for (size_t i = 0; i < nBlocks; i++) {
             /* Deserialize partial results from step 1 */
@@ -98,19 +99,19 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
                                            perNodeArchLength * i,
                                        perNodeArchLength);
 
-            covariance::PartialResultPtr dataForStep2FromStep1(
-                new covariance::PartialResult());
+            covariance_cpu::PartialResultPtr dataForStep2FromStep1(
+                new covariance_cpu::PartialResult());
             dataForStep2FromStep1->deserialize(dataArch);
 
             /* Set local partial results as input for the master-node algorithm
              */
-            masterAlgorithm.input.add(covariance::partialResults,
+            masterAlgorithm.input.add(covariance_cpu::partialResults,
                                       dataForStep2FromStep1);
         }
 
         /* Set the parameter to choose the type of the output matrix */
         masterAlgorithm.parameter.outputMatrixType =
-            covariance::covarianceMatrix;
+            covariance_cpu::covarianceMatrix;
 
         /* Merge and finalizeCompute covariance decomposition on the master node
          */
@@ -118,7 +119,8 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
         masterAlgorithm.finalizeCompute();
 
         /* Retrieve the algorithm results */
-        covariance::ResultPtr covariance_result = masterAlgorithm.getResult();
+        covariance_cpu::ResultPtr covariance_result =
+            masterAlgorithm.getResult();
         auto t2 = std::chrono::high_resolution_clock::now();
         auto duration =
             std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
@@ -130,12 +132,12 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
 
         /* Create an algorithm for principal component analysis using the
          * correlation method*/
-        pca::Batch<algorithmFPType> algorithm;
+        pca_cpu::Batch<algorithmFPType> algorithm;
 
         /* Set the algorithm input data*/
-        algorithm.input.set(pca::correlation,
-                            covariance_result->get(covariance::covariance));
-        algorithm.parameter.resultsToCompute = pca::eigenvalue;
+        algorithm.input.set(pca_cpu::correlation,
+                            covariance_result->get(covariance_cpu::covariance));
+        algorithm.parameter.resultsToCompute = pca_cpu::eigenvalue;
 
         /* Compute results of the PCA algorithm*/
         algorithm.compute();
@@ -148,11 +150,11 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
                   << " secs" << std::endl;
 
         /* Print the results */
-        pca::ResultPtr result = algorithm.getResult();
-        printNumericTable(result->get(pca::eigenvalues),
+        pca_cpu::ResultPtr result = algorithm.getResult();
+        printNumericTable(result->get(pca_cpu::eigenvalues),
                           "First 10 eigenvalues with first 20 dimensions:", 10,
                           20);
-        printNumericTable(result->get(pca::eigenvectors),
+        printNumericTable(result->get(pca_cpu::eigenvectors),
                           "First 10 eigenvectors with first 20 dimensions:", 10,
                           20);
 
@@ -166,16 +168,15 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId,
             env->GetFieldID(clazz, "explainedVarianceNumericTable", "J");
 
         NumericTablePtr *eigenvalues =
-            new NumericTablePtr(result->get(pca::eigenvalues));
+            new NumericTablePtr(result->get(pca_cpu::eigenvalues));
         NumericTablePtr *eigenvectors =
-            new NumericTablePtr(result->get(pca::eigenvectors));
+            new NumericTablePtr(result->get(pca_cpu::eigenvectors));
 
         env->SetLongField(resultObj, pcNumericTableField, (jlong)eigenvectors);
         env->SetLongField(resultObj, explainedVarianceNumericTableField,
                           (jlong)eigenvalues);
     }
 }
-#endif
 
 #ifdef CPU_GPU_PROFILE
 static void doPCAOneAPICompute(
@@ -187,8 +188,8 @@ static void doPCAOneAPICompute(
     homogen_table htable =
         *reinterpret_cast<const homogen_table *>(pNumTabData);
 
-    const auto cov_desc = covariance::descriptor{}.set_result_options(
-        covariance::result_options::cov_matrix);
+    const auto cov_desc = covariance_gpu::descriptor{}.set_result_options(
+        covariance_gpu::result_options::cov_matrix);
 
     auto t1 = std::chrono::high_resolution_clock::now();
     const auto result = preview::compute(comm, cov_desc, htable);
@@ -199,9 +200,9 @@ static void doPCAOneAPICompute(
               << " secs" << std::endl;
     if (isRoot) {
         using float_t = double;
-        using method_t = pca::method::precomputed;
-        using task_t = pca::task::dim_reduction;
-        using descriptor_t = pca::descriptor<float_t, method_t, task_t>;
+        using method_t = pca_gpu::method::precomputed;
+        using task_t = pca_gpu::task::dim_reduction;
+        using descriptor_t = pca_gpu::descriptor<float_t, method_t, task_t>;
         const auto pca_desc = descriptor_t().set_deterministic(true);
 
         t1 = std::chrono::high_resolution_clock::now();
@@ -254,7 +255,6 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL(
     int rankId = cclComm.rank();
     ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
     switch (device) {
-#ifdef CPU_ONLY_PROFILE
     case ComputeDevice::host:
     case ComputeDevice::cpu: {
         NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
@@ -267,8 +267,9 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL(
                   << nThreadsNew << std::endl;
         doPCADAALCompute(env, obj, rankId, cclComm, pData, executorNum,
                          resultObj);
+        break;
     }
-#else
+#ifdef CPU_GPU_PROFILE
     case ComputeDevice::gpu: {
         int nGpu = env->GetArrayLength(gpuIdxArray);
         std::cout << "oneDAL (native): use GPU kernels with " << nGpu
@@ -278,7 +279,6 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL(
         jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
 
         int size = cclComm.size();
-        ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
 
         auto queue =
             getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
@@ -288,6 +288,8 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL(
             preview::spmd::make_communicator<preview::spmd::backend::ccl>(
                 queue, size, rankId, kvs);
         doPCAOneAPICompute(env, pNumTabData, comm, resultObj);
+        env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
+        break;
     }
 #endif
     }
diff --git a/mllib-dal/src/main/native/SummarizerImpl.cpp b/mllib-dal/src/main/native/SummarizerImpl.cpp
index 22c50ace2..8333be531 100644
--- a/mllib-dal/src/main/native/SummarizerImpl.cpp
+++ b/mllib-dal/src/main/native/SummarizerImpl.cpp
@@ -28,13 +28,10 @@
 using namespace std;
 #ifdef CPU_GPU_PROFILE
 using namespace oneapi::dal;
-#else
+#endif
 using namespace daal;
 using namespace daal::algorithms;
 using namespace daal::services;
-#endif
-
-#ifdef CPU_ONLY_PROFILE
 
 typedef double algorithmFPType; /* Algorithm floating-point type */
 
@@ -199,7 +196,6 @@ static void doSummarizerDAALCompute(JNIEnv *env, jobject obj, int rankId,
         env->SetLongField(resultObj, minimumNumericTableField, (jlong)min);
     }
 }
-#endif
 
 #ifdef CPU_GPU_PROFILE
 static void doSummarizerOneAPICompute(
@@ -271,7 +267,6 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL(
     int rankId = cclComm.rank();
     ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
     switch (device) {
-#ifdef CPU_ONLY_PROFILE
     case ComputeDevice::host:
     case ComputeDevice::cpu: {
         NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
@@ -284,8 +279,9 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL(
                   << nThreadsNew << std::endl;
         doSummarizerDAALCompute(env, obj, rankId, cclComm, pData, executorNum,
                                 resultObj);
+        break;
     }
-#else
+#ifdef CPU_GPU_PROFILE
     case ComputeDevice::gpu: {
         int nGpu = env->GetArrayLength(gpuIdxArray);
         std::cout << "oneDAL (native): use GPU kernels with " << nGpu
@@ -295,7 +291,6 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL(
         jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);
 
         int size = cclComm.size();
-        ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
 
         auto queue =
             getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
@@ -305,9 +300,10 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL(
             preview::spmd::make_communicator<preview::spmd::backend::ccl>(
                 queue, size, rankId, kvs);
         doSummarizerOneAPICompute(env, pNumTabData, comm, resultObj);
+        env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
+        break;
     }
 #endif
     }
-
     return 0;
 }