diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 52e2917c7..96018b1fa 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -74,7 +74,7 @@ jobs: run: | ${{github.workspace}}/dev/ci/ci-yarn-test.sh standalone-test: - name: Standalone Test for Examples (CPU) + name: Standalone CPU_GPU_PROFILE Test for Examples (CPU) runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 diff --git a/dev/ci/ci-standalone-test.sh b/dev/ci/ci-standalone-test.sh index edc5af65b..a0a8ae2fe 100755 --- a/dev/ci/ci-standalone-test.sh +++ b/dev/ci/ci-standalone-test.sh @@ -17,7 +17,7 @@ source /opt/intel/oneapi/setvars.sh --ccl-configuration=cpu # Prepare lib resources cd $GITHUB_WORKSPACE/mllib-dal ../dev/prepare-build-deps.sh -./build.sh -p CPU_ONLY_PROFILE -q +./build.sh -p CPU_GPU_PROFILE -q # Setup cluster source $GITHUB_WORKSPACE/dev/test-cluster/standalone/setup-cluster.sh diff --git a/mllib-dal/src/main/native/CorrelationImpl.cpp b/mllib-dal/src/main/native/CorrelationImpl.cpp index a86e33a1a..aba09e5a1 100644 --- a/mllib-dal/src/main/native/CorrelationImpl.cpp +++ b/mllib-dal/src/main/native/CorrelationImpl.cpp @@ -27,14 +27,12 @@ using namespace std; #ifdef CPU_GPU_PROFILE -using namespace oneapi::dal; -#else +namespace covariance_gpu = oneapi::dal::covariance; +#endif using namespace daal; -using namespace daal::algorithms; using namespace daal::services; -#endif +namespace covariance_cpu = daal::algorithms::covariance; -#ifdef CPU_ONLY_PROFILE typedef double algorithmFPType; /* Algorithm floating-point type */ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId, @@ -46,10 +44,10 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId, const bool isRoot = (rankId == ccl_root); - covariance::Distributed localAlgorithm; + covariance_cpu::Distributed localAlgorithm; /* Set the input data set to the algorithm */ - localAlgorithm.input.set(covariance::data, pData); + localAlgorithm.input.set(covariance_cpu::data, pData); /* Compute covariance */ localAlgorithm.compute(); @@ -89,7 +87,8 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId, if (isRoot) { auto t1 = std::chrono::high_resolution_clock::now(); /* Create an algorithm to compute covariance on the master node */ - covariance::Distributed masterAlgorithm; + covariance_cpu::Distributed + masterAlgorithm; for (size_t i = 0; i < nBlocks; i++) { /* Deserialize partial results from step 1 */ @@ -97,19 +96,19 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId, perNodeArchLength * i, perNodeArchLength); - covariance::PartialResultPtr dataForStep2FromStep1( - new covariance::PartialResult()); + covariance_cpu::PartialResultPtr dataForStep2FromStep1( + new covariance_cpu::PartialResult()); dataForStep2FromStep1->deserialize(dataArch); /* Set local partial results as input for the master-node algorithm */ - masterAlgorithm.input.add(covariance::partialResults, + masterAlgorithm.input.add(covariance_cpu::partialResults, dataForStep2FromStep1); } /* Set the parameter to choose the type of the output matrix */ masterAlgorithm.parameter.outputMatrixType = - covariance::correlationMatrix; + covariance_cpu::correlationMatrix; /* Merge and finalizeCompute covariance decomposition on the master node */ @@ -117,7 +116,7 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId, masterAlgorithm.finalizeCompute(); /* Retrieve the algorithm results */ - covariance::ResultPtr result = masterAlgorithm.getResult(); + covariance_cpu::ResultPtr result = masterAlgorithm.getResult(); auto t2 = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(t2 - t1) @@ -126,7 +125,7 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId, << duration / 1000 << " secs" << std::endl; /* Print the results */ - printNumericTable(result->get(covariance::correlation), + printNumericTable(result->get(covariance_cpu::correlation), "Correlation first 20 columns of " "correlation matrix:", 1, 20); @@ -138,13 +137,12 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId, env->GetFieldID(clazz, "correlationNumericTable", "J"); NumericTablePtr *correlation = - new NumericTablePtr(result->get(covariance::correlation)); + new NumericTablePtr(result->get(covariance_cpu::correlation)); env->SetLongField(resultObj, correlationNumericTableField, (jlong)correlation); } } -#endif #ifdef CPU_GPU_PROFILE static void doCorrelationOneAPICompute( @@ -156,9 +154,9 @@ static void doCorrelationOneAPICompute( homogen_table htable = *reinterpret_cast(pNumTabData); - const auto cor_desc = covariance::descriptor{}.set_result_options( - covariance::result_options::cor_matrix | - covariance::result_options::means); + const auto cor_desc = covariance_gpu::descriptor{}.set_result_options( + covariance_gpu::result_options::cor_matrix | + covariance_gpu::result_options::means); auto t1 = std::chrono::high_resolution_clock::now(); const auto result_train = preview::compute(comm, cor_desc, htable); if (isRoot) { @@ -200,7 +198,6 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( int rankId = cclComm.rank(); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { -#ifdef CPU_ONLY_PROFILE case ComputeDevice::host: case ComputeDevice::cpu: { NumericTablePtr pData = *((NumericTablePtr *)pNumTabData); @@ -213,8 +210,9 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( << nThreadsNew << std::endl; doCorrelationDaalCompute(env, obj, rankId, cclComm, pData, executorNum, resultObj); + break; } -#else +#ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { int nGpu = env->GetArrayLength(gpuIdxArray); std::cout << "oneDAL (native): use GPU kernels with " << nGpu @@ -224,7 +222,6 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); int size = cclComm.size(); - ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); auto queue = getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); @@ -234,9 +231,10 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( preview::spmd::make_communicator( queue, size, rankId, kvs); doCorrelationOneAPICompute(env, pNumTabData, comm, resultObj); + env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); + break; } #endif } - return 0; } diff --git a/mllib-dal/src/main/native/KMeansImpl.cpp b/mllib-dal/src/main/native/KMeansImpl.cpp index 8a7d969ff..f167d41ed 100644 --- a/mllib-dal/src/main/native/KMeansImpl.cpp +++ b/mllib-dal/src/main/native/KMeansImpl.cpp @@ -29,14 +29,12 @@ using namespace std; #ifdef CPU_GPU_PROFILE -using namespace oneapi::dal; -#else +namespace kmeans_gpu = oneapi::dal::kmeans; +#endif using namespace daal; -using namespace daal::algorithms; using namespace daal::services; -#endif +namespace kmeans_cpu = daal::algorithms::kmeans; -#ifdef CPU_ONLY_PROFILE typedef double algorithmFPType; /* Algorithm floating-point type */ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, @@ -75,11 +73,12 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, centroids->deserialize(outArch); /* Create an algorithm to compute k-means on local nodes */ - kmeans::Distributed localAlgorithm(nClusters); + kmeans_cpu::Distributed localAlgorithm( + nClusters); /* Set the input data set to the algorithm */ - localAlgorithm.input.set(kmeans::data, pData); - localAlgorithm.input.set(kmeans::inputCentroids, centroids); + localAlgorithm.input.set(kmeans_cpu::data, pData); + localAlgorithm.input.set(kmeans_cpu::inputCentroids, centroids); /* Compute k-means */ localAlgorithm.compute(); @@ -108,7 +107,7 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, if (isRoot) { /* Create an algorithm to compute k-means on the master node */ - kmeans::Distributed masterAlgorithm( + kmeans_cpu::Distributed masterAlgorithm( nClusters); for (size_t i = 0; i < nBlocks; i++) { @@ -116,13 +115,13 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, OutputDataArchive dataArch(&serializedData[perNodeArchLength * i], perNodeArchLength); - kmeans::PartialResultPtr dataForStep2FromStep1( - new kmeans::PartialResult()); + kmeans_cpu::PartialResultPtr dataForStep2FromStep1( + new kmeans_cpu::PartialResult()); dataForStep2FromStep1->deserialize(dataArch); /* Set local partial results as input for the master-node algorithm */ - masterAlgorithm.input.add(kmeans::partialResults, + masterAlgorithm.input.add(kmeans_cpu::partialResults, dataForStep2FromStep1); } @@ -131,11 +130,11 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, masterAlgorithm.finalizeCompute(); ret_cost = masterAlgorithm.getResult() - ->get(kmeans::objectiveFunction) + ->get(kmeans_cpu::objectiveFunction) ->getValue(0, 0); /* Retrieve the algorithm results */ - return masterAlgorithm.getResult()->get(kmeans::centroids); + return masterAlgorithm.getResult()->get(kmeans_cpu::centroids); } return NumericTablePtr(); } @@ -239,7 +238,6 @@ static jlong doKMeansDaalCompute(JNIEnv *env, jobject obj, int rankId, return (jlong)0; } } -#endif #ifdef CPU_GPU_PROFILE static jlong doKMeansOneAPICompute( @@ -253,13 +251,13 @@ static jlong doKMeansOneAPICompute( *reinterpret_cast(pNumTabData); homogen_table centroids = *reinterpret_cast(pNumTabCenters); - const auto kmeans_desc = kmeans::descriptor<>() + const auto kmeans_desc = kmeans_gpu::descriptor<>() .set_cluster_count(clusterNum) .set_max_iteration_count(iterationNum) .set_accuracy_threshold(tolerance); - kmeans::train_input local_input{htable, centroids}; + kmeans_gpu::train_input local_input{htable, centroids}; auto t1 = std::chrono::high_resolution_clock::now(); - kmeans::train_result result_train = + kmeans_gpu::train_result result_train = preview::train(comm, kmeans_desc, local_input); if (isRoot) { std::cout << "Iteration count: " << result_train.get_iteration_count() @@ -314,7 +312,6 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe int rankId = cclComm.rank(); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { -#ifdef CPU_ONLY_PROFILE case ComputeDevice::host: case ComputeDevice::cpu: { NumericTablePtr pData = *((NumericTablePtr *)pNumTabData); @@ -329,8 +326,9 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe ret = doKMeansDaalCompute(env, obj, rankId, cclComm, pData, centroids, clusterNum, tolerance, iterationNum, executorNum, resultObj); + break; } -#else +#ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { int nGpu = env->GetArrayLength(gpuIdxArray); std::cout << "oneDAL (native): use GPU kernels with " << nGpu @@ -340,7 +338,6 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); int size = cclComm.size(); - ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); auto queue = getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); @@ -352,6 +349,9 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe ret = doKMeansOneAPICompute(env, pNumTabData, pNumTabCenters, clusterNum, tolerance, iterationNum, comm, resultObj); + + env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); + break; } #endif } diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile index 6b4a4eb8a..304b9a789 100644 --- a/mllib-dal/src/main/native/Makefile +++ b/mllib-dal/src/main/native/Makefile @@ -32,11 +32,12 @@ $(info ) CFLAGS_COMMON := -g -Wall -Wno-deprecated-declarations -fPIC -std=c++17 \ -I $(I_MPI_ROOT)/include \ -I $(DAALROOT)/include \ + -I $(CCL_ROOT)/include/cpu/oneapi/ \ -I $(CMPLR_ROOT)/linux/include \ -I $(CMPLR_ROOT)/linux/include/sycl ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE) - CFLAGS := $(CFLAGS_COMMON) -I $(CCL_ROOT)/include/cpu/oneapi/ + CFLAGS := $(CFLAGS_COMMON) else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE) CFLAGS := $(CFLAGS_COMMON) -fsycl \ -fsycl-device-code-split=per_kernel \ @@ -46,14 +47,15 @@ else exit 1 endif -INCS := -I $(JAVA_HOME)/include \ +INCS := -I $(CCL_ROOT)/include/cpu \ + -I $(JAVA_HOME)/include \ -I $(JAVA_HOME)/include/linux \ -I $(DAALROOT)/include \ -I ./javah \ -I ./ ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE) - INCS := $(INCS) -I $(CCL_ROOT)/include/cpu + INCS := $(INCS) else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE) INCS := $(INCS) -I $(CCL_ROOT)/include/cpu_gpu_dpcpp else @@ -62,17 +64,15 @@ else endif # Use static link if possible, TBB is only available as dynamic libs -LIBS_COMMON :=-L $(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \ - -L $(DAALROOT)/lib/intel64 -lonedal_core -lonedal_thread -lonedal_dpc \ - -L $(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc \ - -L $(I_MPI_ROOT) +LIBS_COMMON := -L$(CCL_ROOT)/lib/cpu -lccl \ + -L$(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \ + -L$(DAALROOT)/lib/intel64 -lonedal_core -lonedal_thread -lonedal_dpc \ + -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc \ + -L$(I_MPI_ROOT) -ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE) - LIBS_COMMON := $(LIBS_COMMON) \ - -L $(CCL_ROOT)/lib/cpu -lccl -else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE) +ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE) LIBS_COMMON := $(LIBS_COMMON) \ - -L $(CCL_ROOT)/lib/cpu_gpu_dpcpp -lccl + -L$(CCL_ROOT)/lib/cpu_gpu_dpcpp -lccl endif ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE) diff --git a/mllib-dal/src/main/native/OneCCL.cpp b/mllib-dal/src/main/native/OneCCL.cpp index bbfc9fbe9..6264d36ca 100644 --- a/mllib-dal/src/main/native/OneCCL.cpp +++ b/mllib-dal/src/main/native/OneCCL.cpp @@ -100,11 +100,9 @@ Java_com_intel_oap_mllib_OneCCL_00024_c_1initDpcpp(JNIEnv *env, jobject) { JNIEXPORT void JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1cleanup(JNIEnv *env, jobject obj) { - + std::cerr << "OneCCL (native): cleanup" << std::endl; g_kvs.pop_back(); g_comms.pop_back(); - - std::cerr << "OneCCL (native): cleanup" << std::endl; } JNIEXPORT jboolean JNICALL diff --git a/mllib-dal/src/main/native/PCAImpl.cpp b/mllib-dal/src/main/native/PCAImpl.cpp index 1996d7291..5fabd3e24 100644 --- a/mllib-dal/src/main/native/PCAImpl.cpp +++ b/mllib-dal/src/main/native/PCAImpl.cpp @@ -29,14 +29,14 @@ using namespace std; #ifdef CPU_GPU_PROFILE -using namespace oneapi::dal; -#else +namespace pca_gpu = oneapi::dal::pca; +namespace covariance_gpu = oneapi::dal::covariance; +#endif using namespace daal; -using namespace daal::algorithms; using namespace daal::services; -#endif +namespace pca_cpu = daal::algorithms::pca; +namespace covariance_cpu = daal::algorithms::covariance; -#ifdef CPU_ONLY_PROFILE typedef double algorithmFPType; /* Algorithm floating-point type */ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, @@ -48,10 +48,10 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, const bool isRoot = (rankId == ccl_root); - covariance::Distributed localAlgorithm; + covariance_cpu::Distributed localAlgorithm; /* Set the input data set to the algorithm */ - localAlgorithm.input.set(covariance::data, pData); + localAlgorithm.input.set(covariance_cpu::data, pData); /* Compute covariance for PCA*/ localAlgorithm.compute(); @@ -90,7 +90,8 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, if (isRoot) { auto t1 = std::chrono::high_resolution_clock::now(); /* Create an algorithm to compute covariance on the master node */ - covariance::Distributed masterAlgorithm; + covariance_cpu::Distributed + masterAlgorithm; for (size_t i = 0; i < nBlocks; i++) { /* Deserialize partial results from step 1 */ @@ -98,19 +99,19 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, perNodeArchLength * i, perNodeArchLength); - covariance::PartialResultPtr dataForStep2FromStep1( - new covariance::PartialResult()); + covariance_cpu::PartialResultPtr dataForStep2FromStep1( + new covariance_cpu::PartialResult()); dataForStep2FromStep1->deserialize(dataArch); /* Set local partial results as input for the master-node algorithm */ - masterAlgorithm.input.add(covariance::partialResults, + masterAlgorithm.input.add(covariance_cpu::partialResults, dataForStep2FromStep1); } /* Set the parameter to choose the type of the output matrix */ masterAlgorithm.parameter.outputMatrixType = - covariance::covarianceMatrix; + covariance_cpu::covarianceMatrix; /* Merge and finalizeCompute covariance decomposition on the master node */ @@ -118,7 +119,8 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, masterAlgorithm.finalizeCompute(); /* Retrieve the algorithm results */ - covariance::ResultPtr covariance_result = masterAlgorithm.getResult(); + covariance_cpu::ResultPtr covariance_result = + masterAlgorithm.getResult(); auto t2 = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(t2 - t1) @@ -130,12 +132,12 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, /* Create an algorithm for principal component analysis using the * correlation method*/ - pca::Batch algorithm; + pca_cpu::Batch algorithm; /* Set the algorithm input data*/ - algorithm.input.set(pca::correlation, - covariance_result->get(covariance::covariance)); - algorithm.parameter.resultsToCompute = pca::eigenvalue; + algorithm.input.set(pca_cpu::correlation, + covariance_result->get(covariance_cpu::covariance)); + algorithm.parameter.resultsToCompute = pca_cpu::eigenvalue; /* Compute results of the PCA algorithm*/ algorithm.compute(); @@ -148,11 +150,11 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, << " secs" << std::endl; /* Print the results */ - pca::ResultPtr result = algorithm.getResult(); - printNumericTable(result->get(pca::eigenvalues), + pca_cpu::ResultPtr result = algorithm.getResult(); + printNumericTable(result->get(pca_cpu::eigenvalues), "First 10 eigenvalues with first 20 dimensions:", 10, 20); - printNumericTable(result->get(pca::eigenvectors), + printNumericTable(result->get(pca_cpu::eigenvectors), "First 10 eigenvectors with first 20 dimensions:", 10, 20); @@ -166,16 +168,15 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, env->GetFieldID(clazz, "explainedVarianceNumericTable", "J"); NumericTablePtr *eigenvalues = - new NumericTablePtr(result->get(pca::eigenvalues)); + new NumericTablePtr(result->get(pca_cpu::eigenvalues)); NumericTablePtr *eigenvectors = - new NumericTablePtr(result->get(pca::eigenvectors)); + new NumericTablePtr(result->get(pca_cpu::eigenvectors)); env->SetLongField(resultObj, pcNumericTableField, (jlong)eigenvectors); env->SetLongField(resultObj, explainedVarianceNumericTableField, (jlong)eigenvalues); } } -#endif #ifdef CPU_GPU_PROFILE static void doPCAOneAPICompute( @@ -187,8 +188,8 @@ static void doPCAOneAPICompute( homogen_table htable = *reinterpret_cast(pNumTabData); - const auto cov_desc = covariance::descriptor{}.set_result_options( - covariance::result_options::cov_matrix); + const auto cov_desc = covariance_gpu::descriptor{}.set_result_options( + covariance_gpu::result_options::cov_matrix); auto t1 = std::chrono::high_resolution_clock::now(); const auto result = preview::compute(comm, cov_desc, htable); @@ -199,9 +200,9 @@ static void doPCAOneAPICompute( << " secs" << std::endl; if (isRoot) { using float_t = double; - using method_t = pca::method::precomputed; - using task_t = pca::task::dim_reduction; - using descriptor_t = pca::descriptor; + using method_t = pca_gpu::method::precomputed; + using task_t = pca_gpu::task::dim_reduction; + using descriptor_t = pca_gpu::descriptor; const auto pca_desc = descriptor_t().set_deterministic(true); t1 = std::chrono::high_resolution_clock::now(); @@ -254,7 +255,6 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( int rankId = cclComm.rank(); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { -#ifdef CPU_ONLY_PROFILE case ComputeDevice::host: case ComputeDevice::cpu: { NumericTablePtr pData = *((NumericTablePtr *)pNumTabData); @@ -267,8 +267,9 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( << nThreadsNew << std::endl; doPCADAALCompute(env, obj, rankId, cclComm, pData, executorNum, resultObj); + break; } -#else +#ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { int nGpu = env->GetArrayLength(gpuIdxArray); std::cout << "oneDAL (native): use GPU kernels with " << nGpu @@ -278,7 +279,6 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); int size = cclComm.size(); - ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); auto queue = getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); @@ -288,6 +288,8 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( preview::spmd::make_communicator( queue, size, rankId, kvs); doPCAOneAPICompute(env, pNumTabData, comm, resultObj); + env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); + break; } #endif } diff --git a/mllib-dal/src/main/native/SummarizerImpl.cpp b/mllib-dal/src/main/native/SummarizerImpl.cpp index 22c50ace2..8333be531 100644 --- a/mllib-dal/src/main/native/SummarizerImpl.cpp +++ b/mllib-dal/src/main/native/SummarizerImpl.cpp @@ -28,13 +28,10 @@ using namespace std; #ifdef CPU_GPU_PROFILE using namespace oneapi::dal; -#else +#endif using namespace daal; using namespace daal::algorithms; using namespace daal::services; -#endif - -#ifdef CPU_ONLY_PROFILE typedef double algorithmFPType; /* Algorithm floating-point type */ @@ -199,7 +196,6 @@ static void doSummarizerDAALCompute(JNIEnv *env, jobject obj, int rankId, env->SetLongField(resultObj, minimumNumericTableField, (jlong)min); } } -#endif #ifdef CPU_GPU_PROFILE static void doSummarizerOneAPICompute( @@ -271,7 +267,6 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( int rankId = cclComm.rank(); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { -#ifdef CPU_ONLY_PROFILE case ComputeDevice::host: case ComputeDevice::cpu: { NumericTablePtr pData = *((NumericTablePtr *)pNumTabData); @@ -284,8 +279,9 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( << nThreadsNew << std::endl; doSummarizerDAALCompute(env, obj, rankId, cclComm, pData, executorNum, resultObj); + break; } -#else +#ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { int nGpu = env->GetArrayLength(gpuIdxArray); std::cout << "oneDAL (native): use GPU kernels with " << nGpu @@ -295,7 +291,6 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); int size = cclComm.size(); - ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); auto queue = getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); @@ -305,9 +300,10 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( preview::spmd::make_communicator( queue, size, rankId, kvs); doSummarizerOneAPICompute(env, pNumTabData, comm, resultObj); + env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); + break; } #endif } - return 0; }