From 88e801e67d24917a2230a57e2f26ab562b47d0e1 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 25 Apr 2023 10:14:29 +0000 Subject: [PATCH 1/9] fix GPU-CPU-PROFILE jar can't not run daal CPU Signed-off-by: minmingzhu --- .github/workflows/ci-tests.yml | 27 ++++++++- dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh | 37 ++++++++++++ ...=> ci-standalone-CPU_ONLY_PROFILE-test.sh} | 0 mllib-dal/src/main/native/CorrelationImpl.cpp | 40 ++++++------- mllib-dal/src/main/native/KMeansImpl.cpp | 37 ++++++------ mllib-dal/src/main/native/Makefile | 24 ++++---- mllib-dal/src/main/native/PCAImpl.cpp | 60 +++++++++---------- mllib-dal/src/main/native/SummarizerImpl.cpp | 10 +--- 8 files changed, 141 insertions(+), 94 deletions(-) create mode 100755 dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh rename dev/ci/{ci-standalone-test.sh => ci-standalone-CPU_ONLY_PROFILE-test.sh} (100%) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 52e2917c7..86ca043f5 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -73,8 +73,31 @@ jobs: - name: Cluster Test run: | ${{github.workspace}}/dev/ci/ci-yarn-test.sh - standalone-test: - name: Standalone Test for Examples (CPU) + standalone-CPU_ONLY_PROFILE-test: + name: Standalone Test for Examples + runs-on: ubuntu-20.04 + steps: + - uses: actions/checkout@v3 + - name: Set up JDK 1.8 + uses: actions/setup-java@v1 + with: + java-version: 1.8 + - name: Restore cached dependencies + uses: actions/cache@v3 + with: + path: | + #/var/cache/apt/archives/*.deb + ~/.m2/repository + /opt/intel/oneapi + ~/opt + key: ${{ runner.os }}_spark-3.2.0_hadoop-3.2.0_oneapi-2023.0.0 + restore-keys: | + ${{ runner.os }}- + - name: Cluster Test + run: | + ${{github.workspace}}/dev/ci/ci-standalone-test.sh +standalone-CPU_GPU_PROFILE-test: + name: Standalone Test for Examples runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 diff --git a/dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh b/dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh new file mode 100755 index 000000000..a0a8ae2fe --- /dev/null +++ b/dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# exit when any command fails +set -e + +# keep track of the last executed command +trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG +# echo an error message before exiting +trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT + +# Install dependencies for building +$GITHUB_WORKSPACE/dev/install-build-deps-ubuntu.sh + +# Setup building envs +source /opt/intel/oneapi/setvars.sh --ccl-configuration=cpu + +# Prepare lib resources +cd $GITHUB_WORKSPACE/mllib-dal +../dev/prepare-build-deps.sh +./build.sh -p CPU_GPU_PROFILE -q + +# Setup cluster +source $GITHUB_WORKSPACE/dev/test-cluster/standalone/setup-cluster.sh + +# Setup OAP MLlib envs +cp $GITHUB_WORKSPACE/dev/test-cluster/standalone/env.sh $GITHUB_WORKSPACE/conf +cd $GITHUB_WORKSPACE/examples + + +echo "=========================================" +echo "Cluster Testing with Spark Version: $SPARK_VERSION" +echo "=========================================" + +# Build and run all examples +./build-all-scala.sh +./run-all-scala.sh +./run-all-pyspark.sh diff --git a/dev/ci/ci-standalone-test.sh b/dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh similarity index 100% rename from dev/ci/ci-standalone-test.sh rename to dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh diff --git a/mllib-dal/src/main/native/CorrelationImpl.cpp b/mllib-dal/src/main/native/CorrelationImpl.cpp index a86e33a1a..1ce43470d 100644 --- a/mllib-dal/src/main/native/CorrelationImpl.cpp +++ b/mllib-dal/src/main/native/CorrelationImpl.cpp @@ -27,14 +27,12 @@ using namespace std; #ifdef CPU_GPU_PROFILE -using namespace oneapi::dal; -#else +namespace covariance_gpu = oneapi::dal::covariance; +#endif using namespace daal; -using namespace daal::algorithms; using namespace daal::services; -#endif +namespace covariance_cpu = daal::algorithms::covariance; -#ifdef CPU_ONLY_PROFILE typedef double algorithmFPType; /* Algorithm floating-point type */ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId, @@ -46,10 +44,10 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId, const bool isRoot = (rankId == ccl_root); - covariance::Distributed localAlgorithm; + covariance_cpu::Distributed localAlgorithm; /* Set the input data set to the algorithm */ - localAlgorithm.input.set(covariance::data, pData); + localAlgorithm.input.set(covariance_cpu::data, pData); /* Compute covariance */ localAlgorithm.compute(); @@ -89,7 +87,8 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId, if (isRoot) { auto t1 = std::chrono::high_resolution_clock::now(); /* Create an algorithm to compute covariance on the master node */ - covariance::Distributed masterAlgorithm; + covariance_cpu::Distributed + masterAlgorithm; for (size_t i = 0; i < nBlocks; i++) { /* Deserialize partial results from step 1 */ @@ -97,19 +96,19 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId, perNodeArchLength * i, perNodeArchLength); - covariance::PartialResultPtr dataForStep2FromStep1( - new covariance::PartialResult()); + covariance_cpu::PartialResultPtr dataForStep2FromStep1( + new covariance_cpu::PartialResult()); dataForStep2FromStep1->deserialize(dataArch); /* Set local partial results as input for the master-node algorithm */ - masterAlgorithm.input.add(covariance::partialResults, + masterAlgorithm.input.add(covariance_cpu::partialResults, dataForStep2FromStep1); } /* Set the parameter to choose the type of the output matrix */ masterAlgorithm.parameter.outputMatrixType = - covariance::correlationMatrix; + covariance_cpu::correlationMatrix; /* Merge and finalizeCompute covariance decomposition on the master node */ @@ -117,7 +116,7 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId, masterAlgorithm.finalizeCompute(); /* Retrieve the algorithm results */ - covariance::ResultPtr result = masterAlgorithm.getResult(); + covariance_cpu::ResultPtr result = masterAlgorithm.getResult(); auto t2 = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(t2 - t1) @@ -126,7 +125,7 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId, << duration / 1000 << " secs" << std::endl; /* Print the results */ - printNumericTable(result->get(covariance::correlation), + printNumericTable(result->get(covariance_cpu::correlation), "Correlation first 20 columns of " "correlation matrix:", 1, 20); @@ -138,13 +137,12 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId, env->GetFieldID(clazz, "correlationNumericTable", "J"); NumericTablePtr *correlation = - new NumericTablePtr(result->get(covariance::correlation)); + new NumericTablePtr(result->get(covariance_cpu::correlation)); env->SetLongField(resultObj, correlationNumericTableField, (jlong)correlation); } } -#endif #ifdef CPU_GPU_PROFILE static void doCorrelationOneAPICompute( @@ -156,9 +154,9 @@ static void doCorrelationOneAPICompute( homogen_table htable = *reinterpret_cast(pNumTabData); - const auto cor_desc = covariance::descriptor{}.set_result_options( - covariance::result_options::cor_matrix | - covariance::result_options::means); + const auto cor_desc = covariance_gpu::descriptor{}.set_result_options( + covariance_gpu::result_options::cor_matrix | + covariance_gpu::result_options::means); auto t1 = std::chrono::high_resolution_clock::now(); const auto result_train = preview::compute(comm, cor_desc, htable); if (isRoot) { @@ -200,7 +198,6 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( int rankId = cclComm.rank(); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { -#ifdef CPU_ONLY_PROFILE case ComputeDevice::host: case ComputeDevice::cpu: { NumericTablePtr pData = *((NumericTablePtr *)pNumTabData); @@ -214,7 +211,7 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( doCorrelationDaalCompute(env, obj, rankId, cclComm, pData, executorNum, resultObj); } -#else +#ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { int nGpu = env->GetArrayLength(gpuIdxArray); std::cout << "oneDAL (native): use GPU kernels with " << nGpu @@ -237,6 +234,5 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( } #endif } - return 0; } diff --git a/mllib-dal/src/main/native/KMeansImpl.cpp b/mllib-dal/src/main/native/KMeansImpl.cpp index 8a7d969ff..ac08d9c2d 100644 --- a/mllib-dal/src/main/native/KMeansImpl.cpp +++ b/mllib-dal/src/main/native/KMeansImpl.cpp @@ -29,14 +29,12 @@ using namespace std; #ifdef CPU_GPU_PROFILE -using namespace oneapi::dal; -#else +namespace kmeans_gpu = oneapi::dal::kmeans; +#endif using namespace daal; -using namespace daal::algorithms; using namespace daal::services; -#endif +namespace kmeans_cpu = daal::algorithms::kmeans; -#ifdef CPU_ONLY_PROFILE typedef double algorithmFPType; /* Algorithm floating-point type */ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, @@ -75,11 +73,12 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, centroids->deserialize(outArch); /* Create an algorithm to compute k-means on local nodes */ - kmeans::Distributed localAlgorithm(nClusters); + kmeans_cpu::Distributed localAlgorithm( + nClusters); /* Set the input data set to the algorithm */ - localAlgorithm.input.set(kmeans::data, pData); - localAlgorithm.input.set(kmeans::inputCentroids, centroids); + localAlgorithm.input.set(kmeans_cpu::data, pData); + localAlgorithm.input.set(kmeans_cpu::inputCentroids, centroids); /* Compute k-means */ localAlgorithm.compute(); @@ -108,7 +107,7 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, if (isRoot) { /* Create an algorithm to compute k-means on the master node */ - kmeans::Distributed masterAlgorithm( + kmeans_cpu::Distributed masterAlgorithm( nClusters); for (size_t i = 0; i < nBlocks; i++) { @@ -116,13 +115,13 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, OutputDataArchive dataArch(&serializedData[perNodeArchLength * i], perNodeArchLength); - kmeans::PartialResultPtr dataForStep2FromStep1( - new kmeans::PartialResult()); + kmeans_cpu::PartialResultPtr dataForStep2FromStep1( + new kmeans_cpu::PartialResult()); dataForStep2FromStep1->deserialize(dataArch); /* Set local partial results as input for the master-node algorithm */ - masterAlgorithm.input.add(kmeans::partialResults, + masterAlgorithm.input.add(kmeans_cpu::partialResults, dataForStep2FromStep1); } @@ -131,11 +130,11 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm, masterAlgorithm.finalizeCompute(); ret_cost = masterAlgorithm.getResult() - ->get(kmeans::objectiveFunction) + ->get(kmeans_cpu::objectiveFunction) ->getValue(0, 0); /* Retrieve the algorithm results */ - return masterAlgorithm.getResult()->get(kmeans::centroids); + return masterAlgorithm.getResult()->get(kmeans_cpu::centroids); } return NumericTablePtr(); } @@ -239,7 +238,6 @@ static jlong doKMeansDaalCompute(JNIEnv *env, jobject obj, int rankId, return (jlong)0; } } -#endif #ifdef CPU_GPU_PROFILE static jlong doKMeansOneAPICompute( @@ -253,13 +251,13 @@ static jlong doKMeansOneAPICompute( *reinterpret_cast(pNumTabData); homogen_table centroids = *reinterpret_cast(pNumTabCenters); - const auto kmeans_desc = kmeans::descriptor<>() + const auto kmeans_desc = kmeans_gpu::descriptor<>() .set_cluster_count(clusterNum) .set_max_iteration_count(iterationNum) .set_accuracy_threshold(tolerance); - kmeans::train_input local_input{htable, centroids}; + kmeans_gpu::train_input local_input{htable, centroids}; auto t1 = std::chrono::high_resolution_clock::now(); - kmeans::train_result result_train = + kmeans_gpu::train_result result_train = preview::train(comm, kmeans_desc, local_input); if (isRoot) { std::cout << "Iteration count: " << result_train.get_iteration_count() @@ -314,7 +312,6 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe int rankId = cclComm.rank(); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { -#ifdef CPU_ONLY_PROFILE case ComputeDevice::host: case ComputeDevice::cpu: { NumericTablePtr pData = *((NumericTablePtr *)pNumTabData); @@ -330,7 +327,7 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe clusterNum, tolerance, iterationNum, executorNum, resultObj); } -#else +#ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { int nGpu = env->GetArrayLength(gpuIdxArray); std::cout << "oneDAL (native): use GPU kernels with " << nGpu diff --git a/mllib-dal/src/main/native/Makefile b/mllib-dal/src/main/native/Makefile index 6b4a4eb8a..304b9a789 100644 --- a/mllib-dal/src/main/native/Makefile +++ b/mllib-dal/src/main/native/Makefile @@ -32,11 +32,12 @@ $(info ) CFLAGS_COMMON := -g -Wall -Wno-deprecated-declarations -fPIC -std=c++17 \ -I $(I_MPI_ROOT)/include \ -I $(DAALROOT)/include \ + -I $(CCL_ROOT)/include/cpu/oneapi/ \ -I $(CMPLR_ROOT)/linux/include \ -I $(CMPLR_ROOT)/linux/include/sycl ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE) - CFLAGS := $(CFLAGS_COMMON) -I $(CCL_ROOT)/include/cpu/oneapi/ + CFLAGS := $(CFLAGS_COMMON) else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE) CFLAGS := $(CFLAGS_COMMON) -fsycl \ -fsycl-device-code-split=per_kernel \ @@ -46,14 +47,15 @@ else exit 1 endif -INCS := -I $(JAVA_HOME)/include \ +INCS := -I $(CCL_ROOT)/include/cpu \ + -I $(JAVA_HOME)/include \ -I $(JAVA_HOME)/include/linux \ -I $(DAALROOT)/include \ -I ./javah \ -I ./ ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE) - INCS := $(INCS) -I $(CCL_ROOT)/include/cpu + INCS := $(INCS) else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE) INCS := $(INCS) -I $(CCL_ROOT)/include/cpu_gpu_dpcpp else @@ -62,17 +64,15 @@ else endif # Use static link if possible, TBB is only available as dynamic libs -LIBS_COMMON :=-L $(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \ - -L $(DAALROOT)/lib/intel64 -lonedal_core -lonedal_thread -lonedal_dpc \ - -L $(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc \ - -L $(I_MPI_ROOT) +LIBS_COMMON := -L$(CCL_ROOT)/lib/cpu -lccl \ + -L$(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \ + -L$(DAALROOT)/lib/intel64 -lonedal_core -lonedal_thread -lonedal_dpc \ + -L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc \ + -L$(I_MPI_ROOT) -ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE) - LIBS_COMMON := $(LIBS_COMMON) \ - -L $(CCL_ROOT)/lib/cpu -lccl -else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE) +ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE) LIBS_COMMON := $(LIBS_COMMON) \ - -L $(CCL_ROOT)/lib/cpu_gpu_dpcpp -lccl + -L$(CCL_ROOT)/lib/cpu_gpu_dpcpp -lccl endif ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE) diff --git a/mllib-dal/src/main/native/PCAImpl.cpp b/mllib-dal/src/main/native/PCAImpl.cpp index 1996d7291..d07c06acc 100644 --- a/mllib-dal/src/main/native/PCAImpl.cpp +++ b/mllib-dal/src/main/native/PCAImpl.cpp @@ -29,14 +29,14 @@ using namespace std; #ifdef CPU_GPU_PROFILE -using namespace oneapi::dal; -#else +namespace pca_gpu = oneapi::dal::pca; +namespace covariance_gpu = oneapi::dal::covariance; +#endif using namespace daal; -using namespace daal::algorithms; using namespace daal::services; -#endif +namespace pca_cpu = daal::algorithms::pca; +namespace covariance_cpu = daal::algorithms::covariance; -#ifdef CPU_ONLY_PROFILE typedef double algorithmFPType; /* Algorithm floating-point type */ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, @@ -48,10 +48,10 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, const bool isRoot = (rankId == ccl_root); - covariance::Distributed localAlgorithm; + covariance_cpu::Distributed localAlgorithm; /* Set the input data set to the algorithm */ - localAlgorithm.input.set(covariance::data, pData); + localAlgorithm.input.set(covariance_cpu::data, pData); /* Compute covariance for PCA*/ localAlgorithm.compute(); @@ -90,7 +90,8 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, if (isRoot) { auto t1 = std::chrono::high_resolution_clock::now(); /* Create an algorithm to compute covariance on the master node */ - covariance::Distributed masterAlgorithm; + covariance_cpu::Distributed + masterAlgorithm; for (size_t i = 0; i < nBlocks; i++) { /* Deserialize partial results from step 1 */ @@ -98,19 +99,19 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, perNodeArchLength * i, perNodeArchLength); - covariance::PartialResultPtr dataForStep2FromStep1( - new covariance::PartialResult()); + covariance_cpu::PartialResultPtr dataForStep2FromStep1( + new covariance_cpu::PartialResult()); dataForStep2FromStep1->deserialize(dataArch); /* Set local partial results as input for the master-node algorithm */ - masterAlgorithm.input.add(covariance::partialResults, + masterAlgorithm.input.add(covariance_cpu::partialResults, dataForStep2FromStep1); } /* Set the parameter to choose the type of the output matrix */ masterAlgorithm.parameter.outputMatrixType = - covariance::covarianceMatrix; + covariance_cpu::covarianceMatrix; /* Merge and finalizeCompute covariance decomposition on the master node */ @@ -118,7 +119,8 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, masterAlgorithm.finalizeCompute(); /* Retrieve the algorithm results */ - covariance::ResultPtr covariance_result = masterAlgorithm.getResult(); + covariance_cpu::ResultPtr covariance_result = + masterAlgorithm.getResult(); auto t2 = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(t2 - t1) @@ -130,12 +132,12 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, /* Create an algorithm for principal component analysis using the * correlation method*/ - pca::Batch algorithm; + pca_cpu::Batch algorithm; /* Set the algorithm input data*/ - algorithm.input.set(pca::correlation, - covariance_result->get(covariance::covariance)); - algorithm.parameter.resultsToCompute = pca::eigenvalue; + algorithm.input.set(pca_cpu::correlation, + covariance_result->get(covariance_cpu::covariance)); + algorithm.parameter.resultsToCompute = pca_cpu::eigenvalue; /* Compute results of the PCA algorithm*/ algorithm.compute(); @@ -148,11 +150,11 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, << " secs" << std::endl; /* Print the results */ - pca::ResultPtr result = algorithm.getResult(); - printNumericTable(result->get(pca::eigenvalues), + pca_cpu::ResultPtr result = algorithm.getResult(); + printNumericTable(result->get(pca_cpu::eigenvalues), "First 10 eigenvalues with first 20 dimensions:", 10, 20); - printNumericTable(result->get(pca::eigenvectors), + printNumericTable(result->get(pca_cpu::eigenvectors), "First 10 eigenvectors with first 20 dimensions:", 10, 20); @@ -166,16 +168,15 @@ static void doPCADAALCompute(JNIEnv *env, jobject obj, int rankId, env->GetFieldID(clazz, "explainedVarianceNumericTable", "J"); NumericTablePtr *eigenvalues = - new NumericTablePtr(result->get(pca::eigenvalues)); + new NumericTablePtr(result->get(pca_cpu::eigenvalues)); NumericTablePtr *eigenvectors = - new NumericTablePtr(result->get(pca::eigenvectors)); + new NumericTablePtr(result->get(pca_cpu::eigenvectors)); env->SetLongField(resultObj, pcNumericTableField, (jlong)eigenvectors); env->SetLongField(resultObj, explainedVarianceNumericTableField, (jlong)eigenvalues); } } -#endif #ifdef CPU_GPU_PROFILE static void doPCAOneAPICompute( @@ -187,8 +188,8 @@ static void doPCAOneAPICompute( homogen_table htable = *reinterpret_cast(pNumTabData); - const auto cov_desc = covariance::descriptor{}.set_result_options( - covariance::result_options::cov_matrix); + const auto cov_desc = covariance_gpu::descriptor{}.set_result_options( + covariance_gpu::result_options::cov_matrix); auto t1 = std::chrono::high_resolution_clock::now(); const auto result = preview::compute(comm, cov_desc, htable); @@ -199,9 +200,9 @@ static void doPCAOneAPICompute( << " secs" << std::endl; if (isRoot) { using float_t = double; - using method_t = pca::method::precomputed; - using task_t = pca::task::dim_reduction; - using descriptor_t = pca::descriptor; + using method_t = pca_gpu::method::precomputed; + using task_t = pca_gpu::task::dim_reduction; + using descriptor_t = pca_gpu::descriptor; const auto pca_desc = descriptor_t().set_deterministic(true); t1 = std::chrono::high_resolution_clock::now(); @@ -254,7 +255,6 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( int rankId = cclComm.rank(); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { -#ifdef CPU_ONLY_PROFILE case ComputeDevice::host: case ComputeDevice::cpu: { NumericTablePtr pData = *((NumericTablePtr *)pNumTabData); @@ -268,7 +268,7 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( doPCADAALCompute(env, obj, rankId, cclComm, pData, executorNum, resultObj); } -#else +#ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { int nGpu = env->GetArrayLength(gpuIdxArray); std::cout << "oneDAL (native): use GPU kernels with " << nGpu diff --git a/mllib-dal/src/main/native/SummarizerImpl.cpp b/mllib-dal/src/main/native/SummarizerImpl.cpp index 22c50ace2..6786dc794 100644 --- a/mllib-dal/src/main/native/SummarizerImpl.cpp +++ b/mllib-dal/src/main/native/SummarizerImpl.cpp @@ -28,13 +28,10 @@ using namespace std; #ifdef CPU_GPU_PROFILE using namespace oneapi::dal; -#else +#endif using namespace daal; using namespace daal::algorithms; using namespace daal::services; -#endif - -#ifdef CPU_ONLY_PROFILE typedef double algorithmFPType; /* Algorithm floating-point type */ @@ -199,7 +196,6 @@ static void doSummarizerDAALCompute(JNIEnv *env, jobject obj, int rankId, env->SetLongField(resultObj, minimumNumericTableField, (jlong)min); } } -#endif #ifdef CPU_GPU_PROFILE static void doSummarizerOneAPICompute( @@ -271,7 +267,6 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( int rankId = cclComm.rank(); ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); switch (device) { -#ifdef CPU_ONLY_PROFILE case ComputeDevice::host: case ComputeDevice::cpu: { NumericTablePtr pData = *((NumericTablePtr *)pNumTabData); @@ -285,7 +280,7 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( doSummarizerDAALCompute(env, obj, rankId, cclComm, pData, executorNum, resultObj); } -#else +#ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { int nGpu = env->GetArrayLength(gpuIdxArray); std::cout << "oneDAL (native): use GPU kernels with " << nGpu @@ -308,6 +303,5 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( } #endif } - return 0; } From d5a7ea8941fb279264f1e6ed4ad71c69d5b10dfa Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 25 Apr 2023 10:38:54 +0000 Subject: [PATCH 2/9] update Signed-off-by: minmingzhu --- mllib-dal/src/main/native/CorrelationImpl.cpp | 1 + mllib-dal/src/main/native/KMeansImpl.cpp | 2 ++ mllib-dal/src/main/native/PCAImpl.cpp | 1 + mllib-dal/src/main/native/SummarizerImpl.cpp | 1 + 4 files changed, 5 insertions(+) diff --git a/mllib-dal/src/main/native/CorrelationImpl.cpp b/mllib-dal/src/main/native/CorrelationImpl.cpp index 1ce43470d..dddc35284 100644 --- a/mllib-dal/src/main/native/CorrelationImpl.cpp +++ b/mllib-dal/src/main/native/CorrelationImpl.cpp @@ -231,6 +231,7 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( preview::spmd::make_communicator( queue, size, rankId, kvs); doCorrelationOneAPICompute(env, pNumTabData, comm, resultObj); + env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); } #endif } diff --git a/mllib-dal/src/main/native/KMeansImpl.cpp b/mllib-dal/src/main/native/KMeansImpl.cpp index ac08d9c2d..d864ea671 100644 --- a/mllib-dal/src/main/native/KMeansImpl.cpp +++ b/mllib-dal/src/main/native/KMeansImpl.cpp @@ -349,6 +349,8 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe ret = doKMeansOneAPICompute(env, pNumTabData, pNumTabCenters, clusterNum, tolerance, iterationNum, comm, resultObj); + + env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); } #endif } diff --git a/mllib-dal/src/main/native/PCAImpl.cpp b/mllib-dal/src/main/native/PCAImpl.cpp index d07c06acc..935942f0f 100644 --- a/mllib-dal/src/main/native/PCAImpl.cpp +++ b/mllib-dal/src/main/native/PCAImpl.cpp @@ -288,6 +288,7 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( preview::spmd::make_communicator( queue, size, rankId, kvs); doPCAOneAPICompute(env, pNumTabData, comm, resultObj); + env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); } #endif } diff --git a/mllib-dal/src/main/native/SummarizerImpl.cpp b/mllib-dal/src/main/native/SummarizerImpl.cpp index 6786dc794..20f0e87d6 100644 --- a/mllib-dal/src/main/native/SummarizerImpl.cpp +++ b/mllib-dal/src/main/native/SummarizerImpl.cpp @@ -300,6 +300,7 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( preview::spmd::make_communicator( queue, size, rankId, kvs); doSummarizerOneAPICompute(env, pNumTabData, comm, resultObj); + env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); } #endif } From c6f7241e2bb05c0165079d96a7b1ea55636f644b Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 25 Apr 2023 13:20:27 +0000 Subject: [PATCH 3/9] update Signed-off-by: minmingzhu --- mllib-dal/src/main/native/CorrelationImpl.cpp | 3 ++- mllib-dal/src/main/native/KMeansImpl.cpp | 3 ++- mllib-dal/src/main/native/OneCCL.cpp | 4 +--- mllib-dal/src/main/native/PCAImpl.cpp | 3 ++- mllib-dal/src/main/native/SummarizerImpl.cpp | 3 ++- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/mllib-dal/src/main/native/CorrelationImpl.cpp b/mllib-dal/src/main/native/CorrelationImpl.cpp index dddc35284..aba09e5a1 100644 --- a/mllib-dal/src/main/native/CorrelationImpl.cpp +++ b/mllib-dal/src/main/native/CorrelationImpl.cpp @@ -210,6 +210,7 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( << nThreadsNew << std::endl; doCorrelationDaalCompute(env, obj, rankId, cclComm, pData, executorNum, resultObj); + break; } #ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { @@ -221,7 +222,6 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); int size = cclComm.size(); - ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); auto queue = getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); @@ -232,6 +232,7 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL( queue, size, rankId, kvs); doCorrelationOneAPICompute(env, pNumTabData, comm, resultObj); env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); + break; } #endif } diff --git a/mllib-dal/src/main/native/KMeansImpl.cpp b/mllib-dal/src/main/native/KMeansImpl.cpp index d864ea671..f167d41ed 100644 --- a/mllib-dal/src/main/native/KMeansImpl.cpp +++ b/mllib-dal/src/main/native/KMeansImpl.cpp @@ -326,6 +326,7 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe ret = doKMeansDaalCompute(env, obj, rankId, cclComm, pData, centroids, clusterNum, tolerance, iterationNum, executorNum, resultObj); + break; } #ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { @@ -337,7 +338,6 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); int size = cclComm.size(); - ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); auto queue = getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); @@ -351,6 +351,7 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe tolerance, iterationNum, comm, resultObj); env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); + break; } #endif } diff --git a/mllib-dal/src/main/native/OneCCL.cpp b/mllib-dal/src/main/native/OneCCL.cpp index bbfc9fbe9..6264d36ca 100644 --- a/mllib-dal/src/main/native/OneCCL.cpp +++ b/mllib-dal/src/main/native/OneCCL.cpp @@ -100,11 +100,9 @@ Java_com_intel_oap_mllib_OneCCL_00024_c_1initDpcpp(JNIEnv *env, jobject) { JNIEXPORT void JNICALL Java_com_intel_oap_mllib_OneCCL_00024_c_1cleanup(JNIEnv *env, jobject obj) { - + std::cerr << "OneCCL (native): cleanup" << std::endl; g_kvs.pop_back(); g_comms.pop_back(); - - std::cerr << "OneCCL (native): cleanup" << std::endl; } JNIEXPORT jboolean JNICALL diff --git a/mllib-dal/src/main/native/PCAImpl.cpp b/mllib-dal/src/main/native/PCAImpl.cpp index 935942f0f..5fabd3e24 100644 --- a/mllib-dal/src/main/native/PCAImpl.cpp +++ b/mllib-dal/src/main/native/PCAImpl.cpp @@ -267,6 +267,7 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( << nThreadsNew << std::endl; doPCADAALCompute(env, obj, rankId, cclComm, pData, executorNum, resultObj); + break; } #ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { @@ -278,7 +279,6 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); int size = cclComm.size(); - ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); auto queue = getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); @@ -289,6 +289,7 @@ Java_com_intel_oap_mllib_feature_PCADALImpl_cPCATrainDAL( queue, size, rankId, kvs); doPCAOneAPICompute(env, pNumTabData, comm, resultObj); env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); + break; } #endif } diff --git a/mllib-dal/src/main/native/SummarizerImpl.cpp b/mllib-dal/src/main/native/SummarizerImpl.cpp index 20f0e87d6..8333be531 100644 --- a/mllib-dal/src/main/native/SummarizerImpl.cpp +++ b/mllib-dal/src/main/native/SummarizerImpl.cpp @@ -279,6 +279,7 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( << nThreadsNew << std::endl; doSummarizerDAALCompute(env, obj, rankId, cclComm, pData, executorNum, resultObj); + break; } #ifdef CPU_GPU_PROFILE case ComputeDevice::gpu: { @@ -290,7 +291,6 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0); int size = cclComm.size(); - ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal); auto queue = getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu); @@ -301,6 +301,7 @@ Java_com_intel_oap_mllib_stat_SummarizerDALImpl_cSummarizerTrainDAL( queue, size, rankId, kvs); doSummarizerOneAPICompute(env, pNumTabData, comm, resultObj); env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0); + break; } #endif } From 6e0b0b9c42e277b838bf0d03d610925347fb86b6 Mon Sep 17 00:00:00 2001 From: minmingzhu <45281494+minmingzhu@users.noreply.github.com> Date: Tue, 25 Apr 2023 22:05:05 +0800 Subject: [PATCH 4/9] Update ci-tests.yml --- .github/workflows/ci-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 86ca043f5..c96021b0c 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -95,7 +95,7 @@ jobs: ${{ runner.os }}- - name: Cluster Test run: | - ${{github.workspace}}/dev/ci/ci-standalone-test.sh + ${{github.workspace}}/dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh standalone-CPU_GPU_PROFILE-test: name: Standalone Test for Examples runs-on: ubuntu-20.04 @@ -118,4 +118,4 @@ standalone-CPU_GPU_PROFILE-test: ${{ runner.os }}- - name: Cluster Test run: | - ${{github.workspace}}/dev/ci/ci-standalone-test.sh + ${{github.workspace}}/dev/ci/ci-standalone-CPU_GPU_PROFILE-test.sh From b8947afef3fb29acebe147d3e1666fad3207c13c Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 25 Apr 2023 14:19:15 +0000 Subject: [PATCH 5/9] update Signed-off-by: minmingzhu --- .github/workflows/ci-tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index c96021b0c..5647d2e8f 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -74,7 +74,7 @@ jobs: run: | ${{github.workspace}}/dev/ci/ci-yarn-test.sh standalone-CPU_ONLY_PROFILE-test: - name: Standalone Test for Examples + name: Standalone CPU_ONLY_PROFILE Test for Examples runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 @@ -96,8 +96,8 @@ jobs: - name: Cluster Test run: | ${{github.workspace}}/dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh -standalone-CPU_GPU_PROFILE-test: - name: Standalone Test for Examples + standalone-CPU_GPU_PROFILE-test: + name: Standalone CPU_GPU_PROFILE Test for Examples runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 From 59db0d7f550bb49173c1c24f98a2f659b06af9d8 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 25 Apr 2023 15:52:41 +0000 Subject: [PATCH 6/9] update Signed-off-by: minmingzhu --- .github/workflows/ci-tests.yml | 6 +++--- ...OFILE-test.sh => ci-standalone-CPU-ONLY-PROFILE-test.sh} | 0 2 files changed, 3 insertions(+), 3 deletions(-) rename dev/ci/{ci-standalone-CPU_ONLY_PROFILE-test.sh => ci-standalone-CPU-ONLY-PROFILE-test.sh} (100%) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 5647d2e8f..6ea681198 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -73,7 +73,7 @@ jobs: - name: Cluster Test run: | ${{github.workspace}}/dev/ci/ci-yarn-test.sh - standalone-CPU_ONLY_PROFILE-test: + standalone-CPU-ONLY-PROFILE-test: name: Standalone CPU_ONLY_PROFILE Test for Examples runs-on: ubuntu-20.04 steps: @@ -96,7 +96,7 @@ jobs: - name: Cluster Test run: | ${{github.workspace}}/dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh - standalone-CPU_GPU_PROFILE-test: + standalone-CPU-GPU-PROFILE-test: name: Standalone CPU_GPU_PROFILE Test for Examples runs-on: ubuntu-20.04 steps: @@ -118,4 +118,4 @@ jobs: ${{ runner.os }}- - name: Cluster Test run: | - ${{github.workspace}}/dev/ci/ci-standalone-CPU_GPU_PROFILE-test.sh + ${{github.workspace}}/dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh diff --git a/dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh b/dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh similarity index 100% rename from dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh rename to dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh From 90aa4f2cd9a0c5e92ac7bcfbf13dabcb6dbbfa7f Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Tue, 25 Apr 2023 16:05:08 +0000 Subject: [PATCH 7/9] update Signed-off-by: minmingzhu --- .github/workflows/ci-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 6ea681198..4c00dd63a 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -95,7 +95,7 @@ jobs: ${{ runner.os }}- - name: Cluster Test run: | - ${{github.workspace}}/dev/ci/ci-standalone-CPU_ONLY_PROFILE-test.sh + ${{github.workspace}}/dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh standalone-CPU-GPU-PROFILE-test: name: Standalone CPU_GPU_PROFILE Test for Examples runs-on: ubuntu-20.04 From 954c15f5a579a188d4675b4c7c6d7865167e2432 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 26 Apr 2023 06:06:25 +0000 Subject: [PATCH 8/9] update --- .github/workflows/ci-tests.yml | 29 ++------------- dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh | 37 ------------------- ...-PROFILE-test.sh => ci-standalone-test.sh} | 0 3 files changed, 3 insertions(+), 63 deletions(-) delete mode 100755 dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh rename dev/ci/{ci-standalone-CPU-GPU-PROFILE-test.sh => ci-standalone-test.sh} (100%) diff --git a/.github/workflows/ci-tests.yml b/.github/workflows/ci-tests.yml index 4c00dd63a..96018b1fa 100644 --- a/.github/workflows/ci-tests.yml +++ b/.github/workflows/ci-tests.yml @@ -73,8 +73,8 @@ jobs: - name: Cluster Test run: | ${{github.workspace}}/dev/ci/ci-yarn-test.sh - standalone-CPU-ONLY-PROFILE-test: - name: Standalone CPU_ONLY_PROFILE Test for Examples + standalone-test: + name: Standalone CPU_GPU_PROFILE Test for Examples (CPU) runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v3 @@ -95,27 +95,4 @@ jobs: ${{ runner.os }}- - name: Cluster Test run: | - ${{github.workspace}}/dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh - standalone-CPU-GPU-PROFILE-test: - name: Standalone CPU_GPU_PROFILE Test for Examples - runs-on: ubuntu-20.04 - steps: - - uses: actions/checkout@v3 - - name: Set up JDK 1.8 - uses: actions/setup-java@v1 - with: - java-version: 1.8 - - name: Restore cached dependencies - uses: actions/cache@v3 - with: - path: | - #/var/cache/apt/archives/*.deb - ~/.m2/repository - /opt/intel/oneapi - ~/opt - key: ${{ runner.os }}_spark-3.2.0_hadoop-3.2.0_oneapi-2023.0.0 - restore-keys: | - ${{ runner.os }}- - - name: Cluster Test - run: | - ${{github.workspace}}/dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh + ${{github.workspace}}/dev/ci/ci-standalone-test.sh diff --git a/dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh b/dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh deleted file mode 100755 index edc5af65b..000000000 --- a/dev/ci/ci-standalone-CPU-ONLY-PROFILE-test.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/env bash - -# exit when any command fails -set -e - -# keep track of the last executed command -trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG -# echo an error message before exiting -trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT - -# Install dependencies for building -$GITHUB_WORKSPACE/dev/install-build-deps-ubuntu.sh - -# Setup building envs -source /opt/intel/oneapi/setvars.sh --ccl-configuration=cpu - -# Prepare lib resources -cd $GITHUB_WORKSPACE/mllib-dal -../dev/prepare-build-deps.sh -./build.sh -p CPU_ONLY_PROFILE -q - -# Setup cluster -source $GITHUB_WORKSPACE/dev/test-cluster/standalone/setup-cluster.sh - -# Setup OAP MLlib envs -cp $GITHUB_WORKSPACE/dev/test-cluster/standalone/env.sh $GITHUB_WORKSPACE/conf -cd $GITHUB_WORKSPACE/examples - - -echo "=========================================" -echo "Cluster Testing with Spark Version: $SPARK_VERSION" -echo "=========================================" - -# Build and run all examples -./build-all-scala.sh -./run-all-scala.sh -./run-all-pyspark.sh diff --git a/dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh b/dev/ci/ci-standalone-test.sh similarity index 100% rename from dev/ci/ci-standalone-CPU-GPU-PROFILE-test.sh rename to dev/ci/ci-standalone-test.sh From 558b216428f2b1f252c690f6e943d2eec637c927 Mon Sep 17 00:00:00 2001 From: minmingzhu Date: Wed, 26 Apr 2023 06:39:21 +0000 Subject: [PATCH 9/9] retrigger checks