Skip to content

Commit

Permalink
[ML-279]Fix GPU_CPU_PROFILE can't run daal cpu (#280)
Browse files Browse the repository at this point in the history
* fix GPU-CPU-PROFILE jar can't not run daal CPU

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* Update ci-tests.yml

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

Signed-off-by: minmingzhu <minming.zhu@intel.com>

* update

* retrigger checks

---------

Signed-off-by: minmingzhu <minming.zhu@intel.com>
  • Loading branch information
minmingzhu authored Apr 28, 2023
1 parent bbd9fc2 commit 4e1ce4d
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 101 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ jobs:
run: |
${{github.workspace}}/dev/ci/ci-yarn-test.sh
standalone-test:
name: Standalone Test for Examples (CPU)
name: Standalone CPU_GPU_PROFILE Test for Examples (CPU)
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion dev/ci/ci-standalone-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ source /opt/intel/oneapi/setvars.sh --ccl-configuration=cpu
# Prepare lib resources
cd $GITHUB_WORKSPACE/mllib-dal
../dev/prepare-build-deps.sh
./build.sh -p CPU_ONLY_PROFILE -q
./build.sh -p CPU_GPU_PROFILE -q

# Setup cluster
source $GITHUB_WORKSPACE/dev/test-cluster/standalone/setup-cluster.sh
Expand Down
44 changes: 21 additions & 23 deletions mllib-dal/src/main/native/CorrelationImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,12 @@

using namespace std;
#ifdef CPU_GPU_PROFILE
using namespace oneapi::dal;
#else
namespace covariance_gpu = oneapi::dal::covariance;
#endif
using namespace daal;
using namespace daal::algorithms;
using namespace daal::services;
#endif
namespace covariance_cpu = daal::algorithms::covariance;

#ifdef CPU_ONLY_PROFILE
typedef double algorithmFPType; /* Algorithm floating-point type */

static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
Expand All @@ -46,10 +44,10 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,

const bool isRoot = (rankId == ccl_root);

covariance::Distributed<step1Local, algorithmFPType> localAlgorithm;
covariance_cpu::Distributed<step1Local, algorithmFPType> localAlgorithm;

/* Set the input data set to the algorithm */
localAlgorithm.input.set(covariance::data, pData);
localAlgorithm.input.set(covariance_cpu::data, pData);

/* Compute covariance */
localAlgorithm.compute();
Expand Down Expand Up @@ -89,35 +87,36 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
if (isRoot) {
auto t1 = std::chrono::high_resolution_clock::now();
/* Create an algorithm to compute covariance on the master node */
covariance::Distributed<step2Master, algorithmFPType> masterAlgorithm;
covariance_cpu::Distributed<step2Master, algorithmFPType>
masterAlgorithm;

for (size_t i = 0; i < nBlocks; i++) {
/* Deserialize partial results from step 1 */
OutputDataArchive dataArch(serializedData.get() +
perNodeArchLength * i,
perNodeArchLength);

covariance::PartialResultPtr dataForStep2FromStep1(
new covariance::PartialResult());
covariance_cpu::PartialResultPtr dataForStep2FromStep1(
new covariance_cpu::PartialResult());
dataForStep2FromStep1->deserialize(dataArch);

/* Set local partial results as input for the master-node algorithm
*/
masterAlgorithm.input.add(covariance::partialResults,
masterAlgorithm.input.add(covariance_cpu::partialResults,
dataForStep2FromStep1);
}

/* Set the parameter to choose the type of the output matrix */
masterAlgorithm.parameter.outputMatrixType =
covariance::correlationMatrix;
covariance_cpu::correlationMatrix;

/* Merge and finalizeCompute covariance decomposition on the master node
*/
masterAlgorithm.compute();
masterAlgorithm.finalizeCompute();

/* Retrieve the algorithm results */
covariance::ResultPtr result = masterAlgorithm.getResult();
covariance_cpu::ResultPtr result = masterAlgorithm.getResult();
auto t2 = std::chrono::high_resolution_clock::now();
auto duration =
std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
Expand All @@ -126,7 +125,7 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
<< duration / 1000 << " secs" << std::endl;

/* Print the results */
printNumericTable(result->get(covariance::correlation),
printNumericTable(result->get(covariance_cpu::correlation),
"Correlation first 20 columns of "
"correlation matrix:",
1, 20);
Expand All @@ -138,13 +137,12 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
env->GetFieldID(clazz, "correlationNumericTable", "J");

NumericTablePtr *correlation =
new NumericTablePtr(result->get(covariance::correlation));
new NumericTablePtr(result->get(covariance_cpu::correlation));

env->SetLongField(resultObj, correlationNumericTableField,
(jlong)correlation);
}
}
#endif

#ifdef CPU_GPU_PROFILE
static void doCorrelationOneAPICompute(
Expand All @@ -156,9 +154,9 @@ static void doCorrelationOneAPICompute(
homogen_table htable =
*reinterpret_cast<const homogen_table *>(pNumTabData);

const auto cor_desc = covariance::descriptor{}.set_result_options(
covariance::result_options::cor_matrix |
covariance::result_options::means);
const auto cor_desc = covariance_gpu::descriptor{}.set_result_options(
covariance_gpu::result_options::cor_matrix |
covariance_gpu::result_options::means);
auto t1 = std::chrono::high_resolution_clock::now();
const auto result_train = preview::compute(comm, cor_desc, htable);
if (isRoot) {
Expand Down Expand Up @@ -200,7 +198,6 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
int rankId = cclComm.rank();
ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
switch (device) {
#ifdef CPU_ONLY_PROFILE
case ComputeDevice::host:
case ComputeDevice::cpu: {
NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
Expand All @@ -213,8 +210,9 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
<< nThreadsNew << std::endl;
doCorrelationDaalCompute(env, obj, rankId, cclComm, pData, executorNum,
resultObj);
break;
}
#else
#ifdef CPU_GPU_PROFILE
case ComputeDevice::gpu: {
int nGpu = env->GetArrayLength(gpuIdxArray);
std::cout << "oneDAL (native): use GPU kernels with " << nGpu
Expand All @@ -224,7 +222,6 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);

int size = cclComm.size();
ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);

auto queue =
getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
Expand All @@ -234,9 +231,10 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, size, rankId, kvs);
doCorrelationOneAPICompute(env, pNumTabData, comm, resultObj);
env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
break;
}
#endif
}

return 0;
}
42 changes: 21 additions & 21 deletions mllib-dal/src/main/native/KMeansImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,12 @@

using namespace std;
#ifdef CPU_GPU_PROFILE
using namespace oneapi::dal;
#else
namespace kmeans_gpu = oneapi::dal::kmeans;
#endif
using namespace daal;
using namespace daal::algorithms;
using namespace daal::services;
#endif
namespace kmeans_cpu = daal::algorithms::kmeans;

#ifdef CPU_ONLY_PROFILE
typedef double algorithmFPType; /* Algorithm floating-point type */

static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
Expand Down Expand Up @@ -75,11 +73,12 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
centroids->deserialize(outArch);

/* Create an algorithm to compute k-means on local nodes */
kmeans::Distributed<step1Local, algorithmFPType> localAlgorithm(nClusters);
kmeans_cpu::Distributed<step1Local, algorithmFPType> localAlgorithm(
nClusters);

/* Set the input data set to the algorithm */
localAlgorithm.input.set(kmeans::data, pData);
localAlgorithm.input.set(kmeans::inputCentroids, centroids);
localAlgorithm.input.set(kmeans_cpu::data, pData);
localAlgorithm.input.set(kmeans_cpu::inputCentroids, centroids);

/* Compute k-means */
localAlgorithm.compute();
Expand Down Expand Up @@ -108,21 +107,21 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,

if (isRoot) {
/* Create an algorithm to compute k-means on the master node */
kmeans::Distributed<step2Master, algorithmFPType> masterAlgorithm(
kmeans_cpu::Distributed<step2Master, algorithmFPType> masterAlgorithm(
nClusters);

for (size_t i = 0; i < nBlocks; i++) {
/* Deserialize partial results from step 1 */
OutputDataArchive dataArch(&serializedData[perNodeArchLength * i],
perNodeArchLength);

kmeans::PartialResultPtr dataForStep2FromStep1(
new kmeans::PartialResult());
kmeans_cpu::PartialResultPtr dataForStep2FromStep1(
new kmeans_cpu::PartialResult());
dataForStep2FromStep1->deserialize(dataArch);

/* Set local partial results as input for the master-node algorithm
*/
masterAlgorithm.input.add(kmeans::partialResults,
masterAlgorithm.input.add(kmeans_cpu::partialResults,
dataForStep2FromStep1);
}

Expand All @@ -131,11 +130,11 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
masterAlgorithm.finalizeCompute();

ret_cost = masterAlgorithm.getResult()
->get(kmeans::objectiveFunction)
->get(kmeans_cpu::objectiveFunction)
->getValue<algorithmFPType>(0, 0);

/* Retrieve the algorithm results */
return masterAlgorithm.getResult()->get(kmeans::centroids);
return masterAlgorithm.getResult()->get(kmeans_cpu::centroids);
}
return NumericTablePtr();
}
Expand Down Expand Up @@ -239,7 +238,6 @@ static jlong doKMeansDaalCompute(JNIEnv *env, jobject obj, int rankId,
return (jlong)0;
}
}
#endif

#ifdef CPU_GPU_PROFILE
static jlong doKMeansOneAPICompute(
Expand All @@ -253,13 +251,13 @@ static jlong doKMeansOneAPICompute(
*reinterpret_cast<const homogen_table *>(pNumTabData);
homogen_table centroids =
*reinterpret_cast<const homogen_table *>(pNumTabCenters);
const auto kmeans_desc = kmeans::descriptor<>()
const auto kmeans_desc = kmeans_gpu::descriptor<>()
.set_cluster_count(clusterNum)
.set_max_iteration_count(iterationNum)
.set_accuracy_threshold(tolerance);
kmeans::train_input local_input{htable, centroids};
kmeans_gpu::train_input local_input{htable, centroids};
auto t1 = std::chrono::high_resolution_clock::now();
kmeans::train_result result_train =
kmeans_gpu::train_result result_train =
preview::train(comm, kmeans_desc, local_input);
if (isRoot) {
std::cout << "Iteration count: " << result_train.get_iteration_count()
Expand Down Expand Up @@ -314,7 +312,6 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
int rankId = cclComm.rank();
ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
switch (device) {
#ifdef CPU_ONLY_PROFILE
case ComputeDevice::host:
case ComputeDevice::cpu: {
NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
Expand All @@ -329,8 +326,9 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
ret = doKMeansDaalCompute(env, obj, rankId, cclComm, pData, centroids,
clusterNum, tolerance, iterationNum,
executorNum, resultObj);
break;
}
#else
#ifdef CPU_GPU_PROFILE
case ComputeDevice::gpu: {
int nGpu = env->GetArrayLength(gpuIdxArray);
std::cout << "oneDAL (native): use GPU kernels with " << nGpu
Expand All @@ -340,7 +338,6 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);

int size = cclComm.size();
ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);

auto queue =
getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
Expand All @@ -352,6 +349,9 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
ret =
doKMeansOneAPICompute(env, pNumTabData, pNumTabCenters, clusterNum,
tolerance, iterationNum, comm, resultObj);

env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
break;
}
#endif
}
Expand Down
24 changes: 12 additions & 12 deletions mllib-dal/src/main/native/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,12 @@ $(info )
CFLAGS_COMMON := -g -Wall -Wno-deprecated-declarations -fPIC -std=c++17 \
-I $(I_MPI_ROOT)/include \
-I $(DAALROOT)/include \
-I $(CCL_ROOT)/include/cpu/oneapi/ \
-I $(CMPLR_ROOT)/linux/include \
-I $(CMPLR_ROOT)/linux/include/sycl

ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
CFLAGS := $(CFLAGS_COMMON) -I $(CCL_ROOT)/include/cpu/oneapi/
CFLAGS := $(CFLAGS_COMMON)
else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
CFLAGS := $(CFLAGS_COMMON) -fsycl \
-fsycl-device-code-split=per_kernel \
Expand All @@ -46,14 +47,15 @@ else
exit 1
endif

INCS := -I $(JAVA_HOME)/include \
INCS := -I $(CCL_ROOT)/include/cpu \
-I $(JAVA_HOME)/include \
-I $(JAVA_HOME)/include/linux \
-I $(DAALROOT)/include \
-I ./javah \
-I ./

ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
INCS := $(INCS) -I $(CCL_ROOT)/include/cpu
INCS := $(INCS)
else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
INCS := $(INCS) -I $(CCL_ROOT)/include/cpu_gpu_dpcpp
else
Expand All @@ -62,17 +64,15 @@ else
endif

# Use static link if possible, TBB is only available as dynamic libs
LIBS_COMMON :=-L $(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \
-L $(DAALROOT)/lib/intel64 -lonedal_core -lonedal_thread -lonedal_dpc \
-L $(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc \
-L $(I_MPI_ROOT)
LIBS_COMMON := -L$(CCL_ROOT)/lib/cpu -lccl \
-L$(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \
-L$(DAALROOT)/lib/intel64 -lonedal_core -lonedal_thread -lonedal_dpc \
-L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc \
-L$(I_MPI_ROOT)

ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
LIBS_COMMON := $(LIBS_COMMON) \
-L $(CCL_ROOT)/lib/cpu -lccl
else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
LIBS_COMMON := $(LIBS_COMMON) \
-L $(CCL_ROOT)/lib/cpu_gpu_dpcpp -lccl
-L$(CCL_ROOT)/lib/cpu_gpu_dpcpp -lccl
endif

ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
Expand Down
4 changes: 1 addition & 3 deletions mllib-dal/src/main/native/OneCCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,9 @@ Java_com_intel_oap_mllib_OneCCL_00024_c_1initDpcpp(JNIEnv *env, jobject) {

JNIEXPORT void JNICALL
Java_com_intel_oap_mllib_OneCCL_00024_c_1cleanup(JNIEnv *env, jobject obj) {

std::cerr << "OneCCL (native): cleanup" << std::endl;
g_kvs.pop_back();
g_comms.pop_back();

std::cerr << "OneCCL (native): cleanup" << std::endl;
}

JNIEXPORT jboolean JNICALL
Expand Down
Loading

0 comments on commit 4e1ce4d

Please sign in to comment.