Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML-279]Fix GPU_CPU_PROFILE can't run daal cpu #280

Merged
merged 9 commits into from
Apr 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ jobs:
run: |
${{github.workspace}}/dev/ci/ci-yarn-test.sh
standalone-test:
name: Standalone Test for Examples (CPU)
name: Standalone CPU_GPU_PROFILE Test for Examples (CPU)
runs-on: ubuntu-20.04
steps:
- uses: actions/checkout@v3
Expand Down
2 changes: 1 addition & 1 deletion dev/ci/ci-standalone-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ source /opt/intel/oneapi/setvars.sh --ccl-configuration=cpu
# Prepare lib resources
cd $GITHUB_WORKSPACE/mllib-dal
../dev/prepare-build-deps.sh
./build.sh -p CPU_ONLY_PROFILE -q
./build.sh -p CPU_GPU_PROFILE -q

# Setup cluster
source $GITHUB_WORKSPACE/dev/test-cluster/standalone/setup-cluster.sh
Expand Down
44 changes: 21 additions & 23 deletions mllib-dal/src/main/native/CorrelationImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,12 @@

using namespace std;
#ifdef CPU_GPU_PROFILE
using namespace oneapi::dal;
#else
namespace covariance_gpu = oneapi::dal::covariance;
#endif
using namespace daal;
using namespace daal::algorithms;
using namespace daal::services;
#endif
namespace covariance_cpu = daal::algorithms::covariance;

#ifdef CPU_ONLY_PROFILE
typedef double algorithmFPType; /* Algorithm floating-point type */

static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
Expand All @@ -46,10 +44,10 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,

const bool isRoot = (rankId == ccl_root);

covariance::Distributed<step1Local, algorithmFPType> localAlgorithm;
covariance_cpu::Distributed<step1Local, algorithmFPType> localAlgorithm;

/* Set the input data set to the algorithm */
localAlgorithm.input.set(covariance::data, pData);
localAlgorithm.input.set(covariance_cpu::data, pData);

/* Compute covariance */
localAlgorithm.compute();
Expand Down Expand Up @@ -89,35 +87,36 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
if (isRoot) {
auto t1 = std::chrono::high_resolution_clock::now();
/* Create an algorithm to compute covariance on the master node */
covariance::Distributed<step2Master, algorithmFPType> masterAlgorithm;
covariance_cpu::Distributed<step2Master, algorithmFPType>
masterAlgorithm;

for (size_t i = 0; i < nBlocks; i++) {
/* Deserialize partial results from step 1 */
OutputDataArchive dataArch(serializedData.get() +
perNodeArchLength * i,
perNodeArchLength);

covariance::PartialResultPtr dataForStep2FromStep1(
new covariance::PartialResult());
covariance_cpu::PartialResultPtr dataForStep2FromStep1(
new covariance_cpu::PartialResult());
dataForStep2FromStep1->deserialize(dataArch);

/* Set local partial results as input for the master-node algorithm
*/
masterAlgorithm.input.add(covariance::partialResults,
masterAlgorithm.input.add(covariance_cpu::partialResults,
dataForStep2FromStep1);
}

/* Set the parameter to choose the type of the output matrix */
masterAlgorithm.parameter.outputMatrixType =
covariance::correlationMatrix;
covariance_cpu::correlationMatrix;

/* Merge and finalizeCompute covariance decomposition on the master node
*/
masterAlgorithm.compute();
masterAlgorithm.finalizeCompute();

/* Retrieve the algorithm results */
covariance::ResultPtr result = masterAlgorithm.getResult();
covariance_cpu::ResultPtr result = masterAlgorithm.getResult();
auto t2 = std::chrono::high_resolution_clock::now();
auto duration =
std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
Expand All @@ -126,7 +125,7 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
<< duration / 1000 << " secs" << std::endl;

/* Print the results */
printNumericTable(result->get(covariance::correlation),
printNumericTable(result->get(covariance_cpu::correlation),
"Correlation first 20 columns of "
"correlation matrix:",
1, 20);
Expand All @@ -138,13 +137,12 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
env->GetFieldID(clazz, "correlationNumericTable", "J");

NumericTablePtr *correlation =
new NumericTablePtr(result->get(covariance::correlation));
new NumericTablePtr(result->get(covariance_cpu::correlation));

env->SetLongField(resultObj, correlationNumericTableField,
(jlong)correlation);
}
}
#endif

#ifdef CPU_GPU_PROFILE
static void doCorrelationOneAPICompute(
Expand All @@ -156,9 +154,9 @@ static void doCorrelationOneAPICompute(
homogen_table htable =
*reinterpret_cast<const homogen_table *>(pNumTabData);

const auto cor_desc = covariance::descriptor{}.set_result_options(
covariance::result_options::cor_matrix |
covariance::result_options::means);
const auto cor_desc = covariance_gpu::descriptor{}.set_result_options(
covariance_gpu::result_options::cor_matrix |
covariance_gpu::result_options::means);
auto t1 = std::chrono::high_resolution_clock::now();
const auto result_train = preview::compute(comm, cor_desc, htable);
if (isRoot) {
Expand Down Expand Up @@ -200,7 +198,6 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
int rankId = cclComm.rank();
ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
switch (device) {
#ifdef CPU_ONLY_PROFILE
case ComputeDevice::host:
case ComputeDevice::cpu: {
NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
Expand All @@ -213,8 +210,9 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
<< nThreadsNew << std::endl;
doCorrelationDaalCompute(env, obj, rankId, cclComm, pData, executorNum,
resultObj);
break;
}
#else
#ifdef CPU_GPU_PROFILE
case ComputeDevice::gpu: {
int nGpu = env->GetArrayLength(gpuIdxArray);
std::cout << "oneDAL (native): use GPU kernels with " << nGpu
Expand All @@ -224,7 +222,6 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);

int size = cclComm.size();
ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);

auto queue =
getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
Expand All @@ -234,9 +231,10 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, size, rankId, kvs);
doCorrelationOneAPICompute(env, pNumTabData, comm, resultObj);
env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
break;
}
#endif
}

return 0;
}
42 changes: 21 additions & 21 deletions mllib-dal/src/main/native/KMeansImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,12 @@

using namespace std;
#ifdef CPU_GPU_PROFILE
using namespace oneapi::dal;
#else
namespace kmeans_gpu = oneapi::dal::kmeans;
#endif
using namespace daal;
using namespace daal::algorithms;
using namespace daal::services;
#endif
namespace kmeans_cpu = daal::algorithms::kmeans;

#ifdef CPU_ONLY_PROFILE
typedef double algorithmFPType; /* Algorithm floating-point type */

static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
Expand Down Expand Up @@ -75,11 +73,12 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
centroids->deserialize(outArch);

/* Create an algorithm to compute k-means on local nodes */
kmeans::Distributed<step1Local, algorithmFPType> localAlgorithm(nClusters);
kmeans_cpu::Distributed<step1Local, algorithmFPType> localAlgorithm(
nClusters);

/* Set the input data set to the algorithm */
localAlgorithm.input.set(kmeans::data, pData);
localAlgorithm.input.set(kmeans::inputCentroids, centroids);
localAlgorithm.input.set(kmeans_cpu::data, pData);
localAlgorithm.input.set(kmeans_cpu::inputCentroids, centroids);

/* Compute k-means */
localAlgorithm.compute();
Expand Down Expand Up @@ -108,21 +107,21 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,

if (isRoot) {
/* Create an algorithm to compute k-means on the master node */
kmeans::Distributed<step2Master, algorithmFPType> masterAlgorithm(
kmeans_cpu::Distributed<step2Master, algorithmFPType> masterAlgorithm(
nClusters);

for (size_t i = 0; i < nBlocks; i++) {
/* Deserialize partial results from step 1 */
OutputDataArchive dataArch(&serializedData[perNodeArchLength * i],
perNodeArchLength);

kmeans::PartialResultPtr dataForStep2FromStep1(
new kmeans::PartialResult());
kmeans_cpu::PartialResultPtr dataForStep2FromStep1(
new kmeans_cpu::PartialResult());
dataForStep2FromStep1->deserialize(dataArch);

/* Set local partial results as input for the master-node algorithm
*/
masterAlgorithm.input.add(kmeans::partialResults,
masterAlgorithm.input.add(kmeans_cpu::partialResults,
dataForStep2FromStep1);
}

Expand All @@ -131,11 +130,11 @@ static NumericTablePtr kmeans_compute(int rankId, ccl::communicator &comm,
masterAlgorithm.finalizeCompute();

ret_cost = masterAlgorithm.getResult()
->get(kmeans::objectiveFunction)
->get(kmeans_cpu::objectiveFunction)
->getValue<algorithmFPType>(0, 0);

/* Retrieve the algorithm results */
return masterAlgorithm.getResult()->get(kmeans::centroids);
return masterAlgorithm.getResult()->get(kmeans_cpu::centroids);
}
return NumericTablePtr();
}
Expand Down Expand Up @@ -239,7 +238,6 @@ static jlong doKMeansDaalCompute(JNIEnv *env, jobject obj, int rankId,
return (jlong)0;
}
}
#endif

#ifdef CPU_GPU_PROFILE
static jlong doKMeansOneAPICompute(
Expand All @@ -253,13 +251,13 @@ static jlong doKMeansOneAPICompute(
*reinterpret_cast<const homogen_table *>(pNumTabData);
homogen_table centroids =
*reinterpret_cast<const homogen_table *>(pNumTabCenters);
const auto kmeans_desc = kmeans::descriptor<>()
const auto kmeans_desc = kmeans_gpu::descriptor<>()
.set_cluster_count(clusterNum)
.set_max_iteration_count(iterationNum)
.set_accuracy_threshold(tolerance);
kmeans::train_input local_input{htable, centroids};
kmeans_gpu::train_input local_input{htable, centroids};
auto t1 = std::chrono::high_resolution_clock::now();
kmeans::train_result result_train =
kmeans_gpu::train_result result_train =
preview::train(comm, kmeans_desc, local_input);
if (isRoot) {
std::cout << "Iteration count: " << result_train.get_iteration_count()
Expand Down Expand Up @@ -314,7 +312,6 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
int rankId = cclComm.rank();
ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
switch (device) {
#ifdef CPU_ONLY_PROFILE
case ComputeDevice::host:
case ComputeDevice::cpu: {
NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
Expand All @@ -329,8 +326,9 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
ret = doKMeansDaalCompute(env, obj, rankId, cclComm, pData, centroids,
clusterNum, tolerance, iterationNum,
executorNum, resultObj);
break;
}
#else
#ifdef CPU_GPU_PROFILE
case ComputeDevice::gpu: {
int nGpu = env->GetArrayLength(gpuIdxArray);
std::cout << "oneDAL (native): use GPU kernels with " << nGpu
Expand All @@ -340,7 +338,6 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);

int size = cclComm.size();
ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);

auto queue =
getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
Expand All @@ -352,6 +349,9 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
ret =
doKMeansOneAPICompute(env, pNumTabData, pNumTabCenters, clusterNum,
tolerance, iterationNum, comm, resultObj);

env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
break;
}
#endif
}
Expand Down
24 changes: 12 additions & 12 deletions mllib-dal/src/main/native/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,12 @@ $(info )
CFLAGS_COMMON := -g -Wall -Wno-deprecated-declarations -fPIC -std=c++17 \
-I $(I_MPI_ROOT)/include \
-I $(DAALROOT)/include \
-I $(CCL_ROOT)/include/cpu/oneapi/ \
-I $(CMPLR_ROOT)/linux/include \
-I $(CMPLR_ROOT)/linux/include/sycl

ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
CFLAGS := $(CFLAGS_COMMON) -I $(CCL_ROOT)/include/cpu/oneapi/
CFLAGS := $(CFLAGS_COMMON)
else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
CFLAGS := $(CFLAGS_COMMON) -fsycl \
-fsycl-device-code-split=per_kernel \
Expand All @@ -46,14 +47,15 @@ else
exit 1
endif

INCS := -I $(JAVA_HOME)/include \
INCS := -I $(CCL_ROOT)/include/cpu \
-I $(JAVA_HOME)/include \
-I $(JAVA_HOME)/include/linux \
-I $(DAALROOT)/include \
-I ./javah \
-I ./

ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
INCS := $(INCS) -I $(CCL_ROOT)/include/cpu
INCS := $(INCS)
else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
INCS := $(INCS) -I $(CCL_ROOT)/include/cpu_gpu_dpcpp
else
Expand All @@ -62,17 +64,15 @@ else
endif

# Use static link if possible, TBB is only available as dynamic libs
LIBS_COMMON :=-L $(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \
-L $(DAALROOT)/lib/intel64 -lonedal_core -lonedal_thread -lonedal_dpc \
-L $(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc \
-L $(I_MPI_ROOT)
LIBS_COMMON := -L$(CCL_ROOT)/lib/cpu -lccl \
-L$(CMPLR_ROOT)/linux/compiler/lib/intel64_lin -l:libirc.a \
-L$(DAALROOT)/lib/intel64 -lonedal_core -lonedal_thread -lonedal_dpc \
-L$(TBBROOT)/lib/intel64/gcc4.8 -ltbb -ltbbmalloc \
-L$(I_MPI_ROOT)

ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
LIBS_COMMON := $(LIBS_COMMON) \
-L $(CCL_ROOT)/lib/cpu -lccl
else ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
ifeq ($(PLATFORM_PROFILE),CPU_GPU_PROFILE)
LIBS_COMMON := $(LIBS_COMMON) \
-L $(CCL_ROOT)/lib/cpu_gpu_dpcpp -lccl
-L$(CCL_ROOT)/lib/cpu_gpu_dpcpp -lccl
endif

ifeq ($(PLATFORM_PROFILE),CPU_ONLY_PROFILE)
Expand Down
4 changes: 1 addition & 3 deletions mllib-dal/src/main/native/OneCCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,9 @@ Java_com_intel_oap_mllib_OneCCL_00024_c_1initDpcpp(JNIEnv *env, jobject) {

JNIEXPORT void JNICALL
Java_com_intel_oap_mllib_OneCCL_00024_c_1cleanup(JNIEnv *env, jobject obj) {

std::cerr << "OneCCL (native): cleanup" << std::endl;
g_kvs.pop_back();
g_comms.pop_back();

std::cerr << "OneCCL (native): cleanup" << std::endl;
}

JNIEXPORT jboolean JNICALL
Expand Down
Loading