Skip to content

Commit

Permalink
Create new Homogen Table with assign GPU before algorithms running.
Browse files Browse the repository at this point in the history
Signed-off-by: minmingzhu <minming.zhu@intel.com>
  • Loading branch information
minmingzhu committed Oct 11, 2023
1 parent 262a746 commit 44489c8
Show file tree
Hide file tree
Showing 27 changed files with 335 additions and 147 deletions.
26 changes: 18 additions & 8 deletions mllib-dal/src/main/native/CorrelationImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,13 +149,22 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, size_t rankId,

#ifdef CPU_GPU_PROFILE
static void doCorrelationOneAPICompute(
JNIEnv *env, jlong pNumTabData,
JNIEnv *env, jlong pNumTabData, jlong numRows, jlong numClos,
preview::spmd::communicator<preview::spmd::device_memory_access::usm> comm,
jobject resultObj) {
jobject resultObj, sycl::queue &queue) {
logger::println(logger::INFO, "oneDAL (native): GPU compute start");
const bool isRoot = (comm.get_rank() == ccl_root);
homogen_table htable =
*reinterpret_cast<const homogen_table *>(pNumTabData);
GpuAlgorithmFPType *htableArray =
reinterpret_cast<GpuAlgorithmFPType *>(pNumTabData);
auto data =
sycl::malloc_shared<GpuAlgorithmFPType>(numRows * numClos, queue);
queue
.memcpy(data, htableArray,
sizeof(GpuAlgorithmFPType) * numRows * numClos)
.wait();
homogen_table htable{
queue, data, numRows, numClos,
detail::make_default_delete<const GpuAlgorithmFPType>(queue)};

const auto cor_desc =
covariance_gpu::descriptor<GpuAlgorithmFPType>{}.set_result_options(
Expand Down Expand Up @@ -195,9 +204,9 @@ static void doCorrelationOneAPICompute(

JNIEXPORT jlong JNICALL
Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
JNIEnv *env, jobject obj, jlong pNumTabData, jint executorNum,
jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray,
jobject resultObj) {
JNIEnv *env, jobject obj, jlong pNumTabData, jlong numRows, jlong numClos,
jint executorNum, jint executorCores, jint computeDeviceOrdinal,
jintArray gpuIdxArray, jobject resultObj) {
logger::println(logger::INFO,
"oneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());
Expand Down Expand Up @@ -240,7 +249,8 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
auto comm =
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, size, rankId, kvs);
doCorrelationOneAPICompute(env, pNumTabData, comm, resultObj);
doCorrelationOneAPICompute(env, pNumTabData, numRows, numClos, comm,
resultObj, queue);
env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
break;
}
Expand Down
56 changes: 37 additions & 19 deletions mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,25 +208,41 @@ jobject collect_model(JNIEnv *env, const df::model<Task> &m,
}

static jobject doRFClassifierOneAPICompute(
JNIEnv *env, jlong pNumTabFeature, jlong pNumTabLabel, jint executorNum,
JNIEnv *env, jlong pNumTabFeature, jlong featureRows, jlong featureCols,
jlong pNumTabLabel, jlong labelCols, jint executorNum,
jint computeDeviceOrdinal, jint classCount, jint treeCount,
jint numFeaturesPerNode, jint minObservationsLeafNode,
jint minObservationsSplitNode, jdouble minWeightFractionLeafNode,
jdouble minImpurityDecreaseSplitNode, jint maxTreeDepth, jlong seed,
jint maxBins, jboolean bootstrap,
preview::spmd::communicator<preview::spmd::device_memory_access::usm> comm,
jobject resultObj) {
jobject resultObj, sycl::queue &queue) {
logger::println(logger::INFO, "oneDAL (native): GPU compute start");
const bool isRoot = (comm.get_rank() == ccl_root);
homogen_table hFeaturetable =
*reinterpret_cast<const homogen_table *>(pNumTabFeature);
homogen_table hLabeltable =
*reinterpret_cast<const homogen_table *>(pNumTabLabel);
logger::println(logger::INFO,
"doRFClassifierOneAPICompute get_column_count = %d",
hFeaturetable.get_column_count());
logger::println(logger::INFO, "doRFClassifierOneAPICompute classCount = %d",
classCount);
GpuAlgorithmFPType *htableFeatureArray =
reinterpret_cast<GpuAlgorithmFPType *>(pNumTabFeature);
GpuAlgorithmFPType *htableLabelArray =
reinterpret_cast<GpuAlgorithmFPType *>(pNumTabLabel);

auto featureData = sycl::malloc_shared<GpuAlgorithmFPType>(
featureRows * featureCols, queue);
queue
.memcpy(featureData, htableFeatureArray,
sizeof(GpuAlgorithmFPType) * featureRows * featureCols)
.wait();
homogen_table hFeaturetable{
queue, featureData, featureRows, featureCols,
detail::make_default_delete<const GpuAlgorithmFPType>(queue)};

auto labelData =
sycl::malloc_shared<GpuAlgorithmFPType>(featureRows * labelCols, queue);
queue
.memcpy(labelData, htableLabelArray,
sizeof(GpuAlgorithmFPType) * featureRows * labelCols)
.wait();
homogen_table hLabeltable{
queue, labelData, featureRows, labelCols,
detail::make_default_delete<const GpuAlgorithmFPType>(queue)};

const auto df_desc =
df::descriptor<GpuAlgorithmFPType, df::method::hist,
Expand Down Expand Up @@ -300,9 +316,10 @@ static jobject doRFClassifierOneAPICompute(
*/
JNIEXPORT jobject JNICALL
Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassifierTrainDAL(
JNIEnv *env, jobject obj, jlong pNumTabFeature, jlong pNumTabLabel,
jint executorNum, jint computeDeviceOrdinal, jint classCount,
jint treeCount, jint numFeaturesPerNode, jint minObservationsLeafNode,
JNIEnv *env, jobject obj, jlong pNumTabFeature, jlong featureRows,
jlong featureCols, jlong pNumTabLabel, jlong labelCols, jint executorNum,
jint computeDeviceOrdinal, jint classCount, jint treeCount,
jint numFeaturesPerNode, jint minObservationsLeafNode,
jint minObservationsSplitNode, jdouble minWeightFractionLeafNode,
jdouble minImpurityDecreaseSplitNode, jint maxTreeDepth, jlong seed,
jint maxBins, jboolean bootstrap, jintArray gpuIdxArray,
Expand Down Expand Up @@ -333,11 +350,12 @@ Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassif
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, size, rankId, kvs);
jobject hashmapObj = doRFClassifierOneAPICompute(
env, pNumTabFeature, pNumTabLabel, executorNum,
computeDeviceOrdinal, classCount, treeCount, numFeaturesPerNode,
minObservationsLeafNode, minObservationsSplitNode,
minWeightFractionLeafNode, minImpurityDecreaseSplitNode,
maxTreeDepth, seed, maxBins, bootstrap, comm, resultObj);
env, pNumTabFeature, featureRows, featureCols, pNumTabLabel,
labelCols, executorNum, computeDeviceOrdinal, classCount, treeCount,
numFeaturesPerNode, minObservationsLeafNode,
minObservationsSplitNode, minWeightFractionLeafNode,
minImpurityDecreaseSplitNode, maxTreeDepth, seed, maxBins,
bootstrap, comm, resultObj, queue);
return hashmapObj;
}
default: {
Expand Down
50 changes: 35 additions & 15 deletions mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,18 +207,38 @@ jobject collect_model(JNIEnv *env, const df::model<Task> &m,
}

static jobject doRFRegressorOneAPICompute(
JNIEnv *env, jlong pNumTabFeature, jlong pNumTabLabel, jint executorNum,
JNIEnv *env, jlong pNumTabFeature, jlong featureRows, jlong featureCols,
jlong pNumTabLabel, jlong labelCols, jint executorNum,
jint computeDeviceOrdinal, jint treeCount, jint numFeaturesPerNode,
jint minObservationsLeafNode, jint maxTreeDepth, jlong seed, jint maxbins,
jboolean bootstrap,
preview::spmd::communicator<preview::spmd::device_memory_access::usm> comm,
jobject resultObj) {
jobject resultObj, sycl::queue &queue) {
logger::println(logger::INFO, "OneDAL (native): GPU compute start");
const bool isRoot = (comm.get_rank() == ccl_root);
homogen_table hFeaturetable =
*reinterpret_cast<const homogen_table *>(pNumTabFeature);
homogen_table hLabeltable =
*reinterpret_cast<const homogen_table *>(pNumTabLabel);
GpuAlgorithmFPType *htableFeatureArray =
reinterpret_cast<GpuAlgorithmFPType *>(pNumTabFeature);
GpuAlgorithmFPType *htableLabelArray =
reinterpret_cast<GpuAlgorithmFPType *>(pNumTabLabel);
auto featureData = sycl::malloc_shared<GpuAlgorithmFPType>(
featureRows * featureCols, queue);
queue
.memcpy(featureData, htableFeatureArray,
sizeof(GpuAlgorithmFPType) * featureRows * featureCols)
.wait();
homogen_table hFeaturetable{
queue, featureData, featureRows, featureCols,
detail::make_default_delete<const GpuAlgorithmFPType>(queue)};

auto labelData =
sycl::malloc_shared<GpuAlgorithmFPType>(featureRows * labelCols, queue);
queue
.memcpy(labelData, htableLabelArray,
sizeof(GpuAlgorithmFPType) * featureRows * labelCols)
.wait();
homogen_table hLabeltable{
queue, labelData, featureRows, labelCols,
detail::make_default_delete<const GpuAlgorithmFPType>(queue)};
logger::println(logger::INFO,
"doRFRegressorOneAPICompute get_column_count = %d",
hFeaturetable.get_column_count());
Expand Down Expand Up @@ -290,11 +310,11 @@ static jobject doRFRegressorOneAPICompute(

JNIEXPORT jobject JNICALL
Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTrainDAL(
JNIEnv *env, jobject obj, jlong pNumTabFeature, jlong pNumTabLabel,
jint executorNum, jint computeDeviceOrdinal, jint treeCount,
jint numFeaturesPerNode, jint minObservationsLeafNode, jint maxTreeDepth,
jlong seed, jint maxbins, jboolean bootstrap, jintArray gpuIdxArray,
jobject resultObj) {
JNIEnv *env, jobject obj, jlong pNumTabFeature, jlong featureRows,
jlong featureCols, jlong pNumTabLabel, jlong labelCols, jint executorNum,
jint computeDeviceOrdinal, jint treeCount, jint numFeaturesPerNode,
jint minObservationsLeafNode, jint maxTreeDepth, jlong seed, jint maxbins,
jboolean bootstrap, jintArray gpuIdxArray, jobject resultObj) {
logger::println(logger::INFO,
"OneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());
Expand Down Expand Up @@ -322,10 +342,10 @@ Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTra
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, size, rankId, kvs);
jobject hashmapObj = doRFRegressorOneAPICompute(
env, pNumTabFeature, pNumTabLabel, executorNum,
computeDeviceOrdinal, treeCount, numFeaturesPerNode,
minObservationsLeafNode, maxTreeDepth, seed, maxbins, bootstrap,
comm, resultObj);
env, pNumTabFeature, featureRows, featureCols, pNumTabLabel,
labelCols, executorNum, computeDeviceOrdinal, treeCount,
numFeaturesPerNode, minObservationsLeafNode, maxTreeDepth, seed,
maxbins, bootstrap, comm, resultObj, queue);
return hashmapObj;
}
default: {
Expand Down
34 changes: 22 additions & 12 deletions mllib-dal/src/main/native/KMeansImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,14 +243,24 @@ static jlong doKMeansDaalCompute(JNIEnv *env, jobject obj, size_t rankId,

#ifdef CPU_GPU_PROFILE
static jlong doKMeansOneAPICompute(
JNIEnv *env, jlong pNumTabData, jlong pNumTabCenters, jint clusterNum,
jdouble tolerance, jint iterationNum,
JNIEnv *env, jlong pNumTabData, jlong numRows, jlong numClos,
jlong pNumTabCenters, jint clusterNum, jdouble tolerance, jint iterationNum,
preview::spmd::communicator<preview::spmd::device_memory_access::usm> comm,
jobject resultObj) {
jobject resultObj, sycl::queue &queue) {
logger::println(logger::INFO, "OneDAL (native): GPU compute start");
const bool isRoot = (comm.get_rank() == ccl_root);
homogen_table htable =
*reinterpret_cast<const homogen_table *>(pNumTabData);
GpuAlgorithmFPType *htableArray =
reinterpret_cast<GpuAlgorithmFPType *>(pNumTabData);
auto data =
sycl::malloc_shared<GpuAlgorithmFPType>(numRows * numClos, queue);
queue
.memcpy(data, htableArray,
sizeof(GpuAlgorithmFPType) * numRows * numClos)
.wait();
homogen_table htable{
queue, data, numRows, numClos,
detail::make_default_delete<const GpuAlgorithmFPType>(queue)};

homogen_table centroids =
*reinterpret_cast<const homogen_table *>(pNumTabCenters);
const auto kmeans_desc = kmeans_gpu::descriptor<GpuAlgorithmFPType>()
Expand Down Expand Up @@ -303,10 +313,10 @@ static jlong doKMeansOneAPICompute(
*/
JNIEXPORT jlong JNICALL
Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCenters(
JNIEnv *env, jobject obj, jlong pNumTabData, jlong pNumTabCenters,
jint clusterNum, jdouble tolerance, jint iterationNum, jint executorNum,
jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray,
jobject resultObj) {
JNIEnv *env, jobject obj, jlong pNumTabData, jlong numRows, jlong numClos,
jlong pNumTabCenters, jint clusterNum, jdouble tolerance, jint iterationNum,
jint executorNum, jint executorCores, jint computeDeviceOrdinal,
jintArray gpuIdxArray, jobject resultObj) {
logger::println(logger::INFO,
"OneDAL (native): use DPC++ kernels; device %s",
ComputeDeviceString[computeDeviceOrdinal].c_str());
Expand Down Expand Up @@ -352,9 +362,9 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
auto comm =
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, size, rankId, kvs);
ret =
doKMeansOneAPICompute(env, pNumTabData, pNumTabCenters, clusterNum,
tolerance, iterationNum, comm, resultObj);
ret = doKMeansOneAPICompute(env, pNumTabData, numRows, numClos,
pNumTabCenters, clusterNum, tolerance,
iterationNum, comm, resultObj, queue);

env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
break;
Expand Down
46 changes: 33 additions & 13 deletions mllib-dal/src/main/native/LinearRegressionImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,10 @@ ridge_regression_compute(size_t rankId, ccl::communicator &comm,
#ifdef CPU_GPU_PROFILE
static jlong doLROneAPICompute(JNIEnv *env, size_t rankId,
ccl::communicator &cclComm, sycl::queue &queue,
jlong pData, jlong pLabel,
jboolean jfitIntercept, jint executorNum,
jobject resultObj) {
jlong pNumTabFeature, jlong featureRows,
jlong featureCols, jlong pNumTabLabel,
jlong labelCols, jboolean jfitIntercept,
jint executorNum, jobject resultObj) {
logger::println(logger::INFO,
"oneDAL (native): GPU compute start , rankid %d", rankId);
const bool isRoot = (rankId == ccl_root);
Expand All @@ -228,9 +229,29 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId,
ccl::shared_ptr_class<ccl::kvs> &kvs = getKvs();
auto comm = preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, size, rankId, kvs);

homogen_table xtrain = *reinterpret_cast<const homogen_table *>(pData);
homogen_table ytrain = *reinterpret_cast<const homogen_table *>(pLabel);
GpuAlgorithmFPType *htableFeatureArray =
reinterpret_cast<GpuAlgorithmFPType *>(pNumTabFeature);
GpuAlgorithmFPType *htableLabelArray =
reinterpret_cast<GpuAlgorithmFPType *>(pNumTabLabel);
auto featureData = sycl::malloc_shared<GpuAlgorithmFPType>(
featureRows * featureCols, queue);
queue
.memcpy(featureData, htableFeatureArray,
sizeof(GpuAlgorithmFPType) * featureRows * featureCols)
.wait();
homogen_table xtrain{
queue, featureData, featureRows, featureCols,
detail::make_default_delete<const GpuAlgorithmFPType>(queue)};

auto labelData =
sycl::malloc_shared<GpuAlgorithmFPType>(featureRows * labelCols, queue);
queue
.memcpy(labelData, htableLabelArray,
sizeof(GpuAlgorithmFPType) * featureRows * labelCols)
.wait();
homogen_table ytrain{
queue, labelData, featureRows, labelCols,
detail::make_default_delete<const GpuAlgorithmFPType>(queue)};

linear_regression_gpu::train_input local_input{xtrain, ytrain};
const auto linear_regression_desc =
Expand All @@ -256,7 +277,8 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId,
*/
JNIEXPORT jlong JNICALL
Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTrainDAL(
JNIEnv *env, jobject obj, jlong data, jlong label, jboolean fitIntercept,
JNIEnv *env, jobject obj, jlong feature, jlong featureRows,
jlong featureCols, jlong label, jlong labelCols, jboolean fitIntercept,
jdouble regParam, jdouble elasticNetParam, jint executorNum,
jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray,
jobject resultObj) {
Expand Down Expand Up @@ -288,16 +310,14 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra
auto queue =
getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);

jlong pDatagpu = (jlong)data;
jlong pLabelgpu = (jlong)label;
resultptr =
doLROneAPICompute(env, rankId, cclComm, queue, pDatagpu, pLabelgpu,
fitIntercept, executorNum, resultObj);
resultptr = doLROneAPICompute(
env, rankId, cclComm, queue, feature, featureRows, featureCols,
label, labelCols, fitIntercept, executorNum, resultObj);
env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
#endif
} else {
NumericTablePtr pLabel = *((NumericTablePtr *)label);
NumericTablePtr pData = *((NumericTablePtr *)data);
NumericTablePtr pData = *((NumericTablePtr *)feature);

// Set number of threads for oneDAL to use for each rank
services::Environment::getInstance()->setNumberOfThreads(executorCores);
Expand Down
Loading

0 comments on commit 44489c8

Please sign in to comment.