Create new Homogen Table with assign GPU before algorithms running.

Signed-off-by: minmingzhu <minming.zhu@intel.com>
oap-project · Oct 11, 2023 · 44489c8 · 44489c8
1 parent 262a746
commit 44489c8
Show file tree

Hide file tree

Showing 27 changed files with 335 additions and 147 deletions.
diff --git a/mllib-dal/src/main/native/CorrelationImpl.cpp b/mllib-dal/src/main/native/CorrelationImpl.cpp
@@ -149,13 +149,22 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, size_t rankId,
 
 #ifdef CPU_GPU_PROFILE
 static void doCorrelationOneAPICompute(
-    JNIEnv *env, jlong pNumTabData,
+    JNIEnv *env, jlong pNumTabData, jlong numRows, jlong numClos,
     preview::spmd::communicator<preview::spmd::device_memory_access::usm> comm,
-    jobject resultObj) {
+    jobject resultObj, sycl::queue &queue) {
     logger::println(logger::INFO, "oneDAL (native): GPU compute start");
     const bool isRoot = (comm.get_rank() == ccl_root);
-    homogen_table htable =
-        *reinterpret_cast<const homogen_table *>(pNumTabData);
+    GpuAlgorithmFPType *htableArray =
+        reinterpret_cast<GpuAlgorithmFPType *>(pNumTabData);
+    auto data =
+        sycl::malloc_shared<GpuAlgorithmFPType>(numRows * numClos, queue);
+    queue
+        .memcpy(data, htableArray,
+                sizeof(GpuAlgorithmFPType) * numRows * numClos)
+        .wait();
+    homogen_table htable{
+        queue, data, numRows, numClos,
+        detail::make_default_delete<const GpuAlgorithmFPType>(queue)};
 
     const auto cor_desc =
         covariance_gpu::descriptor<GpuAlgorithmFPType>{}.set_result_options(
@@ -195,9 +204,9 @@ static void doCorrelationOneAPICompute(
 
 JNIEXPORT jlong JNICALL
 Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
-    JNIEnv *env, jobject obj, jlong pNumTabData, jint executorNum,
-    jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray,
-    jobject resultObj) {
+    JNIEnv *env, jobject obj, jlong pNumTabData, jlong numRows, jlong numClos,
+    jint executorNum, jint executorCores, jint computeDeviceOrdinal,
+    jintArray gpuIdxArray, jobject resultObj) {
     logger::println(logger::INFO,
                     "oneDAL (native): use DPC++ kernels; device %s",
                     ComputeDeviceString[computeDeviceOrdinal].c_str());
@@ -240,7 +249,8 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
         auto comm =
             preview::spmd::make_communicator<preview::spmd::backend::ccl>(
                 queue, size, rankId, kvs);
-        doCorrelationOneAPICompute(env, pNumTabData, comm, resultObj);
+        doCorrelationOneAPICompute(env, pNumTabData, numRows, numClos, comm,
+                                   resultObj, queue);
         env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
         break;
     }

diff --git a/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp b/mllib-dal/src/main/native/DecisionForestClassifierImpl.cpp
@@ -208,25 +208,41 @@ jobject collect_model(JNIEnv *env, const df::model<Task> &m,
 }
 
 static jobject doRFClassifierOneAPICompute(
-    JNIEnv *env, jlong pNumTabFeature, jlong pNumTabLabel, jint executorNum,
+    JNIEnv *env, jlong pNumTabFeature, jlong featureRows, jlong featureCols,
+    jlong pNumTabLabel, jlong labelCols, jint executorNum,
     jint computeDeviceOrdinal, jint classCount, jint treeCount,
     jint numFeaturesPerNode, jint minObservationsLeafNode,
     jint minObservationsSplitNode, jdouble minWeightFractionLeafNode,
     jdouble minImpurityDecreaseSplitNode, jint maxTreeDepth, jlong seed,
     jint maxBins, jboolean bootstrap,
     preview::spmd::communicator<preview::spmd::device_memory_access::usm> comm,
-    jobject resultObj) {
+    jobject resultObj, sycl::queue &queue) {
     logger::println(logger::INFO, "oneDAL (native): GPU compute start");
     const bool isRoot = (comm.get_rank() == ccl_root);
-    homogen_table hFeaturetable =
-        *reinterpret_cast<const homogen_table *>(pNumTabFeature);
-    homogen_table hLabeltable =
-        *reinterpret_cast<const homogen_table *>(pNumTabLabel);
-    logger::println(logger::INFO,
-                    "doRFClassifierOneAPICompute get_column_count = %d",
-                    hFeaturetable.get_column_count());
-    logger::println(logger::INFO, "doRFClassifierOneAPICompute classCount = %d",
-                    classCount);
+    GpuAlgorithmFPType *htableFeatureArray =
+        reinterpret_cast<GpuAlgorithmFPType *>(pNumTabFeature);
+    GpuAlgorithmFPType *htableLabelArray =
+        reinterpret_cast<GpuAlgorithmFPType *>(pNumTabLabel);
+
+    auto featureData = sycl::malloc_shared<GpuAlgorithmFPType>(
+        featureRows * featureCols, queue);
+    queue
+        .memcpy(featureData, htableFeatureArray,
+                sizeof(GpuAlgorithmFPType) * featureRows * featureCols)
+        .wait();
+    homogen_table hFeaturetable{
+        queue, featureData, featureRows, featureCols,
+        detail::make_default_delete<const GpuAlgorithmFPType>(queue)};
+
+    auto labelData =
+        sycl::malloc_shared<GpuAlgorithmFPType>(featureRows * labelCols, queue);
+    queue
+        .memcpy(labelData, htableLabelArray,
+                sizeof(GpuAlgorithmFPType) * featureRows * labelCols)
+        .wait();
+    homogen_table hLabeltable{
+        queue, labelData, featureRows, labelCols,
+        detail::make_default_delete<const GpuAlgorithmFPType>(queue)};
 
     const auto df_desc =
         df::descriptor<GpuAlgorithmFPType, df::method::hist,
@@ -300,9 +316,10 @@ static jobject doRFClassifierOneAPICompute(
  */
 JNIEXPORT jobject JNICALL
 Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassifierTrainDAL(
-    JNIEnv *env, jobject obj, jlong pNumTabFeature, jlong pNumTabLabel,
-    jint executorNum, jint computeDeviceOrdinal, jint classCount,
-    jint treeCount, jint numFeaturesPerNode, jint minObservationsLeafNode,
+    JNIEnv *env, jobject obj, jlong pNumTabFeature, jlong featureRows,
+    jlong featureCols, jlong pNumTabLabel, jlong labelCols, jint executorNum,
+    jint computeDeviceOrdinal, jint classCount, jint treeCount,
+    jint numFeaturesPerNode, jint minObservationsLeafNode,
     jint minObservationsSplitNode, jdouble minWeightFractionLeafNode,
     jdouble minImpurityDecreaseSplitNode, jint maxTreeDepth, jlong seed,
     jint maxBins, jboolean bootstrap, jintArray gpuIdxArray,
@@ -333,11 +350,12 @@ Java_com_intel_oap_mllib_classification_RandomForestClassifierDALImpl_cRFClassif
             preview::spmd::make_communicator<preview::spmd::backend::ccl>(
                 queue, size, rankId, kvs);
         jobject hashmapObj = doRFClassifierOneAPICompute(
-            env, pNumTabFeature, pNumTabLabel, executorNum,
-            computeDeviceOrdinal, classCount, treeCount, numFeaturesPerNode,
-            minObservationsLeafNode, minObservationsSplitNode,
-            minWeightFractionLeafNode, minImpurityDecreaseSplitNode,
-            maxTreeDepth, seed, maxBins, bootstrap, comm, resultObj);
+            env, pNumTabFeature, featureRows, featureCols, pNumTabLabel,
+            labelCols, executorNum, computeDeviceOrdinal, classCount, treeCount,
+            numFeaturesPerNode, minObservationsLeafNode,
+            minObservationsSplitNode, minWeightFractionLeafNode,
+            minImpurityDecreaseSplitNode, maxTreeDepth, seed, maxBins,
+            bootstrap, comm, resultObj, queue);
         return hashmapObj;
     }
     default: {

diff --git a/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp b/mllib-dal/src/main/native/DecisionForestRegressorImpl.cpp
@@ -207,18 +207,38 @@ jobject collect_model(JNIEnv *env, const df::model<Task> &m,
 }
 
 static jobject doRFRegressorOneAPICompute(
-    JNIEnv *env, jlong pNumTabFeature, jlong pNumTabLabel, jint executorNum,
+    JNIEnv *env, jlong pNumTabFeature, jlong featureRows, jlong featureCols,
+    jlong pNumTabLabel, jlong labelCols, jint executorNum,
     jint computeDeviceOrdinal, jint treeCount, jint numFeaturesPerNode,
     jint minObservationsLeafNode, jint maxTreeDepth, jlong seed, jint maxbins,
     jboolean bootstrap,
     preview::spmd::communicator<preview::spmd::device_memory_access::usm> comm,
-    jobject resultObj) {
+    jobject resultObj, sycl::queue &queue) {
     logger::println(logger::INFO, "OneDAL (native): GPU compute start");
     const bool isRoot = (comm.get_rank() == ccl_root);
-    homogen_table hFeaturetable =
-        *reinterpret_cast<const homogen_table *>(pNumTabFeature);
-    homogen_table hLabeltable =
-        *reinterpret_cast<const homogen_table *>(pNumTabLabel);
+    GpuAlgorithmFPType *htableFeatureArray =
+        reinterpret_cast<GpuAlgorithmFPType *>(pNumTabFeature);
+    GpuAlgorithmFPType *htableLabelArray =
+        reinterpret_cast<GpuAlgorithmFPType *>(pNumTabLabel);
+    auto featureData = sycl::malloc_shared<GpuAlgorithmFPType>(
+        featureRows * featureCols, queue);
+    queue
+        .memcpy(featureData, htableFeatureArray,
+                sizeof(GpuAlgorithmFPType) * featureRows * featureCols)
+        .wait();
+    homogen_table hFeaturetable{
+        queue, featureData, featureRows, featureCols,
+        detail::make_default_delete<const GpuAlgorithmFPType>(queue)};
+
+    auto labelData =
+        sycl::malloc_shared<GpuAlgorithmFPType>(featureRows * labelCols, queue);
+    queue
+        .memcpy(labelData, htableLabelArray,
+                sizeof(GpuAlgorithmFPType) * featureRows * labelCols)
+        .wait();
+    homogen_table hLabeltable{
+        queue, labelData, featureRows, labelCols,
+        detail::make_default_delete<const GpuAlgorithmFPType>(queue)};
     logger::println(logger::INFO,
                     "doRFRegressorOneAPICompute get_column_count = %d",
                     hFeaturetable.get_column_count());
@@ -290,11 +310,11 @@ static jobject doRFRegressorOneAPICompute(
 
 JNIEXPORT jobject JNICALL
 Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTrainDAL(
-    JNIEnv *env, jobject obj, jlong pNumTabFeature, jlong pNumTabLabel,
-    jint executorNum, jint computeDeviceOrdinal, jint treeCount,
-    jint numFeaturesPerNode, jint minObservationsLeafNode, jint maxTreeDepth,
-    jlong seed, jint maxbins, jboolean bootstrap, jintArray gpuIdxArray,
-    jobject resultObj) {
+    JNIEnv *env, jobject obj, jlong pNumTabFeature, jlong featureRows,
+    jlong featureCols, jlong pNumTabLabel, jlong labelCols, jint executorNum,
+    jint computeDeviceOrdinal, jint treeCount, jint numFeaturesPerNode,
+    jint minObservationsLeafNode, jint maxTreeDepth, jlong seed, jint maxbins,
+    jboolean bootstrap, jintArray gpuIdxArray, jobject resultObj) {
     logger::println(logger::INFO,
                     "OneDAL (native): use DPC++ kernels; device %s",
                     ComputeDeviceString[computeDeviceOrdinal].c_str());
@@ -322,10 +342,10 @@ Java_com_intel_oap_mllib_regression_RandomForestRegressorDALImpl_cRFRegressorTra
             preview::spmd::make_communicator<preview::spmd::backend::ccl>(
                 queue, size, rankId, kvs);
         jobject hashmapObj = doRFRegressorOneAPICompute(
-            env, pNumTabFeature, pNumTabLabel, executorNum,
-            computeDeviceOrdinal, treeCount, numFeaturesPerNode,
-            minObservationsLeafNode, maxTreeDepth, seed, maxbins, bootstrap,
-            comm, resultObj);
+            env, pNumTabFeature, featureRows, featureCols, pNumTabLabel,
+            labelCols, executorNum, computeDeviceOrdinal, treeCount,
+            numFeaturesPerNode, minObservationsLeafNode, maxTreeDepth, seed,
+            maxbins, bootstrap, comm, resultObj, queue);
         return hashmapObj;
     }
     default: {

diff --git a/mllib-dal/src/main/native/KMeansImpl.cpp b/mllib-dal/src/main/native/KMeansImpl.cpp
@@ -243,14 +243,24 @@ static jlong doKMeansDaalCompute(JNIEnv *env, jobject obj, size_t rankId,
 
 #ifdef CPU_GPU_PROFILE
 static jlong doKMeansOneAPICompute(
-    JNIEnv *env, jlong pNumTabData, jlong pNumTabCenters, jint clusterNum,
-    jdouble tolerance, jint iterationNum,
+    JNIEnv *env, jlong pNumTabData, jlong numRows, jlong numClos,
+    jlong pNumTabCenters, jint clusterNum, jdouble tolerance, jint iterationNum,
     preview::spmd::communicator<preview::spmd::device_memory_access::usm> comm,
-    jobject resultObj) {
+    jobject resultObj, sycl::queue &queue) {
     logger::println(logger::INFO, "OneDAL (native): GPU compute start");
     const bool isRoot = (comm.get_rank() == ccl_root);
-    homogen_table htable =
-        *reinterpret_cast<const homogen_table *>(pNumTabData);
+    GpuAlgorithmFPType *htableArray =
+        reinterpret_cast<GpuAlgorithmFPType *>(pNumTabData);
+    auto data =
+        sycl::malloc_shared<GpuAlgorithmFPType>(numRows * numClos, queue);
+    queue
+        .memcpy(data, htableArray,
+                sizeof(GpuAlgorithmFPType) * numRows * numClos)
+        .wait();
+    homogen_table htable{
+        queue, data, numRows, numClos,
+        detail::make_default_delete<const GpuAlgorithmFPType>(queue)};
+
     homogen_table centroids =
         *reinterpret_cast<const homogen_table *>(pNumTabCenters);
     const auto kmeans_desc = kmeans_gpu::descriptor<GpuAlgorithmFPType>()
@@ -303,10 +313,10 @@ static jlong doKMeansOneAPICompute(
  */
 JNIEXPORT jlong JNICALL
 Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCenters(
-    JNIEnv *env, jobject obj, jlong pNumTabData, jlong pNumTabCenters,
-    jint clusterNum, jdouble tolerance, jint iterationNum, jint executorNum,
-    jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray,
-    jobject resultObj) {
+    JNIEnv *env, jobject obj, jlong pNumTabData, jlong numRows, jlong numClos,
+    jlong pNumTabCenters, jint clusterNum, jdouble tolerance, jint iterationNum,
+    jint executorNum, jint executorCores, jint computeDeviceOrdinal,
+    jintArray gpuIdxArray, jobject resultObj) {
     logger::println(logger::INFO,
                     "OneDAL (native): use DPC++ kernels; device %s",
                     ComputeDeviceString[computeDeviceOrdinal].c_str());
@@ -352,9 +362,9 @@ Java_com_intel_oap_mllib_clustering_KMeansDALImpl_cKMeansOneapiComputeWithInitCe
         auto comm =
             preview::spmd::make_communicator<preview::spmd::backend::ccl>(
                 queue, size, rankId, kvs);
-        ret =
-            doKMeansOneAPICompute(env, pNumTabData, pNumTabCenters, clusterNum,
-                                  tolerance, iterationNum, comm, resultObj);
+        ret = doKMeansOneAPICompute(env, pNumTabData, numRows, numClos,
+                                    pNumTabCenters, clusterNum, tolerance,
+                                    iterationNum, comm, resultObj, queue);
 
         env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
         break;

diff --git a/mllib-dal/src/main/native/LinearRegressionImpl.cpp b/mllib-dal/src/main/native/LinearRegressionImpl.cpp
@@ -216,9 +216,10 @@ ridge_regression_compute(size_t rankId, ccl::communicator &comm,
 #ifdef CPU_GPU_PROFILE
 static jlong doLROneAPICompute(JNIEnv *env, size_t rankId,
                                ccl::communicator &cclComm, sycl::queue &queue,
-                               jlong pData, jlong pLabel,
-                               jboolean jfitIntercept, jint executorNum,
-                               jobject resultObj) {
+                               jlong pNumTabFeature, jlong featureRows,
+                               jlong featureCols, jlong pNumTabLabel,
+                               jlong labelCols, jboolean jfitIntercept,
+                               jint executorNum, jobject resultObj) {
     logger::println(logger::INFO,
                     "oneDAL (native): GPU compute start , rankid %d", rankId);
     const bool isRoot = (rankId == ccl_root);
@@ -228,9 +229,29 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId,
     ccl::shared_ptr_class<ccl::kvs> &kvs = getKvs();
     auto comm = preview::spmd::make_communicator<preview::spmd::backend::ccl>(
         queue, size, rankId, kvs);
-
-    homogen_table xtrain = *reinterpret_cast<const homogen_table *>(pData);
-    homogen_table ytrain = *reinterpret_cast<const homogen_table *>(pLabel);
+    GpuAlgorithmFPType *htableFeatureArray =
+        reinterpret_cast<GpuAlgorithmFPType *>(pNumTabFeature);
+    GpuAlgorithmFPType *htableLabelArray =
+        reinterpret_cast<GpuAlgorithmFPType *>(pNumTabLabel);
+    auto featureData = sycl::malloc_shared<GpuAlgorithmFPType>(
+        featureRows * featureCols, queue);
+    queue
+        .memcpy(featureData, htableFeatureArray,
+                sizeof(GpuAlgorithmFPType) * featureRows * featureCols)
+        .wait();
+    homogen_table xtrain{
+        queue, featureData, featureRows, featureCols,
+        detail::make_default_delete<const GpuAlgorithmFPType>(queue)};
+
+    auto labelData =
+        sycl::malloc_shared<GpuAlgorithmFPType>(featureRows * labelCols, queue);
+    queue
+        .memcpy(labelData, htableLabelArray,
+                sizeof(GpuAlgorithmFPType) * featureRows * labelCols)
+        .wait();
+    homogen_table ytrain{
+        queue, labelData, featureRows, labelCols,
+        detail::make_default_delete<const GpuAlgorithmFPType>(queue)};
 
     linear_regression_gpu::train_input local_input{xtrain, ytrain};
     const auto linear_regression_desc =
@@ -256,7 +277,8 @@ static jlong doLROneAPICompute(JNIEnv *env, size_t rankId,
  */
 JNIEXPORT jlong JNICALL
 Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTrainDAL(
-    JNIEnv *env, jobject obj, jlong data, jlong label, jboolean fitIntercept,
+    JNIEnv *env, jobject obj, jlong feature, jlong featureRows,
+    jlong featureCols, jlong label, jlong labelCols, jboolean fitIntercept,
     jdouble regParam, jdouble elasticNetParam, jint executorNum,
     jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray,
     jobject resultObj) {
@@ -288,16 +310,14 @@ Java_com_intel_oap_mllib_regression_LinearRegressionDALImpl_cLinearRegressionTra
         auto queue =
             getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);
 
-        jlong pDatagpu = (jlong)data;
-        jlong pLabelgpu = (jlong)label;
-        resultptr =
-            doLROneAPICompute(env, rankId, cclComm, queue, pDatagpu, pLabelgpu,
-                              fitIntercept, executorNum, resultObj);
+        resultptr = doLROneAPICompute(
+            env, rankId, cclComm, queue, feature, featureRows, featureCols,
+            label, labelCols, fitIntercept, executorNum, resultObj);
         env->ReleaseIntArrayElements(gpuIdxArray, gpuIndices, 0);
 #endif
     } else {
         NumericTablePtr pLabel = *((NumericTablePtr *)label);
-        NumericTablePtr pData = *((NumericTablePtr *)data);
+        NumericTablePtr pData = *((NumericTablePtr *)feature);
 
         // Set number of threads for oneDAL to use for each rank
         services::Environment::getInstance()->setNumberOfThreads(executorCores);