Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ML-250] Replace sycl::gpu_selector{}.select_device() with manual GPU allocation #235

Merged
merged 26 commits into from
Apr 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions mllib-dal/src/main/native/Common.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#pragma once

#ifndef ONEDAL_DATA_PARALLEL
#define ONEDAL_DATA_PARALLEL
#endif

#include "GPU.h"
#include "Communicator.hpp"
#include "OutputHelpers.hpp"
#include "oneapi/dal/table/homogen.hpp"
15 changes: 6 additions & 9 deletions mllib-dal/src/main/native/Communicator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,10 @@ class ccl_info {
friend class de::singleton<ccl_info>;

private:
ccl_info(int size, int rankId, const ccl::string &ipPort) {
ccl_info(int size, int rankId, ccl::shared_ptr_class<ccl::kvs> keyvs) {
rank = rankId;
rank_count = size;
ccl::string ccl_ip_port(ipPort);
auto kvs_attr = ccl::create_kvs_attr();
kvs_attr.set<ccl::kvs_attr_id::ip_port>(ccl_ip_port);
kvs = ccl::create_main_kvs(kvs_attr);
kvs = keyvs;
}

public:
Expand All @@ -49,17 +46,17 @@ class ccl_info {
};

template <typename Backend>
communicator<device_memory_access::none> make_communicator(int size, int rank, const ccl::string &ipPort) {
auto& info = de::singleton<ccl_info>::get(size, rank, ipPort);
communicator<device_memory_access::none> make_communicator(int size, int rank, const ccl::shared_ptr_class<ccl::kvs> kvs) {
auto& info = de::singleton<ccl_info>::get(size, rank, kvs);
// integral cast
return oneapi::dal::detail::ccl_communicator<device_memory_access::none>{ info.kvs,
info.rank,
info.rank_count };
}

template <typename Backend>
communicator<device_memory_access::usm> make_communicator(sycl::queue& queue, int size, int rank, const ccl::string &ipPort) {
auto& info = de::singleton<ccl_info>::get(size, rank, ipPort);
communicator<device_memory_access::usm> make_communicator(sycl::queue& queue, int size, int rank, const ccl::shared_ptr_class<ccl::kvs> kvs) {
auto& info = de::singleton<ccl_info>::get(size, rank, kvs);
return oneapi::dal::detail::ccl_communicator<device_memory_access::usm>{
queue,
info.kvs,
Expand Down
72 changes: 35 additions & 37 deletions mllib-dal/src/main/native/CorrelationImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,8 @@
#include <chrono>

#ifdef CPU_GPU_PROFILE
#include "GPU.h"
#ifndef ONEDAL_DATA_PARALLEL
#define ONEDAL_DATA_PARALLEL
#endif
#include "Communicator.hpp"
#include "OutputHelpers.hpp"
#include "Common.hpp"
#include "oneapi/dal/algo/covariance.hpp"
#include "oneapi/dal/table/homogen.hpp"
#endif

#include "OneCCL.h"
Expand Down Expand Up @@ -153,41 +147,30 @@ static void doCorrelationDaalCompute(JNIEnv *env, jobject obj, int rankId,
#endif

#ifdef CPU_GPU_PROFILE
static void doCorrelationOneAPICompute(JNIEnv *env, jint rankId,
jlong pNumTabData, jint executorNum,
const ccl::string &ipPort,
ComputeDevice &device,
jobject resultObj) {
std::cout << "oneDAL (native): GPU compute start , rankid " << rankId
<< std::endl;
const bool isRoot = (rankId == ccl_root);
static void doCorrelationOneAPICompute(
JNIEnv *env, jlong pNumTabData,
preview::spmd::communicator<preview::spmd::device_memory_access::usm> comm,
jobject resultObj) {
std::cout << "oneDAL (native): GPU compute start" << std::endl;
const bool isRoot = (comm.get_rank() == ccl_root);
homogen_table htable =
*reinterpret_cast<const homogen_table *>(pNumTabData);

const auto cor_desc = covariance::descriptor{}.set_result_options(
covariance::result_options::cor_matrix |
covariance::result_options::means);
auto queue = getQueue(device);
auto comm = preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, executorNum, rankId, ipPort);
auto t1 = std::chrono::high_resolution_clock::now();
const auto result_train = preview::compute(comm, cor_desc, htable);
auto t2 = std::chrono::high_resolution_clock::now();
auto duration =
std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count();
std::cout << "Correlation (native): rankid " << rankId
<< "; computing step took " << duration / 1000 << " secs"
<< std::endl;
if (isRoot) {
std::cout << "Mean:\n" << result_train.get_means() << std::endl;
std::cout << "Correlation:\n"
<< result_train.get_cor_matrix() << std::endl;
t2 = std::chrono::high_resolution_clock::now();
duration =
auto t2 = std::chrono::high_resolution_clock::now();
auto duration =
std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1)
.count();
std::cout << "Correlation batch(native): computing step took "
<< duration / 1000 << " secs in end. " << std::endl;
<< duration / 1000 << " secs." << std::endl;
// Return all covariance & mean
jclass clazz = env->GetObjectClass(resultObj);

Expand All @@ -208,18 +191,18 @@ static void doCorrelationOneAPICompute(JNIEnv *env, jint rankId,
JNIEXPORT jlong JNICALL
Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
JNIEnv *env, jobject obj, jlong pNumTabData, jint executorNum,
jint executorCores, jint computeDeviceOrdinal, jint rankId, jstring ipPort,
jint executorCores, jint computeDeviceOrdinal, jintArray gpuIdxArray,
jobject resultObj) {
std::cout << "oneDAL (native): use DPC++ kernels " << std::endl;
const char *ipPortPtr = env->GetStringUTFChars(ipPort, 0);
std::string ipPortStr = std::string(ipPortPtr);

std::cout << "oneDAL (native): use DPC++ kernels "
<< "; device " << ComputeDeviceString[computeDeviceOrdinal]
<< std::endl;
ccl::communicator &cclComm = getComm();
int rankId = cclComm.rank();
ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);
switch (device) {
#ifdef CPU_ONLY_PROFILE
case ComputeDevice::host:
case ComputeDevice::cpu: {
ccl::communicator &comm = getComm();
NumericTablePtr pData = *((NumericTablePtr *)pNumTabData);
// Set number of threads for oneDAL to use for each rank
services::Environment::getInstance()->setNumberOfThreads(executorCores);
Expand All @@ -228,17 +211,32 @@ Java_com_intel_oap_mllib_stat_CorrelationDALImpl_cCorrelationTrainDAL(
services::Environment::getInstance()->getNumberOfThreads();
std::cout << "oneDAL (native): Number of CPU threads used"
<< nThreadsNew << std::endl;
doCorrelationDaalCompute(env, obj, rankId, comm, pData, executorNum,
doCorrelationDaalCompute(env, obj, rankId, cclComm, pData, executorNum,
resultObj);
}
#else
case ComputeDevice::gpu: {
doCorrelationOneAPICompute(env, rankId, pNumTabData, executorNum,
ipPortStr, device, resultObj);
int nGpu = env->GetArrayLength(gpuIdxArray);
std::cout << "oneDAL (native): use GPU kernels with " << nGpu
<< " GPU(s)"
<< " rankid " << rankId << std::endl;

jint *gpuIndices = env->GetIntArrayElements(gpuIdxArray, 0);

int size = cclComm.size();
ComputeDevice device = getComputeDeviceByOrdinal(computeDeviceOrdinal);

auto queue =
getAssignedGPU(device, cclComm, size, rankId, gpuIndices, nGpu);

ccl::shared_ptr_class<ccl::kvs> &kvs = getKvs();
auto comm =
preview::spmd::make_communicator<preview::spmd::backend::ccl>(
queue, size, rankId, kvs);
doCorrelationOneAPICompute(env, pNumTabData, comm, resultObj);
}
#endif
}

env->ReleaseStringUTFChars(ipPort, ipPortPtr);
return 0;
}
61 changes: 39 additions & 22 deletions mllib-dal/src/main/native/GPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,24 +50,6 @@ static int getLocalRank(ccl::communicator &comm, int size, int rank) {
// return 0;
}

sycl::device getAssignedGPU(ccl::communicator &comm, int size, int rankId,
jint *gpu_indices, int n_gpu) {
auto local_rank = getLocalRank(comm, size, rankId);
auto gpus = get_gpus();

std::cout << "rank: " << rankId << " size: " << size
<< " local_rank: " << local_rank << " n_gpu: " << n_gpu
<< std::endl;

auto gpu_selected = gpu_indices[local_rank % n_gpu];
std::cout << "GPU selected for current rank: " << gpu_selected << std::endl;

// In case gpu_selected index is larger than number of GPU SYCL devices
auto rank_gpu = gpus[gpu_selected % gpus.size()];

return rank_gpu;
}

static sycl::queue getSyclQueue(const sycl::device device) {
g_mtx.lock();
if (!g_queueVector.empty()) {
Expand All @@ -83,9 +65,8 @@ static sycl::queue getSyclQueue(const sycl::device device) {
}
}

sycl::queue getQueue(const ComputeDevice device) {
std::cout << "Get Queue" << std::endl;

sycl::queue getAssignedGPU(const ComputeDevice device, ccl::communicator &comm,
int size, int rankId, jint *gpu_indices, int n_gpu) {
switch (device) {
case ComputeDevice::host:
case ComputeDevice::cpu: {
Expand All @@ -94,13 +75,49 @@ sycl::queue getQueue(const ComputeDevice device) {
<< std::endl;
exit(-1);
}
case ComputeDevice::gpu: {
std::cout << "selector GPU" << std::endl;
auto local_rank = getLocalRank(comm, size, rankId);
auto gpus = get_gpus();

std::cout << "rank: " << rankId << " size: " << size
<< " local_rank: " << local_rank << " n_gpu: " << n_gpu
<< std::endl;

auto gpu_selected = gpu_indices[local_rank % n_gpu];
std::cout << "GPU selected for current rank: " << gpu_selected
<< std::endl;

// In case gpu_selected index is larger than number of GPU SYCL devices
auto rank_gpu = gpus[gpu_selected % gpus.size()];
sycl::queue q{rank_gpu};
return q;
}

default: {
std::cout << "No Device!" << std::endl;
exit(-1);
}
}
}

sycl::queue getQueue(const ComputeDevice device) {
std::cout << "Get Queue" << std::endl;

switch (device) {
case ComputeDevice::host:
case ComputeDevice::cpu: {
std::cout << "Not implemented for HOST/CPU device, Please run on "
"GPU device."
<< std::endl;
exit(-1);
}
case ComputeDevice::gpu: {
std::cout << "selector GPU" << std::endl;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove debug output

auto device_gpu = sycl::gpu_selector{}.select_device();
std::cout << "selector GPU end" << std::endl;
return getSyclQueue(device_gpu);
}

default: {
std::cout << "No Device!" << std::endl;
exit(-1);
Expand Down
8 changes: 2 additions & 6 deletions mllib-dal/src/main/native/GPU.h
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
#pragma once

#ifndef ONEDAL_DATA_PARALLEL
#define ONEDAL_DATA_PARALLEL
#endif

#include "service.h"
#include <CL/cl.h>
#include <CL/sycl.hpp>
#include <daal_sycl.h>
#include <jni.h>
#include <oneapi/ccl.hpp>

sycl::device getAssignedGPU(ccl::communicator &comm, int size, int rankId,
jint *gpu_indices, int n_gpu);
sycl::queue getAssignedGPU(const ComputeDevice device, ccl::communicator &comm,
int size, int rankId, jint *gpu_indices, int n_gpu);

sycl::queue getQueue(const ComputeDevice device);
Loading