Skip to content

Commit

Permalink
Add API and logic to remap GPUs inside the framework.
Browse files Browse the repository at this point in the history
  • Loading branch information
liuliu committed Nov 19, 2024
1 parent 4ecbf13 commit 9a56808
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 26 deletions.
17 changes: 9 additions & 8 deletions lib/nnc/ccv_nnc.h
Original file line number Diff line number Diff line change
Expand Up @@ -894,6 +894,15 @@ void ccv_nnc_set_queue_watermark(int state);
* @return How many in-flight GPU commands can have.
*/
CCV_WARN_UNUSED(int) ccv_nnc_queue_watermark(void);
/**
* Set the device mapping to use custom order for device rather than driver imposed order. This is helpful
* to manage code where which GPU to use have no control over. The previous permutation is cleared up on
* each call and you can set 0 size device map to clear up all custom mapping.
* @param type Currently, only CCV_NNC_STREAM_CONTEXT_GPU on NVIDIA systems are supported.
* @param device_map The array of device map, maximum 64 devices.
* @param size The size of the array, only first 64 will be used.
*/
void ccv_nnc_set_device_permutation(const int type, const int* const device_map, const int size);
/**
* Quantize a given memory region of a given datatype / memory resides, into nbits palette.
* @param input The input memory region, it can be CCV_64F, CCV_32F or CCV_16F.
Expand Down Expand Up @@ -1073,14 +1082,6 @@ void ccv_nnc_stream_signal_free(ccv_nnc_stream_signal_t* const signal);
* @return The number of devices.
*/
CCV_WARN_UNUSED(int) ccv_nnc_device_count(const int type);
/**
* Remap a source device as the destination device.
* @param type The type of devices (CCV_NNC_STREAM_CONTEXT_GPU / CCV_NNC_STREAM_CONTEXT_CPU)
* @param source The original device id.
* @param destination The new device id.
* @return 0 if the device remap is successful, -1 if it is not.
*/
CCV_WARN_UNUSED(int) ccv_nnc_device_remap(const int type, const int source, const int destination);
/**
* The neighbor discovery function that will be called with the device id.
*/
Expand Down
9 changes: 9 additions & 0 deletions lib/nnc/ccv_nnc_cmd.c
Original file line number Diff line number Diff line change
Expand Up @@ -730,3 +730,12 @@ void ccv_nnc_set_queue_watermark(int watermark)
ccv_nnc_mps_set_queue_watermark(watermark);
#endif
}

void ccv_nnc_set_device_permutation(const int type, const int* const device_map, const int size)
{
if (type != CCV_STREAM_CONTEXT_GPU)
return;
#ifdef HAVE_CUDA
cusetdevicemap(device_map, size);
#endif
}
92 changes: 74 additions & 18 deletions lib/nnc/gpu/ccv_nnc_compat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ extern "C" {
}

static void cutrigmp(void);
static int cudevicemap(const int device_id);

#ifdef HAVE_CUDNN
struct cudnn_free_list_s {
Expand All @@ -16,8 +17,10 @@ KHASH_MAP_INIT_INT(cudnn_free, struct cudnn_free_list_s*);
static pthread_mutex_t g_cudnn_mutex = PTHREAD_MUTEX_INITIALIZER;
static khash_t(cudnn_free)* g_cudnn = 0;

cudnnHandle_t cudnn_get(const int type)
static cudnnHandle_t cudnn_get(int type)
{
const int device_id = cudevicemap(CCV_STREAM_GET_DEVICE_ID(type));
CCV_STREAM_SET_DEVICE_ID(type, device_id);
pthread_mutex_lock(&g_cudnn_mutex);
if (!g_cudnn)
g_cudnn = kh_init(cudnn_free);
Expand Down Expand Up @@ -45,8 +48,10 @@ cudnnHandle_t cudnn_get(const int type)
return cudnn;
}

void cudnn_save(const int type, cudnnHandle_t cudnn)
static void cudnn_save(int type, cudnnHandle_t cudnn)
{
const int device_id = cudevicemap(CCV_STREAM_GET_DEVICE_ID(type));
CCV_STREAM_SET_DEVICE_ID(type, device_id);
pthread_mutex_lock(&g_cudnn_mutex);
int ret;
khiter_t i = kh_put(cudnn_free, g_cudnn, type, &ret);
Expand All @@ -57,7 +62,7 @@ void cudnn_save(const int type, cudnnHandle_t cudnn)
pthread_mutex_unlock(&g_cudnn_mutex);
}

void cudnn_pressure(const int device_id)
static void cudnn_pressure(const int device_id)
{
pthread_mutex_lock(&g_cudnn_mutex);
if (g_cudnn)
Expand Down Expand Up @@ -160,6 +165,54 @@ void cuunregmp(const int slot)
pthread_mutex_unlock(&g_mp_mutex);
}

static int cuda_device_map[64] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};

static int cuda_device_reverse_map[64] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};

void cusetdevicemap(const int* const device_map, const int size)
{
int i;
for (i = 0; i < sizeof(cuda_device_reverse_map) / sizeof(cuda_device_reverse_map[0]); i++)
cuda_device_reverse_map[i] = -1;
for (i = size; i < sizeof(cuda_device_map) / sizeof(cuda_device_map[0]); i++)
cuda_device_map[i] = -1;
for (i = 0; i < ccv_min(sizeof(cuda_device_map) / sizeof(cuda_device_map[0]), size); i++)
{
cuda_device_map[i] = device_map[i];
cuda_device_reverse_map[device_map[i]] = i;
}
}

static int cudevicemap(const int device_id)
{
if (device_id >= sizeof(cuda_device_map) / sizeof(cuda_device_map[0]))
return device_id;
const int new_device_id = cuda_device_map[device_id];
if (new_device_id < 0)
return device_id;
return new_device_id;
}

static int cudevicereversemap(const int device_id)
{
if (device_id >= sizeof(cuda_device_reverse_map) / sizeof(cuda_device_reverse_map[0]))
return device_id;
const int new_device_id = cuda_device_reverse_map[device_id];
if (new_device_id < 0)
return device_id;
return new_device_id;
}

static void cutrigmp(void)
{
int device_id;
Expand All @@ -170,7 +223,7 @@ static void cutrigmp(void)
{
cump_t* const mp = (cump_t*)ccv_array_get(g_mp_h, i);
if (mp->device_id == device_id && mp->func)
mp->func(device_id, mp->ctx);
mp->func(cudevicereversemap(device_id), mp->ctx);
}
pthread_mutex_unlock(&g_mp_mutex);
// Set back the device id.
Expand All @@ -183,7 +236,7 @@ static void cutrigmp(void)
void* cumalloc(int device, size_t size)
{
void* ptr = 0;
CUDA_ENFORCE(cudaSetDevice(device));
CUDA_ENFORCE(cudaSetDevice(cudevicemap(device)));
cudaMalloc(&ptr, size);
if (ptr == 0)
{
Expand All @@ -195,14 +248,14 @@ void* cumalloc(int device, size_t size)

void cufree(int device, void* ptr)
{
CUDA_ENFORCE(cudaSetDevice(device));
CUDA_ENFORCE(cudaSetDevice(cudevicemap(device)));
CUDA_ENFORCE(cudaFree(ptr));
}

void cudevice(int device)
{
if (device >= 0)
CUDA_ENFORCE(cudaSetDevice(device));
CUDA_ENFORCE(cudaSetDevice(cudevicemap(device)));
}

void cumemcpy(void* dest, const int dest_type, const void* src, const int src_type, size_t n)
Expand All @@ -211,18 +264,18 @@ void cumemcpy(void* dest, const int dest_type, const void* src, const int src_ty
return;
if (CCV_TENSOR_GET_MEMORY(src_type) == CCV_TENSOR_CPU_MEMORY && CCV_TENSOR_GET_MEMORY(dest_type) == CCV_TENSOR_GPU_MEMORY) {
const int device_b = CCV_TENSOR_GET_DEVICE_ID(dest_type);
CUDA_ENFORCE(cudaSetDevice(device_b));
CUDA_ENFORCE(cudaSetDevice(cudevicemap(device_b)));
CUDA_ENFORCE(cudaMemcpy(dest, src, n, cudaMemcpyHostToDevice));
} else if (CCV_TENSOR_GET_MEMORY(src_type) == CCV_TENSOR_GPU_MEMORY && CCV_TENSOR_GET_MEMORY(dest_type) == CCV_TENSOR_CPU_MEMORY) {
const int device_a = CCV_TENSOR_GET_DEVICE_ID(src_type);
CUDA_ENFORCE(cudaSetDevice(device_a));
CUDA_ENFORCE(cudaSetDevice(cudevicemap(device_a)));
CUDA_ENFORCE(cudaMemcpy(dest, src, n, cudaMemcpyDeviceToHost));
} else if (CCV_TENSOR_GET_MEMORY(src_type) == CCV_TENSOR_CPU_MEMORY && CCV_TENSOR_GET_MEMORY(dest_type) == CCV_TENSOR_CPU_MEMORY)
CUDA_ENFORCE(cudaMemcpy(dest, src, n, cudaMemcpyHostToHost));
else if (CCV_TENSOR_GET_MEMORY(src_type) == CCV_TENSOR_GPU_MEMORY && CCV_TENSOR_GET_MEMORY(dest_type) == CCV_TENSOR_GPU_MEMORY) {
const int device_a = CCV_TENSOR_GET_DEVICE_ID(src_type);
const int device_b = CCV_TENSOR_GET_DEVICE_ID(dest_type);
CUDA_ENFORCE(cudaSetDevice(device_b));
CUDA_ENFORCE(cudaSetDevice(cudevicemap(device_b)));
if (device_a == device_b)
CUDA_ENFORCE(cudaMemcpy(dest, src, n, cudaMemcpyDeviceToDevice));
else
Expand Down Expand Up @@ -334,7 +387,7 @@ static ccv_nnc_stream_context_device_local_t* _ccv_nnc_stream_compat_device_loca
}
return stream_compat->_heap_gpus + device_id;
} else {
CUDA_ENFORCE(cudaSetDevice(device_id));
CUDA_ENFORCE(cudaSetDevice(cudevicemap(device_id)));
return &stream_compat->_inline_gpu;
}
}
Expand All @@ -359,7 +412,7 @@ ccv_nnc_stream_signal_t* ccv_nnc_init_stream_signal(ccv_nnc_stream_signal_t* con
assert(CCV_STREAM_GET_CONTEXT(((int*)signal)[0]) == CCV_STREAM_CONTEXT_GPU);
ccv_nnc_stream_compat_signal_t* compat_signal = (ccv_nnc_stream_compat_signal_t*)ccrealloc(signal, sizeof(ccv_nnc_stream_compat_signal_t));
const int device = CCV_STREAM_GET_DEVICE_ID(compat_signal->super.type);
CUDA_ENFORCE(cudaSetDevice(device));
CUDA_ENFORCE(cudaSetDevice(cudevicemap(device)));
CUDA_ENFORCE(cudaEventCreateWithFlags(&compat_signal->event, cudaEventDisableTiming));
return (ccv_nnc_stream_signal_t*)compat_signal;
}
Expand Down Expand Up @@ -388,7 +441,7 @@ void ccv_nnc_deinit_stream_signal(ccv_nnc_stream_signal_t* const signal)
{
ccv_nnc_stream_compat_signal_t* compat_signal = (ccv_nnc_stream_compat_signal_t*)signal;
const int device = CCV_STREAM_GET_DEVICE_ID(compat_signal->super.type);
CUDA_ENFORCE(cudaSetDevice(device));
CUDA_ENFORCE(cudaSetDevice(cudevicemap(device)));
CUDA_ENFORCE(cudaEventDestroy(compat_signal->event));
}

Expand Down Expand Up @@ -458,6 +511,7 @@ void* ccv_nnc_stream_compat_get_workspace(const ccv_nnc_stream_context_t* const
return device_local->workspace;
int device_id;
CUDA_ENFORCE(cudaGetDevice(&device_id));
device_id = cudevicereversemap(device_id);
device_local->workspace_size = workspace_size;
if (device_local->workspace)
CUDA_ENFORCE(cudaFree(device_local->workspace));
Expand Down Expand Up @@ -494,7 +548,7 @@ void ccv_nnc_stream_compat_drain(ccv_nnc_stream_context_t* const stream_context)
stream_compat->_heap_gpus[i].workspace_size = 0;
}
} else if (stream_compat->_inline_gpu.workspace) {
CUDA_ENFORCE(cudaSetDevice(device));
CUDA_ENFORCE(cudaSetDevice(cudevicemap(device)));
CUDA_ENFORCE(cudaFree(stream_compat->_inline_gpu.workspace));
stream_compat->_inline_gpu.workspace = 0;
stream_compat->_inline_gpu.workspace_size = 0;
Expand Down Expand Up @@ -613,7 +667,7 @@ void ccv_nnc_deinit_stream_context(ccv_nnc_stream_context_t* const stream_contex
cuunregmp(stream_compat->_heap_gpus[i].mp_hook - 1);
}
} else {
CUDA_ENFORCE(cudaSetDevice(device));
CUDA_ENFORCE(cudaSetDevice(cudevicemap(device)));
if (stream_compat->_inline_gpu.workspace)
{
CUDA_ENFORCE(cudaFree(stream_compat->_inline_gpu.workspace));
Expand Down Expand Up @@ -657,6 +711,7 @@ int ccv_nnc_stream_context_get_device(const ccv_nnc_stream_context_t* const stre
{
int device = 0;
CUDA_ENFORCE(cudaGetDevice(&device));
device = cudevicereversemap(device);
return device;
}
const ccv_nnc_stream_context_compat_t* stream_compat = (const ccv_nnc_stream_context_compat_t*)stream_context;
Expand Down Expand Up @@ -838,6 +893,7 @@ cudnnHandle_t ccv_nnc_stream_context_get_cudnn(const ccv_nnc_stream_context_t* c
{
int device_id;
CUDA_ENFORCE(cudaGetDevice(&device_id));
device_id = cudevicereversemap(device_id);
device_local->mp_hook = curegmp(device_id, _ccv_nnc_device_local_drain, device_local) + 1;
}
}
Expand Down Expand Up @@ -1440,14 +1496,14 @@ ncclComm_t ccv_nnc_nccl_get_comm(ccv_nnc_stream_context_t* const stream, const i
stream_compat->super.resource_container[0] = (ccv_nnc_stream_resource_container_t*)cccalloc(1, sizeof(ccv_nnc_stream_resource_container_compat_t));
ccv_nnc_stream_resource_container_compat_t* const resource_container_compat = (ccv_nnc_stream_resource_container_compat_t*)stream_compat->super.resource_container[0];
if (resource_container_compat->comms && resource_container_compat->comm_count == device_count)
return resource_container_compat->comms[device_id];
return resource_container_compat->comms[cudevicemap(device_id)];
if (resource_container_compat->comms)
resource_container_compat->comms = (ncclComm_t*)ccrealloc(resource_container_compat->comms, sizeof(ncclComm_t) * device_count);
else
resource_container_compat->comms = (ncclComm_t*)ccmalloc(sizeof(ncclComm_t) * device_count);
_ccv_nnc_nccl_redo_comms(resource_container_compat->comms, resource_container_compat->comm_count, device_count);
resource_container_compat->comm_count = device_count;
return resource_container_compat->comms[device_id];
return resource_container_compat->comms[cudevicemap(device_id)];
} else {
static ncclComm_t comms[CCV_TENSOR_GET_DEVICE_ID(CCV_COMPUTE_DEVICE_ANY)];
static int comm_count = 0;
Expand All @@ -1456,7 +1512,7 @@ ncclComm_t ccv_nnc_nccl_get_comm(ccv_nnc_stream_context_t* const stream, const i
_ccv_nnc_nccl_redo_comms(comms, comm_count, device_count);
comm_count = device_count;
}
return comms[device_id];
return comms[cudevicemap(device_id)];
}
}

Expand Down
1 change: 1 addition & 0 deletions lib/nnc/gpu/ccv_nnc_compat.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ typedef void(*cump_f)(int device_id, void* const context);
int curegmp(int device_id, cump_f func, void* const context); // register memory pressure handler
void cuunregmp(const int id); // un-register memory pressure handler.
void cusetprofiler(int state);
void cusetdevicemap(const int* const device_map, const int size);

// Stream context
CCV_WARN_UNUSED(ccv_nnc_stream_context_t*) ccv_nnc_init_stream_context(ccv_nnc_stream_context_t* const stream_context);
Expand Down
3 changes: 3 additions & 0 deletions test/int/nnc/cifar.tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ static ccv_cnnp_model_t* _cifar_10_dawn(const int softmax)
static int train_cifar_10(const int epoch_limit, ccv_array_t* const training_set, const int batch_size, const float mean[3], ccv_array_t* const test_set)
{
ccv_cnnp_model_t* const cifar_10 = _cifar_10_dawn(1);
int device_map[4] = {3, 2, 1, 0};
ccv_nnc_set_device_permutation(CCV_STREAM_CONTEXT_GPU, device_map, 4);
const int device_count = ccv_nnc_device_count(CCV_STREAM_CONTEXT_GPU);
if (device_count < 1)
return -1;
Expand Down Expand Up @@ -226,6 +228,7 @@ static int train_cifar_10(const int epoch_limit, ccv_array_t* const training_set
ccv_nnc_stream_context_free(stream_contexts[1]);
for (i = 0; i < device_count; i++)
ccv_nnc_tensor_free(cpu_outputs[i]);
ccv_nnc_set_device_permutation(CCV_STREAM_CONTEXT_GPU, 0, 0);
return correct;
}

Expand Down

0 comments on commit 9a56808

Please sign in to comment.