forked from IBMSparkGPU/CUDA-MLlib
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutilities.cu
94 lines (86 loc) · 3.34 KB
/
utilities.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#include "utilities.h"
struct GPUState {
int deviceCount; // number of GPUs to use
int deviceToUse; // GPU to use (round-robin)
pthread_mutex_t initLock;
} gpustate = {-1,-1,PTHREAD_MUTEX_INITIALIZER};
// like cudaMalloc, but tried GPU, and if fails, mallocs on the host instead and registers
// memory allocated via this method should be freed with freeBest
cudaError_t mallocBest ( void **devPtr, size_t size ) {
cudaError_t returnVal = cudaSuccess;
if (cudaMalloc(devPtr, size) != cudaSuccess) {
//fprintf(stderr, "Unable to malloc %i bytes on the device - using host memory\n", size);
void* h_ptr;
returnVal = cudaHostAlloc(&h_ptr, size, cudaHostAllocMapped | cudaHostAllocWriteCombined);
if (returnVal == cudaSuccess) checkCudaErrors(cudaHostGetDevicePointer(devPtr, h_ptr, 0));
}
return returnVal;
}
// like cudaFree, but can also handle host pointers. To be used with mallocBest
cudaError_t freeBest ( void *devPtr ) {
cudaPointerAttributes attributes;
checkCudaErrors(cudaPointerGetAttributes (&attributes, devPtr));
//fprintf(stderr, "freeing memory of type %i\n", attributes.memoryType);
if (attributes.memoryType == cudaMemoryTypeDevice)
checkCudaErrors(cudaFree(devPtr));
else
checkCudaErrors(cudaFreeHost(devPtr));
return cudaSuccess;
}
// returns which GPU to run on, or -1 if no GPUs are available
int get_gpu() {
if (gpustate.deviceCount == 1)
return 0; // return immediately for the common case of 1 GPU
else if (gpustate.deviceCount > 1) { // multiple GPUs
int newval, oldval;
do {
oldval = gpustate.deviceToUse;
if (oldval == gpustate.deviceCount-1)
newval = 0;
else
newval = oldval+1;
} while (!__sync_bool_compare_and_swap(&gpustate.deviceToUse, oldval, newval));
}
else if (gpustate.deviceCount == -1) { // not yet initialized... run initialization
pthread_mutex_lock(&gpustate.initLock);
// check if another thread already completed initialization
if (gpustate.deviceCount != -1) {
pthread_mutex_unlock(&gpustate.initLock);
return get_gpu();
}
// continue with initialization
if (cudaGetDeviceCount(&gpustate.deviceCount)) {
fprintf(stderr, "Cuda Error in GetDeviceCount: %s\n", cudaGetErrorString(cudaGetLastError()));
gpustate.deviceCount = 0;
}
else if (gpustate.deviceCount <= 0)
gpustate.deviceCount = 0;
else
gpustate.deviceToUse = 0;
for (int deviceID=0; deviceID<gpustate.deviceCount; deviceID++) {
cudaSetDevice(deviceID);
cudaDeviceReset();
}
pthread_mutex_unlock(&gpustate.initLock);
}
return gpustate.deviceToUse;
}