Skip to content

Commit

Permalink
Merge pull request #73 from borisfom/caffe-0.14
Browse files Browse the repository at this point in the history
Adding out of memory handler, making arena bins more detailed
  • Loading branch information
borisfom committed Nov 10, 2015
2 parents 62ad22d + 18fc267 commit 934b378
Show file tree
Hide file tree
Showing 2 changed files with 171 additions and 161 deletions.
326 changes: 168 additions & 158 deletions 3rdparty/cub/cub/util_allocator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -269,9 +269,9 @@ struct CachingDeviceAllocator
unsigned int max_bin, ///< Maximum bin
size_t max_cached_bytes, ///< Maximum aggregate cached bytes per device
bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called.
bool debug = false ///< Whether or not to print (de)allocation events to stdout
)
:
bool debug = false ///< Whether or not to print (de)allocation events to stdout
)
:
spin_lock(0),
bin_growth(bin_growth),
min_bin(min_bin),
Expand Down Expand Up @@ -302,8 +302,8 @@ struct CachingDeviceAllocator
* sets a maximum of 6,291,455 cached bytes per device
*/
CachingDeviceAllocator(
bool skip_cleanup = false,
bool debug = false)
bool skip_cleanup = false,
bool debug = false)
:
spin_lock(0),
bin_growth(8),
Expand Down Expand Up @@ -368,106 +368,123 @@ struct CachingDeviceAllocator

*d_ptr = NULL;

int entrypoint_device = INVALID_DEVICE_ORDINAL;
int entrypoint_device = INVALID_DEVICE_ORDINAL;
cudaError_t error = cudaSuccess;

do {

if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
if (device == INVALID_DEVICE_ORDINAL)
device = entrypoint_device;

// Round up to nearest bin size
unsigned int bin;
size_t bin_bytes;
NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
if (bin < min_bin) {
bin = min_bin;
bin_bytes = min_bin_bytes;
}

// Check if bin is greater than our maximum bin
if (bin > max_bin)
{
// Allocate the request exactly and give out-of-range bin
bin = (unsigned int) -1;
bin_bytes = bytes;
}

BlockDescriptor search_key(bin_bytes, bin, device, active_stream);

// Lock
Lock(&spin_lock);

// Find the range of freed blocks big enough within the same bin on the same device
CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);

// Look for freed blocks from the active stream or from other idle streams
bool found = false;
while ((block_itr != cached_blocks.end()) &&
(block_itr->device == device) &&
(block_itr->bin == search_key.bin))
{
cudaStream_t prev_stream = block_itr->associated_stream;
if ((active_stream == prev_stream) || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
{
// Reuse existing cache block. Insert into live blocks.
found = true;
search_key = *block_itr;
search_key.associated_stream = active_stream;
live_blocks.insert(search_key);

// Remove from free blocks
cached_blocks.erase(block_itr);
cached_bytes[device].free -= search_key.bytes;
cached_bytes[device].busy += search_key.bytes;

if (debug) CubLog("\tdevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) prev_stream);

break;
}

block_itr++;
}

if (device == INVALID_DEVICE_ORDINAL) {
if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
device = entrypoint_device;
}

// Round up to nearest bin size
unsigned int bin;
size_t bin_bytes;
NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
if (bin < min_bin) {
bin = min_bin;
bin_bytes = min_bin_bytes;
}

if (!found)
{
Unlock(&spin_lock);
// Set to specified device
if (device != entrypoint_device) {
if (CubDebug(error = cudaSetDevice(device))) break;
}

// Allocate
if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break;
if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) break;


Lock(&spin_lock);
// Insert into live blocks
live_blocks.insert(search_key);
cached_bytes[device].busy += search_key.bytes;

if (debug) CubLog("\tdevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
}
// Check if bin is greater than our maximum bin
if (bin > max_bin)
{
// Allocate the request exactly and give out-of-range bin
bin = (unsigned int) -1;
bin_bytes = bytes;
}

// Copy device pointer to output parameter
*d_ptr = search_key.d_ptr;
if (debug) CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
(long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);

} while(0);
BlockDescriptor search_key(bin_bytes, bin, device, active_stream);

Unlock(&spin_lock);
// Lock while we search
Lock(&spin_lock);

// Attempt to revert back to previous device if necessary
if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
// Find the range of freed blocks big enough within the same bin on the same device
CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);

// Look for freed blocks from the active stream or from other idle streams
bool found = false;
while ( (block_itr != cached_blocks.end())
&& (block_itr->device == device)
&& (block_itr->bin == search_key.bin)) {
cudaStream_t prev_stream = block_itr->associated_stream;
if ((active_stream == prev_stream)
|| (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
// Reuse existing cache block. Insert into live blocks.
found = true;
search_key = *block_itr;
search_key.associated_stream = active_stream;
live_blocks.insert(search_key);

// Remove from free blocks
cached_blocks.erase(block_itr);
cached_bytes[device].free -= search_key.bytes;
cached_bytes[device].busy += search_key.bytes;

if (debug) CubLog("\tdevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) prev_stream);

break;
}

block_itr++;
}
// done searching. Unlock.

Unlock(&spin_lock);

if (!found)
{

// Set to specified device. Entrypoint may not be set.
if (device != entrypoint_device) {
if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
if (CubDebug(error = cudaSetDevice(device))) return error;
}

// Allocate
error = cudaMalloc(&search_key.d_ptr, search_key.bytes);

if (error != cudaSuccess) {
if (debug) CubLog("\tdevice %d failed to allocate %lld bytes for stream %lld",
device, (long long) search_key.bytes, (long long) search_key.associated_stream);

// if (search_key.bytes < cached_bytes[device]) {
// free all cached memory (for all devices), synchrionize and retry once
cudaDeviceSynchronize();
cudaThreadSynchronize();
FreeAllCached();
cudaDeviceSynchronize();
cudaThreadSynchronize();
error = cudaMalloc(&search_key.d_ptr, search_key.bytes);
// }
}
if (CubDebug(error))
return error;
if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
return error;

// Insert into live blocks
Lock(&spin_lock);
live_blocks.insert(search_key);
cached_bytes[device].busy += search_key.bytes;
Unlock(&spin_lock);

if (debug) CubLog("\tdevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);

// Attempt to revert back to previous device if necessary
if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
}

// Copy device pointer to output parameter
*d_ptr = search_key.d_ptr;
if (debug) CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
(long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);

return error;

Expand Down Expand Up @@ -515,73 +532,66 @@ struct CachingDeviceAllocator
int entrypoint_device = INVALID_DEVICE_ORDINAL;
cudaError_t error = cudaSuccess;

if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
if (device == INVALID_DEVICE_ORDINAL) {
if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
device = entrypoint_device;
}

if (device == INVALID_DEVICE_ORDINAL)
device = entrypoint_device;

BlockDescriptor search_key(d_ptr, device);

do {

// Set to specified device
if (device != entrypoint_device) {
if (CubDebug(error = cudaSetDevice(device))) break;
}

// Lock
Lock(&spin_lock);

// Find corresponding block descriptor
BusyBlocks::iterator block_itr = live_blocks.find(search_key);
if (block_itr == live_blocks.end())
{
// Cannot find pointer
if (CubDebug(error = cudaErrorUnknown)) break;
}
else
{
// Remove from live blocks
search_key = *block_itr;
live_blocks.erase(block_itr);

cached_bytes[device].busy -= search_key.bytes;

// Check if we should keep the returned allocation
if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)
{
// Signal the event in the associated stream
if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) break;

// Insert returned allocation into free blocks
cached_blocks.insert(search_key);
cached_bytes[device].free += search_key.bytes;

if (debug) CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);
}
else {
// This means : actually delete the block after we release the lock
d_ptr = 0;
}
}
} while (0);
bool recached = false;

// Lock
Lock(&spin_lock);

// Find corresponding block descriptor
BusyBlocks::iterator block_itr = live_blocks.find(search_key);
if (block_itr != live_blocks.end()) {
// Remove from live blocks
search_key = *block_itr;
live_blocks.erase(block_itr);
cached_bytes[device].busy -= search_key.bytes;

// Check if we should keep the returned allocation
if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)
{
// Insert returned allocation into free blocks
cached_blocks.insert(search_key);
cached_bytes[device].free += search_key.bytes;
recached = true;
if (debug) {
CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
(long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);
}
}
}

Unlock(&spin_lock);

if (!d_ptr)
{
// Free device memory
if (recached) {
// Signal the event in the associated stream
if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
return error;
} else {
// Set to specified device. Entrypoint may not be set.
if (device != entrypoint_device) {
if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
return error;
if (CubDebug(error = cudaSetDevice(device))) return error;
}

// Actually free device memory
if (CubDebug(error = cudaFree(d_ptr))) return error;
if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;

if (debug) CubLog("\tdevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy);
}
if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;

if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
{
if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
}
}

return error;
Expand Down Expand Up @@ -654,7 +664,7 @@ struct CachingDeviceAllocator
if (debug) CubLog("\tdevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].free);
}

Unlock(&spin_lock);

// Attempt to revert back to entry-point device if necessary
Expand Down
6 changes: 3 additions & 3 deletions src/caffe/util/gpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,9 +162,9 @@ namespace caffe {
// if you are paranoid, that doesn't mean they are not after you :)
delete cubAlloc;

cubAlloc = new cub::CachingDeviceAllocator( 8, // defaults
3,
7,
cubAlloc = new cub::CachingDeviceAllocator( 2, // defaults
6,
16,
poolsize_,
false,
debug_);
Expand Down

0 comments on commit 934b378

Please sign in to comment.