diff --git a/3rdparty/cub/cub/util_allocator.cuh b/3rdparty/cub/cub/util_allocator.cuh index 051f82198c3..7f6bebe7f74 100644 --- a/3rdparty/cub/cub/util_allocator.cuh +++ b/3rdparty/cub/cub/util_allocator.cuh @@ -269,9 +269,9 @@ struct CachingDeviceAllocator unsigned int max_bin, ///< Maximum bin size_t max_cached_bytes, ///< Maximum aggregate cached bytes per device bool skip_cleanup = false, ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called. - bool debug = false ///< Whether or not to print (de)allocation events to stdout - ) - : + bool debug = false ///< Whether or not to print (de)allocation events to stdout + ) + : spin_lock(0), bin_growth(bin_growth), min_bin(min_bin), @@ -302,8 +302,8 @@ struct CachingDeviceAllocator * sets a maximum of 6,291,455 cached bytes per device */ CachingDeviceAllocator( - bool skip_cleanup = false, - bool debug = false) + bool skip_cleanup = false, + bool debug = false) : spin_lock(0), bin_growth(8), @@ -368,106 +368,123 @@ struct CachingDeviceAllocator *d_ptr = NULL; - int entrypoint_device = INVALID_DEVICE_ORDINAL; + int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; - do { - - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break; - if (device == INVALID_DEVICE_ORDINAL) - device = entrypoint_device; - - // Round up to nearest bin size - unsigned int bin; - size_t bin_bytes; - NearestPowerOf(bin, bin_bytes, bin_growth, bytes); - if (bin < min_bin) { - bin = min_bin; - bin_bytes = min_bin_bytes; - } - - // Check if bin is greater than our maximum bin - if (bin > max_bin) - { - // Allocate the request exactly and give out-of-range bin - bin = (unsigned int) -1; - bin_bytes = bytes; - } - - BlockDescriptor search_key(bin_bytes, bin, device, active_stream); - - // Lock - Lock(&spin_lock); - - // Find the range of freed blocks big enough within the same bin on the same device - CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); - - // Look for freed blocks from the active stream or from other idle streams - bool found = false; - while ((block_itr != cached_blocks.end()) && - (block_itr->device == device) && - (block_itr->bin == search_key.bin)) - { - cudaStream_t prev_stream = block_itr->associated_stream; - if ((active_stream == prev_stream) || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) - { - // Reuse existing cache block. Insert into live blocks. - found = true; - search_key = *block_itr; - search_key.associated_stream = active_stream; - live_blocks.insert(search_key); - - // Remove from free blocks - cached_blocks.erase(block_itr); - cached_bytes[device].free -= search_key.bytes; - cached_bytes[device].busy += search_key.bytes; - - if (debug) CubLog("\tdevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n", - device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) prev_stream); - - break; - } - - block_itr++; - } - + if (device == INVALID_DEVICE_ORDINAL) { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) + return error; + device = entrypoint_device; + } + + // Round up to nearest bin size + unsigned int bin; + size_t bin_bytes; + NearestPowerOf(bin, bin_bytes, bin_growth, bytes); + if (bin < min_bin) { + bin = min_bin; + bin_bytes = min_bin_bytes; + } - if (!found) - { - Unlock(&spin_lock); - // Set to specified device - if (device != entrypoint_device) { - if (CubDebug(error = cudaSetDevice(device))) break; - } - - // Allocate - if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break; - if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) break; - - - Lock(&spin_lock); - // Insert into live blocks - live_blocks.insert(search_key); - cached_bytes[device].busy += search_key.bytes; - - if (debug) CubLog("\tdevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n", - device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); - } + // Check if bin is greater than our maximum bin + if (bin > max_bin) + { + // Allocate the request exactly and give out-of-range bin + bin = (unsigned int) -1; + bin_bytes = bytes; + } - // Copy device pointer to output parameter - *d_ptr = search_key.d_ptr; - if (debug) CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", - (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy); - - } while(0); + BlockDescriptor search_key(bin_bytes, bin, device, active_stream); - Unlock(&spin_lock); + // Lock while we search + Lock(&spin_lock); - // Attempt to revert back to previous device if necessary - if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) - { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; - } + // Find the range of freed blocks big enough within the same bin on the same device + CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key); + + // Look for freed blocks from the active stream or from other idle streams + bool found = false; + while ( (block_itr != cached_blocks.end()) + && (block_itr->device == device) + && (block_itr->bin == search_key.bin)) { + cudaStream_t prev_stream = block_itr->associated_stream; + if ((active_stream == prev_stream) + || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) { + // Reuse existing cache block. Insert into live blocks. + found = true; + search_key = *block_itr; + search_key.associated_stream = active_stream; + live_blocks.insert(search_key); + + // Remove from free blocks + cached_blocks.erase(block_itr); + cached_bytes[device].free -= search_key.bytes; + cached_bytes[device].busy += search_key.bytes; + + if (debug) CubLog("\tdevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n", + device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) prev_stream); + + break; + } + + block_itr++; + } + // done searching. Unlock. + + Unlock(&spin_lock); + + if (!found) + { + + // Set to specified device. Entrypoint may not be set. + if (device != entrypoint_device) { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) + return error; + if (CubDebug(error = cudaSetDevice(device))) return error; + } + + // Allocate + error = cudaMalloc(&search_key.d_ptr, search_key.bytes); + + if (error != cudaSuccess) { + if (debug) CubLog("\tdevice %d failed to allocate %lld bytes for stream %lld", + device, (long long) search_key.bytes, (long long) search_key.associated_stream); + + // if (search_key.bytes < cached_bytes[device]) { + // free all cached memory (for all devices), synchrionize and retry once + cudaDeviceSynchronize(); + cudaThreadSynchronize(); + FreeAllCached(); + cudaDeviceSynchronize(); + cudaThreadSynchronize(); + error = cudaMalloc(&search_key.d_ptr, search_key.bytes); + // } + } + if (CubDebug(error)) + return error; + if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) + return error; + + // Insert into live blocks + Lock(&spin_lock); + live_blocks.insert(search_key); + cached_bytes[device].busy += search_key.bytes; + Unlock(&spin_lock); + + if (debug) CubLog("\tdevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n", + device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream); + + // Attempt to revert back to previous device if necessary + if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } + } + + // Copy device pointer to output parameter + *d_ptr = search_key.d_ptr; + if (debug) CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n", + (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy); return error; @@ -515,73 +532,66 @@ struct CachingDeviceAllocator int entrypoint_device = INVALID_DEVICE_ORDINAL; cudaError_t error = cudaSuccess; - if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error; + if (device == INVALID_DEVICE_ORDINAL) { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) + return error; + device = entrypoint_device; + } - if (device == INVALID_DEVICE_ORDINAL) - device = entrypoint_device; - BlockDescriptor search_key(d_ptr, device); - - do { - - // Set to specified device - if (device != entrypoint_device) { - if (CubDebug(error = cudaSetDevice(device))) break; - } - - // Lock - Lock(&spin_lock); - - // Find corresponding block descriptor - BusyBlocks::iterator block_itr = live_blocks.find(search_key); - if (block_itr == live_blocks.end()) - { - // Cannot find pointer - if (CubDebug(error = cudaErrorUnknown)) break; - } - else - { - // Remove from live blocks - search_key = *block_itr; - live_blocks.erase(block_itr); - - cached_bytes[device].busy -= search_key.bytes; - - // Check if we should keep the returned allocation - if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes) - { - // Signal the event in the associated stream - if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) break; - - // Insert returned allocation into free blocks - cached_blocks.insert(search_key); - cached_bytes[device].free += search_key.bytes; - - if (debug) CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", - device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy); - } - else { - // This means : actually delete the block after we release the lock - d_ptr = 0; - } - } - } while (0); + bool recached = false; + // Lock + Lock(&spin_lock); + + // Find corresponding block descriptor + BusyBlocks::iterator block_itr = live_blocks.find(search_key); + if (block_itr != live_blocks.end()) { + // Remove from live blocks + search_key = *block_itr; + live_blocks.erase(block_itr); + cached_bytes[device].busy -= search_key.bytes; + + // Check if we should keep the returned allocation + if (cached_bytes[device].free + search_key.bytes <= max_cached_bytes) + { + // Insert returned allocation into free blocks + cached_blocks.insert(search_key); + cached_bytes[device].free += search_key.bytes; + recached = true; + if (debug) { + CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n", + device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), + (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy); + } + } + } + Unlock(&spin_lock); - if (!d_ptr) - { - // Free device memory + if (recached) { + // Signal the event in the associated stream + if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) + return error; + } else { + // Set to specified device. Entrypoint may not be set. + if (device != entrypoint_device) { + if (CubDebug(error = cudaGetDevice(&entrypoint_device))) + return error; + if (CubDebug(error = cudaSetDevice(device))) return error; + } + + // Actually free device memory if (CubDebug(error = cudaFree(d_ptr))) return error; if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error; - + if (debug) CubLog("\tdevice %d freed %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].busy); - } - - if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) - { - if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + + if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) + { + if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error; + } } return error; @@ -654,7 +664,7 @@ struct CachingDeviceAllocator if (debug) CubLog("\tdevice %d freed %lld bytes.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n", current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].free); } - + Unlock(&spin_lock); // Attempt to revert back to entry-point device if necessary diff --git a/src/caffe/util/gpu_memory.cpp b/src/caffe/util/gpu_memory.cpp index acbf60c4233..7a219418b61 100644 --- a/src/caffe/util/gpu_memory.cpp +++ b/src/caffe/util/gpu_memory.cpp @@ -162,9 +162,9 @@ namespace caffe { // if you are paranoid, that doesn't mean they are not after you :) delete cubAlloc; - cubAlloc = new cub::CachingDeviceAllocator( 8, // defaults - 3, - 7, + cubAlloc = new cub::CachingDeviceAllocator( 2, // defaults + 6, + 16, poolsize_, false, debug_);