Skip to content

Commit

Permalink
add sample gpu memory pool
Browse files Browse the repository at this point in the history
  • Loading branch information
qingshui committed Dec 15, 2021
1 parent b4e0903 commit 30310f1
Show file tree
Hide file tree
Showing 6 changed files with 242 additions and 7 deletions.
24 changes: 23 additions & 1 deletion paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,23 @@ class AllocatorFacadePrivate {
break;
}

case AllocatorStrategy::kSamplePool: {
InitNaiveBestFitCPUAllocator();
#ifdef PADDLE_WITH_XPU
for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUDA
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) {
InitSampleCUDAAllocator(platform::CUDAPlace(dev_id));
}
InitNaiveBestFitCUDAPinnedAllocator();
#endif
break;
}

default: {
PADDLE_THROW(platform::errors::InvalidArgument(
"Unsupported allocator strategy: %d", static_cast<int>(strategy)));
Expand All @@ -123,7 +140,8 @@ class AllocatorFacadePrivate {
InitZeroSizeAllocators();
InitSystemAllocators();

if (FLAGS_gpu_allocator_retry_time > 0) {
if (strategy != AllocatorStrategy::kSamplePool &&
FLAGS_gpu_allocator_retry_time > 0) {
WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time);
}

Expand Down Expand Up @@ -188,6 +206,10 @@ class AllocatorFacadePrivate {
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
cuda_allocator, platform::GpuMinChunkSize());
}
void InitSampleCUDAAllocator(platform::CUDAPlace p) {
auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
allocators_[p] = std::make_shared<SampleAllocator>(cuda_allocator);
}
#endif

#ifdef PADDLE_WITH_XPU
Expand Down
4 changes: 3 additions & 1 deletion paddle/fluid/memory/allocation/allocator_strategy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ static AllocatorStrategy GetStrategyFromFlag() {
if (FLAGS_allocator_strategy == "thread_local") {
return AllocatorStrategy::kThreadLocal;
}

if (FLAGS_allocator_strategy == "sample_pool") {
return AllocatorStrategy::kSamplePool;
}
PADDLE_THROW(platform::errors::InvalidArgument(
"Unsupported allocator strategy: %s, condicates are naive_best_fit, "
"auto_growth or thread_local.",
Expand Down
7 changes: 6 additions & 1 deletion paddle/fluid/memory/allocation/allocator_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@ namespace paddle {
namespace memory {
namespace allocation {

enum class AllocatorStrategy { kNaiveBestFit, kAutoGrowth, kThreadLocal };
enum class AllocatorStrategy {
kNaiveBestFit,
kAutoGrowth,
kThreadLocal,
kSamplePool
};

extern AllocatorStrategy GetAllocatorStrategy();

Expand Down
144 changes: 144 additions & 0 deletions paddle/fluid/memory/allocation/retry_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@

#include "paddle/fluid/memory/allocation/retry_allocator.h"

DEFINE_int32(sample_max_bin_bytes, 2048, "sample max bytes in pool MB");
DEFINE_int32(sample_bin_growth, 2, "sample growth memory by bin");
DEFINE_int32(sample_min_bin, 8, "sample min bin number");
DEFINE_bool(sample_debug_info, false, "sample print debug info");

namespace paddle {
namespace memory {
namespace allocation {
Expand Down Expand Up @@ -94,6 +99,145 @@ Allocation* RetryAllocator::AllocateImpl(size_t size) {
}
}

static const unsigned int INVALID_BIN = (unsigned int)-1;
SampleAllocator::BlockDescriptor::BlockDescriptor(Allocation* ptr)
: d_ptr(ptr), bytes(0), used(0), bin(INVALID_BIN) {}
SampleAllocator::BlockDescriptor::BlockDescriptor()
: d_ptr(NULL), bytes(0), used(0), bin(INVALID_BIN) {}
bool SampleAllocator::BlockDescriptor::ptrcompare(const BlockDescriptor& a,
const BlockDescriptor& b) {
return (a.d_ptr < b.d_ptr);
}
bool SampleAllocator::BlockDescriptor::sizecompare(const BlockDescriptor& a,
const BlockDescriptor& b) {
return (a.bytes < b.bytes);
}
SampleAllocator::SampleAllocator(std::shared_ptr<Allocator> allocator)
: allocator_(std::move(allocator)),
bin_growth_(FLAGS_sample_bin_growth),
min_bin_(FLAGS_sample_min_bin),
min_bin_bytes_(pow(bin_growth_, min_bin_)),
max_bin_bytes_(FLAGS_sample_max_bin_bytes * 1024 * 1024),
cached_blocks_(BlockDescriptor::sizecompare),
live_blocks_(BlockDescriptor::ptrcompare) {
PADDLE_ENFORCE_NOT_NULL(
allocator_, platform::errors::InvalidArgument(
"Underlying allocator of SampleAllocator is NULL"));
VLOG(0) << "SampleAllocator init";
}
void SampleAllocator::FreeImpl(Allocation* allocation) {
if (allocation == NULL) {
return;
}
bool recached = false;
BlockDescriptor search_key(allocation);

mutex_.lock();
auto block_itr = live_blocks_.find(search_key);
if (block_itr != live_blocks_.end()) {
search_key = *block_itr;
live_blocks_.erase(block_itr);
cached_bytes_.live -= search_key.bytes;
cached_bytes_.used -= search_key.used;
if (search_key.bin != INVALID_BIN) {
recached = true;
cached_blocks_.insert(search_key);
cached_bytes_.free += search_key.bytes;
}
}
mutex_.unlock();

if (!recached) {
allocator_->Free(allocation);
}

if (FLAGS_sample_debug_info && search_key.bin == INVALID_BIN) {
VLOG(0) << "pool total: " << (cached_bytes_.live >> 20)
<< "MB, used: " << (cached_bytes_.used >> 20) << "MB, free"
<< (cached_bytes_.free >> 20)
<< "MB, free big memory: " << search_key.bytes << " bytes";
}
}
// alloc memory
Allocation* SampleAllocator::AllocateImpl(size_t bytes) {
// Create a block descriptor for the requested allocation
bool found = false;
BlockDescriptor search_key;
search_key.used = bytes;
if (bytes > max_bin_bytes_) {
search_key.bin = INVALID_BIN;
search_key.bytes = bytes;
} else {
if (bytes < min_bin_bytes_) {
search_key.bin = min_bin_;
search_key.bytes = min_bin_bytes_;
} else {
search_key.bin = 0;
search_key.bytes = 1;
while (search_key.bytes < bytes) {
search_key.bytes *= bin_growth_;
++search_key.bin;
}
}
mutex_.lock();
auto block_itr = cached_blocks_.lower_bound(search_key);
if ((block_itr != cached_blocks_.end()) &&
(block_itr->bin == search_key.bin)) {
found = true;
search_key = *block_itr;
search_key.used = bytes;
live_blocks_.insert(search_key);
// Remove from free blocks
cached_bytes_.free -= search_key.bytes;
cached_bytes_.live += search_key.bytes;
cached_bytes_.used += search_key.used;
cached_blocks_.erase(block_itr);
}
mutex_.unlock();
}
if (!found) {
try {
search_key.d_ptr = allocator_->Allocate(search_key.bytes).release();
} catch (BadAlloc&) {
// release all free cache
FreeAllCache();
// cuda malloc
search_key.d_ptr = allocator_->Allocate(search_key.bytes).release();
} catch (...) {
throw;
}
mutex_.lock();
live_blocks_.insert(search_key);
cached_bytes_.live += search_key.bytes;
cached_bytes_.used += search_key.used;
mutex_.unlock();

if (FLAGS_sample_debug_info && search_key.bin == INVALID_BIN) {
VLOG(0) << "pool total: " << (cached_bytes_.live >> 20)
<< "MB, used: " << (cached_bytes_.used >> 20) << "MB, free"
<< (cached_bytes_.free >> 20)
<< "MB, cuda alloc big memory: " << bytes << " bytes";
}
}
return search_key.d_ptr;
}
void SampleAllocator::FreeAllCache(void) {
mutex_.lock();
if (cached_blocks_.empty()) {
mutex_.unlock();
return;
}
while (!cached_blocks_.empty()) {
auto begin = cached_blocks_.begin();
allocator_->Free(begin->d_ptr);
cached_bytes_.free -= begin->bytes;
cached_blocks_.erase(begin);
}
mutex_.unlock();
}

void SampleAllocator::GetMemInfo(TotalBytes* info) { *info = cached_bytes_; }

} // namespace allocation
} // namespace memory
} // namespace paddle
66 changes: 62 additions & 4 deletions paddle/fluid/memory/allocation/retry_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
#include <condition_variable> // NOLINT
#include <memory>
#include <mutex> // NOLINT
#include <set>
#include <utility>

#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/enforce.h"

Expand All @@ -45,9 +45,9 @@ class RetryAllocator : public Allocator {
bool IsAllocThreadSafe() const override { return true; }

protected:
void FreeImpl(Allocation* allocation) override;
Allocation* AllocateImpl(size_t size) override;
uint64_t ReleaseImpl(const platform::Place& place) override {
void FreeImpl(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size) override;
uint64_t ReleaseImpl(const platform::Place &place) override {
return underlying_allocator_->Release(place);
}

Expand All @@ -60,6 +60,64 @@ class RetryAllocator : public Allocator {
std::atomic<size_t> waited_allocate_size_{0};
};

class SampleAllocator : public Allocator {
/**
* Descriptor for device memory allocations
*/
struct BlockDescriptor {
Allocation *d_ptr; // Device pointer
size_t bytes; // Size of allocation in bytes
size_t used; // Real used
unsigned int bin; // Bin enumeration

explicit BlockDescriptor(Allocation *ptr);
BlockDescriptor();
static bool ptrcompare(const BlockDescriptor &a, const BlockDescriptor &b);
static bool sizecompare(const BlockDescriptor &a, const BlockDescriptor &b);
};
// BlockDescriptor comparator function interface
typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
/// Set type for cached blocks (ordered by size)
typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
/// Set type for live blocks (ordered by ptr)
typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;

public:
// Total Bytes
struct TotalBytes {
size_t free = 0;
size_t live = 0;
size_t used = 0;
};
explicit SampleAllocator(std::shared_ptr<Allocator> allocator);
bool IsAllocThreadSafe() const override { return true; }
void GetMemInfo(TotalBytes *info);

protected:
void FreeImpl(Allocation *allocation) override;
Allocation *AllocateImpl(size_t size) override;
uint64_t ReleaseImpl(const platform::Place &place) override {
FreeAllCache();
return allocator_->Release(place);
}
void FreeAllCache(void);

private:
std::shared_ptr<Allocator> allocator_;
std::mutex mutex_;

unsigned int bin_growth_; /// Geometric growth factor for bin-sizes
unsigned int min_bin_; /// Minimum bin enumeration
size_t min_bin_bytes_;
size_t max_bin_bytes_;

TotalBytes cached_bytes_; /// Map of device ordinal to aggregate cached bytes
/// on that device
CachedBlocks
cached_blocks_; /// Set of cached device allocations available for reuse
BusyBlocks live_blocks_;
};

} // namespace allocation
} // namespace memory
} // namespace paddle
4 changes: 4 additions & 0 deletions python/paddle/fluid/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,10 @@ def __bootstrap__():
'call_stack_level',
'sort_sum_gradient',
'max_inplace_grad_add',
'sample_max_bin_bytes',
'sample_bin_growth',
'sample_min_bin',
'sample_debug_info',
]
if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory')
Expand Down

0 comments on commit 30310f1

Please sign in to comment.