Skip to content

Commit 93d6e3f

Browse files
committed
Add Decay Range.
1 parent 5b8f8e1 commit 93d6e3f

File tree

2 files changed

+355
-4
lines changed

2 files changed

+355
-4
lines changed

src/backend/backend.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "../pal/pal.h"
44
#include "commitrange.h"
55
#include "commonconfig.h"
6+
#include "decayrange.h"
67
#include "empty_range.h"
78
#include "globalrange.h"
89
#include "largebuddyrange.h"
@@ -148,9 +149,10 @@ namespace snmalloc
148149
using GlobalR = GlobalRange<StatsR>;
149150

150151
# ifdef SNMALLOC_META_PROTECTED
152+
using CommittedRange =
153+
DecayRange<CommitRange<GlobalR, DefaultPal>, DefaultPal, Pagemap>;
151154
// Source for object allocations
152-
using ObjectRange =
153-
LargeBuddyRange<CommitRange<GlobalR, DefaultPal>, 21, 21, Pagemap>;
155+
using ObjectRange = LargeBuddyRange<CommittedRange, 21, 21, Pagemap>;
154156
// Set up protected range for metadata
155157
using SubR = CommitRange<SubRange<GlobalR, DefaultPal, 6>, DefaultPal>;
156158
using MetaRange =
@@ -159,8 +161,10 @@ namespace snmalloc
159161
# else
160162
// Source for object allocations and metadata
161163
// No separation between the two
162-
using ObjectRange = SmallBuddyRange<
163-
LargeBuddyRange<CommitRange<GlobalR, DefaultPal>, 21, 21, Pagemap>>;
164+
using CommittedRange =
165+
DecayRange<CommitRange<GlobalR, DefaultPal>, DefaultPal, Pagemap>;
166+
using ObjectRange =
167+
SmallBuddyRange<LargeBuddyRange<CommittedRange, 21, 21, Pagemap>>;
164168
using GlobalMetaRange = GlobalRange<ObjectRange>;
165169
# endif
166170
#endif

src/backend/decayrange.h

Lines changed: 347 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,347 @@
1+
#pragma once
2+
3+
#include "../ds/ptrwrap.h"
4+
#include "../pal/pal_ds.h"
5+
#include "largebuddyrange.h"
6+
7+
namespace snmalloc
8+
{
9+
template<SNMALLOC_CONCEPT(RBRep) Rep>
10+
class RepList
11+
{
12+
uintptr_t head = 0;
13+
14+
RepList(uintptr_t head) : head(head) {}
15+
16+
public:
17+
constexpr RepList() = default;
18+
19+
[[nodiscard]] bool is_empty() const
20+
{
21+
return head == 0;
22+
}
23+
24+
RepList get_next()
25+
{
26+
SNMALLOC_ASSERT(!is_empty());
27+
auto next_field = &(Rep::ref(false, head));
28+
auto next = Rep::get(next_field);
29+
return {next};
30+
}
31+
32+
capptr::Chunk<void> get_capability()
33+
{
34+
return capptr::Chunk<void>(reinterpret_cast<void*>(head));
35+
}
36+
37+
RepList cons(capptr::Chunk<void> new_head_cap)
38+
{
39+
auto new_head = new_head_cap.unsafe_uintptr();
40+
auto field = &(Rep::ref(false, new_head));
41+
Rep::set(field, head);
42+
return {new_head};
43+
}
44+
45+
template<typename F>
46+
void forall(F f)
47+
{
48+
auto curr = *this;
49+
while (!curr.is_empty())
50+
{
51+
auto next = curr.get_next();
52+
53+
f(curr.get_capability());
54+
55+
curr = next;
56+
}
57+
}
58+
};
59+
60+
/**
61+
* Concurrent Stack
62+
*
63+
* This stack supports the following clients
64+
* (push|pop)* || pop_all* || ... || pop_all*
65+
*
66+
* That is a single thread that can do push and pop, and other threads
67+
* that do pop_all. pop_all if it returns a value, returns all of the
68+
* stack, however, it may return nullptr if it races with either a push
69+
* or a pop.
70+
*
71+
* The primary use case is single-threaded access, where other threads
72+
* can attempt to steal all the values.
73+
*/
74+
template<SNMALLOC_CONCEPT(RBRep) Rep>
75+
class RepStack
76+
{
77+
static constexpr auto empty = RepList<Rep>{};
78+
79+
private:
80+
alignas(CACHELINE_SIZE) std::atomic<RepList<Rep>> stack{};
81+
82+
RepList<Rep> take()
83+
{
84+
if (stack.load(std::memory_order_relaxed).is_empty())
85+
return empty;
86+
return stack.exchange(empty, std::memory_order_acquire);
87+
}
88+
89+
void replace(RepList<Rep> new_head)
90+
{
91+
SNMALLOC_ASSERT(stack.load().is_empty());
92+
stack.store(new_head, std::memory_order_release);
93+
}
94+
95+
public:
96+
constexpr RepStack() = default;
97+
98+
void push(capptr::Chunk<void> new_head_cap)
99+
{
100+
auto old_head = take();
101+
auto new_head = old_head.cons(new_head_cap);
102+
replace(new_head);
103+
}
104+
105+
capptr::Chunk<void> pop()
106+
{
107+
auto old_head = take();
108+
if (old_head.is_empty())
109+
return nullptr;
110+
111+
auto next = old_head.get_next();
112+
replace(next);
113+
114+
return old_head.get_capability();
115+
}
116+
117+
RepList<Rep> pop_all()
118+
{
119+
return take();
120+
}
121+
};
122+
123+
/**
124+
* This range slowly filters back memory to the parent range.
125+
* It locally caches memory and after it hasn't been used for some time
126+
* it goes back to its parent range.
127+
*/
128+
129+
template<typename ParentRange, typename PAL, typename Pagemap>
130+
class DecayRange
131+
{
132+
/**
133+
* How many slab sizes that can be provided.
134+
*/
135+
static constexpr size_t NUM_SLAB_SIZES = Pal::address_bits - MIN_CHUNK_BITS;
136+
137+
/**
138+
* Number of free stacks per chunk size that each allocator will use.
139+
* For performance ideally a power of 2. We will return to the central
140+
* pool anything that has not be used in the last NUM_EPOCHS - 1, where
141+
* each epoch is separated by DecayMemoryTimerObject::PERIOD.
142+
* I.e. if period is 500ms and num of epochs is 4, then we will return to
143+
* the central pool anything not used for the last 1500-2000ms.
144+
*/
145+
static constexpr size_t NUM_EPOCHS = 4;
146+
static_assert(bits::is_pow2(NUM_EPOCHS), "Code assumes power of two.");
147+
148+
/**
149+
* Stack of ranges that have been returned for reuse.
150+
*/
151+
ModArray<
152+
NUM_SLAB_SIZES,
153+
ModArray<NUM_EPOCHS, RepStack<BuddyChunkRep<Pagemap>>>>
154+
chunk_stack;
155+
156+
typename ParentRange::State parent{};
157+
158+
/**
159+
* Which is the current epoch to place dealloced chunks, and the
160+
* first place we look for allocating chunks.
161+
*/
162+
static inline // alignas(CACHELINE_SIZE)
163+
std::atomic<size_t>
164+
epoch{0};
165+
166+
/**
167+
* Flag to ensure one-shot registration with the PAL.
168+
*/
169+
static inline std::atomic_bool registered_timer{false};
170+
171+
std::atomic_bool registered_local{false};
172+
173+
/**
174+
* All activated DecayRanges.
175+
*/
176+
static inline std::atomic<DecayRange*> all_local{nullptr};
177+
178+
DecayRange* all_local_next{nullptr};
179+
180+
static void handle_decay_tick()
181+
{
182+
static_assert(
183+
ParentRange::ConcurrencySafe,
184+
"Parent must be concurrency safe, as dealloc_range is called here on "
185+
"potentially another thread's state.");
186+
auto new_epoch = (epoch + 1) % NUM_EPOCHS;
187+
// Flush old index for all threads.
188+
auto curr = all_local.load(std::memory_order_acquire);
189+
while (curr != nullptr)
190+
{
191+
for (size_t sc = 0; sc < NUM_SLAB_SIZES; sc++)
192+
{
193+
auto old_stack = curr->chunk_stack[sc][new_epoch].pop_all();
194+
195+
old_stack.forall([curr, sc](auto cap) {
196+
curr->parent->dealloc_range(cap, MIN_CHUNK_SIZE << sc);
197+
});
198+
}
199+
curr = curr->all_local_next;
200+
}
201+
202+
// Advance current index
203+
epoch = new_epoch;
204+
}
205+
206+
class DecayMemoryTimerObject : public PalTimerObject
207+
{
208+
/***
209+
* Method for callback object to perform lazy decommit.
210+
*/
211+
static void process(PalTimerObject*)
212+
{
213+
#ifdef SNMALLOC_TRACING
214+
message<1024>("DecayRange::handle_decay_tick timer");
215+
#endif
216+
handle_decay_tick();
217+
}
218+
219+
// Specify that we notify the ChunkAllocator every 500ms.
220+
static constexpr size_t PERIOD = 500;
221+
222+
public:
223+
constexpr DecayMemoryTimerObject() : PalTimerObject(&process, PERIOD) {}
224+
};
225+
226+
static inline DecayMemoryTimerObject timer_object;
227+
228+
public:
229+
class State
230+
{
231+
DecayRange commit_range{};
232+
233+
public:
234+
constexpr State() = default;
235+
236+
DecayRange* operator->()
237+
{
238+
return &commit_range;
239+
}
240+
};
241+
242+
static constexpr bool Aligned = ParentRange::Aligned;
243+
244+
static constexpr bool ConcurrencySafe = false;
245+
246+
constexpr DecayRange() = default;
247+
248+
capptr::Chunk<void> alloc_range(size_t size)
249+
{
250+
// Check local cache
251+
252+
if constexpr (pal_supports<Time, PAL>)
253+
{
254+
auto slab_sizeclass = bits::next_pow2_bits(size) - MIN_CHUNK_BITS;
255+
// Try local cache of chunks first
256+
for (size_t e = 0; e < NUM_EPOCHS; e++)
257+
{
258+
auto p = chunk_stack[slab_sizeclass][(epoch - e) % NUM_EPOCHS].pop();
259+
260+
if (p != nullptr)
261+
{
262+
#ifdef SNMALLOC_TRACING
263+
message<1024>(
264+
"DecayRange::alloc_range: returning from local cache: {} on {}",
265+
address_cast(p), this);
266+
#endif
267+
return p;
268+
}
269+
}
270+
}
271+
272+
// Loop to possibly flush all the other local threads caches.
273+
// Note that flushing passes to the parent range, which may consolidate
274+
// blocks and thus be able to service this request.
275+
// Alternatively, we could implement stealing, but that wouldn't
276+
// be able to consolidate.
277+
capptr::Chunk<void> result;
278+
for (auto i = NUM_EPOCHS; i > 0; i--)
279+
{
280+
// Nothing in local cache, so allocate from parent.
281+
result = parent->alloc_range(size);
282+
if (result != nullptr)
283+
{
284+
#ifdef SNMALLOC_TRACING
285+
message<1024>(
286+
"DecayRange::alloc_range: returning from parent: {} on {}",
287+
address_cast(result), this);
288+
#endif
289+
return result;
290+
}
291+
292+
// We have run out of memory.
293+
// Try to free some memory to the parent.
294+
#ifdef SNMALLOC_TRACING
295+
message<1024>("DecayRange::handle_decay_tick OOM");
296+
#endif
297+
handle_decay_tick();
298+
}
299+
300+
// Last try.
301+
result = parent->alloc_range(size);
302+
303+
#ifdef SNMALLOC_TRACING
304+
message<1024>(
305+
"DecayRange::alloc_range: returning from parent last try: {} on {}",
306+
address_cast(result), this);
307+
#endif
308+
309+
return result;
310+
}
311+
312+
void dealloc_range(capptr::Chunk<void> base, size_t size)
313+
{
314+
if constexpr (!pal_supports<Time, PAL>)
315+
{
316+
parent->dealloc_range(base, size);
317+
return;
318+
}
319+
320+
if (!registered_timer.exchange(true))
321+
{
322+
// Register with the PAL.
323+
PAL::register_timer(&timer_object);
324+
}
325+
326+
// Check we have registered
327+
if (!registered_local.exchange(true))
328+
{
329+
// Add to the list of local states.
330+
auto* head = all_local.load();
331+
do
332+
{
333+
all_local_next = head;
334+
} while (!all_local.compare_exchange_strong(head, this));
335+
}
336+
337+
auto slab_sizeclass = bits::next_pow2_bits(size) - MIN_CHUNK_BITS;
338+
// Add to local cache.
339+
#ifdef SNMALLOC_TRACING
340+
message<1024>(
341+
"DecayRange::dealloc_range: returning to local cache: {} on {}",
342+
address_cast(base), this);
343+
#endif
344+
chunk_stack[slab_sizeclass][epoch].push(base);
345+
}
346+
};
347+
} // namespace snmalloc

0 commit comments

Comments
 (0)