Skip to content

Commit 55427ad

Browse files
authored
Several improvements to ARC shrinking (#16197)
- When receiving memory pressure signal from OS be more strict trying to free some memory. Otherwise kernel may come again and request much more. Return as result how much arc_c was actually reduced due to this request, that may be less than requested. - On Linux when receiving direct reclaim from some file system (that may be ZFS) instead of ignoring request completely, just shrink the ARC, but do not wait for eviction. Waiting there may cause deadlock. Ignoring it as before may put extra pressure on other caches and/or swap, and cause OOM if nothing help. While not waiting may result in more ARC evicted later, and may be too late if OOM killer activate right now, but I hope it to be better than doing nothing at all. - On Linux set arc_no_grow before waiting for reclaim, not after, or it may grow back while we are waiting. - On Linux add new parameter zfs_arc_shrinker_seeks to balance ARC eviction cost, relative to page cache and other subsystems. - Slightly update Linux arc_set_sys_free() math for new kernels. Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Reviewed-by: Rob Norris <rob.norris@klarasystems.com> Reviewed-by: Tony Hutter <hutter2@llnl.gov>
1 parent c7ada64 commit 55427ad

File tree

5 files changed

+123
-94
lines changed

5 files changed

+123
-94
lines changed

include/sys/arc_impl.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,10 +1058,10 @@ extern uint_t arc_lotsfree_percent;
10581058
extern uint64_t zfs_arc_min;
10591059
extern uint64_t zfs_arc_max;
10601060

1061-
extern void arc_reduce_target_size(int64_t to_free);
1061+
extern uint64_t arc_reduce_target_size(uint64_t to_free);
10621062
extern boolean_t arc_reclaim_needed(void);
10631063
extern void arc_kmem_reap_soon(void);
1064-
extern void arc_wait_for_eviction(uint64_t, boolean_t);
1064+
extern void arc_wait_for_eviction(uint64_t, boolean_t, boolean_t);
10651065

10661066
extern void arc_lowmem_init(void);
10671067
extern void arc_lowmem_fini(void);

man/man4/zfs.4

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -831,6 +831,13 @@ even with a small average compressed block size of ~8 KiB.
831831
The parameter can be set to 0 (zero) to disable the limit,
832832
and only applies on Linux.
833833
.
834+
.It Sy zfs_arc_shrinker_seeks Ns = Ns Sy 2 Pq int
835+
Relative cost of ARC eviction on Linux, AKA number of seeks needed to
836+
restore evicted page.
837+
Bigger values make ARC more precious and evictions smaller, comparing to
838+
other kernel subsystems.
839+
Value of 4 means parity with page cache.
840+
.
834841
.It Sy zfs_arc_sys_free Ns = Ns Sy 0 Ns B Pq u64
835842
The target number of bytes the ARC should leave as free memory on the system.
836843
If zero, equivalent to the bigger of

module/os/freebsd/zfs/arc_os.c

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -149,26 +149,25 @@ static eventhandler_tag arc_event_lowmem = NULL;
149149
static void
150150
arc_lowmem(void *arg __unused, int howto __unused)
151151
{
152-
int64_t free_memory, to_free;
152+
int64_t can_free, free_memory, to_free;
153153

154154
arc_no_grow = B_TRUE;
155155
arc_warm = B_TRUE;
156156
arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
157+
157158
free_memory = arc_available_memory();
158-
int64_t can_free = arc_c - arc_c_min;
159-
if (can_free <= 0)
160-
return;
161-
to_free = (can_free >> arc_shrink_shift) - MIN(free_memory, 0);
159+
can_free = arc_c - arc_c_min;
160+
to_free = (MAX(can_free, 0) >> arc_shrink_shift) - MIN(free_memory, 0);
162161
DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
163-
arc_reduce_target_size(to_free);
162+
to_free = arc_reduce_target_size(to_free);
164163

165164
/*
166165
* It is unsafe to block here in arbitrary threads, because we can come
167166
* here from ARC itself and may hold ARC locks and thus risk a deadlock
168167
* with ARC reclaim thread.
169168
*/
170169
if (curproc == pageproc)
171-
arc_wait_for_eviction(to_free, B_FALSE);
170+
arc_wait_for_eviction(to_free, B_FALSE, B_FALSE);
172171
}
173172

174173
void

module/os/linux/zfs/arc_os.c

Lines changed: 47 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#include <linux/page_compat.h>
5050
#include <linux/notifier.h>
5151
#include <linux/memory.h>
52+
#include <linux/version.h>
5253
#endif
5354
#include <sys/callb.h>
5455
#include <sys/kstat.h>
@@ -58,6 +59,7 @@
5859
#include <sys/trace_zfs.h>
5960
#include <sys/aggsum.h>
6061

62+
#ifdef _KERNEL
6163
/*
6264
* This is a limit on how many pages the ARC shrinker makes available for
6365
* eviction in response to one page allocation attempt. Note that in
@@ -72,11 +74,20 @@
7274
* See also the comment in arc_shrinker_count().
7375
* Set to 0 to disable limit.
7476
*/
75-
int zfs_arc_shrinker_limit = 10000;
77+
static int zfs_arc_shrinker_limit = 10000;
78+
79+
/*
80+
* Relative cost of ARC eviction, AKA number of seeks needed to restore evicted
81+
* page. Bigger values make ARC more precious and evictions smaller comparing
82+
* to other kernel subsystems. Value of 4 means parity with page cache,
83+
* according to my reading of kernel's do_shrink_slab() and other code.
84+
*/
85+
static int zfs_arc_shrinker_seeks = DEFAULT_SEEKS;
7686

7787
#ifdef CONFIG_MEMORY_HOTPLUG
7888
static struct notifier_block arc_hotplug_callback_mem_nb;
7989
#endif
90+
#endif
8091

8192
/*
8293
* Return a default max arc size based on the amount of physical memory.
@@ -170,22 +181,7 @@ static unsigned long
170181
arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
171182
{
172183
/*
173-
* __GFP_FS won't be set if we are called from ZFS code (see
174-
* kmem_flags_convert(), which removes it). To avoid a deadlock, we
175-
* don't allow evicting in this case. We return 0 rather than
176-
* SHRINK_STOP so that the shrinker logic doesn't accumulate a
177-
* deficit against us.
178-
*/
179-
if (!(sc->gfp_mask & __GFP_FS)) {
180-
return (0);
181-
}
182-
183-
/*
184-
* This code is reached in the "direct reclaim" case, where the
185-
* kernel (outside ZFS) is trying to allocate a page, and the system
186-
* is low on memory.
187-
*
188-
* The kernel's shrinker code doesn't understand how many pages the
184+
* The kernel's shrinker code may not understand how many pages the
189185
* ARC's callback actually frees, so it may ask the ARC to shrink a
190186
* lot for one page allocation. This is problematic because it may
191187
* take a long time, thus delaying the page allocation, and because
@@ -204,40 +200,44 @@ arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
204200
*
205201
* See also the comment above zfs_arc_shrinker_limit.
206202
*/
203+
int64_t can_free = btop(arc_evictable_memory());
207204
int64_t limit = zfs_arc_shrinker_limit != 0 ?
208205
zfs_arc_shrinker_limit : INT64_MAX;
209-
return (MIN(limit, btop((int64_t)arc_evictable_memory())));
206+
return (MIN(can_free, limit));
210207
}
211208

212209
static unsigned long
213210
arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
214211
{
215-
ASSERT((sc->gfp_mask & __GFP_FS) != 0);
216-
217212
/* The arc is considered warm once reclaim has occurred */
218213
if (unlikely(arc_warm == B_FALSE))
219214
arc_warm = B_TRUE;
220215

216+
/*
217+
* We are experiencing memory pressure which the arc_evict_zthr was
218+
* unable to keep up with. Set arc_no_grow to briefly pause ARC
219+
* growth to avoid compounding the memory pressure.
220+
*/
221+
arc_no_grow = B_TRUE;
222+
221223
/*
222224
* Evict the requested number of pages by reducing arc_c and waiting
223-
* for the requested amount of data to be evicted.
225+
* for the requested amount of data to be evicted. To avoid deadlock
226+
* do not wait for eviction if we may be called from ZFS itself (see
227+
* kmem_flags_convert() removing __GFP_FS). It may cause excessive
228+
* eviction later if many evictions are accumulated, but just skipping
229+
* the eviction is not good either if most of memory is used by ARC.
224230
*/
225-
arc_reduce_target_size(ptob(sc->nr_to_scan));
226-
arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE);
231+
uint64_t to_free = arc_reduce_target_size(ptob(sc->nr_to_scan));
232+
if (sc->gfp_mask & __GFP_FS)
233+
arc_wait_for_eviction(to_free, B_FALSE, B_FALSE);
227234
if (current->reclaim_state != NULL)
228235
#ifdef HAVE_RECLAIM_STATE_RECLAIMED
229-
current->reclaim_state->reclaimed += sc->nr_to_scan;
236+
current->reclaim_state->reclaimed += btop(to_free);
230237
#else
231-
current->reclaim_state->reclaimed_slab += sc->nr_to_scan;
238+
current->reclaim_state->reclaimed_slab += btop(to_free);
232239
#endif
233240

234-
/*
235-
* We are experiencing memory pressure which the arc_evict_zthr was
236-
* unable to keep up with. Set arc_no_grow to briefly pause arc
237-
* growth to avoid compounding the memory pressure.
238-
*/
239-
arc_no_grow = B_TRUE;
240-
241241
/*
242242
* When direct reclaim is observed it usually indicates a rapid
243243
* increase in memory pressure. This occurs because the kswapd
@@ -250,7 +250,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
250250
ARCSTAT_BUMP(arcstat_memory_direct_count);
251251
}
252252

253-
return (sc->nr_to_scan);
253+
return (btop(to_free));
254254
}
255255

256256
static struct shrinker *arc_shrinker = NULL;
@@ -304,9 +304,7 @@ arc_set_sys_free(uint64_t allmem)
304304
* arc_wait_for_eviction() will wait until at least the
305305
* high_wmark_pages() are free (see arc_evict_state_impl()).
306306
*
307-
* Note: Even when the system is very low on memory, the kernel's
308-
* shrinker code may only ask for one "batch" of pages (512KB) to be
309-
* evicted. If concurrent allocations consume these pages, there may
307+
* Note: If concurrent allocations consume these pages, there may
310308
* still be insufficient free pages, and the OOM killer takes action.
311309
*
312310
* By setting arc_sys_free large enough, and having
@@ -318,20 +316,26 @@ arc_set_sys_free(uint64_t allmem)
318316
* It's hard to iterate the zones from a linux kernel module, which
319317
* makes it difficult to determine the watermark dynamically. Instead
320318
* we compute the maximum high watermark for this system, based
321-
* on the amount of memory, assuming default parameters on Linux kernel
322-
* 5.3.
319+
* on the amount of memory, using the same method as the kernel uses
320+
* to calculate its internal `min_free_kbytes` variable. See
321+
* torvalds/linux@ee8eb9a5fe86 for the change in the upper clamp value
322+
* from 64M to 256M.
323323
*/
324324

325325
/*
326326
* Base wmark_low is 4 * the square root of Kbytes of RAM.
327327
*/
328-
long wmark = 4 * int_sqrt(allmem/1024) * 1024;
328+
long wmark = int_sqrt(allmem / 1024 * 16) * 1024;
329329

330330
/*
331-
* Clamp to between 128K and 64MB.
331+
* Clamp to between 128K and 256/64MB.
332332
*/
333333
wmark = MAX(wmark, 128 * 1024);
334+
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 7, 0)
335+
wmark = MIN(wmark, 256 * 1024 * 1024);
336+
#else
334337
wmark = MIN(wmark, 64 * 1024 * 1024);
338+
#endif
335339

336340
/*
337341
* watermark_boost can increase the wmark by up to 150%.
@@ -357,7 +361,7 @@ arc_lowmem_init(void)
357361
* swapping out pages when it is preferable to shrink the arc.
358362
*/
359363
arc_shrinker = spl_register_shrinker("zfs-arc-shrinker",
360-
arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS);
364+
arc_shrinker_count, arc_shrinker_scan, zfs_arc_shrinker_seeks);
361365
VERIFY(arc_shrinker);
362366

363367
arc_set_sys_free(allmem);
@@ -500,3 +504,5 @@ arc_unregister_hotplug(void)
500504

501505
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW,
502506
"Limit on number of pages that ARC shrinker can reclaim at once");
507+
ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_seeks, INT, ZMOD_RD,
508+
"Relative cost of ARC eviction vs other kernel subsystems");

0 commit comments

Comments
 (0)