4949#include <linux/page_compat.h>
5050#include <linux/notifier.h>
5151#include <linux/memory.h>
52+ #include <linux/version.h>
5253#endif
5354#include <sys/callb.h>
5455#include <sys/kstat.h>
5859#include <sys/trace_zfs.h>
5960#include <sys/aggsum.h>
6061
62+ #ifdef _KERNEL
6163/*
6264 * This is a limit on how many pages the ARC shrinker makes available for
6365 * eviction in response to one page allocation attempt. Note that in
7274 * See also the comment in arc_shrinker_count().
7375 * Set to 0 to disable limit.
7476 */
75- int zfs_arc_shrinker_limit = 10000 ;
77+ static int zfs_arc_shrinker_limit = 10000 ;
78+
79+ /*
80+ * Relative cost of ARC eviction, AKA number of seeks needed to restore evicted
81+ * page. Bigger values make ARC more precious and evictions smaller comparing
82+ * to other kernel subsystems. Value of 4 means parity with page cache,
83+ * according to my reading of kernel's do_shrink_slab() and other code.
84+ */
85+ static int zfs_arc_shrinker_seeks = DEFAULT_SEEKS ;
7686
7787#ifdef CONFIG_MEMORY_HOTPLUG
7888static struct notifier_block arc_hotplug_callback_mem_nb ;
7989#endif
90+ #endif
8091
8192/*
8293 * Return a default max arc size based on the amount of physical memory.
@@ -170,22 +181,7 @@ static unsigned long
170181arc_shrinker_count (struct shrinker * shrink , struct shrink_control * sc )
171182{
172183 /*
173- * __GFP_FS won't be set if we are called from ZFS code (see
174- * kmem_flags_convert(), which removes it). To avoid a deadlock, we
175- * don't allow evicting in this case. We return 0 rather than
176- * SHRINK_STOP so that the shrinker logic doesn't accumulate a
177- * deficit against us.
178- */
179- if (!(sc -> gfp_mask & __GFP_FS )) {
180- return (0 );
181- }
182-
183- /*
184- * This code is reached in the "direct reclaim" case, where the
185- * kernel (outside ZFS) is trying to allocate a page, and the system
186- * is low on memory.
187- *
188- * The kernel's shrinker code doesn't understand how many pages the
184+ * The kernel's shrinker code may not understand how many pages the
189185 * ARC's callback actually frees, so it may ask the ARC to shrink a
190186 * lot for one page allocation. This is problematic because it may
191187 * take a long time, thus delaying the page allocation, and because
@@ -204,40 +200,44 @@ arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
204200 *
205201 * See also the comment above zfs_arc_shrinker_limit.
206202 */
203+ int64_t can_free = btop (arc_evictable_memory ());
207204 int64_t limit = zfs_arc_shrinker_limit != 0 ?
208205 zfs_arc_shrinker_limit : INT64_MAX ;
209- return (MIN (limit , btop (( int64_t ) arc_evictable_memory ()) ));
206+ return (MIN (can_free , limit ));
210207}
211208
212209static unsigned long
213210arc_shrinker_scan (struct shrinker * shrink , struct shrink_control * sc )
214211{
215- ASSERT ((sc -> gfp_mask & __GFP_FS ) != 0 );
216-
217212 /* The arc is considered warm once reclaim has occurred */
218213 if (unlikely (arc_warm == B_FALSE ))
219214 arc_warm = B_TRUE ;
220215
216+ /*
217+ * We are experiencing memory pressure which the arc_evict_zthr was
218+ * unable to keep up with. Set arc_no_grow to briefly pause ARC
219+ * growth to avoid compounding the memory pressure.
220+ */
221+ arc_no_grow = B_TRUE ;
222+
221223 /*
222224 * Evict the requested number of pages by reducing arc_c and waiting
223- * for the requested amount of data to be evicted.
225+ * for the requested amount of data to be evicted. To avoid deadlock
226+ * do not wait for eviction if we may be called from ZFS itself (see
227+ * kmem_flags_convert() removing __GFP_FS). It may cause excessive
228+ * eviction later if many evictions are accumulated, but just skipping
229+ * the eviction is not good either if most of memory is used by ARC.
224230 */
225- arc_reduce_target_size (ptob (sc -> nr_to_scan ));
226- arc_wait_for_eviction (ptob (sc -> nr_to_scan ), B_FALSE );
231+ uint64_t to_free = arc_reduce_target_size (ptob (sc -> nr_to_scan ));
232+ if (sc -> gfp_mask & __GFP_FS )
233+ arc_wait_for_eviction (to_free , B_FALSE , B_FALSE );
227234 if (current -> reclaim_state != NULL )
228235#ifdef HAVE_RECLAIM_STATE_RECLAIMED
229- current -> reclaim_state -> reclaimed += sc -> nr_to_scan ;
236+ current -> reclaim_state -> reclaimed += btop ( to_free ) ;
230237#else
231- current -> reclaim_state -> reclaimed_slab += sc -> nr_to_scan ;
238+ current -> reclaim_state -> reclaimed_slab += btop ( to_free ) ;
232239#endif
233240
234- /*
235- * We are experiencing memory pressure which the arc_evict_zthr was
236- * unable to keep up with. Set arc_no_grow to briefly pause arc
237- * growth to avoid compounding the memory pressure.
238- */
239- arc_no_grow = B_TRUE ;
240-
241241 /*
242242 * When direct reclaim is observed it usually indicates a rapid
243243 * increase in memory pressure. This occurs because the kswapd
@@ -250,7 +250,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
250250 ARCSTAT_BUMP (arcstat_memory_direct_count );
251251 }
252252
253- return (sc -> nr_to_scan );
253+ return (btop ( to_free ) );
254254}
255255
256256static struct shrinker * arc_shrinker = NULL ;
@@ -304,9 +304,7 @@ arc_set_sys_free(uint64_t allmem)
304304 * arc_wait_for_eviction() will wait until at least the
305305 * high_wmark_pages() are free (see arc_evict_state_impl()).
306306 *
307- * Note: Even when the system is very low on memory, the kernel's
308- * shrinker code may only ask for one "batch" of pages (512KB) to be
309- * evicted. If concurrent allocations consume these pages, there may
307+ * Note: If concurrent allocations consume these pages, there may
310308 * still be insufficient free pages, and the OOM killer takes action.
311309 *
312310 * By setting arc_sys_free large enough, and having
@@ -318,20 +316,26 @@ arc_set_sys_free(uint64_t allmem)
318316 * It's hard to iterate the zones from a linux kernel module, which
319317 * makes it difficult to determine the watermark dynamically. Instead
320318 * we compute the maximum high watermark for this system, based
321- * on the amount of memory, assuming default parameters on Linux kernel
322- * 5.3.
319+ * on the amount of memory, using the same method as the kernel uses
320+ * to calculate its internal `min_free_kbytes` variable. See
321+ * torvalds/linux@ee8eb9a5fe86 for the change in the upper clamp value
322+ * from 64M to 256M.
323323 */
324324
325325 /*
326326 * Base wmark_low is 4 * the square root of Kbytes of RAM.
327327 */
328- long wmark = 4 * int_sqrt (allmem / 1024 ) * 1024 ;
328+ long wmark = int_sqrt (allmem / 1024 * 16 ) * 1024 ;
329329
330330 /*
331- * Clamp to between 128K and 64MB.
331+ * Clamp to between 128K and 256/ 64MB.
332332 */
333333 wmark = MAX (wmark , 128 * 1024 );
334+ #if LINUX_VERSION_CODE >= KERNEL_VERSION (5 , 7 , 0 )
335+ wmark = MIN (wmark , 256 * 1024 * 1024 );
336+ #else
334337 wmark = MIN (wmark , 64 * 1024 * 1024 );
338+ #endif
335339
336340 /*
337341 * watermark_boost can increase the wmark by up to 150%.
@@ -357,7 +361,7 @@ arc_lowmem_init(void)
357361 * swapping out pages when it is preferable to shrink the arc.
358362 */
359363 arc_shrinker = spl_register_shrinker ("zfs-arc-shrinker" ,
360- arc_shrinker_count , arc_shrinker_scan , DEFAULT_SEEKS );
364+ arc_shrinker_count , arc_shrinker_scan , zfs_arc_shrinker_seeks );
361365 VERIFY (arc_shrinker );
362366
363367 arc_set_sys_free (allmem );
@@ -500,3 +504,5 @@ arc_unregister_hotplug(void)
500504
501505ZFS_MODULE_PARAM (zfs_arc , zfs_arc_ , shrinker_limit , INT , ZMOD_RW ,
502506 "Limit on number of pages that ARC shrinker can reclaim at once" );
507+ ZFS_MODULE_PARAM (zfs_arc , zfs_arc_ , shrinker_seeks , INT , ZMOD_RD ,
508+ "Relative cost of ARC eviction vs other kernel subsystems" );
0 commit comments