diff --git a/include/os/linux/spl/sys/shrinker.h b/include/os/linux/spl/sys/shrinker.h index e519a527cd18..cc34d8ab1931 100644 --- a/include/os/linux/spl/sys/shrinker.h +++ b/include/os/linux/spl/sys/shrinker.h @@ -84,7 +84,7 @@ __ ## varname ## _wrapper(struct shrinker *shrink, struct shrink_control *sc)\ \ static struct shrinker varname = { \ .shrink = __ ## varname ## _wrapper, \ - .seeks = seek_cost \ + .seeks = seek_cost, \ } #define SHRINK_STOP (-1) @@ -97,7 +97,7 @@ static struct shrinker varname = { \ static struct shrinker varname = { \ .count_objects = countfunc, \ .scan_objects = scanfunc, \ - .seeks = seek_cost \ + .seeks = seek_cost, \ } #else diff --git a/include/os/linux/zfs/sys/trace_arc.h b/include/os/linux/zfs/sys/trace_arc.h index 5ce5b38a3ca3..faf2bd3d56d8 100644 --- a/include/os/linux/zfs/sys/trace_arc.h +++ b/include/os/linux/zfs/sys/trace_arc.h @@ -354,6 +354,41 @@ DEFINE_EVENT(zfs_l2arc_evict_class, name, \ /* END CSTYLED */ DEFINE_L2ARC_EVICT_EVENT(zfs_l2arc__evict); +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * uint64_t, ..., + * uint64_t, ..., + * uint64_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_arc_wait_for_eviction_class, + TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count), + TP_ARGS(amount, arc_evict_count, aew_count), + TP_STRUCT__entry( + __field(uint64_t, amount) + __field(uint64_t, arc_evict_count) + __field(uint64_t, aew_count) + ), + TP_fast_assign( + __entry->amount = amount; + __entry->arc_evict_count = arc_evict_count; + __entry->aew_count = aew_count; + ), + TP_printk("amount %llu arc_evict_count %llu aew_count %llu", + __entry->amount, __entry->arc_evict_count, __entry->aew_count) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(name) \ +DEFINE_EVENT(zfs_arc_wait_for_eviction_class, name, \ + TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count), + TP_ARGS(amount, arc_evict_count, aew_count), +/* END CSTYLED */ +DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(zfs_arc__wait__for__eviction); + #endif /* _TRACE_ARC_H */ #undef TRACE_INCLUDE_PATH @@ -376,6 +411,7 @@ DEFINE_DTRACE_PROBE1(l2arc__miss); DEFINE_DTRACE_PROBE2(l2arc__read); DEFINE_DTRACE_PROBE2(l2arc__write); DEFINE_DTRACE_PROBE2(l2arc__iodone); +DEFINE_DTRACE_PROBE3(arc__wait__for__eviction); DEFINE_DTRACE_PROBE4(arc__miss); DEFINE_DTRACE_PROBE4(l2arc__evict); diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index eb90d5bc9f4c..bb9163ba7977 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -846,15 +846,11 @@ typedef struct arc_stats { kstat_named_t arcstat_cached_only_in_progress; } arc_stats_t; -typedef enum free_memory_reason_t { - FMR_UNKNOWN, - FMR_NEEDFREE, - FMR_LOTSFREE, - FMR_SWAPFS_MINFREE, - FMR_PAGES_PP_MAXIMUM, - FMR_HEAP_ARENA, - FMR_ZIO_ARENA, -} free_memory_reason_t; +typedef struct arc_evict_waiter { + list_node_t aew_node; + kcondvar_t aew_cv; + uint64_t aew_count; +} arc_evict_waiter_t; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -870,7 +866,6 @@ typedef enum free_memory_reason_t { #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ #define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ -#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */ extern taskq_t *arc_prune_taskq; extern arc_stats_t arc_stats; @@ -879,10 +874,6 @@ extern boolean_t arc_warm; extern int arc_grow_retry; extern int arc_no_grow_shift; extern int arc_shrink_shift; -extern zthr_t *arc_evict_zthr; -extern kmutex_t arc_evict_lock; -extern kcondvar_t arc_evict_waiters_cv; -extern boolean_t arc_evict_needed; extern kmutex_t arc_prune_mtx; extern list_t arc_prune_list; extern aggsum_t arc_size; @@ -897,6 +888,7 @@ extern void arc_reduce_target_size(int64_t to_free); extern boolean_t arc_reclaim_needed(void); extern void arc_kmem_reap_soon(void); extern boolean_t arc_is_overflowing(void); +extern void arc_wait_for_eviction(uint64_t); extern void arc_lowmem_init(void); extern void arc_lowmem_fini(void); diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index c2abd9d805a3..c209acbe16e6 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -861,6 +861,23 @@ increased to reduce the memory footprint. Default value: \fB8192\fR. .RE +.sp +.ne 2 +.na +\fBzfs_arc_eviction_pct\fR (int) +.ad +.RS 12n +When \fBarc_is_overflowing()\fR, \fBarc_get_data_impl()\fR waits for this +percent of the requested amount of data to be evicted. For example, by +default for every 2KB that's evicted, 1KB of it may be "reused" by a new +allocation. Since this is above 100%, it ensures that progress is made +towards getting \fBarc_size\fR under \fBarc_c\fR. Since this is finite, it +ensures that allocations can still happen, even during the potentially long +time that \fBarc_size\fR is more than \fBarc_c\fR. +.sp +Default value: \fB200\fR. +.RE + .sp .ne 2 .na @@ -1148,6 +1165,29 @@ only operates during memory pressure/reclaim. Default value: \fB0\fR% (disabled). .RE +.sp +.ne 2 +.na +\fBzfs_arc_shrinker_limit\fR (int) +.ad +.RS 12n +This is a limit on how many pages the ARC shrinker makes available for +eviction in response to one page allocation attempt. Note that in +practice, the kernel's shrinker can ask us to evict up to about 4x this +for one allocation attempt. +.sp +The default limit of 10,000 (in practice, 160MB per allocation attempt with +4K pages) limits the amount of time spent attempting to reclaim ARC memory to +less than 100ms per allocation attempt, even with a small average compressed +block size of ~8KB. +.sp +The parameter can be set to 0 (zero) to disable the limit. +.sp +This parameter only applies on Linux. +.sp +Default value: \fB10,000\fR. +.RE + .sp .ne 2 .na diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index 554896d856a3..5f4b5df4a99a 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -52,9 +52,6 @@ extern struct vfsops zfs_vfsops; uint_t zfs_arc_free_target = 0; -int64_t last_free_memory; -free_memory_reason_t last_free_reason; - static void arc_free_target_init(void *unused __unused) { @@ -100,7 +97,6 @@ arc_available_memory(void) { int64_t lowest = INT64_MAX; int64_t n __unused; - free_memory_reason_t r = FMR_UNKNOWN; /* * Cooperate with pagedaemon when it's time for it to scan @@ -109,7 +105,6 @@ arc_available_memory(void) n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); if (n < lowest) { lowest = n; - r = FMR_LOTSFREE; } #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) /* @@ -126,13 +121,10 @@ arc_available_memory(void) n = uma_avail() - (long)(uma_limit() / 4); if (n < lowest) { lowest = n; - r = FMR_HEAP_ARENA; } #endif - last_free_memory = lowest; - last_free_reason = r; - DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); + DTRACE_PROBE1(arc__available_memory, int64_t, lowest); return (lowest); } @@ -223,18 +215,15 @@ arc_lowmem(void *arg __unused, int howto __unused) DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); arc_reduce_target_size(to_free); - mutex_enter(&arc_evict_lock); - arc_evict_needed = B_TRUE; - zthr_wakeup(arc_evict_zthr); - /* * It is unsafe to block here in arbitrary threads, because we can come * here from ARC itself and may hold ARC locks and thus risk a deadlock * with ARC reclaim thread. */ if (curproc == pageproc) - (void) cv_wait(&arc_evict_waiters_cv, &arc_evict_lock); - mutex_exit(&arc_evict_lock); + arc_wait_for_eviction(to_free); + else + arc_wait_for_eviction(0); } void diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 9c3a6a4e2987..92f9bae8ccd3 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -57,8 +57,22 @@ #include #include -int64_t last_free_memory; -free_memory_reason_t last_free_reason; +/* + * This is a limit on how many pages the ARC shrinker makes available for + * eviction in response to one page allocation attempt. Note that in + * practice, the kernel's shrinker can ask us to evict up to about 4x this + * for one allocation attempt. + * + * The default limit of 10,000 (in practice, 160MB per allocation attempt + * with 4K pages) limits the amount of time spent attempting to reclaim ARC + * memory to less than 100ms per allocation attempt, even with a small + * average compressed block size of ~8KB. + * + * See also the comment in arc_shrinker_count(). + * Set to 0 to disable limit. + */ +int zfs_arc_shrinker_limit = 10000; + /* * Return a default max arc size based on the amount of physical memory. @@ -104,16 +118,6 @@ arc_free_memory(void) #endif /* CONFIG_HIGHMEM */ } -/* - * Additional reserve of pages for pp_reserve. - */ -int64_t arc_pages_pp_reserve = 64; - -/* - * Additional reserve of pages for swapfs. - */ -int64_t arc_swapfs_reserve = 64; - /* * Return the amount of memory that can be consumed before reclaim will be * needed. Positive if there is sufficient free memory, negative indicates @@ -122,25 +126,7 @@ int64_t arc_swapfs_reserve = 64; int64_t arc_available_memory(void) { - int64_t lowest = INT64_MAX; - free_memory_reason_t r = FMR_UNKNOWN; - int64_t n; - - if (arc_need_free > 0) { - lowest = -arc_need_free; - r = FMR_NEEDFREE; - } - - n = arc_free_memory() - arc_sys_free - arc_need_free; - if (n < lowest) { - lowest = n; - r = FMR_LOTSFREE; - } - - last_free_memory = lowest; - last_free_reason = r; - - return (lowest); + return (arc_free_memory() - arc_sys_free); } static uint64_t @@ -174,84 +160,84 @@ arc_evictable_memory(void) static unsigned long arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { - return (btop((int64_t)arc_evictable_memory())); + /* + * __GFP_FS won't be set if we are called from ZFS code (see + * kmem_flags_convert(), which removes it). To avoid a deadlock, we + * don't allow evicting in this case. We return 0 rather than + * SHRINK_STOP so that the shrinker logic doesn't accumulate a + * deficit against us. + */ + if (!(sc->gfp_mask & __GFP_FS)) { + return (0); + } + + /* + * This code is reached in the "direct reclaim" case, where the + * kernel (outside ZFS) is trying to allocate a page, and the system + * is low on memory. + * + * The kernel's shrinker code doesn't understand how many pages the + * ARC's callback actually frees, so it may ask the ARC to shrink a + * lot for one page allocation. This is problematic because it may + * take a long time, thus delaying the page allocation, and because + * it may force the ARC to unnecessarily shrink very small. + * + * Therefore, we limit the amount of data that we say is evictable, + * which limits the amount that the shrinker will ask us to evict for + * one page allocation attempt. + * + * In practice, we may be asked to shrink 4x the limit to satisfy one + * page allocation, before the kernel's shrinker code gives up on us. + * When that happens, we rely on the kernel code to find the pages + * that we freed before invoking the OOM killer. This happens in + * __alloc_pages_slowpath(), which retries and finds the pages we + * freed when it calls get_page_from_freelist(). + * + * See also the comment above zfs_arc_shrinker_limit. + */ + int64_t limit = zfs_arc_shrinker_limit != 0 ? + zfs_arc_shrinker_limit : INT64_MAX; + return (MIN(limit, btop((int64_t)arc_evictable_memory()))); } static unsigned long arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) { - int64_t pages; + ASSERT((sc->gfp_mask & __GFP_FS) != 0); /* The arc is considered warm once reclaim has occurred */ if (unlikely(arc_warm == B_FALSE)) arc_warm = B_TRUE; - /* Return the potential number of reclaimable pages */ - pages = btop((int64_t)arc_evictable_memory()); - - /* Not allowed to perform filesystem reclaim */ - if (!(sc->gfp_mask & __GFP_FS)) - return (SHRINK_STOP); - - /* Reclaim in progress */ - if (mutex_tryenter(&arc_evict_lock) == 0) { - ARCSTAT_INCR(arcstat_need_free, ptob(sc->nr_to_scan)); - return (0); - } - - mutex_exit(&arc_evict_lock); + /* + * Evict the requested number of pages by reducing arc_c and waiting + * for the requested amount of data to be evicted. + */ + arc_reduce_target_size(ptob(sc->nr_to_scan)); + arc_wait_for_eviction(ptob(sc->nr_to_scan)); + if (current->reclaim_state != NULL) + current->reclaim_state->reclaimed_slab += sc->nr_to_scan; /* - * Evict the requested number of pages by shrinking arc_c the - * requested amount. + * We are experiencing memory pressure which the arc_evict_zthr was + * unable to keep up with. Set arc_no_grow to briefly pause arc + * growth to avoid compounding the memory pressure. */ - if (pages > 0) { - arc_reduce_target_size(ptob(sc->nr_to_scan)); - - /* - * Repeated calls to the arc shrinker can reduce arc_c - * drastically, potentially all the way to arc_c_min. While - * arc_c is below arc_size, ZFS can't process read/write - * requests, because arc_get_data_impl() will block. To - * ensure that arc_c doesn't shrink faster than the evict - * thread can keep up, we wait for eviction here. - */ - mutex_enter(&arc_evict_lock); - if (arc_is_overflowing()) { - arc_evict_needed = B_TRUE; - zthr_wakeup(arc_evict_zthr); - (void) cv_wait(&arc_evict_waiters_cv, - &arc_evict_lock); - } - mutex_exit(&arc_evict_lock); - - if (current_is_kswapd()) - arc_kmem_reap_soon(); - pages = MAX((int64_t)pages - - (int64_t)btop(arc_evictable_memory()), 0); - /* - * We've shrunk what we can, wake up threads. - */ - cv_broadcast(&arc_evict_waiters_cv); - } else - pages = SHRINK_STOP; + arc_no_grow = B_TRUE; /* * When direct reclaim is observed it usually indicates a rapid * increase in memory pressure. This occurs because the kswapd * threads were unable to asynchronously keep enough free memory - * available. In this case set arc_no_grow to briefly pause arc - * growth to avoid compounding the memory pressure. + * available. */ if (current_is_kswapd()) { ARCSTAT_BUMP(arcstat_memory_indirect_count); } else { - arc_no_grow = B_TRUE; - arc_kmem_reap_soon(); ARCSTAT_BUMP(arcstat_memory_direct_count); } - return (pages); + return (sc->nr_to_scan); } SPL_SHRINKER_DECLARE(arc_shrinker, @@ -305,9 +291,56 @@ arc_lowmem_init(void) */ spl_register_shrinker(&arc_shrinker); - /* Set to 1/64 of all memory or a minimum of 512K */ - arc_sys_free = MAX(allmem / 64, (512 * 1024)); - arc_need_free = 0; + /* + * The ARC tries to keep at least this much memory available for the + * system. This gives the ARC time to shrink in response to memory + * pressure, before running completely out of memory and invoking the + * direct-reclaim ARC shrinker. + * + * This should be more than twice high_wmark_pages(), so that + * arc_wait_for_eviction() will wait until at least the + * high_wmark_pages() are free (see arc_evict_state_impl()). + * + * Note: Even when the system is very low on memory, the kernel's + * shrinker code may only ask for one "batch" of pages (512KB) to be + * evicted. If concurrent allocations consume these pages, there may + * still be insufficient free pages, and the OOM killer takes action. + * + * By setting arc_sys_free large enough, and having + * arc_wait_for_eviction() wait until there is at least arc_sys_free/2 + * free memory, it is much less likely that concurrent allocations can + * consume all the memory that was evicted before checking for + * OOM. + * + * It's hard to iterate the zones from a linux kernel module, which + * makes it difficult to determine the watermark dynamically. Instead + * we compute the maximum high watermark for this system, based + * on the amount of memory, assuming default parameters on Linux kernel + * 5.3. + */ + + /* + * Base wmark_low is 4 * the square root of Kbytes of RAM. + */ + long wmark = 4 * int_sqrt(allmem/1024) * 1024; + + /* + * Clamp to between 128K and 64MB. + */ + wmark = MAX(wmark, 128 * 1024); + wmark = MIN(wmark, 64 * 1024 * 1024); + + /* + * watermark_boost can increase the wmark by up to 150%. + */ + wmark += wmark * 150 / 100; + + /* + * arc_sys_free needs to be more than 2x the watermark, because + * arc_wait_for_eviction() waits for half of arc_sys_free. Bump this up + * to 3x to ensure we're above it. + */ + arc_sys_free = wmark * 3 + allmem / 32; } void @@ -348,15 +381,11 @@ int64_t arc_available_memory(void) { int64_t lowest = INT64_MAX; - free_memory_reason_t r = FMR_UNKNOWN; /* Every 100 calls, free a small amount */ if (spa_get_random(100) == 0) lowest = -1024; - last_free_memory = lowest; - last_free_reason = r; - return (lowest); } @@ -429,3 +458,8 @@ arc_prune_async(int64_t adjust) } mutex_exit(&arc_prune_mtx); } + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW, + "Limit on number of pages that ARC shrinker can reclaim at once"); +/* END CSTYLED */ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index dcf710ad14c7..3ec98917d1f7 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -313,17 +313,38 @@ boolean_t arc_watch = B_FALSE; * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves * arc_available_memory(). */ -static zthr_t *arc_reap_zthr; +static zthr_t *arc_reap_zthr; /* * This thread's job is to keep arc_size under arc_c, by calling * arc_evict(), which improves arc_is_overflowing(). */ -zthr_t *arc_evict_zthr; +static zthr_t *arc_evict_zthr; -kmutex_t arc_evict_lock; -kcondvar_t arc_evict_waiters_cv; -boolean_t arc_evict_needed = B_FALSE; +static kmutex_t arc_evict_lock; +static boolean_t arc_evict_needed = B_FALSE; + +/* + * Count of bytes evicted since boot. + */ +static uint64_t arc_evict_count; + +/* + * List of arc_evict_waiter_t's, representing threads waiting for the + * arc_evict_count to reach specific values. + */ +static list_t arc_evict_waiters; + +/* + * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of + * the requested amount of data to be evicted. For example, by default for + * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation. + * Since this is above 100%, it ensures that progress is made towards getting + * arc_size under arc_c. Since this is finite, it ensures that allocations + * can still happen, even during the potentially long time that arc_size is + * more than arc_c. + */ +int zfs_arc_eviction_pct = 200; /* * The number of headers to evict in arc_evict_state_impl() before @@ -632,6 +653,7 @@ arc_state_t *arc_mfu; #define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit) #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ +#define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ /* size of all b_rabd's in entire arc */ #define arc_raw_size ARCSTAT(arcstat_raw_size) @@ -3859,6 +3881,20 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) return (bytes_evicted); } +static void +arc_set_need_free(void) +{ + ASSERT(MUTEX_HELD(&arc_evict_lock)); + int64_t remaining = arc_free_memory() - arc_sys_free / 2; + arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters); + if (aw == NULL) { + arc_need_free = MAX(-remaining, 0); + } else { + arc_need_free = + MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count)); + } +} + static uint64_t arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, uint64_t spa, int64_t bytes) @@ -3938,29 +3974,6 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, if (evicted != 0) evict_count++; - /* - * If arc_size isn't overflowing, signal any - * threads that might happen to be waiting. - * - * For each header evicted, we wake up a single - * thread. If we used cv_broadcast, we could - * wake up "too many" threads causing arc_size - * to significantly overflow arc_c; since - * arc_get_data_impl() doesn't check for overflow - * when it's woken up (it doesn't because it's - * possible for the ARC to be overflowing while - * full of un-evictable buffers, and the - * function should proceed in this case). - * - * If threads are left sleeping, due to not - * using cv_broadcast here, they will be woken - * up via cv_broadcast in arc_evict_cb() just - * before arc_evict_zthr sleeps. - */ - mutex_enter(&arc_evict_lock); - if (!arc_is_overflowing()) - cv_signal(&arc_evict_waiters_cv); - mutex_exit(&arc_evict_lock); } else { ARCSTAT_BUMP(arcstat_mutex_miss); } @@ -3968,6 +3981,32 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, multilist_sublist_unlock(mls); + /* + * Increment the count of evicted bytes, and wake up any threads that + * are waiting for the count to reach this value. Since the list is + * ordered by ascending aew_count, we pop off the beginning of the + * list until we reach the end, or a waiter that's past the current + * "count". Doing this outside the loop reduces the number of times + * we need to acquire the global arc_evict_lock. + * + * Only wake when there's sufficient free memory in the system + * (specifically, arc_sys_free/2, which by default is a bit more than + * 1/64th of RAM). See the comments in arc_wait_for_eviction(). + */ + mutex_enter(&arc_evict_lock); + arc_evict_count += bytes_evicted; + + if ((int64_t)(arc_free_memory() - arc_sys_free / 2) > 0) { + arc_evict_waiter_t *aw; + while ((aw = list_head(&arc_evict_waiters)) != NULL && + aw->aew_count <= arc_evict_count) { + list_remove(&arc_evict_waiters, aw); + cv_broadcast(&aw->aew_cv); + } + } + arc_set_need_free(); + mutex_exit(&arc_evict_lock); + /* * If the ARC size is reduced from arc_c_max to arc_c_min (especially * if the average cached block is small), eviction can be on-CPU for @@ -4582,7 +4621,16 @@ void arc_reduce_target_size(int64_t to_free) { uint64_t asize = aggsum_value(&arc_size); - uint64_t c = arc_c; + + /* + * All callers want the ARC to actually evict (at least) this much + * memory. Therefore we reduce from the lower of the current size and + * the target size. This way, even if arc_c is much higher than + * arc_size (as can be the case after many calls to arc_freed(), we will + * immediately have arc_c < arc_size and therefore the arc_evict_zthr + * will evict. + */ + uint64_t c = MIN(arc_c, asize); if (c > to_free && c - to_free > arc_c_min) { arc_c = c - to_free; @@ -4693,18 +4741,18 @@ arc_evict_cb_check(void *arg, zthr_t *zthr) arc_ksp->ks_update(arc_ksp, KSTAT_READ); /* - * We have to rely on arc_get_data_impl() to tell us when to evict, - * rather than checking if we are overflowing here, so that we are - * sure to not leave arc_get_data_impl() waiting on - * arc_evict_waiters_cv. If we have become "not overflowing" since - * arc_get_data_impl() checked, we need to wake it up. We could - * broadcast the CV here, but arc_get_data_impl() may have not yet - * gone to sleep. We would need to use a mutex to ensure that this - * function doesn't broadcast until arc_get_data_impl() has gone to - * sleep (e.g. the arc_evict_lock). However, the lock ordering of - * such a lock would necessarily be incorrect with respect to the - * zthr_lock, which is held before this function is called, and is - * held by arc_get_data_impl() when it calls zthr_wakeup(). + * We have to rely on arc_wait_for_eviction() to tell us when to + * evict, rather than checking if we are overflowing here, so that we + * are sure to not leave arc_wait_for_eviction() waiting on aew_cv. + * If we have become "not overflowing" since arc_wait_for_eviction() + * checked, we need to wake it up. We could broadcast the CV here, + * but arc_wait_for_eviction() may have not yet gone to sleep. We + * would need to use a mutex to ensure that this function doesn't + * broadcast until arc_wait_for_eviction() has gone to sleep (e.g. + * the arc_evict_lock). However, the lock ordering of such a lock + * would necessarily be incorrect with respect to the zthr_lock, + * which is held before this function is called, and is held by + * arc_wait_for_eviction() when it calls zthr_wakeup(). */ return (arc_evict_needed); } @@ -4743,8 +4791,11 @@ arc_evict_cb(void *arg, zthr_t *zthr) * can't evict anything more, so we should wake * arc_get_data_impl() sooner. */ - cv_broadcast(&arc_evict_waiters_cv); - arc_need_free = 0; + arc_evict_waiter_t *aw; + while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) { + cv_broadcast(&aw->aew_cv); + } + arc_set_need_free(); } mutex_exit(&arc_evict_lock); spl_fstrans_unmark(cookie); @@ -4824,9 +4875,6 @@ arc_reap_cb(void *arg, zthr_t *zthr) int64_t to_free = (arc_c >> arc_shrink_shift) - free_memory; if (to_free > 0) { -#ifdef _KERNEL - to_free = MAX(to_free, arc_need_free); -#endif arc_reduce_target_size(to_free); } spl_fstrans_unmark(cookie); @@ -5007,6 +5055,64 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) } } +/* + * Wait for the specified amount of data (in bytes) to be evicted from the + * ARC, and for there to be sufficient free memory in the system. Waiting for + * eviction ensures that the memory used by the ARC decreases. Waiting for + * free memory ensures that the system won't run out of free pages, regardless + * of ARC behavior and settings. See arc_lowmem_init(). + */ +void +arc_wait_for_eviction(uint64_t amount) +{ + mutex_enter(&arc_evict_lock); + if (arc_is_overflowing()) { + arc_evict_needed = B_TRUE; + zthr_wakeup(arc_evict_zthr); + + if (amount != 0) { + arc_evict_waiter_t aw; + list_link_init(&aw.aew_node); + cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); + + arc_evict_waiter_t *last = + list_tail(&arc_evict_waiters); + if (last != NULL) { + ASSERT3U(last->aew_count, >, arc_evict_count); + aw.aew_count = last->aew_count + amount; + } else { + aw.aew_count = arc_evict_count + amount; + } + + list_insert_tail(&arc_evict_waiters, &aw); + + arc_set_need_free(); + + DTRACE_PROBE3(arc__wait__for__eviction, + uint64_t, amount, + uint64_t, arc_evict_count, + uint64_t, aw.aew_count); + + /* + * We will be woken up either when arc_evict_count + * reaches aew_count, or when the ARC is no longer + * overflowing and eviction completes. + */ + cv_wait(&aw.aew_cv, &arc_evict_lock); + + /* + * In case of "false" wakeup, we will still be on the + * list. + */ + if (list_link_active(&aw.aew_node)) + list_remove(&arc_evict_waiters, &aw); + + cv_destroy(&aw.aew_cv); + } + } + mutex_exit(&arc_evict_lock); +} + /* * Allocate a block and return it to the caller. If we are hitting the * hard limit for the cache size, we must sleep, waiting for the eviction @@ -5022,40 +5128,26 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) arc_adapt(size, state); /* - * If arc_size is currently overflowing, and has grown past our - * upper limit, we must be adding data faster than the evict - * thread can evict. Thus, to ensure we don't compound the + * If arc_size is currently overflowing, we must be adding data + * faster than we are evicting. To ensure we don't compound the * problem by adding more data and forcing arc_size to grow even - * further past it's target size, we halt and wait for the - * eviction thread to catch up. + * further past it's target size, we wait for the eviction thread to + * make some progress. We also wait for there to be sufficient free + * memory in the system, as measured by arc_free_memory(). + * + * Specifically, we wait for zfs_arc_eviction_pct percent of the + * requested size to be evicted. This should be more than 100%, to + * ensure that that progress is also made towards getting arc_size + * under arc_c. See the comment above zfs_arc_eviction_pct. * - * It's also possible that the reclaim thread is unable to evict - * enough buffers to get arc_size below the overflow limit (e.g. - * due to buffers being un-evictable, or hash lock collisions). - * In this case, we want to proceed regardless if we're - * overflowing; thus we don't use a while loop here. + * We do the overflowing check without holding the arc_evict_lock to + * reduce lock contention in this hot path. Note that + * arc_wait_for_eviction() will acquire the lock and check again to + * ensure we are truly overflowing before blocking. */ if (arc_is_overflowing()) { - mutex_enter(&arc_evict_lock); - - /* - * Now that we've acquired the lock, we may no longer be - * over the overflow limit, lets check. - * - * We're ignoring the case of spurious wake ups. If that - * were to happen, it'd let this thread consume an ARC - * buffer before it should have (i.e. before we're under - * the overflow limit and were signalled by the reclaim - * thread). As long as that is a rare occurrence, it - * shouldn't cause any harm. - */ - if (arc_is_overflowing()) { - arc_evict_needed = B_TRUE; - zthr_wakeup(arc_evict_zthr); - (void) cv_wait(&arc_evict_waiters_cv, - &arc_evict_lock); - } - mutex_exit(&arc_evict_lock); + arc_wait_for_eviction(size * + zfs_arc_eviction_pct / 100); } VERIFY3U(hdr->b_type, ==, type); @@ -7269,7 +7361,8 @@ arc_init(void) { uint64_t percent, allmem = arc_all_memory(); mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&arc_evict_waiters_cv, NULL, CV_DEFAULT, NULL); + list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t), + offsetof(arc_evict_waiter_t, aew_node)); arc_min_prefetch_ms = 1000; arc_min_prescient_prefetch_ms = 6000; @@ -7402,7 +7495,7 @@ arc_fini(void) (void) zthr_cancel(arc_reap_zthr); mutex_destroy(&arc_evict_lock); - cv_destroy(&arc_evict_waiters_cv); + list_destroy(&arc_evict_waiters); /* * buf_fini() must proceed arc_state_fini() because buf_fin() may @@ -10357,4 +10450,7 @@ ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW, "Percentage of excess dnodes to try to unpin"); + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW, + "When full, ARC allocation waits for eviction of this % of alloc size"); /* END CSTYLED */