Skip to content

Commit

Permalink
Revise ARC shrinker algorithm
Browse files Browse the repository at this point in the history
The ARC shrinker callback `arc_shrinker_count/_scan()` is invoked by the
kernel's shrinker mechanism when the system is running low on free
pages.  This happens via 2 code paths:

1. "direct reclaim": The system is attempting to allocate a page, but we
are low on memory.  The ARC shrinker callback is invoked from the
page-allocation code path.

2. "indirect reclaim": kswapd notices that there aren't many free pages,
so it invokes the ARC shrinker callback.

In both cases, the kernel's shrinker code requests that the ARC shrinker
callback release some of its cache, and then it measures how many pages
were released.  However, it's measurement of released pages does not
include pages that are freed via `__free_pages()`, which is how the ARC
releases memory (via `abd_free_chunks()`).  Rather, the kernel shrinker
code is looking for pages to be placed on the lists of reclaimable pages
(which is separate from actually-free pages).

Because the kernel shrinker code doesn't detect that the ARC has
released pages, it may call the ARC shrinker callback many times,
resulting in the ARC "collapsing" down to `arc_c_min`.  This has several
negative impacts:

1. ZFS doesn't use RAM to cache data effectively.

2. In the direct reclaim case, a single page allocation may wait a long
time (e.g. more than a minute) while we evict the entire ARC.

3. Even with the improvements made in 67c0f0d ("ARC shrinking blocks
reads/writes"), occasionally `arc_size` may stay above `arc_c` for the
entire time of the ARC collapse, thus blocking ZFS read/write operations
in `arc_get_data_impl()`.

To address these issues, this commit limits the ways that the ARC
shrinker callback can be used by the kernel shrinker code, and mitigates
the impact of arc_is_overflowing() on ZFS read/write operations.

With this commit:

1. We limit the amount of data that can be reclaimed from the ARC via
the "direct reclaim" shrinker.  This limits the amount of time it takes
to allocate a single page.

2. We do not allow the ARC to shrink via kswapd (indirect reclaim).
Instead we rely on `arc_evict_zthr` to monitor free memory and reduce
the ARC target size to keep sufficient free memory in the system.  Note
that we can't simply rely on limiting the amount that we reclaim at once
(as for the direct reclaim case), because kswapd's "boosted" logic can
invoke the callback an unlimited number of times (see
`balance_pgdat()`).

3. When `arc_is_overflowing()` and we want to allocate memory,
`arc_get_data_impl()` will wait only for a multiple of the requested
amount of data to be evicted, rather than waiting for the ARC to no
longer be overflowing.  This allows ZFS reads/writes to make progress
even while the ARC is overflowing, while also ensuring that the eviction
thread makes progress towards reducing the total amount of memory used
by the ARC.

4. The amount of memory that the ARC always tries to keep free for the
rest of the system, `arc_sys_free` is increased.

Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
  • Loading branch information
ahrens committed Jul 20, 2020
1 parent 7761a20 commit e69daa0
Show file tree
Hide file tree
Showing 7 changed files with 277 additions and 197 deletions.
4 changes: 2 additions & 2 deletions include/os/linux/spl/sys/shrinker.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ __ ## varname ## _wrapper(struct shrinker *shrink, struct shrink_control *sc)\
\
static struct shrinker varname = { \
.shrink = __ ## varname ## _wrapper, \
.seeks = seek_cost \
.seeks = seek_cost, \
}

#define SHRINK_STOP (-1)
Expand All @@ -97,7 +97,7 @@ static struct shrinker varname = { \
static struct shrinker varname = { \
.count_objects = countfunc, \
.scan_objects = scanfunc, \
.seeks = seek_cost \
.seeks = seek_cost, \
}

#else
Expand Down
36 changes: 36 additions & 0 deletions include/os/linux/zfs/sys/trace_arc.h
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,41 @@ DEFINE_EVENT(zfs_l2arc_evict_class, name, \
/* END CSTYLED */
DEFINE_L2ARC_EVICT_EVENT(zfs_l2arc__evict);

/*
* Generic support for three argument tracepoints of the form:
*
* DTRACE_PROBE3(...,
* uint64_t, ...,
* uint64_t, ...,
* uint64_t, ...);
*/
/* BEGIN CSTYLED */
DECLARE_EVENT_CLASS(zfs_arc_wait_for_eviction_class,
TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count),
TP_ARGS(amount, arc_evict_count, aew_count),
TP_STRUCT__entry(
__field(uint64_t, amount)
__field(uint64_t, arc_evict_count)
__field(uint64_t, aew_count)
),
TP_fast_assign(
__entry->amount = amount;
__entry->arc_evict_count = arc_evict_count;
__entry->aew_count = aew_count;
),
TP_printk("amount %llu arc_evict_count %llu aew_count %llu",
__entry->amount, __entry->arc_evict_count, __entry->aew_count)
);
/* END CSTYLED */

/* BEGIN CSTYLED */
#define DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(name) \
DEFINE_EVENT(zfs_arc_wait_for_eviction_class, name, \
TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count),
TP_ARGS(amount, arc_evict_count, aew_count),
/* END CSTYLED */
DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(zfs_arc__wait__for__eviction);

#endif /* _TRACE_ARC_H */

#undef TRACE_INCLUDE_PATH
Expand All @@ -376,6 +411,7 @@ DEFINE_DTRACE_PROBE1(l2arc__miss);
DEFINE_DTRACE_PROBE2(l2arc__read);
DEFINE_DTRACE_PROBE2(l2arc__write);
DEFINE_DTRACE_PROBE2(l2arc__iodone);
DEFINE_DTRACE_PROBE3(arc__wait__for__eviction);
DEFINE_DTRACE_PROBE4(arc__miss);
DEFINE_DTRACE_PROBE4(l2arc__evict);

Expand Down
21 changes: 6 additions & 15 deletions include/sys/arc_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -823,7 +823,6 @@ typedef struct arc_stats {
kstat_named_t arcstat_l2_rebuild_log_blks;
kstat_named_t arcstat_memory_throttle_count;
kstat_named_t arcstat_memory_direct_count;
kstat_named_t arcstat_memory_indirect_count;
kstat_named_t arcstat_memory_all_bytes;
kstat_named_t arcstat_memory_free_bytes;
kstat_named_t arcstat_memory_available_bytes;
Expand All @@ -846,15 +845,11 @@ typedef struct arc_stats {
kstat_named_t arcstat_cached_only_in_progress;
} arc_stats_t;

typedef enum free_memory_reason_t {
FMR_UNKNOWN,
FMR_NEEDFREE,
FMR_LOTSFREE,
FMR_SWAPFS_MINFREE,
FMR_PAGES_PP_MAXIMUM,
FMR_HEAP_ARENA,
FMR_ZIO_ARENA,
} free_memory_reason_t;
typedef struct arc_evict_waiter {
list_node_t aew_node;
kcondvar_t aew_cv;
uint64_t aew_count;
} arc_evict_waiter_t;

#define ARCSTAT(stat) (arc_stats.stat.value.ui64)

Expand All @@ -870,18 +865,13 @@ typedef enum free_memory_reason_t {
#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */
#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */

extern taskq_t *arc_prune_taskq;
extern arc_stats_t arc_stats;
extern hrtime_t arc_growtime;
extern boolean_t arc_warm;
extern int arc_grow_retry;
extern int arc_shrink_shift;
extern zthr_t *arc_evict_zthr;
extern kmutex_t arc_evict_lock;
extern kcondvar_t arc_evict_waiters_cv;
extern boolean_t arc_evict_needed;
extern kmutex_t arc_prune_mtx;
extern list_t arc_prune_list;
extern aggsum_t arc_size;
Expand All @@ -896,6 +886,7 @@ extern void arc_reduce_target_size(int64_t to_free);
extern boolean_t arc_reclaim_needed(void);
extern void arc_kmem_reap_soon(void);
extern boolean_t arc_is_overflowing(void);
extern void arc_wait_for_eviction(uint64_t);

extern void arc_lowmem_init(void);
extern void arc_lowmem_fini(void);
Expand Down
12 changes: 6 additions & 6 deletions module/os/freebsd/zfs/arc_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -223,18 +223,18 @@ arc_lowmem(void *arg __unused, int howto __unused)
DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free);
arc_reduce_target_size(to_free);

mutex_enter(&arc_evict_lock);
arc_evict_needed = B_TRUE;
zthr_wakeup(arc_evict_zthr);

mutex_enter(&arc_adjust_lock);
arc_adjust_needed = B_TRUE;
zthr_wakeup(arc_adjust_zthr);
/*
* It is unsafe to block here in arbitrary threads, because we can come
* here from ARC itself and may hold ARC locks and thus risk a deadlock
* with ARC reclaim thread.
*/
if (curproc == pageproc)
(void) cv_wait(&arc_evict_waiters_cv, &arc_evict_lock);
mutex_exit(&arc_evict_lock);
arc_wait_for_eviction(to_free);
else
arc_wait_for_eviction(0);
}

void
Expand Down
1 change: 0 additions & 1 deletion module/os/freebsd/zfs/sysctl_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,6 @@ extern arc_stats_t arc_stats;
#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */
#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */
#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */

static int
Expand Down
Loading

0 comments on commit e69daa0

Please sign in to comment.