Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix ARC ghost states eviction accounting. #12279

Merged
merged 1 commit into from
Jul 13, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion include/sys/arc_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -984,7 +984,6 @@ extern unsigned long zfs_arc_max;
extern void arc_reduce_target_size(int64_t to_free);
extern boolean_t arc_reclaim_needed(void);
extern void arc_kmem_reap_soon(void);
extern boolean_t arc_is_overflowing(void);
extern void arc_wait_for_eviction(uint64_t);

extern void arc_lowmem_init(void);
Expand Down
24 changes: 13 additions & 11 deletions man/man4/zfs.4
Original file line number Diff line number Diff line change
Expand Up @@ -712,20 +712,22 @@ equivalent to the greater of the number of online CPUs and
The ARC size is considered to be overflowing if it exceeds the current
ARC target size
.Pq Sy arc_c
by a threshold determined by this parameter.
The threshold is calculated as a fraction of
.Sy arc_c
using the formula
.Sy arc_c >> zfs_arc_overflow_shift .
by thresholds determined by this parameter.
Exceeding by
.Sy ( arc_c >> zfs_arc_overflow_shift ) * 0.5
starts ARC reclamation process.
If that appears insufficient, exceeding by
.Sy ( arc_c >> zfs_arc_overflow_shift ) * 1.5
blocks new buffer allocation until the reclaim thread catches up.
Started reclamation process continues till ARC size returns below the
target size.
.Pp
The default value of
.Sy 8
causes the ARC to be considered overflowing if it exceeds the target size by
.Em 1/256th Pq Em 0.3%
of the target size.
.Pp
When the ARC is overflowing, new buffer allocations are stalled until
the reclaim thread catches up and the overflow condition no longer exists.
causes the ARC to start reclamation if it exceeds the target size by
.Em 0.2%
of the target size, and block allocations by
.Em 0.6% .
.
.It Sy zfs_arc_p_min_shift Ns = Ns Sy 0 Pq int
If nonzero, this will update
Expand Down
2 changes: 0 additions & 2 deletions module/os/freebsd/zfs/arc_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -234,8 +234,6 @@ arc_lowmem(void *arg __unused, int howto __unused)
*/
if (curproc == pageproc)
arc_wait_for_eviction(to_free);
else
arc_wait_for_eviction(0);
}

void
Expand Down
155 changes: 94 additions & 61 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -826,6 +826,12 @@ typedef enum arc_fill_flags {
ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */
} arc_fill_flags_t;

typedef enum arc_ovf_level {
ARC_OVF_NONE, /* ARC within target size. */
ARC_OVF_SOME, /* ARC is slightly overflowed. */
ARC_OVF_SEVERE /* ARC is severely overflowed. */
} arc_ovf_level_t;

static kmutex_t l2arc_feed_thr_lock;
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
Expand Down Expand Up @@ -3861,9 +3867,18 @@ arc_buf_destroy(arc_buf_t *buf, void* tag)
* - arc_mru_ghost -> deleted
* - arc_mfu_ghost -> arc_l2c_only
* - arc_mfu_ghost -> deleted
*
* Return total size of evicted data buffers for eviction progress tracking.
* When evicting from ghost states return logical buffer size to make eviction
* progress at the same (or at least comparable) rate as from non-ghost states.
*
* Return *real_evicted for actual ARC size reduction to wake up threads
* waiting for it. For non-ghost states it includes size of evicted data
* buffers (the headers are not freed there). For ghost states it includes
* only the evicted headers size.
*/
static int64_t
arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, uint64_t *real_evicted)
amotin marked this conversation as resolved.
Show resolved Hide resolved
{
arc_state_t *evicted_state, *state;
int64_t bytes_evicted = 0;
Expand All @@ -3873,6 +3888,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
ASSERT(MUTEX_HELD(hash_lock));
ASSERT(HDR_HAS_L1HDR(hdr));

*real_evicted = 0;
state = hdr->b_l1hdr.b_state;
if (GHOST_STATE(state)) {
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
Expand Down Expand Up @@ -3909,9 +3925,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
*/
hdr = arc_hdr_realloc(hdr, hdr_full_cache,
hdr_l2only_cache);
*real_evicted += HDR_FULL_SIZE - HDR_L2ONLY_SIZE;
} else {
arc_change_state(arc_anon, hdr, hash_lock);
arc_hdr_destroy(hdr);
*real_evicted += HDR_FULL_SIZE;
amotin marked this conversation as resolved.
Show resolved Hide resolved
}
return (bytes_evicted);
}
Expand All @@ -3935,8 +3953,10 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
ARCSTAT_BUMP(arcstat_mutex_miss);
break;
}
if (buf->b_data != NULL)
if (buf->b_data != NULL) {
amotin marked this conversation as resolved.
Show resolved Hide resolved
bytes_evicted += HDR_GET_LSIZE(hdr);
*real_evicted += HDR_GET_LSIZE(hdr);
}
mutex_exit(&buf->b_evict_lock);
arc_buf_destroy_impl(buf);
}
Expand Down Expand Up @@ -3972,6 +3992,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
arc_cksum_free(hdr);

bytes_evicted += arc_hdr_size(hdr);
*real_evicted += arc_hdr_size(hdr);

/*
* If this hdr is being evicted and has a compressed
Expand Down Expand Up @@ -4013,7 +4034,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
uint64_t spa, int64_t bytes)
{
multilist_sublist_t *mls;
uint64_t bytes_evicted = 0;
uint64_t bytes_evicted = 0, real_evicted = 0;
arc_buf_hdr_t *hdr;
kmutex_t *hash_lock;
int evict_count = 0;
Expand Down Expand Up @@ -4074,10 +4095,13 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
ASSERT(!MUTEX_HELD(hash_lock));

if (mutex_tryenter(hash_lock)) {
uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
uint64_t revicted;
uint64_t evicted = arc_evict_hdr(hdr, hash_lock,
&revicted);
mutex_exit(hash_lock);

bytes_evicted += evicted;
real_evicted += revicted;

/*
* If evicted is zero, arc_evict_hdr() must have
Expand Down Expand Up @@ -4107,7 +4131,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
* 1/64th of RAM). See the comments in arc_wait_for_eviction().
*/
mutex_enter(&arc_evict_lock);
arc_evict_count += bytes_evicted;
arc_evict_count += real_evicted;

if (arc_free_memory() > arc_sys_free / 2) {
arc_evict_waiter_t *aw;
Expand Down Expand Up @@ -5121,7 +5145,7 @@ arc_adapt(int bytes, arc_state_t *state)
* Check if arc_size has grown past our upper threshold, determined by
* zfs_arc_overflow_shift.
*/
boolean_t
static arc_ovf_level_t
arc_is_overflowing(void)
{
/* Always allow at least one block of overflow */
Expand All @@ -5137,8 +5161,10 @@ arc_is_overflowing(void)
* in the ARC. In practice, that's in the tens of MB, which is low
* enough to be safe.
*/
return (aggsum_lower_bound(&arc_sums.arcstat_size) >=
(int64_t)arc_c + overflow);
int64_t over = aggsum_lower_bound(&arc_sums.arcstat_size) -
arc_c - overflow / 2;
return (over < 0 ? ARC_OVF_NONE :
over < overflow ? ARC_OVF_SOME : ARC_OVF_SEVERE);
This conversation was marked as resolved.
Show resolved Hide resolved
}

static abd_t *
Expand Down Expand Up @@ -5180,58 +5206,73 @@ arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
void
arc_wait_for_eviction(uint64_t amount)
{
mutex_enter(&arc_evict_lock);
if (arc_is_overflowing()) {
arc_evict_needed = B_TRUE;
zthr_wakeup(arc_evict_zthr);

if (amount != 0) {
arc_evict_waiter_t aw;
list_link_init(&aw.aew_node);
cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);
switch (arc_is_overflowing()) {
case ARC_OVF_NONE:
return;
case ARC_OVF_SOME:
/*
* This is a bit racy without taking arc_evict_lock, but the
* worst that can happen is we either call zthr_wakeup() extra
* time due to race with other thread here, or the set flag
* get cleared by arc_evict_cb(), which is unlikely due to
* big hysteresis, but also not important since at this level
* of overflow the eviction is purely advisory. Same time
* taking the global lock here every time without waiting for
* the actual eviction creates a significant lock contention.
*/
if (!arc_evict_needed) {
arc_evict_needed = B_TRUE;
zthr_wakeup(arc_evict_zthr);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't need the arc_evict_lock to set arc_evict_needed? could we also change it without the lock in arc_reduce_target_size() and arc_evict_cb()?

Copy link
Member Author

@amotin amotin Jun 29, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a bit racy, but since we are not in overflow, it does not matter. arc_reduce_target_size() in addition to arc_evict_needed also changes the arc_c, potentially significantly, so additional fence makes sense to me. In arc_evict_cb() the locking is needed to reliably handle arc_evict_waiters list, but since it is called from only one thread, it does not matter much to performance.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if this makes sense to me. Are you saying we don't need the arc_evict_lock here because we're not actually going to sleep which makes the comment surrounding arc_evict_needed/arc_wait_for_eviction dance meaningless in this case? Nonetheless, we need to either document this significantly or change the consumers of arc_evict_needed to be consistent. My fear is that another developer would see the lack of lock here and then deem this a bug or want to remove the lock in other places.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added a comment here. Comments around arc_evict_needed/arc_wait_for_eviction are about arc_evict_waiters, not touched here.

}
return;
case ARC_OVF_SEVERE:
default:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it would be safer (to future code changes) to have this be case 2: and have the default panic. That ensures that we handle all of the possible return values.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would add another branching and the panic code itself, while in this case it is really safe (just slower) to take the default path.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that the current code is correct. My concern is that if someone adds a new variant of the enum that has some different meaning, we want to make sure that they think about how that should be handled here, instead of just treating it the same as ARC_OVF_SEVERE. In other words, it's generally good practice for switch statements to handle all the variants and panic if an unknown variant is encountered - it's defensive against future changes. But I see that there are plenty of places in the existing code that don't do that, so oh well.

Copy link
Member Author

@amotin amotin Jun 30, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On FreeBSD there is __assert_unreachable() macro, turning into panic() when built with debug and into __builtin_unreachable() otherwise if compiler supports it and into nothing if not. We could introduce something like that in ZFS, if somebody knows how to properly spell it on Linux.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I understand the argument against default: panic("invalid arc_ovf_level_t")

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is a waste of time for additional branching and a code trashing.

Copy link
Member

@ahrens ahrens Jul 8, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I disagree. I think any performance cost is trivial (but will happily change my mind if you have performance measurements to the contrary). In my opinion, indicating what states are expected and allowed makes the code more clear.

That said, while this is an interesting discussion of philosophy, I don't think it's hugely consequential to this PR - as I mentioned there are other places that have similarly problematic use of default. So I wouldn't hold up your PR over this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets say I saw in some profiles before effects of alike additional cases. Depending on situation compiler may turn switch into number of if's, and additional branching in a hot path never helps. In this particular case I believe compiler will inline arc_is_overflowing(), since it is now static and used in only one place, and just throw out any additional code we'd add there.

I've described above how I see it to be done properly, both paranoid in debug builds, may be even more optimized in production and clearly visible in the code.

{
arc_evict_waiter_t aw;
list_link_init(&aw.aew_node);
cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL);

uint64_t last_count = 0;
if (!list_is_empty(&arc_evict_waiters)) {
arc_evict_waiter_t *last =
list_tail(&arc_evict_waiters);
last_count = last->aew_count;
}
/*
* Note, the last waiter's count may be less than
* arc_evict_count if we are low on memory in which
* case arc_evict_state_impl() may have deferred
* wakeups (but still incremented arc_evict_count).
*/
aw.aew_count =
MAX(last_count, arc_evict_count) + amount;
uint64_t last_count = 0;
mutex_enter(&arc_evict_lock);
if (!list_is_empty(&arc_evict_waiters)) {
arc_evict_waiter_t *last =
list_tail(&arc_evict_waiters);
last_count = last->aew_count;
} else if (!arc_evict_needed) {
arc_evict_needed = B_TRUE;
zthr_wakeup(arc_evict_zthr);
}
/*
* Note, the last waiter's count may be less than
* arc_evict_count if we are low on memory in which
* case arc_evict_state_impl() may have deferred
* wakeups (but still incremented arc_evict_count).
*/
aw.aew_count = MAX(last_count, arc_evict_count) + amount;

list_insert_tail(&arc_evict_waiters, &aw);
list_insert_tail(&arc_evict_waiters, &aw);

arc_set_need_free();
arc_set_need_free();

DTRACE_PROBE3(arc__wait__for__eviction,
uint64_t, amount,
uint64_t, arc_evict_count,
uint64_t, aw.aew_count);
DTRACE_PROBE3(arc__wait__for__eviction,
uint64_t, amount,
uint64_t, arc_evict_count,
uint64_t, aw.aew_count);

/*
* We will be woken up either when arc_evict_count
* reaches aew_count, or when the ARC is no longer
* overflowing and eviction completes.
*/
/*
* We will be woken up either when arc_evict_count reaches
* aew_count, or when the ARC is no longer overflowing and
* eviction completes.
* In case of "false" wakeup, we will still be on the list.
*/
do {
cv_wait(&aw.aew_cv, &arc_evict_lock);
} while (list_link_active(&aw.aew_node));
mutex_exit(&arc_evict_lock);

/*
* In case of "false" wakeup, we will still be on the
* list.
*/
if (list_link_active(&aw.aew_node))
list_remove(&arc_evict_waiters, &aw);

cv_destroy(&aw.aew_cv);
}
cv_destroy(&aw.aew_cv);
}
}
mutex_exit(&arc_evict_lock);
}

/*
Expand Down Expand Up @@ -5262,16 +5303,8 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag,
* requested size to be evicted. This should be more than 100%, to
* ensure that that progress is also made towards getting arc_size
* under arc_c. See the comment above zfs_arc_eviction_pct.
*
* We do the overflowing check without holding the arc_evict_lock to
* reduce lock contention in this hot path. Note that
* arc_wait_for_eviction() will acquire the lock and check again to
* ensure we are truly overflowing before blocking.
*/
if (arc_is_overflowing()) {
arc_wait_for_eviction(size *
zfs_arc_eviction_pct / 100);
}
arc_wait_for_eviction(size * zfs_arc_eviction_pct / 100);

VERIFY3U(hdr->b_type, ==, type);
if (type == ARC_BUFC_METADATA) {
Expand Down