Skip to content

Commit

Permalink
Fixes in persistent L2ARC
Browse files Browse the repository at this point in the history
In l2arc_add_vdev() first decide whether the device is eligible for
L2ARC rebuild or whole device trim and then add it to the list of cache
devices. Otherwise l2arc_feed_thread() might already start writing on
the device invalidating previous content as l2ad_hand = l2ad_start.
However l2arc_rebuild_vdev() needs the device present in the cache
device list to figure out its l2arc_dev_t. Fix this by moving most of
l2arc_rebuild_vdev() in a new function l2arc_rebuild_dev() which does
not need to search in the cache device list.

In contrast to l2arc_add_vdev() we do not have to worry about
l2arc_feed_thread() invalidating previous content when onlining a
cache device. The device parameters (l2ad*) are not cleared when
offlining the device and writing new buffers will not invalidate
all previous content. In worst case only buffers that have not had
their log block written to the device will be lost.

Retire persist_l2arc_00{4,5,8} tests since they cover code already
covered by the remaining ones. Test persist_l2arc_006 is renamed to
persist_l2arc_004 and persist_l2arc_007 is renamed to persist_l2arc_005.

Fix a typo in persist_l2arc_004, and remove an assertion that is not
always true from l2arc_arcstats_pos. Also update an assertion in
persist_l2arc_005 and explain why in a comment.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes openzfs#12365
  • Loading branch information
gamanakis authored Jul 26, 2021
1 parent 037af3e commit ab8a8f0
Show file tree
Hide file tree
Showing 10 changed files with 162 additions and 491 deletions.
177 changes: 102 additions & 75 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -9749,6 +9749,80 @@ l2arc_vdev_get(vdev_t *vd)
return (dev);
}

static void
l2arc_rebuild_dev(l2arc_dev_t *dev, boolean_t reopen)
{
l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr;
uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize;
spa_t *spa = dev->l2ad_spa;

/*
* The L2ARC has to hold at least the payload of one log block for
* them to be restored (persistent L2ARC). The payload of a log block
* depends on the amount of its log entries. We always write log blocks
* with 1022 entries. How many of them are committed or restored depends
* on the size of the L2ARC device. Thus the maximum payload of
* one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
* is less than that, we reduce the amount of committed and restored
* log entries per block so as to enable persistence.
*/
if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
dev->l2ad_log_entries = 0;
} else {
dev->l2ad_log_entries = MIN((dev->l2ad_end -
dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
L2ARC_LOG_BLK_MAX_ENTRIES);
}

/*
* Read the device header, if an error is returned do not rebuild L2ARC.
*/
if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
/*
* If we are onlining a cache device (vdev_reopen) that was
* still present (l2arc_vdev_present()) and rebuild is enabled,
* we should evict all ARC buffers and pointers to log blocks
* and reclaim their space before restoring its contents to
* L2ARC.
*/
if (reopen) {
if (!l2arc_rebuild_enabled) {
return;
} else {
l2arc_evict(dev, 0, B_TRUE);
/* start a new log block */
dev->l2ad_log_ent_idx = 0;
dev->l2ad_log_blk_payload_asize = 0;
dev->l2ad_log_blk_payload_start = 0;
}
}
/*
* Just mark the device as pending for a rebuild. We won't
* be starting a rebuild in line here as it would block pool
* import. Instead spa_load_impl will hand that off to an
* async task which will call l2arc_spa_rebuild_start.
*/
dev->l2ad_rebuild = B_TRUE;
} else if (spa_writeable(spa)) {
/*
* In this case TRIM the whole device if l2arc_trim_ahead > 0,
* otherwise create a new header. We zero out the memory holding
* the header to reset dh_start_lbps. If we TRIM the whole
* device the new header will be written by
* vdev_trim_l2arc_thread() at the end of the TRIM to update the
* trim_state in the header too. When reading the header, if
* trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
* we opt to TRIM the whole device again.
*/
if (l2arc_trim_ahead > 0) {
dev->l2ad_trim_all = B_TRUE;
} else {
bzero(l2dhdr, l2dhdr_asize);
l2arc_dev_hdr_update(dev);
}
}
}

/*
* Add a vdev for use by the L2ARC. By this point the spa has already
* validated the vdev and opened it.
Expand Down Expand Up @@ -9801,99 +9875,52 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
zfs_refcount_create(&adddev->l2ad_lb_asize);
zfs_refcount_create(&adddev->l2ad_lb_count);

/*
* Decide if dev is eligible for L2ARC rebuild or whole device
* trimming. This has to happen before the device is added in the
* cache device list and l2arc_dev_mtx is released. Otherwise
* l2arc_feed_thread() might already start writing on the
* device.
*/
l2arc_rebuild_dev(adddev, B_FALSE);

/*
* Add device to global list
*/
mutex_enter(&l2arc_dev_mtx);
list_insert_head(l2arc_dev_list, adddev);
atomic_inc_64(&l2arc_ndev);
mutex_exit(&l2arc_dev_mtx);

/*
* Decide if vdev is eligible for L2ARC rebuild
*/
l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE);
}

/*
* Decide if a vdev is eligible for L2ARC rebuild, called from vdev_reopen()
* in case of onlining a cache device.
*/
void
l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen)
{
l2arc_dev_t *dev = NULL;
l2arc_dev_hdr_phys_t *l2dhdr;
uint64_t l2dhdr_asize;
spa_t *spa;

dev = l2arc_vdev_get(vd);
ASSERT3P(dev, !=, NULL);
spa = dev->l2ad_spa;
l2dhdr = dev->l2ad_dev_hdr;
l2dhdr_asize = dev->l2ad_dev_hdr_asize;

/*
* The L2ARC has to hold at least the payload of one log block for
* them to be restored (persistent L2ARC). The payload of a log block
* depends on the amount of its log entries. We always write log blocks
* with 1022 entries. How many of them are committed or restored depends
* on the size of the L2ARC device. Thus the maximum payload of
* one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device
* is less than that, we reduce the amount of committed and restored
* log entries per block so as to enable persistence.
*/
if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) {
dev->l2ad_log_entries = 0;
} else {
dev->l2ad_log_entries = MIN((dev->l2ad_end -
dev->l2ad_start) >> SPA_MAXBLOCKSHIFT,
L2ARC_LOG_BLK_MAX_ENTRIES);
}

/*
* Read the device header, if an error is returned do not rebuild L2ARC.
*/
if (l2arc_dev_hdr_read(dev) == 0 && dev->l2ad_log_entries > 0) {
/*
* If we are onlining a cache device (vdev_reopen) that was
* still present (l2arc_vdev_present()) and rebuild is enabled,
* we should evict all ARC buffers and pointers to log blocks
* and reclaim their space before restoring its contents to
* L2ARC.
*/
if (reopen) {
if (!l2arc_rebuild_enabled) {
return;
} else {
l2arc_evict(dev, 0, B_TRUE);
/* start a new log block */
dev->l2ad_log_ent_idx = 0;
dev->l2ad_log_blk_payload_asize = 0;
dev->l2ad_log_blk_payload_start = 0;
}
}
/*
* Just mark the device as pending for a rebuild. We won't
* be starting a rebuild in line here as it would block pool
* import. Instead spa_load_impl will hand that off to an
* async task which will call l2arc_spa_rebuild_start.
*/
dev->l2ad_rebuild = B_TRUE;
} else if (spa_writeable(spa)) {
/*
* In this case TRIM the whole device if l2arc_trim_ahead > 0,
* otherwise create a new header. We zero out the memory holding
* the header to reset dh_start_lbps. If we TRIM the whole
* device the new header will be written by
* vdev_trim_l2arc_thread() at the end of the TRIM to update the
* trim_state in the header too. When reading the header, if
* trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0
* we opt to TRIM the whole device again.
*/
if (l2arc_trim_ahead > 0) {
dev->l2ad_trim_all = B_TRUE;
} else {
bzero(l2dhdr, l2dhdr_asize);
l2arc_dev_hdr_update(dev);
}
}
* In contrast to l2arc_add_vdev() we do not have to worry about
* l2arc_feed_thread() invalidating previous content when onlining a
* cache device. The device parameters (l2ad*) are not cleared when
* offlining the device and writing new buffers will not invalidate
* all previous content. In worst case only buffers that have not had
* their log block written to the device will be lost.
* When onlining the cache device (ie offline->online without exporting
* the pool in between) this happens:
* vdev_reopen() -> vdev_open() -> l2arc_rebuild_vdev()
* | |
* vdev_is_dead() = B_FALSE l2ad_rebuild = B_TRUE
* During the time where vdev_is_dead = B_FALSE and until l2ad_rebuild
* is set to B_TRUE we might write additional buffers to the device.
*/
l2arc_rebuild_dev(dev, reopen);
}

/*
Expand Down
3 changes: 1 addition & 2 deletions tests/runfiles/common.run
Original file line number Diff line number Diff line change
Expand Up @@ -935,8 +935,7 @@ tags = ['functional', 'log_spacemap']
[tests/functional/l2arc]
tests = ['l2arc_arcstats_pos', 'l2arc_mfuonly_pos', 'l2arc_l2miss_pos',
'persist_l2arc_001_pos', 'persist_l2arc_002_pos',
'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos',
'persist_l2arc_006_pos', 'persist_l2arc_007_pos', 'persist_l2arc_008_pos']
'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos']
tags = ['functional', 'l2arc']

[tests/functional/zpool_influxdb]
Expand Down
2 changes: 0 additions & 2 deletions tests/test-runner/bin/zts-report.py.in
Original file line number Diff line number Diff line change
Expand Up @@ -223,8 +223,6 @@ maybe = {
'history/history_008_pos': ['FAIL', known_reason],
'history/history_010_pos': ['SKIP', exec_reason],
'io/mmap': ['SKIP', fio_reason],
'l2arc/persist_l2arc_005_pos': ['FAIL', known_reason],
'l2arc/persist_l2arc_007_pos': ['FAIL', '11887'],
'largest_pool/largest_pool_001_pos': ['FAIL', known_reason],
'mmp/mmp_on_uberblocks': ['FAIL', known_reason],
'pyzfs/pyzfs_unittest': ['SKIP', python_deps_reason],
Expand Down
5 changes: 1 addition & 4 deletions tests/zfs-tests/tests/functional/l2arc/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,7 @@ dist_pkgdata_SCRIPTS = \
persist_l2arc_002_pos.ksh \
persist_l2arc_003_neg.ksh \
persist_l2arc_004_pos.ksh \
persist_l2arc_005_pos.ksh \
persist_l2arc_006_pos.ksh \
persist_l2arc_007_pos.ksh \
persist_l2arc_008_pos.ksh
persist_l2arc_005_pos.ksh

dist_pkgdata_DATA = \
l2arc.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ typeset l2_mru_end=$(get_arcstat l2_mru_asize)
typeset l2_prefetch_end=$(get_arcstat l2_prefetch_asize)
typeset l2_asize_end=$(get_arcstat l2_asize)

log_must test $(( $l2_mfu_end - $l2_mfu_init )) -gt 0
log_must test $(( $l2_mru_end + $l2_mfu_end + $l2_prefetch_end - \
$l2_asize_end )) -eq 0
log_must test $(( $l2_mru_init + $l2_mfu_init + $l2_prefetch_init - \
Expand Down
55 changes: 27 additions & 28 deletions tests/zfs-tests/tests/functional/l2arc/persist_l2arc_004_pos.ksh
Original file line number Diff line number Diff line change
Expand Up @@ -23,25 +23,24 @@

#
# DESCRIPTION:
# Persistent L2ARC restores all written log blocks
# Off/onlining an L2ARC device results in rebuilding L2ARC, vdev not
# present.
#
# STRATEGY:
# 1. Create pool with a cache device.
# 2. Create a random file in that pool, smaller than the cache device
# and random read for 10 sec.
# 3. Export pool.
# 4. Read amount of log blocks written.
# 5. Import pool.
# 6. Read amount of log blocks built.
# 7. Compare the two amounts.
# 8. Read the file written in (2) and check if l2_hits in
# /proc/spl/kstat/zfs/arcstats increased.
# 9. Check if the labels of the L2ARC device are intact.
# 2. Create a random file in that pool and random read for 10 sec.
# 3. Read the amount of log blocks written from the header of the
# L2ARC device.
# 4. Offline the L2ARC device and export pool.
# 5. Import pool and online the L2ARC device.
# 6. Read the amount of log blocks rebuilt in arcstats and compare to
# (3).
# 7. Check if the labels of the L2ARC device are intact.
#

verify_runnable "global"

log_assert "Persistent L2ARC restores all written log blocks."
log_assert "Off/onlining an L2ARC device results in rebuilding L2ARC, vdev not present."

function cleanup
{
Expand All @@ -50,47 +49,47 @@ function cleanup
fi

log_must set_tunable32 L2ARC_NOPREFETCH $noprefetch
log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE \
$rebuild_blocks_min_l2size
}
log_onexit cleanup

# L2ARC_NOPREFETCH is set to 0 to let L2ARC handle prefetches
typeset noprefetch=$(get_tunable L2ARC_NOPREFETCH)
typeset rebuild_blocks_min_l2size=$(get_tunable L2ARC_REBUILD_BLOCKS_MIN_L2SIZE)
log_must set_tunable32 L2ARC_NOPREFETCH 0
log_must set_tunable32 L2ARC_REBUILD_BLOCKS_MIN_L2SIZE 0

typeset fill_mb=800
typeset cache_sz=$(( 2 * $fill_mb ))
typeset cache_sz=$(( floor($fill_mb / 2) ))
export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M

log_must truncate -s ${cache_sz}M $VDEV_CACHE

typeset log_blk_start=$(get_arcstat l2_log_blk_writes)

log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE

log_must fio $FIO_SCRIPTS/mkfiles.fio
log_must fio $FIO_SCRIPTS/random_reads.fio

arcstat_quiescence_noecho l2_size
log_must zpool offline $TESTPOOL $VDEV_CACHE
arcstat_quiescence_noecho l2_size
log_must zpool export $TESTPOOL
arcstat_quiescence_noecho l2_feeds

typeset log_blk_end=$(get_arcstat l2_log_blk_writes)
typeset log_blk_rebuild_start=$(get_arcstat l2_rebuild_log_blks)
typeset l2_rebuild_log_blk_start=$(get_arcstat l2_rebuild_log_blks)
typeset l2_dh_log_blk=$(zdb -l $VDEV_CACHE | grep log_blk_count | \
awk '{print $2}')

log_must zpool import -d $VDIR $TESTPOOL

typeset l2_hits_start=$(get_arcstat l2_hits)

log_must fio $FIO_SCRIPTS/random_reads.fio
log_must zpool online $TESTPOOL $VDEV_CACHE
arcstat_quiescence_noecho l2_size

typeset log_blk_rebuild_end=$(arcstat_quiescence_echo l2_rebuild_log_blks)
typeset l2_hits_end=$(get_arcstat l2_hits)

log_must test $(( $log_blk_rebuild_end - $log_blk_rebuild_start )) -eq \
$(( $log_blk_end - $log_blk_start ))
typeset l2_rebuild_log_blk_end=$(arcstat_quiescence_echo l2_rebuild_log_blks)

log_must test $l2_hits_end -gt $l2_hits_start
log_must test $l2_dh_log_blk -eq $(( $l2_rebuild_log_blk_end - \
$l2_rebuild_log_blk_start ))
log_must test $l2_dh_log_blk -gt 0

log_must zpool offline $TESTPOOL $VDEV_CACHE
arcstat_quiescence_noecho l2_size
Expand All @@ -99,4 +98,4 @@ log_must zdb -lq $VDEV_CACHE

log_must zpool destroy -f $TESTPOOL

log_pass "Persistent L2ARC restores all written log blocks."
log_pass "Off/onlining an L2ARC device results in rebuilding L2ARC, vdev not present."
Loading

0 comments on commit ab8a8f0

Please sign in to comment.