From c27703e512d56f9fe78e9bbfb08a18d64bf9bc1d Mon Sep 17 00:00:00 2001 From: Tom Caputi Date: Tue, 21 Aug 2018 16:51:48 -0400 Subject: [PATCH] OpenZFS 9403 - assertion failed in arc_buf_destroy() when concurrently reading block with checksum error Porting notes: * zio_decompress_fail_fraction has been set to (1 << 18) by default in ztest. * The ability to zinject decompression errors has been added, but this only works at the zio_decompress() level, where we have all of the inf we need to match against the user's zinject options. * The decompress_fault test has been added to test the new zinject functionality Authored by: Matt Ahrens Reviewed by: George Wilson Reviewed by: Paul Dagnelie Reviewed by: Pavel Zakharov Approved by: Matt Ahrens Ported-by: Tom Caputi OpenZFS-issue: https://illumos.org/issues/9403 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/fa98e487a9 --- cmd/zinject/zinject.c | 10 ++-- cmd/ztest/ztest.c | 13 +++-- man/man5/zfs-module-parameters.5 | 13 +++++ man/man8/zinject.8 | 1 + module/zfs/arc.c | 51 +++++++++++++---- module/zfs/dbuf.c | 37 +++++++------ module/zfs/zio.c | 6 ++ module/zfs/zio_compress.c | 18 +++++- tests/runfiles/linux.run | 2 +- .../tests/functional/fault/Makefile.am | 1 + .../functional/fault/decompress_fault.ksh | 55 +++++++++++++++++++ .../tests/functional/fault/decrypt_fault.ksh | 4 +- 12 files changed, 172 insertions(+), 39 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/fault/decompress_fault.ksh diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c index d6298f8f49a4..ee15ff8a4f11 100644 --- a/cmd/zinject/zinject.c +++ b/cmd/zinject/zinject.c @@ -116,9 +116,9 @@ * specified. * * The '-e' option takes a string describing the errno to simulate. This must - * be one of 'io', 'checksum', or 'decrypt'. In most cases this will result - * in the same behavior, but RAID-Z will produce a different set of ereports - * for this situation. + * be one of 'io', 'checksum', 'decompress', or 'decrypt'. In most cases this + * will result in the same behavior, but RAID-Z will produce a different set of + * ereports for this situation. * * The '-a', '-u', and '-m' flags toggle internal flush behavior. If '-a' is * specified, then the ARC cache is flushed appropriately. If '-u' is @@ -300,7 +300,7 @@ usage(void) "\n" "\t\t-q\tQuiet mode. Only print out the handler number added.\n" "\t\t-e\tInject a specific error. Must be one of 'io',\n" - "\t\t\t'checksum', or decrypt. Default is 'io'.\n" + "\t\t\t'checksum', 'decompress', or decrypt. Default is 'io'.\n" "\t\t-l\tInject error at a particular block level. Default is " "0.\n" "\t\t-m\tAutomatically remount underlying filesystem.\n" @@ -774,6 +774,8 @@ main(int argc, char **argv) error = EIO; } else if (strcasecmp(optarg, "checksum") == 0) { error = ECKSUM; + } else if (strcasecmp(optarg, "decompress") == 0) { + error = EINVAL; } else if (strcasecmp(optarg, "decrypt") == 0) { error = EACCES; } else if (strcasecmp(optarg, "nxio") == 0) { diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 8fb6200d55e6..6e697b27be14 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -193,7 +193,7 @@ static const ztest_shared_opts_t ztest_opts_defaults = { .zo_init = 1, .zo_time = 300, /* 5 minutes */ .zo_maxloops = 50, /* max loops during spa_freeze() */ - .zo_metaslab_force_ganging = 32 << 10 + .zo_metaslab_force_ganging = 32 << 10, }; extern uint64_t metaslab_force_ganging; @@ -204,6 +204,7 @@ extern boolean_t zfs_compressed_arc_enabled; extern int zfs_abd_scatter_enabled; extern int dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; +extern unsigned long zio_decompress_fail_fraction; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -483,9 +484,6 @@ static int zc_cb_counter = 0; */ #define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) -extern uint64_t metaslab_gang_bang; -extern uint64_t metaslab_df_alloc_threshold; - enum ztest_object { ZTEST_META_DNODE = 0, ZTEST_DIROBJ, @@ -7367,6 +7365,13 @@ main(int argc, char **argv) */ zfs_force_some_double_word_sm_entries = B_TRUE; + /* + * Force rare decompression failures to exercise those code paths. + * This will touch many more code paths than zinjecting "decompress" + * errors since those only operate at the zio_decompress() level. + */ + zio_decompress_fail_fraction = (1 << 18); + action.sa_handler = sig_handler; sigemptyset(&action.sa_mask); action.sa_flags = 0; diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index eae8dc42870f..62a16f662907 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -2511,6 +2511,19 @@ to limit potential SLOG device abuse by single active ZIL writer. Default value: \fB786,432\fR. .RE +.sp +.ne 2 +.na +\fBzio_decompress_fail_fraction\fR (int) +.ad +.RS 12n +If non-zero, this value represents the denominator of the probability that zfs +should induce a decompression failure. For instance, for a 5% decompression +failure rate, this value should be set to 20. +.sp +Default value: \fB0\fR. +.RE + .sp .ne 2 .na diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 index 7b664415f5ff..e97db839b808 100644 --- a/man/man8/zinject.8 +++ b/man/man8/zinject.8 @@ -108,6 +108,7 @@ A vdev specified by path or GUID. .BI "\-e" " device_error" Specify .BR "checksum" " for an ECKSUM error," +.BR "decompress" " for a data decompression error," .BR "decrypt" " for a data decryption error," .BR "corrupt" " to flip a bit in the data after a read," .BR "dtl" " for an ECHILD error," diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 7f2929c17e1a..b28da93a1ab4 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ @@ -5803,10 +5803,12 @@ arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t **bufp = arg; if (buf == NULL) { + ASSERT(zio == NULL || zio->io_error != 0); *bufp = NULL; } else { + ASSERT(zio == NULL || zio->io_error == 0); *bufp = buf; - ASSERT(buf->b_data); + ASSERT(buf->b_data != NULL); } } @@ -5934,12 +5936,6 @@ arc_read_done(zio_t *zio) &acb->acb_zb, acb->acb_private, acb->acb_encrypted, acb->acb_compressed, acb->acb_noauth, B_TRUE, &acb->acb_buf); - if (error != 0) { - (void) remove_reference(hdr, hash_lock, - acb->acb_private); - arc_buf_destroy_impl(acb->acb_buf); - acb->acb_buf = NULL; - } /* * Assert non-speculative zios didn't fail because an @@ -5962,9 +5958,34 @@ arc_read_done(zio_t *zio) } } - if (zio->io_error == 0) + if (error != 0) { + /* + * Decompression or decryption failed. Set + * io_error so that when we call acb_done + * (below), we will indicate that the read + * failed. Note that in the unusual case + * where one callback is compressed and another + * uncompressed, we will mark all of them + * as failed, even though the uncompressed + * one can't actually fail. In this case, + * the hdr will not be anonymous, because + * if there are multiple callbacks, it's + * because multiple threads found the same + * arc buf in the hash table. + */ zio->io_error = error; + } } + + /* + * If there are multiple callbacks, we must have the hash lock, + * because the only way for multiple threads to find this hdr is + * in the hash table. This ensures that if there are multiple + * callbacks, the hdr is not anonymous. If it were anonymous, + * we couldn't use arc_buf_destroy() in the error case below. + */ + ASSERT(callback_cnt < 2 || hash_lock != NULL); + hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); if (callback_cnt == 0) @@ -6006,7 +6027,17 @@ arc_read_done(zio_t *zio) /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { - if (acb->acb_done) { + if (acb->acb_done != NULL) { + if (zio->io_error != 0 && acb->acb_buf != NULL) { + /* + * If arc_buf_alloc_impl() fails during + * decompression, the buf will still be + * allocated, and needs to be freed here. + */ + arc_buf_destroy(acb->acb_buf, + acb->acb_private); + acb->acb_buf = NULL; + } acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, acb->acb_buf, acb->acb_private); } diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index f0bee6c2bb59..3cd71cf6ef5a 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ @@ -1191,25 +1191,26 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, ASSERT(refcount_count(&db->db_holds) > 0); ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - if (db->db_level == 0 && db->db_freed_in_flight) { - /* we were freed in flight; disregard any error */ - if (buf == NULL) { - buf = arc_alloc_buf(db->db_objset->os_spa, - db, DBUF_GET_BUFC_TYPE(db), db->db.db_size); - } + if (buf == NULL) { + /* i/o error */ + ASSERT(zio == NULL || zio->io_error != 0); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT3P(db->db_buf, ==, NULL); + db->db_state = DB_UNCACHED; + } else if (db->db_level == 0 && db->db_freed_in_flight) { + /* freed in flight */ + ASSERT(zio == NULL || zio->io_error == 0); arc_release(buf, db); bzero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; - } else if (buf != NULL) { + } else { + /* success */ + ASSERT(zio == NULL || zio->io_error == 0); dbuf_set_data(db, buf); db->db_state = DB_CACHED; - } else { - ASSERT(db->db_blkid != DMU_BONUS_BLKID); - ASSERT3P(db->db_buf, ==, NULL); - db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); dbuf_rele_and_unlock(db, NULL, B_FALSE); @@ -2848,6 +2849,13 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); ASSERT3S(dpa->dpa_curlevel, >, 0); + if (abuf == NULL) { + ASSERT(zio == NULL || zio->io_error != 0); + kmem_free(dpa, sizeof (*dpa)); + return; + } + ASSERT(zio == NULL || zio->io_error == 0); + /* * The dpa_dnode is only valid if we are called with a NULL * zio. This indicates that the arc_read() returned without @@ -2880,11 +2888,6 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, dbuf_rele(db, FTAG); } - if (abuf == NULL) { - kmem_free(dpa, sizeof (*dpa)); - return; - } - dpa->dpa_curlevel--; uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index a8f6a928fdc2..15d24f104078 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -389,6 +389,12 @@ zio_decompress(zio_t *zio, abd_t *data, uint64_t size) zio->io_abd, tmp, zio->io_size, size); abd_return_buf_copy(data, tmp, size); +#ifdef _KERNEL + printk(KERN_DEBUG "HERE1: %d %d\n", zio_injection_enabled, ret); +#endif + if (zio_injection_enabled && ret == 0) + ret = zio_handle_fault_injection(zio, EINVAL); + if (ret != 0) zio->io_error = SET_ERROR(EIO); } diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c index 971e8de8b5a0..f5cbc3e8218a 100644 --- a/module/zfs/zio_compress.c +++ b/module/zfs/zio_compress.c @@ -28,7 +28,7 @@ */ /* - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. */ #include @@ -37,6 +37,12 @@ #include #include +/* + * If nonzero, every 1/X decompression attempts will fail, simulating + * an undetected memory error. + */ +unsigned long zio_decompress_fail_fraction = 0; + /* * Compression vectors. */ @@ -148,5 +154,15 @@ zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len); abd_return_buf(src, tmp, s_len); + /* + * Decompression shouldn't fail, because we've already verifyied + * the checksum. However, for extra protection (e.g. against bitflips + * in non-ECC RAM), we handle this error (and test it). + */ + ASSERT0(ret); + if (zio_decompress_fail_fraction != 0 && + spa_get_random(zio_decompress_fail_fraction) == 0) + ret = SET_ERROR(EINVAL); + return (ret); } diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 5e0dd0832e25..9a93b8d56136 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -523,7 +523,7 @@ tags = ['functional', 'exec'] [tests/functional/fault] tests = ['auto_online_001_pos', 'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_ashift', 'auto_spare_multiple', - 'scrub_after_resilver', 'decrypt_fault'] + 'scrub_after_resilver', 'decrypt_fault', 'decompress_fault'] tags = ['functional', 'fault'] [tests/functional/features/async_destroy] diff --git a/tests/zfs-tests/tests/functional/fault/Makefile.am b/tests/zfs-tests/tests/functional/fault/Makefile.am index 1153ad8d660b..285e331a1a1d 100644 --- a/tests/zfs-tests/tests/functional/fault/Makefile.am +++ b/tests/zfs-tests/tests/functional/fault/Makefile.am @@ -9,6 +9,7 @@ dist_pkgdata_SCRIPTS = \ auto_spare_ashift.ksh \ auto_spare_multiple.ksh \ decrypt_fault.ksh \ + decompress_fault.ksh \ scrub_after_resilver.ksh dist_pkgdata_DATA = \ diff --git a/tests/zfs-tests/tests/functional/fault/decompress_fault.ksh b/tests/zfs-tests/tests/functional/fault/decompress_fault.ksh new file mode 100755 index 000000000000..051fac6e0ccb --- /dev/null +++ b/tests/zfs-tests/tests/functional/fault/decompress_fault.ksh @@ -0,0 +1,55 @@ +#!/bin/ksh -p +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2018 by Datto Inc. +# All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/fault/fault.cfg + +# +# DESCRIPTION: +# Test that injected decompression errors are handled correctly. +# +# STRATEGY: +# 1. Create an compressed dataset with a test file +# 2. Inject decompression errors on the file 20% of the time +# 3. Read the file to confirm that errors are handled correctly +# 4. Confirm that the decompression injection was added to the ZED logs +# + +log_assert "Testing that injected decompression errors are handled correctly" + +function cleanup +{ + log_must eval "echo 1 > /sys/module/zfs/parameters/zfs_compressed_arc_enabled" + log_must zinject -c all + default_cleanup_noexit +} + +log_onexit cleanup + +default_mirror_setup_noexit $DISK1 $DISK2 +log_must eval "echo 0 > /sys/module/zfs/parameters/zfs_compressed_arc_enabled" +log_must zfs create -o compression=on $TESTPOOL/fs +mntpt=$(get_prop mountpoint $TESTPOOL/fs) +write_compressible $mntpt 32m 1 0 "testfile" +log_must sync +log_must zfs umount $TESTPOOL/fs +log_must zfs mount $TESTPOOL/fs +log_must zinject -a -t data -e decompress -f 20 $mntpt/testfile.0 +log_mustnot eval "cat $mntpt/testfile.0 > /dev/null" +log_must eval "zpool events $TESTPOOL | grep -q 'data'" + +log_pass "Injected decompression errors are handled correctly" diff --git a/tests/zfs-tests/tests/functional/fault/decrypt_fault.ksh b/tests/zfs-tests/tests/functional/fault/decrypt_fault.ksh index 10008f22eee3..ca698f778367 100755 --- a/tests/zfs-tests/tests/functional/fault/decrypt_fault.ksh +++ b/tests/zfs-tests/tests/functional/fault/decrypt_fault.ksh @@ -11,7 +11,7 @@ # # -# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2018 by Datto Inc. # All rights reserved. # @@ -23,7 +23,7 @@ # Test that injected decryption errors are handled correctly. # # STRATEGY: -# 1. Create an encrypted dataset with an test file +# 1. Create an encrypted dataset with a test file # 2. Inject decryption errors on the file 20% of the time # 3. Read the file to confirm that errors are handled correctly # 4. Confirm that the decryption injection was added to the ZED logs