From b908bb2b07a73df4febd4e651e6f0a43f7538465 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Wed, 25 Jan 2023 14:24:31 -0800 Subject: [PATCH 1/3] EIO caused by encryption + recursive gang Encrypted blocks can not have 3 DVAs, because they use the space of the 3rd DVA for the IV+salt. zio_write_gang_block() takes this into account, setting `gbh_copies` to no more than 2 in this case. Gang members BP's do not have the X (encrypted) bit set (nor do they have the DMU level and type fields set), because encryption is not handled at this level. The gang block is reassembled, and then encryption (and compression) are handled. To check if this gang block is encrypted, the code in zio_write_gang_block() checks `pio->io_bp`. This is normally fine, because the block that's being ganged is typically the encrypted BP. The problem is that if there is "recursive ganging", where a gang member is itself a gang block, then when zio_write_gang_block() is called to create a gang block for a gang member, `pio->io_bp` is the gang member's BP, which doesn't have the X bit set, so the number of DVA's is not restricted to 2. It should instead be looking at the the "gang leader", i.e. the top-level gang block, to determine how many DVA's can be used, to avoid a "NDVA's inversion" (where a child has more DVA's than its parent). gang leader BP: X (encrypted) bit set, 2 DVA's, IV+salt in 3rd DVA's space: ``` DVA[0]=<1:...:100400> DVA[1]=<0:...:100400> salt=... iv=... [L0 ZFS plain file] fletcher4 uncompressed encrypted LE gang unique double size=100000L/100000P birth=... fill=1 cksum=... ``` leader's GBH contains a BP with gang bit set and 3 DVA's: ``` DVA[0]=<1:...:55600> DVA[1]=<0:...:55600> [L0 unallocated] fletcher4 uncompressed unencrypted LE contiguous unique double size=55600L/55600P birth=... fill=0 cksum=... DVA[0]=<1:...:55600> DVA[1]=<0:...:55600> [L0 unallocated] fletcher4 uncompressed unencrypted LE contiguous unique double size=55600L/55600P birth=... fill=0 cksum=... DVA[0]=<1:...:55600> DVA[1]=<0:...:55600> DVA[2]=<1:...:200> [L0 unallocated] fletcher4 uncompressed unencrypted LE gang unique double size=55400L/55400P birth=... fill=0 cksum=... ``` On nondebug bits, having the 3rd DVA in the gang block works for the most part, because it's true that all 3 DVA's are available in the gang member BP (in the GBH). However, for accounting purposes, gang block DVA's ASIZE include all the space allocated below them, i.e. the 512-byte gang block header (GBH) as well as the gang members below that. We see that above where the gang leader BP is 1MB logical (and after compression: 0x`100000P`), but the ASIZE of each DVA is 2 sectors (1KB) more than 1MB (0x`100400`). Since thre are 3 copies of a block below it, we increment the ATIME of the 3rd DVA of the gang leader by the space used by the 3rd DVA of the child (1 sector, in this case). But there isn't really a 3rd DVA of the parent; the salt is stored in place of the 3rd DVA's ASIZE. So when zio_write_gang_member_ready() increments the parent's BP's `DVA[2]`'s ASIZE, it's actually incrementing the parent's salt. When we later try to read the encrypted recursively-ganged block, the salt doesn't match what we used to write it, so MAC verification fails and we get an EIO. ``` zio_encrypt(): encrypted 515/2/0/403 salt: 25 25 bb 9d ad d6 cd 89 zio_decrypt(): decrypting 515/2/0/403 salt: 26 25 bb 9d ad d6 cd 89 ``` This commit addresses the problem by not increasing the number of copies of the GBH beyond 2 (even for non-encrypted blocks). This simplifies the logic while maintaining the ability to traverse all metadata (including gang blocks) even if one copy is lost. (Note that 3 copies of the GBH will still be created if requested, e.g. for `copies=3` or MOS blocks.) Additionally, the code that increments the parent's DVA's ASIZE is made to check the parent DVA's NDVAS even on nondebug bits. So if there's a similar bug in the future, it will cause a panic when trying to write, rather than corrupting the parent BP and causing an error when reading. Signed-off-by: Matthew Ahrens Caused-by: #14356 Closes #14413 --- module/zfs/zio.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 5d7ed6d582a2..d888a584a93c 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2778,7 +2778,7 @@ zio_write_gang_member_ready(zio_t *zio) ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); - ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); + VERIFY3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { @@ -2816,18 +2816,20 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) uint64_t resid = pio->io_size; uint64_t lsize; int copies = gio->io_prop.zp_copies; - int gbh_copies; zio_prop_t zp; int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); /* - * encrypted blocks need DVA[2] free so encrypted gang headers can't - * have a third copy. + * If one copy was requested, store 2 copies of the GBH, so that we + * can still traverse all the data (e.g. to free or scrub) even if a + * block is damaged. Note that we can't store 3 copies of the GBH in + * all cases, e.g. with encryption, which uses DVA[2] for the IV+salt. */ - gbh_copies = MIN(copies + 1, spa_max_replication(spa)); - if (BP_IS_ENCRYPTED(bp) && gbh_copies >= SPA_DVAS_PER_BP) - gbh_copies = SPA_DVAS_PER_BP - 1; + int gbh_copies = copies; + if (gbh_copies == 1) { + gbh_copies = MIN(2, spa_max_replication(spa)); + } int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { From a09c459b6e8c96244bba1f9b28eaa206fd88a39a Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 30 Jan 2023 13:35:16 -0800 Subject: [PATCH 2/3] Add test case Signed-off-by: Brian Behlendorf --- tests/runfiles/common.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../functional/no_space/enospc_ganging.ksh | 86 +++++++++++++++++++ 3 files changed, 88 insertions(+), 1 deletion(-) create mode 100755 tests/zfs-tests/tests/functional/no_space/enospc_ganging.ksh diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 005c539fc89d..3d55eff82c3d 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -704,7 +704,7 @@ tags = ['functional', 'nestedfs'] [tests/functional/no_space] tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos', - 'enospc_df', 'enospc_rm'] + 'enospc_df', 'enospc_ganging.ksh', 'enospc_rm'] tags = ['functional', 'no_space'] [tests/functional/nopwrite] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index bbe94f9177ae..ad2ec4670556 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1539,6 +1539,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/no_space/enospc_002_pos.ksh \ functional/no_space/enospc_003_pos.ksh \ functional/no_space/enospc_df.ksh \ + functional/no_space/enospc_ganging.ksh \ functional/no_space/enospc_rm.ksh \ functional/no_space/setup.ksh \ functional/online_offline/cleanup.ksh \ diff --git a/tests/zfs-tests/tests/functional/no_space/enospc_ganging.ksh b/tests/zfs-tests/tests/functional/no_space/enospc_ganging.ksh new file mode 100755 index 000000000000..1d35fba5dbfa --- /dev/null +++ b/tests/zfs-tests/tests/functional/no_space/enospc_ganging.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Exercise gang block IO paths for non-encrypted and encrypted datasets. +# + +verify_runnable "both" +log_assert "Verify IO when file system is full and ganging." + +function cleanup +{ + log_must set_tunable64 METASLAB_FORCE_GANGING $metaslab_force_ganging + default_cleanup_noexit +} + +log_onexit cleanup + +default_setup_noexit $DISKS + +typeset metaslab_force_ganging=$(get_tunable METASLAB_FORCE_GANGING) +shift=$(random_int_between 15 17) +log_must set_tunable64 METASLAB_FORCE_GANGING $((2**$shift)) + +keyfile=/$TESTPOOL/keyencfods +log_must eval "echo 'password' > $keyfile" +bs=1024k +count=512 + +log_must dd if=/dev/urandom of=$TESTDIR/data bs=$bs count=$count +data_checksum=$(sha256digest $TESTDIR/data) + +# Test common large block configuration. +log_must zfs create -o recordsize=1m -o primarycache=metadata $TESTPOOL/gang +mntpnt=$(get_prop mountpoint $TESTPOOL/gang) + +log_must dd if=$TESTDIR/data of=$mntpnt/file bs=$bs count=$count +sync_pool $TESTPOOL +log_must dd if=$mntpnt/file of=$TESTDIR/out bs=$bs count=$count +out_checksum=$(sha256digest $TESTDIR/out) + +if [[ "$data_checksum" != "$out_checksum" ]]; then + log_fail "checksum mismatch ($data_checksum != $out_checksum)" +fi + +log_must rm -f $TESTDIR/out +log_must zfs destroy $TESTPOOL/gang + +# Test common large block configuration with encryption. +log_must zfs create \ + -o recordsize=1m \ + -o primarycache=metadata \ + -o compression=off \ + -o encryption=on \ + -o keyformat=passphrase \ + -o keylocation=file://$keyfile \ + -o copies=2 \ + $TESTPOOL/gang +mntpnt=$(get_prop mountpoint $TESTPOOL/gang) + +log_must dd if=$TESTDIR/data of=$mntpnt/file bs=$bs count=$count +sync_pool $TESTPOOL +log_must dd if=$mntpnt/file of=$TESTDIR/out bs=$bs count=$count +out_checksum=$(sha256digest $TESTDIR/out) + +if [[ "$data_checksum" != "$out_checksum" ]]; then + log_fail "checksum mismatch ($data_checksum != $out_checksum)" +fi + +log_must rm -f $TESTDIR/out +log_must zfs destroy $TESTPOOL/gang + +log_pass "Verified IO when file system is full and ganging." From 2c8160a6498da25909a8fa53c5e21abf1bbe6da5 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Tue, 31 Jan 2023 12:33:51 -0800 Subject: [PATCH 3/3] Update tests/runfiles/common.run Co-authored-by: Brian Behlendorf --- tests/runfiles/common.run | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 3d55eff82c3d..7a7cf927c77e 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -704,7 +704,7 @@ tags = ['functional', 'nestedfs'] [tests/functional/no_space] tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos', - 'enospc_df', 'enospc_ganging.ksh', 'enospc_rm'] + 'enospc_df', 'enospc_ganging', 'enospc_rm'] tags = ['functional', 'no_space'] [tests/functional/nopwrite]