From 480d809703c55f54f2ea8b69d22ea36346cfd5e2 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Thu, 16 Mar 2023 18:00:14 +0100 Subject: [PATCH 001/180] Refine some details for the github actions update Set the retention-days variable to 14 days for these artifacts: - the zloop error logs - the zloop vdev files - the compiled modules Add the abality to re-run some part of the functional testings. Fix some comments and remove the deleting of the modules artifact. Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Signed-off-by: Tino Reichardt Closes #14637 --- .../workflows/scripts/setup-dependencies.sh | 2 +- .github/workflows/zfs-linux-tests.yml | 2 ++ .github/workflows/zfs-linux.yml | 29 +++++++------------ 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/.github/workflows/scripts/setup-dependencies.sh b/.github/workflows/scripts/setup-dependencies.sh index c788f0afab9f..440d5e8e5ac9 100755 --- a/.github/workflows/scripts/setup-dependencies.sh +++ b/.github/workflows/scripts/setup-dependencies.sh @@ -37,7 +37,7 @@ function mod_install() { fi echo "::group::Install and load modules" - # delete kernel-shipped zfs modules, be sure about correct modules + # don't use kernel-shipped zfs modules sudo sed -i.bak 's/updates/extra updates/' /etc/depmod.d/ubuntu.conf sudo apt-get install --fix-missing ./*.deb diff --git a/.github/workflows/zfs-linux-tests.yml b/.github/workflows/zfs-linux-tests.yml index f72ff7f5a73f..c4fe930d092c 100644 --- a/.github/workflows/zfs-linux-tests.yml +++ b/.github/workflows/zfs-linux-tests.yml @@ -41,6 +41,7 @@ jobs: path: | /var/tmp/zloop/*/ !/var/tmp/zloop/*/vdev/ + retention-days: 14 if-no-files-found: ignore - uses: actions/upload-artifact@v3 if: failure() @@ -48,6 +49,7 @@ jobs: name: Zpool-files-${{ inputs.os }} path: | /var/tmp/zloop/*/vdev/ + retention-days: 14 if-no-files-found: ignore sanity: diff --git a/.github/workflows/zfs-linux.yml b/.github/workflows/zfs-linux.yml index 4681bea50210..be3908deb948 100644 --- a/.github/workflows/zfs-linux.yml +++ b/.github/workflows/zfs-linux.yml @@ -20,17 +20,12 @@ jobs: - name: Build modules run: .github/workflows/scripts/setup-dependencies.sh build - name: Prepare modules upload - run: tar czf modules-${{ matrix.os }}.tgz *.deb .github scripts/zfs-tests-color.sh tests/test-runner tests/ImageOS.txt + run: tar czf modules-${{ matrix.os }}.tgz *.deb .github tests/test-runner tests/ImageOS.txt - uses: actions/upload-artifact@v3 with: name: modules-${{ matrix.os }} path: modules-${{ matrix.os }}.tgz - - name: Prepare scripts upload - run: tar czf scripts.tgz .github tests/test-runner - - uses: actions/upload-artifact@v3 - with: - name: scripts - path: scripts.tgz + retention-days: 14 testings: name: Testing @@ -52,20 +47,18 @@ jobs: - uses: actions/download-artifact@v3 - name: Generating summary run: | - tar xzf scripts/scripts.tgz .github tests + tar xzf modules-22.04/modules-22.04.tgz .github tests .github/workflows/scripts/generate-summary.sh # up to 4 steps, each can have 1 MiB output (for debugging log files) - - run: .github/workflows/scripts/generate-summary.sh 1 - - run: .github/workflows/scripts/generate-summary.sh 2 - - run: .github/workflows/scripts/generate-summary.sh 3 - - run: .github/workflows/scripts/generate-summary.sh 4 + - name: Summary for errors #1 + run: .github/workflows/scripts/generate-summary.sh 1 + - name: Summary for errors #2 + run: .github/workflows/scripts/generate-summary.sh 2 + - name: Summary for errors #3 + run: .github/workflows/scripts/generate-summary.sh 3 + - name: Summary for errors #4 + run: .github/workflows/scripts/generate-summary.sh 4 - uses: actions/upload-artifact@v3 with: name: Summary Files path: Summary/ - - uses: geekyeggo/delete-artifact@v2 - with: - name: modules-20.04 - - uses: geekyeggo/delete-artifact@v2 - with: - name: modules-22.04 From 60cfd3bbc22cd51a2e7e9c8341d9909cdb5dac85 Mon Sep 17 00:00:00 2001 From: naivekun Date: Fri, 17 Mar 2023 02:54:10 +0800 Subject: [PATCH 002/180] QAT: Fix uninitialized seed in QAT compression CpaDcRqResults have to be initialized with checksum=1 for adler32. Otherwise when error CPA_DC_OVERFLOW occurred, the next compress operation will continue on previously part-compressed data, and write invalid checksum data. When zfs decompress the compressed data, a invalid checksum will occurred and lead to #14463 Reviewed-by: Tino Reichardt Reviewed-by: Weigang Li Reviewed-by: Chengfei Zhu Signed-off-by: naivekun Closes #14632 Closes #14463 --- module/os/linux/zfs/qat_compress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/linux/zfs/qat_compress.c b/module/os/linux/zfs/qat_compress.c index 7088f6bd1c6f..07d5d34dae33 100644 --- a/module/os/linux/zfs/qat_compress.c +++ b/module/os/linux/zfs/qat_compress.c @@ -247,7 +247,7 @@ qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, Cpa8U *buffer_meta_src = NULL; Cpa8U *buffer_meta_dst = NULL; Cpa32U buffer_meta_size = 0; - CpaDcRqResults dc_results; + CpaDcRqResults dc_results = {.checksum = 1}; CpaStatus status = CPA_STATUS_FAIL; Cpa32U hdr_sz = 0; Cpa32U compressed_sz; From fa468025859f4d73ee74ece7f2f8c13ad6e5aa73 Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Thu, 16 Mar 2023 17:27:49 -0400 Subject: [PATCH 003/180] Fix possible bad bit shift in dnode_next_offset_level() 031d7c2fe6afaa78943bd0a563b91fc84ace42d7 did not handle reverse iteration, such that the original issue theoretically could still occur. Note that contrary to the claim in the ZFS disk format specification that a maximum of 6 levels are possible, 9 levels are possible with recordsize=512 and and indirect block size of 16KB. In this unusual configuration, span will be 65. The maximum size of span at 70 can be reached at recordsize=16K and an indirect blocksize of 16KB. When we are at this indirection level and are traversing backward, the minimum value is start, but we cannot calculate that with 64-bit arithmetic, so we avoid the calculation and instead rely on the earlier statement that did `*offset = start;`. Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Reported-by: Coverity (CID-1466214) Closes #14618 --- module/zfs/dnode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index d334386e0a9c..367bfaa80726 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -2597,8 +2597,9 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, if (inc < 0) { /* traversing backwards; position offset at the end */ - ASSERT3U(*offset, <=, start); - *offset = MIN(*offset + (1ULL << span) - 1, start); + if (span < 8 * sizeof (*offset)) + *offset = MIN(*offset + (1ULL << span) - 1, + start); } else if (*offset < start) { *offset = start; } From d520f6434247fcfe9e37a117267eefb71276e255 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 17 Mar 2023 20:31:08 -0400 Subject: [PATCH 004/180] FreeBSD: Remove extra arc_reduce_target_size() call Remove arc_reduce_target_size() call from arc_prune_task(). The idea of arc_prune_task() is to remove external references on ARC metadata, such as vnodes. Since arc_prune_async() is called only from ARC itself, it makes no sense to create a parasitic loop between ARC eviction and the pruning, treatening to drop ARC to its minimum. I can't guess why it was added as part of FreeBSD to OpenZFS integration. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14639 --- module/os/freebsd/zfs/arc_os.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c index a2ff0f386a9d..12f16edb1e2b 100644 --- a/module/os/freebsd/zfs/arc_os.c +++ b/module/os/freebsd/zfs/arc_os.c @@ -140,8 +140,6 @@ arc_prune_task(void *arg) { uint64_t nr_scan = (uintptr_t)arg; - arc_reduce_target_size(ptob(nr_scan)); - #ifndef __ILP32__ if (nr_scan > INT_MAX) nr_scan = INT_MAX; From 0f9e7354145ca3196f77b18b3d02858121ece31c Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Wed, 22 Mar 2023 16:39:48 +0100 Subject: [PATCH 005/180] Remove unused constant EdonR256_BLOCK_BITSIZE Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Reviewed-by: George Melikov Signed-off-by: Tino Reichardt Closes #14650 --- include/sys/edonr.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/sys/edonr.h b/include/sys/edonr.h index 4bdfa249b9bf..b19b5eb42c29 100644 --- a/include/sys/edonr.h +++ b/include/sys/edonr.h @@ -50,8 +50,6 @@ extern "C" { /* Specific algorithm definitions */ #define EdonR512_DIGEST_SIZE 64 #define EdonR512_BLOCK_SIZE 128 - -#define EdonR256_BLOCK_BITSIZE 512 #define EdonR512_BLOCK_BITSIZE 1024 typedef struct { From 1eca40f3ade7ffc1847c66dd5cad67c94ba8fd35 Mon Sep 17 00:00:00 2001 From: Timothy Day Date: Wed, 22 Mar 2023 12:22:52 -0400 Subject: [PATCH 006/180] Fix kmodtool for packaging mainline Linux kmodtool currently incorrectly identifies official RHEL kernels, as opposed to custom kernels. This can cause the openZFS kmod RPM build to break. The issue can be reproduced by building a set of mainline Linux RPMs, installing them, and then attempting to build the openZFS kmod package against them. Reviewed-by: Brian Behlendorf Signed-off-by: Timothy Day Closes #14617 --- scripts/kmodtool | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/scripts/kmodtool b/scripts/kmodtool index 6eea8fe50f31..0d6af2c7f1d1 100755 --- a/scripts/kmodtool +++ b/scripts/kmodtool @@ -333,36 +333,36 @@ print_customrpmtemplate () { for kernel in ${1} do - if [[ -e "${buildroot}/usr/src/kernels/${kernel}" ]] ; then - # this looks like a Fedora/RH kernel -- print a normal template (which includes the proper BR) and be happy :) - kernel_versions="${kernel_versions}${kernel}___${buildroot}%{_usrsrc}/kernels/${kernel} " + if [[ -e "${prefix}/lib/modules/${kernel}/build/Makefile" ]]; then + # likely a user-build-kernel with available buildfiles + # fixme: we should check if uname from Makefile is the same as ${kernel} - # parse kernel versions string and print template - local kernel_verrelarch=${kernel%%${kernels_known_variants}} - print_rpmtemplate_per_kmodpkg --redhat ${kernel} ${kernel##${kernel_verrelarch}} + kernel_versions="${kernel_versions}${kernel}___${prefix}/lib/modules/${kernel}/build/ " + print_rpmtemplate_per_kmodpkg --custom "${kernel}" # create development package if [[ -n "${devel}" ]]; then # create devel package including common headers - print_rpmtemplate_kmoddevelpkg --redhat ${kernel} ${kernel##${kernel_verrelarch}} + print_rpmtemplate_kmoddevelpkg --custom "${kernel}" # create devel package - print_rpmtemplate_per_kmoddevelpkg --redhat ${kernel} ${kernel##${kernel_verrelarch}} + print_rpmtemplate_per_kmoddevelpkg --custom "${kernel}" fi - elif [[ -e "${prefix}/lib/modules/${kernel}/build/Makefile" ]]; then - # likely a user-build-kernel with available buildfiles - # fixme: we should check if uname from Makefile is the same as ${kernel} + elif [[ -e "${buildroot}/usr/src/kernels/${kernel}" ]]; then + # this looks like a Fedora/RH kernel -- print a normal template (which includes the proper BR) and be happy :) + kernel_versions="${kernel_versions}${kernel}___${buildroot}%{_usrsrc}/kernels/${kernel} " - kernel_versions="${kernel_versions}${kernel}___${prefix}/lib/modules/${kernel}/build/ " - print_rpmtemplate_per_kmodpkg --custom "${kernel}" + # parse kernel versions string and print template + local kernel_verrelarch=${kernel%%${kernels_known_variants}} + print_rpmtemplate_per_kmodpkg --redhat ${kernel} ${kernel##${kernel_verrelarch}} # create development package if [[ -n "${devel}" ]]; then # create devel package including common headers - print_rpmtemplate_kmoddevelpkg --custom "${kernel}" + print_rpmtemplate_kmoddevelpkg --redhat ${kernel} ${kernel##${kernel_verrelarch}} # create devel package - print_rpmtemplate_per_kmoddevelpkg --custom "${kernel}" + print_rpmtemplate_per_kmoddevelpkg --redhat ${kernel} ${kernel##${kernel_verrelarch}} fi else error_out 2 "Don't know how to handle ${kernel} -- ${prefix}/lib/modules/${kernel}/build/Makefile not found" From 9fa007d35dd3ab231034e766f643510c5976ffc9 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Wed, 22 Mar 2023 17:24:41 +0100 Subject: [PATCH 007/180] Fix build on FreeBSD Constify some variables after d1807f168edd09ca26a5a0c6b570686b982808ad. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Tino Reichardt Reviewed-by: Richard Yao Signed-off-by: Pawel Jakub Dawidek Closes #14656 --- module/os/freebsd/zfs/spa_os.c | 2 +- module/os/freebsd/zfs/vdev_geom.c | 2 +- module/os/freebsd/zfs/zfs_ioctl_os.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/module/os/freebsd/zfs/spa_os.c b/module/os/freebsd/zfs/spa_os.c index 449c1624817e..1b9f1a4ec9dc 100644 --- a/module/os/freebsd/zfs/spa_os.c +++ b/module/os/freebsd/zfs/spa_os.c @@ -184,7 +184,7 @@ spa_import_rootpool(const char *name, bool checkpointrewind) spa_t *spa; vdev_t *rvd; nvlist_t *config, *nvtop; - char *pname; + const char *pname; int error; /* diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index fef6a1b88e36..196d67b4b595 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -543,7 +543,7 @@ process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, uint64_t pool_guid; uint64_t vdev_guid; uint64_t id, txg, known_txg; - char *pname; + const char *pname; if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || strcmp(pname, name) != 0) diff --git a/module/os/freebsd/zfs/zfs_ioctl_os.c b/module/os/freebsd/zfs/zfs_ioctl_os.c index 7f7e2b72c51a..8f44cced5d95 100644 --- a/module/os/freebsd/zfs/zfs_ioctl_os.c +++ b/module/os/freebsd/zfs/zfs_ioctl_os.c @@ -99,7 +99,7 @@ zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) char name[MAXNAMELEN]; spa_t *spa; vdev_t *vd; - char *command; + const char *command; uint64_t pool_guid; uint64_t vdev_guid; int error; From 5b5f518687551b5e245d7515d0c81b174c47acfb Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 25 Mar 2023 04:14:39 +1100 Subject: [PATCH 008/180] man: add ZIO_STAGE_BRT_FREE to zpool-events And bump all the values after it, matching the header update in 67a1b037. Reviewed-by: Alexander Motin Reviewed-by: Brian Atkinson Signed-off-by: Rob Norris Closes #14665 --- man/man8/zpool-events.8 | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index 1b79149b52e2..0ba93e4166e7 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -406,28 +406,30 @@ ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W--- ZIO_STAGE_NOP_WRITE:0x00000100:-W--- -ZIO_STAGE_DDT_READ_START:0x00000200:R---- -ZIO_STAGE_DDT_READ_DONE:0x00000400:R---- -ZIO_STAGE_DDT_WRITE:0x00000800:-W--- -ZIO_STAGE_DDT_FREE:0x00001000:--F-- +ZIO_STAGE_BRT_FREE:0x00000200:--F-- -ZIO_STAGE_GANG_ASSEMBLE:0x00002000:RWFC- -ZIO_STAGE_GANG_ISSUE:0x00004000:RWFC- +ZIO_STAGE_DDT_READ_START:0x00000400:R---- +ZIO_STAGE_DDT_READ_DONE:0x00000800:R---- +ZIO_STAGE_DDT_WRITE:0x00001000:-W--- +ZIO_STAGE_DDT_FREE:0x00002000:--F-- -ZIO_STAGE_DVA_THROTTLE:0x00008000:-W--- -ZIO_STAGE_DVA_ALLOCATE:0x00010000:-W--- -ZIO_STAGE_DVA_FREE:0x00020000:--F-- -ZIO_STAGE_DVA_CLAIM:0x00040000:---C- +ZIO_STAGE_GANG_ASSEMBLE:0x00004000:RWFC- +ZIO_STAGE_GANG_ISSUE:0x00008000:RWFC- -ZIO_STAGE_READY:0x00080000:RWFCI +ZIO_STAGE_DVA_THROTTLE:0x00010000:-W--- +ZIO_STAGE_DVA_ALLOCATE:0x00020000:-W--- +ZIO_STAGE_DVA_FREE:0x00040000:--F-- +ZIO_STAGE_DVA_CLAIM:0x00080000:---C- -ZIO_STAGE_VDEV_IO_START:0x00100000:RW--I -ZIO_STAGE_VDEV_IO_DONE:0x00200000:RW--I -ZIO_STAGE_VDEV_IO_ASSESS:0x00400000:RW--I +ZIO_STAGE_READY:0x00100000:RWFCI -ZIO_STAGE_CHECKSUM_VERIFY:0x00800000:R---- +ZIO_STAGE_VDEV_IO_START:0x00200000:RW--I +ZIO_STAGE_VDEV_IO_DONE:0x00400000:RW--I +ZIO_STAGE_VDEV_IO_ASSESS:0x00800000:RW--I -ZIO_STAGE_DONE:0x01000000:RWFCI +ZIO_STAGE_CHECKSUM_VERIFY:0x01000000:R---- + +ZIO_STAGE_DONE:0x02000000:RWFCI .TE . .Sh I/O FLAGS From ce0e1cc402505493a890e7fc0819e582ae686b3b Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Fri, 24 Mar 2023 18:18:35 +0100 Subject: [PATCH 009/180] Fix cloning into already dirty dbufs. Undirty the dbuf and destroy its buffer when cloning into it. Coverity ID: CID-1535375 Reported-by: Richard Yao Reported-by: Benjamin Coddington Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Pawel Jakub Dawidek Closes #14655 --- include/sys/dbuf.h | 1 + module/zfs/dbuf.c | 3 +-- module/zfs/dmu.c | 35 ++++++++++++++++++++++++----------- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index a06316362e57..fb26a83b1844 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -382,6 +382,7 @@ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); +boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 80ea1bfe4197..617c850296b4 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -175,7 +175,6 @@ struct { continue; \ } -static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags); @@ -2518,7 +2517,7 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr) * Undirty a buffer in the transaction group referenced by the given * transaction. Return whether this evicted the dbuf. */ -static boolean_t +boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { uint64_t txg = tx->tx_txg; diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index e6bade11c859..23b6667524cc 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2190,7 +2190,8 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, for (int i = 0; i < numbufs; i++) { dbuf = dbp[i]; db = (dmu_buf_impl_t *)dbuf; - bp = db->db_blkptr; + + mutex_enter(&db->db_mtx); /* * If the block is not on the disk yet, it has no BP assigned. @@ -2212,10 +2213,16 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, * The block was modified in the same * transaction group. */ + mutex_exit(&db->db_mtx); error = SET_ERROR(EAGAIN); goto out; } + } else { + bp = db->db_blkptr; } + + mutex_exit(&db->db_mtx); + if (bp == NULL) { /* * The block was created in this transaction group, @@ -2273,19 +2280,23 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); - if (db->db_state == DB_UNCACHED) { - /* - * XXX-PJD: If the dbuf is already cached, calling - * dmu_buf_will_not_fill() will panic on assertion - * (db->db_buf == NULL) in dbuf_clear_data(), - * which is called from dbuf_noread() in DB_NOFILL - * case. I'm not 100% sure this is the right thing - * to do, but it seems to work. - */ - dmu_buf_will_not_fill(dbuf, tx); + mutex_enter(&db->db_mtx); + + VERIFY(!dbuf_undirty(db, tx)); + ASSERT(list_head(&db->db_dirty_records) == NULL); + if (db->db_buf != NULL) { + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; } + mutex_exit(&db->db_mtx); + + dmu_buf_will_not_fill(dbuf, tx); + + mutex_enter(&db->db_mtx); + dr = list_head(&db->db_dirty_records); + VERIFY(dr != NULL); ASSERT3U(dr->dr_txg, ==, tx->tx_txg); dl = &dr->dt.dl; dl->dr_overridden_by = *bp; @@ -2301,6 +2312,8 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, BP_PHYSICAL_BIRTH(bp); } + mutex_exit(&db->db_mtx); + /* * When data in embedded into BP there is no need to create * BRT entry as there is no data block. Just copy the BP as From d2d4f8554f62bd82fae873b68c2c122dfd959d3f Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Fri, 24 Mar 2023 10:20:07 -0700 Subject: [PATCH 010/180] Fix prefetching of indirect blocks while destroying When traversing a tree of block pointers (e.g. for `zfs destroy ` or `zfs send`), we prefetch the indirect blocks that will be needed, in `traverse_prefetch_metadata()`. In the case of `zfs destroy `, we do a little traversing each txg, and resume the traversal the next txg. So the indirect blocks that will be needed, and thus are candidates for prefetching, does not include blocks that are before the resume point. The problem is that the logic for determining if the indirect blocks are before the resume point is incorrect, causing the (up to 1024) L1 indirect blocks that are inside the first L2 to not be prefetched. In practice, if we are able to read many more than 1024 blocks per txg, then this will be inconsequential. But if i/o latency is more than a few milliseconds, almost no L1's will be prefetched, so they will be read serially, and thus the destroying will be very slow. This can be observed as `zpool get freeing` decreasing very slowly. Specifically: When we first examine the L2 that contains the block we'll be resuming from, we have not yet resumed, so `td_resume` is nonzero. At this point, all calls to `traverse_prefetch_metadata()` will fail, even if the L1 in question is after the resume point. It isn't until the callback is issued for the resume point that we zero out `td_resume`, but by this point we've already attempted and failed to prefetch everything under this L2 indirect block. This commit addresses the issue by reusing the existing `resume_skip_check()` to determine if the L1's bookmark is before or after the resume point. To do so, this function is made non-mutating (the caller now zeros `td_resume`). Note, this bug likely predates (was not introduced by) #11803. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens Closes #14603 --- module/zfs/dmu_traverse.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c index 244b9b4cbcbc..809f7f6165f9 100644 --- a/module/zfs/dmu_traverse.c +++ b/module/zfs/dmu_traverse.c @@ -154,10 +154,10 @@ typedef enum resume_skip { * Otherwise returns RESUME_SKIP_NONE. */ static resume_skip_t -resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, +resume_skip_check(const traverse_data_t *td, const dnode_phys_t *dnp, const zbookmark_phys_t *zb) { - if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { + if (td->td_resume != NULL) { /* * If we already visited this bp & everything below, * don't bother doing it again. @@ -165,12 +165,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) return (RESUME_SKIP_ALL); - /* - * If we found the block we're trying to resume from, zero - * the bookmark out to indicate that we have resumed. - */ if (memcmp(zb, td->td_resume, sizeof (*zb)) == 0) { - memset(td->td_resume, 0, sizeof (*zb)); if (td->td_flags & TRAVERSE_POST) return (RESUME_SKIP_CHILDREN); } @@ -182,7 +177,7 @@ resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, * Returns B_TRUE, if prefetch read is issued, otherwise B_FALSE. */ static boolean_t -traverse_prefetch_metadata(traverse_data_t *td, +traverse_prefetch_metadata(traverse_data_t *td, const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | @@ -192,11 +187,10 @@ traverse_prefetch_metadata(traverse_data_t *td, if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) return (B_FALSE); /* - * If we are in the process of resuming, don't prefetch, because - * some children will not be needed (and in fact may have already - * been freed). + * If this bp is before the resume point, it may have already been + * freed. */ - if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) + if (resume_skip_check(td, dnp, zb) != RESUME_SKIP_NONE) return (B_FALSE); if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) return (B_FALSE); @@ -344,7 +338,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + pidx); - if (traverse_prefetch_metadata(td, + if (traverse_prefetch_metadata(td, dnp, &((blkptr_t *)buf->b_data)[pidx], czb) == B_TRUE) { prefetched++; @@ -506,12 +500,12 @@ prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); - traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb); + traverse_prefetch_metadata(td, dnp, &dnp->dn_blkptr[j], &czb); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); - traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb); + traverse_prefetch_metadata(td, dnp, DN_SPILL_BLKPTR(dnp), &czb); } } From 80f2cdcd67ccae69d26426063e6689278c45d147 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Mon, 13 Mar 2023 23:23:04 +0100 Subject: [PATCH 011/180] Add more ANSI colors to libzfs Reviewed-by: WHR Reviewed-by: Brian Behlendorf Signed-off-by: Ethan Coe-Renner Signed-off-by: Tino Reichardt Closes #14621 --- include/libzutil.h | 6 ++++++ lib/libzfs/libzfs_diff.c | 6 +++--- lib/libzfs/libzfs_util.c | 13 ++++++++----- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/include/libzutil.h b/include/libzutil.h index 948ac08cd772..465e463f0c1f 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -169,10 +169,16 @@ struct zfs_cmd; /* * List of colors to use */ +#define ANSI_BLACK "\033[0;30m" #define ANSI_RED "\033[0;31m" #define ANSI_GREEN "\033[0;32m" #define ANSI_YELLOW "\033[0;33m" #define ANSI_BLUE "\033[0;34m" +#define ANSI_BOLD_BLUE "\033[1;34m" /* light blue */ +#define ANSI_MAGENTA "\033[0;35m" +#define ANSI_CYAN "\033[0;36m" +#define ANSI_GRAY "\033[0;37m" + #define ANSI_RESET "\033[0m" #define ANSI_BOLD "\033[1m" diff --git a/lib/libzfs/libzfs_diff.c b/lib/libzfs/libzfs_diff.c index 1330e7c3052a..da2b26ef99ce 100644 --- a/lib/libzfs/libzfs_diff.c +++ b/lib/libzfs/libzfs_diff.c @@ -55,10 +55,10 @@ #define ZDIFF_REMOVED '-' #define ZDIFF_RENAMED "R" -#define ZDIFF_ADDED_COLOR ANSI_GREEN +#define ZDIFF_ADDED_COLOR ANSI_GREEN #define ZDIFF_MODIFIED_COLOR ANSI_YELLOW -#define ZDIFF_REMOVED_COLOR ANSI_RED -#define ZDIFF_RENAMED_COLOR ANSI_BLUE +#define ZDIFF_REMOVED_COLOR ANSI_RED +#define ZDIFF_RENAMED_COLOR ANSI_BOLD_BLUE /* * Given a {dsname, object id}, get the object path diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index b0d7b2afaf7b..60695f8a63f4 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -2011,10 +2011,11 @@ use_color(void) } /* - * color_start() and color_end() are used for when you want to colorize a block - * of text. For example: + * The functions color_start() and color_end() are used for when you want + * to colorize a block of text. * - * color_start(ANSI_RED_FG) + * For example: + * color_start(ANSI_RED) * printf("hello"); * printf("world"); * color_end(); @@ -2022,7 +2023,7 @@ use_color(void) void color_start(const char *color) { - if (use_color()) { + if (color && use_color()) { fputs(color, stdout); fflush(stdout); } @@ -2038,7 +2039,9 @@ color_end(void) } -/* printf() with a color. If color is NULL, then do a normal printf. */ +/* + * printf() with a color. If color is NULL, then do a normal printf. + */ int printf_color(const char *color, const char *format, ...) { From 7bde396aa236712ac3eab9867e703f4c79fb1ac4 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Mon, 13 Mar 2023 23:30:09 +0100 Subject: [PATCH 012/180] Colorize zpool iostat output Use a bold header and colorize the space suffixes in iostat by order of magnitude like this: - K is green - M is yellow - G is red - T is lightblue - P is magenta - E is cyan - 0 space is colored gray Reviewed-by: WHR Reviewed-by: Brian Behlendorf Signed-off-by: Ethan Coe-Renner Signed-off-by: Tino Reichardt Closes #14621 Closes #14459 --- cmd/zpool/zpool_main.c | 35 ++++++++++++++++++++++++++++++++++- man/man8/zpool.8 | 2 ++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 3335569faa1a..d79c1608b09f 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -4221,6 +4221,8 @@ print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, unsigned int namewidth; const char *title; + color_start(ANSI_BOLD); + if (cb->cb_flags & IOS_ANYHISTO_M) { title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; } else if (cb->cb_vdevs.cb_names_count) { @@ -4254,6 +4256,8 @@ print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, if (cb->vcdl != NULL) print_cmd_columns(cb->vcdl, 1); + color_end(); + printf("\n"); } @@ -4263,6 +4267,35 @@ print_iostat_header(iostat_cbdata_t *cb) print_iostat_header_impl(cb, 0, NULL); } +/* + * Prints a size string (i.e. 120M) with the suffix ("M") colored + * by order of magnitude. Uses column_size to add padding. + */ +static void +print_stat_color(char *statbuf, unsigned int column_size) +{ + fputs(" ", stdout); + if (*statbuf == '0') { + color_start(ANSI_GRAY); + fputc('0', stdout); + column_size--; + } else { + for (; *statbuf; statbuf++) { + if (*statbuf == 'K') color_start(ANSI_GREEN); + else if (*statbuf == 'M') color_start(ANSI_YELLOW); + else if (*statbuf == 'G') color_start(ANSI_RED); + else if (*statbuf == 'T') color_start(ANSI_BOLD_BLUE); + else if (*statbuf == 'P') color_start(ANSI_MAGENTA); + else if (*statbuf == 'E') color_start(ANSI_CYAN); + fputc(*statbuf, stdout); + if (--column_size <= 0) + break; + } + } + color_end(); + for (; column_size > 0; column_size--) + fputc(' ', stdout); +} /* * Display a single statistic. @@ -4278,7 +4311,7 @@ print_one_stat(uint64_t value, enum zfs_nicenum_format format, if (scripted) printf("\t%s", buf); else - printf(" %*s", column_size, buf); + print_stat_color(buf, column_size); } /* diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 48e4eb3c557b..e8eadffa6fcf 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -445,6 +445,8 @@ to dump core on exit for the purposes of running .It Sy ZFS_COLOR Use ANSI color in .Nm zpool Cm status +and +.Nm zpool Cm iostat output. .It Sy ZPOOL_IMPORT_PATH The search path for devices or files to use with the pool. From 2bd0490fafe5ebb811dee9b0eb5d5c245c7a1cbf Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Mon, 13 Mar 2023 23:31:44 +0100 Subject: [PATCH 013/180] Add colored output to zfs list Use a bold header row and colorize the AVAIL column based on the used space percentage of volume. We define these colors: - when > 80%, use yellow - when > 90%, use red Reviewed-by: WHR Reviewed-by: Brian Behlendorf Signed-off-by: Ethan Coe-Renner Signed-off-by: Tino Reichardt Closes #14621 Closes #14350 --- cmd/zfs/zfs_main.c | 30 ++++++++++++++++++++++++++++++ man/man8/zfs.8 | 2 ++ 2 files changed, 32 insertions(+) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 64a38faa178f..f918036cb9b7 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -3441,6 +3441,8 @@ print_header(list_cbdata_t *cb) boolean_t first = B_TRUE; boolean_t right_justify; + color_start(ANSI_BOLD); + for (; pl != NULL; pl = pl->pl_next) { if (!first) { (void) printf(" "); @@ -3467,9 +3469,31 @@ print_header(list_cbdata_t *cb) (void) printf("%-*s", (int)pl->pl_width, header); } + color_end(); + (void) printf("\n"); } +/* + * Decides on the color that the avail value should be printed in. + * > 80% used = yellow + * > 90% used = red + */ +static const char * +zfs_list_avail_color(zfs_handle_t *zhp) +{ + uint64_t used = zfs_prop_get_int(zhp, ZFS_PROP_USED); + uint64_t avail = zfs_prop_get_int(zhp, ZFS_PROP_AVAILABLE); + int percentage = (int)((double)avail / MAX(avail + used, 1) * 100); + + if (percentage > 20) + return (NULL); + else if (percentage > 10) + return (ANSI_YELLOW); + else + return (ANSI_RED); +} + /* * Given a dataset and a list of fields, print out all the properties according * to the described layout. @@ -3532,6 +3556,9 @@ print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) right_justify = B_FALSE; } + if (pl->pl_prop == ZFS_PROP_AVAILABLE) + color_start(zfs_list_avail_color(zhp)); + /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width @@ -3543,6 +3570,9 @@ print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) (void) printf("%*s", (int)pl->pl_width, propstr); else (void) printf("%-*s", (int)pl->pl_width, propstr); + + if (pl->pl_prop == ZFS_PROP_AVAILABLE) + color_end(); } (void) putchar('\n'); diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 index d12377f9b4f2..dd578cb74aac 100644 --- a/man/man8/zfs.8 +++ b/man/man8/zfs.8 @@ -740,6 +740,8 @@ command will be undone if the share is ever unshared (like via a reboot). .It Sy ZFS_COLOR Use ANSI color in .Nm zfs Cm diff +and +.Nm zfs Cm list output. .It Sy ZFS_MOUNT_HELPER Cause From 460d887c439079422b642da87a77dbb896f5e64a Mon Sep 17 00:00:00 2001 From: George Wilson Date: Fri, 24 Mar 2023 13:27:07 -0400 Subject: [PATCH 014/180] panic loop when removing slog device There is a window in the slog removal code where a panic loop could ensue if the system crashes during that operation. The original design of slog removal did not persisted any state because the removal happened synchronously. This was changed by a later commit which persisted the vdev_removing flag and exposed this bug. If a slog removal is in progress and happens to crash after persisting the vdev_removing flag to the label but before the vdev is removed from the spa config, then the pool will continue to panic on import. Here's a sample of the panic: [ 134.387411] VERIFY0(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)) failed (0 == 22) [ 134.393865] PANIC at dmu.c:1135:dmu_write() [ 134.396035] Kernel panic - not syncing: VERIFY0(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)) failed (0 == 22) [ 134.397857] CPU: 2 PID: 5914 Comm: txg_sync Kdump: loaded Tainted: P OE 5.4.0-1100-dx2023020205-b3751f8c2-azure #106 [ 134.407938] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS 090008 12/07/2018 [ 134.407938] Call Trace: [ 134.407938] dump_stack+0x57/0x6d [ 134.407938] panic+0xfb/0x2d7 [ 134.407938] spl_panic+0xcf/0x102 [spl] [ 134.407938] ? traverse_impl+0x1ca/0x420 [zfs] [ 134.407938] ? dmu_object_alloc_impl+0x3b4/0x3c0 [zfs] [ 134.407938] ? dnode_hold+0x1b/0x20 [zfs] [ 134.407938] dmu_write+0xc3/0xd0 [zfs] [ 134.407938] ? space_map_alloc+0x55/0x80 [zfs] [ 134.407938] metaslab_sync+0x61a/0x830 [zfs] [ 134.407938] ? queued_spin_unlock+0x9/0x10 [zfs] [ 134.407938] vdev_sync+0x72/0x190 [zfs] [ 134.407938] spa_sync_iterate_to_convergence+0x160/0x250 [zfs] [ 134.407938] spa_sync+0x2f7/0x670 [zfs] [ 134.407938] txg_sync_thread+0x22d/0x2d0 [zfs] [ 134.407938] ? txg_dispatch_callbacks+0xf0/0xf0 [zfs] [ 134.407938] thread_generic_wrapper+0x83/0xa0 [spl] [ 134.407938] kthread+0x104/0x140 [ 134.407938] ? kasan_check_write.constprop.0+0x10/0x10 [spl] [ 134.407938] ? kthread_park+0x90/0x90 [ 134.457802] ret_from_fork+0x1f/0x40 This change no longer persists the vdev_removing flag when removing slog devices and also cleans up some code that was added which is not used. Reviewed-by: Brian Behlendorf Reviewed-by: Matthew Ahrens Reviewed-by: Mark Maybee Signed-off-by: George Wilson Closes #14652 --- include/sys/vdev.h | 5 ++--- module/zfs/vdev_label.c | 32 ++++++++++++-------------------- 2 files changed, 14 insertions(+), 23 deletions(-) diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 7a7c70dc1598..d529bbcdd9a4 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -186,9 +186,8 @@ extern boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx); typedef enum vdev_config_flag { VDEV_CONFIG_SPARE = 1 << 0, VDEV_CONFIG_L2CACHE = 1 << 1, - VDEV_CONFIG_REMOVING = 1 << 2, - VDEV_CONFIG_MOS = 1 << 3, - VDEV_CONFIG_MISSING = 1 << 4 + VDEV_CONFIG_MOS = 1 << 2, + VDEV_CONFIG_MISSING = 1 << 3 } vdev_config_flag_t; extern void vdev_post_kobj_evt(vdev_t *vd); diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index d81bc29f2bc4..f61be65a2c72 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -500,7 +500,12 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, fnvlist_add_uint64(nv, ZPOOL_CONFIG_NONALLOCATING, vd->vdev_noalloc); } - if (vd->vdev_removing) { + + /* + * Slog devices are removed synchronously so don't + * persist the vdev_removing flag to the label. + */ + if (vd->vdev_removing && !vd->vdev_islog) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, vd->vdev_removing); } @@ -644,35 +649,22 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (!vd->vdev_ops->vdev_op_leaf) { nvlist_t **child; - int c, idx; + uint64_t c; ASSERT(!vd->vdev_ishole); child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); - for (c = 0, idx = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - - /* - * If we're generating an nvlist of removing - * vdevs then skip over any device which is - * not being removed. - */ - if ((flags & VDEV_CONFIG_REMOVING) && - !cvd->vdev_removing) - continue; - - child[idx++] = vdev_config_generate(spa, cvd, + for (c = 0; c < vd->vdev_children; c++) { + child[c] = vdev_config_generate(spa, vd->vdev_child[c], getstats, flags); } - if (idx) { - fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - (const nvlist_t * const *)child, idx); - } + fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + (const nvlist_t * const *)child, vd->vdev_children); - for (c = 0; c < idx; c++) + for (c = 0; c < vd->vdev_children; c++) nvlist_free(child[c]); kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); From 0ad5f4344238b548e2240a405d418f7af9290623 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Fri, 24 Mar 2023 13:29:19 -0400 Subject: [PATCH 015/180] Drop lying to the compiler in the fletcher4 code This is probably the uncontroversial part of #13631, which fixes a real problem people are having. There's still things to improve in our code after this is merged, but it should stop the breakage that people have reported, where we lie about a type always being aligned and then pass in stack objects with no alignment requirement and hope for the best. Of course, our SIMD code was written with unaligned accesses, so it doesn't care if we drop this...but some auto-vectorized code that gcc emits sure does, since we told it it can assume they're aligned. Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Reviewed-by: Richard Yao Signed-off-by: Rich Ercolani Closes #14649 --- include/zfs_fletcher.h | 24 ++++------------------ module/zcommon/zfs_fletcher.c | 4 ---- module/zcommon/zfs_fletcher_aarch64_neon.c | 2 -- module/zcommon/zfs_fletcher_avx512.c | 2 -- module/zcommon/zfs_fletcher_intel.c | 2 -- module/zcommon/zfs_fletcher_sse.c | 2 -- module/zcommon/zfs_fletcher_superscalar.c | 4 ---- module/zcommon/zfs_fletcher_superscalar4.c | 4 ---- 8 files changed, 4 insertions(+), 40 deletions(-) diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h index e913a0bd7fda..ca1a092928d6 100644 --- a/include/zfs_fletcher.h +++ b/include/zfs_fletcher.h @@ -76,19 +76,19 @@ typedef struct zfs_fletcher_superscalar { } zfs_fletcher_superscalar_t; typedef struct zfs_fletcher_sse { - uint64_t v[2] __attribute__((aligned(16))); + uint64_t v[2]; } zfs_fletcher_sse_t; typedef struct zfs_fletcher_avx { - uint64_t v[4] __attribute__((aligned(32))); + uint64_t v[4]; } zfs_fletcher_avx_t; typedef struct zfs_fletcher_avx512 { - uint64_t v[8] __attribute__((aligned(64))); + uint64_t v[8]; } zfs_fletcher_avx512_t; typedef struct zfs_fletcher_aarch64_neon { - uint64_t v[2] __attribute__((aligned(16))); + uint64_t v[2]; } zfs_fletcher_aarch64_neon_t; @@ -161,20 +161,4 @@ _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_aarch64_neon_ops; } #endif -#if defined(ZFS_UBSAN_ENABLED) -#if defined(__has_attribute) -#if __has_attribute(no_sanitize_undefined) -#define ZFS_NO_SANITIZE_UNDEFINED __attribute__((no_sanitize_undefined)) -#elif __has_attribute(no_sanitize) -#define ZFS_NO_SANITIZE_UNDEFINED __attribute__((no_sanitize("undefined"))) -#else -#error "Compiler has to support attribute " - "`no_sanitize_undefined` or `no_sanitize(\"undefined\")`" - "when compiling with UBSan enabled" -#endif /* __has_attribute(no_sanitize_undefined) */ -#endif /* defined(__has_attribute) */ -#else -#define ZFS_NO_SANITIZE_UNDEFINED -#endif /* defined(ZFS_UBSAN_ENABLED) */ - #endif /* _ZFS_FLETCHER_H */ diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index eae854f3d452..1d9b1cffc0b2 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -301,21 +301,18 @@ fletcher_2_byteswap(const void *buf, uint64_t size, (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp); } -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx) { ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0); } -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t)); } -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) @@ -339,7 +336,6 @@ fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); } -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c index cd5fe545a19d..26f2115c44bf 100644 --- a/module/zcommon/zfs_fletcher_aarch64_neon.c +++ b/module/zcommon/zfs_fletcher_aarch64_neon.c @@ -48,14 +48,12 @@ #include #include -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx) { memset(ctx->aarch64_neon, 0, 4 * sizeof (zfs_fletcher_aarch64_neon_t)); } -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_aarch64_neon_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c index 81182ead2caf..95fc2b151a7d 100644 --- a/module/zcommon/zfs_fletcher_avx512.c +++ b/module/zcommon/zfs_fletcher_avx512.c @@ -35,14 +35,12 @@ #define __asm __asm__ __volatile__ #endif -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx) { memset(ctx->avx512, 0, 4 * sizeof (zfs_fletcher_avx512_t)); } -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c index 6108bda7a07c..34590a15572d 100644 --- a/module/zcommon/zfs_fletcher_intel.c +++ b/module/zcommon/zfs_fletcher_intel.c @@ -47,14 +47,12 @@ #include #include -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_avx2_init(fletcher_4_ctx_t *ctx) { memset(ctx->avx, 0, 4 * sizeof (zfs_fletcher_avx_t)); } -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c index 096472c9af5f..8ab9b9acb83b 100644 --- a/module/zcommon/zfs_fletcher_sse.c +++ b/module/zcommon/zfs_fletcher_sse.c @@ -49,14 +49,12 @@ #include #include -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_sse2_init(fletcher_4_ctx_t *ctx) { memset(ctx->sse, 0, 4 * sizeof (zfs_fletcher_sse_t)); } -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { diff --git a/module/zcommon/zfs_fletcher_superscalar.c b/module/zcommon/zfs_fletcher_superscalar.c index 8b5b72a7b8b5..2a80816ff3ec 100644 --- a/module/zcommon/zfs_fletcher_superscalar.c +++ b/module/zcommon/zfs_fletcher_superscalar.c @@ -47,14 +47,12 @@ #include #include -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar_init(fletcher_4_ctx_t *ctx) { memset(ctx->superscalar, 0, 4 * sizeof (zfs_fletcher_superscalar_t)); } -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { @@ -70,7 +68,6 @@ fletcher_4_superscalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) ZIO_SET_CHECKSUM(zcp, A, B, C, D); } -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) @@ -110,7 +107,6 @@ fletcher_4_superscalar_native(fletcher_4_ctx_t *ctx, ctx->superscalar[3].v[1] = d2; } -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) diff --git a/module/zcommon/zfs_fletcher_superscalar4.c b/module/zcommon/zfs_fletcher_superscalar4.c index bef387933917..0b52bb63d003 100644 --- a/module/zcommon/zfs_fletcher_superscalar4.c +++ b/module/zcommon/zfs_fletcher_superscalar4.c @@ -47,14 +47,12 @@ #include #include -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar4_init(fletcher_4_ctx_t *ctx) { memset(ctx->superscalar, 0, 4 * sizeof (zfs_fletcher_superscalar_t)); } -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar4_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) { @@ -84,7 +82,6 @@ fletcher_4_superscalar4_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) ZIO_SET_CHECKSUM(zcp, A, B, C, D); } -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar4_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) @@ -150,7 +147,6 @@ fletcher_4_superscalar4_native(fletcher_4_ctx_t *ctx, ctx->superscalar[3].v[3] = d4; } -ZFS_NO_SANITIZE_UNDEFINED static void fletcher_4_superscalar4_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) From a05263b7aa2ae38b50a159b3d2b22f3a454178ab Mon Sep 17 00:00:00 2001 From: Ameer Hamza <106930537+ixhamza@users.noreply.github.com> Date: Fri, 24 Mar 2023 22:30:38 +0500 Subject: [PATCH 016/180] Update vdev state for spare vdev zfsd fetches new pool configuration through ZFS_IOC_POOL_STATS but it does not get updated nvlist configuration for spare vdev since the configuration is read by spa_spares->sav_config. In this commit, updating the vdev state for spare vdev that is consumed by zfsd on spare disk hotplug. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Ameer Hamza Closes #14653 --- module/zfs/spa.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 95b38f09825b..0573daa92f9d 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -5393,13 +5393,15 @@ spa_add_spares(spa_t *spa, nvlist_t *config) for (i = 0; i < nspares; i++) { guid = fnvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID); + VERIFY0(nvlist_lookup_uint64_array(spares[i], + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { - VERIFY0(nvlist_lookup_uint64_array(spares[i], - ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, - &vsc)); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; + } else { + vs->vs_state = + spa->spa_spares.sav_vdevs[i]->vdev_state; } } } From ae0b1f66c707cff09bfde54aade784a016559a34 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Mon, 27 Mar 2023 14:29:19 -0400 Subject: [PATCH 017/180] linux 6.3 compat: add another bdev_io_acct case Linux 6.3+, and backports from it (6.2.8+), changed the signatures on bdev_io_{start,end}_acct. Add a case for it. Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #14658 Closes #14668 --- config/kernel-generic_io_acct.m4 | 98 ++++++++++++------- include/os/linux/kernel/linux/blkdev_compat.h | 10 +- 2 files changed, 69 insertions(+), 39 deletions(-) diff --git a/config/kernel-generic_io_acct.m4 b/config/kernel-generic_io_acct.m4 index a8a448c6fe96..a6a109004294 100644 --- a/config/kernel-generic_io_acct.m4 +++ b/config/kernel-generic_io_acct.m4 @@ -2,7 +2,20 @@ dnl # dnl # Check for generic io accounting interface. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT], [ - ZFS_LINUX_TEST_SRC([bdev_io_acct], [ + ZFS_LINUX_TEST_SRC([bdev_io_acct_63], [ + #include + ], [ + struct block_device *bdev = NULL; + struct bio *bio = NULL; + unsigned long passed_time = 0; + unsigned long start_time; + + start_time = bdev_start_io_acct(bdev, bio_op(bio), + passed_time); + bdev_end_io_acct(bdev, bio_op(bio), bio_sectors(bio), start_time); + ]) + + ZFS_LINUX_TEST_SRC([bdev_io_acct_old], [ #include ], [ struct block_device *bdev = NULL; @@ -63,74 +76,85 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT], [ AC_DEFUN([ZFS_AC_KERNEL_GENERIC_IO_ACCT], [ dnl # - dnl # 5.19 API, + dnl # Linux 6.3, and then backports thereof, changed + dnl # the signatures on bdev_start_io_acct/bdev_end_io_acct dnl # - dnl # disk_start_io_acct() and disk_end_io_acct() have been replaced by - dnl # bdev_start_io_acct() and bdev_end_io_acct(). - dnl # - AC_MSG_CHECKING([whether generic bdev_*_io_acct() are available]) - ZFS_LINUX_TEST_RESULT([bdev_io_acct], [ + AC_MSG_CHECKING([whether 6.3+ bdev_*_io_acct() are available]) + ZFS_LINUX_TEST_RESULT([bdev_io_acct_63], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BDEV_IO_ACCT, 1, [bdev_*_io_acct() available]) + AC_DEFINE(HAVE_BDEV_IO_ACCT_63, 1, [bdev_*_io_acct() available]) ], [ AC_MSG_RESULT(no) dnl # - dnl # 5.12 API, + dnl # 5.19 API, dnl # - dnl # bio_start_io_acct() and bio_end_io_acct() became GPL-exported - dnl # so use disk_start_io_acct() and disk_end_io_acct() instead + dnl # disk_start_io_acct() and disk_end_io_acct() have been replaced by + dnl # bdev_start_io_acct() and bdev_end_io_acct(). dnl # - AC_MSG_CHECKING([whether generic disk_*_io_acct() are available]) - ZFS_LINUX_TEST_RESULT([disk_io_acct], [ + AC_MSG_CHECKING([whether pre-6.3 bdev_*_io_acct() are available]) + ZFS_LINUX_TEST_RESULT([bdev_io_acct_old], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_DISK_IO_ACCT, 1, [disk_*_io_acct() available]) + AC_DEFINE(HAVE_BDEV_IO_ACCT_OLD, 1, [bdev_*_io_acct() available]) ], [ AC_MSG_RESULT(no) - dnl # - dnl # 5.7 API, + dnl # 5.12 API, dnl # - dnl # Added bio_start_io_acct() and bio_end_io_acct() helpers. + dnl # bio_start_io_acct() and bio_end_io_acct() became GPL-exported + dnl # so use disk_start_io_acct() and disk_end_io_acct() instead dnl # - AC_MSG_CHECKING([whether generic bio_*_io_acct() are available]) - ZFS_LINUX_TEST_RESULT([bio_io_acct], [ + AC_MSG_CHECKING([whether generic disk_*_io_acct() are available]) + ZFS_LINUX_TEST_RESULT([disk_io_acct], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BIO_IO_ACCT, 1, [bio_*_io_acct() available]) + AC_DEFINE(HAVE_DISK_IO_ACCT, 1, [disk_*_io_acct() available]) ], [ AC_MSG_RESULT(no) dnl # - dnl # 4.14 API, + dnl # 5.7 API, dnl # - dnl # generic_start_io_acct/generic_end_io_acct now require - dnl # request_queue to be provided. No functional changes, - dnl # but preparation for inflight accounting. + dnl # Added bio_start_io_acct() and bio_end_io_acct() helpers. dnl # - AC_MSG_CHECKING([whether generic_*_io_acct wants 4 args]) - ZFS_LINUX_TEST_RESULT_SYMBOL([generic_acct_4args], - [generic_start_io_acct], [block/bio.c], [ + AC_MSG_CHECKING([whether generic bio_*_io_acct() are available]) + ZFS_LINUX_TEST_RESULT([bio_io_acct], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GENERIC_IO_ACCT_4ARG, 1, - [generic_*_io_acct() 4 arg available]) + AC_DEFINE(HAVE_BIO_IO_ACCT, 1, [bio_*_io_acct() available]) ], [ AC_MSG_RESULT(no) dnl # - dnl # 3.19 API addition + dnl # 4.14 API, dnl # - dnl # torvalds/linux@394ffa50 allows us to increment - dnl # iostat counters without generic_make_request(). + dnl # generic_start_io_acct/generic_end_io_acct now require + dnl # request_queue to be provided. No functional changes, + dnl # but preparation for inflight accounting. dnl # - AC_MSG_CHECKING( - [whether generic_*_io_acct wants 3 args]) - ZFS_LINUX_TEST_RESULT_SYMBOL([generic_acct_3args], + AC_MSG_CHECKING([whether generic_*_io_acct wants 4 args]) + ZFS_LINUX_TEST_RESULT_SYMBOL([generic_acct_4args], [generic_start_io_acct], [block/bio.c], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GENERIC_IO_ACCT_3ARG, 1, - [generic_*_io_acct() 3 arg available]) + AC_DEFINE(HAVE_GENERIC_IO_ACCT_4ARG, 1, + [generic_*_io_acct() 4 arg available]) ], [ AC_MSG_RESULT(no) + + dnl # + dnl # 3.19 API addition + dnl # + dnl # torvalds/linux@394ffa50 allows us to increment + dnl # iostat counters without generic_make_request(). + dnl # + AC_MSG_CHECKING( + [whether generic_*_io_acct wants 3 args]) + ZFS_LINUX_TEST_RESULT_SYMBOL([generic_acct_3args], + [generic_start_io_acct], [block/bio.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GENERIC_IO_ACCT_3ARG, 1, + [generic_*_io_acct() 3 arg available]) + ], [ + AC_MSG_RESULT(no) + ]) ]) ]) ]) diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index f04eb5b2593f..c7405ffab8ba 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -592,7 +592,10 @@ blk_generic_start_io_acct(struct request_queue *q __attribute__((unused)), struct gendisk *disk __attribute__((unused)), int rw __attribute__((unused)), struct bio *bio) { -#if defined(HAVE_BDEV_IO_ACCT) +#if defined(HAVE_BDEV_IO_ACCT_63) + return (bdev_start_io_acct(bio->bi_bdev, bio_op(bio), + jiffies)); +#elif defined(HAVE_BDEV_IO_ACCT_OLD) return (bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio), bio_op(bio), jiffies)); #elif defined(HAVE_DISK_IO_ACCT) @@ -618,7 +621,10 @@ blk_generic_end_io_acct(struct request_queue *q __attribute__((unused)), struct gendisk *disk __attribute__((unused)), int rw __attribute__((unused)), struct bio *bio, unsigned long start_time) { -#if defined(HAVE_BDEV_IO_ACCT) +#if defined(HAVE_BDEV_IO_ACCT_63) + bdev_end_io_acct(bio->bi_bdev, bio_op(bio), bio_sectors(bio), + start_time); +#elif defined(HAVE_BDEV_IO_ACCT_OLD) bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time); #elif defined(HAVE_DISK_IO_ACCT) disk_end_io_acct(disk, bio_op(bio), start_time); From aebd94cc8541e0ec3b1de57edbd57c4280213089 Mon Sep 17 00:00:00 2001 From: Rob N Date: Tue, 28 Mar 2023 05:55:54 +1100 Subject: [PATCH 018/180] config: don't link libudev on FreeBSD FreeBSD has a libudev shim in libudev-devd. If present, configure would detect it and produce binaries linked against it, even though nothing used it. That is surprising and unnecessary, so lets remove it. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #14669 --- config/user.m4 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/user.m4 b/config/user.m4 index a7241f44f1fd..6ec27a5b2cf5 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -14,11 +14,11 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ AM_COND_IF([BUILD_LINUX], [ ZFS_AC_CONFIG_USER_UDEV ZFS_AC_CONFIG_USER_SYSTEMD + ZFS_AC_CONFIG_USER_LIBUDEV ZFS_AC_CONFIG_USER_LIBUUID ZFS_AC_CONFIG_USER_LIBBLKID ]) ZFS_AC_CONFIG_USER_LIBTIRPC - ZFS_AC_CONFIG_USER_LIBUDEV ZFS_AC_CONFIG_USER_LIBCRYPTO ZFS_AC_CONFIG_USER_LIBAIO ZFS_AC_CONFIG_USER_LIBATOMIC From a604d3243b0f29ac0a93fc39ece8850f03419d22 Mon Sep 17 00:00:00 2001 From: George Wilson Date: Tue, 28 Mar 2023 11:13:32 -0400 Subject: [PATCH 019/180] Revert "Do not hold spa_config in ZIL while blocked on IO" This reverts commit 7d638df09be7482935bcf6ec8e4ea2ac8a8be1a8. Reviewed-by: Prakash Surya Reviewed-by: Brian Behlendorf Signed-off-by: George Wilson Closes #14678 --- module/zfs/zil.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index fba1c1999612..eb26e4b32998 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1384,6 +1384,8 @@ zil_lwb_flush_vdevs_done(zio_t *zio) itx_t *itx; uint64_t txg; + spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); @@ -1522,6 +1524,8 @@ zil_lwb_write_done(zio_t *zio) zil_vdev_node_t *zv; lwb_t *nlwb; + ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); + ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); @@ -1583,7 +1587,6 @@ zil_lwb_write_done(zio_t *zio) return; } - spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); if (vd != NULL) { @@ -1599,7 +1602,6 @@ zil_lwb_write_done(zio_t *zio) } kmem_free(zv, sizeof (*zv)); } - spa_config_exit(spa, SCL_STATE, FTAG); } static void @@ -1878,6 +1880,8 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) */ memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); + spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); + zil_lwb_add_block(lwb, &lwb->lwb_blk); lwb->lwb_issued_timestamp = gethrtime(); lwb->lwb_state = LWB_STATE_ISSUED; From 64bfa6bae3dd44bc93b1e9141b746231796a42de Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 28 Mar 2023 08:19:03 -0700 Subject: [PATCH 020/180] Additional limits on hole reporting Holding the zp->z_rangelock as a RL_READER over the range 0-UINT64_MAX is sufficient to prevent the dnode from being re-dirtied by concurrent writers. To avoid potentially looping multiple times for external caller which do not take the rangelock holes are not reported after the first sync. While not optimal this is always functionally correct. This change adds the missing rangelock calls on FreeBSD to zvol_cdev_ioctl(). Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Closes #14512 Closes #14641 --- module/os/freebsd/zfs/zvol_os.c | 3 +++ module/zfs/dmu.c | 32 ++++++++++++++++++-------------- module/zfs/zfs_vnops.c | 2 +- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 631e020db9c9..26578491fd67 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -1212,7 +1212,10 @@ zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, hole = (cmd == FIOSEEKHOLE); noff = *off; + lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX, + RL_READER); error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); + zfs_rangelock_exit(lr); *off = noff; break; } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 23b6667524cc..ce985d833f58 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2116,18 +2116,18 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) } /* - * This function is only called from zfs_holey_common() for zpl_llseek() - * in order to determine the location of holes. In order to accurately - * report holes all dirty data must be synced to disk. This causes extremely - * poor performance when seeking for holes in a dirty file. As a compromise, - * only provide hole data when the dnode is clean. When a dnode is dirty - * report the dnode as having no holes which is always a safe thing to do. + * Reports the location of data and holes in an object. In order to + * accurately report holes all dirty data must be synced to disk. This + * causes extremely poor performance when seeking for holes in a dirty file. + * As a compromise, only provide hole data when the dnode is clean. When + * a dnode is dirty report the dnode as having no holes by returning EBUSY + * which is always safe to do. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; - int err; + int restarted = 0, err; restart: err = dnode_hold(os, object, FTAG, &dn); @@ -2139,19 +2139,23 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) if (dnode_is_dirty(dn)) { /* * If the zfs_dmu_offset_next_sync module option is enabled - * then strict hole reporting has been requested. Dirty - * dnodes must be synced to disk to accurately report all - * holes. When disabled dirty dnodes are reported to not - * have any holes which is always safe. + * then hole reporting has been requested. Dirty dnodes + * must be synced to disk to accurately report holes. * - * When called by zfs_holey_common() the zp->z_rangelock - * is held to prevent zfs_write() and mmap writeback from - * re-dirtying the dnode after txg_wait_synced(). + * Provided a RL_READER rangelock spanning 0-UINT64_MAX is + * held by the caller only a single restart will be required. + * We tolerate callers which do not hold the rangelock by + * returning EBUSY and not reporting holes after one restart. */ if (zfs_dmu_offset_next_sync) { rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); + + if (restarted) + return (SET_ERROR(EBUSY)); + txg_wait_synced(dmu_objset_pool(os), 0); + restarted = 1; goto restart; } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index db80be783899..91b594e41cda 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -111,7 +111,7 @@ zfs_holey_common(znode_t *zp, ulong_t cmd, loff_t *off) if (zn_has_cached_data(zp, 0, file_sz - 1)) zn_flush_cached_data(zp, B_FALSE); - lr = zfs_rangelock_enter(&zp->z_rangelock, 0, file_sz, RL_READER); + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_READER); error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); zfs_rangelock_exit(lr); From 65d10bd87c408bfa13fa27bb6ad3ecc0e2e3775b Mon Sep 17 00:00:00 2001 From: Kevin Jin <33590050+jxdking@users.noreply.github.com> Date: Tue, 28 Mar 2023 11:43:41 -0400 Subject: [PATCH 021/180] Fix short-lived txg caused by autotrim Current autotrim causes short-lived txg through: 1. calling txg_wait_synced() in metaslab_enable() 2. calling txg_wait_open() with should_quiesce = true This patch addresses all the issues mentioned above. A new cv, vdev_autotrim_kick_cv is added to kick autotrim activity. It will be signaled once a txg is synced so that it does not change the original autotrim pace. Also because it is a cv, the wait is interruptible which speeds up the vdev_autotrim_stop_wait() call. Finally, combining big zfs_txg_timeout, txg_wait_open() also causes delay when exporting a pool. Reviewed-by: Brian Behlendorf Signed-off-by: jxdking Issue #8993 Closes #12194 --- include/sys/vdev_impl.h | 1 + include/sys/vdev_trim.h | 1 + module/zfs/spa.c | 3 ++ module/zfs/vdev.c | 2 ++ module/zfs/vdev_trim.c | 72 +++++++++++++++++++++++++++++++++-------- 5 files changed, 65 insertions(+), 14 deletions(-) diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 73c0206efa2e..7cfffe3b4eed 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -329,6 +329,7 @@ struct vdev { list_node_t vdev_trim_node; kmutex_t vdev_autotrim_lock; kcondvar_t vdev_autotrim_cv; + kcondvar_t vdev_autotrim_kick_cv; kthread_t *vdev_autotrim_thread; /* Protects vdev_trim_thread and vdev_trim_state. */ kmutex_t vdev_trim_lock; diff --git a/include/sys/vdev_trim.h b/include/sys/vdev_trim.h index 2a217f0d43ce..7a94d4af098f 100644 --- a/include/sys/vdev_trim.h +++ b/include/sys/vdev_trim.h @@ -41,6 +41,7 @@ extern void vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state); extern void vdev_trim_stop_wait(spa_t *spa, list_t *vd_list); extern void vdev_trim_restart(vdev_t *vd); extern void vdev_autotrim(spa_t *spa); +extern void vdev_autotrim_kick(spa_t *spa); extern void vdev_autotrim_stop_all(spa_t *spa); extern void vdev_autotrim_stop_wait(vdev_t *vd); extern void vdev_autotrim_restart(spa_t *spa); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 0573daa92f9d..dc202978c0f6 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -9449,6 +9449,9 @@ spa_sync(spa_t *spa, uint64_t txg) spa_update_dspace(spa); + if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) + vdev_autotrim_kick(spa); + /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 1af5baeecd2d..ad932a7ba764 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -696,6 +696,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL); cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); @@ -1148,6 +1149,7 @@ vdev_free(vdev_t *vd) mutex_destroy(&vd->vdev_trim_io_lock); cv_destroy(&vd->vdev_trim_cv); cv_destroy(&vd->vdev_autotrim_cv); + cv_destroy(&vd->vdev_autotrim_kick_cv); cv_destroy(&vd->vdev_trim_io_cv); mutex_destroy(&vd->vdev_rebuild_lock); diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 5b5076c8722c..0d71b9434342 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -182,6 +182,25 @@ vdev_autotrim_should_stop(vdev_t *tvd) spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF); } +/* + * Wait for given number of kicks, return true if the wait is aborted due to + * vdev_autotrim_exit_wanted. + */ +static boolean_t +vdev_autotrim_wait_kick(vdev_t *vd, int num_of_kick) +{ + mutex_enter(&vd->vdev_autotrim_lock); + for (int i = 0; i < num_of_kick; i++) { + if (vd->vdev_autotrim_exit_wanted) + break; + cv_wait(&vd->vdev_autotrim_kick_cv, &vd->vdev_autotrim_lock); + } + boolean_t exit_wanted = vd->vdev_autotrim_exit_wanted; + mutex_exit(&vd->vdev_autotrim_lock); + + return (exit_wanted); +} + /* * The sync task for updating the on-disk state of a manual TRIM. This * is scheduled by vdev_trim_change_state(). @@ -1190,7 +1209,6 @@ vdev_autotrim_thread(void *arg) while (!vdev_autotrim_should_stop(vd)) { int txgs_per_trim = MAX(zfs_trim_txg_batch, 1); - boolean_t issued_trim = B_FALSE; uint64_t extent_bytes_max = zfs_trim_extent_bytes_max; uint64_t extent_bytes_min = zfs_trim_extent_bytes_min; @@ -1224,6 +1242,8 @@ vdev_autotrim_thread(void *arg) i += txgs_per_trim) { metaslab_t *msp = vd->vdev_ms[i]; range_tree_t *trim_tree; + boolean_t issued_trim = B_FALSE; + boolean_t wait_aborted = B_FALSE; spa_config_exit(spa, SCL_CONFIG, FTAG); metaslab_disable(msp); @@ -1374,7 +1394,18 @@ vdev_autotrim_thread(void *arg) range_tree_vacate(trim_tree, NULL, NULL); range_tree_destroy(trim_tree); - metaslab_enable(msp, issued_trim, B_FALSE); + /* + * Wait for couples of kicks, to ensure the trim io is + * synced. If the wait is aborted due to + * vdev_autotrim_exit_wanted, we need to signal + * metaslab_enable() to wait for sync. + */ + if (issued_trim) { + wait_aborted = vdev_autotrim_wait_kick(vd, + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE); + } + + metaslab_enable(msp, wait_aborted, B_FALSE); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); for (uint64_t c = 0; c < children; c++) { @@ -1388,17 +1419,14 @@ vdev_autotrim_thread(void *arg) } kmem_free(tap, sizeof (trim_args_t) * children); + + if (vdev_autotrim_should_stop(vd)) + break; } spa_config_exit(spa, SCL_CONFIG, FTAG); - /* - * After completing the group of metaslabs wait for the next - * open txg. This is done to make sure that a minimum of - * zfs_trim_txg_batch txgs will occur before these metaslabs - * are trimmed again. - */ - txg_wait_open(spa_get_dsl(spa), 0, issued_trim); + vdev_autotrim_wait_kick(vd, 1); shift++; spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -1476,11 +1504,9 @@ vdev_autotrim_stop_wait(vdev_t *tvd) mutex_enter(&tvd->vdev_autotrim_lock); if (tvd->vdev_autotrim_thread != NULL) { tvd->vdev_autotrim_exit_wanted = B_TRUE; - - while (tvd->vdev_autotrim_thread != NULL) { - cv_wait(&tvd->vdev_autotrim_cv, - &tvd->vdev_autotrim_lock); - } + cv_broadcast(&tvd->vdev_autotrim_kick_cv); + cv_wait(&tvd->vdev_autotrim_cv, + &tvd->vdev_autotrim_lock); ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL); tvd->vdev_autotrim_exit_wanted = B_FALSE; @@ -1488,6 +1514,24 @@ vdev_autotrim_stop_wait(vdev_t *tvd) mutex_exit(&tvd->vdev_autotrim_lock); } +void +vdev_autotrim_kick(spa_t *spa) +{ + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + vdev_t *root_vd = spa->spa_root_vdev; + vdev_t *tvd; + + for (uint64_t i = 0; i < root_vd->vdev_children; i++) { + tvd = root_vd->vdev_child[i]; + + mutex_enter(&tvd->vdev_autotrim_lock); + if (tvd->vdev_autotrim_thread != NULL) + cv_broadcast(&tvd->vdev_autotrim_kick_cv); + mutex_exit(&tvd->vdev_autotrim_lock); + } +} + /* * Wait for all of the vdev_autotrim_thread associated with the pool to * be terminated (canceled or stopped). From 431083f75bdd3efaee992bdd672625ec7240d252 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Wed, 29 Mar 2023 01:51:58 +0200 Subject: [PATCH 022/180] Fixes in persistent error log Address the following bugs in persistent error log: 1) Check nested clones, eg "fs->snap->clone->snap2->clone2". 2) When deleting files containing error blocks in those clones (from "clone" the example above), do not break the check chain. 3) When deleting files in the originating fs before syncing the errlog to disk, do not break the check chain. This happens because at the time of introducing the error block in the error list, we do not have its birth txg and the head filesystem. If the original file is deleted before the error list is synced to the error log (which is when we actually lookup the birth txg and the head filesystem), then we do not have access to this info anymore and break the check chain. The most prominent change is related to achieving (3). We expand the spa_error_entry_t structure to accommodate the newly introduced zbookmark_err_phys_t structure (containing the birth txg of the error block).Due to compatibility reasons we cannot remove the zbookmark_phys_t structure and we also need to place the new structure after se_avl, so it is not accounted for in avl_find(). Then we modify spa_log_error() to also provide the birth txg of the error block. With these changes in place we simplify the previously introduced function get_head_and_birth_txg() (now named get_head_ds()). We chose not to follow the same approach for the head filesystem (thus completely removing get_head_ds()) to avoid introducing new lock contentions. The stack sizes of nested functions (as measured by checkstack.pl in the linux kernel) are: check_filesystem [zfs]: 272 (was 912) check_clones [zfs]: 64 We also introduced two new tests covering the above changes. Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #14633 --- include/sys/spa.h | 4 +- include/sys/spa_impl.h | 1 + include/sys/zio.h | 4 +- man/man7/zpool-features.7 | 1 + module/zfs/arc.c | 7 +- module/zfs/dbuf.c | 3 +- module/zfs/dmu_send.c | 2 +- module/zfs/dsl_scan.c | 4 +- module/zfs/spa_errlog.c | 340 ++++++++++-------- module/zfs/zio.c | 6 +- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 2 + .../zpool_status/zpool_status_003_pos.ksh | 2 + .../zpool_status/zpool_status_005_pos.ksh | 20 +- .../zpool_status/zpool_status_006_pos.ksh | 97 +++++ .../zpool_status/zpool_status_007_pos.ksh | 98 +++++ 16 files changed, 422 insertions(+), 172 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh diff --git a/include/sys/spa.h b/include/sys/spa.h index ab07103fe8ba..79c46aa07709 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -65,6 +65,7 @@ typedef struct spa_aux_vdev spa_aux_vdev_t; typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; typedef struct zbookmark_phys zbookmark_phys_t; +typedef struct zbookmark_err_phys zbookmark_err_phys_t; struct bpobj; struct bplist; @@ -1134,7 +1135,8 @@ extern const char *spa_state_to_name(spa_t *spa); /* error handling */ struct zbookmark_phys; -extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); +extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, + const uint64_t *birth); extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb); extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 8ccd58b584ca..5782c54bd78f 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -66,6 +66,7 @@ typedef struct spa_error_entry { zbookmark_phys_t se_bookmark; char *se_name; avl_node_t se_avl; + zbookmark_err_phys_t se_zep; /* not accounted in avl_find */ } spa_error_entry_t; typedef struct spa_history_phys { diff --git a/include/sys/zio.h b/include/sys/zio.h index 78603d0ebeba..3463682a1065 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -303,12 +303,12 @@ struct zbookmark_phys { uint64_t zb_blkid; }; -typedef struct zbookmark_err_phys { +struct zbookmark_err_phys { uint64_t zb_object; int64_t zb_level; uint64_t zb_blkid; uint64_t zb_birth; -} zbookmark_err_phys_t; +}; #define SET_BOOKMARK(zb, objset, object, level, blkid) \ { \ diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index a4d595cd3cd9..4cd7526858a3 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -565,6 +565,7 @@ and keyed by the head id. In case of encrypted filesystems with unloaded keys or unmounted encrypted filesystems we are unable to check their snapshots or clones for errors and these will not be reported. +In this case no filenames will be reported either. With this feature enabled, every dataset affected by an error block is listed in the output of .Nm zpool Cm status . diff --git a/module/zfs/arc.c b/module/zfs/arc.c index aff438777c8c..e32707bbe5c3 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -2209,7 +2209,7 @@ arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, * (and generate an ereport) before leaving the ARC. */ ret = SET_ERROR(EIO); - spa_log_error(spa, zb); + spa_log_error(spa, zb, &buf->b_hdr->b_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); } @@ -5540,7 +5540,8 @@ arc_read_done(zio_t *zio) ASSERT(BP_IS_PROTECTED(bp)); error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { - spa_log_error(zio->io_spa, &acb->acb_zb); + spa_log_error(zio->io_spa, &acb->acb_zb, + &zio->io_bp->blk_birth); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, zio->io_spa, NULL, &acb->acb_zb, zio, 0); @@ -5833,7 +5834,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, */ rc = SET_ERROR(EIO); if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { - spa_log_error(spa, zb); + spa_log_error(spa, zb, &hdr->b_birth); (void) zfs_ereport_post( FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, zb, NULL, 0); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 617c850296b4..c7f76e8d96f8 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1620,7 +1620,8 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, * If this is not true it indicates tampering and we report an error. */ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) { - spa_log_error(db->db_objset->os_spa, &zb); + spa_log_error(db->db_objset->os_spa, &zb, + &db->db_blkptr->blk_birth); zfs_panic_recover("unencrypted block in encrypted " "object set %llu", dmu_objset_id(db->db_objset)); err = SET_ERROR(EIO); diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index f86a0a5b1c57..5b7f5543ad09 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1123,7 +1123,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, */ if (sta->os->os_encrypted && !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { - spa_log_error(spa, zb); + spa_log_error(spa, zb, &bp->blk_birth); zfs_panic_recover("unencrypted block in encrypted " "object set %llu", dmu_objset_id(sta->os)); return (SET_ERROR(EIO)); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 8e3fd126caa5..d6a9365df120 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -1881,7 +1881,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, if (dnp != NULL && dnp->dn_bonuslen > DN_MAX_BONUS_LEN(dnp)) { scn->scn_phys.scn_errors++; - spa_log_error(spa, zb); + spa_log_error(spa, zb, &bp->blk_birth); return (SET_ERROR(EINVAL)); } @@ -1976,7 +1976,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, * by arc_read() for the cases above. */ scn->scn_phys.scn_errors++; - spa_log_error(spa, zb); + spa_log_error(spa, zb, &bp->blk_birth); return (SET_ERROR(EINVAL)); } diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index c6d97eed2892..41cb9d01273c 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -135,6 +135,10 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) } #ifdef _KERNEL +static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, + uint64_t *snap_obj_array, zbookmark_err_phys_t *zep, void* uaddr, + uint64_t *count); + static void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) { @@ -152,74 +156,22 @@ name_to_object(char *buf, uint64_t *obj) ASSERT(*buf == '\0'); } -static int -get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, - uint64_t *head_dataset_id) +/* + * Retrieve the head filesystem. + */ +static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds) { - dsl_pool_t *dp = spa->spa_dsl_pool; dsl_dataset_t *ds; - objset_t *os; - - int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds); - if (error != 0) { - return (error); - } - ASSERT(head_dataset_id); - *head_dataset_id = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; - - error = dmu_objset_from_ds(ds, &os); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - - /* - * If the key is not loaded dbuf_dnode_findbp() will error out with - * EACCES. However in that case dnode_hold() will eventually call - * dbuf_read()->zio_wait() which may call spa_log_error(). This will - * lead to a deadlock due to us holding the mutex spa_errlist_lock. - * Avoid this by checking here if the keys are loaded, if not return. - * If the keys are not loaded the head_errlog feature is meaningless - * as we cannot figure out the birth txg of the block pointer. - */ - if (dsl_dataset_get_keystatus(ds->ds_dir) == - ZFS_KEYSTATUS_UNAVAILABLE) { - zep->zb_birth = 0; - dsl_dataset_rele(ds, FTAG); - return (0); - } + int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, + dsobj, FTAG, &ds); - dnode_t *dn; - blkptr_t bp; - - error = dnode_hold(os, zep->zb_object, FTAG, &dn); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); + if (error != 0) return (error); - } - rw_enter(&dn->dn_struct_rwlock, RW_READER); - error = dbuf_dnode_findbp(dn, zep->zb_level, zep->zb_blkid, &bp, NULL, - NULL); - if (error == 0 && BP_IS_HOLE(&bp)) - error = SET_ERROR(ENOENT); - - /* - * If the key is loaded but the encrypted filesystem is unmounted when - * a scrub is run, then dbuf_dnode_findbp() will still error out with - * EACCES (possibly due to the key mapping being removed upon - * unmounting). In that case the head_errlog feature is also - * meaningless as we cannot figure out the birth txg of the block - * pointer. - */ - if (error == EACCES) - error = 0; - else if (!error) - zep->zb_birth = bp.blk_birth; - - rw_exit(&dn->dn_struct_rwlock); - dnode_rele(dn, FTAG); + ASSERT(head_ds); + *head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; dsl_dataset_rele(ds, FTAG); + return (error); } @@ -229,7 +181,7 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj, * during spa_errlog_sync(). */ void -spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) +spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth) { spa_error_entry_t search; spa_error_entry_t *new; @@ -262,8 +214,26 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); new->se_bookmark = *zb; - avl_insert(tree, new, where); + /* + * If the head_errlog feature is enabled, store the birth txg now. In + * case the file is deleted before spa_errlog_sync() runs, we will not + * be able to retrieve the birth txg. + */ + if (spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + new->se_zep.zb_object = zb->zb_object; + new->se_zep.zb_level = zb->zb_level; + new->se_zep.zb_blkid = zb->zb_blkid; + + /* + * birth may end up being NULL, e.g. in zio_done(). We + * will handle this in process_error_block(). + */ + if (birth != NULL) + new->se_zep.zb_birth = *birth; + } + + avl_insert(tree, new, where); mutex_exit(&spa->spa_errlist_lock); } @@ -336,20 +306,28 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, error = find_birth_txg(ds, zep, &latest_txg); /* - * If we cannot figure out the current birth txg of the block pointer - * error out. If the filesystem is encrypted and the key is not loaded + * If the filesystem is encrypted and the key is not loaded * or the encrypted filesystem is not mounted the error will be EACCES. - * In that case do not return an error. + * In that case report an error in the head filesystem and return. */ if (error == EACCES) { dsl_dataset_rele(ds, FTAG); + zbookmark_phys_t zb; + zep_to_zb(head_ds, zep, &zb); + error = copyout_entry(&zb, uaddr, count); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } return (0); } - if (error) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - if (zep->zb_birth == latest_txg) { + + /* + * If find_birth_txg() errors out otherwise, let txg_to_consider be + * equal to the spa's syncing txg: if check_filesystem() errors out + * then affected snapshots or clones will not be checked. + */ + if (error == 0 && zep->zb_birth == latest_txg) { /* Block neither free nor rewritten. */ zbookmark_phys_t zb; zep_to_zb(head_ds, zep, &zb); @@ -359,44 +337,55 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, return (error); } check_snapshot = B_FALSE; - } else { - ASSERT3U(zep->zb_birth, <, latest_txg); + } else if (error == 0) { txg_to_consider = latest_txg; } - /* How many snapshots reference this block. */ - uint64_t snap_count; - error = zap_count(spa->spa_meta_objset, - dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } + /* + * Retrieve the number of snapshots if the dataset is not a snapshot. + */ + uint64_t snap_count = 0; + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { - if (snap_count == 0) { - /* File system has no snapshot. */ - dsl_dataset_rele(ds, FTAG); - return (0); + error = zap_count(spa->spa_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (snap_count == 0) { + /* Filesystem without snapshots. */ + dsl_dataset_rele(ds, FTAG); + return (0); + } } - uint64_t *snap_obj_array = kmem_alloc(snap_count * sizeof (uint64_t), + uint64_t *snap_obj_array = kmem_zalloc(snap_count * sizeof (uint64_t), KM_SLEEP); int aff_snap_count = 0; uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + uint64_t zap_clone = dsl_dir_phys(ds->ds_dir)->dd_clones; + + dsl_dataset_rele(ds, FTAG); /* Check only snapshots created from this file system. */ while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && snap_obj_txg <= txg_to_consider) { - dsl_dataset_rele(ds, FTAG); error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); if (error != 0) goto out; - if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) - break; + if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) { + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + dsl_dataset_rele(ds, FTAG); + continue; + } boolean_t affected = B_TRUE; if (check_snapshot) { @@ -405,6 +394,7 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, affected = (error == 0 && zep->zb_birth == blk_txg); } + /* Report errors in snapshots. */ if (affected) { snap_obj_array[aff_snap_count] = snap_obj; aff_snap_count++; @@ -416,37 +406,77 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, dsl_dataset_rele(ds, FTAG); goto out; } - - /* - * Only clones whose origins were affected could also - * have affected snapshots. - */ - zap_cursor_t zc; - zap_attribute_t za; - for (zap_cursor_init(&zc, spa->spa_meta_objset, - dsl_dataset_phys(ds)->ds_next_clones_obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { - error = check_filesystem(spa, - za.za_first_integer, zep, uaddr, count); - - if (error != 0) { - zap_cursor_fini(&zc); - goto out; - } - } - zap_cursor_fini(&zc); } - snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + dsl_dataset_rele(ds, FTAG); + } + + if (zap_clone != 0 && aff_snap_count > 0) { + error = check_clones(spa, zap_clone, snap_count, snap_obj_array, + zep, uaddr, count); } - dsl_dataset_rele(ds, FTAG); out: kmem_free(snap_obj_array, sizeof (*snap_obj_array)); return (error); } +/* + * Clone checking. + */ +static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, + uint64_t *snap_obj_array, zbookmark_err_phys_t *zep, void* uaddr, + uint64_t *count) +{ + int error = 0; + zap_cursor_t *zc; + zap_attribute_t *za; + + zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + + for (zap_cursor_init(zc, spa->spa_meta_objset, zap_clone); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_dataset_t *clone; + error = dsl_dataset_hold_obj(dp, za->za_first_integer, + FTAG, &clone); + + if (error != 0) + break; + + /* + * Only clones whose origins were affected could also + * have affected snapshots. + */ + boolean_t found = B_FALSE; + for (int i = 0; i < snap_count; i++) { + if (dsl_dir_phys(clone->ds_dir)->dd_origin_obj + == snap_obj_array[i]) + found = B_TRUE; + } + dsl_dataset_rele(clone, FTAG); + + if (!found) + continue; + + error = check_filesystem(spa, za->za_first_integer, zep, + uaddr, count); + + if (error != 0) + break; + } + + kmem_free(za, sizeof (*za)); + kmem_free(zc, sizeof (*zc)); + zap_cursor_fini(zc); + + return (error); +} + static int find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, uint64_t *top_affected_fs) @@ -474,12 +504,13 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, void *uaddr, uint64_t *count) { /* - * If the zb_birth is 0 it means we failed to retrieve the birth txg - * of the block pointer. This happens when an encrypted filesystem is - * not mounted or when the key is not loaded. Do not proceed to + * If zb_birth == 0 or head_ds == 0 it means we failed to retrieve the + * birth txg or the head filesystem of the block pointer. This may + * happen e.g. when an encrypted filesystem is not mounted or when + * the key is not loaded. In this case do not proceed to * check_filesystem(), instead do the accounting here. */ - if (zep->zb_birth == 0) { + if (zep->zb_birth == 0 || head_ds == 0) { zbookmark_phys_t zb; zep_to_zb(head_ds, zep, &zb); int error = copyout_entry(&zb, uaddr, count); @@ -697,11 +728,10 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, zep.zb_birth = 0; /* - * We cannot use get_head_and_birth_txg() because it will - * acquire the pool config lock, which we already have. In case - * of an error we simply continue. + * In case of an error we should simply continue instead of + * returning prematurely. See the next comment. */ - uint64_t head_dataset_obj; + uint64_t head_ds; dsl_pool_t *dp = spa->spa_dsl_pool; dsl_dataset_t *ds; objset_t *os; @@ -710,8 +740,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, if (error != 0) continue; - head_dataset_obj = - dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; /* * The objset and the dnode are required for getting the block @@ -751,14 +780,14 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, uint64_t err_obj; error = zap_lookup_int_key(spa->spa_meta_objset, *newobj, - head_dataset_obj, &err_obj); + head_ds, &err_obj); if (error == ENOENT) { err_obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); (void) zap_update_int_key(spa->spa_meta_objset, - *newobj, head_dataset_obj, err_obj, tx); + *newobj, head_ds, err_obj, tx); } char buf[64]; @@ -875,20 +904,21 @@ process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count) } for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { - zbookmark_err_phys_t zep; - zep.zb_object = se->se_bookmark.zb_object; - zep.zb_level = se->se_bookmark.zb_level; - zep.zb_blkid = se->se_bookmark.zb_blkid; - zep.zb_birth = 0; + uint64_t head_ds = 0; + int error = get_head_ds(spa, se->se_bookmark.zb_objset, + &head_ds); - uint64_t head_ds_obj; - int error = get_head_and_birth_txg(spa, &zep, - se->se_bookmark.zb_objset, &head_ds_obj); + /* + * If get_head_ds() errors out, set the head filesystem + * to the filesystem stored in the bookmark of the + * error block. + */ + if (error != 0) + head_ds = se->se_bookmark.zb_objset; - if (!error) - error = process_error_block(spa, head_ds_obj, &zep, - uaddr, count); - if (error) + error = process_error_block(spa, head_ds, + &se->se_zep, uaddr, count); + if (error != 0) return (error); } return (0); @@ -914,8 +944,9 @@ spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count) #ifdef _KERNEL /* * The pool config lock is needed to hold a dataset_t via (among other - * places) process_error_list() -> get_head_and_birth_txg(), and lock - * ordering requires that we get it before the spa_errlog_lock. + * places) process_error_list() -> process_error_block()-> + * find_top_affected_fs(), and lock ordering requires that we get it + * before the spa_errlog_lock. */ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); mutex_enter(&spa->spa_errlog_lock); @@ -1011,34 +1042,33 @@ sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) } else { for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { zbookmark_err_phys_t zep; - zep.zb_object = se->se_bookmark.zb_object; - zep.zb_level = se->se_bookmark.zb_level; - zep.zb_blkid = se->se_bookmark.zb_blkid; - zep.zb_birth = 0; + zep.zb_object = se->se_zep.zb_object; + zep.zb_level = se->se_zep.zb_level; + zep.zb_blkid = se->se_zep.zb_blkid; + zep.zb_birth = se->se_zep.zb_birth; + + uint64_t head_ds = 0; + int error = get_head_ds(spa, se->se_bookmark.zb_objset, + &head_ds); /* - * If we cannot find out the head dataset and birth txg - * of the present error block, we simply continue. - * Reinserting that error block to the error lists, - * even if we are not syncing the final txg, results - * in duplicate posting of errors. + * If get_head_ds() errors out, set the head filesystem + * to the filesystem stored in the bookmark of the + * error block. */ - uint64_t head_dataset_obj; - int error = get_head_and_birth_txg(spa, &zep, - se->se_bookmark.zb_objset, &head_dataset_obj); - if (error) - continue; + if (error != 0) + head_ds = se->se_bookmark.zb_objset; uint64_t err_obj; error = zap_lookup_int_key(spa->spa_meta_objset, - *obj, head_dataset_obj, &err_obj); + *obj, head_ds, &err_obj); if (error == ENOENT) { err_obj = zap_create(spa->spa_meta_objset, DMU_OT_ERROR_LOG, DMU_OT_NONE, 0, tx); (void) zap_update_int_key(spa->spa_meta_objset, - *obj, head_dataset_obj, err_obj, tx); + *obj, head_ds, err_obj, tx); } errphys_to_name(&zep, buf, sizeof (buf)); @@ -1108,7 +1138,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg) /* * The pool config lock is needed to hold a dataset_t via - * sync_error_list() -> get_head_and_birth_txg(), and lock ordering + * sync_error_list() -> get_head_ds(), and lock ordering * requires that we get it before the spa_errlog_lock. */ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 1b1a1831f333..0924fb6f40bc 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -570,7 +570,8 @@ zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) if (ret == ECKSUM) { zio->io_error = SET_ERROR(EIO); if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { - spa_log_error(spa, &zio->io_bookmark); + spa_log_error(spa, &zio->io_bookmark, + &zio->io_bp->blk_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, spa, NULL, &zio->io_bookmark, zio, 0); } @@ -4718,7 +4719,8 @@ zio_done(zio_t *zio) * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ - spa_log_error(zio->io_spa, &zio->io_bookmark); + spa_log_error(zio->io_spa, &zio->io_bookmark, + &zio->io_bp->blk_birth); (void) zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, NULL, &zio->io_bookmark, zio, 0); } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index e4824457e926..50a9309acea5 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -494,7 +494,8 @@ tags = ['functional', 'cli_root', 'zpool_split'] [tests/functional/cli_root/zpool_status] tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', 'zpool_status_004_pos', - 'zpool_status_005_pos', 'zpool_status_features_001_pos'] + 'zpool_status_005_pos', 'zpool_status_006_pos', + 'zpool_status_007_pos', 'zpool_status_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 9e738227f743..92d62b503f65 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1169,6 +1169,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_status/zpool_status_003_pos.ksh \ functional/cli_root/zpool_status/zpool_status_004_pos.ksh \ functional/cli_root/zpool_status/zpool_status_005_pos.ksh \ + functional/cli_root/zpool_status/zpool_status_006_pos.ksh \ + functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh index a243944e48ea..b501aac5ad6d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh @@ -61,11 +61,13 @@ dd if=/$TESTPOOL2/10m_file bs=1M || true log_must zfs snapshot $TESTPOOL2@snap log_must zfs clone $TESTPOOL2@snap $TESTPOOL2/clone +log_must zfs create $TESTPOOL2/$TESTFS1 # Look to see that snapshot, clone and filesystem our files report errors log_must zpool status -v $TESTPOOL2 log_must eval "zpool status -v | grep '$TESTPOOL2@snap:/10m_file'" log_must eval "zpool status -v | grep '$TESTPOOL2/clone/10m_file'" log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'" +log_mustnot eval "zpool status -v | grep '$TESTFS1'" log_pass "'zpool status -v' outputs affected filesystem, snapshot & clone" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh index 3eb7825ca295..04cd1892380d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh @@ -24,7 +24,6 @@ # Copyright (c) 2022 George Amanakis. All rights reserved. # -. $STF_SUITE/include/libtest.shlib # # DESCRIPTION: # Verify correct output with 'zpool status -v' after corrupting a file @@ -34,7 +33,12 @@ # 2. zinject checksum errors # 3. Unmount the filesystem and unload the key # 4. Scrub the pool -# 5. Verify we report errors in the pool in 'zpool status -v' +# 5. Verify we report that errors were detected but we do not report +# the filename since the key is not loaded. +# 6. Load the key and mount the encrypted fs. +# 7. Verify we report errors in the pool in 'zpool status -v' + +. $STF_SUITE/include/libtest.shlib verify_runnable "both" @@ -66,13 +70,21 @@ log_must dd if=/dev/urandom of=$file bs=1024 count=1024 oflag=sync log_must eval "echo 'aaaaaaaa' >> "$file corrupt_blocks_at_level $file 0 -log_must zfs unmount $TESTPOOL2/$TESTFS1 -log_must zfs unload-key $TESTPOOL2/$TESTFS1 +log_must zfs umount $TESTPOOL2/$TESTFS1 +log_must zfs unload-key -a log_must zpool sync $TESTPOOL2 log_must zpool scrub $TESTPOOL2 log_must zpool wait -t scrub $TESTPOOL2 log_must zpool status -v $TESTPOOL2 log_must eval "zpool status -v $TESTPOOL2 | \ grep \"Permanent errors have been detected\"" +log_mustnot eval "zpool status -v $TESTPOOL2 | grep '$file'" + +log_must eval "cat /$TESTPOOL2/pwd | zfs load-key $TESTPOOL2/$TESTFS1" +log_must zfs mount $TESTPOOL2/$TESTFS1 +log_must zpool status -v $TESTPOOL2 +log_must eval "zpool status -v $TESTPOOL2 | \ + grep \"Permanent errors have been detected\"" +log_must eval "zpool status -v $TESTPOOL2 | grep '$file'" log_pass "Verify reporting errors with unloaded keys works" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_006_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_006_pos.ksh new file mode 100755 index 000000000000..d6f4a4fe2d86 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_006_pos.ksh @@ -0,0 +1,97 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 George Amanakis. All rights reserved. +# + +# +# DESCRIPTION: +# Verify reporting errors when deleting files +# +# STRATEGY: +# 1. Create a pool, and a file +# 2. zinject checksum errors +# 3. Create snapshots and clones like: +# fs->snap1->clone1->snap2->clone2->... +# 4. Read the original file and immediately delete it +# 5. Delete the file in clone2 +# 6. Snapshot clone2->snapxx and clone into snapxx->clonexx +# 7. Verify we report errors in the pool in 'zpool status -v' +# 8. Promote clone1 +# 9. Verify we report errors in the pool in 'zpool status -v' + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "both" + +function cleanup +{ + log_must zinject -c all + destroy_pool $TESTPOOL2 + rm -f $TESTDIR/vdev_a +} + +log_assert "Verify reporting errors when deleting files" +log_onexit cleanup + +typeset file="/$TESTPOOL2/$TESTFILE0" + +truncate -s $MINVDEVSIZE $TESTDIR/vdev_a +log_must zpool create -f -o feature@head_errlog=enabled $TESTPOOL2 $TESTDIR/vdev_a +log_must dd if=/dev/urandom of=$file bs=1024 count=1024 oflag=sync +log_must zinject -t data -e checksum -f 100 -am $file + +for i in {1..3}; do + lastfs="$(zfs list -r $TESTPOOL2 | tail -1 | awk '{print $1}')" + log_must zfs snap $lastfs@snap$i + log_must zfs clone $lastfs@snap$i $TESTPOOL2/clone$i +done + +log_mustnot dd if=$file of=/dev/null bs=1024 +log_must rm $file /$TESTPOOL2/clone2/$TESTFILE0 +log_must zfs snap $TESTPOOL2/clone2@snapxx +log_must zfs clone $TESTPOOL2/clone2@snapxx $TESTPOOL2/clonexx +log_must zpool status -v $TESTPOOL2 + +log_must eval "zpool status -v $TESTPOOL2 | \ + grep \"Permanent errors have been detected\"" +log_must eval "zpool status -v | grep '$TESTPOOL2@snap1:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1@snap2:/$TESTFILE0'" +log_mustnot eval "zpool status -v | grep '$TESTPOOL2/clone2/$TESTFILE0'" +log_mustnot eval "zpool status -v | grep '$TESTPOOL2/clonexx/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'" + +log_must zfs promote $TESTPOOL2/clone1 +log_must eval "zpool status -v $TESTPOOL2 | \ + grep \"Permanent errors have been detected\"" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1@snap1:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1@snap2:/$TESTFILE0'" +log_mustnot eval "zpool status -v | grep '$TESTPOOL2/clone2/$TESTFILE0'" +log_mustnot eval "zpool status -v | grep '$TESTPOOL2/clonexx/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'" + +log_pass "Verify reporting errors when deleting files" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh new file mode 100755 index 000000000000..c9849379f779 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh @@ -0,0 +1,98 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 George Amanakis. All rights reserved. +# + +# +# DESCRIPTION: +# Verify reporting errors when deleting corrupted files after scrub +# +# STRATEGY: +# 1. Create a pool, and a file +# 2. Corrupt the file +# 3. Create snapshots and clones like: +# fs->snap1->clone1->snap2->clone2->... +# 4. Read the original file and immediately delete it +# 5. Delete the file in clone2 +# 6. Snapshot clone2->snapxx and clone into snapxx->clonexx +# 7. Verify we report errors in the pool in 'zpool status -v' +# 8. Promote clone1 +# 9. Verify we report errors in the pool in 'zpool status -v' + +. $STF_SUITE/include/libtest.shlib + +verify_runnable "both" + +function cleanup +{ + destroy_pool $TESTPOOL2 + rm -f $TESTDIR/vdev_a +} + +log_assert "Verify reporting errors when deleting corrupted files after scrub" +log_onexit cleanup + +typeset file="/$TESTPOOL2/$TESTFS1/$TESTFILE0" + +truncate -s $MINVDEVSIZE $TESTDIR/vdev_a +log_must zpool create -f $TESTPOOL2 $TESTDIR/vdev_a +log_must zfs create -o primarycache=none $TESTPOOL2/$TESTFS1 +log_must dd if=/dev/urandom of=$file bs=1024 count=1024 oflag=sync +corrupt_blocks_at_level $file 0 + +lastfs="$(zfs list -r $TESTPOOL2 | tail -1 | awk '{print $1}')" +for i in {1..3}; do + log_must zfs snap $lastfs@snap$i + log_must zfs clone $lastfs@snap$i $TESTPOOL2/clone$i + lastfs="$(zfs list -r $TESTPOOL2/clone$i | tail -1 | awk '{print $1}')" +done + +log_must zpool scrub -w $TESTPOOL2 +log_must rm $file /$TESTPOOL2/clone2/$TESTFILE0 +log_must zfs snap $TESTPOOL2/clone2@snapxx +log_must zfs clone $TESTPOOL2/clone2@snapxx $TESTPOOL2/clonexx +log_must zpool status -v $TESTPOOL2 + +log_must eval "zpool status -v $TESTPOOL2 | \ + grep \"Permanent errors have been detected\"" +log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1@snap1:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1@snap2:/$TESTFILE0'" +log_mustnot eval "zpool status -v | grep '$TESTPOOL2/clone2/$TESTFILE0'" +log_mustnot eval "zpool status -v | grep '$TESTPOOL2/clonexx/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'" + +log_must zfs promote $TESTPOOL2/clone1 +log_must eval "zpool status -v $TESTPOOL2 | \ + grep \"Permanent errors have been detected\"" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1@snap1:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1@snap2:/$TESTFILE0'" +log_mustnot eval "zpool status -v | grep '$TESTPOOL2/clone2/$TESTFILE0'" +log_mustnot eval "zpool status -v | grep '$TESTPOOL2/clonexx/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'" + +log_pass "Verify reporting errors when deleting corrupted files after scrub" From 21c4b2a944dce7e45a9f0c959624fe66b825fae9 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Wed, 29 Mar 2023 15:39:36 -0700 Subject: [PATCH 023/180] Linux 6.2 compat: META Update the META file to reflect compatibility with the 6.2 kernel. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #14689 --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 18b3f3498eeb..8779e512f7be 100644 --- a/META +++ b/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.1 +Linux-Maximum: 6.2 Linux-Minimum: 3.10 From 1142362ff606ab7a1262d7d6f1f9be2205825065 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 31 Mar 2023 09:43:54 -0700 Subject: [PATCH 024/180] Use vmem_zalloc to silence allocation warning The kmem allocation in zfs_prune_aliases() will trigger a large allocation warning on systems with 64K pages. Resolve this by switching to vmem_alloc() which internally uses kvmalloc() so the right allocator will be used based on the allocation size. Reviewed-by: Richard Yao Reviewed-by: Tino Reichardt Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Closes #8491 Closes #14694 --- module/os/linux/zfs/zfs_vfsops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 2d9b27a90884..48945b8af8c1 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1194,7 +1194,7 @@ zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) int objects = 0; int i = 0, j = 0; - zp_array = kmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); + zp_array = vmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); mutex_enter(&zfsvfs->z_znodes_lock); while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { @@ -1230,7 +1230,7 @@ zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) zrele(zp); } - kmem_free(zp_array, max_array * sizeof (znode_t *)); + vmem_free(zp_array, max_array * sizeof (znode_t *)); return (objects); } From c5431f14655ce05d1ea99cb012806f0e5873d257 Mon Sep 17 00:00:00 2001 From: youzhongyang Date: Fri, 31 Mar 2023 12:46:22 -0400 Subject: [PATCH 025/180] linux 6.3 compat: needs REQ_PREFLUSH | REQ_OP_WRITE Modify bio_set_flush() so if kernel version is >= 4.10, flags REQ_PREFLUSH and REQ_OP_WRITE are set together. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Youzhong Yang Closes #14695 --- include/os/linux/kernel/linux/blkdev_compat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index c7405ffab8ba..c5c6385be6ff 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -426,7 +426,7 @@ static inline void bio_set_flush(struct bio *bio) { #if defined(HAVE_REQ_PREFLUSH) /* >= 4.10 */ - bio_set_op_attrs(bio, 0, REQ_PREFLUSH); + bio_set_op_attrs(bio, 0, REQ_PREFLUSH | REQ_OP_WRITE); #elif defined(WRITE_FLUSH_FUA) /* >= 2.6.37 and <= 4.9 */ bio_set_op_attrs(bio, 0, WRITE_FLUSH_FUA); #else From 3399a30ee02d0d31ba2d43d0ce0a2fd90d5c575d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Fri, 31 Mar 2023 18:47:48 +0200 Subject: [PATCH 026/180] contrib: dracut: fix race with root=zfs:dset when necessities required MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This had always worked in my testing, but a user on hardware reported this to happen 100%, and I reproduced it once with cold VM host caches. dracut-zfs-generator runs as a systemd generator, i.e. at Some Relatively Early Time; if root= is a fixed dataset, it tries to "solve [necessities] statically at generation time". If by that point zfs-import.target hasn't popped (because the import is taking a non-negligible amount of time for whatever reason), it'll see no children for the root datase, and as such generate no mounts. This has never had any right to work. No-one caught this earlier because it's just that much more convenient to have root=zfs:AUTO, which orders itself properly. To fix this, always run zfs-nonroot-necessities.service; this additionally simplifies the implementation by: * making BOOTFS from zfs-env-bootfs.service be the real, canonical, root dataset name, not just "whatever the first bootfs is", and only set it if we're ZFS-booting * zfs-{rollback,snapshot}-bootfs.service can use this instead of re-implementing it * having zfs-env-bootfs.service also set BOOTFSFLAGS * this means the sysroot.mount drop-in can be fixed text * zfs-nonroot-necessities.service can also be constant and always enabled, because it's conditioned on BOOTFS being set There is no longer any code generated at run-time (the sysroot.mount drop-in is an unavoidable gratuitous cp). The flow of BOOTFS{,FLAGS} from zfs-env-bootfs.service to sysroot.mount is not noted explicitly in dracut.zfs(7), because (a) at some point it's just visual noise and (b) it's already ordered via d-p-m.s from z-i.t. Reviewed-by: Brian Behlendorf Signed-off-by: Ahelenia Ziemiańska Closes #14690 --- contrib/dracut/90zfs/module-setup.sh.in | 3 + .../dracut/90zfs/zfs-env-bootfs.service.in | 15 ++++- contrib/dracut/90zfs/zfs-generator.sh.in | 67 ++----------------- contrib/dracut/90zfs/zfs-lib.sh.in | 2 +- .../90zfs/zfs-nonroot-necessities.service.in | 20 ++++++ .../90zfs/zfs-rollback-bootfs.service.in | 3 +- .../90zfs/zfs-snapshot-bootfs.service.in | 3 +- contrib/dracut/Makefile.am | 1 + man/man7/dracut.zfs.7 | 16 ++--- 9 files changed, 54 insertions(+), 76 deletions(-) create mode 100644 contrib/dracut/90zfs/zfs-nonroot-necessities.service.in diff --git a/contrib/dracut/90zfs/module-setup.sh.in b/contrib/dracut/90zfs/module-setup.sh.in index 78c74e7423bb..e55cb60e1612 100755 --- a/contrib/dracut/90zfs/module-setup.sh.in +++ b/contrib/dracut/90zfs/module-setup.sh.in @@ -81,6 +81,9 @@ install() { inst_simple "${moddir}/zfs-env-bootfs.service" "${systemdsystemunitdir}/zfs-env-bootfs.service" systemctl -q --root "${initdir}" add-wants zfs-import.target zfs-env-bootfs.service + inst_simple "${moddir}/zfs-nonroot-necessities.service" "${systemdsystemunitdir}/zfs-nonroot-necessities.service" + systemctl -q --root "${initdir}" add-requires initrd-root-fs.target zfs-nonroot-necessities.service + # Add user-provided unit overrides: # - /etc/systemd/system/${_service} # - /etc/systemd/system/${_service}.d/overrides.conf diff --git a/contrib/dracut/90zfs/zfs-env-bootfs.service.in b/contrib/dracut/90zfs/zfs-env-bootfs.service.in index 34c88037cac2..7ebab4c1a58d 100644 --- a/contrib/dracut/90zfs/zfs-env-bootfs.service.in +++ b/contrib/dracut/90zfs/zfs-env-bootfs.service.in @@ -1,6 +1,5 @@ [Unit] -Description=Set BOOTFS environment for dracut -Documentation=man:zpool(8) +Description=Set BOOTFS and BOOTFSFLAGS environment variables for dracut DefaultDependencies=no After=zfs-import-cache.service After=zfs-import-scan.service @@ -8,7 +7,17 @@ Before=zfs-import.target [Service] Type=oneshot -ExecStart=/bin/sh -c "exec systemctl set-environment BOOTFS=$(@sbindir@/zpool list -H -o bootfs | grep -m1 -vFx -)" +ExecStart=/bin/sh -c ' \ + . /lib/dracut-zfs-lib.sh; \ + decode_root_args || exit 0; \ + [ "$root" = "zfs:AUTO" ] && root="$(@sbindir@/zpool list -H -o bootfs | grep -m1 -vFx -)"; \ + rootflags="$(getarg rootflags=)"; \ + case ",$rootflags," in \ + *,zfsutil,*) ;; \ + ,,) rootflags=zfsutil ;; \ + *) rootflags="zfsutil,$rootflags" ;; \ + esac; \ + exec systemctl set-environment BOOTFS="$root" BOOTFSFLAGS="$rootflags"' [Install] WantedBy=zfs-import.target diff --git a/contrib/dracut/90zfs/zfs-generator.sh.in b/contrib/dracut/90zfs/zfs-generator.sh.in index 56f7ca9785ba..4e1eb7490e0d 100755 --- a/contrib/dracut/90zfs/zfs-generator.sh.in +++ b/contrib/dracut/90zfs/zfs-generator.sh.in @@ -14,81 +14,24 @@ GENERATOR_DIR="$1" . /lib/dracut-zfs-lib.sh decode_root_args || exit 0 -[ -z "${rootflags}" ] && rootflags=$(getarg rootflags=) -case ",${rootflags}," in - *,zfsutil,*) ;; - ,,) rootflags=zfsutil ;; - *) rootflags="zfsutil,${rootflags}" ;; -esac - [ -n "$debug" ] && echo "zfs-generator: writing extension for sysroot.mount to $GENERATOR_DIR/sysroot.mount.d/zfs-enhancement.conf" >> /dev/kmsg -mkdir -p "$GENERATOR_DIR"/sysroot.mount.d "$GENERATOR_DIR"/initrd-root-fs.target.requires "$GENERATOR_DIR"/dracut-pre-mount.service.d +mkdir -p "$GENERATOR_DIR"/sysroot.mount.d "$GENERATOR_DIR"/dracut-pre-mount.service.d + { echo "[Unit]" echo "Before=initrd-root-fs.target" echo "After=zfs-import.target" echo echo "[Mount]" - if [ "${root}" = "zfs:AUTO" ]; then - echo "PassEnvironment=BOOTFS" - echo 'What=${BOOTFS}' - else - echo "What=${root}" - fi + echo "PassEnvironment=BOOTFS BOOTFSFLAGS" + echo 'What=${BOOTFS}' echo "Type=zfs" - echo "Options=${rootflags}" + echo 'Options=${BOOTFSFLAGS}' } > "$GENERATOR_DIR"/sysroot.mount.d/zfs-enhancement.conf ln -fs ../sysroot.mount "$GENERATOR_DIR"/initrd-root-fs.target.requires/sysroot.mount - -if [ "${root}" = "zfs:AUTO" ]; then - { - echo "[Unit]" - echo "Before=initrd-root-fs.target" - echo "After=sysroot.mount" - echo "DefaultDependencies=no" - echo - echo "[Service]" - echo "Type=oneshot" - echo "PassEnvironment=BOOTFS" - echo "ExecStart=/bin/sh -c '" ' \ - . /lib/dracut-zfs-lib.sh; \ - _zfs_nonroot_necessities_cb() { \ - zfs mount | grep -m1 -q "^$1 " && return 0; \ - echo "Mounting $1 on /sysroot$2"; \ - mount -o zfsutil -t zfs "$1" "/sysroot$2"; \ - }; \ - for_relevant_root_children "${BOOTFS}" _zfs_nonroot_necessities_cb;' \ - "'" - } > "$GENERATOR_DIR"/zfs-nonroot-necessities.service - ln -fs ../zfs-nonroot-necessities.service "$GENERATOR_DIR"/initrd-root-fs.target.requires/zfs-nonroot-necessities.service -else - # We can solve this statically at generation time, so do! - _zfs_generator_cb() { - dset="${1}" - mpnt="${2}" - unit="$(systemd-escape --suffix=mount -p "/sysroot${mpnt}")" - - { - echo "[Unit]" - echo "Before=initrd-root-fs.target" - echo "After=sysroot.mount" - echo - echo "[Mount]" - echo "Where=/sysroot${mpnt}" - echo "What=${dset}" - echo "Type=zfs" - echo "Options=zfsutil" - } > "$GENERATOR_DIR/${unit}" - ln -fs ../"${unit}" "$GENERATOR_DIR"/initrd-root-fs.target.requires/"${unit}" - } - - for_relevant_root_children "${root}" _zfs_generator_cb -fi - - { echo "[Unit]" echo "After=zfs-import.target" diff --git a/contrib/dracut/90zfs/zfs-lib.sh.in b/contrib/dracut/90zfs/zfs-lib.sh.in index 3a43e514d6f9..7139e2e6fe4b 100755 --- a/contrib/dracut/90zfs/zfs-lib.sh.in +++ b/contrib/dracut/90zfs/zfs-lib.sh.in @@ -39,7 +39,7 @@ mount_dataset() { # for_relevant_root_children DATASET EXEC # Runs "EXEC dataset mountpoint" for all children of DATASET that are needed for system bringup -# Used by zfs-generator.sh and friends, too! +# Used by zfs-nonroot-necessities.service and friends, too! for_relevant_root_children() { dataset="${1}" exec="${2}" diff --git a/contrib/dracut/90zfs/zfs-nonroot-necessities.service.in b/contrib/dracut/90zfs/zfs-nonroot-necessities.service.in new file mode 100644 index 000000000000..8f420c737c72 --- /dev/null +++ b/contrib/dracut/90zfs/zfs-nonroot-necessities.service.in @@ -0,0 +1,20 @@ +[Unit] +Before=initrd-root-fs.target +After=sysroot.mount +DefaultDependencies=no +ConditionEnvironment=BOOTFS + +[Service] +Type=oneshot +PassEnvironment=BOOTFS +ExecStart=/bin/sh -c ' \ + . /lib/dracut-zfs-lib.sh; \ + _zfs_nonroot_necessities_cb() { \ + @sbindir@/zfs mount | grep -m1 -q "^$1 " && return 0; \ + echo "Mounting $1 on /sysroot$2"; \ + mount -o zfsutil -t zfs "$1" "/sysroot$2"; \ + }; \ + for_relevant_root_children "${BOOTFS}" _zfs_nonroot_necessities_cb' + +[Install] +RequiredBy=initrd-root-fs.target diff --git a/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in b/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in index a29cf3a3dd81..68fdcb1f323e 100644 --- a/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in +++ b/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in @@ -5,8 +5,9 @@ After=zfs-import.target dracut-pre-mount.service zfs-snapshot-bootfs.service Before=dracut-mount.service DefaultDependencies=no ConditionKernelCommandLine=bootfs.rollback +ConditionEnvironment=BOOTFS [Service] Type=oneshot -ExecStart=/bin/sh -c '. /lib/dracut-zfs-lib.sh; decode_root_args || exit; [ "$root" = "zfs:AUTO" ] && root="$BOOTFS"; SNAPNAME="$(getarg bootfs.rollback)"; exec @sbindir@/zfs rollback -Rf "$root@${SNAPNAME:-%v}"' +ExecStart=/bin/sh -c '. /lib/dracut-lib.sh; SNAPNAME="$(getarg bootfs.rollback)"; exec @sbindir@/zfs rollback -Rf "$BOOTFS@${SNAPNAME:-%v}"' RemainAfterExit=yes diff --git a/contrib/dracut/90zfs/zfs-snapshot-bootfs.service.in b/contrib/dracut/90zfs/zfs-snapshot-bootfs.service.in index 9e73d1a78724..a675b5b2ea98 100644 --- a/contrib/dracut/90zfs/zfs-snapshot-bootfs.service.in +++ b/contrib/dracut/90zfs/zfs-snapshot-bootfs.service.in @@ -5,8 +5,9 @@ After=zfs-import.target dracut-pre-mount.service Before=dracut-mount.service DefaultDependencies=no ConditionKernelCommandLine=bootfs.snapshot +ConditionEnvironment=BOOTFS [Service] Type=oneshot -ExecStart=-/bin/sh -c '. /lib/dracut-zfs-lib.sh; decode_root_args || exit; [ "$root" = "zfs:AUTO" ] && root="$BOOTFS"; SNAPNAME="$(getarg bootfs.snapshot)"; exec @sbindir@/zfs snapshot "$root@${SNAPNAME:-%v}"' +ExecStart=-/bin/sh -c '. /lib/dracut-lib.sh; SNAPNAME="$(getarg bootfs.snapshot)"; exec @sbindir@/zfs snapshot "$BOOTFS@${SNAPNAME:-%v}"' RemainAfterExit=yes diff --git a/contrib/dracut/Makefile.am b/contrib/dracut/Makefile.am index 73ca52b66316..b432ab76a6d8 100644 --- a/contrib/dracut/Makefile.am +++ b/contrib/dracut/Makefile.am @@ -16,6 +16,7 @@ pkgdracut_90_SCRIPTS = \ pkgdracut_90_DATA = \ %D%/90zfs/zfs-env-bootfs.service \ + %D%/90zfs/zfs-nonroot-necessities.service \ %D%/90zfs/zfs-rollback-bootfs.service \ %D%/90zfs/zfs-snapshot-bootfs.service diff --git a/man/man7/dracut.zfs.7 b/man/man7/dracut.zfs.7 index 2db2f6639eaf..c1475c695e83 100644 --- a/man/man7/dracut.zfs.7 +++ b/man/man7/dracut.zfs.7 @@ -1,6 +1,6 @@ .\" SPDX-License-Identifier: 0BSD .\" -.Dd April 4, 2022 +.Dd March 28, 2023 .Dt DRACUT.ZFS 7 .Os . @@ -28,13 +28,13 @@ zfs-import-scan.service \(da \(da | zfs-import-c zfs-import.target \(-> dracut-pre-mount.service | \(ua | | dracut-zfs-generator | - | ____________________/| + | _____________________/| |/ \(da - | sysroot.mount \(<-\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em dracut-zfs-generator - | | \(da | - | \(da sysroot-{usr,etc,lib,&c.}.mount | - | initrd-root-fs.target \(<-\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em or \(da - | | zfs-nonroot-necessities.service + | sysroot.mount \(<-\(em\(em\(em dracut-zfs-generator + | | + | \(da + | initrd-root-fs.target \(<-\(em zfs-nonroot-necessities.service + | | | | \(da | \(da dracut-mount.service | zfs-snapshot-bootfs.service | | @@ -42,7 +42,7 @@ zfs-import-scan.service \(da \(da | zfs-import-c \(da … | zfs-rollback-bootfs.service | | | \(da | - | sysroot-usr.mount \(<-\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em/ + | /sysroot/{usr,etc,lib,&c.} \(<-\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em\(em/ | | | \(da | initrd-fs.target From 6ecdd35bdbcdeae0adabefe107677620e88d5548 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Wed, 5 Apr 2023 18:57:01 +0200 Subject: [PATCH 027/180] Fix "Add colored output to zfs list" Running `zfs list -o avail rpool` resulted in a core dump. This commit will fix this. Run the needed overhead only, when `use_color()` is true. Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Tino Reichardt Closes #14712 --- cmd/zfs/zfs_main.c | 17 +++++++++++++++-- include/libzutil.h | 1 + lib/libzfs/libzfs.abi | 4 ++++ lib/libzfs/libzfs_util.c | 2 +- 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index f918036cb9b7..d65b01579037 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -3556,8 +3556,21 @@ print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) right_justify = B_FALSE; } - if (pl->pl_prop == ZFS_PROP_AVAILABLE) - color_start(zfs_list_avail_color(zhp)); + /* + * zfs_list_avail_color() needs ZFS_PROP_AVAILABLE + USED + * - so we need another for() search for the USED part + * - when no colors wanted, we can skip the whole thing + */ + if (use_color() && pl->pl_prop == ZFS_PROP_AVAILABLE) { + zprop_list_t *pl2 = cb->cb_proplist; + for (; pl2 != NULL; pl2 = pl2->pl_next) { + if (pl2->pl_prop == ZFS_PROP_USED) { + color_start(zfs_list_avail_color(zhp)); + /* found it, no need for more loops */ + break; + } + } + } /* * If this is being called in scripted mode, or if this is the diff --git a/include/libzutil.h b/include/libzutil.h index 465e463f0c1f..237ff976ba62 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -182,6 +182,7 @@ struct zfs_cmd; #define ANSI_RESET "\033[0m" #define ANSI_BOLD "\033[1m" +_LIBZUTIL_H int use_color(void); _LIBZUTIL_H void color_start(const char *color); _LIBZUTIL_H void color_end(void); _LIBZUTIL_H int printf_color(const char *color, const char *format, ...); diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 99e1b8cdf695..2b61710f5592 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -259,6 +259,7 @@ + @@ -5338,6 +5339,9 @@ + + + diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 60695f8a63f4..393971ddf13c 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -1965,7 +1965,7 @@ zfs_version_print(void) * Return 1 if the user requested ANSI color output, and our terminal supports * it. Return 0 for no color. */ -static int +int use_color(void) { static int use_color = -1; From 8eb2f2605717572d92041534a0556d6d75c47d99 Mon Sep 17 00:00:00 2001 From: youzhongyang Date: Wed, 5 Apr 2023 13:01:38 -0400 Subject: [PATCH 028/180] Linux 6.3 compat: writepage_t first arg struct folio* The type def of writepage_t in kernel 6.3 is changed to take struct folio* as the first argument. We need to detect this change and pass correct function to write_cache_pages(). Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Youzhong Yang Closes #14699 --- config/kernel-writepage_t.m4 | 26 ++++++++++++++++++++++++++ config/kernel.m4 | 2 ++ module/os/linux/zfs/zpl_file.c | 28 +++++++++++++++++++++++++--- 3 files changed, 53 insertions(+), 3 deletions(-) create mode 100644 config/kernel-writepage_t.m4 diff --git a/config/kernel-writepage_t.m4 b/config/kernel-writepage_t.m4 new file mode 100644 index 000000000000..3a0cffd98570 --- /dev/null +++ b/config/kernel-writepage_t.m4 @@ -0,0 +1,26 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_WRITEPAGE_T], [ + dnl # + dnl # 6.3 API change + dnl # The writepage_t function type now has its first argument as + dnl # struct folio* instead of struct page* + dnl # + ZFS_LINUX_TEST_SRC([writepage_t_folio], [ + #include + int putpage(struct folio *folio, + struct writeback_control *wbc, void *data) + { return 0; } + writepage_t func = putpage; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_WRITEPAGE_T], [ + AC_MSG_CHECKING([whether int (*writepage_t)() takes struct folio*]) + ZFS_LINUX_TEST_RESULT([writepage_t_folio], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_WRITEPAGE_T_FOLIO, 1, + [int (*writepage_t)() takes struct folio*]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + diff --git a/config/kernel.m4 b/config/kernel.m4 index 4c7569841f33..fb07f5004d3c 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -151,6 +151,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_IDMAP_MNT_API ZFS_AC_KERNEL_SRC_IATTR_VFSID ZFS_AC_KERNEL_SRC_FILEMAP + ZFS_AC_KERNEL_SRC_WRITEPAGE_T case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -281,6 +282,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_IDMAP_MNT_API ZFS_AC_KERNEL_IATTR_VFSID ZFS_AC_KERNEL_FILEMAP + ZFS_AC_KERNEL_WRITEPAGE_T case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 0a50f80ea68d..ce22e9a9e0e4 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -736,6 +736,29 @@ zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) return (0); } +#ifdef HAVE_WRITEPAGE_T_FOLIO +static int +zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) +{ + (void) zpl_putpage(&pp->page, wbc, data); + return (0); +} +#endif + +static inline int +zpl_write_cache_pages(struct address_space *mapping, + struct writeback_control *wbc, void *data) +{ + int result; + +#ifdef HAVE_WRITEPAGE_T_FOLIO + result = write_cache_pages(mapping, wbc, zpl_putfolio, data); +#else + result = write_cache_pages(mapping, wbc, zpl_putpage, data); +#endif + return (result); +} + static int zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -760,7 +783,7 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) */ boolean_t for_sync = (sync_mode == WB_SYNC_ALL); wbc->sync_mode = WB_SYNC_NONE; - result = write_cache_pages(mapping, wbc, zpl_putpage, &for_sync); + result = zpl_write_cache_pages(mapping, wbc, &for_sync); if (sync_mode != wbc->sync_mode) { if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (result); @@ -776,8 +799,7 @@ zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) * details). That being said, this is a no-op in most cases. */ wbc->sync_mode = sync_mode; - result = write_cache_pages(mapping, wbc, zpl_putpage, - &for_sync); + result = zpl_write_cache_pages(mapping, wbc, &for_sync); } return (result); } From 1038f87c4edcc66d7d9446efb9b0d9ed50beda19 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 6 Apr 2023 02:42:22 +0900 Subject: [PATCH 029/180] Fix some signedness issues in arc_evict() It may happen that "wanted total ARC size" (wt) is negative, that was expected. But multiplication product of it and unsigned fractions result in unsigned value, incorrectly shifted right with a sing loss. Reviewed-by: Brian Behlendorf Reviewed-by: Prakash Surya Reviewed-by: Paul Dagnelie Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14692 --- module/zfs/arc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index e32707bbe5c3..c50228a2682f 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -4465,7 +4465,7 @@ arc_evict(void) */ int64_t prune = 0; int64_t dn = wmsum_value(&arc_sums.arcstat_dnode_size); - w = wt * (arc_meta >> 16) >> 16; + w = wt * (int64_t)(arc_meta >> 16) >> 16; if (zfs_refcount_count(&arc_mru->arcs_size[ARC_BUFC_METADATA]) + zfs_refcount_count(&arc_mfu->arcs_size[ARC_BUFC_METADATA]) - zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) - @@ -4481,7 +4481,7 @@ arc_evict(void) arc_prune_async(prune); /* Evict MRU metadata. */ - w = wt * (arc_meta * arc_pm >> 48) >> 16; + w = wt * (int64_t)(arc_meta * arc_pm >> 48) >> 16; e = MIN((int64_t)(asize - arc_c), (int64_t)(mrum - w)); bytes = arc_evict_impl(arc_mru, ARC_BUFC_METADATA, e); total_evicted += bytes; @@ -4489,7 +4489,7 @@ arc_evict(void) asize -= bytes; /* Evict MFU metadata. */ - w = wt * (arc_meta >> 16) >> 16; + w = wt * (int64_t)(arc_meta >> 16) >> 16; e = MIN((int64_t)(asize - arc_c), (int64_t)(m - w)); bytes = arc_evict_impl(arc_mfu, ARC_BUFC_METADATA, e); total_evicted += bytes; @@ -4498,7 +4498,7 @@ arc_evict(void) /* Evict MRU data. */ wt -= m - total_evicted; - w = wt * (arc_pd >> 16) >> 16; + w = wt * (int64_t)(arc_pd >> 16) >> 16; e = MIN((int64_t)(asize - arc_c), (int64_t)(mrud - w)); bytes = arc_evict_impl(arc_mru, ARC_BUFC_DATA, e); total_evicted += bytes; From b66c2a0899a71fce6454ff92e77e3838eafb27c8 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Thu, 6 Apr 2023 10:29:27 -0700 Subject: [PATCH 030/180] Storage device expansion "silently" fails on degraded vdev When a vdev is degraded or faulted, we refuse to expand it when doing online -e. However, we also don't actually cause the online command to fail, even though the disk didn't expand. This is confusing and misleading, and can result in violated expectations. Reviewed-by: Brian Behlendorf Reviewed-by: Matthew Ahrens Reviewed-by: Mark Maybee Signed-off-by: Paul Dagnelie Closes 14145 --- cmd/zpool/zpool_main.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index d79c1608b09f..9475beaa2368 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -6936,6 +6936,17 @@ zpool_do_online(int argc, char **argv) return (1); for (i = 1; i < argc; i++) { + vdev_state_t oldstate; + boolean_t avail_spare, l2cache; + nvlist_t *tgt = zpool_find_vdev(zhp, argv[i], &avail_spare, + &l2cache, NULL); + if (tgt == NULL) { + ret = 1; + continue; + } + uint_t vsc; + oldstate = ((vdev_stat_t *)fnvlist_lookup_uint64_array(tgt, + ZPOOL_CONFIG_VDEV_STATS, &vsc))->vs_state; if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { if (newstate != VDEV_STATE_HEALTHY) { (void) printf(gettext("warning: device '%s' " @@ -6949,6 +6960,17 @@ zpool_do_online(int argc, char **argv) (void) printf(gettext("use 'zpool " "replace' to replace devices " "that are no longer present\n")); + if ((flags & ZFS_ONLINE_EXPAND)) { + (void) printf(gettext("%s: failed " + "to expand usable space on " + "unhealthy device '%s'\n"), + (oldstate >= VDEV_STATE_DEGRADED ? + "error" : "warning"), argv[i]); + if (oldstate >= VDEV_STATE_DEGRADED) { + ret = 1; + break; + } + } } } else { ret = 1; From ece7ab7e7de81a9a51923d7baa7db3577de4aace Mon Sep 17 00:00:00 2001 From: Rob N Date: Fri, 7 Apr 2023 03:31:19 +1000 Subject: [PATCH 031/180] vdev: expose zfs_vdev_def_queue_depth as a module parameter It was previously available only to FreeBSD. Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Seagate Technology LLC Closes #14718 --- man/man4/zfs.4 | 5 +++++ module/os/freebsd/zfs/sysctl_os.c | 8 -------- module/zfs/vdev_queue.c | 3 +++ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index e8e2cfec61e8..566caae7b4e9 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1292,6 +1292,11 @@ as fuller devices will tend to be slower than empty devices. Also see .Sy zio_dva_throttle_enabled . . +.It Sy zfs_vdev_def_queue_depth Ns = Ns Sy 32 Pq uint +Default queue depth for each vdev IO allocator. +Higher values allow for better coalescing of sequential writes before sending +them to the disk, but can increase transaction commit times. +. .It Sy zfs_vdev_failfast_mask Ns = Ns Sy 1 Pq uint Defines if the driver should retire on a given error type. The following options may be bitwise-ored together: diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index eccb91deff4f..cc616f33db96 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -887,14 +887,6 @@ SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, " (LEGACY)"); /* END CSTYLED */ -extern uint_t zfs_vdev_def_queue_depth; - -/* BEGIN CSTYLED */ -SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, - CTLFLAG_RWTUN, &zfs_vdev_def_queue_depth, 0, - "Default queue depth for each allocator"); -/* END CSTYLED */ - /* zio.c */ /* BEGIN CSTYLED */ diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index ec55674393ce..1a75d68abd9e 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -1119,3 +1119,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW, "Queue depth percentage for each top-level vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW, + "Default queue depth for each allocator"); From a3f82aec933660558d6512a83481527ef731ff0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Matu=C5=A1ka?= Date: Thu, 6 Apr 2023 19:35:02 +0200 Subject: [PATCH 032/180] Miscellaneous FreBSD compilation bugfixes Add missing machine/md_var.h to spl/sys/simd_aarch64.h and spl/sys/simd_arm.h In spl/sys/simd_x86.h, PCB_FPUNOSAVE exists only on amd64, use PCB_NPXNOSAVE on i386 In FreeBSD sys/elf_common.h redefines AT_UID and AT_GID on FreeBSD, we need a hack in vnode.h similar to Linux. sys/simd.h needs to be included early. In zfs_freebsd_copy_file_range() we pass a (size_t *)lenp to zfs_clone_range() that expects a (uint64_t *) Allow compiling armv6 world by limiting ARM macros in sha256_impl.c and sha512_impl.c to __ARM_ARCH > 6 Reviewed-by: Alexander Motin Reviewed-by: Tino Reichardt Reviewed-by: Richard Yao Reviewed-by: Pawel Jakub Dawidek Reviewed-by: Signed-off-by: WHR Signed-off-by: Martin Matuska Closes #14674 --- include/os/freebsd/spl/sys/simd_aarch64.h | 1 + include/os/freebsd/spl/sys/simd_arm.h | 1 + include/os/freebsd/spl/sys/simd_x86.h | 4 ++++ include/os/freebsd/spl/sys/vnode.h | 4 ++++ module/icp/algs/blake3/blake3.c | 1 + module/icp/algs/blake3/blake3_generic.c | 1 + module/icp/algs/blake3/blake3_impl.c | 2 +- module/icp/algs/sha2/sha256_impl.c | 6 +++--- module/icp/algs/sha2/sha512_impl.c | 6 +++--- module/os/freebsd/zfs/zfs_vnops_os.c | 7 ++++--- module/zcommon/zfs_fletcher.c | 2 +- module/zcommon/zfs_prop.c | 6 ++++-- module/zfs/vdev_raidz_math.c | 2 +- 13 files changed, 29 insertions(+), 14 deletions(-) diff --git a/include/os/freebsd/spl/sys/simd_aarch64.h b/include/os/freebsd/spl/sys/simd_aarch64.h index 847c2ed29189..df33bdaeccf8 100644 --- a/include/os/freebsd/spl/sys/simd_aarch64.h +++ b/include/os/freebsd/spl/sys/simd_aarch64.h @@ -45,6 +45,7 @@ #include #include +#include #define kfpu_allowed() 1 #define kfpu_initialize(tsk) do {} while (0) diff --git a/include/os/freebsd/spl/sys/simd_arm.h b/include/os/freebsd/spl/sys/simd_arm.h index f6362cd6bb54..178fbc3b3c6e 100644 --- a/include/os/freebsd/spl/sys/simd_arm.h +++ b/include/os/freebsd/spl/sys/simd_arm.h @@ -44,6 +44,7 @@ #include #include +#include #define kfpu_allowed() 1 #define kfpu_initialize(tsk) do {} while (0) diff --git a/include/os/freebsd/spl/sys/simd_x86.h b/include/os/freebsd/spl/sys/simd_x86.h index 6512d4fcba4f..8e93b558dfe8 100644 --- a/include/os/freebsd/spl/sys/simd_x86.h +++ b/include/os/freebsd/spl/sys/simd_x86.h @@ -45,6 +45,10 @@ fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX);\ } +#ifndef PCB_FPUNOSAVE +#define PCB_FPUNOSAVE PCB_NPXNOSAVE +#endif + #define kfpu_end() { \ if (__predict_false(curpcb->pcb_flags & PCB_FPUNOSAVE)) \ fpu_kern_leave(curthread, NULL); \ diff --git a/include/os/freebsd/spl/sys/vnode.h b/include/os/freebsd/spl/sys/vnode.h index 483d12ae59a2..ab1727dca0c9 100644 --- a/include/os/freebsd/spl/sys/vnode.h +++ b/include/os/freebsd/spl/sys/vnode.h @@ -143,6 +143,10 @@ vn_flush_cached_data(vnode_t *vp, boolean_t sync) /* * Attributes of interest to the caller of setattr or getattr. */ + +#undef AT_UID +#undef AT_GID + #define AT_MODE 0x00002 #define AT_UID 0x00004 #define AT_GID 0x00008 diff --git a/module/icp/algs/blake3/blake3.c b/module/icp/algs/blake3/blake3.c index 4f93e4ff2051..0bab7a3a7593 100644 --- a/module/icp/algs/blake3/blake3.c +++ b/module/icp/algs/blake3/blake3.c @@ -25,6 +25,7 @@ * Copyright (c) 2021-2022 Tino Reichardt */ +#include #include #include diff --git a/module/icp/algs/blake3/blake3_generic.c b/module/icp/algs/blake3/blake3_generic.c index ca7197a26f39..fbe184969672 100644 --- a/module/icp/algs/blake3/blake3_generic.c +++ b/module/icp/algs/blake3/blake3_generic.c @@ -25,6 +25,7 @@ * Copyright (c) 2021-2022 Tino Reichardt */ +#include #include #include "blake3_impl.h" diff --git a/module/icp/algs/blake3/blake3_impl.c b/module/icp/algs/blake3/blake3_impl.c index b59fde1a4dd3..f3f48c2dfa1a 100644 --- a/module/icp/algs/blake3/blake3_impl.c +++ b/module/icp/algs/blake3/blake3_impl.c @@ -23,10 +23,10 @@ * Copyright (c) 2021-2022 Tino Reichardt */ +#include #include #include #include -#include #include "blake3_impl.h" diff --git a/module/icp/algs/sha2/sha256_impl.c b/module/icp/algs/sha2/sha256_impl.c index 278d7e577d72..01ce5cbd814c 100644 --- a/module/icp/algs/sha2/sha256_impl.c +++ b/module/icp/algs/sha2/sha256_impl.c @@ -23,10 +23,10 @@ * Copyright (c) 2022 Tino Reichardt */ +#include #include #include #include -#include #include #include @@ -118,7 +118,7 @@ const sha256_ops_t sha256_shani_impl = { }; #endif -#elif defined(__aarch64__) || defined(__arm__) +#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH > 6) static boolean_t sha256_have_neon(void) { return (kfpu_allowed() && zfs_neon_available()); @@ -192,7 +192,7 @@ static const sha256_ops_t *const sha256_impls[] = { #if defined(__x86_64) && defined(HAVE_SSE4_1) &sha256_shani_impl, #endif -#if defined(__aarch64__) || defined(__arm__) +#if defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH > 6) &sha256_armv7_impl, &sha256_neon_impl, &sha256_armv8_impl, diff --git a/module/icp/algs/sha2/sha512_impl.c b/module/icp/algs/sha2/sha512_impl.c index 991e832b15ca..27b35a639a54 100644 --- a/module/icp/algs/sha2/sha512_impl.c +++ b/module/icp/algs/sha2/sha512_impl.c @@ -23,10 +23,10 @@ * Copyright (c) 2022 Tino Reichardt */ +#include #include #include #include -#include #include #include @@ -108,7 +108,7 @@ const sha512_ops_t sha512_armv8_impl = { .name = "armv8-ce" }; -#elif defined(__arm__) +#elif defined(__arm__) && __ARM_ARCH > 6 extern void zfs_sha512_block_armv7(uint64_t s[8], const void *, size_t); const sha512_ops_t sha512_armv7_impl = { .is_supported = sha2_is_supported, @@ -168,7 +168,7 @@ static const sha512_ops_t *const sha512_impls[] = { &sha512_armv7_impl, &sha512_armv8_impl, #endif -#if defined(__arm__) +#if defined(__arm__) && __ARM_ARCH > 6 &sha512_armv7_impl, &sha512_neon_impl, #endif diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 9169244b1a13..b3405b7593f4 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -29,12 +29,12 @@ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ - #include #include #include #include #include +#include #include #include #include @@ -85,7 +85,6 @@ #include #include #include -#include #include #include #include @@ -6241,6 +6240,7 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) struct mount *mp; struct uio io; int error; + uint64_t len = *ap->a_lenp; /* * TODO: If offset/length is not aligned to recordsize, use @@ -6289,7 +6289,8 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) goto unlock; error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp), - ap->a_outoffp, ap->a_lenp, ap->a_fsizetd->td_ucred); + ap->a_outoffp, &len, ap->a_fsizetd->td_ucred); + *ap->a_lenp = (size_t)len; unlock: if (invp != outvp) diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c index 1d9b1cffc0b2..619ddef0243a 100644 --- a/module/zcommon/zfs_fletcher.c +++ b/module/zcommon/zfs_fletcher.c @@ -136,8 +136,8 @@ #include #include #include -#include #include +#include #include #include #include diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 9c65702b8d43..3db6fd13f4ae 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -30,6 +30,10 @@ /* Portions Copyright 2010 Robert Milkowski */ +#if defined(_KERNEL) +#include +#endif + #include #include #include @@ -1037,8 +1041,6 @@ zfs_prop_align_right(zfs_prop_t prop) #if defined(_KERNEL) -#include - #if defined(HAVE_KERNEL_FPU_INTERNAL) uint8_t **zfs_kfpu_fpregs; EXPORT_SYMBOL(zfs_kfpu_fpregs); diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c index 66f211c430ce..e12b96170f55 100644 --- a/module/zfs/vdev_raidz_math.c +++ b/module/zfs/vdev_raidz_math.c @@ -22,6 +22,7 @@ * Copyright (C) 2016 Gvozden Nešković. All rights reserved. */ +#include #include #include #include @@ -29,7 +30,6 @@ #include #include #include -#include /* Opaque implementation with NULL methods to represent original methods */ static const raidz_impl_ops_t vdev_raidz_original_impl = { From 0b8fdb8ade5d5f0f4ab5be4e488643c2b6c312be Mon Sep 17 00:00:00 2001 From: Andrew Innes Date: Fri, 7 Apr 2023 01:40:23 +0800 Subject: [PATCH 033/180] ZTS: Use inbuilt monotonic time Make the test runner try to use the included python monotonic time function instead of calling librt. This makes the test runner work on macos where librt wasn't available. Reviewed-by: Tino Reichardt Signed-off-by: Andrew Innes Closes #14700 --- tests/test-runner/bin/test-runner.py.in | 34 ++++++++++++------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/test-runner/bin/test-runner.py.in b/tests/test-runner/bin/test-runner.py.in index 28276ebc47e3..c454bf8d7c6a 100755 --- a/tests/test-runner/bin/test-runner.py.in +++ b/tests/test-runner/bin/test-runner.py.in @@ -47,25 +47,25 @@ LOG_OUT = 'LOG_OUT' LOG_ERR = 'LOG_ERR' LOG_FILE_OBJ = None +try: + from time import monotonic as monotonic_time +except ImportError: + class timespec(ctypes.Structure): + _fields_ = [ + ('tv_sec', ctypes.c_long), + ('tv_nsec', ctypes.c_long) + ] -class timespec(ctypes.Structure): - _fields_ = [ - ('tv_sec', ctypes.c_long), - ('tv_nsec', ctypes.c_long) - ] - - -librt = ctypes.CDLL('librt.so.1', use_errno=True) -clock_gettime = librt.clock_gettime -clock_gettime.argtypes = [ctypes.c_int, ctypes.POINTER(timespec)] - + librt = ctypes.CDLL('librt.so.1', use_errno=True) + clock_gettime = librt.clock_gettime + clock_gettime.argtypes = [ctypes.c_int, ctypes.POINTER(timespec)] -def monotonic_time(): - t = timespec() - if clock_gettime(CLOCK_MONOTONIC, ctypes.pointer(t)) != 0: - errno_ = ctypes.get_errno() - raise OSError(errno_, os.strerror(errno_)) - return t.tv_sec + t.tv_nsec * 1e-9 + def monotonic_time(): + t = timespec() + if clock_gettime(CLOCK_MONOTONIC, ctypes.pointer(t)) != 0: + errno_ = ctypes.get_errno() + raise OSError(errno_, os.strerror(errno_)) + return t.tv_sec + t.tv_nsec * 1e-9 class Result(object): From 8ab674ab9c8f4f73fec5582b7186fab633c324e3 Mon Sep 17 00:00:00 2001 From: Damian Szuberski Date: Fri, 7 Apr 2023 03:43:24 +1000 Subject: [PATCH 034/180] ZTS: add existing tests to runfiles Some test cases were committed to the repository but never added to runfiles. Move `zfs_unshare_008_pos` to the Linux-only runfile. Reviewed-by: Brian Behlendorf Signed-off-by: szubersk Closes #14701 --- tests/runfiles/common.run | 15 ++++++++------- tests/runfiles/linux.run | 6 +++++- tests/zfs-tests/tests/Makefile.am | 1 + 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 50a9309acea5..4233c0285c4b 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -106,7 +106,7 @@ tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', 'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict', 'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult', 'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', 'tst.snapshot_neg', - 'tst.snapshot_recursive', 'tst.snapshot_simple', + 'tst.snapshot_recursive', 'tst.snapshot_rename', 'tst.snapshot_simple', 'tst.bookmark.create', 'tst.bookmark.copy', 'tst.terminate_by_signal' ] @@ -151,7 +151,8 @@ tags = ['functional', 'cli_root', 'zfs_change-key'] tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos', 'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos', 'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg', - 'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested'] + 'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested', + 'zfs_clone_rm_nested'] tags = ['functional', 'cli_root', 'zfs_clone'] [tests/functional/cli_root/zfs_copies] @@ -266,8 +267,8 @@ tags = ['functional', 'cli_root', 'zfs_rollback'] [tests/functional/cli_root/zfs_send] tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', 'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos', - 'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_raw', - 'zfs_send_sparse', 'zfs_send-b', 'zfs_send_skip_missing'] + 'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_encrypted_unloaded', + 'zfs_send_raw', 'zfs_send_sparse', 'zfs_send-b', 'zfs_send_skip_missing'] tags = ['functional', 'cli_root', 'zfs_send'] [tests/functional/cli_root/zfs_set] @@ -310,7 +311,7 @@ tags = ['functional', 'cli_root', 'zfs_unmount'] [tests/functional/cli_root/zfs_unshare] tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos', 'zfs_unshare_004_neg', 'zfs_unshare_005_neg', 'zfs_unshare_006_pos', - 'zfs_unshare_007_pos', 'zfs_unshare_008_pos'] + 'zfs_unshare_007_pos'] tags = ['functional', 'cli_root', 'zfs_unshare'] [tests/functional/cli_root/zfs_upgrade] @@ -794,13 +795,13 @@ tests = ['removal_all_vdev', 'removal_cancel', 'removal_check_space', 'removal_nopwrite', 'removal_remap_deadlists', 'removal_resume_export', 'removal_sanity', 'removal_with_add', 'removal_with_create_fs', 'removal_with_dedup', - 'removal_with_errors', 'removal_with_export', + 'removal_with_errors', 'removal_with_export', 'removal_with_indirect', 'removal_with_ganging', 'removal_with_faulted', 'removal_with_remove', 'removal_with_scrub', 'removal_with_send', 'removal_with_send_recv', 'removal_with_snapshot', 'removal_with_write', 'removal_with_zdb', 'remove_expanded', 'remove_mirror', 'remove_mirror_sanity', 'remove_raidz', - 'remove_indirect', 'remove_attach_mirror'] + 'remove_indirect', 'remove_attach_mirror', 'removal_reservation'] tags = ['functional', 'removal'] [tests/functional/rename_dirs] diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 15755408b5ad..4df770d61f07 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -52,6 +52,10 @@ tests = ['zfs_share_005_pos', 'zfs_share_007_neg', 'zfs_share_009_neg', 'zfs_share_012_pos', 'zfs_share_013_pos'] tags = ['functional', 'cli_root', 'zfs_share'] +[tests/functional/cli_root/zfs_unshare:Linux] +tests = ['zfs_unshare_008_pos'] +tags = ['functional', 'cli_root', 'zfs_unshare'] + [tests/functional/cli_root/zfs_sysfs:Linux] tests = ['zfeature_set_unsupported', 'zfs_get_unsupported', 'zfs_set_unsupported', 'zfs_sysfs_live', 'zpool_get_unsupported', @@ -121,7 +125,7 @@ post = tags = ['functional', 'largest_pool'] [tests/functional/mmap:Linux] -tests = ['mmap_libaio_001_pos'] +tests = ['mmap_libaio_001_pos', 'mmap_sync_001_pos'] tags = ['functional', 'mmap'] [tests/functional/mmp:Linux] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 92d62b503f65..a470573616af 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1703,6 +1703,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/removal/removal_with_export.ksh \ functional/removal/removal_with_faulted.ksh \ functional/removal/removal_with_ganging.ksh \ + functional/removal/removal_with_indirect.ksh \ functional/removal/removal_with_remove.ksh \ functional/removal/removal_with_scrub.ksh \ functional/removal/removal_with_send.ksh \ From a8a127e2c9f352ba797cd6625d61840b425d6534 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Thu, 6 Apr 2023 19:46:18 +0200 Subject: [PATCH 035/180] Fix typo in check_clones() Run kmem_free() after zap_cursor_fini(). Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Reviewed-by: Adam Moss Signed-off-by: George Amanakis Closes #14702 --- module/zfs/spa_errlog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index 41cb9d01273c..af144ef16978 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -470,9 +470,9 @@ static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, break; } + zap_cursor_fini(zc); kmem_free(za, sizeof (*za)); kmem_free(zc, sizeof (*zc)); - zap_cursor_fini(zc); return (error); } From ff73574cd83580e4dd5905a43695bd5d0f4911b3 Mon Sep 17 00:00:00 2001 From: Rob N Date: Fri, 7 Apr 2023 03:52:50 +1000 Subject: [PATCH 036/180] vdev: expose zfs_vdev_max_ms_shift as a module parameter Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Reviewed-by: Alexander Motin Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Seagate Technology LLC Closes #14719 --- man/man4/zfs.4 | 5 ++++- module/zfs/vdev.c | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 566caae7b4e9..d529147464fe 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -357,7 +357,10 @@ and the allocation can't actually be satisfied When a vdev is added, target this number of metaslabs per top-level vdev. . .It Sy zfs_vdev_default_ms_shift Ns = Ns Sy 29 Po 512 MiB Pc Pq uint -Default limit for metaslab size. +Default lower limit for metaslab size. +. +.It Sy zfs_vdev_max_ms_shift Ns = Ns Sy 34 Po 16 GiB Pc Pq uint +Default upper limit for metaslab size. . .It Sy zfs_vdev_max_auto_ashift Ns = Ns Sy 14 Pq uint Maximum ashift used when optimizing for logical \[->] physical sector size on diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index ad932a7ba764..7cf858c05051 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -96,7 +96,7 @@ static uint_t zfs_vdev_ms_count_limit = 1ULL << 17; static uint_t zfs_vdev_default_ms_shift = 29; /* upper limit for metaslab size (16G) */ -static const uint_t zfs_vdev_max_ms_shift = 34; +static uint_t zfs_vdev_max_ms_shift = 34; int vdev_validate_skip = B_FALSE; @@ -6288,7 +6288,10 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW, "Target number of metaslabs per top-level vdev"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW, - "Default limit for metaslab size"); + "Default lower limit for metaslab size"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW, + "Default upper limit for metaslab size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW, "Minimum number of metaslabs per top-level vdev"); From baca06c258e07522165cb8e33ff2c0224ad0da57 Mon Sep 17 00:00:00 2001 From: Rob N Date: Tue, 11 Apr 2023 04:53:02 +1000 Subject: [PATCH 037/180] libzfs: add v2 iterator interfaces f6a0dac84 modified the zfs_iter_* functions to take a new "flags" parameter, and introduced a variety of flags to ask the kernel to limit the results in various ways, reducing the amount of work the caller needed to do to filter out things they didn't need. Unfortunately this change broke the ABI for existing clients (read: older versions of the `zfs` program), and was reverted 399b98198. dc95911d2 reintroduced the original patch, with the understanding that a backwards-compatible fix would be made before the 2.2 release branch was tagged. This commit is that fix. This introduces zfs_iter_*_v2 functions that have the new flags argument, and reverts the existing functions to not have the flags parameter, as they were before. The old functions are now reimplemented in terms of the new, with flags set to 0. Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Original-patch-by: George Wilson Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Closes #14597 --- cmd/zfs/zfs_iter.c | 6 +- cmd/zfs/zfs_main.c | 30 ++++----- cmd/zpool/zpool_main.c | 2 +- contrib/pam_zfs_key/pam_zfs_key.c | 4 +- include/libzfs.h | 26 ++++++-- lib/libzfs/libzfs.abi | 102 +++++++++++++++++++++++------- lib/libzfs/libzfs_changelist.c | 7 +- lib/libzfs/libzfs_crypto.c | 2 +- lib/libzfs/libzfs_dataset.c | 18 +++--- lib/libzfs/libzfs_iter.c | 75 ++++++++++++++++++---- lib/libzfs/libzfs_mount.c | 4 +- lib/libzfs/libzfs_sendrecv.c | 21 +++--- 12 files changed, 208 insertions(+), 89 deletions(-) diff --git a/cmd/zfs/zfs_iter.c b/cmd/zfs/zfs_iter.c index 6665627d43e3..202cb0e82b5a 100644 --- a/cmd/zfs/zfs_iter.c +++ b/cmd/zfs/zfs_iter.c @@ -143,19 +143,19 @@ zfs_callback(zfs_handle_t *zhp, void *data) (cb->cb_types & (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME))) && zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { - (void) zfs_iter_filesystems(zhp, cb->cb_flags, + (void) zfs_iter_filesystems_v2(zhp, cb->cb_flags, zfs_callback, data); } if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | ZFS_TYPE_BOOKMARK)) == 0) && include_snaps) { - (void) zfs_iter_snapshots(zhp, cb->cb_flags, + (void) zfs_iter_snapshots_v2(zhp, cb->cb_flags, zfs_callback, data, 0, 0); } if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | ZFS_TYPE_BOOKMARK)) == 0) && include_bmarks) { - (void) zfs_iter_bookmarks(zhp, cb->cb_flags, + (void) zfs_iter_bookmarks_v2(zhp, cb->cb_flags, zfs_callback, data); } diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index d65b01579037..e28f1d04f350 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -1532,7 +1532,8 @@ destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) int err; assert(cb->cb_firstsnap == NULL); assert(cb->cb_prevsnap == NULL); - err = zfs_iter_snapshots_sorted(fs_zhp, 0, destroy_print_cb, cb, 0, 0); + err = zfs_iter_snapshots_sorted_v2(fs_zhp, 0, destroy_print_cb, cb, 0, + 0); if (cb->cb_firstsnap != NULL) { uint64_t used = 0; if (err == 0) { @@ -1558,7 +1559,7 @@ snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg) if (!cb->cb_doclones && !cb->cb_defer_destroy) { cb->cb_target = zhp; cb->cb_first = B_TRUE; - err = zfs_iter_dependents(zhp, 0, B_TRUE, + err = zfs_iter_dependents_v2(zhp, 0, B_TRUE, destroy_check_dependent, cb); } @@ -1576,7 +1577,7 @@ gather_snapshots(zfs_handle_t *zhp, void *arg) destroy_cbdata_t *cb = arg; int err = 0; - err = zfs_iter_snapspec(zhp, 0, cb->cb_snapspec, + err = zfs_iter_snapspec_v2(zhp, 0, cb->cb_snapspec, snapshot_to_nvl_cb, cb); if (err == ENOENT) err = 0; @@ -1590,7 +1591,7 @@ gather_snapshots(zfs_handle_t *zhp, void *arg) } if (cb->cb_recurse) - err = zfs_iter_filesystems(zhp, 0, gather_snapshots, cb); + err = zfs_iter_filesystems_v2(zhp, 0, gather_snapshots, cb); out: zfs_close(zhp); @@ -1615,7 +1616,7 @@ destroy_clones(destroy_cbdata_t *cb) * false while destroying the clones. */ cb->cb_defer_destroy = B_FALSE; - err = zfs_iter_dependents(zhp, 0, B_FALSE, + err = zfs_iter_dependents_v2(zhp, 0, B_FALSE, destroy_callback, cb); cb->cb_defer_destroy = defer; zfs_close(zhp); @@ -1825,9 +1826,8 @@ zfs_do_destroy(int argc, char **argv) * Check for any dependents and/or clones. */ cb.cb_first = B_TRUE; - if (!cb.cb_doclones && - zfs_iter_dependents(zhp, 0, B_TRUE, destroy_check_dependent, - &cb) != 0) { + if (!cb.cb_doclones && zfs_iter_dependents_v2(zhp, 0, B_TRUE, + destroy_check_dependent, &cb) != 0) { rv = 1; goto out; } @@ -1837,7 +1837,7 @@ zfs_do_destroy(int argc, char **argv) goto out; } cb.cb_batchedsnaps = fnvlist_alloc(); - if (zfs_iter_dependents(zhp, 0, B_FALSE, destroy_callback, + if (zfs_iter_dependents_v2(zhp, 0, B_FALSE, destroy_callback, &cb) != 0) { rv = 1; goto out; @@ -4065,7 +4065,7 @@ rollback_check(zfs_handle_t *zhp, void *data) } if (cbp->cb_recurse) { - if (zfs_iter_dependents(zhp, 0, B_TRUE, + if (zfs_iter_dependents_v2(zhp, 0, B_TRUE, rollback_check_dependent, cbp) != 0) { zfs_close(zhp); return (-1); @@ -4164,10 +4164,10 @@ zfs_do_rollback(int argc, char **argv) if (cb.cb_create > 0) min_txg = cb.cb_create; - if ((ret = zfs_iter_snapshots(zhp, 0, rollback_check, &cb, + if ((ret = zfs_iter_snapshots_v2(zhp, 0, rollback_check, &cb, min_txg, 0)) != 0) goto out; - if ((ret = zfs_iter_bookmarks(zhp, 0, rollback_check, &cb)) != 0) + if ((ret = zfs_iter_bookmarks_v2(zhp, 0, rollback_check, &cb)) != 0) goto out; if ((ret = cb.cb_error) != 0) @@ -4309,7 +4309,7 @@ zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) free(name); if (sd->sd_recursive) - rv = zfs_iter_filesystems(zhp, 0, zfs_snapshot_cb, sd); + rv = zfs_iter_filesystems_v2(zhp, 0, zfs_snapshot_cb, sd); zfs_close(zhp); return (rv); } @@ -6373,7 +6373,7 @@ zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un) if (un && opts.recursive) { struct deleg_perms data = { un, update_perm_nvl }; - if (zfs_iter_filesystems(zhp, 0, set_deleg_perms, + if (zfs_iter_filesystems_v2(zhp, 0, set_deleg_perms, &data) != 0) goto cleanup0; } @@ -6751,7 +6751,7 @@ get_one_dataset(zfs_handle_t *zhp, void *data) /* * Iterate over any nested datasets. */ - if (zfs_iter_filesystems(zhp, 0, get_one_dataset, data) != 0) { + if (zfs_iter_filesystems_v2(zhp, 0, get_one_dataset, data) != 0) { zfs_close(zhp); return (1); } diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 9475beaa2368..20f9cd679534 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -8855,7 +8855,7 @@ check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs) (*count)++; } - zfs_iter_filesystems(zhp, 0, check_unsupp_fs, unsupp_fs); + zfs_iter_filesystems_v2(zhp, 0, check_unsupp_fs, unsupp_fs); zfs_close(zhp); diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index 99cdb8d7733f..6ba5b5fba75f 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -535,8 +535,8 @@ zfs_key_config_get_dataset(zfs_key_config_t *config) return (NULL); } - (void) zfs_iter_filesystems(zhp, 0, find_dsname_by_prop_value, - config); + (void) zfs_iter_filesystems_v2(zhp, 0, + find_dsname_by_prop_value, config); zfs_close(zhp); char *dsname = config->dsname; config->dsname = NULL; diff --git a/include/libzfs.h b/include/libzfs.h index 4f8eeb72ad95..7ec9768d8e93 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -656,17 +656,29 @@ _LIBZFS_H void zprop_print_one_property(const char *, zprop_get_cbdata_t *, typedef int (*zfs_iter_f)(zfs_handle_t *, void *); _LIBZFS_H int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *); -_LIBZFS_H int zfs_iter_children(zfs_handle_t *, int, zfs_iter_f, void *); -_LIBZFS_H int zfs_iter_dependents(zfs_handle_t *, int, boolean_t, zfs_iter_f, +_LIBZFS_H int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); +_LIBZFS_H int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); -_LIBZFS_H int zfs_iter_filesystems(zfs_handle_t *, int, zfs_iter_f, void *); -_LIBZFS_H int zfs_iter_snapshots(zfs_handle_t *, int, zfs_iter_f, void *, +_LIBZFS_H int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); +_LIBZFS_H int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *, uint64_t, uint64_t); -_LIBZFS_H int zfs_iter_snapshots_sorted(zfs_handle_t *, int, zfs_iter_f, void *, +_LIBZFS_H int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *, uint64_t, uint64_t); -_LIBZFS_H int zfs_iter_snapspec(zfs_handle_t *, int, const char *, zfs_iter_f, +_LIBZFS_H int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *); -_LIBZFS_H int zfs_iter_bookmarks(zfs_handle_t *, int, zfs_iter_f, void *); +_LIBZFS_H int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *); + +_LIBZFS_H int zfs_iter_children_v2(zfs_handle_t *, int, zfs_iter_f, void *); +_LIBZFS_H int zfs_iter_dependents_v2(zfs_handle_t *, int, boolean_t, zfs_iter_f, + void *); +_LIBZFS_H int zfs_iter_filesystems_v2(zfs_handle_t *, int, zfs_iter_f, void *); +_LIBZFS_H int zfs_iter_snapshots_v2(zfs_handle_t *, int, zfs_iter_f, void *, + uint64_t, uint64_t); +_LIBZFS_H int zfs_iter_snapshots_sorted_v2(zfs_handle_t *, int, zfs_iter_f, + void *, uint64_t, uint64_t); +_LIBZFS_H int zfs_iter_snapspec_v2(zfs_handle_t *, int, const char *, + zfs_iter_f, void *); +_LIBZFS_H int zfs_iter_bookmarks_v2(zfs_handle_t *, int, zfs_iter_f, void *); _LIBZFS_H int zfs_iter_mounted(zfs_handle_t *, zfs_iter_f, void *); typedef struct get_all_cb { diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 2b61710f5592..41e74fd8db19 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -339,14 +339,21 @@ + + + + + + + @@ -2261,18 +2268,31 @@ - - - - + + + + + + + + + + - - - - - + + + + + + + + + + + + @@ -3305,10 +3325,16 @@ - - - - + + + + + + + + + + @@ -3882,19 +3908,34 @@ - - - - - - + + + + + + + + + + + + + + + - - - - + + + + + + + + + + @@ -5091,6 +5132,14 @@ + + + + + + + + @@ -5100,6 +5149,13 @@ + + + + + + + diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c index d7ea60822419..dd14c570ec03 100644 --- a/lib/libzfs/libzfs_changelist.c +++ b/lib/libzfs/libzfs_changelist.c @@ -552,7 +552,7 @@ change_one(zfs_handle_t *zhp, void *data) } if (!clp->cl_alldependents) - ret = zfs_iter_children(zhp, 0, change_one, data); + ret = zfs_iter_children_v2(zhp, 0, change_one, data); /* * If we added the handle to the changelist, we will re-use it @@ -721,11 +721,12 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, return (NULL); } } else if (clp->cl_alldependents) { - if (zfs_iter_dependents(zhp, 0, B_TRUE, change_one, clp) != 0) { + if (zfs_iter_dependents_v2(zhp, 0, B_TRUE, change_one, + clp) != 0) { changelist_free(clp); return (NULL); } - } else if (zfs_iter_children(zhp, 0, change_one, clp) != 0) { + } else if (zfs_iter_children_v2(zhp, 0, change_one, clp) != 0) { changelist_free(clp); return (NULL); } diff --git a/lib/libzfs/libzfs_crypto.c b/lib/libzfs/libzfs_crypto.c index 40059063e21a..8f2a50d55e87 100644 --- a/lib/libzfs/libzfs_crypto.c +++ b/lib/libzfs/libzfs_crypto.c @@ -1226,7 +1226,7 @@ load_keys_cb(zfs_handle_t *zhp, void *arg) cb->cb_numfailed++; out: - (void) zfs_iter_filesystems(zhp, 0, load_keys_cb, cb); + (void) zfs_iter_filesystems_v2(zhp, 0, load_keys_cb, cb); zfs_close(zhp); /* always return 0, since this function is best effort */ diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 8fa36fa95a17..138eca19acc3 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -757,7 +757,7 @@ zfs_open(libzfs_handle_t *hdl, const char *path, int types) * Iterate bookmarks to find the right one. */ errno = 0; - if ((zfs_iter_bookmarks(pzhp, 0, zfs_open_bookmarks_cb, + if ((zfs_iter_bookmarks_v2(pzhp, 0, zfs_open_bookmarks_cb, &cb_data) == 0) && (cb_data.zhp == NULL)) { (void) zfs_error(hdl, EZFS_NOENT, errbuf); zfs_close(pzhp); @@ -2476,7 +2476,7 @@ get_clones_cb(zfs_handle_t *zhp, void *arg) } out: - (void) zfs_iter_children(zhp, 0, get_clones_cb, gca); + (void) zfs_iter_children_v2(zhp, 0, get_clones_cb, gca); zfs_close(zhp); return (0); } @@ -3925,7 +3925,7 @@ zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) if (lzc_exists(name)) fnvlist_add_boolean(dd->nvl, name); - rv = zfs_iter_filesystems(zhp, 0, zfs_check_snap_cb, dd); + rv = zfs_iter_filesystems_v2(zhp, 0, zfs_check_snap_cb, dd); zfs_close(zhp); return (rv); } @@ -4163,7 +4163,7 @@ zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) fnvlist_add_boolean(sd->sd_nvl, name); - rv = zfs_iter_filesystems(zhp, 0, zfs_snapshot_cb, sd); + rv = zfs_iter_filesystems_v2(zhp, 0, zfs_snapshot_cb, sd); } zfs_close(zhp); @@ -4340,7 +4340,7 @@ rollback_destroy(zfs_handle_t *zhp, void *data) rollback_data_t *cbp = data; if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { - cbp->cb_error |= zfs_iter_dependents(zhp, 0, B_FALSE, + cbp->cb_error |= zfs_iter_dependents_v2(zhp, 0, B_FALSE, rollback_destroy_dependent, cbp); cbp->cb_error |= zfs_destroy(zhp, B_FALSE); @@ -4380,10 +4380,10 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) if (cb.cb_create > 0) min_txg = cb.cb_create; - (void) zfs_iter_snapshots(zhp, 0, rollback_destroy, &cb, + (void) zfs_iter_snapshots_v2(zhp, 0, rollback_destroy, &cb, min_txg, 0); - (void) zfs_iter_bookmarks(zhp, 0, rollback_destroy, &cb); + (void) zfs_iter_bookmarks_v2(zhp, 0, rollback_destroy, &cb); if (cb.cb_error) return (-1); @@ -4964,7 +4964,7 @@ zfs_hold_one(zfs_handle_t *zhp, void *arg) fnvlist_add_string(ha->nvl, name, ha->tag); if (ha->recursive) - rv = zfs_iter_filesystems(zhp, 0, zfs_hold_one, ha); + rv = zfs_iter_filesystems_v2(zhp, 0, zfs_hold_one, ha); zfs_close(zhp); return (rv); } @@ -5095,7 +5095,7 @@ zfs_release_one(zfs_handle_t *zhp, void *arg) } if (ha->recursive) - rv = zfs_iter_filesystems(zhp, 0, zfs_release_one, ha); + rv = zfs_iter_filesystems_v2(zhp, 0, zfs_release_one, ha); zfs_close(zhp); return (rv); } diff --git a/lib/libzfs/libzfs_iter.c b/lib/libzfs/libzfs_iter.c index 681fe5b4748d..452d8fd6ab71 100644 --- a/lib/libzfs/libzfs_iter.c +++ b/lib/libzfs/libzfs_iter.c @@ -103,7 +103,14 @@ zfs_do_list_ioctl(zfs_handle_t *zhp, int arg, zfs_cmd_t *zc) * Iterate over all child filesystems */ int -zfs_iter_filesystems(zfs_handle_t *zhp, int flags, zfs_iter_f func, void *data) +zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + return (zfs_iter_filesystems_v2(zhp, 0, func, data)); +} + +int +zfs_iter_filesystems_v2(zfs_handle_t *zhp, int flags, zfs_iter_f func, + void *data) { zfs_cmd_t zc = {"\0"}; zfs_handle_t *nzhp; @@ -143,7 +150,15 @@ zfs_iter_filesystems(zfs_handle_t *zhp, int flags, zfs_iter_f func, void *data) * Iterate over all snapshots */ int -zfs_iter_snapshots(zfs_handle_t *zhp, int flags, zfs_iter_f func, +zfs_iter_snapshots(zfs_handle_t *zhp, boolean_t simple, zfs_iter_f func, + void *data, uint64_t min_txg, uint64_t max_txg) +{ + return (zfs_iter_snapshots_v2(zhp, simple ? ZFS_ITER_SIMPLE : 0, func, + data, min_txg, max_txg)); +} + +int +zfs_iter_snapshots_v2(zfs_handle_t *zhp, int flags, zfs_iter_f func, void *data, uint64_t min_txg, uint64_t max_txg) { zfs_cmd_t zc = {"\0"}; @@ -197,7 +212,13 @@ zfs_iter_snapshots(zfs_handle_t *zhp, int flags, zfs_iter_f func, * Iterate over all bookmarks */ int -zfs_iter_bookmarks(zfs_handle_t *zhp, int flags __maybe_unused, +zfs_iter_bookmarks(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + return (zfs_iter_bookmarks_v2(zhp, 0, func, data)); +} + +int +zfs_iter_bookmarks_v2(zfs_handle_t *zhp, int flags __maybe_unused, zfs_iter_f func, void *data) { zfs_handle_t *nzhp; @@ -305,7 +326,15 @@ zfs_snapshot_compare(const void *larg, const void *rarg) } int -zfs_iter_snapshots_sorted(zfs_handle_t *zhp, int flags, zfs_iter_f callback, +zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, + void *data, uint64_t min_txg, uint64_t max_txg) +{ + return (zfs_iter_snapshots_sorted_v2(zhp, 0, callback, data, + min_txg, max_txg)); +} + +int +zfs_iter_snapshots_sorted_v2(zfs_handle_t *zhp, int flags, zfs_iter_f callback, void *data, uint64_t min_txg, uint64_t max_txg) { int ret = 0; @@ -316,7 +345,7 @@ zfs_iter_snapshots_sorted(zfs_handle_t *zhp, int flags, zfs_iter_f callback, avl_create(&avl, zfs_snapshot_compare, sizeof (zfs_node_t), offsetof(zfs_node_t, zn_avlnode)); - ret = zfs_iter_snapshots(zhp, flags, zfs_sort_snaps, &avl, min_txg, + ret = zfs_iter_snapshots_v2(zhp, flags, zfs_sort_snaps, &avl, min_txg, max_txg); for (node = avl_first(&avl); node != NULL; node = AVL_NEXT(&avl, node)) @@ -379,7 +408,14 @@ snapspec_cb(zfs_handle_t *zhp, void *arg) * return ENOENT at the end. */ int -zfs_iter_snapspec(zfs_handle_t *fs_zhp, int flags, const char *spec_orig, +zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig, + zfs_iter_f func, void *arg) +{ + return (zfs_iter_snapspec_v2(fs_zhp, 0, spec_orig, func, arg)); +} + +int +zfs_iter_snapspec_v2(zfs_handle_t *fs_zhp, int flags, const char *spec_orig, zfs_iter_f func, void *arg) { char *buf, *comma_separated, *cp; @@ -419,7 +455,7 @@ zfs_iter_snapspec(zfs_handle_t *fs_zhp, int flags, const char *spec_orig, } } - err = zfs_iter_snapshots_sorted(fs_zhp, flags, + err = zfs_iter_snapshots_sorted_v2(fs_zhp, flags, snapspec_cb, &ssa, 0, 0); if (ret == 0) ret = err; @@ -456,14 +492,20 @@ zfs_iter_snapspec(zfs_handle_t *fs_zhp, int flags, const char *spec_orig, * and as close as possible. */ int -zfs_iter_children(zfs_handle_t *zhp, int flags, zfs_iter_f func, void *data) +zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + return (zfs_iter_children_v2(zhp, 0, func, data)); +} + +int +zfs_iter_children_v2(zfs_handle_t *zhp, int flags, zfs_iter_f func, void *data) { int ret; - if ((ret = zfs_iter_snapshots(zhp, flags, func, data, 0, 0)) != 0) + if ((ret = zfs_iter_snapshots_v2(zhp, flags, func, data, 0, 0)) != 0) return (ret); - return (zfs_iter_filesystems(zhp, flags, func, data)); + return (zfs_iter_filesystems_v2(zhp, flags, func, data)); } @@ -524,10 +566,10 @@ iter_dependents_cb(zfs_handle_t *zhp, void *arg) isf.zhp = zhp; isf.next = ida->stack; ida->stack = &isf; - err = zfs_iter_filesystems(zhp, ida->flags, + err = zfs_iter_filesystems_v2(zhp, ida->flags, iter_dependents_cb, ida); if (err == 0) - err = zfs_iter_snapshots(zhp, ida->flags, + err = zfs_iter_snapshots_v2(zhp, ida->flags, iter_dependents_cb, ida, 0, 0); ida->stack = isf.next; } @@ -541,7 +583,14 @@ iter_dependents_cb(zfs_handle_t *zhp, void *arg) } int -zfs_iter_dependents(zfs_handle_t *zhp, int flags, boolean_t allowrecursion, +zfs_iter_dependents(zfs_handle_t *zhp, boolean_t allowrecursion, + zfs_iter_f func, void *data) +{ + return (zfs_iter_dependents_v2(zhp, 0, allowrecursion, func, data)); +} + +int +zfs_iter_dependents_v2(zfs_handle_t *zhp, int flags, boolean_t allowrecursion, zfs_iter_f func, void *data) { iter_dependents_arg_t ida; diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c index 024f449baa0b..5d1fe651c97e 100644 --- a/lib/libzfs/libzfs_mount.c +++ b/lib/libzfs/libzfs_mount.c @@ -940,7 +940,7 @@ zfs_iter_cb(zfs_handle_t *zhp, void *data) } libzfs_add_handle(cbp, zhp); - if (zfs_iter_filesystems(zhp, 0, zfs_iter_cb, cbp) != 0) { + if (zfs_iter_filesystems_v2(zhp, 0, zfs_iter_cb, cbp) != 0) { zfs_close(zhp); return (-1); } @@ -1289,7 +1289,7 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) * over all child filesystems. */ libzfs_add_handle(&cb, zfsp); - if (zfs_iter_filesystems(zfsp, 0, zfs_iter_cb, &cb) != 0) + if (zfs_iter_filesystems_v2(zfsp, 0, zfs_iter_cb, &cb) != 0) goto out; /* diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 23402f86a8ae..87a30f54fea8 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -288,7 +288,7 @@ send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv); /* * Collect guid, valid props, optionally holds, etc. of a snapshot. - * This interface is intended for use as a zfs_iter_snapshots_sorted visitor. + * This interface is intended for use as a zfs_iter_snapshots_v2_sorted visitor. */ static int send_iterate_snap(zfs_handle_t *zhp, void *arg) @@ -619,8 +619,8 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg) min_txg = fromsnap_txg; if (!sd->replicate && tosnap_txg != 0) max_txg = tosnap_txg; - (void) zfs_iter_snapshots_sorted(zhp, 0, send_iterate_snap, sd, - min_txg, max_txg); + (void) zfs_iter_snapshots_sorted_v2(zhp, 0, send_iterate_snap, + sd, min_txg, max_txg); } else { char snapname[MAXPATHLEN] = { 0 }; zfs_handle_t *snap; @@ -662,7 +662,7 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg) /* Iterate over children. */ if (sd->recursive) - rv = zfs_iter_filesystems(zhp, 0, send_iterate_fs, sd); + rv = zfs_iter_filesystems_v2(zhp, 0, send_iterate_fs, sd); out: /* Restore saved fields. */ @@ -1083,7 +1083,7 @@ send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap, /* * Send a single filesystem snapshot, updating the send dump data. - * This interface is intended for use as a zfs_iter_snapshots_sorted visitor. + * This interface is intended for use as a zfs_iter_snapshots_v2_sorted visitor. */ static int dump_snapshot(zfs_handle_t *zhp, void *arg) @@ -1293,7 +1293,7 @@ dump_filesystem(zfs_handle_t *zhp, send_dump_data_t *sdd) zhp->zfs_name, sdd->tosnap); } } - rv = zfs_iter_snapshots_sorted(zhp, 0, dump_snapshot, sdd, + rv = zfs_iter_snapshots_sorted_v2(zhp, 0, dump_snapshot, sdd, min_txg, max_txg); } else { char snapname[MAXPATHLEN] = { 0 }; @@ -3162,9 +3162,9 @@ guid_to_name_cb(zfs_handle_t *zhp, void *arg) return (EEXIST); } - err = zfs_iter_children(zhp, 0, guid_to_name_cb, gtnd); + err = zfs_iter_children_v2(zhp, 0, guid_to_name_cb, gtnd); if (err != EEXIST && gtnd->bookmark_ok) - err = zfs_iter_bookmarks(zhp, 0, guid_to_name_cb, gtnd); + err = zfs_iter_bookmarks_v2(zhp, 0, guid_to_name_cb, gtnd); zfs_close(zhp); return (err); } @@ -3218,9 +3218,10 @@ guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, continue; int err = guid_to_name_cb(zfs_handle_dup(zhp), >nd); if (err != EEXIST) - err = zfs_iter_children(zhp, 0, guid_to_name_cb, >nd); + err = zfs_iter_children_v2(zhp, 0, guid_to_name_cb, + >nd); if (err != EEXIST && bookmark_ok) - err = zfs_iter_bookmarks(zhp, 0, guid_to_name_cb, + err = zfs_iter_bookmarks_v2(zhp, 0, guid_to_name_cb, >nd); zfs_close(zhp); if (err == EEXIST) From dee77f45d0f961de0b421f36bbde4196624a13c5 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Fri, 7 Apr 2023 00:40:34 +0000 Subject: [PATCH 038/180] module: resync part of Makefile.bsd sha256-armv8.S and sha512-armv8.S need the same treatment as the sse bits; removal of -mgeneral-regs-only from flags. This fixes errors about requiring NEON, which is a difference in clang vs. gcc treatment of -mgeneral-regs-only being specified on asm files. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Kyle Evans Closes #14715 --- module/Makefile.bsd | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 8ec094d4ad1c..365609fb8585 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -507,6 +507,16 @@ CFLAGS.zstd_lazy.c+= ${__ZFS_ZSTD_AARCH64_FLAGS} CFLAGS.zstd_ldm.c+= ${__ZFS_ZSTD_AARCH64_FLAGS} CFLAGS.zstd_opt.c+= ${__ZFS_ZSTD_AARCH64_FLAGS} +sha256-armv8.o: sha256-armv8.S + ${CC} -c ${CFLAGS:N-mgeneral-regs-only} ${WERROR} ${.IMPSRC} \ + -o ${.TARGET} + ${CTFCONVERT_CMD} + +sha512-armv8.o: sha512-armv8.S + ${CC} -c ${CFLAGS:N-mgeneral-regs-only} ${WERROR} ${.IMPSRC} \ + -o ${.TARGET} + ${CTFCONVERT_CMD} + b3_aarch64_sse2.o: b3_aarch64_sse2.S ${CC} -c ${CFLAGS:N-mgeneral-regs-only} ${WERROR} ${.IMPSRC} \ -o ${.TARGET} From d0cbd9feaf5b82130f2e679256c71e0c7413aae9 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Fri, 7 Apr 2023 00:40:34 +0000 Subject: [PATCH 039/180] module: freebsd: fix aarch64 fpu handling Just like x86, aarch64 needs to use the fpu_kern(9) API around FPU usage, otherwise we panic promptly at boot as soon as ZFS attempts to do checksum benchmarking. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Kyle Evans Closes #14715 --- include/os/freebsd/spl/sys/simd_aarch64.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/include/os/freebsd/spl/sys/simd_aarch64.h b/include/os/freebsd/spl/sys/simd_aarch64.h index df33bdaeccf8..234f401db791 100644 --- a/include/os/freebsd/spl/sys/simd_aarch64.h +++ b/include/os/freebsd/spl/sys/simd_aarch64.h @@ -44,13 +44,23 @@ #define _FREEBSD_SIMD_AARCH64_H #include +#include #include +#include #include +#include #define kfpu_allowed() 1 #define kfpu_initialize(tsk) do {} while (0) -#define kfpu_begin() do {} while (0) -#define kfpu_end() do {} while (0) +#define kfpu_begin() do { \ + if (__predict_false(!is_fpu_kern_thread(0))) \ + fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX); \ +} while (0) + +#define kfpu_end() do { \ + if (__predict_false(curthread->td_pcb->pcb_fpflags & PCB_FP_NOSAVE)) \ + fpu_kern_leave(curthread, NULL); \ +} while (0) #define kfpu_init() (0) #define kfpu_fini() do {} while (0) From d4dc53dad2f6c3a2d107f1ba0e8d66228c845e00 Mon Sep 17 00:00:00 2001 From: youzhongyang Date: Mon, 10 Apr 2023 17:15:36 -0400 Subject: [PATCH 040/180] Linux 6.3 compat: idmapped mount API changes Linux kernel 6.3 changed a bunch of APIs to use the dedicated idmap type for mounts (struct mnt_idmap), we need to detect these changes and make zfs work with the new APIs. Reviewed-by: Brian Behlendorf Signed-off-by: Youzhong Yang Closes #14682 --- config/kernel-acl.m4 | 34 ++++++-- config/kernel-generic_fillattr.m4 | 33 +++++-- config/kernel-inode-create.m4 | 41 +++++++-- config/kernel-inode-getattr.m4 | 63 ++++++++++---- config/kernel-inode-permission.m4 | 35 ++++++-- config/kernel-inode-setattr.m4 | 87 +++++++++++++++++++ config/kernel-is_owner_or_cap.m4 | 25 +++++- config/kernel-mkdir.m4 | 55 +++++++++--- config/kernel-mknod.m4 | 34 +++++++- config/kernel-rename.m4 | 56 ++++++++---- config/kernel-setattr-prepare.m4 | 44 +++++++--- config/kernel-symlink.m4 | 33 +++++-- config/kernel-tmpfile.m4 | 33 +++++-- config/kernel-xattr-handler.m4 | 91 +++++++++++++------- config/kernel.m4 | 6 +- include/os/freebsd/spl/sys/types.h | 2 +- include/os/freebsd/zfs/sys/zfs_vnops_os.h | 10 +-- include/os/linux/kernel/linux/vfs_compat.h | 21 ++++- include/os/linux/kernel/linux/xattr_compat.h | 17 +++- include/os/linux/spl/sys/cred.h | 30 ++++--- include/os/linux/spl/sys/types.h | 15 +++- include/os/linux/zfs/sys/policy.h | 6 +- include/os/linux/zfs/sys/zfs_vnops_os.h | 15 ++-- include/os/linux/zfs/sys/zpl.h | 13 ++- include/sys/zfs_acl.h | 10 +-- module/os/freebsd/zfs/zfs_acl.c | 10 +-- module/os/freebsd/zfs/zfs_vnops_os.c | 10 +-- module/os/linux/spl/spl-cred.c | 12 +++ module/os/linux/zfs/policy.c | 13 +-- module/os/linux/zfs/zfs_acl.c | 33 ++++--- module/os/linux/zfs/zfs_dir.c | 4 +- module/os/linux/zfs/zfs_ioctl_os.c | 4 + module/os/linux/zfs/zfs_vnops_os.c | 35 ++++---- module/os/linux/zfs/zfs_znode.c | 2 +- module/os/linux/zfs/zpl_ctldir.c | 53 +++++++++--- module/os/linux/zfs/zpl_file.c | 10 +-- module/os/linux/zfs/zpl_inode.c | 77 ++++++++++++----- module/os/linux/zfs/zpl_xattr.c | 25 +++--- module/zfs/zfs_replay.c | 14 +-- module/zfs/zfs_vnops.c | 4 +- 40 files changed, 821 insertions(+), 294 deletions(-) create mode 100644 config/kernel-inode-setattr.m4 diff --git a/config/kernel-acl.m4 b/config/kernel-acl.m4 index 6e92da97d0fe..be08c3c60724 100644 --- a/config/kernel-acl.m4 +++ b/config/kernel-acl.m4 @@ -236,7 +236,22 @@ dnl # dnl # 6.2 API change, dnl # set_acl() second paramter changed to a struct dentry * dnl # +dnl # 6.3 API change, +dnl # set_acl() first parameter changed to struct mnt_idmap * +dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_SET_ACL], [ + ZFS_LINUX_TEST_SRC([inode_operations_set_acl_mnt_idmap_dentry], [ + #include + + int set_acl_fn(struct mnt_idmap *idmap, + struct dentry *dent, struct posix_acl *acl, + int type) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .set_acl = set_acl_fn, + }; + ],[]) ZFS_LINUX_TEST_SRC([inode_operations_set_acl_userns_dentry], [ #include @@ -281,17 +296,24 @@ AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_SET_ACL], [ AC_DEFINE(HAVE_SET_ACL, 1, [iops->set_acl() exists]) AC_DEFINE(HAVE_SET_ACL_USERNS, 1, [iops->set_acl() takes 4 args]) ],[ - ZFS_LINUX_TEST_RESULT([inode_operations_set_acl_userns_dentry], [ + ZFS_LINUX_TEST_RESULT([inode_operations_set_acl_mnt_idmap_dentry], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_SET_ACL, 1, [iops->set_acl() exists]) - AC_DEFINE(HAVE_SET_ACL_USERNS_DENTRY_ARG2, 1, - [iops->set_acl() takes 4 args, arg2 is struct dentry *]) + AC_DEFINE(HAVE_SET_ACL_IDMAP_DENTRY, 1, + [iops->set_acl() takes 4 args, arg1 is struct mnt_idmap *]) ],[ - ZFS_LINUX_TEST_RESULT([inode_operations_set_acl], [ + ZFS_LINUX_TEST_RESULT([inode_operations_set_acl_userns_dentry], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_SET_ACL, 1, [iops->set_acl() exists, takes 3 args]) + AC_DEFINE(HAVE_SET_ACL, 1, [iops->set_acl() exists]) + AC_DEFINE(HAVE_SET_ACL_USERNS_DENTRY_ARG2, 1, + [iops->set_acl() takes 4 args, arg2 is struct dentry *]) ],[ - ZFS_LINUX_REQUIRE_API([i_op->set_acl()], [3.14]) + ZFS_LINUX_TEST_RESULT([inode_operations_set_acl], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SET_ACL, 1, [iops->set_acl() exists, takes 3 args]) + ],[ + ZFS_LINUX_REQUIRE_API([i_op->set_acl()], [3.14]) + ]) ]) ]) ]) diff --git a/config/kernel-generic_fillattr.m4 b/config/kernel-generic_fillattr.m4 index 0acd5d53103f..02dee4d4c000 100644 --- a/config/kernel-generic_fillattr.m4 +++ b/config/kernel-generic_fillattr.m4 @@ -4,7 +4,10 @@ dnl # dnl # generic_fillattr in linux/fs.h now requires a struct user_namespace* dnl # as the first arg, to support idmapped mounts. dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR_USERNS], [ +dnl # 6.3 API +dnl # generic_fillattr() now takes struct mnt_idmap* as the first argument +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR], [ ZFS_LINUX_TEST_SRC([generic_fillattr_userns], [ #include ],[ @@ -13,16 +16,32 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR_USERNS], [ struct kstat *k = NULL; generic_fillattr(userns, in, k); ]) + + ZFS_LINUX_TEST_SRC([generic_fillattr_mnt_idmap], [ + #include + ],[ + struct mnt_idmap *idmap = NULL; + struct inode *in = NULL; + struct kstat *k = NULL; + generic_fillattr(idmap, in, k); + ]) ]) -AC_DEFUN([ZFS_AC_KERNEL_GENERIC_FILLATTR_USERNS], [ - AC_MSG_CHECKING([whether generic_fillattr requires struct user_namespace*]) - ZFS_LINUX_TEST_RESULT([generic_fillattr_userns], [ +AC_DEFUN([ZFS_AC_KERNEL_GENERIC_FILLATTR], [ + AC_MSG_CHECKING([whether generic_fillattr requires struct mnt_idmap*]) + ZFS_LINUX_TEST_RESULT([generic_fillattr_mnt_idmap], [ AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_GENERIC_FILLATTR_USERNS, 1, - [generic_fillattr requires struct user_namespace*]) + AC_DEFINE(HAVE_GENERIC_FILLATTR_IDMAP, 1, + [generic_fillattr requires struct mnt_idmap*]) ],[ - AC_MSG_RESULT([no]) + AC_MSG_CHECKING([whether generic_fillattr requires struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([generic_fillattr_userns], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_GENERIC_FILLATTR_USERNS, 1, + [generic_fillattr requires struct user_namespace*]) + ],[ + AC_MSG_RESULT([no]) + ]) ]) ]) diff --git a/config/kernel-inode-create.m4 b/config/kernel-inode-create.m4 index a6ea11fb61b2..9e9e43180976 100644 --- a/config/kernel-inode-create.m4 +++ b/config/kernel-inode-create.m4 @@ -1,4 +1,22 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_CREATE], [ + dnl # + dnl # 6.3 API change + dnl # The first arg is changed to struct mnt_idmap * + dnl # + ZFS_LINUX_TEST_SRC([create_mnt_idmap], [ + #include + #include + + int inode_create(struct mnt_idmap *idmap, + struct inode *inode ,struct dentry *dentry, + umode_t umode, bool flag) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .create = inode_create, + }; + ],[]) + dnl # dnl # 5.12 API change that added the struct user_namespace* arg dnl # to the front of this function type's arg list. @@ -35,19 +53,28 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_CREATE], [ ]) AC_DEFUN([ZFS_AC_KERNEL_CREATE], [ - AC_MSG_CHECKING([whether iops->create() takes struct user_namespace*]) - ZFS_LINUX_TEST_RESULT([create_userns], [ + AC_MSG_CHECKING([whether iops->create() takes struct mnt_idmap*]) + ZFS_LINUX_TEST_RESULT([create_mnt_idmap], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_IOPS_CREATE_USERNS, 1, - [iops->create() takes struct user_namespace*]) + AC_DEFINE(HAVE_IOPS_CREATE_IDMAP, 1, + [iops->create() takes struct mnt_idmap*]) ],[ AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether iops->create() passes flags]) - ZFS_LINUX_TEST_RESULT([create_flags], [ + AC_MSG_CHECKING([whether iops->create() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([create_userns], [ AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOPS_CREATE_USERNS, 1, + [iops->create() takes struct user_namespace*]) ],[ - ZFS_LINUX_TEST_ERROR([iops->create()]) + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether iops->create() passes flags]) + ZFS_LINUX_TEST_RESULT([create_flags], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([iops->create()]) + ]) ]) ]) ]) diff --git a/config/kernel-inode-getattr.m4 b/config/kernel-inode-getattr.m4 index f62e82f5230a..c8bfb07862ab 100644 --- a/config/kernel-inode-getattr.m4 +++ b/config/kernel-inode-getattr.m4 @@ -1,4 +1,24 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_GETATTR], [ + dnl # + dnl # Linux 6.3 API + dnl # The first arg of getattr I/O operations handler type + dnl # is changed to struct mnt_idmap* + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_getattr_mnt_idmap], [ + #include + + int test_getattr( + struct mnt_idmap *idmap, + const struct path *p, struct kstat *k, + u32 request_mask, unsigned int query_flags) + { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .getattr = test_getattr, + }; + ],[]) + dnl # dnl # Linux 5.12 API dnl # The getattr I/O operations handler type was extended to require @@ -55,37 +75,48 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_GETATTR], [ AC_DEFUN([ZFS_AC_KERNEL_INODE_GETATTR], [ dnl # - dnl # Kernel 5.12 test + dnl # Kernel 6.3 test dnl # - AC_MSG_CHECKING([whether iops->getattr() takes user_namespace]) - ZFS_LINUX_TEST_RESULT([inode_operations_getattr_userns], [ + AC_MSG_CHECKING([whether iops->getattr() takes mnt_idmap]) + ZFS_LINUX_TEST_RESULT([inode_operations_getattr_mnt_idmap], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_USERNS_IOPS_GETATTR, 1, - [iops->getattr() takes struct user_namespace*]) + AC_DEFINE(HAVE_IDMAP_IOPS_GETATTR, 1, + [iops->getattr() takes struct mnt_idmap*]) ],[ AC_MSG_RESULT(no) - dnl # - dnl # Kernel 4.11 test + dnl # Kernel 5.12 test dnl # - AC_MSG_CHECKING([whether iops->getattr() takes a path]) - ZFS_LINUX_TEST_RESULT([inode_operations_getattr_path], [ + AC_MSG_CHECKING([whether iops->getattr() takes user_namespace]) + ZFS_LINUX_TEST_RESULT([inode_operations_getattr_userns], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_PATH_IOPS_GETATTR, 1, - [iops->getattr() takes a path]) + AC_DEFINE(HAVE_USERNS_IOPS_GETATTR, 1, + [iops->getattr() takes struct user_namespace*]) ],[ AC_MSG_RESULT(no) dnl # - dnl # Kernel < 4.11 test + dnl # Kernel 4.11 test dnl # - AC_MSG_CHECKING([whether iops->getattr() takes a vfsmount]) - ZFS_LINUX_TEST_RESULT([inode_operations_getattr_vfsmount], [ + AC_MSG_CHECKING([whether iops->getattr() takes a path]) + ZFS_LINUX_TEST_RESULT([inode_operations_getattr_path], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_VFSMOUNT_IOPS_GETATTR, 1, - [iops->getattr() takes a vfsmount]) + AC_DEFINE(HAVE_PATH_IOPS_GETATTR, 1, + [iops->getattr() takes a path]) ],[ AC_MSG_RESULT(no) + + dnl # + dnl # Kernel < 4.11 test + dnl # + AC_MSG_CHECKING([whether iops->getattr() takes a vfsmount]) + ZFS_LINUX_TEST_RESULT([inode_operations_getattr_vfsmount], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_VFSMOUNT_IOPS_GETATTR, 1, + [iops->getattr() takes a vfsmount]) + ],[ + AC_MSG_RESULT(no) + ]) ]) ]) ]) diff --git a/config/kernel-inode-permission.m4 b/config/kernel-inode-permission.m4 index ba9ff5d43d4d..01d23635b0c9 100644 --- a/config/kernel-inode-permission.m4 +++ b/config/kernel-inode-permission.m4 @@ -1,4 +1,22 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_PERMISSION], [ + dnl # + dnl # 6.3 API change + dnl # iops->permission() now takes struct mnt_idmap* + dnl # as its first arg + dnl # + ZFS_LINUX_TEST_SRC([permission_mnt_idmap], [ + #include + #include + + int inode_permission(struct mnt_idmap *idmap, + struct inode *inode, int mask) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .permission = inode_permission, + }; + ],[]) + dnl # dnl # 5.12 API change that added the struct user_namespace* arg dnl # to the front of this function type's arg list. @@ -18,12 +36,19 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_PERMISSION], [ ]) AC_DEFUN([ZFS_AC_KERNEL_PERMISSION], [ - AC_MSG_CHECKING([whether iops->permission() takes struct user_namespace*]) - ZFS_LINUX_TEST_RESULT([permission_userns], [ + AC_MSG_CHECKING([whether iops->permission() takes struct mnt_idmap*]) + ZFS_LINUX_TEST_RESULT([permission_mnt_idmap], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_IOPS_PERMISSION_USERNS, 1, - [iops->permission() takes struct user_namespace*]) + AC_DEFINE(HAVE_IOPS_PERMISSION_IDMAP, 1, + [iops->permission() takes struct mnt_idmap*]) ],[ - AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether iops->permission() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([permission_userns], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOPS_PERMISSION_USERNS, 1, + [iops->permission() takes struct user_namespace*]) + ],[ + AC_MSG_RESULT(no) + ]) ]) ]) diff --git a/config/kernel-inode-setattr.m4 b/config/kernel-inode-setattr.m4 new file mode 100644 index 000000000000..45755b4eb273 --- /dev/null +++ b/config/kernel-inode-setattr.m4 @@ -0,0 +1,87 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_SETATTR], [ + dnl # + dnl # Linux 6.3 API + dnl # The first arg of setattr I/O operations handler type + dnl # is changed to struct mnt_idmap* + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_setattr_mnt_idmap], [ + #include + + int test_setattr( + struct mnt_idmap *idmap, + struct dentry *de, struct iattr *ia) + { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .setattr = test_setattr, + }; + ],[]) + + dnl # + dnl # Linux 5.12 API + dnl # The setattr I/O operations handler type was extended to require + dnl # a struct user_namespace* as its first arg, to support idmapped + dnl # mounts. + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_setattr_userns], [ + #include + + int test_setattr( + struct user_namespace *userns, + struct dentry *de, struct iattr *ia) + { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .setattr = test_setattr, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([inode_operations_setattr], [ + #include + + int test_setattr( + struct dentry *de, struct iattr *ia) + { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .setattr = test_setattr, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_SETATTR], [ + dnl # + dnl # Kernel 6.3 test + dnl # + AC_MSG_CHECKING([whether iops->setattr() takes mnt_idmap]) + ZFS_LINUX_TEST_RESULT([inode_operations_setattr_mnt_idmap], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IDMAP_IOPS_SETATTR, 1, + [iops->setattr() takes struct mnt_idmap*]) + ],[ + AC_MSG_RESULT(no) + dnl # + dnl # Kernel 5.12 test + dnl # + AC_MSG_CHECKING([whether iops->setattr() takes user_namespace]) + ZFS_LINUX_TEST_RESULT([inode_operations_setattr_userns], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_USERNS_IOPS_SETATTR, 1, + [iops->setattr() takes struct user_namespace*]) + ],[ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether iops->setattr() exists]) + ZFS_LINUX_TEST_RESULT([inode_operations_setattr], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOPS_SETATTR, 1, + [iops->setattr() exists]) + ],[ + AC_MSG_RESULT(no) + ]) + ]) + ]) +]) diff --git a/config/kernel-is_owner_or_cap.m4 b/config/kernel-is_owner_or_cap.m4 index a90cf3da641d..4e9c002b77f2 100644 --- a/config/kernel-is_owner_or_cap.m4 +++ b/config/kernel-is_owner_or_cap.m4 @@ -16,12 +16,20 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE], [ (void) inode_owner_or_capable(ip); ]) - ZFS_LINUX_TEST_SRC([inode_owner_or_capable_idmapped], [ + ZFS_LINUX_TEST_SRC([inode_owner_or_capable_userns], [ #include ],[ struct inode *ip = NULL; (void) inode_owner_or_capable(&init_user_ns, ip); ]) + + ZFS_LINUX_TEST_SRC([inode_owner_or_capable_mnt_idmap], [ + #include + #include + ],[ + struct inode *ip = NULL; + (void) inode_owner_or_capable(&nop_mnt_idmap, ip); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE], [ @@ -35,12 +43,21 @@ AC_DEFUN([ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE], [ AC_MSG_CHECKING( [whether inode_owner_or_capable() takes user_ns]) - ZFS_LINUX_TEST_RESULT([inode_owner_or_capable_idmapped], [ + ZFS_LINUX_TEST_RESULT([inode_owner_or_capable_userns], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_INODE_OWNER_OR_CAPABLE_IDMAPPED, 1, + AC_DEFINE(HAVE_INODE_OWNER_OR_CAPABLE_USERNS, 1, [inode_owner_or_capable() takes user_ns]) ],[ - ZFS_LINUX_TEST_ERROR([capability]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING( + [whether inode_owner_or_capable() takes mnt_idmap]) + ZFS_LINUX_TEST_RESULT([inode_owner_or_capable_mnt_idmap], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_OWNER_OR_CAPABLE_IDMAP, 1, + [inode_owner_or_capable() takes mnt_idmap]) + ], [ + ZFS_LINUX_TEST_ERROR([capability]) + ]) ]) ]) ]) diff --git a/config/kernel-mkdir.m4 b/config/kernel-mkdir.m4 index 6667ed04fa4c..7407a791b846 100644 --- a/config/kernel-mkdir.m4 +++ b/config/kernel-mkdir.m4 @@ -2,6 +2,22 @@ dnl # dnl # Supported mkdir() interfaces checked newest to oldest. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_MKDIR], [ + dnl # + dnl # 6.3 API change + dnl # mkdir() takes struct mnt_idmap * as the first arg + dnl # + ZFS_LINUX_TEST_SRC([mkdir_mnt_idmap], [ + #include + + int mkdir(struct mnt_idmap *idmap, + struct inode *inode, struct dentry *dentry, + umode_t umode) { return 0; } + static const struct inode_operations + iops __attribute__ ((unused)) = { + .mkdir = mkdir, + }; + ],[]) + dnl # dnl # 5.12 API change dnl # The struct user_namespace arg was added as the first argument to @@ -43,25 +59,36 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MKDIR], [ AC_DEFUN([ZFS_AC_KERNEL_MKDIR], [ dnl # - dnl # 5.12 API change - dnl # The struct user_namespace arg was added as the first argument to - dnl # mkdir() of the iops structure. + dnl # 6.3 API change + dnl # mkdir() takes struct mnt_idmap * as the first arg dnl # - AC_MSG_CHECKING([whether iops->mkdir() takes struct user_namespace*]) - ZFS_LINUX_TEST_RESULT([mkdir_user_namespace], [ + AC_MSG_CHECKING([whether iops->mkdir() takes struct mnt_idmap*]) + ZFS_LINUX_TEST_RESULT([mkdir_mnt_idmap], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_IOPS_MKDIR_USERNS, 1, - [iops->mkdir() takes struct user_namespace*]) + AC_DEFINE(HAVE_IOPS_MKDIR_IDMAP, 1, + [iops->mkdir() takes struct mnt_idmap*]) ],[ - AC_MSG_RESULT(no) - - AC_MSG_CHECKING([whether iops->mkdir() takes umode_t]) - ZFS_LINUX_TEST_RESULT([inode_operations_mkdir], [ + dnl # + dnl # 5.12 API change + dnl # The struct user_namespace arg was added as the first argument to + dnl # mkdir() of the iops structure. + dnl # + AC_MSG_CHECKING([whether iops->mkdir() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([mkdir_user_namespace], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_MKDIR_UMODE_T, 1, - [iops->mkdir() takes umode_t]) + AC_DEFINE(HAVE_IOPS_MKDIR_USERNS, 1, + [iops->mkdir() takes struct user_namespace*]) ],[ - ZFS_LINUX_TEST_ERROR([mkdir()]) + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether iops->mkdir() takes umode_t]) + ZFS_LINUX_TEST_RESULT([inode_operations_mkdir], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MKDIR_UMODE_T, 1, + [iops->mkdir() takes umode_t]) + ],[ + ZFS_LINUX_TEST_ERROR([mkdir()]) + ]) ]) ]) ]) diff --git a/config/kernel-mknod.m4 b/config/kernel-mknod.m4 index ffe45106003a..1494ec1ae4d4 100644 --- a/config/kernel-mknod.m4 +++ b/config/kernel-mknod.m4 @@ -1,4 +1,22 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MKNOD], [ + dnl # + dnl # 6.3 API change + dnl # The first arg is now struct mnt_idmap* + dnl # + ZFS_LINUX_TEST_SRC([mknod_mnt_idmap], [ + #include + #include + + int tmp_mknod(struct mnt_idmap *idmap, + struct inode *inode ,struct dentry *dentry, + umode_t u, dev_t d) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .mknod = tmp_mknod, + }; + ],[]) + dnl # dnl # 5.12 API change that added the struct user_namespace* arg dnl # to the front of this function type's arg list. @@ -19,12 +37,20 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MKNOD], [ ]) AC_DEFUN([ZFS_AC_KERNEL_MKNOD], [ - AC_MSG_CHECKING([whether iops->mknod() takes struct user_namespace*]) - ZFS_LINUX_TEST_RESULT([mknod_userns], [ + AC_MSG_CHECKING([whether iops->mknod() takes struct mnt_idmap*]) + ZFS_LINUX_TEST_RESULT([mknod_mnt_idmap], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_IOPS_MKNOD_USERNS, 1, - [iops->mknod() takes struct user_namespace*]) + AC_DEFINE(HAVE_IOPS_MKNOD_IDMAP, 1, + [iops->mknod() takes struct mnt_idmap*]) ],[ AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether iops->mknod() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([mknod_userns], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOPS_MKNOD_USERNS, 1, + [iops->mknod() takes struct user_namespace*]) + ],[ + AC_MSG_RESULT(no) + ]) ]) ]) diff --git a/config/kernel-rename.m4 b/config/kernel-rename.m4 index a2b0800ab4d2..57c3eed78974 100644 --- a/config/kernel-rename.m4 +++ b/config/kernel-rename.m4 @@ -71,39 +71,61 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [ .rename = rename_fn, }; ],[]) + + dnl # + dnl # 6.3 API change - the first arg is now struct mnt_idmap* + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_rename_mnt_idmap], [ + #include + int rename_fn(struct mnt_idmap *idmap, struct inode *sip, + struct dentry *sdp, struct inode *tip, struct dentry *tdp, + unsigned int flags) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .rename = rename_fn, + }; + ],[]) ]) AC_DEFUN([ZFS_AC_KERNEL_RENAME], [ - AC_MSG_CHECKING([whether iops->rename() takes struct user_namespace*]) - ZFS_LINUX_TEST_RESULT([inode_operations_rename_userns], [ + AC_MSG_CHECKING([whether iops->rename() takes struct mnt_idmap*]) + ZFS_LINUX_TEST_RESULT([inode_operations_rename_mnt_idmap], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_IOPS_RENAME_USERNS, 1, - [iops->rename() takes struct user_namespace*]) + AC_DEFINE(HAVE_IOPS_RENAME_IDMAP, 1, + [iops->rename() takes struct mnt_idmap*]) ],[ - AC_MSG_RESULT(no) - - AC_MSG_CHECKING([whether iops->rename2() exists]) - ZFS_LINUX_TEST_RESULT([inode_operations_rename2], [ + AC_MSG_CHECKING([whether iops->rename() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([inode_operations_rename_userns], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_RENAME2, 1, [iops->rename2() exists]) + AC_DEFINE(HAVE_IOPS_RENAME_USERNS, 1, + [iops->rename() takes struct user_namespace*]) ],[ AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether iops->rename() wants flags]) - ZFS_LINUX_TEST_RESULT([inode_operations_rename_flags], [ + AC_MSG_CHECKING([whether iops->rename2() exists]) + ZFS_LINUX_TEST_RESULT([inode_operations_rename2], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1, - [iops->rename() wants flags]) + AC_DEFINE(HAVE_RENAME2, 1, [iops->rename2() exists]) ],[ AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether struct inode_operations_wrapper takes .rename2()]) - ZFS_LINUX_TEST_RESULT([dir_inode_operations_wrapper_rename2], [ + AC_MSG_CHECKING([whether iops->rename() wants flags]) + ZFS_LINUX_TEST_RESULT([inode_operations_rename_flags], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_RENAME2_OPERATIONS_WRAPPER, 1, - [struct inode_operations_wrapper takes .rename2()]) + AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1, + [iops->rename() wants flags]) ],[ AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether struct inode_operations_wrapper takes .rename2()]) + ZFS_LINUX_TEST_RESULT([dir_inode_operations_wrapper_rename2], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RENAME2_OPERATIONS_WRAPPER, 1, + [struct inode_operations_wrapper takes .rename2()]) + ],[ + AC_MSG_RESULT(no) + ]) ]) ]) ]) diff --git a/config/kernel-setattr-prepare.m4 b/config/kernel-setattr-prepare.m4 index 24245aa53448..e02d6263e9c9 100644 --- a/config/kernel-setattr-prepare.m4 +++ b/config/kernel-setattr-prepare.m4 @@ -27,26 +27,48 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SETATTR_PREPARE], [ int error __attribute__ ((unused)) = setattr_prepare(userns, dentry, attr); ]) + + dnl # + dnl # 6.3 API change + dnl # The first arg of setattr_prepare() is changed to struct mnt_idmap* + dnl # + ZFS_LINUX_TEST_SRC([setattr_prepare_mnt_idmap], [ + #include + ], [ + struct dentry *dentry = NULL; + struct iattr *attr = NULL; + struct mnt_idmap *idmap = NULL; + int error __attribute__ ((unused)) = + setattr_prepare(idmap, dentry, attr); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_SETATTR_PREPARE], [ - AC_MSG_CHECKING([whether setattr_prepare() is available and accepts struct user_namespace*]) - ZFS_LINUX_TEST_RESULT_SYMBOL([setattr_prepare_userns], + AC_MSG_CHECKING([whether setattr_prepare() is available and accepts struct mnt_idmap*]) + ZFS_LINUX_TEST_RESULT_SYMBOL([setattr_prepare_mnt_idmap], [setattr_prepare], [fs/attr.c], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_SETATTR_PREPARE_USERNS, 1, - [setattr_prepare() accepts user_namespace]) + AC_DEFINE(HAVE_SETATTR_PREPARE_IDMAP, 1, + [setattr_prepare() accepts mnt_idmap]) ], [ - AC_MSG_RESULT(no) - - AC_MSG_CHECKING([whether setattr_prepare() is available, doesn't accept user_namespace]) - ZFS_LINUX_TEST_RESULT_SYMBOL([setattr_prepare], - [setattr_prepare], [fs/attr.c], [ + AC_MSG_CHECKING([whether setattr_prepare() is available and accepts struct user_namespace*]) + ZFS_LINUX_TEST_RESULT_SYMBOL([setattr_prepare_userns], + [setattr_prepare], [fs/attr.c], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_SETATTR_PREPARE_NO_USERNS, 1, - [setattr_prepare() is available, doesn't accept user_namespace]) + AC_DEFINE(HAVE_SETATTR_PREPARE_USERNS, 1, + [setattr_prepare() accepts user_namespace]) ], [ AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether setattr_prepare() is available, doesn't accept user_namespace]) + ZFS_LINUX_TEST_RESULT_SYMBOL([setattr_prepare], + [setattr_prepare], [fs/attr.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SETATTR_PREPARE_NO_USERNS, 1, + [setattr_prepare() is available, doesn't accept user_namespace]) + ], [ + AC_MSG_RESULT(no) + ]) ]) ]) ]) diff --git a/config/kernel-symlink.m4 b/config/kernel-symlink.m4 index d90366d04b72..a0333ed66a7c 100644 --- a/config/kernel-symlink.m4 +++ b/config/kernel-symlink.m4 @@ -1,4 +1,20 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SYMLINK], [ + dnl # + dnl # 6.3 API change that changed the first arg + dnl # to struct mnt_idmap* + dnl # + ZFS_LINUX_TEST_SRC([symlink_mnt_idmap], [ + #include + #include + int tmp_symlink(struct mnt_idmap *idmap, + struct inode *inode ,struct dentry *dentry, + const char *path) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .symlink = tmp_symlink, + }; + ],[]) dnl # dnl # 5.12 API change that added the struct user_namespace* arg dnl # to the front of this function type's arg list. @@ -19,12 +35,19 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SYMLINK], [ ]) AC_DEFUN([ZFS_AC_KERNEL_SYMLINK], [ - AC_MSG_CHECKING([whether iops->symlink() takes struct user_namespace*]) - ZFS_LINUX_TEST_RESULT([symlink_userns], [ + AC_MSG_CHECKING([whether iops->symlink() takes struct mnt_idmap*]) + ZFS_LINUX_TEST_RESULT([symlink_mnt_idmap], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_IOPS_SYMLINK_USERNS, 1, - [iops->symlink() takes struct user_namespace*]) + AC_DEFINE(HAVE_IOPS_SYMLINK_IDMAP, 1, + [iops->symlink() takes struct mnt_idmap*]) ],[ - AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether iops->symlink() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([symlink_userns], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOPS_SYMLINK_USERNS, 1, + [iops->symlink() takes struct user_namespace*]) + ],[ + AC_MSG_RESULT(no) + ]) ]) ]) diff --git a/config/kernel-tmpfile.m4 b/config/kernel-tmpfile.m4 index 0e1deb3612f3..cc18b8f65a88 100644 --- a/config/kernel-tmpfile.m4 +++ b/config/kernel-tmpfile.m4 @@ -4,6 +4,19 @@ dnl # Add support for i_op->tmpfile dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_TMPFILE], [ dnl # + dnl # 6.3 API change + dnl # The first arg is now struct mnt_idmap * + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_tmpfile_mnt_idmap], [ + #include + int tmpfile(struct mnt_idmap *idmap, + struct inode *inode, struct file *file, + umode_t mode) { return 0; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .tmpfile = tmpfile, + }; + ],[]) dnl # 6.1 API change dnl # use struct file instead of struct dentry dnl # @@ -44,23 +57,29 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_TMPFILE], [ AC_DEFUN([ZFS_AC_KERNEL_TMPFILE], [ AC_MSG_CHECKING([whether i_op->tmpfile() exists]) - ZFS_LINUX_TEST_RESULT([inode_operations_tmpfile], [ + ZFS_LINUX_TEST_RESULT([inode_operations_tmpfile_mnt_idmap], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_TMPFILE, 1, [i_op->tmpfile() exists]) - AC_DEFINE(HAVE_TMPFILE_USERNS, 1, [i_op->tmpfile() has userns]) - ],[ - ZFS_LINUX_TEST_RESULT([inode_operations_tmpfile_dentry_userns], [ + AC_DEFINE(HAVE_TMPFILE_IDMAP, 1, [i_op->tmpfile() has mnt_idmap]) + ], [ + ZFS_LINUX_TEST_RESULT([inode_operations_tmpfile], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_TMPFILE, 1, [i_op->tmpfile() exists]) AC_DEFINE(HAVE_TMPFILE_USERNS, 1, [i_op->tmpfile() has userns]) - AC_DEFINE(HAVE_TMPFILE_DENTRY, 1, [i_op->tmpfile() uses old dentry signature]) ],[ - ZFS_LINUX_TEST_RESULT([inode_operations_tmpfile_dentry], [ + ZFS_LINUX_TEST_RESULT([inode_operations_tmpfile_dentry_userns], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_TMPFILE, 1, [i_op->tmpfile() exists]) + AC_DEFINE(HAVE_TMPFILE_USERNS, 1, [i_op->tmpfile() has userns]) AC_DEFINE(HAVE_TMPFILE_DENTRY, 1, [i_op->tmpfile() uses old dentry signature]) ],[ - ZFS_LINUX_REQUIRE_API([i_op->tmpfile()], [3.11]) + ZFS_LINUX_TEST_RESULT([inode_operations_tmpfile_dentry], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_TMPFILE, 1, [i_op->tmpfile() exists]) + AC_DEFINE(HAVE_TMPFILE_DENTRY, 1, [i_op->tmpfile() uses old dentry signature]) + ],[ + ZFS_LINUX_REQUIRE_API([i_op->tmpfile()], [3.11]) + ]) ]) ]) ]) diff --git a/config/kernel-xattr-handler.m4 b/config/kernel-xattr-handler.m4 index b6cbfa155007..6b8a08dbcc80 100644 --- a/config/kernel-xattr-handler.m4 +++ b/config/kernel-xattr-handler.m4 @@ -179,6 +179,21 @@ dnl # dnl # Supported xattr handler set() interfaces checked newest to oldest. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_SET], [ + ZFS_LINUX_TEST_SRC([xattr_handler_set_mnt_idmap], [ + #include + + int set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) + { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .set = set, + }; + ],[]) + ZFS_LINUX_TEST_SRC([xattr_handler_set_userns], [ #include @@ -240,53 +255,63 @@ AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_SET], [ dnl # The xattr_handler->set() callback was changed to 8 arguments, and dnl # struct user_namespace* was inserted as arg #2 dnl # - AC_MSG_CHECKING([whether xattr_handler->set() wants dentry, inode, and user_namespace]) - ZFS_LINUX_TEST_RESULT([xattr_handler_set_userns], [ + dnl # 6.3 API change, + dnl # The xattr_handler->set() callback 2nd arg is now struct mnt_idmap * + dnl # + AC_MSG_CHECKING([whether xattr_handler->set() wants dentry, inode, and mnt_idmap]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_mnt_idmap], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_XATTR_SET_USERNS, 1, - [xattr_handler->set() takes user_namespace]) - ],[ - dnl # - dnl # 4.7 API change, - dnl # The xattr_handler->set() callback was changed to take both - dnl # dentry and inode. - dnl # - AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether xattr_handler->set() wants dentry and inode]) - ZFS_LINUX_TEST_RESULT([xattr_handler_set_dentry_inode], [ + AC_DEFINE(HAVE_XATTR_SET_IDMAP, 1, + [xattr_handler->set() takes mnt_idmap]) + ], [ + AC_MSG_CHECKING([whether xattr_handler->set() wants dentry, inode, and user_namespace]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_userns], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_XATTR_SET_DENTRY_INODE, 1, - [xattr_handler->set() wants both dentry and inode]) + AC_DEFINE(HAVE_XATTR_SET_USERNS, 1, + [xattr_handler->set() takes user_namespace]) ],[ dnl # - dnl # 4.4 API change, - dnl # The xattr_handler->set() callback was changed to take a - dnl # xattr_handler, and handler_flags argument was removed and - dnl # should be accessed by handler->flags. + dnl # 4.7 API change, + dnl # The xattr_handler->set() callback was changed to take both + dnl # dentry and inode. dnl # AC_MSG_RESULT(no) - AC_MSG_CHECKING( - [whether xattr_handler->set() wants xattr_handler]) - ZFS_LINUX_TEST_RESULT([xattr_handler_set_xattr_handler], [ + AC_MSG_CHECKING([whether xattr_handler->set() wants dentry and inode]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_dentry_inode], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_XATTR_SET_HANDLER, 1, - [xattr_handler->set() wants xattr_handler]) + AC_DEFINE(HAVE_XATTR_SET_DENTRY_INODE, 1, + [xattr_handler->set() wants both dentry and inode]) ],[ dnl # - dnl # 2.6.33 API change, - dnl # The xattr_handler->set() callback was changed - dnl # to take a dentry instead of an inode, and a - dnl # handler_flags argument was added. + dnl # 4.4 API change, + dnl # The xattr_handler->set() callback was changed to take a + dnl # xattr_handler, and handler_flags argument was removed and + dnl # should be accessed by handler->flags. dnl # AC_MSG_RESULT(no) AC_MSG_CHECKING( - [whether xattr_handler->set() wants dentry]) - ZFS_LINUX_TEST_RESULT([xattr_handler_set_dentry], [ + [whether xattr_handler->set() wants xattr_handler]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_xattr_handler], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_XATTR_SET_DENTRY, 1, - [xattr_handler->set() wants dentry]) + AC_DEFINE(HAVE_XATTR_SET_HANDLER, 1, + [xattr_handler->set() wants xattr_handler]) ],[ - ZFS_LINUX_TEST_ERROR([xattr set()]) + dnl # + dnl # 2.6.33 API change, + dnl # The xattr_handler->set() callback was changed + dnl # to take a dentry instead of an inode, and a + dnl # handler_flags argument was added. + dnl # + AC_MSG_RESULT(no) + AC_MSG_CHECKING( + [whether xattr_handler->set() wants dentry]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_dentry], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_SET_DENTRY, 1, + [xattr_handler->set() wants dentry]) + ],[ + ZFS_LINUX_TEST_ERROR([xattr set()]) + ]) ]) ]) ]) diff --git a/config/kernel.m4 b/config/kernel.m4 index fb07f5004d3c..439ffdf5a898 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -71,6 +71,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE ZFS_AC_KERNEL_SRC_XATTR ZFS_AC_KERNEL_SRC_ACL + ZFS_AC_KERNEL_SRC_INODE_SETATTR ZFS_AC_KERNEL_SRC_INODE_GETATTR ZFS_AC_KERNEL_SRC_INODE_SET_FLAGS ZFS_AC_KERNEL_SRC_INODE_SET_IVERSION @@ -133,7 +134,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_KSTRTOUL ZFS_AC_KERNEL_SRC_PERCPU ZFS_AC_KERNEL_SRC_CPU_HOTPLUG - ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR_USERNS + ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR ZFS_AC_KERNEL_SRC_MKNOD ZFS_AC_KERNEL_SRC_SYMLINK ZFS_AC_KERNEL_SRC_BIO_MAX_SEGS @@ -202,6 +203,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE ZFS_AC_KERNEL_XATTR ZFS_AC_KERNEL_ACL + ZFS_AC_KERNEL_INODE_SETATTR ZFS_AC_KERNEL_INODE_GETATTR ZFS_AC_KERNEL_INODE_SET_FLAGS ZFS_AC_KERNEL_INODE_SET_IVERSION @@ -264,7 +266,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_KSTRTOUL ZFS_AC_KERNEL_PERCPU ZFS_AC_KERNEL_CPU_HOTPLUG - ZFS_AC_KERNEL_GENERIC_FILLATTR_USERNS + ZFS_AC_KERNEL_GENERIC_FILLATTR ZFS_AC_KERNEL_MKNOD ZFS_AC_KERNEL_SYMLINK ZFS_AC_KERNEL_BIO_MAX_SEGS diff --git a/include/os/freebsd/spl/sys/types.h b/include/os/freebsd/spl/sys/types.h index 558843dcaa74..ebc93f4f4485 100644 --- a/include/os/freebsd/spl/sys/types.h +++ b/include/os/freebsd/spl/sys/types.h @@ -105,7 +105,7 @@ typedef u_longlong_t len_t; typedef longlong_t diskaddr_t; -typedef void zuserns_t; +typedef void zidmap_t; #include #endif /* !_OPENSOLARIS_SYS_TYPES_H_ */ diff --git a/include/os/freebsd/zfs/sys/zfs_vnops_os.h b/include/os/freebsd/zfs/sys/zfs_vnops_os.h index 839ee629a5ab..eddcab575b91 100644 --- a/include/os/freebsd/zfs/sys/zfs_vnops_os.h +++ b/include/os/freebsd/zfs/sys/zfs_vnops_os.h @@ -35,23 +35,23 @@ int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, int *rbehind, int *rahead, int last_size); extern int zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags); extern int zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, - znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp, zuserns_t *mnt_ns); + znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns); extern int zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags); extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr, - zuserns_t *mnt_ns); + zidmap_t *mnt_ns); extern int zfs_rename(znode_t *sdzp, const char *snm, znode_t *tdzp, const char *tnm, cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, - zuserns_t *mnt_ns); + zidmap_t *mnt_ns); extern int zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, - const char *link, znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns); + const char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns); extern int zfs_link(znode_t *tdzp, znode_t *sp, const char *name, cred_t *cr, int flags); extern int zfs_space(znode_t *zp, int cmd, struct flock *bfp, int flag, offset_t offset, cred_t *cr); extern int zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, - zuserns_t *mnt_ns); + zidmap_t *mnt_ns); extern int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr); extern int zfs_write_simple(znode_t *zp, const void *data, size_t len, diff --git a/include/os/linux/kernel/linux/vfs_compat.h b/include/os/linux/kernel/linux/vfs_compat.h index fd0b9e8e1068..e156ed41c28c 100644 --- a/include/os/linux/kernel/linux/vfs_compat.h +++ b/include/os/linux/kernel/linux/vfs_compat.h @@ -341,7 +341,8 @@ static inline void zfs_gid_write(struct inode *ip, gid_t gid) * 4.9 API change */ #if !(defined(HAVE_SETATTR_PREPARE_NO_USERNS) || \ - defined(HAVE_SETATTR_PREPARE_USERNS)) + defined(HAVE_SETATTR_PREPARE_USERNS) || \ + defined(HAVE_SETATTR_PREPARE_IDMAP)) static inline int setattr_prepare(struct dentry *dentry, struct iattr *ia) { @@ -396,6 +397,15 @@ func(struct user_namespace *user_ns, const struct path *path, \ return (func##_impl(user_ns, path, stat, request_mask, \ query_flags)); \ } +#elif defined(HAVE_IDMAP_IOPS_GETATTR) +#define ZPL_GETATTR_WRAPPER(func) \ +static int \ +func(struct mnt_idmap *user_ns, const struct path *path, \ + struct kstat *stat, u32 request_mask, unsigned int query_flags) \ +{ \ + return (func##_impl(user_ns, path, stat, request_mask, \ + query_flags)); \ +} #else #error #endif @@ -447,8 +457,15 @@ zpl_is_32bit_api(void) * 5.12 API change * To support id-mapped mounts, generic_fillattr() was modified to * accept a new struct user_namespace* as its first arg. + * + * 6.3 API change + * generic_fillattr() first arg is changed to struct mnt_idmap * + * */ -#ifdef HAVE_GENERIC_FILLATTR_USERNS +#ifdef HAVE_GENERIC_FILLATTR_IDMAP +#define zpl_generic_fillattr(idmap, ip, sp) \ + generic_fillattr(idmap, ip, sp) +#elif defined(HAVE_GENERIC_FILLATTR_USERNS) #define zpl_generic_fillattr(user_ns, ip, sp) \ generic_fillattr(user_ns, ip, sp) #else diff --git a/include/os/linux/kernel/linux/xattr_compat.h b/include/os/linux/kernel/linux/xattr_compat.h index ff80fbb06413..bcc7289ad857 100644 --- a/include/os/linux/kernel/linux/xattr_compat.h +++ b/include/os/linux/kernel/linux/xattr_compat.h @@ -133,13 +133,28 @@ fn(const struct xattr_handler *handler, struct dentry *dentry, \ #error "Unsupported kernel" #endif +/* + * 6.3 API change, + * The xattr_handler->set() callback was changed to take the + * struct mnt_idmap* as the first arg, to support idmapped + * mounts. + */ +#if defined(HAVE_XATTR_SET_IDMAP) +#define ZPL_XATTR_SET_WRAPPER(fn) \ +static int \ +fn(const struct xattr_handler *handler, struct mnt_idmap *user_ns, \ + struct dentry *dentry, struct inode *inode, const char *name, \ + const void *buffer, size_t size, int flags) \ +{ \ + return (__ ## fn(user_ns, inode, name, buffer, size, flags)); \ +} /* * 5.12 API change, * The xattr_handler->set() callback was changed to take the * struct user_namespace* as the first arg, to support idmapped * mounts. */ -#if defined(HAVE_XATTR_SET_USERNS) +#elif defined(HAVE_XATTR_SET_USERNS) #define ZPL_XATTR_SET_WRAPPER(fn) \ static int \ fn(const struct xattr_handler *handler, struct user_namespace *user_ns, \ diff --git a/include/os/linux/spl/sys/cred.h b/include/os/linux/spl/sys/cred.h index 75ad400d312d..7fd5f644863f 100644 --- a/include/os/linux/spl/sys/cred.h +++ b/include/os/linux/spl/sys/cred.h @@ -48,6 +48,8 @@ extern struct task_struct init_task; #define SGID_TO_KGID(x) (KGIDT_INIT(x)) #define KGIDP_TO_SGIDP(x) (&(x)->val) +extern zidmap_t *zfs_get_init_idmap(void); + /* Check if the user ns is the initial one */ static inline boolean_t zfs_is_init_userns(struct user_namespace *user_ns) @@ -74,36 +76,39 @@ static inline boolean_t zfs_no_idmapping(struct user_namespace *mnt_userns, return (zfs_is_init_userns(mnt_userns) || mnt_userns == fs_userns); } -static inline uid_t zfs_uid_to_vfsuid(struct user_namespace *mnt_userns, +static inline uid_t zfs_uid_to_vfsuid(zidmap_t *mnt_userns, struct user_namespace *fs_userns, uid_t uid) { - if (zfs_no_idmapping(mnt_userns, fs_userns)) + struct user_namespace *owner = idmap_owner(mnt_userns); + if (zfs_no_idmapping(owner, fs_userns)) return (uid); if (!zfs_is_init_userns(fs_userns)) uid = from_kuid(fs_userns, KUIDT_INIT(uid)); if (uid == (uid_t)-1) return (uid); - return (__kuid_val(make_kuid(mnt_userns, uid))); + return (__kuid_val(make_kuid(owner, uid))); } -static inline gid_t zfs_gid_to_vfsgid(struct user_namespace *mnt_userns, +static inline gid_t zfs_gid_to_vfsgid(zidmap_t *mnt_userns, struct user_namespace *fs_userns, gid_t gid) { - if (zfs_no_idmapping(mnt_userns, fs_userns)) + struct user_namespace *owner = idmap_owner(mnt_userns); + if (zfs_no_idmapping(owner, fs_userns)) return (gid); if (!zfs_is_init_userns(fs_userns)) gid = from_kgid(fs_userns, KGIDT_INIT(gid)); if (gid == (gid_t)-1) return (gid); - return (__kgid_val(make_kgid(mnt_userns, gid))); + return (__kgid_val(make_kgid(owner, gid))); } -static inline uid_t zfs_vfsuid_to_uid(struct user_namespace *mnt_userns, +static inline uid_t zfs_vfsuid_to_uid(zidmap_t *mnt_userns, struct user_namespace *fs_userns, uid_t uid) { - if (zfs_no_idmapping(mnt_userns, fs_userns)) + struct user_namespace *owner = idmap_owner(mnt_userns); + if (zfs_no_idmapping(owner, fs_userns)) return (uid); - uid = from_kuid(mnt_userns, KUIDT_INIT(uid)); + uid = from_kuid(owner, KUIDT_INIT(uid)); if (uid == (uid_t)-1) return (uid); if (zfs_is_init_userns(fs_userns)) @@ -111,12 +116,13 @@ static inline uid_t zfs_vfsuid_to_uid(struct user_namespace *mnt_userns, return (__kuid_val(make_kuid(fs_userns, uid))); } -static inline gid_t zfs_vfsgid_to_gid(struct user_namespace *mnt_userns, +static inline gid_t zfs_vfsgid_to_gid(zidmap_t *mnt_userns, struct user_namespace *fs_userns, gid_t gid) { - if (zfs_no_idmapping(mnt_userns, fs_userns)) + struct user_namespace *owner = idmap_owner(mnt_userns); + if (zfs_no_idmapping(owner, fs_userns)) return (gid); - gid = from_kgid(mnt_userns, KGIDT_INIT(gid)); + gid = from_kgid(owner, KGIDT_INIT(gid)); if (gid == (gid_t)-1) return (gid); if (zfs_is_init_userns(fs_userns)) diff --git a/include/os/linux/spl/sys/types.h b/include/os/linux/spl/sys/types.h index cae1bbddf105..a7666187ec23 100644 --- a/include/os/linux/spl/sys/types.h +++ b/include/os/linux/spl/sys/types.h @@ -55,6 +55,19 @@ typedef int major_t; typedef int minor_t; struct user_namespace; -typedef struct user_namespace zuserns_t; +#ifdef HAVE_IOPS_CREATE_IDMAP +#include +struct mnt_idmap { + struct user_namespace *owner; + refcount_t count; +}; +typedef struct mnt_idmap zidmap_t; +#define idmap_owner(p) (((struct mnt_idmap *)p)->owner) +#else +typedef struct user_namespace zidmap_t; +#define idmap_owner(p) ((struct user_namespace *)p) +#endif + +extern zidmap_t *zfs_init_idmap; #endif /* _SPL_TYPES_H */ diff --git a/include/os/linux/zfs/sys/policy.h b/include/os/linux/zfs/sys/policy.h index b182da95b8e0..0c265db78591 100644 --- a/include/os/linux/zfs/sys/policy.h +++ b/include/os/linux/zfs/sys/policy.h @@ -47,14 +47,14 @@ int secpolicy_vnode_create_gid(const cred_t *); int secpolicy_vnode_remove(const cred_t *); int secpolicy_vnode_setdac(const cred_t *, uid_t); int secpolicy_vnode_setid_retain(struct znode *, const cred_t *, boolean_t); -int secpolicy_vnode_setids_setgids(const cred_t *, gid_t, zuserns_t *, - zuserns_t *); +int secpolicy_vnode_setids_setgids(const cred_t *, gid_t, zidmap_t *, + struct user_namespace *); int secpolicy_zinject(const cred_t *); int secpolicy_zfs(const cred_t *); int secpolicy_zfs_proc(const cred_t *, proc_t *); void secpolicy_setid_clear(vattr_t *, cred_t *); int secpolicy_setid_setsticky_clear(struct inode *, vattr_t *, - const vattr_t *, cred_t *, zuserns_t *, zuserns_t *); + const vattr_t *, cred_t *, zidmap_t *, struct user_namespace *); int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, mode_t); int secpolicy_vnode_setattr(cred_t *, struct inode *, struct vattr *, const struct vattr *, int, int (void *, int, cred_t *), void *); diff --git a/include/os/linux/zfs/sys/zfs_vnops_os.h b/include/os/linux/zfs/sys/zfs_vnops_os.h index 1caec0ef4f1d..7a1db7deeec8 100644 --- a/include/os/linux/zfs/sys/zfs_vnops_os.h +++ b/include/os/linux/zfs/sys/zfs_vnops_os.h @@ -46,25 +46,24 @@ extern int zfs_lookup(znode_t *dzp, char *nm, znode_t **zpp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp); extern int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, - zuserns_t *mnt_ns); + zidmap_t *mnt_ns); extern int zfs_tmpfile(struct inode *dip, vattr_t *vapzfs, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, - zuserns_t *mnt_ns); + zidmap_t *mnt_ns); extern int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags); extern int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, - znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp, zuserns_t *mnt_ns); + znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns); extern int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags); extern int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr); -extern int zfs_getattr_fast(struct user_namespace *, struct inode *ip, - struct kstat *sp); +extern int zfs_getattr_fast(zidmap_t *, struct inode *ip, struct kstat *sp); extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr, - zuserns_t *mnt_ns); + zidmap_t *mnt_ns); extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, - zuserns_t *mnt_ns); + zidmap_t *mnt_ns); extern int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, - char *link, znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns); + char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns); extern int zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr); extern int zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, int flags); diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index ac1f01a86c41..2b302e9dab07 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -39,7 +39,7 @@ /* zpl_inode.c */ extern void zpl_vap_init(vattr_t *vap, struct inode *dir, - umode_t mode, cred_t *cr, zuserns_t *mnt_ns); + umode_t mode, cred_t *cr, zidmap_t *mnt_ns); extern const struct inode_operations zpl_inode_operations; #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER @@ -68,7 +68,10 @@ extern int zpl_xattr_security_init(struct inode *ip, struct inode *dip, const struct qstr *qstr); #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) -#if defined(HAVE_SET_ACL_USERNS) +#if defined(HAVE_SET_ACL_IDMAP_DENTRY) +extern int zpl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, + struct posix_acl *acl, int type); +#elif defined(HAVE_SET_ACL_USERNS) extern int zpl_set_acl(struct user_namespace *userns, struct inode *ip, struct posix_acl *acl, int type); #elif defined(HAVE_SET_ACL_USERNS_DENTRY_ARG2) @@ -189,13 +192,15 @@ zpl_dir_emit_dots(struct file *file, zpl_dir_context_t *ctx) #if defined(HAVE_INODE_OWNER_OR_CAPABLE) #define zpl_inode_owner_or_capable(ns, ip) inode_owner_or_capable(ip) -#elif defined(HAVE_INODE_OWNER_OR_CAPABLE_IDMAPPED) +#elif defined(HAVE_INODE_OWNER_OR_CAPABLE_USERNS) #define zpl_inode_owner_or_capable(ns, ip) inode_owner_or_capable(ns, ip) +#elif defined(HAVE_INODE_OWNER_OR_CAPABLE_IDMAP) +#define zpl_inode_owner_or_capable(idmap, ip) inode_owner_or_capable(idmap, ip) #else #error "Unsupported kernel" #endif -#ifdef HAVE_SETATTR_PREPARE_USERNS +#if defined(HAVE_SETATTR_PREPARE_USERNS) || defined(HAVE_SETATTR_PREPARE_IDMAP) #define zpl_setattr_prepare(ns, dentry, ia) setattr_prepare(ns, dentry, ia) #else /* diff --git a/include/sys/zfs_acl.h b/include/sys/zfs_acl.h index e5c570c474a3..e19288528849 100644 --- a/include/sys/zfs_acl.h +++ b/include/sys/zfs_acl.h @@ -206,7 +206,7 @@ struct zfsvfs; #ifdef _KERNEL int zfs_acl_ids_create(struct znode *, int, vattr_t *, - cred_t *, vsecattr_t *, zfs_acl_ids_t *, zuserns_t *); + cred_t *, vsecattr_t *, zfs_acl_ids_t *, zidmap_t *); void zfs_acl_ids_free(zfs_acl_ids_t *); boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *, uint64_t); int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); @@ -216,15 +216,15 @@ void zfs_oldace_byteswap(ace_t *, int); void zfs_ace_byteswap(void *, size_t, boolean_t); extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr); extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *, - zuserns_t *); + zidmap_t *); int zfs_fastaccesschk_execute(struct znode *, cred_t *); -extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *, zuserns_t *); +extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *, zidmap_t *); extern int zfs_zaccess_unix(void *, int, cred_t *); extern int zfs_acl_access(struct znode *, int, cred_t *); int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); -int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *, zuserns_t *); +int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *, zidmap_t *); int zfs_zaccess_rename(struct znode *, struct znode *, - struct znode *, struct znode *, cred_t *cr, zuserns_t *mnt_ns); + struct znode *, struct znode *, cred_t *cr, zidmap_t *mnt_ns); void zfs_acl_free(zfs_acl_t *); int zfs_vsec_2_aclp(struct zfsvfs *, umode_t, vsecattr_t *, cred_t *, struct zfs_fuid_info **, zfs_acl_t **); diff --git a/module/os/freebsd/zfs/zfs_acl.c b/module/os/freebsd/zfs/zfs_acl.c index 9f735dbb558c..a077076927a1 100644 --- a/module/os/freebsd/zfs/zfs_acl.c +++ b/module/os/freebsd/zfs/zfs_acl.c @@ -1619,7 +1619,7 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, */ int zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, - vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids, zuserns_t *mnt_ns) + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids, zidmap_t *mnt_ns) { int error; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; @@ -2341,7 +2341,7 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) */ int zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr, - zuserns_t *mnt_ns) + zidmap_t *mnt_ns) { uint32_t working_mode; int error; @@ -2471,7 +2471,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr, */ int zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr, - zuserns_t *mnt_ns) + zidmap_t *mnt_ns) { return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr, mnt_ns)); @@ -2541,7 +2541,7 @@ zfs_delete_final_check(znode_t *zp, znode_t *dzp, * */ int -zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr, zuserns_t *mnt_ns) +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr, zidmap_t *mnt_ns) { uint32_t dzp_working_mode = 0; uint32_t zp_working_mode = 0; @@ -2628,7 +2628,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr, zuserns_t *mnt_ns) int zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, - znode_t *tzp, cred_t *cr, zuserns_t *mnt_ns) + znode_t *tzp, cred_t *cr, zidmap_t *mnt_ns) { int add_perm; int error; diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index b3405b7593f4..8abd7239ad2e 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -1053,7 +1053,7 @@ zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, */ int zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, - znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zuserns_t *mnt_ns) + znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns) { (void) excl, (void) mode, (void) flag; znode_t *zp; @@ -1405,7 +1405,7 @@ zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags) */ int zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, - cred_t *cr, int flags, vsecattr_t *vsecp, zuserns_t *mnt_ns) + cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns) { (void) flags, (void) vsecp; znode_t *zp; @@ -2159,7 +2159,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) * vp - ctime updated, mtime updated if size changed. */ int -zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zuserns_t *mnt_ns) +zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) { vnode_t *vp = ZTOV(zp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; @@ -3420,7 +3420,7 @@ zfs_do_rename_impl(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, int zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname, - cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns) + cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns) { struct componentname scn, tcn; vnode_t *sdvp, *tdvp; @@ -3477,7 +3477,7 @@ zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname, */ int zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, - const char *link, znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns) + const char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns) { (void) flags; znode_t *zp; diff --git a/module/os/linux/spl/spl-cred.c b/module/os/linux/spl/spl-cred.c index f81b9540a639..d407fc66b2de 100644 --- a/module/os/linux/spl/spl-cred.c +++ b/module/os/linux/spl/spl-cred.c @@ -145,6 +145,18 @@ crgetgid(const cred_t *cr) return (KGID_TO_SGID(cr->fsgid)); } +/* Return the initial user ns or nop_mnt_idmap */ +zidmap_t * +zfs_get_init_idmap(void) +{ +#ifdef HAVE_IOPS_CREATE_IDMAP + return ((zidmap_t *)&nop_mnt_idmap); +#else + return ((zidmap_t *)&init_user_ns); +#endif +} + +EXPORT_SYMBOL(zfs_get_init_idmap); EXPORT_SYMBOL(crhold); EXPORT_SYMBOL(crfree); EXPORT_SYMBOL(crgetuid); diff --git a/module/os/linux/zfs/policy.c b/module/os/linux/zfs/policy.c index eaf38df864d3..5d1b4383412a 100644 --- a/module/os/linux/zfs/policy.c +++ b/module/os/linux/zfs/policy.c @@ -124,7 +124,7 @@ secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner) if (crgetuid(cr) == owner) return (0); - if (zpl_inode_owner_or_capable(kcred->user_ns, ip)) + if (zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (0); #if defined(CONFIG_USER_NS) @@ -214,8 +214,8 @@ secpolicy_vnode_setid_retain(struct znode *zp __maybe_unused, const cred_t *cr, * Determine that subject can set the file setgid flag. */ int -secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid, zuserns_t *mnt_ns, - zuserns_t *fs_ns) +secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid, zidmap_t *mnt_ns, + struct user_namespace *fs_ns) { gid = zfs_gid_to_vfsgid(mnt_ns, fs_ns, gid); #if defined(CONFIG_USER_NS) @@ -286,8 +286,8 @@ secpolicy_setid_clear(vattr_t *vap, cred_t *cr) * Determine that subject can set the file setid flags. */ static int -secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner, zuserns_t *mnt_ns, - zuserns_t *fs_ns) +secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner, zidmap_t *mnt_ns, + struct user_namespace *fs_ns) { owner = zfs_uid_to_vfsuid(mnt_ns, fs_ns, owner); @@ -315,7 +315,8 @@ secpolicy_vnode_stky_modify(const cred_t *cr) int secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap, - const vattr_t *ovap, cred_t *cr, zuserns_t *mnt_ns, zuserns_t *fs_ns) + const vattr_t *ovap, cred_t *cr, zidmap_t *mnt_ns, + struct user_namespace *fs_ns) { int error; diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c index db1bb9577197..df4ebc3870be 100644 --- a/module/os/linux/zfs/zfs_acl.c +++ b/module/os/linux/zfs/zfs_acl.c @@ -1802,7 +1802,7 @@ zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t va_mode, zfs_acl_t *paclp, */ int zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, - vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids, zuserns_t *mnt_ns) + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids, zidmap_t *mnt_ns) { int error; zfsvfs_t *zfsvfs = ZTOZSB(dzp); @@ -1981,7 +1981,7 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) return (SET_ERROR(ENOSYS)); if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr, - kcred->user_ns))) + zfs_init_idmap))) return (error); mutex_enter(&zp->z_acl_lock); @@ -2141,7 +2141,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) return (SET_ERROR(EPERM)); if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, - kcred->user_ns))) + zfs_init_idmap))) return (error); error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp, @@ -2286,7 +2286,7 @@ zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) */ static int zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, - boolean_t anyaccess, cred_t *cr, zuserns_t *mnt_ns) + boolean_t anyaccess, cred_t *cr, zidmap_t *mnt_ns) { zfsvfs_t *zfsvfs = ZTOZSB(zp); zfs_acl_t *aclp; @@ -2420,7 +2420,7 @@ zfs_has_access(znode_t *zp, cred_t *cr) uint32_t have = ACE_ALL_PERMS; if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr, - kcred->user_ns) != 0) { + zfs_init_idmap) != 0) { uid_t owner; owner = zfs_fuid_map_id(ZTOZSB(zp), @@ -2451,7 +2451,7 @@ zfs_has_access(znode_t *zp, cred_t *cr) */ static int zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr, - zuserns_t *mnt_ns) + zidmap_t *mnt_ns) { int err, mask; int unmapped = 0; @@ -2464,11 +2464,10 @@ zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr, return (unmapped ? SET_ERROR(EPERM) : 0); } -#if defined(HAVE_IOPS_PERMISSION_USERNS) +#if (defined(HAVE_IOPS_PERMISSION_USERNS) || \ + defined(HAVE_IOPS_PERMISSION_IDMAP)) if (mnt_ns) err = generic_permission(mnt_ns, ZTOI(zp), mask); - else - err = generic_permission(cr->user_ns, ZTOI(zp), mask); #else err = generic_permission(ZTOI(zp), mask); #endif @@ -2483,7 +2482,7 @@ zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr, static int zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, - boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr, zuserns_t *mnt_ns) + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr, zidmap_t *mnt_ns) { zfsvfs_t *zfsvfs = ZTOZSB(zp); int err; @@ -2540,7 +2539,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, static int zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, - cred_t *cr, zuserns_t *mnt_ns) + cred_t *cr, zidmap_t *mnt_ns) { if (*working_mode != ACE_WRITE_DATA) return (SET_ERROR(EACCES)); @@ -2612,7 +2611,7 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) if ((error = zfs_enter(ZTOZSB(zdp), FTAG)) != 0) return (error); error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, - kcred->user_ns); + zfs_init_idmap); zfs_exit(ZTOZSB(zdp), FTAG); return (error); } @@ -2625,7 +2624,7 @@ zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) */ int zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr, - zuserns_t *mnt_ns) + zidmap_t *mnt_ns) { uint32_t working_mode; int error; @@ -2774,7 +2773,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr, */ int zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr, - zuserns_t *mnt_ns) + zidmap_t *mnt_ns) { return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr, mnt_ns)); @@ -2788,7 +2787,7 @@ zfs_zaccess_unix(void *zp, int mode, cred_t *cr) { int v4_mode = zfs_unix_to_v4(mode >> 6); - return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr, kcred->user_ns)); + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr, zfs_init_idmap)); } /* See zfs_zaccess_delete() */ @@ -2865,7 +2864,7 @@ static const boolean_t zfs_write_implies_delete_child = B_TRUE; * zfs_write_implies_delete_child */ int -zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr, zuserns_t *mnt_ns) +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr, zidmap_t *mnt_ns) { uint32_t wanted_dirperms; uint32_t dzp_working_mode = 0; @@ -2996,7 +2995,7 @@ zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr, zuserns_t *mnt_ns) int zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, - znode_t *tzp, cred_t *cr, zuserns_t *mnt_ns) + znode_t *tzp, cred_t *cr, zidmap_t *mnt_ns) { int add_perm; int error; diff --git a/module/os/linux/zfs/zfs_dir.c b/module/os/linux/zfs/zfs_dir.c index 1fec4ea09317..1eeabe53d23c 100644 --- a/module/os/linux/zfs/zfs_dir.c +++ b/module/os/linux/zfs/zfs_dir.c @@ -1120,7 +1120,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr) *xzpp = NULL; if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, - &acl_ids, kcred->user_ns)) != 0) + &acl_ids, zfs_init_idmap)) != 0) return (error); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) { zfs_acl_ids_free(&acl_ids); @@ -1269,7 +1269,7 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) if ((uid = crgetuid(cr)) == downer || uid == fowner || zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, - kcred->user_ns) == 0) + zfs_init_idmap) == 0) return (0); else return (secpolicy_vnode_remove(cr)); diff --git a/module/os/linux/zfs/zfs_ioctl_os.c b/module/os/linux/zfs/zfs_ioctl_os.c index f068f544f0ec..663474ea49ab 100644 --- a/module/os/linux/zfs/zfs_ioctl_os.c +++ b/module/os/linux/zfs/zfs_ioctl_os.c @@ -282,6 +282,8 @@ zfsdev_detach(void) #define ZFS_DEBUG_STR "" #endif +zidmap_t *zfs_init_idmap; + static int openzfs_init_os(void) { @@ -305,6 +307,8 @@ openzfs_init_os(void) printk(KERN_NOTICE "ZFS: Posix ACLs disabled by kernel\n"); #endif /* CONFIG_FS_POSIX_ACL */ + zfs_init_idmap = (zidmap_t *)zfs_get_init_idmap(); + return (0); } diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index b8af3e3b058d..234c4d5ef0e0 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -487,7 +487,7 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, */ if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, - B_TRUE, cr, kcred->user_ns))) { + B_TRUE, cr, zfs_init_idmap))) { zrele(*zpp); *zpp = NULL; } @@ -506,7 +506,7 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, */ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, - kcred->user_ns))) { + zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } @@ -551,7 +551,7 @@ zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, - zuserns_t *mnt_ns) + zidmap_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); @@ -799,7 +799,7 @@ zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, - zuserns_t *mnt_ns) + zidmap_t *mnt_ns) { (void) excl, (void) mode, (void) flag; znode_t *zp = NULL, *dzp = ITOZ(dip); @@ -984,7 +984,7 @@ zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) return (error); } - if ((error = zfs_zaccess_delete(dzp, zp, cr, kcred->user_ns))) { + if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { goto out; } @@ -1179,7 +1179,7 @@ zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) */ int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, - cred_t *cr, int flags, vsecattr_t *vsecp, zuserns_t *mnt_ns) + cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); @@ -1400,7 +1400,7 @@ zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, return (error); } - if ((error = zfs_zaccess_delete(dzp, zp, cr, kcred->user_ns))) { + if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { goto out; } @@ -1652,8 +1652,7 @@ zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) * RETURN: 0 (always succeeds) */ int -zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip, - struct kstat *sp) +zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); @@ -1841,7 +1840,7 @@ zfs_setattr_dir(znode_t *dzp) * ip - ctime updated, mtime updated if size changed. */ int -zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zuserns_t *mnt_ns) +zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) { struct inode *ip; zfsvfs_t *zfsvfs = ZTOZSB(zp); @@ -2038,10 +2037,10 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zuserns_t *mnt_ns) * Take ownership or chgrp to group we are a member of */ - uid = zfs_uid_to_vfsuid((struct user_namespace *)mnt_ns, - zfs_i_user_ns(ip), vap->va_uid); - gid = zfs_gid_to_vfsgid((struct user_namespace *)mnt_ns, - zfs_i_user_ns(ip), vap->va_gid); + uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip), + vap->va_uid); + gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip), + vap->va_gid); take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); take_group = (mask & ATTR_GID) && zfs_groupmember(zfsvfs, gid, cr); @@ -2680,7 +2679,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) */ int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, - cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zuserns_t *mnt_ns) + cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns) { znode_t *szp, *tzp; zfsvfs_t *zfsvfs = ZTOZSB(sdzp); @@ -3213,7 +3212,7 @@ zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, */ int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, - znode_t **zpp, cred_t *cr, int flags, zuserns_t *mnt_ns) + znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns) { znode_t *zp; zfs_dirlock_t *dl; @@ -3521,7 +3520,7 @@ zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, } if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, - kcred->user_ns))) { + zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } @@ -4136,7 +4135,7 @@ zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, - kcred->user_ns))) { + zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index 38cdccfd8084..c104cd661bf5 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -1963,7 +1963,7 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) } VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, - cr, NULL, &acl_ids, kcred->user_ns)); + cr, NULL, &acl_ids, zfs_init_idmap)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); diff --git a/module/os/linux/zfs/zpl_ctldir.c b/module/os/linux/zfs/zpl_ctldir.c index f0779c81dc75..68a7de78f471 100644 --- a/module/os/linux/zfs/zpl_ctldir.c +++ b/module/os/linux/zfs/zpl_ctldir.c @@ -103,7 +103,11 @@ zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir) * Get root directory attributes. */ static int -#ifdef HAVE_USERNS_IOPS_GETATTR +#ifdef HAVE_IDMAP_IOPS_GETATTR +zpl_root_getattr_impl(struct mnt_idmap *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#elif defined(HAVE_USERNS_IOPS_GETATTR) zpl_root_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) @@ -115,9 +119,11 @@ zpl_root_getattr_impl(const struct path *path, struct kstat *stat, (void) request_mask, (void) query_flags; struct inode *ip = path->dentry->d_inode; -#ifdef HAVE_USERNS_IOPS_GETATTR +#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) #ifdef HAVE_GENERIC_FILLATTR_USERNS generic_fillattr(user_ns, ip, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP) + generic_fillattr(user_ns, ip, stat); #else (void) user_ns; #endif @@ -312,6 +318,10 @@ static int zpl_snapdir_rename2(struct user_namespace *user_ns, struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) +#elif defined(HAVE_IOPS_RENAME_IDMAP) +zpl_snapdir_rename2(struct mnt_idmap *user_ns, struct inode *sdip, + struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, + unsigned int flags) #else zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) @@ -333,7 +343,9 @@ zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry, return (error); } -#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS) +#if (!defined(HAVE_RENAME_WANTS_FLAGS) && \ + !defined(HAVE_IOPS_RENAME_USERNS) && \ + !defined(HAVE_IOPS_RENAME_IDMAP)) static int zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) @@ -360,6 +372,9 @@ static int #ifdef HAVE_IOPS_MKDIR_USERNS zpl_snapdir_mkdir(struct user_namespace *user_ns, struct inode *dip, struct dentry *dentry, umode_t mode) +#elif defined(HAVE_IOPS_MKDIR_IDMAP) +zpl_snapdir_mkdir(struct mnt_idmap *user_ns, struct inode *dip, + struct dentry *dentry, umode_t mode) #else zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) #endif @@ -371,10 +386,10 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); -#ifdef HAVE_IOPS_MKDIR_USERNS +#if (defined(HAVE_IOPS_MKDIR_USERNS) || defined(HAVE_IOPS_MKDIR_IDMAP)) zpl_vap_init(vap, dip, mode | S_IFDIR, cr, user_ns); #else - zpl_vap_init(vap, dip, mode | S_IFDIR, cr, kcred->user_ns); + zpl_vap_init(vap, dip, mode | S_IFDIR, cr, zfs_init_idmap); #endif error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); @@ -395,7 +410,11 @@ zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) * Get snapshot directory attributes. */ static int -#ifdef HAVE_USERNS_IOPS_GETATTR +#ifdef HAVE_IDMAP_IOPS_GETATTR +zpl_snapdir_getattr_impl(struct mnt_idmap *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#elif defined(HAVE_USERNS_IOPS_GETATTR) zpl_snapdir_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) @@ -411,9 +430,11 @@ zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, if ((error = zpl_enter(zfsvfs, FTAG)) != 0) return (error); -#ifdef HAVE_USERNS_IOPS_GETATTR +#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) #ifdef HAVE_GENERIC_FILLATTR_USERNS generic_fillattr(user_ns, ip, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP) + generic_fillattr(user_ns, ip, stat); #else (void) user_ns; #endif @@ -471,7 +492,9 @@ const struct file_operations zpl_fops_snapdir = { const struct inode_operations zpl_ops_snapdir = { .lookup = zpl_snapdir_lookup, .getattr = zpl_snapdir_getattr, -#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) +#if (defined(HAVE_RENAME_WANTS_FLAGS) || \ + defined(HAVE_IOPS_RENAME_USERNS) || \ + defined(HAVE_IOPS_RENAME_IDMAP)) .rename = zpl_snapdir_rename2, #else .rename = zpl_snapdir_rename, @@ -562,6 +585,10 @@ static int zpl_shares_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#elif defined(HAVE_IDMAP_IOPS_GETATTR) +zpl_shares_getattr_impl(struct mnt_idmap *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) #else zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) @@ -577,9 +604,11 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, return (error); if (zfsvfs->z_shares_dir == 0) { -#ifdef HAVE_USERNS_IOPS_GETATTR +#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) #ifdef HAVE_GENERIC_FILLATTR_USERNS generic_fillattr(user_ns, path->dentry->d_inode, stat); +#elif defined(HAVE_GENERIC_FILLATTR_IDMAP) + generic_fillattr(user_ns, path->dentry->d_inode, stat); #else (void) user_ns; #endif @@ -594,12 +623,8 @@ zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); if (error == 0) { -#ifdef HAVE_USERNS_IOPS_GETATTR -#ifdef HAVE_GENERIC_FILLATTR_USERNS +#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) error = -zfs_getattr_fast(user_ns, ZTOI(dzp), stat); -#else - (void) user_ns; -#endif #else error = -zfs_getattr_fast(kcred->user_ns, ZTOI(dzp), stat); #endif diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index ce22e9a9e0e4..e690525d3cd4 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -1049,7 +1049,7 @@ __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) !capable(CAP_LINUX_IMMUTABLE)) return (-EPERM); - if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) + if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EACCES); xva_init(xva); @@ -1096,7 +1096,7 @@ zpl_ioctl_setflags(struct file *filp, void __user *arg) crhold(cr); cookie = spl_fstrans_mark(); - err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, kcred->user_ns); + err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); @@ -1144,7 +1144,7 @@ zpl_ioctl_setxattr(struct file *filp, void __user *arg) crhold(cr); cookie = spl_fstrans_mark(); - err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, kcred->user_ns); + err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); @@ -1179,7 +1179,7 @@ __zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva) !capable(CAP_LINUX_IMMUTABLE)) return (-EPERM); - if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) + if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EACCES); xva_init(xva); @@ -1232,7 +1232,7 @@ zpl_ioctl_setdosflags(struct file *filp, void __user *arg) crhold(cr); cookie = spl_fstrans_mark(); - err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, kcred->user_ns); + err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c index 993447e54683..5f5ad186a61c 100644 --- a/module/os/linux/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -113,12 +113,12 @@ zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) void zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr, - zuserns_t *mnt_ns) + zidmap_t *mnt_ns) { vap->va_mask = ATTR_MODE; vap->va_mode = mode; - vap->va_uid = zfs_vfsuid_to_uid((struct user_namespace *)mnt_ns, + vap->va_uid = zfs_vfsuid_to_uid(mnt_ns, zfs_i_user_ns(dir), crgetuid(cr)); if (dir->i_mode & S_ISGID) { @@ -126,7 +126,7 @@ zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr, if (S_ISDIR(mode)) vap->va_mode |= S_ISGID; } else { - vap->va_gid = zfs_vfsgid_to_gid((struct user_namespace *)mnt_ns, + vap->va_gid = zfs_vfsgid_to_gid(mnt_ns, zfs_i_user_ns(dir), crgetgid(cr)); } } @@ -135,6 +135,9 @@ static int #ifdef HAVE_IOPS_CREATE_USERNS zpl_create(struct user_namespace *user_ns, struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) +#elif defined(HAVE_IOPS_CREATE_IDMAP) +zpl_create(struct mnt_idmap *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool flag) #else zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) #endif @@ -144,8 +147,8 @@ zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) vattr_t *vap; int error; fstrans_cookie_t cookie; -#ifndef HAVE_IOPS_CREATE_USERNS - zuserns_t *user_ns = kcred->user_ns; +#if !(defined(HAVE_IOPS_CREATE_USERNS) || defined(HAVE_IOPS_CREATE_IDMAP)) + zidmap_t *user_ns = kcred->user_ns; #endif crhold(cr); @@ -181,6 +184,9 @@ static int #ifdef HAVE_IOPS_MKNOD_USERNS zpl_mknod(struct user_namespace *user_ns, struct inode *dir, struct dentry *dentry, umode_t mode, +#elif defined(HAVE_IOPS_MKNOD_IDMAP) +zpl_mknod(struct mnt_idmap *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode, #else zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, #endif @@ -191,8 +197,8 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, vattr_t *vap; int error; fstrans_cookie_t cookie; -#ifndef HAVE_IOPS_MKNOD_USERNS - zuserns_t *user_ns = kcred->user_ns; +#if !(defined(HAVE_IOPS_MKNOD_USERNS) || defined(HAVE_IOPS_MKNOD_IDMAP)) + zidmap_t *user_ns = kcred->user_ns; #endif /* @@ -234,7 +240,10 @@ zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, #ifdef HAVE_TMPFILE static int -#ifndef HAVE_TMPFILE_DENTRY +#ifdef HAVE_TMPFILE_IDMAP +zpl_tmpfile(struct mnt_idmap *userns, struct inode *dir, + struct file *file, umode_t mode) +#elif !defined(HAVE_TMPFILE_DENTRY) zpl_tmpfile(struct user_namespace *userns, struct inode *dir, struct file *file, umode_t mode) #else @@ -251,8 +260,8 @@ zpl_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) vattr_t *vap; int error; fstrans_cookie_t cookie; -#ifndef HAVE_TMPFILE_USERNS - zuserns_t *userns = kcred->user_ns; +#if !(defined(HAVE_TMPFILE_USERNS) || defined(HAVE_TMPFILE_IDMAP)) + zidmap_t *userns = kcred->user_ns; #endif crhold(cr); @@ -330,6 +339,9 @@ static int #ifdef HAVE_IOPS_MKDIR_USERNS zpl_mkdir(struct user_namespace *user_ns, struct inode *dir, struct dentry *dentry, umode_t mode) +#elif defined(HAVE_IOPS_MKDIR_IDMAP) +zpl_mkdir(struct mnt_idmap *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode) #else zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) #endif @@ -339,8 +351,8 @@ zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) znode_t *zp; int error; fstrans_cookie_t cookie; -#ifndef HAVE_IOPS_MKDIR_USERNS - zuserns_t *user_ns = kcred->user_ns; +#if !(defined(HAVE_IOPS_MKDIR_USERNS) || defined(HAVE_IOPS_MKDIR_IDMAP)) + zidmap_t *user_ns = kcred->user_ns; #endif crhold(cr); @@ -403,6 +415,10 @@ static int zpl_getattr_impl(struct user_namespace *user_ns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#elif defined(HAVE_IDMAP_IOPS_GETATTR) +zpl_getattr_impl(struct mnt_idmap *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) #else zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) @@ -419,7 +435,7 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, * XXX query_flags currently ignored. */ -#ifdef HAVE_USERNS_IOPS_GETATTR +#if (defined(HAVE_USERNS_IOPS_GETATTR) || defined(HAVE_IDMAP_IOPS_GETATTR)) error = -zfs_getattr_fast(user_ns, ip, stat); #else error = -zfs_getattr_fast(kcred->user_ns, ip, stat); @@ -458,9 +474,12 @@ zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, ZPL_GETATTR_WRAPPER(zpl_getattr); static int -#ifdef HAVE_SETATTR_PREPARE_USERNS +#ifdef HAVE_USERNS_IOPS_SETATTR zpl_setattr(struct user_namespace *user_ns, struct dentry *dentry, struct iattr *ia) +#elif defined(HAVE_IDMAP_IOPS_SETATTR) +zpl_setattr(struct mnt_idmap *user_ns, struct dentry *dentry, + struct iattr *ia) #else zpl_setattr(struct dentry *dentry, struct iattr *ia) #endif @@ -473,8 +492,10 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) #ifdef HAVE_SETATTR_PREPARE_USERNS error = zpl_setattr_prepare(user_ns, dentry, ia); +#elif defined(HAVE_SETATTR_PREPARE_IDMAP) + error = zpl_setattr_prepare(user_ns, dentry, ia); #else - error = zpl_setattr_prepare(kcred->user_ns, dentry, ia); + error = zpl_setattr_prepare(zfs_init_idmap, dentry, ia); #endif if (error) return (error); @@ -506,10 +527,12 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip); cookie = spl_fstrans_mark(); -#ifdef HAVE_SETATTR_PREPARE_USERNS +#ifdef HAVE_USERNS_IOPS_SETATTR + error = -zfs_setattr(ITOZ(ip), vap, 0, cr, user_ns); +#elif defined(HAVE_IDMAP_IOPS_SETATTR) error = -zfs_setattr(ITOZ(ip), vap, 0, cr, user_ns); #else - error = -zfs_setattr(ITOZ(ip), vap, 0, cr, kcred->user_ns); + error = -zfs_setattr(ITOZ(ip), vap, 0, cr, zfs_init_idmap); #endif if (!error && (ia->ia_valid & ATTR_MODE)) error = zpl_chmod_acl(ip); @@ -527,6 +550,10 @@ static int zpl_rename2(struct user_namespace *user_ns, struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int rflags) +#elif defined(HAVE_IOPS_RENAME_IDMAP) +zpl_rename2(struct mnt_idmap *user_ns, struct inode *sdip, + struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, + unsigned int rflags) #else zpl_rename2(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int rflags) @@ -536,8 +563,8 @@ zpl_rename2(struct inode *sdip, struct dentry *sdentry, vattr_t *wo_vap = NULL; int error; fstrans_cookie_t cookie; -#ifndef HAVE_IOPS_RENAME_USERNS - zuserns_t *user_ns = kcred->user_ns; +#if !(defined(HAVE_IOPS_RENAME_USERNS) || defined(HAVE_IOPS_RENAME_IDMAP)) + zidmap_t *user_ns = kcred->user_ns; #endif crhold(cr); @@ -561,7 +588,8 @@ zpl_rename2(struct inode *sdip, struct dentry *sdentry, #if !defined(HAVE_IOPS_RENAME_USERNS) && \ !defined(HAVE_RENAME_WANTS_FLAGS) && \ - !defined(HAVE_RENAME2) + !defined(HAVE_RENAME2) && \ + !defined(HAVE_IOPS_RENAME_IDMAP) static int zpl_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) @@ -574,6 +602,9 @@ static int #ifdef HAVE_IOPS_SYMLINK_USERNS zpl_symlink(struct user_namespace *user_ns, struct inode *dir, struct dentry *dentry, const char *name) +#elif defined(HAVE_IOPS_SYMLINK_IDMAP) +zpl_symlink(struct mnt_idmap *user_ns, struct inode *dir, + struct dentry *dentry, const char *name) #else zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) #endif @@ -583,8 +614,8 @@ zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) znode_t *zp; int error; fstrans_cookie_t cookie; -#ifndef HAVE_IOPS_SYMLINK_USERNS - zuserns_t *user_ns = kcred->user_ns; +#if !(defined(HAVE_IOPS_SYMLINK_USERNS) || defined(HAVE_IOPS_SYMLINK_IDMAP)) + zidmap_t *user_ns = kcred->user_ns; #endif crhold(cr); @@ -802,6 +833,8 @@ const struct inode_operations zpl_dir_inode_operations = { .rename2 = zpl_rename2, #elif defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) .rename = zpl_rename2, +#elif defined(HAVE_IOPS_RENAME_IDMAP) + .rename = zpl_rename2, #else .rename = zpl_rename, #endif diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index 4156d686732a..96d85991811e 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -499,7 +499,7 @@ zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, vap->va_gid = crgetgid(cr); error = -zfs_create(dxzp, (char *)name, vap, 0, 0644, &xzp, - cr, ATTR_NOACLCHECK, NULL, kcred->user_ns); + cr, ATTR_NOACLCHECK, NULL, zfs_init_idmap); if (error) goto out; } @@ -738,7 +738,7 @@ __zpl_xattr_user_get(struct inode *ip, const char *name, ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get); static int -__zpl_xattr_user_set(struct user_namespace *user_ns, +__zpl_xattr_user_set(zidmap_t *user_ns, struct inode *ip, const char *name, const void *value, size_t size, int flags) { @@ -848,7 +848,7 @@ __zpl_xattr_trusted_get(struct inode *ip, const char *name, ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get); static int -__zpl_xattr_trusted_set(struct user_namespace *user_ns, +__zpl_xattr_trusted_set(zidmap_t *user_ns, struct inode *ip, const char *name, const void *value, size_t size, int flags) { @@ -918,7 +918,7 @@ __zpl_xattr_security_get(struct inode *ip, const char *name, ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get); static int -__zpl_xattr_security_set(struct user_namespace *user_ns, +__zpl_xattr_security_set(zidmap_t *user_ns, struct inode *ip, const char *name, const void *value, size_t size, int flags) { @@ -1061,6 +1061,9 @@ int #ifdef HAVE_SET_ACL_USERNS zpl_set_acl(struct user_namespace *userns, struct inode *ip, struct posix_acl *acl, int type) +#elif defined(HAVE_SET_ACL_IDMAP_DENTRY) +zpl_set_acl(struct mnt_idmap *userns, struct dentry *dentry, + struct posix_acl *acl, int type) #elif defined(HAVE_SET_ACL_USERNS_DENTRY_ARG2) zpl_set_acl(struct user_namespace *userns, struct dentry *dentry, struct posix_acl *acl, int type) @@ -1070,6 +1073,8 @@ zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type) { #ifdef HAVE_SET_ACL_USERNS_DENTRY_ARG2 return (zpl_set_acl_impl(d_inode(dentry), acl, type)); +#elif defined(HAVE_SET_ACL_IDMAP_DENTRY) + return (zpl_set_acl_impl(d_inode(dentry), acl, type)); #else return (zpl_set_acl_impl(ip, acl, type)); #endif /* HAVE_SET_ACL_USERNS_DENTRY_ARG2 */ @@ -1313,7 +1318,7 @@ __zpl_xattr_acl_get_default(struct inode *ip, const char *name, ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default); static int -__zpl_xattr_acl_set_access(struct user_namespace *mnt_ns, +__zpl_xattr_acl_set_access(zidmap_t *mnt_ns, struct inode *ip, const char *name, const void *value, size_t size, int flags) { @@ -1328,12 +1333,12 @@ __zpl_xattr_acl_set_access(struct user_namespace *mnt_ns, if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); -#if defined(HAVE_XATTR_SET_USERNS) +#if defined(HAVE_XATTR_SET_USERNS) || defined(HAVE_XATTR_SET_IDMAP) if (!zpl_inode_owner_or_capable(mnt_ns, ip)) return (-EPERM); #else (void) mnt_ns; - if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) + if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EPERM); #endif @@ -1359,7 +1364,7 @@ __zpl_xattr_acl_set_access(struct user_namespace *mnt_ns, ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access); static int -__zpl_xattr_acl_set_default(struct user_namespace *mnt_ns, +__zpl_xattr_acl_set_default(zidmap_t *mnt_ns, struct inode *ip, const char *name, const void *value, size_t size, int flags) { @@ -1374,12 +1379,12 @@ __zpl_xattr_acl_set_default(struct user_namespace *mnt_ns, if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); -#if defined(HAVE_XATTR_SET_USERNS) +#if defined(HAVE_XATTR_SET_USERNS) || defined(HAVE_XATTR_SET_IDMAP) if (!zpl_inode_owner_or_capable(mnt_ns, ip)) return (-EPERM); #else (void) mnt_ns; - if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) + if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EPERM); #endif diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index 04dfda56b3f1..09c7be853bf9 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -389,7 +389,7 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) #if defined(__linux__) error = zfs_create(dzp, name, &xva.xva_vattr, - 0, 0, &zp, kcred, vflg, &vsec, kcred->user_ns); + 0, 0, &zp, kcred, vflg, &vsec, zfs_init_idmap); #else error = zfs_create(dzp, name, &xva.xva_vattr, 0, 0, &zp, kcred, vflg, &vsec, NULL); @@ -424,7 +424,7 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) } #if defined(__linux__) error = zfs_mkdir(dzp, name, &xva.xva_vattr, - &zp, kcred, vflg, &vsec, kcred->user_ns); + &zp, kcred, vflg, &vsec, zfs_init_idmap); #else error = zfs_mkdir(dzp, name, &xva.xva_vattr, &zp, kcred, vflg, &vsec, NULL); @@ -540,7 +540,7 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) #if defined(__linux__) error = zfs_create(dzp, name, &xva.xva_vattr, - 0, 0, &zp, kcred, vflg, NULL, kcred->user_ns); + 0, 0, &zp, kcred, vflg, NULL, zfs_init_idmap); #else error = zfs_create(dzp, name, &xva.xva_vattr, 0, 0, &zp, kcred, vflg, NULL, NULL); @@ -563,7 +563,7 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) #if defined(__linux__) error = zfs_mkdir(dzp, name, &xva.xva_vattr, - &zp, kcred, vflg, NULL, kcred->user_ns); + &zp, kcred, vflg, NULL, zfs_init_idmap); #else error = zfs_mkdir(dzp, name, &xva.xva_vattr, &zp, kcred, vflg, NULL, NULL); @@ -578,7 +578,7 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) link = name + strlen(name) + 1; #if defined(__linux__) error = zfs_symlink(dzp, name, &xva.xva_vattr, - link, &zp, kcred, vflg, kcred->user_ns); + link, &zp, kcred, vflg, zfs_init_idmap); #else error = zfs_symlink(dzp, name, &xva.xva_vattr, link, &zp, kcred, vflg, NULL); @@ -699,7 +699,7 @@ do_zfs_replay_rename(zfsvfs_t *zfsvfs, lr_rename_t *lr, char *sname, #if defined(__linux__) error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags, - wo_vap, kcred->user_ns); + wo_vap, zfs_init_idmap); #else error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg, rflags, wo_vap, NULL); @@ -977,7 +977,7 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) lr->lr_uid, lr->lr_gid); #if defined(__linux__) - error = zfs_setattr(zp, vap, 0, kcred, kcred->user_ns); + error = zfs_setattr(zp, vap, 0, kcred, zfs_init_idmap); #else error = zfs_setattr(zp, vap, 0, kcred, NULL); #endif diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 91b594e41cda..a6a27222bf4c 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -172,14 +172,14 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) if (flag & V_ACE_MASK) #if defined(__linux__) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, - kcred->user_ns); + zfs_init_idmap); #else error = zfs_zaccess(zp, mode, flag, B_FALSE, cr, NULL); #endif else #if defined(__linux__) - error = zfs_zaccess_rwx(zp, mode, flag, cr, kcred->user_ns); + error = zfs_zaccess_rwx(zp, mode, flag, cr, zfs_init_idmap); #else error = zfs_zaccess_rwx(zp, mode, flag, cr, NULL); #endif From 678a3b8f999b9571691187f5224164c8bc58ec34 Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Mon, 10 Apr 2023 15:24:27 -0600 Subject: [PATCH 041/180] Trim needless zeroes from checksum events The ereport.fs.zfs.checksum event contains histograms of the bits that were wrongly set or cleared according to their bit position in a 64-bit word. So the maximum value that any histogram bucket could have would be 64. But ZFS currently uses a uint32_t to hold each bucket. As a result, the event report is full of needless zeroes. Change the bucket size to uint8_t, stripping 768 needless zeros from each event. Original event format: ``` class=ereport.fs.zfs.checksum ena=639460469834258433 pool=testpool.1933 pool_guid=4979719877084416563 pool_state=0 pool_context=0 pool_failmode=wait vdev_guid=4136721804819128578 vdev_type=file vdev_path=/tmp/kyua.1TxP3A/2/work/file1.1933 vdev_ashift=9 vdev_complete_ts=609837019678 vdev_delta_ts=33450 vdev_read_errors=0 vdev_write_errors=0 vdev_cksum_errors=20 vdev_delays=0 parent_guid=2751977006639883417 parent_type=raidz vdev_spare_guids= zio_err=0 zio_flags=1048752 zio_stage=4194304 zio_pipeline=65011712 zio_delay=0 zio_timestamp=0 zio_delta=0 zio_priority=4 zio_offset=702976 zio_size=1024 zio_objset=24 zio_object=0 zio_level=3 zio_blkid=0 bad_ranges=0000000000000400 bad_ranges_min_gap=8 bad_range_sets=0000079e bad_range_clears=00000854 bad_set_histogram=000000210000001a000000150000001d000000240000001b000000220000001b000000210000002100000018000000260000002300000025000000210000001e000000250000001b0000001d0000001e0000001600000025000000180000001b000000240000001b000000240000001b0000001c000000210000001b0000001e000000210000001a0000001e000000220000001d0000001b000000200000001f0000001a000000250000001f0000001d0000001b0000001d000000240000001d0000001b0000001b0000001f00000024000000190000001a0000001f0000001e000000240000001e0000002400000021000000200000001d0000001d00000021 bad_cleared_histogram=000000220000002700000021000000210000001b0000001a000000250000001f0000001c0000001e0000002400000022000000220000002400000022000000240000002200000021000000220000001b0000002100000021000000190000001b000000240000002400000020000000290000002a00000028000000250000002400000020000000270000002500000016000000270000001c000000210000001f000000240000001c0000002100000022000000240000002100000023000000210000002700000022000000240000001b00000022000000210000001c00000023000000150000002600000020000000270000001e0000001d0000002400000026 time=00000016806457270000000323406839 eid=458 ``` New format: ``` class=ereport.fs.zfs.checksum ena=96599319807790081 pool=testpool.1933 pool_guid=1236902063710799041 pool_state=0 pool_context=0 pool_failmode=wait vdev_guid=2774253874431514999 vdev_type=file vdev_path=/tmp/kyua.6Temlq/2/work/file1.1933 vdev_ashift=9 vdev_complete_ts=92124283803 vdev_delta_ts=46670 vdev_read_errors=0 vdev_write_errors=0 vdev_cksum_errors=20 vdev_delays=0 parent_guid=8090931855087882905 parent_type=raidz vdev_spare_guids= zio_err=0 zio_flags=1048752 zio_stage=4194304 zio_pipeline=65011712 zio_delay=0 zio_timestamp=0 zio_delta=0 zio_priority=4 zio_offset=1028608 zio_size=512 zio_objset=0 zio_object=0 zio_level=0 zio_blkid=4 bad_ranges=0000000000000200 bad_ranges_min_gap=8 bad_range_sets=0000061f bad_range_clears=000001f4 bad_set_histogram=1719161c1c1c101618171a151a1a19161e1c171d1816161c191f1a18192117191c131d171b1613151a171419161a1b1319101b14171b18151e191a1b141a1c17 bad_cleared_histogram=06090a0808070a0b020609060506090a01090a050a0a0509070609080d050d0607080d060507080c04070807070a0608020c080c080908040808090a05090a07 time=00000016806477050000000604157480 eid=62 ``` Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: Alan Somers Sponsored-by: Axcient Closes #14716 --- module/zfs/zfs_fm.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 7169e49ac46a..bdd0e96c327a 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -755,8 +755,8 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, typedef struct zfs_ecksum_info { /* histograms of set and cleared bits by bit number in a 64-bit word */ - uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY]; - uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; + uint8_t zei_histogram_set[sizeof (uint64_t) * NBBY]; + uint8_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; /* inline arrays of bits set and cleared. */ uint64_t zei_bits_set[ZFM_MAX_INLINE]; @@ -781,7 +781,7 @@ typedef struct zfs_ecksum_info { } zfs_ecksum_info_t; static void -update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count) +update_histogram(uint64_t value_arg, uint8_t *hist, uint32_t *count) { size_t i; size_t bits = 0; @@ -1052,10 +1052,10 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, } else { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, - DATA_TYPE_UINT32_ARRAY, + DATA_TYPE_UINT8_ARRAY, NBBY * sizeof (uint64_t), eip->zei_histogram_set, FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, - DATA_TYPE_UINT32_ARRAY, + DATA_TYPE_UINT8_ARRAY, NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, NULL); } From 574e09d8c6b49f223417e9aadb14367ac9db18f6 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Wed, 12 Apr 2023 17:53:53 +0200 Subject: [PATCH 042/180] Fix in check_filesystem() Fix the code in case of missing snapshots. Previously the check was in a conditional that would be executed if the filesystem had snapshots. Reviewed-by: Brian Behlendorf Reviewed-by: Tino Reichardt Signed-off-by: George Amanakis Closes #14735 --- module/zfs/spa_errlog.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index af144ef16978..3bc8619b51a8 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -354,12 +354,12 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, dsl_dataset_rele(ds, FTAG); return (error); } + } - if (snap_count == 0) { - /* Filesystem without snapshots. */ - dsl_dataset_rele(ds, FTAG); - return (0); - } + if (snap_count == 0) { + /* Filesystem without snapshots. */ + dsl_dataset_rele(ds, FTAG); + return (0); } uint64_t *snap_obj_array = kmem_zalloc(snap_count * sizeof (uint64_t), From 6e015933f88fe7ba5de45cf263028de1ee04460a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Wed, 12 Apr 2023 19:08:49 +0200 Subject: [PATCH 043/180] initramfs: source user scripts from /e/z/initramfs-tools-load-key{,.d/*} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By dropping in a file in a directory (for packages) or by making a file (for local administrators), custom key loading methods may be provided for the rootfs and necessities. Reviewed-by: Brian Behlendorf Signed-off-by: Nicholas Morris Signed-off-by: Ahelenia Ziemiańska Co-authored-by: Nicholas Morris Supersedes: #14704 Closes: #13757 Closes #14733 --- contrib/initramfs/README.md | 25 ++++++++++++++++++++++++- contrib/initramfs/hooks/zfs.in | 3 +++ contrib/initramfs/scripts/zfs | 10 ++++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/contrib/initramfs/README.md b/contrib/initramfs/README.md index 34e9bab3c756..68647fa9fc3d 100644 --- a/contrib/initramfs/README.md +++ b/contrib/initramfs/README.md @@ -78,7 +78,30 @@ To use this feature: 1. Install the `dropbear-initramfs` package. You may wish to uninstall the `cryptsetup-initramfs` package to avoid warnings. 2. Add your SSH key(s) to `/etc/dropbear-initramfs/authorized_keys`. Note - that Dropbear does not support ed25519 keys before version 2020.79; + that Dropbear does not support ed25519 keys before version 2020.79; in that case, use RSA (2048-bit or more) instead. 3. Rebuild the initramfs with your keys: `update-initramfs -u` 4. During the system boot, login via SSH and run: `zfsunlock` + +### Unlocking a ZFS encrypted root via alternate means + +If present, a shell program at `/etc/zfs/initramfs-tools-load-key` +and files matching `/etc/zfs/initramfs-tools-load-key.d/*` +will be copied to the initramfs during generation +and sourced to load the key, if required. + +The `$ENCRYPTIONROOT` to load the key for and `$KEYLOCATION` variables are set, +and all initramfs-tools functions are available; +use unquoted `$ZPOOL` and `$ZFS` to run `zpool` and `zfs`. + +A successful return (and loaded key) stops the search. +A failure return is non-fatal, +and loading keys proceeds as normal if no hook succeeds. + +A trivial example of a key-loading drop-in that uses the BLAKE2 checksum +of the file at the `keylocation` as the key follows. + +```sh +key="$(b2sum "${KEYLOCATION#file://}")" || return +printf '%s\n' "${key%% *}" | $ZFS load-key -L prompt "$ENCRYPTIONROOT" +``` diff --git a/contrib/initramfs/hooks/zfs.in b/contrib/initramfs/hooks/zfs.in index 28dd252eea52..6cd7e6f1ea3b 100755 --- a/contrib/initramfs/hooks/zfs.in +++ b/contrib/initramfs/hooks/zfs.in @@ -41,6 +41,9 @@ copy_file cache "@sysconfdir@/zfs/zpool.cache" copy_file config "@initconfdir@/zfs" copy_file config "@sysconfdir@/zfs/zfs-functions" copy_file config "@sysconfdir@/zfs/vdev_id.conf" +for f in "@sysconfdir@/zfs/initramfs-tools-load-key" "@sysconfdir@/zfs/initramfs-tools-load-key.d/"*; do + copy_file config "$f" +done copy_file rule "@udevruledir@/60-zvol.rules" copy_file rule "@udevruledir@/69-vdev.rules" diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs index 23aa95efc8f4..7f977a30f75b 100644 --- a/contrib/initramfs/scripts/zfs +++ b/contrib/initramfs/scripts/zfs @@ -420,6 +420,16 @@ decrypt_fs() # Continue only if the key needs to be loaded [ "$KEYSTATUS" = "unavailable" ] || return 0 + # Try extensions first + for f in "/etc/zfs/initramfs-tools-load-key" "/etc/zfs/initramfs-tools-load-key.d/"*; do + [ -r "$f" ] || continue + (. "$f") && { + # Successful return and actually-loaded key: we're done + KEYSTATUS="$(get_fs_value "${ENCRYPTIONROOT}" keystatus)" + [ "$KEYSTATUS" = "unavailable" ] || return 0 + } + done + # Do not prompt if key is stored noninteractively, if ! [ "${KEYLOCATION}" = "prompt" ]; then $ZFS load-key "${ENCRYPTIONROOT}" From c71fe716401f6919068f84b389dcd1b7ec2b8b0e Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Thu, 13 Apr 2023 08:15:05 +0900 Subject: [PATCH 044/180] Fix data corruption when cloning embedded blocks Don't overwrite blk_phys_birth, as for embedded blocks it is part of the payload. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Issue #13392 Closes #14739 --- module/zfs/dmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index ce985d833f58..cda1472a77aa 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2312,8 +2312,10 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dl->dr_overridden_by.blk_phys_birth = 0; } else { dl->dr_overridden_by.blk_birth = dr->dr_txg; - dl->dr_overridden_by.blk_phys_birth = - BP_PHYSICAL_BIRTH(bp); + if (!BP_IS_EMBEDDED(bp)) { + dl->dr_overridden_by.blk_phys_birth = + BP_PHYSICAL_BIRTH(bp); + } } mutex_exit(&db->db_mtx); From 27a82cbb3ef2e30a54860b955fb257fb7f8307cd Mon Sep 17 00:00:00 2001 From: youzhongyang Date: Thu, 13 Apr 2023 12:12:03 -0400 Subject: [PATCH 045/180] Linux 6.3 compat: Fix memcpy "detected field-spanning write" error Add a new union member of flexible array to dnode_phys_t and use it in the macro so we can silence the memcpy() fortify error. Reviewed-by: Brian Behlendorf Signed-off-by: Youzhong Yang Closes #14737 --- include/sys/dnode.h | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/include/sys/dnode.h b/include/sys/dnode.h index 2d741ea36bd0..dbe7350d4da7 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -120,7 +120,11 @@ extern "C" { #define DN_MAX_LEVELS (DIV_ROUND_UP(DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT, \ DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT) + 1) -#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ +/* + * Use the flexible array instead of the fixed length one dn_bonus + * to address memcpy/memmove fortify error + */ +#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus_flexible + \ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) #define DN_MAX_BONUS_LEN(dnp) \ ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? \ @@ -266,6 +270,10 @@ typedef struct dnode_phys { sizeof (blkptr_t)]; blkptr_t dn_spill; }; + struct { + blkptr_t __dn_ignore4; + uint8_t dn_bonus_flexible[]; + }; }; } dnode_phys_t; From ac18dc77f3703940682aecb442f4e58aa2c14f1a Mon Sep 17 00:00:00 2001 From: dodexahedron Date: Thu, 13 Apr 2023 09:15:34 -0700 Subject: [PATCH 046/180] Minor improvements to zpoolconcepts.7 * Fixed one typo (effects -> affects) * Re-worded raidz description to make it clearer that it is not quite the same as RAID5, though similar * Clarified that data is not necessarily written in a static stripe width * Minor grammar consistency improvement * Noted that "volumes" means zvols * Fixed a couple of split infinitives * Clarified that hot spares come from the same pool they were assigned to * "we" -> ZFS * Fixed warnings thrown by mandoc, and removed unnecessary wordiness in one fixed line. Reviewed-by: Brian Behlendorf Signed-off-by: Brandon Thetford Closes #14726 --- man/man7/zpoolconcepts.7 | 64 +++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7 index 4ef96b157564..db3fd4926236 100644 --- a/man/man7/zpoolconcepts.7 +++ b/man/man7/zpoolconcepts.7 @@ -26,7 +26,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd June 2, 2021 +.Dd April 7, 2023 .Dt ZPOOLCONCEPTS 7 .Os . @@ -36,7 +36,7 @@ . .Sh DESCRIPTION .Ss Virtual Devices (vdevs) -A "virtual device" describes a single device or a collection of devices +A "virtual device" describes a single device or a collection of devices, organized according to certain performance and fault characteristics. The following virtual devices are supported: .Bl -tag -width "special" @@ -66,13 +66,14 @@ A mirror of two or more devices. Data is replicated in an identical fashion across all components of a mirror. A mirror with .Em N No disks of size Em X No can hold Em X No bytes and can withstand Em N-1 -devices failing without losing data. +devices failing, without losing data. .It Sy raidz , raidz1 , raidz2 , raidz3 -A variation on RAID-5 that allows for better distribution of parity and -eliminates the RAID-5 -.Qq write hole +A distributed-parity layout, similar to RAID-5/6, with improved distribution of +parity, and which does not suffer from the RAID-5/6 +.Qq write hole , .Pq in which data and parity become inconsistent after a power loss . -Data and parity is striped across all disks within a raidz group. +Data and parity is striped across all disks within a raidz group, though not +necessarily in a consistent stripe width. .Pp A raidz group can have single, double, or triple parity, meaning that the raidz group can sustain one, two, or three failures, respectively, without @@ -96,8 +97,8 @@ The minimum number of devices in a raidz group is one more than the number of parity disks. The recommended number is between 3 and 9 to help increase performance. .It Sy draid , draid1 , draid2 , draid3 -A variant of raidz that provides integrated distributed hot spares which -allows for faster resilvering while retaining the benefits of raidz. +A variant of raidz that provides integrated distributed hot spares, allowing +for faster resilvering, while retaining the benefits of raidz. A dRAID vdev is constructed from multiple internal raidz groups, each with .Em D No data devices and Em P No parity devices . These groups are distributed over all of the children in order to fully @@ -105,12 +106,12 @@ utilize the available disk performance. .Pp Unlike raidz, dRAID uses a fixed stripe width (padding as necessary with zeros) to allow fully sequential resilvering. -This fixed stripe width significantly effects both usable capacity and IOPS. +This fixed stripe width significantly affects both usable capacity and IOPS. For example, with the default .Em D=8 No and Em 4 KiB No disk sectors the minimum allocation size is Em 32 KiB . If using compression, this relatively large allocation size can reduce the effective compression ratio. -When using ZFS volumes and dRAID, the default of the +When using ZFS volumes (zvols) and dRAID, the default of the .Sy volblocksize property is increased to account for the allocation size. If a dRAID pool will hold a significant amount of small blocks, it is @@ -118,7 +119,7 @@ recommended to also add a mirrored .Sy special vdev to store those blocks. .Pp -In regards to I/O, performance is similar to raidz since for any read all +In regards to I/O, performance is similar to raidz since, for any read, all .Em D No data disks must be accessed . Delivered random IOPS can be reasonably approximated as .Sy floor((N-S)/(D+P))*single_drive_IOPS . @@ -178,7 +179,7 @@ For more information, see the .Sx Intent Log section. .It Sy dedup -A device dedicated solely for deduplication tables. +A device solely dedicated for deduplication tables. The redundancy of this device should match the redundancy of the other normal devices in the pool. If more than one dedup device is specified, then @@ -230,7 +231,7 @@ each a mirror of two disks: ZFS supports a rich set of mechanisms for handling device failure and data corruption. All metadata and data is checksummed, and ZFS automatically repairs bad data -from a good copy when corruption is detected. +from a good copy, when corruption is detected. .Pp In order to take advantage of these features, a pool must make use of some form of redundancy, using either mirrored or raidz groups. @@ -247,7 +248,7 @@ A faulted pool has corrupted metadata, or one or more faulted devices, and insufficient replicas to continue functioning. .Pp The health of the top-level vdev, such as a mirror or raidz device, -is potentially impacted by the state of its associated vdevs, +is potentially impacted by the state of its associated vdevs or component devices. A top-level vdev or component device is in one of the following states: .Bl -tag -width "DEGRADED" @@ -319,14 +320,15 @@ In this case, checksum errors are reported for all disks on which the block is stored. .Pp If a device is removed and later re-attached to the system, -ZFS attempts online the device automatically. +ZFS attempts to bring the device online automatically. Device attachment detection is hardware-dependent and might not be supported on all platforms. . .Ss Hot Spares ZFS allows devices to be associated with pools as .Qq hot spares . -These devices are not actively used in the pool, but when an active device +These devices are not actively used in the pool. +But, when an active device fails, it is automatically replaced by a hot spare. To create a pool with hot spares, specify a .Sy spare @@ -343,10 +345,10 @@ Once a spare replacement is initiated, a new .Sy spare vdev is created within the configuration that will remain there until the original device is replaced. -At this point, the hot spare becomes available again if another device fails. +At this point, the hot spare becomes available again, if another device fails. .Pp -If a pool has a shared spare that is currently being used, the pool can not be -exported since other pools may use this shared spare, which may lead to +If a pool has a shared spare that is currently being used, the pool cannot be +exported, since other pools may use this shared spare, which may lead to potential data corruption. .Pp Shared spares add some risk. @@ -390,7 +392,7 @@ See the .Sx EXAMPLES section for an example of mirroring multiple log devices. .Pp -Log devices can be added, replaced, attached, detached and removed. +Log devices can be added, replaced, attached, detached, and removed. In addition, log devices are imported and exported as part of the pool that contains them. Mirrored devices can be removed by specifying the top-level mirror vdev. @@ -423,8 +425,8 @@ This can be disabled by setting .Sy l2arc_rebuild_enabled Ns = Ns Sy 0 . For cache devices smaller than .Em 1 GiB , -we do not write the metadata structures -required for rebuilding the L2ARC in order not to waste space. +ZFS does not write the metadata structures +required for rebuilding the L2ARC, to conserve space. This can be changed with .Sy l2arc_rebuild_blocks_min_l2size . The cache device header @@ -435,21 +437,21 @@ Setting will result in scanning the full-length ARC lists for cacheable content to be written in L2ARC (persistent ARC). If a cache device is added with -.Nm zpool Cm add -its label and header will be overwritten and its contents are not going to be +.Nm zpool Cm add , +its label and header will be overwritten and its contents will not be restored in L2ARC, even if the device was previously part of the pool. If a cache device is onlined with -.Nm zpool Cm online +.Nm zpool Cm online , its contents will be restored in L2ARC. -This is useful in case of memory pressure +This is useful in case of memory pressure, where the contents of the cache device are not fully restored in L2ARC. -The user can off- and online the cache device when there is less memory pressure -in order to fully restore its contents to L2ARC. +The user can off- and online the cache device when there is less memory +pressure, to fully restore its contents to L2ARC. . .Ss Pool checkpoint Before starting critical procedures that include destructive actions .Pq like Nm zfs Cm destroy , -an administrator can checkpoint the pool's state and in the case of a +an administrator can checkpoint the pool's state and, in the case of a mistake or failure, rewind the entire pool back to the checkpoint. Otherwise, the checkpoint can be discarded when the procedure has completed successfully. @@ -485,7 +487,7 @@ current state of the pool won't be scanned during a scrub. . .Ss Special Allocation Class Allocations in the special class are dedicated to specific block types. -By default this includes all metadata, the indirect blocks of user data, and +By default, this includes all metadata, the indirect blocks of user data, and any deduplication tables. The class can also be provisioned to accept small file blocks. .Pp From 3b5af2013992231645e0a462eef4171d1c48de17 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Tue, 18 Apr 2023 08:42:09 +0900 Subject: [PATCH 047/180] Fix VERIFY(!zil_replaying(zilog, tx)) panic The zfs_log_clone_range() function is never called from the zfs_clone_range_replay() function, so I assumed it is safe to assert that zil_replaying() is never TRUE here. It turns out zil_replaying() also returns TRUE when the sync property is set to disabled. Fix the problem by just returning if zil_replaying() returns TRUE. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Reported by: Florian Smeets Signed-off-by: Pawel Jakub Dawidek Closes #14758 --- module/zfs/zfs_log.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c index d009c58d8644..50325907b0d1 100644 --- a/module/zfs/zfs_log.c +++ b/module/zfs/zfs_log.c @@ -905,9 +905,7 @@ zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, uint64_t partlen, max_log_data; size_t i, partnbps; - VERIFY(!zil_replaying(zilog, tx)); - - if (zp->z_unlinked) + if (zil_replaying(zilog, tx) || zp->z_unlinked) return; max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t)); From 8ed62440eff5a3ea93b0a9b6f0b7e5e7290b667a Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Mon, 17 Apr 2023 20:38:09 -0400 Subject: [PATCH 048/180] Work around Raspberry Pi kernel packaging oddities On Debian and Ubuntu and friends, you get something like "linux-image-$(uname -r)" and "linux-headers-$(uname -r)" you can put a Depends on. On Raspberry Pi OS, you get "raspberrypi-kernel" and "raspberrypi-kernel-headers", with version numbers like 20230411. There is not, as far as I can tell, a reasonable way to map that to a kernel version short of reaching out and digging around in the changelogs or Makefile, so just special-case it so the packages don't fail to install at install time. They still might not build if the versions don't match, but I don't see a way to do anything about that... Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #14745 Closes #14747 --- contrib/debian/control.modules.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/debian/control.modules.in b/contrib/debian/control.modules.in index 70a165266d16..34eb7fafba7c 100644 --- a/contrib/debian/control.modules.in +++ b/contrib/debian/control.modules.in @@ -5,7 +5,7 @@ Maintainer: ZFS on Linux specific mailing list Build-Depends: debhelper-compat (= 10), dkms (>> 2.1.1.2-5), libtool, - linux-headers-_KVERS_ + linux-headers-_KVERS_ | raspberrypi-kernel-headers Standards-Version: 4.3.0 Homepage: http://www.openzfs.org/ Vcs-Git: https://github.com/openzfs/zfs.git @@ -14,7 +14,7 @@ Vcs-Browser: https://github.com/openzfs/zfs Package: openzfs-zfs-modules-_KVERS_ Architecture: _ARCH_ Provides: openzfs-zfs-modules -Depends: linux-image-_KVERS_ +Depends: linux-image-_KVERS_ | raspberrypi-kernel Recommends: openzfsutils Replaces: zfs-modules-_KVERS_ Conflicts: zfs-modules-_KVERS_ From accfdeb948dab87a9576084a582f5bfeda0bd2f2 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 18 Apr 2023 08:41:52 -0700 Subject: [PATCH 049/180] Revert "ZFS_IOC_COUNT_FILLED does unnecessary txg_wait_synced()" This reverts commit 4b3133e671b958fa2c915a4faf57812820124a7b. Users identified this commit as a possible source of data corruption: https://github.com/openzfs/zfs/issues/14753 Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Issue #14753 Closes #14761 --- module/zfs/dnode.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index 367bfaa80726..d15268cd7bc7 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -1764,29 +1764,20 @@ dnode_try_claim(objset_t *os, uint64_t object, int slots) } /* - * Checks if the dnode might contain any uncommitted changes to data blocks. - * Dirty metadata (e.g. bonus buffer) does not count. + * Checks if the dnode contains any uncommitted dirty records. */ boolean_t dnode_is_dirty(dnode_t *dn) { mutex_enter(&dn->dn_mtx); + for (int i = 0; i < TXG_SIZE; i++) { - list_t *list = &dn->dn_dirty_records[i]; - for (dbuf_dirty_record_t *dr = list_head(list); - dr != NULL; dr = list_next(list, dr)) { - if (dr->dr_dbuf == NULL || - (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && - dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) { - mutex_exit(&dn->dn_mtx); - return (B_TRUE); - } - } - if (dn->dn_free_ranges[i] != NULL) { + if (multilist_link_active(&dn->dn_dirty_link[i])) { mutex_exit(&dn->dn_mtx); return (B_TRUE); } } + mutex_exit(&dn->dn_mtx); return (B_FALSE); @@ -2650,9 +2641,7 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_phys->dn_nlevels == 0) { - if (!(flags & DNODE_FIND_HOLE)) { - error = SET_ERROR(ESRCH); - } + error = SET_ERROR(ESRCH); goto out; } From f9e1c63f8c32141bb18c0270d565e3bfc1bbd233 Mon Sep 17 00:00:00 2001 From: Low-power Date: Wed, 19 Apr 2023 02:34:41 +0800 Subject: [PATCH 050/180] Values printed by zpool-iostat(8) should be right-aligned This inappropriate left-alignment was introduced in 7bb7b1f. Reviewed-by: Tony Hutter Reviewed-by: Matthew Ahrens Reviewed-by: Tino Reichardt Signed-off-by: WHR Closes #14751 --- cmd/zpool/zpool_main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 20f9cd679534..27e805943443 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -4272,13 +4272,17 @@ print_iostat_header(iostat_cbdata_t *cb) * by order of magnitude. Uses column_size to add padding. */ static void -print_stat_color(char *statbuf, unsigned int column_size) +print_stat_color(const char *statbuf, unsigned int column_size) { fputs(" ", stdout); + size_t len = strlen(statbuf); + while (len < column_size) { + fputc(' ', stdout); + column_size--; + } if (*statbuf == '0') { color_start(ANSI_GRAY); fputc('0', stdout); - column_size--; } else { for (; *statbuf; statbuf++) { if (*statbuf == 'K') color_start(ANSI_GREEN); @@ -4293,8 +4297,6 @@ print_stat_color(char *statbuf, unsigned int column_size) } } color_end(); - for (; column_size > 0; column_size--) - fputc(' ', stdout); } /* From 23f84d161ed0ef91854be6da301ccca5ede11eec Mon Sep 17 00:00:00 2001 From: youzhongyang Date: Tue, 18 Apr 2023 21:10:40 -0400 Subject: [PATCH 051/180] Silence clang warning of flexible array not at end Reviewed-by: Brian Behlendorf Reviewed-by: Jorgen Lundman Signed-off-by: Youzhong Yang Closes #14764 --- include/sys/dmu_objset.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index d22c682875d8..9f6e0fdd601b 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -72,6 +72,10 @@ struct dmu_tx; */ #define OBJSET_CRYPT_PORTABLE_FLAGS_MASK (0) +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wgnu-variable-sized-type-not-at-end" +#endif typedef struct objset_phys { dnode_phys_t os_meta_dnode; zil_header_t os_zil_header; @@ -88,6 +92,9 @@ typedef struct objset_phys { char os_pad1[OBJSET_PHYS_SIZE_V3 - OBJSET_PHYS_SIZE_V2 - sizeof (dnode_phys_t)]; } objset_phys_t; +#if defined(__clang__) +#pragma clang diagnostic pop +#endif typedef int (*dmu_objset_upgrade_cb_t)(objset_t *); From 3d37e7e5f540f513ab1d8fa61d9208c43b889401 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Wed, 19 Apr 2023 18:03:42 +0200 Subject: [PATCH 052/180] zfsprops.7: update mandlock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h=f7e33bdbd6d1bdf9c3df8bba5abcf3399f957ac3 https://git.kernel.org/pub/scm/docs/man-pages/man-pages.git/commit/?id=7e59106e9c34458540f7d382d5b49071d1b7104f Fixes: commit fb9baa9b2045a193a3caf0a46b5cac5ef7a84b61 ("zfsprops.8: remove nbmand-not-used-on-Linux and pointer to mount(8)") Reviewed-by: Brian Behlendorf Signed-off-by: Ahelenia Ziemiańska Closes #14765 --- man/man7/zfsprops.7 | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 index 1f6ffb59e32f..8f6b919cfc0b 100644 --- a/man/man7/zfsprops.7 +++ b/man/man7/zfsprops.7 @@ -38,7 +38,7 @@ .\" Copyright (c) 2019, Kjeld Schouten-Lebbing .\" Copyright (c) 2022 Hewlett Packard Enterprise Development LP. .\" -.Dd July 21, 2022 +.Dd April 18, 2023 .Dt ZFSPROPS 7 .Os . @@ -80,7 +80,9 @@ for zettabyte The following are all valid .Pq and equal specifications: -.Li 1536M, 1.5g, 1.50GB . +.Li 1536M , +.Li 1.5g , +.Li 1.50GB . .Pp The values of non-numeric properties are case sensitive and must be lowercase, except for @@ -1254,10 +1256,12 @@ location. Controls whether the file system should be mounted with .Sy nbmand .Pq Non-blocking mandatory locks . -This is used for SMB clients. Changes to this property only take effect when the file system is umounted and remounted. -Support for these locks is scarce and not described by POSIX. +This was only supported by Linux prior to 5.15, and was buggy there, +and is not supported by +.Fx . +On Solaris it's used for SMB clients. .It Sy overlay Ns = Ns Sy on Ns | Ns Sy off Allow mounting on a busy directory or a directory which already contains files or directories. From 719534ca8e8d39e94aef35d753c1c41179791ef5 Mon Sep 17 00:00:00 2001 From: Ameer Hamza <106930537+ixhamza@users.noreply.github.com> Date: Wed, 19 Apr 2023 21:04:32 +0500 Subject: [PATCH 053/180] Fix "Detach spare vdev in case if resilvering does not happen" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Spare vdev should detach from the pool when a disk is reinserted. However, spare detachment depends on the completion of resilvering, and if resilver does not schedule, the spare vdev keeps attached to the pool until the next resilvering. When a zfs pool contains several disks (25+ mirror), resilvering does not always happen when a disk is reinserted. In this patch, spare vdev is manually detached from the pool when resilvering does not occur and it has been tested on both Linux and FreeBSD. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #14722 --- include/sys/spa.h | 1 + module/zfs/spa.c | 5 +++-- module/zfs/vdev.c | 12 +++++++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index 79c46aa07709..a1c102020223 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -787,6 +787,7 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); #define SPA_ASYNC_L2CACHE_REBUILD 0x800 #define SPA_ASYNC_L2CACHE_TRIM 0x1000 #define SPA_ASYNC_REBUILD_DONE 0x2000 +#define SPA_ASYNC_DETACH_SPARE 0x4000 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); diff --git a/module/zfs/spa.c b/module/zfs/spa.c index dc202978c0f6..e76700a9caa3 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -7058,7 +7058,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, * Detach a device from a mirror or replacing vdev. * * If 'replace_done' is specified, only detach if the parent - * is a replacing vdev. + * is a replacing or a spare vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) @@ -8281,7 +8281,8 @@ spa_async_thread(void *arg) * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE || - tasks & SPA_ASYNC_REBUILD_DONE) { + tasks & SPA_ASYNC_REBUILD_DONE || + tasks & SPA_ASYNC_DETACH_SPARE) { spa_vdev_resilver_done(spa); } diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 7cf858c05051..241be8fd856c 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4183,9 +4183,19 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) if (wasoffline || (oldstate < VDEV_STATE_DEGRADED && - vd->vdev_state >= VDEV_STATE_DEGRADED)) + vd->vdev_state >= VDEV_STATE_DEGRADED)) { spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); + /* + * Asynchronously detach spare vdev if resilver or + * rebuild is not required + */ + if (vd->vdev_unspare && + !dsl_scan_resilvering(spa->spa_dsl_pool) && + !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) && + !vdev_rebuild_active(tvd)) + spa_async_request(spa, SPA_ASYNC_DETACH_SPARE); + } return (spa_vdev_state_exit(spa, vd, 0)); } From d4657835c8a5da816ab098fd2f1d62480865d087 Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Wed, 19 Apr 2023 13:20:02 -0700 Subject: [PATCH 054/180] ZTS: send-c_volume is flaky We use block_device_wait to wait for the zvol block device to actually appear, and we log the result of the dd calls by using an intermediate file. Reviewed-by: George Melikov Reviewed-by: John Wren Kennedy Reviewed-by: Brian Behlendorf Signed-off-by: Paul Dagnelie Closes #14767 --- tests/zfs-tests/tests/functional/rsend/send-c_volume.ksh | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/zfs-tests/tests/functional/rsend/send-c_volume.ksh b/tests/zfs-tests/tests/functional/rsend/send-c_volume.ksh index 988ed91b9918..1bf234823459 100755 --- a/tests/zfs-tests/tests/functional/rsend/send-c_volume.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send-c_volume.ksh @@ -29,6 +29,7 @@ function cleanup { + rm $BACKDIR/copy log_must_busy zfs destroy -r $vol cleanup_pool $POOL2 } @@ -60,7 +61,9 @@ log_must eval "zfs recv -d $POOL2 <$BACKDIR/full" verify_stream_size $BACKDIR/full $vol verify_stream_size $BACKDIR/full $vol2 -md5=$(dd if=$voldev2 bs=1024k count=$megs 2>/dev/null | md5digest) +block_device_wait $voldev2 +log_must dd if=$voldev2 of=$BACKDIR/copy bs=1024k count=$megs +md5=$(md5digest $BACKDIR/copy) [[ $md5 = $md5_1 ]] || log_fail "md5 mismatch: $md5 != $md5_1" # Repeat, for an incremental send @@ -72,7 +75,9 @@ log_must eval "zfs recv -d $POOL2 <$BACKDIR/inc" verify_stream_size $BACKDIR/inc $vol 90 $vol@snap verify_stream_size $BACKDIR/inc $vol2 90 $vol2@snap -md5=$(dd skip=$megs if=$voldev2 bs=1024k count=$megs 2>/dev/null | md5digest) +block_device_wait $voldev2 +log_must dd skip=$megs if=$voldev2 of=$BACKDIR/copy bs=1024k count=$megs +md5=$(md5digest $BACKDIR/copy) [[ $md5 = $md5_2 ]] || log_fail "md5 mismatch: $md5 != $md5_2" log_pass "Verify compressed send works with volumes" From 71d191ef25d1c60e6725c07b6b94a0184f7db2eb Mon Sep 17 00:00:00 2001 From: Herb Wartens Date: Wed, 19 Apr 2023 13:22:59 -0700 Subject: [PATCH 055/180] Allow MMP to bypass waiting for other threads At our site we have seen cases when multi-modifier protection is enabled (multihost=on) on our pool and the pool gets suspended due to a single disk that is failing and responding very slowly. Our pools have 90 disks in them and we expect disks to fail. The current version of MMP requires that we wait for other writers before moving on. When a disk is responding very slowly, we observed that waiting here was bad enough to cause the pool to suspend. This change allows the MMP thread to bypass waiting for other threads and reduces the chances the pool gets suspended. Reviewed-by: Brian Behlendorf Signed-off-by: Herb Wartens Closes #14659 --- include/sys/spa.h | 2 ++ module/zfs/mmp.c | 2 +- module/zfs/spa_misc.c | 29 ++++++++++++++++++++++++++--- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index a1c102020223..b96a9ef1d42f 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -977,6 +977,8 @@ extern int spa_import_progress_set_state(uint64_t pool_guid, extern int spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw); +extern void spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, + krw_t rw); extern void spa_config_exit(spa_t *spa, int locks, const void *tag); extern int spa_config_held(spa_t *spa, int locks, krw_t rw); diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c index ef0e01df390f..25eea0752941 100644 --- a/module/zfs/mmp.c +++ b/module/zfs/mmp.c @@ -445,7 +445,7 @@ mmp_write_uberblock(spa_t *spa) uint64_t offset; hrtime_t lock_acquire_time = gethrtime(); - spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER); + spa_config_enter_mmp(spa, SCL_STATE, mmp_tag, RW_READER); lock_acquire_time = gethrtime() - lock_acquire_time; if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10)) zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns " diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 1475a4a53f4a..54a0eeccf27b 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -493,8 +493,9 @@ spa_config_tryenter(spa_t *spa, int locks, const void *tag, krw_t rw) return (1); } -void -spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) +static void +spa_config_enter_impl(spa_t *spa, int locks, const void *tag, krw_t rw, + int mmp_flag) { (void) tag; int wlocks_held = 0; @@ -509,7 +510,8 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { - while (scl->scl_writer || scl->scl_write_wanted) { + while (scl->scl_writer || + (!mmp_flag && scl->scl_write_wanted)) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { @@ -527,6 +529,27 @@ spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) ASSERT3U(wlocks_held, <=, locks); } +void +spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) +{ + spa_config_enter_impl(spa, locks, tag, rw, 0); +} + +/* + * The spa_config_enter_mmp() allows the mmp thread to cut in front of + * outstanding write lock requests. This is needed since the mmp updates are + * time sensitive and failure to service them promptly will result in a + * suspended pool. This pool suspension has been seen in practice when there is + * a single disk in a pool that is responding slowly and presumably about to + * fail. + */ + +void +spa_config_enter_mmp(spa_t *spa, int locks, const void *tag, krw_t rw) +{ + spa_config_enter_impl(spa, locks, tag, rw, 1); +} + void spa_config_exit(spa_t *spa, int locks, const void *tag) { From 3e4ed4213d7b4e8892e9def8b06363391d8dbd60 Mon Sep 17 00:00:00 2001 From: rob-wing <98866084+rob-wing@users.noreply.github.com> Date: Thu, 20 Apr 2023 09:07:56 -0800 Subject: [PATCH 056/180] Create zap for root vdev And add it to the AVZ, this is not backwards compatible with older pools due to an assertion in spa_sync() that verifies the number of ZAPs of all vdevs matches the number of ZAPs in the AVZ. Granted, the assertion only applies to #DEBUG builds - still, a feature flag is introduced to avoid the assertion, com.klarasystems:vdev_zaps_v2 Notably, this allows to get/set properties on the root vdev: % zpool set user:prop=value root-0 Before this commit, it was already possible to get/set properties on top-level vdevs with the syntax - (e.g. mirror-0): % zpool set user:prop=value mirror-0 This syntax also applies to the root vdev as it is is of type 'root' with a vdev_id of 0, root-0. The keyword 'root' as an alias for 'root-0'. The following tests have been added: - zpool get all properties from root vdev - zpool set a property on root vdev - verify root vdev ZAP is created Reviewed-by: Brian Behlendorf Signed-off-by: Rob Wing Sponsored-by: Seagate Technology Submitted-by: Klara, Inc. Closes #14405 --- cmd/zdb/zdb.c | 3 + cmd/zpool/zpool_main.c | 65 +++++++++++------ include/sys/fs/zfs.h | 1 + include/sys/vdev_impl.h | 1 + include/zfeature_common.h | 1 + lib/libzfs/libzfs.abi | 11 +-- lib/libzfs/libzfs_pool.c | 1 + lib/libzutil/zutil_import.c | 5 +- man/man7/zpool-features.7 | 16 ++++ module/zcommon/zfeature_common.c | 6 ++ module/zfs/spa.c | 11 +++ module/zfs/vdev.c | 31 +++++++- module/zfs/vdev_label.c | 6 ++ tests/runfiles/common.run | 4 +- tests/zfs-tests/tests/Makefile.am | 3 + .../cli_root/zpool_get/vdev_get.cfg | 73 +++++++++++++++++++ .../cli_root/zpool_get/vdev_get_001_pos.ksh | 62 ++++++++++++++++ .../cli_root/zpool_get/zpool_get.cfg | 1 + .../cli_root/zpool_set/vdev_set_001_pos.ksh | 52 +++++++++++++ .../functional/vdev_zaps/vdev_zaps.kshlib | 13 ++++ .../vdev_zaps/vdev_zaps_001_pos.ksh | 3 +- .../vdev_zaps/vdev_zaps_002_pos.ksh | 1 + .../vdev_zaps/vdev_zaps_003_pos.ksh | 1 + .../vdev_zaps/vdev_zaps_004_pos.ksh | 1 + .../vdev_zaps/vdev_zaps_005_pos.ksh | 1 + .../vdev_zaps/vdev_zaps_006_pos.ksh | 1 + .../vdev_zaps/vdev_zaps_007_pos.ksh | 1 + 27 files changed, 338 insertions(+), 37 deletions(-) create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_001_pos.ksh diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index e87826f7467b..c93ed4399afd 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -7630,6 +7630,9 @@ mos_leak_vdev(vdev_t *vd) mos_obj_refd(space_map_object(ms->ms_sm)); } + if (vd->vdev_root_zap != 0) + mos_obj_refd(vd->vdev_root_zap); + if (vd->vdev_top_zap != 0) { mos_obj_refd(vd->vdev_top_zap); mos_leak_vdev_top_zap(vd); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 27e805943443..4965cba52692 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -10002,32 +10002,32 @@ get_callback_vdev(zpool_handle_t *zhp, char *vdevname, void *data) } static int -get_callback_vdev_width_cb(void *zhp_data, nvlist_t *nv, void *data) +get_callback_vdev_cb(void *zhp_data, nvlist_t *nv, void *data) { zpool_handle_t *zhp = zhp_data; zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; - char *vdevname = zpool_vdev_name(g_zfs, zhp, nv, - cbp->cb_vdevs.cb_name_flags); + char *vdevname; + const char *type; int ret; - /* Adjust the column widths for the vdev properties */ - ret = vdev_expand_proplist(zhp, vdevname, &cbp->cb_proplist); - - return (ret); -} + /* + * zpool_vdev_name() transforms the root vdev name (i.e., root-0) to the + * pool name for display purposes, which is not desired. Fallback to + * zpool_vdev_name() when not dealing with the root vdev. + */ + type = fnvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE); + if (zhp != NULL && strcmp(type, "root") == 0) + vdevname = strdup("root-0"); + else + vdevname = zpool_vdev_name(g_zfs, zhp, nv, + cbp->cb_vdevs.cb_name_flags); -static int -get_callback_vdev_cb(void *zhp_data, nvlist_t *nv, void *data) -{ - zpool_handle_t *zhp = zhp_data; - zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; - char *vdevname = zpool_vdev_name(g_zfs, zhp, nv, - cbp->cb_vdevs.cb_name_flags); - int ret; + (void) vdev_expand_proplist(zhp, vdevname, &cbp->cb_proplist); - /* Display the properties */ ret = get_callback_vdev(zhp, vdevname, data); + free(vdevname); + return (ret); } @@ -10042,7 +10042,6 @@ get_callback(zpool_handle_t *zhp, void *data) if (cbp->cb_type == ZFS_TYPE_VDEV) { if (strcmp(cbp->cb_vdevs.cb_names[0], "all-vdevs") == 0) { - for_each_vdev(zhp, get_callback_vdev_width_cb, data); for_each_vdev(zhp, get_callback_vdev_cb, data); } else { /* Adjust column widths for vdev properties */ @@ -10119,6 +10118,7 @@ zpool_do_get(int argc, char **argv) int ret; int c, i; char *propstr = NULL; + char *vdev = NULL; cb.cb_first = B_TRUE; @@ -10216,10 +10216,17 @@ zpool_do_get(int argc, char **argv) } else if (are_all_pools(1, argv)) { /* The first arg is a pool name */ if ((argc == 2 && strcmp(argv[1], "all-vdevs") == 0) || + (argc == 2 && strcmp(argv[1], "root") == 0) || are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb.cb_vdevs)) { + + if (strcmp(argv[1], "root") == 0) + vdev = strdup("root-0"); + else + vdev = strdup(argv[1]); + /* ... and the rest are vdev names */ - cb.cb_vdevs.cb_names = argv + 1; + cb.cb_vdevs.cb_names = &vdev; cb.cb_vdevs.cb_names_count = argc - 1; cb.cb_type = ZFS_TYPE_VDEV; argc = 1; /* One pool to process */ @@ -10264,6 +10271,9 @@ zpool_do_get(int argc, char **argv) else zprop_free_list(cb.cb_proplist); + if (vdev != NULL) + free(vdev); + return (ret); } @@ -10365,6 +10375,7 @@ zpool_do_set(int argc, char **argv) { set_cbdata_t cb = { 0 }; int error; + char *vdev = NULL; current_prop_type = ZFS_TYPE_POOL; if (argc > 1 && argv[1][0] == '-') { @@ -10413,13 +10424,20 @@ zpool_do_set(int argc, char **argv) /* argv[1], when supplied, is vdev name */ if (argc == 2) { - if (!are_vdevs_in_pool(1, argv + 1, argv[0], &cb.cb_vdevs)) { + + if (strcmp(argv[1], "root") == 0) + vdev = strdup("root-0"); + else + vdev = strdup(argv[1]); + + if (!are_vdevs_in_pool(1, &vdev, argv[0], &cb.cb_vdevs)) { (void) fprintf(stderr, gettext( "cannot find '%s' in '%s': device not in pool\n"), - argv[1], argv[0]); + vdev, argv[0]); + free(vdev); return (EINVAL); } - cb.cb_vdevs.cb_names = argv + 1; + cb.cb_vdevs.cb_names = &vdev; cb.cb_vdevs.cb_names_count = 1; cb.cb_type = ZFS_TYPE_VDEV; } @@ -10427,6 +10445,9 @@ zpool_do_set(int argc, char **argv) error = for_each_pool(1, argv, B_TRUE, NULL, ZFS_TYPE_POOL, B_FALSE, set_callback, &cb); + if (vdev != NULL) + free(vdev); + return (error); } diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 25babd4ea8cf..0734ff12280e 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -816,6 +816,7 @@ typedef struct zpool_load_policy { #define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" #define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ #define ZPOOL_CONFIG_ERRATA "errata" /* not stored on disk */ +#define ZPOOL_CONFIG_VDEV_ROOT_ZAP "com.klarasystems:vdev_zap_root" #define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" #define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" #define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 7cfffe3b4eed..ea3043c82a39 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -277,6 +277,7 @@ struct vdev { kthread_t *vdev_open_thread; /* thread opening children */ kthread_t *vdev_validate_thread; /* thread validating children */ uint64_t vdev_crtxg; /* txg when top-level was added */ + uint64_t vdev_root_zap; /* * Top-level vdev state. diff --git a/include/zfeature_common.h b/include/zfeature_common.h index ef915a70952e..7066c699e203 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -79,6 +79,7 @@ typedef enum spa_feature { SPA_FEATURE_HEAD_ERRLOG, SPA_FEATURE_BLAKE3, SPA_FEATURE_BLOCK_CLONING, + SPA_FEATURE_AVZ_V2, SPA_FEATURES } spa_feature_t; diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 41e74fd8db19..f9aed4e0d57e 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -595,7 +595,7 @@ - + @@ -5808,7 +5808,8 @@ - + + @@ -8694,8 +8695,8 @@ - - + + @@ -8772,7 +8773,7 @@ - + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index f9b7cc004d6b..ae4c861590fd 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2859,6 +2859,7 @@ zpool_vdev_is_interior(const char *name) strncmp(name, VDEV_TYPE_SPARE, strlen(VDEV_TYPE_SPARE)) == 0 || strncmp(name, VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || + strncmp(name, VDEV_TYPE_ROOT, strlen(VDEV_TYPE_ROOT)) == 0 || strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) return (B_TRUE); diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index 65f462e42cd0..19d8a4742813 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -1927,9 +1927,8 @@ for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func, if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) return (ret); - /* Don't run our function on root or indirect vdevs */ - if ((strcmp(type, VDEV_TYPE_ROOT) != 0) && - (strcmp(type, VDEV_TYPE_INDIRECT) != 0)) { + /* Don't run our function on indirect vdevs */ + if (strcmp(type, VDEV_TYPE_INDIRECT) != 0) { ret |= func(zhp, nv, data); } diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 4cd7526858a3..efe9e833996a 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -858,6 +858,22 @@ by user and group. \*[instant-never] \*[remount-upgrade] . +.feature com.klarasystems vdev_zaps_v2 no +This feature creates a ZAP object for the root vdev. +.Pp +This feature becomes active after the next +.Nm zpool Cm import +or +.Nm zpool reguid . +. +Properties can be retrieved or set on the root vdev using +.Nm zpool Cm get +and +.Nm zpool Cm set +with +.Sy root +as the vdev name which is an alias for +.Sy root-0 . .feature org.openzfs zilsaxattr yes extensible_dataset This feature enables .Sy xattr Ns = Ns Sy sa diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index 6fe1da8ed46f..4c9b7ed72a0f 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -731,6 +731,12 @@ zpool_feature_init(void) ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + zfeature_register(SPA_FEATURE_AVZ_V2, + "com.klarasystems:vdev_zaps_v2", "vdev_zaps_v2", + "Support for root vdev ZAP.", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, + sfeatures); + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index e76700a9caa3..67601211d6c2 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -3044,6 +3044,12 @@ vdev_count_verify_zaps(vdev_t *vd) spa_t *spa = vd->vdev_spa; uint64_t total = 0; + if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) && + vd->vdev_root_zap != 0) { + total++; + ASSERT0(zap_lookup_int(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, vd->vdev_root_zap)); + } if (vd->vdev_top_zap != 0) { total++; ASSERT0(zap_lookup_int(spa->spa_meta_objset, @@ -8626,6 +8632,11 @@ spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) { spa_t *spa = vd->vdev_spa; + if (vd->vdev_root_zap != 0 && + spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) { + VERIFY0(zap_add_int(spa->spa_meta_objset, avz, + vd->vdev_root_zap, tx)); + } if (vd->vdev_top_zap != 0) { VERIFY0(zap_add_int(spa->spa_meta_objset, avz, vd->vdev_top_zap, tx)); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 241be8fd856c..4bfd95861e02 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -397,7 +397,9 @@ vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value) uint64_t objid; int err; - if (vd->vdev_top_zap != 0) { + if (vd->vdev_root_zap != 0) { + objid = vd->vdev_root_zap; + } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; @@ -898,6 +900,14 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, &vd->vdev_crtxg); + if (vd->vdev_ops == &vdev_root_ops && + (alloctype == VDEV_ALLOC_LOAD || + alloctype == VDEV_ALLOC_SPLIT || + alloctype == VDEV_ALLOC_ROOTPOOL)) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, + &vd->vdev_root_zap); + } + /* * If we're a top-level vdev, try to load the allocation parameters. */ @@ -3347,6 +3357,12 @@ vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) vdev_zap_allocation_data(vd, tx); } } + if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 && + spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { + if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) + spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx); + vd->vdev_root_zap = vdev_create_link_zap(vd, tx); + } for (uint64_t i = 0; i < vd->vdev_children; i++) { vdev_construct_zaps(vd->vdev_child[i], tx); @@ -5683,12 +5699,17 @@ vdev_props_set_sync(void *arg, dmu_tx_t *tx) /* * Set vdev property values in the vdev props mos object. */ - if (vd->vdev_top_zap != 0) { + if (vd->vdev_root_zap != 0) { + objid = vd->vdev_root_zap; + } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; } else { - panic("vdev not top or leaf"); + /* + * XXX: implement vdev_props_set_check() + */ + panic("vdev not root/top/leaf"); } switch (prop = vdev_name_to_prop(propname)) { @@ -5891,7 +5912,9 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops); - if (vd->vdev_top_zap != 0) { + if (vd->vdev_root_zap != 0) { + objid = vd->vdev_root_zap; + } else if (vd->vdev_top_zap != 0) { objid = vd->vdev_top_zap; } else if (vd->vdev_leaf_zap != 0) { objid = vd->vdev_leaf_zap; diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index f61be65a2c72..85c7134ca4c4 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -573,6 +573,12 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vd->vdev_top_zap); } + if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap != 0 && + spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP, + vd->vdev_root_zap); + } + if (vd->vdev_resilver_deferred) { ASSERT(vd->vdev_ops->vdev_op_leaf); ASSERT(spa->spa_resilver_deferred); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 4233c0285c4b..cc4ce03677cb 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -394,7 +394,7 @@ tags = ['functional', 'cli_root', 'zpool_export'] [tests/functional/cli_root/zpool_get] tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos', - 'zpool_get_004_neg', 'zpool_get_005_pos'] + 'zpool_get_004_neg', 'zpool_get_005_pos', 'vdev_get_001_pos'] tags = ['functional', 'cli_root', 'zpool_get'] [tests/functional/cli_root/zpool_history] @@ -482,7 +482,7 @@ tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', - 'zpool_set_ashift', 'zpool_set_features'] + 'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index a470573616af..e671a3f6b02b 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -178,6 +178,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/cli_root/zpool_expand/zpool_expand.cfg \ functional/cli_root/zpool_export/zpool_export.cfg \ functional/cli_root/zpool_export/zpool_export.kshlib \ + functional/cli_root/zpool_get/vdev_get.cfg \ functional/cli_root/zpool_get/zpool_get.cfg \ functional/cli_root/zpool_get/zpool_get_parsable.cfg \ functional/cli_root/zpool_import/blockfiles/cryptv0.dat.bz2 \ @@ -1032,6 +1033,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_export/zpool_export_004_pos.ksh \ functional/cli_root/zpool_get/cleanup.ksh \ functional/cli_root/zpool_get/setup.ksh \ + functional/cli_root/zpool_get/vdev_get_001_pos.ksh \ functional/cli_root/zpool_get/zpool_get_001_pos.ksh \ functional/cli_root/zpool_get/zpool_get_002_pos.ksh \ functional/cli_root/zpool_get/zpool_get_003_pos.ksh \ @@ -1146,6 +1148,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_set/cleanup.ksh \ functional/cli_root/zpool_set/setup.ksh \ functional/cli_root/zpool/setup.ksh \ + functional/cli_root/zpool_set/vdev_set_001_pos.ksh \ functional/cli_root/zpool_set/zpool_set_001_pos.ksh \ functional/cli_root/zpool_set/zpool_set_002_neg.ksh \ functional/cli_root/zpool_set/zpool_set_003_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg new file mode 100644 index 000000000000..71a64d4fae7a --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg @@ -0,0 +1,73 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022, Klara Inc. +# + +# Set the expected properties of a vdev +typeset -a properties=( + capacity + state + guid + asize + psize + ashift + size + free + allocated + comment + expandsize + fragmentation + bootsize + parity + path + devid + physpath + encpath + fru + parent + children + numchildren + read_errors + write_errors + checksum_errors + initialize_errors + null_ops + read_ops + write_ops + free_ops + claim_ops + trim_ops + null_bytes + read_bytes + write_bytes + free_bytes + claim_bytes + trim_bytes + removing + allocating + failfast + checksum_n + checksum_t + io_n + io_t +) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get_001_pos.ksh new file mode 100755 index 000000000000..bca2337861d4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get_001_pos.ksh @@ -0,0 +1,62 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_get/vdev_get.cfg + +# +# DESCRIPTION: +# +# zpool get root works as expected +# +# STRATEGY: +# +# 1. use zpool get to retrieve properties from root vdev +# 2. verify expected properties match detected properties +# + +log_assert "zpool get all on root vdev" + +EXPECT="$(zpool get -H all ${TESTPOOL} root | wc -l)" +if [ $? -ne 0 ]; then + log_fail "cannot retrieve properties from root vdev" +fi + +i=0; +while [ $i -lt "${#properties[@]}" ] +do + log_must zpool get -H "${properties[$i]}" "$TESTPOOL" root + i=$(($i+1)) +done + +EXPECT=$((EXPECT)) +if [ $i -gt $EXPECT ]; then + log_fail "found vdev properties not in vdev_get.cfg: $i/$EXPECT." +elif [ $i -lt $EXPECT ]; then + log_fail "expected properties not found in vdev_get.cfg: $i/$EXPECT." +fi + +log_pass "zpool get all on root vdev" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 097cd52e4777..160a0ca2e6db 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -104,5 +104,6 @@ if is_linux || is_freebsd; then "feature@head_errlog" "feature@blake3" "feature@block_cloning" + "feature@vdev_zaps_v2" ) fi diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_001_pos.ksh new file mode 100755 index 000000000000..a1f3efb90577 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_001_pos.ksh @@ -0,0 +1,52 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2022, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# +# zpool set comment property on root vdev +# +# STRATEGY: +# 1. set a property on root vdev +# 2. verify the property is set +# + +log_assert "zpool set comment property on root vdev" + +log_must zpool set comment="openzfs" ${TESTPOOL} root + +COMMENT="$(zpool get -H -o value comment ${TESTPOOL} root)" +if [ $? -ne 0 ]; then + log_fail "cant retrieve comment property from root vdev" +fi + +if [ "$COMMENT" != "openzfs" ]; then + log_fail "unexpected value for comment property: $COMMENT != \"openzfs\"" +fi + +log_pass "zpool set comment property on root vdev" diff --git a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps.kshlib b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps.kshlib index ad5bd9e7f81b..c68a5b2c4c83 100644 --- a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps.kshlib +++ b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps.kshlib @@ -34,6 +34,10 @@ function get_top_vd_zap # dsk conf { get_conf_section "$1" "$2" | awk '/com.delphix:vdev_zap_top: [0-9]+/ {print $2}' } +function get_root_vd_zap # conf +{ + awk '/com.klarasystems:vdev_zap_root: [0-9]+/ {print $2}' "$1" +} function assert_has_sentinel # conf { @@ -54,6 +58,15 @@ function assert_zap_common # pool vd lvl zapobj fi } +function assert_root_zap # pool conf +{ + typeset pool=$1 + typeset conf=$2 + + root_zap=$(get_root_vd_zap $conf) + assert_zap_common $pool "root vdev" "root" $root_zap +} + function assert_top_zap # pool vd conf { typeset pool=$1 diff --git a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_001_pos.ksh b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_001_pos.ksh index b67cc6d973e6..bdc8dcd468ae 100755 --- a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_001_pos.ksh @@ -21,7 +21,7 @@ # # Strategy: # 1. Create a pool with one disk. -# 2. Verify that the disk has a top and leaf ZAP in its config and the MOS. +# 2. Verify that the disk has a root, top and leaf ZAP in its config and the MOS. # . $STF_SUITE/include/libtest.shlib @@ -35,6 +35,7 @@ log_must zpool create -f $TESTPOOL $DISK conf="$TESTDIR/vz001" log_must eval "zdb -PC $TESTPOOL > $conf" +assert_root_zap $TESTPOOL "$conf" assert_top_zap $TESTPOOL $DISK "$conf" assert_leaf_zap $TESTPOOL $DISK "$conf" assert_has_sentinel "$conf" diff --git a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_002_pos.ksh b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_002_pos.ksh index c571973b080b..35c4f64fa463 100755 --- a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_002_pos.ksh @@ -36,6 +36,7 @@ conf="$TESTDIR/vz002" log_must eval "zdb -PC $TESTPOOL > $conf" assert_has_sentinel "$conf" +assert_root_zap $TESTPOOL "$conf" for DISK in $DISKS; do assert_top_zap $TESTPOOL $DISK "$conf" assert_leaf_zap $TESTPOOL $DISK "$conf" diff --git a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_003_pos.ksh b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_003_pos.ksh index 015729576a7d..bb6875c339c3 100755 --- a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_003_pos.ksh @@ -37,6 +37,7 @@ conf="$TESTDIR/vz003" log_must eval "zdb -PC $TESTPOOL > $conf" assert_has_sentinel "$conf" +assert_root_zap $TESTPOOL "$conf" assert_top_zap $TESTPOOL "type: 'mirror'" "$conf" for DISK in $DISKS; do assert_leaf_zap $TESTPOOL $DISK "$conf" diff --git a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_004_pos.ksh b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_004_pos.ksh index 3d0f55d5a9a7..e82e398c6d27 100755 --- a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_004_pos.ksh +++ b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_004_pos.ksh @@ -40,6 +40,7 @@ log_must zpool create -f $TESTPOOL $DISK conf="$TESTDIR/vz004" log_must eval "zdb -PC $TESTPOOL > $conf" assert_has_sentinel "$conf" +assert_root_zap $TESTPOOL "$conf" orig_top=$(get_top_vd_zap $DISK $conf) orig_leaf=$(get_leaf_vd_zap $DISK $conf) assert_zap_common $TESTPOOL $DISK "top" $orig_top diff --git a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_005_pos.ksh b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_005_pos.ksh index 1d82218bf283..4b9b45e149d7 100755 --- a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_005_pos.ksh @@ -37,6 +37,7 @@ log_must zpool create -f $TESTPOOL $DISK conf="$TESTDIR/vz005" log_must eval "zdb -PC $TESTPOOL > $conf" assert_has_sentinel "$conf" +assert_root_zap $TESTPOOL "$conf" orig_top=$(get_top_vd_zap $DISK $conf) orig_leaf=$(get_leaf_vd_zap $DISK $conf) assert_zap_common $TESTPOOL $DISK "top" $orig_top diff --git a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_006_pos.ksh b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_006_pos.ksh index ce94336c7c5d..2ac493b8b0d2 100755 --- a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_006_pos.ksh +++ b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_006_pos.ksh @@ -39,6 +39,7 @@ conf="$TESTDIR/vz006" log_must eval "zdb -PC $TESTPOOL > $conf" assert_has_sentinel "$conf" +assert_root_zap $TESTPOOL "$conf" orig_top=$(get_top_vd_zap ${DISK_ARR[1]} $conf) assert_zap_common $TESTPOOL ${DISK_ARR[1]} "top" $orig_top assert_leaf_zap $TESTPOOL ${DISK_ARR[1]} "$conf" diff --git a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_007_pos.ksh b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_007_pos.ksh index c7f12c633706..c7a4a62de436 100755 --- a/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_007_pos.ksh @@ -39,6 +39,7 @@ conf="$TESTDIR/vz007" log_must eval "zdb -PC $TESTPOOL > $conf" assert_has_sentinel "$conf" +assert_root_zap $TESTPOOL "$conf" orig_top=$(get_top_vd_zap "type: 'mirror'" $conf) orig_leaf0=$(get_leaf_vd_zap ${DISK_ARR[0]} $conf) orig_leaf1=$(get_leaf_vd_zap ${DISK_ARR[1]} $conf) From a10c64561648f9cb9f90959c57625d3ffdaea156 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 20 Apr 2023 10:25:16 -0700 Subject: [PATCH 057/180] ZTS: zvol_misc_trim retry busy export Retry the export if the pool is busy due to an open zvol. Observed in the CI on Fedora 37. cannot export 'testpool': pool is busy ERROR: zpool export testpool exited 1 Reviewed-by: George Melikov Signed-off-by: Brian Behlendorf Closes #14769 --- .../tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh index 2c4ef28ab826..46cac3ecb6c2 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh @@ -125,12 +125,12 @@ log_must $trimcmd $zvolpath set_blk_mq 1 -log_must zpool export $TESTPOOL +log_must_busy zpool export $TESTPOOL log_must zpool import $TESTPOOL do_test set_blk_mq 0 -log_must zpool export $TESTPOOL +log_must_busy zpool export $TESTPOOL log_must zpool import $TESTPOOL do_test From ab71b24d20df7ec59c6b3a2b560af263ad262d9b Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 11 Apr 2023 17:50:43 +0000 Subject: [PATCH 058/180] Linux: zfs_zaccess_trivial() should always call generic_permission() Building with Clang on Linux generates a warning that err could be uninitialized if mnt_ns is a NULL pointer. However, mnt_ns should never be NULL, so there is no need to put this behind an if statement. Taking it outside of the if statement means that the possibility of err being uninitialized goes from being always zero in a way that the compiler could not realize to a way that is always zero in a way that the compiler can realize. Sponsored-By: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Youzhong Yang Signed-off-by: Richard Yao Closes #14738 --- module/os/linux/zfs/zfs_acl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c index df4ebc3870be..ff26f47f2e04 100644 --- a/module/os/linux/zfs/zfs_acl.c +++ b/module/os/linux/zfs/zfs_acl.c @@ -2466,8 +2466,7 @@ zfs_zaccess_trivial(znode_t *zp, uint32_t *working_mode, cred_t *cr, #if (defined(HAVE_IOPS_PERMISSION_USERNS) || \ defined(HAVE_IOPS_PERMISSION_IDMAP)) - if (mnt_ns) - err = generic_permission(mnt_ns, ZTOI(zp), mask); + err = generic_permission(mnt_ns, ZTOI(zp), mask); #else err = generic_permission(ZTOI(zp), mask); #endif From 135d9a9048e3716c755373182720d0eba170285f Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Tue, 11 Apr 2023 17:56:16 +0000 Subject: [PATCH 059/180] Linux: Suppress -Wordered-compare-function-pointers in tracepoint code Clang points out that there is a comparison against -1, but we cannot fix it because that is from the kernel headers, which we must support. We can workaround this by using a pragma. Sponsored-By: Wasabi Technology, Inc. Reviewed-by: Brian Behlendorf Reviewed-by: Youzhong Yang Signed-off-by: Richard Yao Closes #14738 --- include/os/linux/zfs/sys/trace_zil.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/os/linux/zfs/sys/trace_zil.h b/include/os/linux/zfs/sys/trace_zil.h index 6dd18c5974b9..fb03d3149f8f 100644 --- a/include/os/linux/zfs/sys/trace_zil.h +++ b/include/os/linux/zfs/sys/trace_zil.h @@ -152,6 +152,9 @@ * zilog_t *, ..., * itx_t *, ...); */ + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wordered-compare-function-pointers" /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_zil_process_itx_class, TP_PROTO(zilog_t *zilog, itx_t *itx), @@ -169,6 +172,7 @@ DECLARE_EVENT_CLASS(zfs_zil_process_itx_class, ZILOG_TP_PRINTK_ARGS, ITX_TP_PRINTK_ARGS) ); /* END CSTYLED */ +#pragma clang diagnostic pop #define DEFINE_ZIL_PROCESS_ITX_EVENT(name) \ DEFINE_EVENT(zfs_zil_process_itx_class, name, \ From 8eae2d214cfa53862833eeeda9a5c1e9d5ded47d Mon Sep 17 00:00:00 2001 From: Allan Jude Date: Fri, 21 Apr 2023 13:20:36 -0400 Subject: [PATCH 060/180] Add support for zpool user properties Usage: zpool set org.freebsd:comment="this is my pool" poolname Tests are based on zfs_set's user property tests. Also stop truncating property values at MAXNAMELEN, use ZFS_MAXPROPLEN. Reviewed-by: Brian Behlendorf Signed-off-by: Allan Jude Signed-off-by: Mateusz Piotrowski Sponsored-by: Beckhoff Automation GmbH & Co. KG. Sponsored-by: Klara Inc. Closes #11680 --- cmd/zpool/zpool_main.c | 18 +- include/libzfs.h | 2 + lib/libzfs/libzfs.abi | 131 +++++++------ lib/libzfs/libzfs_pool.c | 101 +++++++++- lib/libzfs/libzfs_util.c | 1 + man/man7/zpoolprops.7 | 55 +++++- module/zfs/spa.c | 128 +++++++++---- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 3 + .../zpool_set/user_property_001_pos.ksh | 89 +++++++++ .../zpool_set/user_property_002_neg.ksh | 88 +++++++++ .../zpool_set/zpool_set_common.kshlib | 178 ++++++++++++++++++ 12 files changed, 688 insertions(+), 109 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_set/user_property_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_set/user_property_002_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 4965cba52692..301c5f4bfc6f 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -6071,11 +6071,14 @@ print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) zpool_prop_get_feature(zhp, pl->pl_user_prop, property, sizeof (property)) == 0) { propstr = property; + } else if (zfs_prop_user(pl->pl_user_prop) && + zpool_get_userprop(zhp, pl->pl_user_prop, property, + sizeof (property), NULL) == 0) { + propstr = property; } else { propstr = "-"; } - /* * If this is being called in scripted mode, or if this is the * last column and it is left-justified, don't include a width @@ -10035,7 +10038,7 @@ static int get_callback(zpool_handle_t *zhp, void *data) { zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; - char value[MAXNAMELEN]; + char value[ZFS_MAXPROPLEN]; zprop_source_t srctype; zprop_list_t *pl; int vid; @@ -10070,6 +10073,17 @@ get_callback(zpool_handle_t *zhp, void *data) continue; if (pl->pl_prop == ZPROP_INVAL && + zfs_prop_user(pl->pl_user_prop)) { + srctype = ZPROP_SRC_LOCAL; + + if (zpool_get_userprop(zhp, pl->pl_user_prop, + value, sizeof (value), &srctype) != 0) + continue; + + zprop_print_one_property(zpool_get_name(zhp), + cbp, pl->pl_user_prop, value, srctype, + NULL, NULL); + } else if (pl->pl_prop == ZPROP_INVAL && (zpool_prop_feature(pl->pl_user_prop) || zpool_prop_unsupported(pl->pl_user_prop))) { srctype = ZPROP_SRC_LOCAL; diff --git a/include/libzfs.h b/include/libzfs.h index 7ec9768d8e93..87d1ed738f2b 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -333,6 +333,8 @@ _LIBZFS_H const char *zpool_get_state_str(zpool_handle_t *); _LIBZFS_H int zpool_set_prop(zpool_handle_t *, const char *, const char *); _LIBZFS_H int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, size_t proplen, zprop_source_t *, boolean_t literal); +_LIBZFS_H int zpool_get_userprop(zpool_handle_t *, const char *, char *, + size_t proplen, zprop_source_t *); _LIBZFS_H uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, zprop_source_t *); _LIBZFS_H int zpool_props_refresh(zpool_handle_t *); diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index f9aed4e0d57e..732863dcffc7 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -259,8 +259,8 @@ - + @@ -492,6 +492,7 @@ + @@ -2267,32 +2268,19 @@ - - - - - - - - - - - - - - - - - + + + + - - - - - + + + + + @@ -3324,17 +3312,11 @@ - - - - - - - - - - + + + + @@ -3907,35 +3889,20 @@ - - - - - - - - - - - - - - - - - - - - - + + + + + + - - - - + + + + @@ -5131,6 +5098,27 @@ + + + + + + + + + + + + + + + + + + + + + @@ -5163,6 +5151,19 @@ + + + + + + + + + + + + + @@ -5395,9 +5396,6 @@ - - - @@ -6169,6 +6167,14 @@ + + + + + + + + @@ -7852,6 +7858,9 @@ + + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index ae4c861590fd..4fb71b4e0dc8 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -426,6 +426,37 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, return (0); } +/* + * Get a zpool property value for 'propname' and return the value in + * a pre-allocated buffer. + */ +int +zpool_get_userprop(zpool_handle_t *zhp, const char *propname, char *buf, + size_t len, zprop_source_t *srctype) +{ + nvlist_t *nv, *nvl; + uint64_t ival; + const char *value; + zprop_source_t source = ZPROP_SRC_LOCAL; + + nvl = zhp->zpool_props; + if (nvlist_lookup_nvlist(nvl, propname, &nv) == 0) { + if (nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0) + source = ival; + verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); + } else { + source = ZPROP_SRC_DEFAULT; + value = "-"; + } + + if (srctype) + *srctype = source; + + (void) strlcpy(buf, value, len); + + return (0); +} + /* * Check if the bootfs name has the same pool name as it is set to. * Assuming bootfs is a valid dataset name. @@ -549,6 +580,44 @@ zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, (void) no_memory(hdl); goto error; } + continue; + } else if (prop == ZPOOL_PROP_INVAL && + zfs_prop_user(propname)) { + /* + * This is a user property: make sure it's a + * string, and that it's less than ZAP_MAXNAMELEN. + */ + if (nvpair_type(elem) != DATA_TYPE_STRING) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a string"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property name '%s' is too long"), + propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + (void) nvpair_value_string(elem, &strval); + + if (strlen(strval) >= ZFS_MAXPROPLEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property value '%s' is too long"), + strval); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (nvlist_add_string(retprops, propname, + strval) != 0) { + (void) no_memory(hdl); + goto error; + } + continue; } @@ -855,9 +924,30 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp, features = zpool_get_features(zhp); if ((*plp)->pl_all && firstexpand) { + /* Handle userprops in the all properties case */ + if (zhp->zpool_props == NULL && zpool_props_refresh(zhp)) + return (-1); + + nvp = NULL; + while ((nvp = nvlist_next_nvpair(zhp->zpool_props, nvp)) != + NULL) { + const char *propname = nvpair_name(nvp); + + if (!zfs_prop_user(propname)) + continue; + + entry = zfs_alloc(hdl, sizeof (zprop_list_t)); + entry->pl_prop = ZPROP_USERPROP; + entry->pl_user_prop = zfs_strdup(hdl, propname); + entry->pl_width = strlen(entry->pl_user_prop); + entry->pl_all = B_TRUE; + + *last = entry; + last = &entry->pl_next; + } + for (i = 0; i < SPA_FEATURES; i++) { - zprop_list_t *entry = zfs_alloc(hdl, - sizeof (zprop_list_t)); + entry = zfs_alloc(hdl, sizeof (zprop_list_t)); entry->pl_prop = ZPROP_USERPROP; entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s", spa_feature_table[i].fi_uname); @@ -874,7 +964,6 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp, nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) { char *propname; boolean_t found; - zprop_list_t *entry; if (zfeature_is_supported(nvpair_name(nvp))) continue; @@ -920,6 +1009,12 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp, NULL, literal) == 0) { if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); + } else if (entry->pl_prop == ZPROP_INVAL && + zfs_prop_user(entry->pl_user_prop) && + zpool_get_userprop(zhp, entry->pl_user_prop, buf, + sizeof (buf), NULL) == 0) { + if (strlen(buf) > entry->pl_width) + entry->pl_width = strlen(buf); } } diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 393971ddf13c..4b8a20160e02 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -1774,6 +1774,7 @@ addlist(libzfs_handle_t *hdl, const char *propname, zprop_list_t **listp, * a user-defined property. */ if (prop == ZPROP_USERPROP && ((type == ZFS_TYPE_POOL && + !zfs_prop_user(propname) && !zpool_prop_feature(propname) && !zpool_prop_unsupported(propname)) || ((type == ZFS_TYPE_DATASET) && !zfs_prop_user(propname) && diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7 index 12b9b11903df..7709d85226dc 100644 --- a/man/man7/zpoolprops.7 +++ b/man/man7/zpoolprops.7 @@ -26,8 +26,9 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" Copyright (c) 2021, Colm Buckley +.\" Copyright (c) 2023, Klara Inc. .\" -.Dd May 27, 2021 +.Dd April 18, 2023 .Dt ZPOOLPROPS 7 .Os . @@ -40,6 +41,12 @@ Each pool has several properties associated with it. Some properties are read-only statistics while others are configurable and change the behavior of the pool. .Pp +User properties have no effect on ZFS behavior. +Use them to annotate pools in a way that is meaningful in your environment. +For more information about user properties, see the +.Sx User Properties +section. +.Pp The following are read-only properties: .Bl -tag -width "unsupported@guid" .It Sy allocated @@ -431,3 +438,49 @@ backwards compatibility. Once feature flags are enabled on a pool this property will no longer have a value. .El +. +.Ss User Properties +In addition to the standard native properties, ZFS supports arbitrary user +properties. +User properties have no effect on ZFS behavior, but applications or +administrators can use them to annotate pools. +.Pp +User property names must contain a colon +.Pq Qq Sy \&: +character to distinguish them from native properties. +They may contain lowercase letters, numbers, and the following punctuation +characters: colon +.Pq Qq Sy \&: , +dash +.Pq Qq Sy - , +period +.Pq Qq Sy \&. , +and underscore +.Pq Qq Sy _ . +The expected convention is that the property name is divided into two portions +such as +.Ar module : Ns Ar property , +but this namespace is not enforced by ZFS. +User property names can be at most 256 characters, and cannot begin with a dash +.Pq Qq Sy - . +.Pp +When making programmatic use of user properties, it is strongly suggested to use +a reversed DNS domain name for the +.Ar module +component of property names to reduce the chance that two +independently-developed packages use the same property name for different +purposes. +.Pp +The values of user properties are arbitrary strings and +are never validated. +All of the commands that operate on properties +.Po Nm zpool Cm list , +.Nm zpool Cm get , +.Nm zpool Cm set , +and so forth +.Pc +can be used to manipulate both native properties and user properties. +Use +.Nm zpool Cm set Ar name Ns = +to clear a user property. +Property values are limited to 8192 bytes. diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 67601211d6c2..dd4a442d97a1 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -296,6 +296,22 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, nvlist_free(propval); } +/* + * Add a user property (source=src, propname=propval) to an nvlist. + */ +static void +spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, + zprop_source_t src) +{ + nvlist_t *propval; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); + VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); + nvlist_free(propval); +} + /* * Get property values from the spa configuration. */ @@ -471,7 +487,8 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; - if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) + if ((prop = zpool_name_to_prop(za.za_name)) == + ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name)) continue; switch (za.za_integer_length) { @@ -514,7 +531,13 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) kmem_free(strval, za.za_num_integers); break; } - spa_prop_add_list(*nvp, prop, strval, 0, src); + if (prop != ZPOOL_PROP_INVAL) { + spa_prop_add_list(*nvp, prop, strval, 0, src); + } else { + src = ZPROP_SRC_LOCAL; + spa_prop_add_user(*nvp, za.za_name, strval, + src); + } kmem_free(strval, za.za_num_integers); break; @@ -556,36 +579,47 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) switch (prop) { case ZPOOL_PROP_INVAL: - if (!zpool_prop_feature(propname)) { - error = SET_ERROR(EINVAL); - break; - } - /* * Sanitize the input. */ - if (nvpair_type(elem) != DATA_TYPE_UINT64) { - error = SET_ERROR(EINVAL); - break; - } + if (zfs_prop_user(propname)) { + if (strlen(propname) >= ZAP_MAXNAMELEN) { + error = SET_ERROR(ENAMETOOLONG); + break; + } - if (nvpair_value_uint64(elem, &intval) != 0) { - error = SET_ERROR(EINVAL); - break; - } + if (strlen(fnvpair_value_string(elem)) >= + ZAP_MAXVALUELEN) { + error = SET_ERROR(E2BIG); + break; + } + } else if (zpool_prop_feature(propname)) { + if (nvpair_type(elem) != DATA_TYPE_UINT64) { + error = SET_ERROR(EINVAL); + break; + } - if (intval != 0) { - error = SET_ERROR(EINVAL); - break; - } + if (nvpair_value_uint64(elem, &intval) != 0) { + error = SET_ERROR(EINVAL); + break; + } + + if (intval != 0) { + error = SET_ERROR(EINVAL); + break; + } + + fname = strchr(propname, '@') + 1; + if (zfeature_lookup_name(fname, NULL) != 0) { + error = SET_ERROR(EINVAL); + break; + } - fname = strchr(propname, '@') + 1; - if (zfeature_lookup_name(fname, NULL) != 0) { + has_feature = B_TRUE; + } else { error = SET_ERROR(EINVAL); break; } - - has_feature = B_TRUE; break; case ZPOOL_PROP_VERSION: @@ -792,6 +826,12 @@ spa_prop_set(spa_t *spa, nvlist_t *nvp) prop == ZPOOL_PROP_READONLY) continue; + if (prop == ZPOOL_PROP_INVAL && + zfs_prop_user(nvpair_name(elem))) { + need_sync = B_TRUE; + break; + } + if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { uint64_t ver = 0; @@ -8800,24 +8840,11 @@ spa_sync_props(void *arg, dmu_tx_t *tx) const char *strval, *fname; zpool_prop_t prop; const char *propname; + const char *elemname = nvpair_name(elem); zprop_type_t proptype; spa_feature_t fid; - switch (prop = zpool_name_to_prop(nvpair_name(elem))) { - case ZPOOL_PROP_INVAL: - /* - * We checked this earlier in spa_prop_validate(). - */ - ASSERT(zpool_prop_feature(nvpair_name(elem))); - - fname = strchr(nvpair_name(elem), '@') + 1; - VERIFY0(zfeature_lookup_name(fname, &fid)); - - spa_feature_enable(spa, fid, tx); - spa_history_log_internal(spa, "set", tx, - "%s=enabled", nvpair_name(elem)); - break; - + switch (prop = zpool_name_to_prop(elemname)) { case ZPOOL_PROP_VERSION: intval = fnvpair_value_uint64(elem); /* @@ -8860,7 +8887,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } spa_history_log_internal(spa, "set", tx, - "%s=%s", nvpair_name(elem), strval); + "%s=%s", elemname, strval); break; case ZPOOL_PROP_COMPATIBILITY: strval = fnvpair_value_string(elem); @@ -8879,6 +8906,20 @@ spa_sync_props(void *arg, dmu_tx_t *tx) "%s=%s", nvpair_name(elem), strval); break; + case ZPOOL_PROP_INVAL: + if (zpool_prop_feature(elemname)) { + fname = strchr(elemname, '@') + 1; + VERIFY0(zfeature_lookup_name(fname, &fid)); + + spa_feature_enable(spa, fid, tx); + spa_history_log_internal(spa, "set", tx, + "%s=enabled", elemname); + break; + } else if (!zfs_prop_user(elemname)) { + ASSERT(zpool_prop_feature(elemname)); + break; + } + zfs_fallthrough; default: /* * Set pool property values in the poolprops mos object. @@ -8893,6 +8934,11 @@ spa_sync_props(void *arg, dmu_tx_t *tx) /* normalize the property name */ propname = zpool_prop_to_name(prop); proptype = zpool_prop_get_type(prop); + if (prop == ZPOOL_PROP_INVAL && + zfs_prop_user(elemname)) { + propname = elemname; + proptype = PROP_TYPE_STRING; + } if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); @@ -8901,7 +8947,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa->spa_pool_props_object, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "set", tx, - "%s=%s", nvpair_name(elem), strval); + "%s=%s", elemname, strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); @@ -8914,7 +8960,7 @@ spa_sync_props(void *arg, dmu_tx_t *tx) spa->spa_pool_props_object, propname, 8, 1, &intval, tx)); spa_history_log_internal(spa, "set", tx, - "%s=%lld", nvpair_name(elem), + "%s=%lld", elemname, (longlong_t)intval); switch (prop) { diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index cc4ce03677cb..55991cfeaf78 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -482,7 +482,8 @@ tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', - 'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos'] + 'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos', + 'user_property_001_pos', 'user_property_002_neg'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index e671a3f6b02b..74295b86ddc2 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1149,10 +1149,13 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_set/setup.ksh \ functional/cli_root/zpool/setup.ksh \ functional/cli_root/zpool_set/vdev_set_001_pos.ksh \ + functional/cli_root/zpool_set/zpool_set_common.kshlib \ functional/cli_root/zpool_set/zpool_set_001_pos.ksh \ functional/cli_root/zpool_set/zpool_set_002_neg.ksh \ functional/cli_root/zpool_set/zpool_set_003_neg.ksh \ functional/cli_root/zpool_set/zpool_set_ashift.ksh \ + functional/cli_root/zpool_set/user_property_001_pos.ksh \ + functional/cli_root/zpool_set/user_property_002_neg.ksh \ functional/cli_root/zpool_set/zpool_set_features.ksh \ functional/cli_root/zpool_split/cleanup.ksh \ functional/cli_root/zpool_split/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/user_property_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_set/user_property_001_pos.ksh new file mode 100755 index 000000000000..4b9097933f37 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/user_property_001_pos.ksh @@ -0,0 +1,89 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib + +# +# DESCRIPTION: +# ZFS can set any valid user-defined pool property. +# +# STRATEGY: +# 1. Combine all kind of valid characters into a valid user-defined +# property name. +# 2. Random get a string as the value. +# 3. Verify all the valid user-defined pool properties can be set to a +# pool. +# + +verify_runnable "both" + +log_assert "ZFS can set any valid user-defined pool property." +log_onexit cleanup_user_prop $TESTPOOL + +typeset -a names=() +typeset -a values=() + +# Longest property name (255 bytes, which is the 256-byte limit minus 1 byte +# for the null byte) +names+=("$(awk 'BEGIN { printf "x:"; while (c++ < (256 - 2 - 1)) printf "a" }')") +values+=("long-property-name") +# Longest property value (the limits are 1024 on FreeBSD and 4096 on Linux, so +# pick the right one; the longest value can use limit minus 1 bytes for the +# null byte) +if is_linux; then + typeset ZFS_MAXPROPLEN=4096 +else + typeset ZFS_MAXPROPLEN=1024 +fi +names+=("long:property:value") +values+=("$(awk -v max="$ZFS_MAXPROPLEN" 'BEGIN { while (c++ < (max - 1)) printf "A" }')") +# Valid property names +for i in {1..10}; do + typeset -i len + ((len = RANDOM % 32)) + names+=("$(valid_user_property $len)") + ((len = RANDOM % 512)) + values+=("$(user_property_value $len)") +done + +typeset -i i=0 +while ((i < ${#names[@]})); do + typeset name="${names[$i]}" + typeset value="${values[$i]}" + + log_must eval "zpool set $name='$value' $TESTPOOL" + log_must eval "check_user_prop $TESTPOOL $name '$value'" + + ((i += 1)) +done + +log_pass "ZFS can set any valid user-defined pool property passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/user_property_002_neg.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_set/user_property_002_neg.ksh new file mode 100755 index 000000000000..7c8fcba6e471 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/user_property_002_neg.ksh @@ -0,0 +1,88 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara Inc. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib + +# +# DESCRIPTION: +# ZFS can handle any invalid user-defined pool property. +# +# STRATEGY: +# 1. Combine all kind of invalid user pool property names. +# 2. Random get a string as the value. +# 3. Verify all the invalid user-defined pool properties can not be set +# to the pool. +# + +verify_runnable "both" + +log_assert "ZFS can handle any invalid user pool property." +log_onexit cleanup_user_prop $TESTPOOL + +typeset -a names=() +typeset -a values=() + +# Too long property name (256 bytes, which is the 256-byte limit minus 1 byte +# for the null byte plus 1 byte to reach back over the limit) +names+=("$(awk 'BEGIN { printf "x:"; while (c++ < (256 - 2 - 1 + 1)) printf "a" }')") +values+=("too-long-property-name") +# Too long property value (the limits are 1024 on FreeBSD and 4096 on Linux, so +# pick the right one; the too long value is, e.g., the limit minus 1 bytes for the +# null byte plus 1 byte to reach back over the limit) +if is_linux; then + typeset ZFS_MAXPROPLEN=4096 +else + typeset ZFS_MAXPROPLEN=1024 +fi +names+=("too:long:property:value") +values+=("$(awk -v max="$ZFS_MAXPROPLEN" 'BEGIN { while (c++ < (max - 1 + 1)) printf "A" }')") +# Invalid property names +for i in {1..10}; do + typeset -i len + ((len = RANDOM % 32)) + names+=("$(invalid_user_property $len)") + ((len = RANDOM % 512)) + values+=("$(user_property_value $len)") +done + +typeset -i i=0 +while ((i < ${#names[@]})); do + typeset name="${names[$i]}" + typeset value="${values[$i]}" + + log_mustnot zpool set $name=$value $TESTPOOL + log_mustnot check_user_prop $TESTPOOL \"$name\" \"$value\" + + ((i += 1)) +done + +log_pass "ZFS can handle invalid user pool property passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib b/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib new file mode 100644 index 000000000000..346e4a16b2ad --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_common.kshlib @@ -0,0 +1,178 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2014, 2016 by Delphix. All rights reserved. +# Copyright (c) 2023 by Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +set -A VALID_NAME_CHAR a b c d e f g h i j k l m n o p q r s t u v w x y z \ + 0 1 2 3 4 5 6 7 8 9 ':' '-' '.' '_' +set -A INVALID_NAME_CHAR A B C D E F G H I J K L M N O P Q R S T U V W X Y Z \ + '`' '~' '!' '@' '#' '$' '%' '^' '&' '(' ')' '+' '=' '|' "\\" '{' '[' ']' \ + '}' ';' '"' '<' ',' '>' '?' '/' ' ' +set -A ALL_CHAR ${VALID_NAME_CHAR[*]} ${INVALID_NAME_CHAR[*]} + +# +# Cleanup all the user properties of the pool. +# +# $1 pool name +# +function cleanup_user_prop +{ + typeset pool=$1 + + typeset user_prop + user_prop=$(zpool get -H -o property all $pool | grep ":") + + typeset prop + for prop in $user_prop; do + zpool set $prop="" $pool || + log_must zpool set $prop="" $pool + done +} + +# +# Random select character from the specified character set and combine into a +# random string +# +# $1 character set name +# $2 String length +# +function random_string +{ + typeset char_set=${1:-VALID_NAME_CHAR} + typeset -i len=${2:-5} + + eval typeset -i count=\${#$char_set[@]} + + # No consumers want an empty string. + ((len == 0)) && len=3 + + typeset str + typeset -i i=0 + while ((i < len)); do + typeset -i ind + ((ind = RANDOM % count)) + eval str=\${str}\${$char_set[\$ind]} + + ((i += 1)) + done + + echo "$str" +} + +# +# Get valid user-defined property name +# +# $1 user-defined property name length +# +function valid_user_property +{ + typeset -i sumlen=${1:-10} + ((sumlen < 2 )) && sumlen=2 + typeset -i len + ((len = RANDOM % sumlen)) + typeset part1 part2 + + while true; do + part1="$(random_string VALID_NAME_CHAR $len)" + if [[ "$part1" == "-"* ]]; then + continue + fi + break + done + ((len = sumlen - (len + 1))) + + while true; do + part2="$(random_string VALID_NAME_CHAR $len)" + if [[ -z $part1 && -z $part2 ]]; then + continue + fi + break + done + + echo "${part1}:${part2}" +} + +# +# Get invalid user-defined property name +# +# $1 user-defined property name length +# +function invalid_user_property +{ + typeset -i sumlen=${1:-10} + ((sumlen == 0)) && sumlen=1 + typeset -i len + ((len = RANDOM % sumlen)) + + typeset part1 part2 + while true; do + part1="$(random_string VALID_NAME_CHAR $len)" + ((len = sumlen - len)) + part2="$(random_string INVALID_NAME_CHAR $len)" + + # Avoid $part1 is *:* and $part2 is "=*" + if [[ "$part1" == *":"* && "$part2" == "="* ]]; then + continue + fi + break + done + + echo "${part1}${part2}" +} + +# +# Get user-defined property value +# +# $1 user-defined property name length +# +function user_property_value +{ + typeset -i len=${1:-100} + + random_string ALL_CHAR $len +} + +# +# Check if the user-defined property is identical to the expected value. +# +# $1 pool +# $2 user property +# $3 expected value +# +function check_user_prop +{ + typeset pool=$1 + typeset user_prop="$2" + typeset expect_value="$3" + typeset value=$(zpool get -p -H -o value "$user_prop" $pool 2>&1) + + [ "$expect_value" = "$value" ] +} From 62cc9d4f6b5ad3a16b97f6389828b20331863f40 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Fri, 21 Apr 2023 19:22:52 +0200 Subject: [PATCH 061/180] FreeBSD: make zfs_vfs_held() definition consistent with declaration Noticed while attempting to change FreeBSD's boolean_t into an actual bool: in include/sys/zfs_ioctl_impl.h, zfs_vfs_held() is declared to return a boolean_t, but in module/os/freebsd/zfs/zfs_ioctl_os.c it is defined to return an int. Make the definition match the declaration. Reviewed-by: Alexander Motin Reviewed-by: Brian Atkinson Signed-off-by: Dimitry Andric Closes #14776 --- module/os/freebsd/zfs/zfs_ioctl_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/freebsd/zfs/zfs_ioctl_os.c b/module/os/freebsd/zfs/zfs_ioctl_os.c index 8f44cced5d95..a835e013d630 100644 --- a/module/os/freebsd/zfs/zfs_ioctl_os.c +++ b/module/os/freebsd/zfs/zfs_ioctl_os.c @@ -59,7 +59,7 @@ zfs_vfs_ref(zfsvfs_t **zfvp) return (error); } -int +boolean_t zfs_vfs_held(zfsvfs_t *zfsvfs) { return (zfsvfs->z_vfs != NULL); From a7982d5d30dabd2e206011f54226b25d6c70c4d6 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Apr 2023 21:12:17 +0000 Subject: [PATCH 062/180] FreeBSD: fix up EXDEV handling for clone_range API contract requires VOPs to handle EXDEV internally, worst case by falling back to the generic copy routine. This broke with the recent changes. While here whack custom loop to lock 2 vnodes with vn_lock_pair, which provides the same functionality internally. write start/finish around it plays no role so got eliminated. One difference is that vn_lock_pair always takes an exclusive lock on both vnodes. I did not patch around it because current code takes an exclusive lock on the target vnode. zfs supports shared-locking for writes, so this serializes different calls to the routine as is, despite range locking inside. At the same time you may notice the source vnode can get some traffic if only shared-locked, thus once more this goes the safer route of exclusive-locking. Note this should be patched to use shared-locking for both once the feature is considered stable. Technically the switch to vn_lock_pair should be a separate change, but it would only introduce churn immediately whacked by the rest of the patch. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Mateusz Guzik Sponsored by: Rubicon Communications, LLC ("Netgate") Closes #14723 --- module/os/freebsd/zfs/zfs_vnops_os.c | 63 +++++++++++++++------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 8abd7239ad2e..0ec4d40ce790 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6249,56 +6249,59 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) * need something else than vn_generic_copy_file_range(). */ - /* Lock both vnodes, avoiding risk of deadlock. */ - do { - mp = NULL; - error = vn_start_write(outvp, &mp, V_WAIT); - if (error == 0) { - error = vn_lock(outvp, LK_EXCLUSIVE); - if (error == 0) { - if (invp == outvp) - break; - error = vn_lock(invp, LK_SHARED | LK_NOWAIT); - if (error == 0) - break; - VOP_UNLOCK(outvp); - if (mp != NULL) - vn_finished_write(mp); - mp = NULL; - error = vn_lock(invp, LK_SHARED); - if (error == 0) - VOP_UNLOCK(invp); - } + vn_start_write(outvp, &mp, V_WAIT); + if (invp == outvp) { + if (vn_lock(outvp, LK_EXCLUSIVE) != 0) { + goto bad_write_fallback; } - if (mp != NULL) - vn_finished_write(mp); - } while (error == 0); - if (error != 0) - return (error); + } else { +#if __FreeBSD_version >= 1400086 + vn_lock_pair(invp, false, LK_EXCLUSIVE, outvp, false, + LK_EXCLUSIVE); +#else + vn_lock_pair(invp, false, outvp, false); +#endif + if (VN_IS_DOOMED(invp) || VN_IS_DOOMED(outvp)) { + goto bad_locked_fallback; + } + } + #ifdef MAC error = mac_vnode_check_write(curthread->td_ucred, ap->a_outcred, outvp); if (error != 0) - goto unlock; + goto out_locked; #endif io.uio_offset = *ap->a_outoffp; io.uio_resid = *ap->a_lenp; error = vn_rlimit_fsize(outvp, &io, ap->a_fsizetd); if (error != 0) - goto unlock; + goto out_locked; error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp), - ap->a_outoffp, &len, ap->a_fsizetd->td_ucred); + ap->a_outoffp, &len, ap->a_outcred); + if (error == EXDEV) + goto bad_locked_fallback; *ap->a_lenp = (size_t)len; - -unlock: +out_locked: if (invp != outvp) VOP_UNLOCK(invp); VOP_UNLOCK(outvp); if (mp != NULL) vn_finished_write(mp); + return (error); +bad_locked_fallback: + if (invp != outvp) + VOP_UNLOCK(invp); + VOP_UNLOCK(outvp); +bad_write_fallback: + if (mp != NULL) + vn_finished_write(mp); + error = vn_generic_copy_file_range(ap->a_invp, ap->a_inoffp, + ap->a_outvp, ap->a_outoffp, ap->a_lenp, ap->a_flags, + ap->a_incred, ap->a_outcred, ap->a_fsizetd); return (error); } From ff0e135e25fa6b523f93d8022304b1be60a67be7 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 5 Apr 2023 21:28:52 +0000 Subject: [PATCH 063/180] FreeBSD: try to fallback early if can't do optimized copy Not complete, but already shaves on some locking. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Mateusz Guzik Sponsored by: Rubicon Communications, LLC ("Netgate") Closes #14723 --- module/os/freebsd/zfs/zfs_vnops_os.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c index 0ec4d40ce790..d29f00a0cbe4 100644 --- a/module/os/freebsd/zfs/zfs_vnops_os.c +++ b/module/os/freebsd/zfs/zfs_vnops_os.c @@ -6235,6 +6235,7 @@ struct vop_copy_file_range_args { static int zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) { + zfsvfs_t *outzfsvfs; struct vnode *invp = ap->a_invp; struct vnode *outvp = ap->a_outvp; struct mount *mp; @@ -6250,6 +6251,13 @@ zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap) */ vn_start_write(outvp, &mp, V_WAIT); + if (__predict_true(mp == outvp->v_mount)) { + outzfsvfs = (zfsvfs_t *)mp->mnt_data; + if (!spa_feature_is_enabled(dmu_objset_spa(outzfsvfs->z_os), + SPA_FEATURE_BLOCK_CLONING)) { + goto bad_write_fallback; + } + } if (invp == outvp) { if (vn_lock(outvp, LK_EXCLUSIVE) != 0) { goto bad_write_fallback; From 81a2b2e6a6d9c71d314588207de9775a5ea6f72f Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 25 Apr 2023 01:15:42 +0200 Subject: [PATCH 064/180] FreeBSD: add missing vop_fplookup assignments It became illegal to not have them as of 5f6df177758b9dff88e4b6069aeb2359e8b0c493 ("vfs: validate that vop vectors provide all or none fplookup vops") upstream. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Mateusz Guzik Closes #14788 --- module/os/freebsd/zfs/zfs_ctldir.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c index ca2f4419d1c4..d00efa44f2bc 100644 --- a/module/os/freebsd/zfs/zfs_ctldir.c +++ b/module/os/freebsd/zfs/zfs_ctldir.c @@ -801,6 +801,9 @@ static struct vop_vector zfsctl_ops_root = { .vop_default = &default_vnodeops, #if __FreeBSD_version >= 1300121 .vop_fplookup_vexec = VOP_EAGAIN, +#endif +#if __FreeBSD_version >= 1300139 + .vop_fplookup_symlink = VOP_EAGAIN, #endif .vop_open = zfsctl_common_open, .vop_close = zfsctl_common_close, @@ -1129,6 +1132,9 @@ static struct vop_vector zfsctl_ops_snapdir = { .vop_default = &default_vnodeops, #if __FreeBSD_version >= 1300121 .vop_fplookup_vexec = VOP_EAGAIN, +#endif +#if __FreeBSD_version >= 1300139 + .vop_fplookup_symlink = VOP_EAGAIN, #endif .vop_open = zfsctl_common_open, .vop_close = zfsctl_common_close, @@ -1236,6 +1242,9 @@ static struct vop_vector zfsctl_ops_snapshot = { .vop_default = NULL, /* ensure very restricted access */ #if __FreeBSD_version >= 1300121 .vop_fplookup_vexec = VOP_EAGAIN, +#endif +#if __FreeBSD_version >= 1300139 + .vop_fplookup_symlink = VOP_EAGAIN, #endif .vop_open = zfsctl_common_open, .vop_close = zfsctl_common_close, From 6b6aaf6dc2e65c63c74fbd7840c14627e9a91ce2 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Mon, 24 Apr 2023 19:55:07 -0400 Subject: [PATCH 065/180] Taught zdb -bb to print metadata totals People often want estimates of how much of their pool is occupied by metadata, but they end up using lots of text processing on zdb's output to get it. So let's just...provide it for them. Now, zdb -bbbs will output something like: Blocks LSIZE PSIZE ASIZE avg comp %Total Type [...] 68 1.06M 272K 544K 8K 4.00 0.00 L6 Total 1.71K 212M 6.85M 13.7M 8K 30.91 0.00 L5 Total 1.71K 212M 6.85M 13.7M 8K 30.91 0.00 L4 Total 1.73K 214M 6.92M 13.8M 8K 30.89 0.00 L3 Total 18.7K 2.29G 111M 221M 11.8K 21.19 0.00 L2 Total 3.56M 454G 28.4G 56.9G 16.0K 15.97 0.19 L1 Total 308M 36.8T 28.2T 28.6T 95.1K 1.30 99.80 L0 Total 311M 37.3T 28.3T 28.6T 94.2K 1.32 100.00 Total 50.4M 774G 113G 291G 5.77K 6.85 0.99 Metadata Total Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #14746 --- cmd/zdb/zdb.c | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index c93ed4399afd..64ec3eb0028c 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -6812,12 +6812,15 @@ dump_block_stats(spa_t *spa) if (dump_opt['b'] >= 2) { int l, t, level; + char csize[32], lsize[32], psize[32], asize[32]; + char avg[32], gang[32]; (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" "\t avg\t comp\t%%Total\tType\n"); + zfs_blkstat_t *mdstats = umem_zalloc(sizeof (zfs_blkstat_t), + UMEM_NOFAIL); + for (t = 0; t <= ZDB_OT_TOTAL; t++) { - char csize[32], lsize[32], psize[32], asize[32]; - char avg[32], gang[32]; const char *typename; /* make sure nicenum has enough space */ @@ -6860,6 +6863,15 @@ dump_block_stats(spa_t *spa) if (zb->zb_asize == 0) continue; + if (level != ZB_TOTAL && t < DMU_OT_NUMTYPES && + (level > 0 || DMU_OT_IS_METADATA(t))) { + mdstats->zb_count += zb->zb_count; + mdstats->zb_lsize += zb->zb_lsize; + mdstats->zb_psize += zb->zb_psize; + mdstats->zb_asize += zb->zb_asize; + mdstats->zb_gangs += zb->zb_gangs; + } + if (dump_opt['b'] < 3 && level != ZB_TOTAL) continue; @@ -6905,6 +6917,24 @@ dump_block_stats(spa_t *spa) } } } + zdb_nicenum(mdstats->zb_count, csize, + sizeof (csize)); + zdb_nicenum(mdstats->zb_lsize, lsize, + sizeof (lsize)); + zdb_nicenum(mdstats->zb_psize, psize, + sizeof (psize)); + zdb_nicenum(mdstats->zb_asize, asize, + sizeof (asize)); + zdb_nicenum(mdstats->zb_asize / mdstats->zb_count, avg, + sizeof (avg)); + zdb_nicenum(mdstats->zb_gangs, gang, sizeof (gang)); + + (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" + "\t%5.2f\t%6.2f\t", + csize, lsize, psize, asize, avg, + (double)mdstats->zb_lsize / mdstats->zb_psize, + 100.0 * mdstats->zb_asize / tzb->zb_asize); + (void) printf("%s\n", "Metadata Total"); /* Output a table summarizing block sizes in the pool */ if (dump_opt['b'] >= 2) { From 6d59d5df9808902a3cb6064605c753ec2ab8d2d7 Mon Sep 17 00:00:00 2001 From: Han Gao Date: Wed, 26 Apr 2023 07:05:45 +0800 Subject: [PATCH 066/180] Add loongarch64 support Add loongarch64 definitions & lua module setjmp asm LoongArch is a new RISC ISA, which is a bit like MIPS or RISC-V. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Han Gao Signed-off-by: WANG Xuerui Closes #13422 --- include/os/linux/spl/sys/isa_defs.h | 18 +++++- lib/libspl/include/sys/isa_defs.h | 18 +++++- module/lua/ldo.c | 2 + module/lua/setjmp/setjmp.S | 2 + module/lua/setjmp/setjmp_loongarch64.S | 82 ++++++++++++++++++++++++++ 5 files changed, 120 insertions(+), 2 deletions(-) create mode 100644 module/lua/setjmp/setjmp_loongarch64.S diff --git a/include/os/linux/spl/sys/isa_defs.h b/include/os/linux/spl/sys/isa_defs.h index 5801ec92bc2b..7c95c94c1cf1 100644 --- a/include/os/linux/spl/sys/isa_defs.h +++ b/include/os/linux/spl/sys/isa_defs.h @@ -195,10 +195,26 @@ #define _SUNOS_VTOC_16 +/* + * LoongArch arch specific defines + * only LoongArch64 is supported yet + */ +#elif defined(__loongarch__) && defined(__loongarch_lp64) + +#if !defined(_LP64) +#define _LP64 +#endif + +#define _ZFS_LITTLE_ENDIAN +#define _SUNOS_VTOC_16 + +/* not all LoongArch cores support unaligned accesses in hardware */ +#define _ALIGNMENT_REQUIRED 1 + #else /* * Currently supported: - * x86_64, x32, i386, arm, powerpc, s390, sparc, mips, and RV64G + * x86_64, x32, i386, arm, powerpc, s390, sparc, mips, RV64G, and LoongArch64 */ #error "Unsupported ISA type" #endif diff --git a/lib/libspl/include/sys/isa_defs.h b/lib/libspl/include/sys/isa_defs.h index 114cca4f1545..302f31e989cb 100644 --- a/lib/libspl/include/sys/isa_defs.h +++ b/lib/libspl/include/sys/isa_defs.h @@ -246,10 +246,26 @@ extern "C" { #define _SUNOS_VTOC_16 +/* + * LoongArch arch specific defines + * only LoongArch64 is supported yet + */ +#elif defined(__loongarch__) && defined(__loongarch_lp64) + +#if !defined(_LP64) +#define _LP64 +#endif + +#define _ZFS_LITTLE_ENDIAN +#define _SUNOS_VTOC_16 + +/* not all LoongArch cores support unaligned accesses in hardware */ +#define _ALIGNMENT_REQUIRED 1 + #else /* * Currently supported: - * x86_64, x32, i386, arm, powerpc, s390, sparc, mips, and RV64G + * x86_64, x32, i386, arm, powerpc, s390, sparc, mips, RV64G, and LoongArch64 */ #error "Unsupported ISA type" #endif diff --git a/module/lua/ldo.c b/module/lua/ldo.c index bf525588e260..38bd4e08a73d 100644 --- a/module/lua/ldo.c +++ b/module/lua/ldo.c @@ -84,6 +84,8 @@ static intptr_t stack_remaining(void) { #define JMP_BUF_CNT 18 #elif defined(__riscv) #define JMP_BUF_CNT 64 +#elif defined(__loongarch_lp64) +#define JMP_BUF_CNT 64 #else #define JMP_BUF_CNT 1 #endif diff --git a/module/lua/setjmp/setjmp.S b/module/lua/setjmp/setjmp.S index 1f461a0a4ef3..6f03eea92711 100644 --- a/module/lua/setjmp/setjmp.S +++ b/module/lua/setjmp/setjmp.S @@ -16,4 +16,6 @@ #include "setjmp_s390x.S" #elif defined(__riscv) #include "setjmp_rv64g.S" +#elif defined(__loongarch_lp64) +#include "setjmp_loongarch64.S" #endif diff --git a/module/lua/setjmp/setjmp_loongarch64.S b/module/lua/setjmp/setjmp_loongarch64.S new file mode 100644 index 000000000000..216b829ff236 --- /dev/null +++ b/module/lua/setjmp/setjmp_loongarch64.S @@ -0,0 +1,82 @@ +/*- + * Copyright 2022 Han Gao + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if __loongarch_lp64 + +#define ENTRY(symbol) \ + .text; \ + .globl symbol; \ + .align 3; \ + .type symbol, @function; \ + symbol: + +#define END(function) \ + .size function, .- function; + +ENTRY(setjmp) + st.d $ra, $a0, 0*8 + st.d $sp, $a0, 1*8 + st.d $r21, $a0, 2*8 + st.d $fp, $a0, 3*8 + st.d $s0, $a0, 4*8 + st.d $s1, $a0, 5*8 + st.d $s2, $a0, 6*8 + st.d $s3, $a0, 7*8 + st.d $s4, $a0, 8*8 + st.d $s5, $a0, 9*8 + st.d $s6, $a0, 10*8 + st.d $s7, $a0, 11*8 + st.d $s8, $a0, 12*8 + + li.w $a0, 0 + jr $ra +END(setjmp) + +ENTRY(longjmp) + ld.d $ra, $a0, 0*8 + ld.d $sp, $a0, 1*8 + ld.d $r21, $a0, 2*8 + ld.d $fp, $a0, 3*8 + ld.d $s0, $a0, 4*8 + ld.d $s1, $a0, 5*8 + ld.d $s2, $a0, 6*8 + ld.d $s3, $a0, 7*8 + ld.d $s4, $a0, 8*8 + ld.d $s5, $a0, 9*8 + ld.d $s6, $a0, 10*8 + ld.d $s7, $a0, 11*8 + ld.d $s8, $a0, 12*8 + + sltui $a0, $a1, 1 + add.d $a0, $a0, $a1 // a0 = (a1 == 0) ? 1 : a1 + jr $ra +END(longjmp) + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif + +#endif From 0e8a42bbee2c29f1a2c49fdbda403e839af4b38d Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 25 Apr 2023 16:40:55 -0700 Subject: [PATCH 067/180] Revert "Fix data race between zil_commit() and zil_suspend()" This reverts commit 4c856fb333ac57d9b4a6ddd44407fd022a702f00 to resolve a newly introduced deadlock which in practice in more disruptive that the issue this commit intended to address. Reviewed-by: Richard Yao Reviewed-by: Mark Maybee Signed-off-by: Brian Behlendorf Issue #14775 Closes #14790 --- include/sys/zil_impl.h | 1 - module/zfs/zil.c | 27 --------------------------- 2 files changed, 28 deletions(-) diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index f44a01afee5c..bb85bf6d1eb1 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -183,7 +183,6 @@ struct zilog { uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */ uint64_t zl_replaying_seq; /* current replay seq number */ - krwlock_t zl_suspend_lock; /* protects suspend count */ uint32_t zl_suspend; /* log suspend count */ kcondvar_t zl_cv_suspend; /* log suspend completion */ uint8_t zl_suspending; /* log is currently suspending */ diff --git a/module/zfs/zil.c b/module/zfs/zil.c index eb26e4b32998..d1631c2ac9db 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -3317,21 +3317,6 @@ zil_commit(zilog_t *zilog, uint64_t foid) return; } - /* - * The ->zl_suspend_lock rwlock ensures that all in-flight - * zil_commit() operations finish before suspension begins and that - * no more begin. Without it, it is possible for the scheduler to - * preempt us right after the zilog->zl_suspend suspend check, run - * another thread that runs zil_suspend() and after the other thread - * has finished its call to zil_commit_impl(), resume this thread while - * zil is suspended. This can trigger an assertion failure in - * VERIFY(list_is_empty(&lwb->lwb_itxs)). If it is held, it means that - * `zil_suspend()` is executing in another thread, so we go to - * txg_wait_synced(). - */ - if (!rw_tryenter(&zilog->zl_suspend_lock, RW_READER)) - goto wait; - /* * If the ZIL is suspended, we don't want to dirty it by calling * zil_commit_itx_assign() below, nor can we write out @@ -3340,14 +3325,11 @@ zil_commit(zilog_t *zilog, uint64_t foid) * semantics, and avoid calling those functions altogether. */ if (zilog->zl_suspend > 0) { - rw_exit(&zilog->zl_suspend_lock); -wait: txg_wait_synced(zilog->zl_dmu_pool, 0); return; } zil_commit_impl(zilog, foid); - rw_exit(&zilog->zl_suspend_lock); } void @@ -3612,8 +3594,6 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_lwb_io_cv, NULL, CV_DEFAULT, NULL); - rw_init(&zilog->zl_suspend_lock, NULL, RW_DEFAULT, NULL); - return (zilog); } @@ -3653,8 +3633,6 @@ zil_free(zilog_t *zilog) cv_destroy(&zilog->zl_cv_suspend); cv_destroy(&zilog->zl_lwb_io_cv); - rw_destroy(&zilog->zl_suspend_lock); - kmem_free(zilog, sizeof (zilog_t)); } @@ -3782,14 +3760,11 @@ zil_suspend(const char *osname, void **cookiep) return (error); zilog = dmu_objset_zil(os); - rw_enter(&zilog->zl_suspend_lock, RW_WRITER); - mutex_enter(&zilog->zl_lock); zh = zilog->zl_header; if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); - rw_exit(&zilog->zl_suspend_lock); dmu_objset_rele(os, suspend_tag); return (SET_ERROR(EBUSY)); } @@ -3803,7 +3778,6 @@ zil_suspend(const char *osname, void **cookiep) if (cookiep == NULL && !zilog->zl_suspending && (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { mutex_exit(&zilog->zl_lock); - rw_exit(&zilog->zl_suspend_lock); dmu_objset_rele(os, suspend_tag); return (0); } @@ -3812,7 +3786,6 @@ zil_suspend(const char *osname, void **cookiep) dsl_pool_rele(dmu_objset_pool(os), suspend_tag); zilog->zl_suspend++; - rw_exit(&zilog->zl_suspend_lock); if (zilog->zl_suspend > 1) { /* From d960beca61b3c5ede8eb595ceb091666e0d4a17c Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 26 Apr 2023 08:43:39 -0700 Subject: [PATCH 068/180] zdb: Fix minor memory leak Commit 6b6aaf6dc2e65c63c74fbd7840c14627e9a91ce2 introduced a small memory leak in zdb. This was detected by the LeakSanitizer and was causing all ztest runs to fail. Reviewed-by: Igor Kozhukhov Reviewed-by: Rich Ercolani Signed-off-by: Brian Behlendorf Closes #14796 --- cmd/zdb/zdb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 64ec3eb0028c..ec5d1acacf85 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -6940,6 +6940,8 @@ dump_block_stats(spa_t *spa) if (dump_opt['b'] >= 2) { dump_size_histograms(zcb); } + + umem_free(mdstats, sizeof (zfs_blkstat_t)); } (void) printf("\n"); From b69cb06664ad74f5907f3a6675657b7d4b3daa44 Mon Sep 17 00:00:00 2001 From: Rob N Date: Thu, 27 Apr 2023 01:50:44 +1000 Subject: [PATCH 069/180] tests/zdb_encrypted: parse numbers a little more robustly On FreeBSD, `wc` prints some leading spaces, while on Linux it does not. So we tell ksh to expect an integer, and it does the rest. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #14791 Closes #14797 --- .../zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted.ksh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted.ksh index 4572f64947a1..0218c2ea1033 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_encrypted.ksh @@ -55,8 +55,8 @@ log_must eval "echo $PASSPHRASE | zfs create -o mountpoint=$TESTDIR2" \ echo 'my great encrypted text' > $file -obj="$(ls -i $file | cut -d' ' -f1)" -size="$(wc -c < $file)" +typeset -i obj=$(ls -i $file | cut -d' ' -f1) +typeset -i size=$(wc -c < $file) log_note "test file $file is objid $obj, size $size" From 88b8777159f69237ca59b31e8c246c567e836c68 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 20 Apr 2023 08:59:38 +0000 Subject: [PATCH 070/180] FreeBSD: add missing vn state transition for .zfs Signed-off-by: Mateusz Guzik Closes #14774 --- module/os/freebsd/zfs/zfs_ctldir.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c index d00efa44f2bc..8ba5e0b242a2 100644 --- a/module/os/freebsd/zfs/zfs_ctldir.c +++ b/module/os/freebsd/zfs/zfs_ctldir.c @@ -204,6 +204,10 @@ sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id, return (error); } +#if __FreeBSD_version >= 1400077 + vn_set_state(vp, VSTATE_CONSTRUCTED); +#endif + *vpp = vp; return (0); } From e37a89d5d0ba07da09998de6701a6c0ec43903b7 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Thu, 20 Apr 2023 09:00:03 +0000 Subject: [PATCH 071/180] FreeBSD: fix up EINVAL from getdirentries on .zfs Without the change: /.zfs /.zfs/snapshot find: /.zfs: Invalid argument Signed-off-by: Mateusz Guzik Closes #14774 --- module/os/freebsd/zfs/zfs_ctldir.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c index 8ba5e0b242a2..420d887b661e 100644 --- a/module/os/freebsd/zfs/zfs_ctldir.c +++ b/module/os/freebsd/zfs/zfs_ctldir.c @@ -679,6 +679,17 @@ zfsctl_root_readdir(struct vop_readdir_args *ap) ASSERT3S(vp->v_type, ==, VDIR); + /* + * FIXME: this routine only ever emits 3 entries and does not tolerate + * being called with a buffer too small to handle all of them. + * + * The check below facilitates the idiom of repeating calls until the + * count to return is 0. + */ + if (zfs_uio_offset(&uio) == 3 * sizeof(entry)) { + return (0); + } + error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, &uio, &dots_offset); if (error != 0) { From bba7cbf0a481ab16f9a9a4874b7dbd5682e4d3a4 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 26 Apr 2023 12:20:43 -0400 Subject: [PATCH 072/180] Fix positive ABD size assertion in abd_verify(). Gang ABDs without childred are legal, and they do have zero size. For other ABD types zero size doesn't have much sense and likely not working correctly now. Reviewed-by: Igor Kozhukhov Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14795 --- module/zfs/abd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/zfs/abd.c b/module/zfs/abd.c index d4921d0ba7db..26222d2efe3f 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -109,7 +109,6 @@ void abd_verify(abd_t *abd) { #ifdef ZFS_DEBUG - ASSERT3U(abd->abd_size, >, 0); ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | @@ -118,6 +117,7 @@ abd_verify(abd_t *abd) IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); if (abd_is_linear(abd)) { + ASSERT3U(abd->abd_size, >, 0); ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL); } else if (abd_is_gang(abd)) { uint_t child_sizes = 0; @@ -130,6 +130,7 @@ abd_verify(abd_t *abd) } ASSERT3U(abd->abd_size, ==, child_sizes); } else { + ASSERT3U(abd->abd_size, >, 0); abd_verify_scatter(abd); } #endif From b5411618f727c4ce5f787bb97d1c87f20c66027a Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 26 Apr 2023 11:49:16 -0700 Subject: [PATCH 073/180] Fix checkstyle warning Resolve a missed checkstyle warning. Reviewed-by: Alexander Motin Reviewed-by: Mateusz Guzik Reviewed-by: George Melikov Signed-off-by: Brian Behlendorf Closes #14799 --- module/os/freebsd/zfs/zfs_ctldir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c index 420d887b661e..28445a18b809 100644 --- a/module/os/freebsd/zfs/zfs_ctldir.c +++ b/module/os/freebsd/zfs/zfs_ctldir.c @@ -686,7 +686,7 @@ zfsctl_root_readdir(struct vop_readdir_args *ap) * The check below facilitates the idiom of repeating calls until the * count to return is 0. */ - if (zfs_uio_offset(&uio) == 3 * sizeof(entry)) { + if (zfs_uio_offset(&uio) == 3 * sizeof (entry)) { return (0); } From ee728008a4279dbbbe5332f8b9a886f3b8d91e00 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Wed, 26 Apr 2023 21:40:26 +0200 Subject: [PATCH 074/180] Fix BLAKE3 aarch64 assembly for FreeBSD and macOS The x18 register isn't useable within FreeBSD kernel space, so we have to fix the BLAKE3 aarch64 assembly for not using it. The source files are here: https://github.com/mcmilk/BLAKE3-tests Reviewed-by: Kyle Evans Signed-off-by: Tino Reichardt Closes #14728 --- .../icp/asm-aarch64/blake3/b3_aarch64_sse2.S | 4163 +++++++-------- .../icp/asm-aarch64/blake3/b3_aarch64_sse41.S | 4447 ++++++++--------- 2 files changed, 4078 insertions(+), 4532 deletions(-) diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S index 8237f0eb5a4e..dc2719d142db 100644 --- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S @@ -22,480 +22,61 @@ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale - * Copyright (c) 2022 Tino Reichardt + * Copyright (c) 2022-2023 Tino Reichardt * * This is converted assembly: SSE2 -> ARMv8-A * Used tools: SIMDe https://github.com/simd-everywhere/simde + * + * Should work on FreeBSD, Linux and macOS + * see: https://github.com/mcmilk/BLAKE3-tests/blob/master/contrib/simde.sh */ #if defined(__aarch64__) .text - .section .rodata.cst16,"aM",@progbits,16 - .p2align 4 -.LCPI0_0: - .word 1779033703 - .word 3144134277 - .word 1013904242 - .word 2773480762 -.LCPI0_1: - .xword 0 - .xword -4294967296 -.LCPI0_2: - .xword -1 - .xword 4294967295 + .section .note.gnu.property,"a",@note + .p2align 3 + .word 4 + .word 16 + .word 5 + .asciz "GNU" + .word 3221225472 + .word 4 + .word 3 + .word 0 +.Lsec_end0: .text .globl zfs_blake3_compress_in_place_sse2 .p2align 2 .type zfs_blake3_compress_in_place_sse2,@function zfs_blake3_compress_in_place_sse2: .cfi_startproc - ldp q3, q2, [x0] - ldp q5, q6, [x1] - add x10, x1, #32 - lsr x11, x3, #32 - fmov s4, w3 - ld2 { v17.4s, v18.4s }, [x10] - adrp x10, .LCPI0_2 - and w8, w2, #0xff - mov v4.s[1], w11 - ldr q1, [x10, :lo12:.LCPI0_2] - and w9, w4, #0xff - adrp x12, .LCPI0_0 - mov v4.s[2], w8 - uzp1 v19.4s, v5.4s, v6.4s - add v3.4s, v2.4s, v3.4s - ldr q7, [x12, :lo12:.LCPI0_0] - mov v4.s[3], w9 - add v3.4s, v3.4s, v19.4s - uzp2 v5.4s, v5.4s, v6.4s - ext v21.16b, v18.16b, v18.16b, #12 - uzp1 v6.4s, v19.4s, v19.4s - ext v22.16b, v19.16b, v19.16b, #12 - eor v4.16b, v3.16b, v4.16b - ext v20.16b, v17.16b, v17.16b, #12 - ext v6.16b, v6.16b, v19.16b, #8 - ext v19.16b, v19.16b, v22.16b, #12 - zip1 v22.2d, v21.2d, v5.2d - rev32 v24.8h, v4.8h - mov v4.16b, v1.16b - zip2 v23.4s, v5.4s, v21.4s - uzp2 v6.4s, v6.4s, v5.4s - bsl v4.16b, v22.16b, v20.16b - add v3.4s, v3.4s, v5.4s - zip1 v5.4s, v23.4s, v20.4s - zip1 v22.4s, v20.4s, v23.4s - add v23.4s, v24.4s, v7.4s - ext v7.16b, v6.16b, v6.16b, #4 - ext v25.16b, v4.16b, v4.16b, #12 - ext v5.16b, v22.16b, v5.16b, #8 - eor v2.16b, v23.16b, v2.16b - uzp1 v4.4s, v4.4s, v25.4s - uzp1 v22.4s, v7.4s, v7.4s - ext v25.16b, v7.16b, v7.16b, #12 - ext v22.16b, v22.16b, v7.16b, #8 - ext v7.16b, v7.16b, v25.16b, #12 - ushr v25.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - orr v2.16b, v2.16b, v25.16b - add v3.4s, v3.4s, v2.4s - eor v24.16b, v3.16b, v24.16b - add v3.4s, v3.4s, v17.4s - ushr v17.4s, v24.4s, #8 - shl v18.4s, v24.4s, #24 - orr v17.16b, v18.16b, v17.16b - add v18.4s, v17.4s, v23.4s - eor v2.16b, v18.16b, v2.16b - ushr v23.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - ext v3.16b, v3.16b, v3.16b, #12 - orr v2.16b, v2.16b, v23.16b - ext v17.16b, v17.16b, v17.16b, #8 - add v3.4s, v2.4s, v3.4s - adrp x11, .LCPI0_1 - eor v17.16b, v3.16b, v17.16b - ldr q16, [x11, :lo12:.LCPI0_1] - ext v18.16b, v18.16b, v18.16b, #4 - rev32 v24.8h, v17.8h - movi v0.2d, #0xffffffff00000000 - add v23.4s, v3.4s, v21.4s - mov v21.s[1], v20.s[2] - add v20.4s, v18.4s, v24.4s - bit v19.16b, v21.16b, v0.16b - eor v3.16b, v20.16b, v2.16b - uzp2 v2.4s, v22.4s, v19.4s - zip1 v17.2d, v5.2d, v19.2d - zip2 v18.4s, v19.4s, v5.4s - ushr v21.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - ext v22.16b, v2.16b, v2.16b, #4 - bsl v16.16b, v4.16b, v17.16b - zip1 v17.4s, v18.4s, v4.4s - zip1 v18.4s, v4.4s, v18.4s - orr v21.16b, v3.16b, v21.16b - ext v25.16b, v16.16b, v16.16b, #12 - ext v3.16b, v18.16b, v17.16b, #8 - uzp1 v18.4s, v22.4s, v22.4s - ext v26.16b, v22.16b, v22.16b, #12 - add v23.4s, v23.4s, v21.4s - uzp1 v17.4s, v16.4s, v25.4s - ext v16.16b, v18.16b, v22.16b, #8 - ext v18.16b, v22.16b, v26.16b, #12 - eor v22.16b, v23.16b, v24.16b - add v6.4s, v23.4s, v6.4s - ushr v23.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - orr v22.16b, v22.16b, v23.16b - add v20.4s, v22.4s, v20.4s - eor v21.16b, v20.16b, v21.16b - ushr v23.4s, v21.4s, #7 - shl v21.4s, v21.4s, #25 - ext v6.16b, v6.16b, v6.16b, #4 - orr v21.16b, v21.16b, v23.16b - ext v22.16b, v22.16b, v22.16b, #8 - add v6.4s, v21.4s, v6.4s - eor v22.16b, v6.16b, v22.16b - ext v20.16b, v20.16b, v20.16b, #12 - add v6.4s, v6.4s, v19.4s - rev32 v19.8h, v22.8h - add v20.4s, v20.4s, v19.4s - eor v21.16b, v20.16b, v21.16b - ushr v22.4s, v21.4s, #12 - shl v21.4s, v21.4s, #20 - orr v21.16b, v21.16b, v22.16b - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - ushr v22.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v19.16b, v19.16b, v22.16b - add v20.4s, v19.4s, v20.4s - eor v21.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #12 - ushr v22.4s, v21.4s, #7 - shl v21.4s, v21.4s, #25 - add v6.4s, v6.4s, v4.4s - orr v21.16b, v21.16b, v22.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - ext v20.16b, v20.16b, v20.16b, #4 - rev32 v19.8h, v19.8h - add v20.4s, v20.4s, v19.4s - add v6.4s, v6.4s, v5.4s - mov v5.s[1], v4.s[2] - eor v4.16b, v20.16b, v21.16b - ushr v21.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - orr v21.16b, v4.16b, v21.16b - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - add v2.4s, v6.4s, v2.4s - ushr v6.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v6.16b, v19.16b, v6.16b - add v19.4s, v6.4s, v20.4s - eor v20.16b, v19.16b, v21.16b - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v20.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v2.4s, v20.4s, v2.4s - eor v6.16b, v2.16b, v6.16b - ext v19.16b, v19.16b, v19.16b, #12 - rev32 v6.8h, v6.8h - add v19.4s, v19.4s, v6.4s - mov v22.16b, v0.16b - eor v20.16b, v19.16b, v20.16b - bsl v22.16b, v5.16b, v7.16b - ushr v21.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - add v2.4s, v2.4s, v22.4s - orr v20.16b, v20.16b, v21.16b - add v2.4s, v2.4s, v20.4s - eor v6.16b, v2.16b, v6.16b - ushr v21.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - orr v6.16b, v6.16b, v21.16b - add v19.4s, v6.4s, v19.4s - eor v20.16b, v19.16b, v20.16b - ext v2.16b, v2.16b, v2.16b, #12 - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - add v2.4s, v2.4s, v17.4s - orr v20.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v2.4s, v2.4s, v20.4s - eor v6.16b, v2.16b, v6.16b - uzp2 v5.4s, v16.4s, v22.4s - zip1 v7.2d, v3.2d, v22.2d - zip2 v16.4s, v22.4s, v3.4s - ext v19.16b, v19.16b, v19.16b, #4 - rev32 v22.8h, v6.8h - ext v23.16b, v5.16b, v5.16b, #4 - bif v7.16b, v17.16b, v1.16b - zip1 v24.4s, v16.4s, v17.4s - zip1 v16.4s, v17.4s, v16.4s - add v21.4s, v2.4s, v3.4s - mov v3.s[1], v17.s[2] - add v17.4s, v19.4s, v22.4s - mov v19.16b, v0.16b - ext v25.16b, v7.16b, v7.16b, #12 - ext v4.16b, v16.16b, v24.16b, #8 - uzp1 v16.4s, v23.4s, v23.4s - bsl v19.16b, v3.16b, v18.16b - eor v2.16b, v17.16b, v20.16b - uzp1 v7.4s, v7.4s, v25.4s - ext v25.16b, v16.16b, v23.16b, #8 - zip1 v3.2d, v4.2d, v19.2d - ushr v20.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - ext v24.16b, v23.16b, v23.16b, #12 - uzp2 v6.4s, v25.4s, v19.4s - zip2 v18.4s, v19.4s, v4.4s - bif v3.16b, v7.16b, v1.16b - orr v20.16b, v2.16b, v20.16b - ext v16.16b, v23.16b, v24.16b, #12 - ext v23.16b, v6.16b, v6.16b, #4 - zip1 v24.4s, v18.4s, v7.4s - zip1 v18.4s, v7.4s, v18.4s - ext v25.16b, v3.16b, v3.16b, #12 - add v21.4s, v21.4s, v20.4s - ext v2.16b, v18.16b, v24.16b, #8 - uzp1 v18.4s, v23.4s, v23.4s - ext v24.16b, v23.16b, v23.16b, #12 - uzp1 v3.4s, v3.4s, v25.4s - eor v22.16b, v21.16b, v22.16b - ext v25.16b, v18.16b, v23.16b, #8 - dup v18.4s, v2.s[3] - ext v23.16b, v23.16b, v24.16b, #12 - add v5.4s, v21.4s, v5.4s - trn1 v21.4s, v3.4s, v3.4s - ushr v24.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - ext v18.16b, v21.16b, v18.16b, #8 - orr v21.16b, v22.16b, v24.16b - add v17.4s, v21.4s, v17.4s - eor v20.16b, v17.16b, v20.16b - ushr v22.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - ext v5.16b, v5.16b, v5.16b, #4 - orr v20.16b, v20.16b, v22.16b - ext v21.16b, v21.16b, v21.16b, #8 - add v5.4s, v20.4s, v5.4s - eor v21.16b, v5.16b, v21.16b - ext v17.16b, v17.16b, v17.16b, #12 - add v5.4s, v5.4s, v19.4s - rev32 v19.8h, v21.8h - add v17.4s, v17.4s, v19.4s - eor v20.16b, v17.16b, v20.16b - ushr v21.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - orr v20.16b, v20.16b, v21.16b - add v5.4s, v5.4s, v20.4s - eor v19.16b, v5.16b, v19.16b - ushr v21.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v19.16b, v19.16b, v21.16b - add v17.4s, v19.4s, v17.4s - eor v20.16b, v17.16b, v20.16b - ext v5.16b, v5.16b, v5.16b, #12 - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - add v5.4s, v5.4s, v7.4s - orr v20.16b, v20.16b, v21.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v5.4s, v5.4s, v20.4s - eor v19.16b, v5.16b, v19.16b - ext v17.16b, v17.16b, v17.16b, #4 - rev32 v22.8h, v19.8h - add v21.4s, v5.4s, v4.4s - mov v4.s[1], v7.s[2] - add v19.4s, v17.4s, v22.4s - bit v16.16b, v4.16b, v0.16b - eor v5.16b, v19.16b, v20.16b - uzp2 v4.4s, v25.4s, v16.4s - zip1 v7.2d, v2.2d, v16.2d - zip2 v17.4s, v16.4s, v2.4s - ushr v20.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - ext v24.16b, v4.16b, v4.16b, #4 - bif v7.16b, v3.16b, v1.16b - zip1 v25.4s, v17.4s, v3.4s - zip1 v17.4s, v3.4s, v17.4s - orr v20.16b, v5.16b, v20.16b - ext v26.16b, v7.16b, v7.16b, #12 - ext v5.16b, v17.16b, v25.16b, #8 - uzp1 v17.4s, v24.4s, v24.4s - ext v25.16b, v24.16b, v24.16b, #12 - bit v23.16b, v18.16b, v0.16b - add v21.4s, v21.4s, v20.4s - uzp1 v7.4s, v7.4s, v26.4s - ext v26.16b, v17.16b, v24.16b, #8 - ext v17.16b, v24.16b, v25.16b, #12 - eor v22.16b, v21.16b, v22.16b - add v6.4s, v21.4s, v6.4s - zip1 v21.2d, v5.2d, v23.2d - zip2 v24.4s, v23.4s, v5.4s - bif v21.16b, v7.16b, v1.16b - zip1 v1.4s, v24.4s, v7.4s - zip1 v24.4s, v7.4s, v24.4s - ext v1.16b, v24.16b, v1.16b, #8 - ushr v24.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - orr v22.16b, v22.16b, v24.16b - add v19.4s, v22.4s, v19.4s - ext v24.16b, v21.16b, v21.16b, #12 - eor v20.16b, v19.16b, v20.16b - uzp1 v21.4s, v21.4s, v24.4s - ushr v24.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - orr v20.16b, v20.16b, v24.16b - ext v6.16b, v6.16b, v6.16b, #4 - ext v22.16b, v22.16b, v22.16b, #8 - add v6.4s, v20.4s, v6.4s - eor v22.16b, v6.16b, v22.16b - ext v19.16b, v19.16b, v19.16b, #12 - add v6.4s, v6.4s, v16.4s - rev32 v16.8h, v22.8h - add v19.4s, v19.4s, v16.4s - eor v20.16b, v19.16b, v20.16b - ushr v22.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - orr v20.16b, v20.16b, v22.16b - add v6.4s, v6.4s, v20.4s - eor v16.16b, v6.16b, v16.16b - ext v6.16b, v6.16b, v6.16b, #12 - add v3.4s, v6.4s, v3.4s - ushr v6.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - orr v6.16b, v16.16b, v6.16b - add v16.4s, v6.4s, v19.4s - eor v19.16b, v16.16b, v20.16b - ushr v20.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - orr v19.16b, v19.16b, v20.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v3.4s, v3.4s, v19.4s - eor v6.16b, v3.16b, v6.16b - ext v16.16b, v16.16b, v16.16b, #4 - add v2.4s, v3.4s, v2.4s - rev32 v3.8h, v6.8h - add v6.4s, v16.4s, v3.4s - eor v16.16b, v6.16b, v19.16b - ushr v19.4s, v16.4s, #12 - shl v16.4s, v16.4s, #20 - orr v16.16b, v16.16b, v19.16b - add v2.4s, v2.4s, v16.4s - eor v3.16b, v2.16b, v3.16b - add v2.4s, v2.4s, v4.4s - ushr v4.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v4.16b - add v4.4s, v3.4s, v6.4s - eor v6.16b, v4.16b, v16.16b - ushr v16.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v6.16b, v6.16b, v16.16b - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v6.4s, v2.4s - eor v3.16b, v2.16b, v3.16b - ext v4.16b, v4.16b, v4.16b, #12 - rev32 v3.8h, v3.8h - add v4.4s, v4.4s, v3.4s - eor v6.16b, v4.16b, v6.16b - ushr v16.4s, v6.4s, #12 - shl v6.4s, v6.4s, #20 - add v2.4s, v2.4s, v23.4s - orr v6.16b, v6.16b, v16.16b - add v2.4s, v2.4s, v6.4s - eor v3.16b, v2.16b, v3.16b - ushr v16.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v16.16b - add v4.4s, v3.4s, v4.4s - eor v6.16b, v4.16b, v6.16b - ext v2.16b, v2.16b, v2.16b, #12 - ushr v16.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - add v2.4s, v2.4s, v7.4s - orr v6.16b, v6.16b, v16.16b - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v2.4s, v6.4s - eor v3.16b, v2.16b, v3.16b - ext v4.16b, v4.16b, v4.16b, #4 - rev32 v3.8h, v3.8h - add v2.4s, v2.4s, v5.4s - mov v5.s[1], v7.s[2] - add v4.4s, v4.4s, v3.4s - bsl v0.16b, v5.16b, v17.16b - eor v5.16b, v4.16b, v6.16b - ushr v6.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v6.16b - add v2.4s, v2.4s, v5.4s - eor v3.16b, v2.16b, v3.16b - ushr v6.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v6.16b - add v4.4s, v3.4s, v4.4s - uzp2 v18.4s, v26.4s, v18.4s - eor v5.16b, v4.16b, v5.16b - add v2.4s, v2.4s, v18.4s - ushr v6.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v5.16b, v5.16b, v6.16b - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v5.4s, v2.4s - eor v3.16b, v2.16b, v3.16b - ext v4.16b, v4.16b, v4.16b, #12 - add v0.4s, v2.4s, v0.4s - rev32 v2.8h, v3.8h - add v3.4s, v4.4s, v2.4s - eor v4.16b, v3.16b, v5.16b - ushr v5.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - orr v4.16b, v4.16b, v5.16b - add v0.4s, v0.4s, v4.4s - eor v2.16b, v0.16b, v2.16b - ushr v5.4s, v2.4s, #8 - shl v2.4s, v2.4s, #24 - orr v2.16b, v2.16b, v5.16b - add v3.4s, v2.4s, v3.4s - eor v4.16b, v3.16b, v4.16b - ext v0.16b, v0.16b, v0.16b, #12 - ushr v5.4s, v4.4s, #7 - shl v4.4s, v4.4s, #25 - add v0.4s, v0.4s, v21.4s - orr v4.16b, v4.16b, v5.16b - ext v2.16b, v2.16b, v2.16b, #8 - add v0.4s, v0.4s, v4.4s - eor v2.16b, v0.16b, v2.16b - ext v3.16b, v3.16b, v3.16b, #4 - add v0.4s, v0.4s, v1.4s - rev32 v1.8h, v2.8h - add v2.4s, v3.4s, v1.4s - eor v3.16b, v2.16b, v4.16b - ushr v4.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - orr v3.16b, v3.16b, v4.16b - add v0.4s, v0.4s, v3.4s - eor v1.16b, v0.16b, v1.16b - ushr v4.4s, v1.4s, #8 - shl v1.4s, v1.4s, #24 - orr v1.16b, v1.16b, v4.16b - add v2.4s, v1.4s, v2.4s - eor v3.16b, v2.16b, v3.16b - ext v0.16b, v0.16b, v0.16b, #4 - ext v2.16b, v2.16b, v2.16b, #12 - ushr v4.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - ext v1.16b, v1.16b, v1.16b, #8 + hint #25 + .cfi_negate_ra_state + sub sp, sp, #96 + stp x29, x30, [sp, #64] + add x29, sp, #64 + str x19, [sp, #80] + .cfi_def_cfa w29, 32 + .cfi_offset w19, -16 + .cfi_offset w30, -24 + .cfi_offset w29, -32 + mov x19, x0 + mov w5, w4 + mov x4, x3 + mov w3, w2 + mov x2, x1 + mov x0, sp + mov x1, x19 + bl compress_pre + ldp q0, q1, [sp] + ldp q2, q3, [sp, #32] eor v0.16b, v2.16b, v0.16b - orr v2.16b, v3.16b, v4.16b - eor v1.16b, v2.16b, v1.16b - stp q0, q1, [x0] + eor v1.16b, v3.16b, v1.16b + ldp x29, x30, [sp, #64] + stp q0, q1, [x19] + ldr x19, [sp, #80] + add sp, sp, #96 + hint #29 ret .Lfunc_end0: .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-zfs_blake3_compress_in_place_sse2 @@ -504,483 +85,518 @@ zfs_blake3_compress_in_place_sse2: .section .rodata.cst16,"aM",@progbits,16 .p2align 4 .LCPI1_0: - .word 1779033703 - .word 3144134277 - .word 1013904242 - .word 2773480762 -.LCPI1_1: - .xword 0 - .xword -4294967296 -.LCPI1_2: - .xword -1 - .xword 4294967295 + .xword -4942790177982912921 + .xword -6534734903820487822 .text - .globl zfs_blake3_compress_xof_sse2 .p2align 2 - .type zfs_blake3_compress_xof_sse2,@function -zfs_blake3_compress_xof_sse2: + .type compress_pre,@function +compress_pre: .cfi_startproc - ldp q3, q2, [x0] - ldp q5, q6, [x1] - add x10, x1, #32 - lsr x11, x3, #32 - fmov s4, w3 - ld2 { v17.4s, v18.4s }, [x10] - adrp x10, .LCPI1_2 - and w8, w2, #0xff - mov v4.s[1], w11 - ldr q1, [x10, :lo12:.LCPI1_2] - and w9, w4, #0xff - adrp x12, .LCPI1_0 - mov v4.s[2], w8 - uzp1 v19.4s, v5.4s, v6.4s - add v3.4s, v2.4s, v3.4s - ldr q7, [x12, :lo12:.LCPI1_0] - mov v4.s[3], w9 - add v3.4s, v3.4s, v19.4s - uzp2 v5.4s, v5.4s, v6.4s - ext v21.16b, v18.16b, v18.16b, #12 - uzp1 v6.4s, v19.4s, v19.4s - ext v22.16b, v19.16b, v19.16b, #12 - eor v4.16b, v3.16b, v4.16b - ext v20.16b, v17.16b, v17.16b, #12 - ext v6.16b, v6.16b, v19.16b, #8 - ext v19.16b, v19.16b, v22.16b, #12 - zip1 v22.2d, v21.2d, v5.2d - rev32 v24.8h, v4.8h - mov v4.16b, v1.16b - zip2 v23.4s, v5.4s, v21.4s - uzp2 v6.4s, v6.4s, v5.4s - bsl v4.16b, v22.16b, v20.16b - add v3.4s, v3.4s, v5.4s - zip1 v5.4s, v23.4s, v20.4s - zip1 v22.4s, v20.4s, v23.4s - add v23.4s, v24.4s, v7.4s - ext v7.16b, v6.16b, v6.16b, #4 - ext v25.16b, v4.16b, v4.16b, #12 - ext v5.16b, v22.16b, v5.16b, #8 - eor v2.16b, v23.16b, v2.16b - uzp1 v4.4s, v4.4s, v25.4s - uzp1 v22.4s, v7.4s, v7.4s - ext v25.16b, v7.16b, v7.16b, #12 - ext v22.16b, v22.16b, v7.16b, #8 - ext v7.16b, v7.16b, v25.16b, #12 - ushr v25.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - orr v2.16b, v2.16b, v25.16b - add v3.4s, v3.4s, v2.4s - eor v24.16b, v3.16b, v24.16b - add v3.4s, v3.4s, v17.4s - ushr v17.4s, v24.4s, #8 - shl v18.4s, v24.4s, #24 - orr v17.16b, v18.16b, v17.16b - add v18.4s, v17.4s, v23.4s - eor v2.16b, v18.16b, v2.16b - ushr v23.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - ext v3.16b, v3.16b, v3.16b, #12 - orr v2.16b, v2.16b, v23.16b - ext v17.16b, v17.16b, v17.16b, #8 - add v3.4s, v2.4s, v3.4s - adrp x11, .LCPI1_1 - eor v17.16b, v3.16b, v17.16b - ldr q16, [x11, :lo12:.LCPI1_1] - ext v18.16b, v18.16b, v18.16b, #4 - rev32 v24.8h, v17.8h - movi v0.2d, #0xffffffff00000000 - add v23.4s, v3.4s, v21.4s - mov v21.s[1], v20.s[2] - add v20.4s, v18.4s, v24.4s - bit v19.16b, v21.16b, v0.16b - eor v3.16b, v20.16b, v2.16b - uzp2 v2.4s, v22.4s, v19.4s - zip1 v17.2d, v5.2d, v19.2d - zip2 v18.4s, v19.4s, v5.4s - ushr v21.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - ext v22.16b, v2.16b, v2.16b, #4 - bsl v16.16b, v4.16b, v17.16b - zip1 v17.4s, v18.4s, v4.4s - zip1 v18.4s, v4.4s, v18.4s - orr v21.16b, v3.16b, v21.16b - ext v25.16b, v16.16b, v16.16b, #12 - ext v3.16b, v18.16b, v17.16b, #8 - uzp1 v18.4s, v22.4s, v22.4s - ext v26.16b, v22.16b, v22.16b, #12 - add v23.4s, v23.4s, v21.4s - uzp1 v17.4s, v16.4s, v25.4s - ext v16.16b, v18.16b, v22.16b, #8 - ext v18.16b, v22.16b, v26.16b, #12 - eor v22.16b, v23.16b, v24.16b - add v6.4s, v23.4s, v6.4s - ushr v23.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - orr v22.16b, v22.16b, v23.16b - add v20.4s, v22.4s, v20.4s - eor v21.16b, v20.16b, v21.16b - ushr v23.4s, v21.4s, #7 - shl v21.4s, v21.4s, #25 - ext v6.16b, v6.16b, v6.16b, #4 - orr v21.16b, v21.16b, v23.16b - ext v22.16b, v22.16b, v22.16b, #8 - add v6.4s, v21.4s, v6.4s - eor v22.16b, v6.16b, v22.16b - ext v20.16b, v20.16b, v20.16b, #12 - add v6.4s, v6.4s, v19.4s - rev32 v19.8h, v22.8h - add v20.4s, v20.4s, v19.4s - eor v21.16b, v20.16b, v21.16b - ushr v22.4s, v21.4s, #12 - shl v21.4s, v21.4s, #20 - orr v21.16b, v21.16b, v22.16b - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - ushr v22.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v19.16b, v19.16b, v22.16b - add v20.4s, v19.4s, v20.4s - eor v21.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #12 - ushr v22.4s, v21.4s, #7 - shl v21.4s, v21.4s, #25 - add v6.4s, v6.4s, v4.4s - orr v21.16b, v21.16b, v22.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - ext v20.16b, v20.16b, v20.16b, #4 - rev32 v19.8h, v19.8h - add v20.4s, v20.4s, v19.4s - add v6.4s, v6.4s, v5.4s - mov v5.s[1], v4.s[2] - eor v4.16b, v20.16b, v21.16b - ushr v21.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - orr v21.16b, v4.16b, v21.16b - add v6.4s, v6.4s, v21.4s - eor v19.16b, v6.16b, v19.16b - add v2.4s, v6.4s, v2.4s - ushr v6.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v6.16b, v19.16b, v6.16b - add v19.4s, v6.4s, v20.4s - eor v20.16b, v19.16b, v21.16b - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v20.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v2.4s, v20.4s, v2.4s - eor v6.16b, v2.16b, v6.16b - ext v19.16b, v19.16b, v19.16b, #12 - rev32 v6.8h, v6.8h - add v19.4s, v19.4s, v6.4s - mov v22.16b, v0.16b - eor v20.16b, v19.16b, v20.16b - bsl v22.16b, v5.16b, v7.16b - ushr v21.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - add v2.4s, v2.4s, v22.4s - orr v20.16b, v20.16b, v21.16b - add v2.4s, v2.4s, v20.4s - eor v6.16b, v2.16b, v6.16b - ushr v21.4s, v6.4s, #8 - shl v6.4s, v6.4s, #24 - orr v6.16b, v6.16b, v21.16b - add v19.4s, v6.4s, v19.4s - eor v20.16b, v19.16b, v20.16b - ext v2.16b, v2.16b, v2.16b, #12 - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - add v2.4s, v2.4s, v17.4s - orr v20.16b, v20.16b, v21.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v2.4s, v2.4s, v20.4s - eor v6.16b, v2.16b, v6.16b - uzp2 v5.4s, v16.4s, v22.4s - zip1 v7.2d, v3.2d, v22.2d - zip2 v16.4s, v22.4s, v3.4s - ext v19.16b, v19.16b, v19.16b, #4 - rev32 v22.8h, v6.8h - ext v23.16b, v5.16b, v5.16b, #4 - bif v7.16b, v17.16b, v1.16b - zip1 v24.4s, v16.4s, v17.4s - zip1 v16.4s, v17.4s, v16.4s - add v21.4s, v2.4s, v3.4s - mov v3.s[1], v17.s[2] - add v17.4s, v19.4s, v22.4s - mov v19.16b, v0.16b - ext v25.16b, v7.16b, v7.16b, #12 - ext v4.16b, v16.16b, v24.16b, #8 - uzp1 v16.4s, v23.4s, v23.4s - bsl v19.16b, v3.16b, v18.16b - eor v2.16b, v17.16b, v20.16b - uzp1 v7.4s, v7.4s, v25.4s - ext v25.16b, v16.16b, v23.16b, #8 - zip1 v3.2d, v4.2d, v19.2d - ushr v20.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - ext v24.16b, v23.16b, v23.16b, #12 - uzp2 v6.4s, v25.4s, v19.4s - zip2 v18.4s, v19.4s, v4.4s - bif v3.16b, v7.16b, v1.16b - orr v20.16b, v2.16b, v20.16b - ext v16.16b, v23.16b, v24.16b, #12 - ext v23.16b, v6.16b, v6.16b, #4 - zip1 v24.4s, v18.4s, v7.4s - zip1 v18.4s, v7.4s, v18.4s - ext v25.16b, v3.16b, v3.16b, #12 - add v21.4s, v21.4s, v20.4s - ext v2.16b, v18.16b, v24.16b, #8 - uzp1 v18.4s, v23.4s, v23.4s - ext v24.16b, v23.16b, v23.16b, #12 - uzp1 v3.4s, v3.4s, v25.4s - eor v22.16b, v21.16b, v22.16b - ext v25.16b, v18.16b, v23.16b, #8 - dup v18.4s, v2.s[3] - ext v23.16b, v23.16b, v24.16b, #12 - add v5.4s, v21.4s, v5.4s - trn1 v21.4s, v3.4s, v3.4s - ushr v24.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - ext v18.16b, v21.16b, v18.16b, #8 - orr v21.16b, v22.16b, v24.16b - add v17.4s, v21.4s, v17.4s - eor v20.16b, v17.16b, v20.16b - ushr v22.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - ext v5.16b, v5.16b, v5.16b, #4 - orr v20.16b, v20.16b, v22.16b - ext v21.16b, v21.16b, v21.16b, #8 - add v5.4s, v20.4s, v5.4s - eor v21.16b, v5.16b, v21.16b - ext v17.16b, v17.16b, v17.16b, #12 - add v5.4s, v5.4s, v19.4s - rev32 v19.8h, v21.8h - add v17.4s, v17.4s, v19.4s - eor v20.16b, v17.16b, v20.16b - ushr v21.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - orr v20.16b, v20.16b, v21.16b - add v5.4s, v5.4s, v20.4s - eor v19.16b, v5.16b, v19.16b - ushr v21.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v19.16b, v19.16b, v21.16b - add v17.4s, v19.4s, v17.4s - eor v20.16b, v17.16b, v20.16b - ext v5.16b, v5.16b, v5.16b, #12 - ushr v21.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - add v5.4s, v5.4s, v7.4s - orr v20.16b, v20.16b, v21.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v5.4s, v5.4s, v20.4s - eor v19.16b, v5.16b, v19.16b - ext v17.16b, v17.16b, v17.16b, #4 - rev32 v22.8h, v19.8h - add v21.4s, v5.4s, v4.4s - mov v4.s[1], v7.s[2] - add v19.4s, v17.4s, v22.4s - bit v16.16b, v4.16b, v0.16b - eor v5.16b, v19.16b, v20.16b - uzp2 v4.4s, v25.4s, v16.4s - zip1 v7.2d, v2.2d, v16.2d - zip2 v17.4s, v16.4s, v2.4s - ushr v20.4s, v5.4s, #12 - shl v5.4s, v5.4s, #20 - ext v24.16b, v4.16b, v4.16b, #4 - bif v7.16b, v3.16b, v1.16b - zip1 v25.4s, v17.4s, v3.4s - zip1 v17.4s, v3.4s, v17.4s - orr v20.16b, v5.16b, v20.16b - ext v26.16b, v7.16b, v7.16b, #12 - ext v5.16b, v17.16b, v25.16b, #8 - uzp1 v17.4s, v24.4s, v24.4s - ext v25.16b, v24.16b, v24.16b, #12 - bit v23.16b, v18.16b, v0.16b - add v21.4s, v21.4s, v20.4s - uzp1 v7.4s, v7.4s, v26.4s - ext v26.16b, v17.16b, v24.16b, #8 - ext v17.16b, v24.16b, v25.16b, #12 - eor v22.16b, v21.16b, v22.16b - add v6.4s, v21.4s, v6.4s - zip1 v21.2d, v5.2d, v23.2d - zip2 v24.4s, v23.4s, v5.4s - bif v21.16b, v7.16b, v1.16b - zip1 v1.4s, v24.4s, v7.4s - zip1 v24.4s, v7.4s, v24.4s - ext v1.16b, v24.16b, v1.16b, #8 - ushr v24.4s, v22.4s, #8 - shl v22.4s, v22.4s, #24 - orr v22.16b, v22.16b, v24.16b - add v19.4s, v22.4s, v19.4s - ext v24.16b, v21.16b, v21.16b, #12 - eor v20.16b, v19.16b, v20.16b - uzp1 v21.4s, v21.4s, v24.4s - ushr v24.4s, v20.4s, #7 - shl v20.4s, v20.4s, #25 - orr v20.16b, v20.16b, v24.16b - ext v6.16b, v6.16b, v6.16b, #4 - ext v22.16b, v22.16b, v22.16b, #8 - add v6.4s, v20.4s, v6.4s - eor v22.16b, v6.16b, v22.16b - ext v19.16b, v19.16b, v19.16b, #12 - add v6.4s, v6.4s, v16.4s - rev32 v16.8h, v22.8h - add v19.4s, v19.4s, v16.4s - eor v20.16b, v19.16b, v20.16b - ushr v22.4s, v20.4s, #12 - shl v20.4s, v20.4s, #20 - orr v20.16b, v20.16b, v22.16b - add v6.4s, v6.4s, v20.4s - eor v16.16b, v6.16b, v16.16b - ext v6.16b, v6.16b, v6.16b, #12 - add v3.4s, v6.4s, v3.4s - ushr v6.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - orr v6.16b, v16.16b, v6.16b - add v16.4s, v6.4s, v19.4s - eor v19.16b, v16.16b, v20.16b - ushr v20.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - orr v19.16b, v19.16b, v20.16b - ext v6.16b, v6.16b, v6.16b, #8 - add v3.4s, v3.4s, v19.4s - eor v6.16b, v3.16b, v6.16b - ext v16.16b, v16.16b, v16.16b, #4 - add v2.4s, v3.4s, v2.4s - rev32 v3.8h, v6.8h - add v6.4s, v16.4s, v3.4s - eor v16.16b, v6.16b, v19.16b - ushr v19.4s, v16.4s, #12 - shl v16.4s, v16.4s, #20 - orr v16.16b, v16.16b, v19.16b - add v2.4s, v2.4s, v16.4s - eor v3.16b, v2.16b, v3.16b - add v2.4s, v2.4s, v4.4s - ushr v4.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v4.16b - add v4.4s, v3.4s, v6.4s - eor v6.16b, v4.16b, v16.16b - ushr v16.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - orr v6.16b, v6.16b, v16.16b - ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v6.4s, v2.4s + hint #34 + fmov s1, w3 + movi d0, #0x0000ff000000ff + ldr q2, [x1] + fmov d3, x4 + adrp x8, .LCPI1_0 + mov v1.s[1], w5 + str q2, [x0] + ldr q4, [x8, :lo12:.LCPI1_0] + add x8, x2, #32 + ldr q5, [x1, #16] + and v0.8b, v1.8b, v0.8b + stp q5, q4, [x0, #16] + mov v3.d[1], v0.d[0] + str q3, [x0, #48] + ldp q0, q6, [x2] + uzp1 v1.4s, v0.4s, v6.4s + uzp2 v0.4s, v0.4s, v6.4s + add v2.4s, v2.4s, v1.4s + uzp1 v18.4s, v1.4s, v1.4s + add v2.4s, v2.4s, v5.4s eor v3.16b, v2.16b, v3.16b - ext v4.16b, v4.16b, v4.16b, #12 + add v2.4s, v2.4s, v0.4s rev32 v3.8h, v3.8h - add v4.4s, v4.4s, v3.4s - eor v6.16b, v4.16b, v6.16b - ushr v16.4s, v6.4s, #12 - shl v6.4s, v6.4s, #20 - add v2.4s, v2.4s, v23.4s - orr v6.16b, v6.16b, v16.16b - add v2.4s, v2.4s, v6.4s + add v4.4s, v3.4s, v4.4s + eor v5.16b, v4.16b, v5.16b + ushr v6.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v6.16b + add v2.4s, v2.4s, v5.4s eor v3.16b, v2.16b, v3.16b - ushr v16.4s, v3.4s, #8 + ushr v6.4s, v3.4s, #8 shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v16.16b + orr v3.16b, v3.16b, v6.16b + ld2 { v6.4s, v7.4s }, [x8] add v4.4s, v3.4s, v4.4s - eor v6.16b, v4.16b, v6.16b - ext v2.16b, v2.16b, v2.16b, #12 - ushr v16.4s, v6.4s, #7 - shl v6.4s, v6.4s, #25 - add v2.4s, v2.4s, v7.4s - orr v6.16b, v6.16b, v16.16b ext v3.16b, v3.16b, v3.16b, #8 add v2.4s, v2.4s, v6.4s - eor v3.16b, v2.16b, v3.16b + eor v5.16b, v4.16b, v5.16b ext v4.16b, v4.16b, v4.16b, #4 - rev32 v3.8h, v3.8h + ext v6.16b, v6.16b, v6.16b, #12 + ext v2.16b, v2.16b, v2.16b, #12 + ushr v16.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v16.16b + ext v16.16b, v7.16b, v7.16b, #12 add v2.4s, v2.4s, v5.4s - mov v5.s[1], v7.s[2] + mov v7.16b, v16.16b + eor v3.16b, v3.16b, v2.16b + add v2.4s, v2.4s, v16.4s + mov v7.s[1], v6.s[2] + rev32 v3.8h, v3.8h add v4.4s, v4.4s, v3.4s - bsl v0.16b, v5.16b, v17.16b - eor v5.16b, v4.16b, v6.16b - ushr v6.4s, v5.4s, #12 + eor v5.16b, v4.16b, v5.16b + ushr v17.4s, v5.4s, #12 shl v5.4s, v5.4s, #20 - orr v5.16b, v5.16b, v6.16b + orr v5.16b, v5.16b, v17.16b add v2.4s, v2.4s, v5.4s eor v3.16b, v2.16b, v3.16b - ushr v6.4s, v3.4s, #8 + ushr v17.4s, v3.4s, #8 shl v3.4s, v3.4s, #24 - orr v3.16b, v3.16b, v6.16b + orr v3.16b, v3.16b, v17.16b + ext v17.16b, v18.16b, v1.16b, #8 add v4.4s, v3.4s, v4.4s - uzp2 v18.4s, v26.4s, v18.4s + uzp2 v17.4s, v17.4s, v0.4s + ext v3.16b, v3.16b, v3.16b, #8 eor v5.16b, v4.16b, v5.16b - add v2.4s, v2.4s, v18.4s - ushr v6.4s, v5.4s, #7 + add v2.4s, v2.4s, v17.4s + ext v4.16b, v4.16b, v4.16b, #12 + ushr v18.4s, v5.4s, #7 shl v5.4s, v5.4s, #25 ext v2.16b, v2.16b, v2.16b, #4 - orr v5.16b, v5.16b, v6.16b + orr v5.16b, v5.16b, v18.16b + ext v18.16b, v1.16b, v1.16b, #12 + add v2.4s, v2.4s, v5.4s + ext v1.16b, v1.16b, v18.16b, #12 + zip1 v18.2d, v16.2d, v0.2d + zip2 v0.4s, v0.4s, v16.4s + eor v3.16b, v3.16b, v2.16b + rev64 v1.4s, v1.4s + mov v18.s[3], v6.s[3] + zip1 v16.4s, v0.4s, v6.4s + rev32 v3.8h, v3.8h + trn2 v1.4s, v1.4s, v7.4s + zip1 v0.4s, v6.4s, v0.4s + add v4.4s, v4.4s, v3.4s + add v2.4s, v2.4s, v1.4s + ext v6.16b, v0.16b, v16.16b, #8 + eor v5.16b, v4.16b, v5.16b + ushr v7.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v7.16b + add v7.4s, v2.4s, v5.4s + eor v2.16b, v7.16b, v3.16b + ext v7.16b, v7.16b, v7.16b, #12 + ushr v3.4s, v2.4s, #8 + shl v2.4s, v2.4s, #24 + orr v3.16b, v2.16b, v3.16b + ext v2.16b, v18.16b, v18.16b, #12 + add v4.4s, v3.4s, v4.4s + uzp1 v2.4s, v18.4s, v2.4s ext v3.16b, v3.16b, v3.16b, #8 - add v2.4s, v5.4s, v2.4s - eor v3.16b, v2.16b, v3.16b + eor v5.16b, v4.16b, v5.16b + add v7.4s, v7.4s, v2.4s + ext v4.16b, v4.16b, v4.16b, #4 + ushr v18.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v18.16b + add v7.4s, v7.4s, v5.4s + eor v3.16b, v3.16b, v7.16b + add v7.4s, v7.4s, v6.4s + rev32 v3.8h, v3.8h + add v4.4s, v4.4s, v3.4s + eor v5.16b, v4.16b, v5.16b + ushr v0.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v0.16b, v5.16b, v0.16b + add v5.4s, v7.4s, v0.4s + ext v7.16b, v17.16b, v17.16b, #4 + eor v3.16b, v5.16b, v3.16b + uzp1 v17.4s, v7.4s, v7.4s + ushr v16.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v16.16b + ext v16.16b, v17.16b, v7.16b, #8 + add v4.4s, v3.4s, v4.4s + uzp2 v16.4s, v16.4s, v1.4s + ext v3.16b, v3.16b, v3.16b, #8 + eor v0.16b, v4.16b, v0.16b + add v5.4s, v5.4s, v16.4s + ext v4.16b, v4.16b, v4.16b, #12 + ushr v17.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v0.16b, v0.16b, v17.16b + ext v17.16b, v7.16b, v7.16b, #12 + add v5.4s, v5.4s, v0.4s + ext v7.16b, v7.16b, v17.16b, #12 + mov v17.16b, v6.16b + eor v3.16b, v3.16b, v5.16b + rev64 v7.4s, v7.4s + mov v17.s[1], v2.s[2] + rev32 v3.8h, v3.8h + add v4.4s, v4.4s, v3.4s + eor v18.16b, v4.16b, v0.16b + trn2 v0.4s, v7.4s, v17.4s + ushr v7.4s, v18.4s, #12 + shl v17.4s, v18.4s, #20 + add v5.4s, v5.4s, v0.4s + zip1 v18.2d, v6.2d, v1.2d + zip2 v1.4s, v1.4s, v6.4s + orr v7.16b, v17.16b, v7.16b + mov v18.s[3], v2.s[3] + zip1 v6.4s, v1.4s, v2.4s + add v5.4s, v5.4s, v7.4s + zip1 v1.4s, v2.4s, v1.4s + eor v3.16b, v5.16b, v3.16b + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v1.16b, v6.16b, #8 + ushr v17.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v17.16b, v3.16b, v17.16b + ext v3.16b, v18.16b, v18.16b, #12 + add v4.4s, v17.4s, v4.4s + uzp1 v3.4s, v18.4s, v3.4s + ext v17.16b, v17.16b, v17.16b, #8 + eor v7.16b, v4.16b, v7.16b + add v5.4s, v5.4s, v3.4s + ext v4.16b, v4.16b, v4.16b, #4 + ushr v18.4s, v7.4s, #7 + shl v7.4s, v7.4s, #25 + orr v7.16b, v7.16b, v18.16b + add v5.4s, v5.4s, v7.4s + eor v17.16b, v17.16b, v5.16b + add v5.4s, v5.4s, v6.4s + rev32 v17.8h, v17.8h + add v4.4s, v4.4s, v17.4s + eor v2.16b, v4.16b, v7.16b + ext v7.16b, v16.16b, v16.16b, #4 + ushr v1.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v1.16b, v2.16b, v1.16b + add v2.4s, v5.4s, v1.4s + eor v5.16b, v2.16b, v17.16b + uzp1 v17.4s, v7.4s, v7.4s + ushr v16.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + orr v5.16b, v5.16b, v16.16b + ext v16.16b, v17.16b, v7.16b, #8 + add v4.4s, v5.4s, v4.4s + uzp2 v16.4s, v16.4s, v0.4s + ext v5.16b, v5.16b, v5.16b, #8 + eor v1.16b, v4.16b, v1.16b + add v2.4s, v2.4s, v16.4s ext v4.16b, v4.16b, v4.16b, #12 - add v0.4s, v2.4s, v0.4s - rev32 v2.8h, v3.8h - add v3.4s, v4.4s, v2.4s - eor v4.16b, v3.16b, v5.16b - ushr v5.4s, v4.4s, #12 + ushr v17.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v1.16b, v1.16b, v17.16b + ext v17.16b, v7.16b, v7.16b, #12 + add v2.4s, v2.4s, v1.4s + ext v7.16b, v7.16b, v17.16b, #12 + mov v17.16b, v6.16b + eor v5.16b, v5.16b, v2.16b + rev64 v7.4s, v7.4s + mov v17.s[1], v3.s[2] + rev32 v5.8h, v5.8h + add v4.4s, v4.4s, v5.4s + eor v18.16b, v4.16b, v1.16b + trn2 v1.4s, v7.4s, v17.4s + ushr v7.4s, v18.4s, #12 + shl v17.4s, v18.4s, #20 + add v2.4s, v2.4s, v1.4s + zip1 v18.2d, v6.2d, v0.2d + zip2 v0.4s, v0.4s, v6.4s + orr v7.16b, v17.16b, v7.16b + mov v18.s[3], v3.s[3] + add v2.4s, v2.4s, v7.4s + eor v5.16b, v2.16b, v5.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v17.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + orr v5.16b, v5.16b, v17.16b + add v17.4s, v5.4s, v4.4s + ext v4.16b, v18.16b, v18.16b, #12 + ext v5.16b, v5.16b, v5.16b, #8 + eor v7.16b, v17.16b, v7.16b + uzp1 v4.4s, v18.4s, v4.4s + ext v17.16b, v17.16b, v17.16b, #4 + ushr v18.4s, v7.4s, #7 + shl v7.4s, v7.4s, #25 + add v2.4s, v2.4s, v4.4s + orr v7.16b, v7.16b, v18.16b + add v2.4s, v2.4s, v7.4s + eor v5.16b, v5.16b, v2.16b + rev32 v5.8h, v5.8h + add v6.4s, v17.4s, v5.4s + zip1 v17.4s, v0.4s, v3.4s + zip1 v0.4s, v3.4s, v0.4s + eor v3.16b, v6.16b, v7.16b + ext v0.16b, v0.16b, v17.16b, #8 + ushr v7.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v2.4s, v2.4s, v0.4s + orr v3.16b, v3.16b, v7.16b + ext v7.16b, v16.16b, v16.16b, #4 + add v2.4s, v2.4s, v3.4s + uzp1 v17.4s, v7.4s, v7.4s + eor v5.16b, v2.16b, v5.16b + ushr v16.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + orr v5.16b, v5.16b, v16.16b + ext v16.16b, v17.16b, v7.16b, #8 + add v6.4s, v5.4s, v6.4s + uzp2 v16.4s, v16.4s, v1.4s + ext v5.16b, v5.16b, v5.16b, #8 + eor v3.16b, v6.16b, v3.16b + add v2.4s, v2.4s, v16.4s + ext v6.16b, v6.16b, v6.16b, #12 + ushr v17.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v3.16b, v3.16b, v17.16b + add v17.4s, v2.4s, v3.4s + eor v2.16b, v5.16b, v17.16b + ext v5.16b, v7.16b, v7.16b, #12 + rev32 v18.8h, v2.8h + ext v2.16b, v7.16b, v5.16b, #12 + mov v5.16b, v0.16b + add v6.4s, v6.4s, v18.4s + rev64 v2.4s, v2.4s + mov v5.s[1], v4.s[2] + eor v3.16b, v6.16b, v3.16b + trn2 v2.4s, v2.4s, v5.4s + ushr v5.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v7.4s, v17.4s, v2.4s + orr v3.16b, v3.16b, v5.16b + add v5.4s, v7.4s, v3.4s + eor v7.16b, v5.16b, v18.16b + zip1 v18.2d, v0.2d, v1.2d + ext v5.16b, v5.16b, v5.16b, #12 + zip2 v0.4s, v1.4s, v0.4s + ushr v17.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + mov v18.s[3], v4.s[3] + orr v7.16b, v7.16b, v17.16b + ext v17.16b, v18.16b, v18.16b, #12 + add v6.4s, v7.4s, v6.4s + ext v7.16b, v7.16b, v7.16b, #8 + eor v19.16b, v6.16b, v3.16b + uzp1 v3.4s, v18.4s, v17.4s + ext v6.16b, v6.16b, v6.16b, #4 + ushr v17.4s, v19.4s, #7 + shl v18.4s, v19.4s, #25 + add v5.4s, v5.4s, v3.4s + orr v17.16b, v18.16b, v17.16b + add v5.4s, v5.4s, v17.4s + eor v7.16b, v7.16b, v5.16b + rev32 v7.8h, v7.8h + add v1.4s, v6.4s, v7.4s + zip1 v6.4s, v0.4s, v4.4s + zip1 v0.4s, v4.4s, v0.4s + eor v4.16b, v1.16b, v17.16b + ext v6.16b, v0.16b, v6.16b, #8 + ushr v0.4s, v4.4s, #12 shl v4.4s, v4.4s, #20 - orr v4.16b, v4.16b, v5.16b - add v0.4s, v0.4s, v4.4s - eor v2.16b, v0.16b, v2.16b - ushr v5.4s, v2.4s, #8 - shl v2.4s, v2.4s, #24 - orr v2.16b, v2.16b, v5.16b - add v3.4s, v2.4s, v3.4s - eor v4.16b, v3.16b, v4.16b - ext v0.16b, v0.16b, v0.16b, #12 - ushr v5.4s, v4.4s, #7 - shl v4.4s, v4.4s, #25 - add v0.4s, v0.4s, v21.4s - orr v4.16b, v4.16b, v5.16b - ext v2.16b, v2.16b, v2.16b, #8 - add v0.4s, v0.4s, v4.4s - eor v2.16b, v0.16b, v2.16b - ext v3.16b, v3.16b, v3.16b, #4 - add v0.4s, v0.4s, v1.4s - rev32 v1.8h, v2.8h - add v2.4s, v3.4s, v1.4s - eor v3.16b, v2.16b, v4.16b - ushr v4.4s, v3.4s, #12 + add v5.4s, v5.4s, v6.4s + zip1 v20.2d, v6.2d, v2.2d + orr v0.16b, v4.16b, v0.16b + mov v20.s[3], v3.s[3] + add v4.4s, v5.4s, v0.4s + eor v5.16b, v4.16b, v7.16b + ext v7.16b, v16.16b, v16.16b, #4 + ushr v16.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + uzp1 v17.4s, v7.4s, v7.4s + orr v5.16b, v5.16b, v16.16b + ext v16.16b, v17.16b, v7.16b, #8 + add v1.4s, v5.4s, v1.4s + uzp2 v16.4s, v16.4s, v2.4s + zip2 v2.4s, v2.4s, v6.4s + eor v0.16b, v1.16b, v0.16b + add v4.4s, v4.4s, v16.4s + ext v1.16b, v1.16b, v1.16b, #12 + ext v16.16b, v16.16b, v16.16b, #4 + ushr v17.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v17.16b, v0.16b, v17.16b + ext v0.16b, v5.16b, v5.16b, #8 + ext v5.16b, v7.16b, v7.16b, #12 + add v4.4s, v4.4s, v17.4s + eor v0.16b, v0.16b, v4.16b + rev32 v18.8h, v0.8h + ext v0.16b, v7.16b, v5.16b, #12 + mov v5.16b, v6.16b + add v7.4s, v1.4s, v18.4s + rev64 v1.4s, v0.4s + mov v5.s[1], v3.s[2] + eor v17.16b, v7.16b, v17.16b + trn2 v1.4s, v1.4s, v5.4s + ushr v19.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v4.4s, v4.4s, v1.4s + orr v17.16b, v17.16b, v19.16b + add v19.4s, v4.4s, v17.4s + eor v4.16b, v19.16b, v18.16b + ext v19.16b, v19.16b, v19.16b, #12 + ushr v18.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + orr v18.16b, v4.16b, v18.16b + ext v4.16b, v20.16b, v20.16b, #12 + add v7.4s, v18.4s, v7.4s + uzp1 v4.4s, v20.4s, v4.4s + ext v18.16b, v18.16b, v18.16b, #8 + eor v17.16b, v7.16b, v17.16b + add v19.4s, v19.4s, v4.4s + ext v7.16b, v7.16b, v7.16b, #4 + ushr v20.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + orr v17.16b, v17.16b, v20.16b + add v19.4s, v19.4s, v17.4s + eor v18.16b, v18.16b, v19.16b + rev32 v18.8h, v18.8h + add v6.4s, v7.4s, v18.4s + zip1 v7.4s, v2.4s, v3.4s + zip1 v2.4s, v3.4s, v2.4s + eor v3.16b, v6.16b, v17.16b + ext v2.16b, v2.16b, v7.16b, #8 + ushr v7.4s, v3.4s, #12 shl v3.4s, v3.4s, #20 - orr v3.16b, v3.16b, v4.16b - add v0.4s, v0.4s, v3.4s - eor v1.16b, v0.16b, v1.16b - ushr v4.4s, v1.4s, #8 - shl v1.4s, v1.4s, #24 - orr v1.16b, v1.16b, v4.16b - add v2.4s, v1.4s, v2.4s - eor v3.16b, v2.16b, v3.16b - ushr v4.4s, v3.4s, #7 + add v17.4s, v19.4s, v2.4s + zip1 v1.2d, v2.2d, v1.2d + zip2 v0.4s, v0.4s, v2.4s + orr v3.16b, v3.16b, v7.16b + mov v1.s[3], v4.s[3] + add v7.4s, v17.4s, v3.4s + eor v17.16b, v7.16b, v18.16b + ext v7.16b, v7.16b, v7.16b, #4 + ushr v18.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + orr v17.16b, v17.16b, v18.16b + ext v18.16b, v16.16b, v16.16b, #8 + add v6.4s, v17.4s, v6.4s + uzp2 v5.4s, v18.4s, v5.4s + eor v3.16b, v6.16b, v3.16b + ext v5.16b, v5.16b, v18.16b, #4 + ext v6.16b, v6.16b, v6.16b, #12 + ushr v18.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + add v5.4s, v7.4s, v5.4s + ext v7.16b, v17.16b, v17.16b, #8 + ext v17.16b, v16.16b, v16.16b, #12 + orr v3.16b, v3.16b, v18.16b + ext v16.16b, v16.16b, v17.16b, #12 + add v5.4s, v3.4s, v5.4s + mov v17.16b, v2.16b + rev64 v16.4s, v16.4s + eor v7.16b, v7.16b, v5.16b + mov v17.s[1], v4.s[2] + rev32 v7.8h, v7.8h + trn2 v16.4s, v16.4s, v17.4s + add v6.4s, v6.4s, v7.4s + add v5.4s, v5.4s, v16.4s + eor v3.16b, v6.16b, v3.16b + ushr v17.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + orr v3.16b, v3.16b, v17.16b + add v5.4s, v5.4s, v3.4s + eor v7.16b, v5.16b, v7.16b + ext v5.16b, v5.16b, v5.16b, #12 + ushr v16.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + orr v7.16b, v7.16b, v16.16b + ext v16.16b, v1.16b, v1.16b, #12 + add v6.4s, v7.4s, v6.4s + uzp1 v1.4s, v1.4s, v16.4s + eor v3.16b, v6.16b, v3.16b + add v1.4s, v5.4s, v1.4s + ext v5.16b, v7.16b, v7.16b, #8 + ext v6.16b, v6.16b, v6.16b, #4 + ushr v16.4s, v3.4s, #7 shl v3.4s, v3.4s, #25 + orr v3.16b, v3.16b, v16.16b + add v1.4s, v1.4s, v3.4s + eor v5.16b, v5.16b, v1.16b + rev32 v5.8h, v5.8h + add v2.4s, v6.4s, v5.4s + zip1 v6.4s, v0.4s, v4.4s + zip1 v0.4s, v4.4s, v0.4s + eor v3.16b, v2.16b, v3.16b + ext v0.16b, v0.16b, v6.16b, #8 + ushr v4.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v0.4s, v1.4s, v0.4s + orr v1.16b, v3.16b, v4.16b + add v0.4s, v0.4s, v1.4s + eor v3.16b, v0.16b, v5.16b ext v0.16b, v0.16b, v0.16b, #4 - ext v1.16b, v1.16b, v1.16b, #8 - ext v2.16b, v2.16b, v2.16b, #12 + ushr v4.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 orr v3.16b, v3.16b, v4.16b + add v2.4s, v3.4s, v2.4s + ext v3.16b, v3.16b, v3.16b, #8 + eor v1.16b, v2.16b, v1.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v4.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + stp q2, q3, [x0, #32] + orr v1.16b, v1.16b, v4.16b + stp q0, q1, [x0] + ret +.Lfunc_end1: + .size compress_pre, .Lfunc_end1-compress_pre + .cfi_endproc + + .globl zfs_blake3_compress_xof_sse2 + .p2align 2 + .type zfs_blake3_compress_xof_sse2,@function +zfs_blake3_compress_xof_sse2: + .cfi_startproc + hint #25 + .cfi_negate_ra_state + sub sp, sp, #96 + stp x29, x30, [sp, #64] + add x29, sp, #64 + stp x20, x19, [sp, #80] + .cfi_def_cfa w29, 32 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w30, -24 + .cfi_offset w29, -32 + mov x20, x0 + mov x19, x5 + mov w5, w4 + mov x4, x3 + mov w3, w2 + mov x2, x1 + mov x0, sp + mov x1, x20 + bl compress_pre + ldp q0, q1, [sp] + ldp q2, q3, [sp, #32] eor v0.16b, v2.16b, v0.16b - eor v3.16b, v3.16b, v1.16b - stp q0, q3, [x5] - ldr q0, [x0] + eor v1.16b, v3.16b, v1.16b + ldp x29, x30, [sp, #64] + stp q0, q1, [x19] + ldr q0, [x20] eor v0.16b, v0.16b, v2.16b - str q0, [x5, #32] - ldr q0, [x0, #16] - eor v0.16b, v0.16b, v1.16b - str q0, [x5, #48] + str q0, [x19, #32] + ldr q0, [x20, #16] + eor v0.16b, v0.16b, v3.16b + str q0, [x19, #48] + ldp x20, x19, [sp, #80] + add sp, sp, #96 + hint #29 ret -.Lfunc_end1: - .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-zfs_blake3_compress_xof_sse2 +.Lfunc_end2: + .size zfs_blake3_compress_xof_sse2, .Lfunc_end2-zfs_blake3_compress_xof_sse2 .cfi_endproc .section .rodata.cst16,"aM",@progbits,16 .p2align 4 -.LCPI2_0: +.LCPI3_0: .word 0 .word 1 .word 2 @@ -991,19 +607,21 @@ zfs_blake3_compress_xof_sse2: .type zfs_blake3_hash_many_sse2,@function zfs_blake3_hash_many_sse2: .cfi_startproc + hint #25 + .cfi_negate_ra_state stp d15, d14, [sp, #-160]! stp d13, d12, [sp, #16] stp d11, d10, [sp, #32] stp d9, d8, [sp, #48] stp x29, x30, [sp, #64] + add x29, sp, #64 stp x28, x27, [sp, #80] stp x26, x25, [sp, #96] stp x24, x23, [sp, #112] stp x22, x21, [sp, #128] stp x20, x19, [sp, #144] - mov x29, sp - sub sp, sp, #384 - .cfi_def_cfa w29, 160 + sub sp, sp, #464 + .cfi_def_cfa w29, 96 .cfi_offset w19, -8 .cfi_offset w20, -16 .cfi_offset w21, -24 @@ -1024,1414 +642,1406 @@ zfs_blake3_hash_many_sse2: .cfi_offset b13, -144 .cfi_offset b14, -152 .cfi_offset b15, -160 - ldr x26, [x29, #168] - ldrb w27, [x29, #160] mov w19, w6 mov x20, x4 - mov x22, x2 - mov x28, x1 + mov x24, x1 + ldr x26, [x29, #104] + ldrb w27, [x29, #96] cmp x1, #4 - mov x24, x0 str x3, [sp, #40] - b.lo .LBB2_8 - adrp x9, .LCPI2_0 - ldr q0, [x9, :lo12:.LCPI2_0] - sbfx w11, w5, #0, #1 - dup v1.4s, w11 - mov w9, #58983 + b.lo .LBB3_6 + adrp x8, .LCPI3_0 + sbfx w9, w5, #0, #1 mov w10, #44677 - and v0.16b, v1.16b, v0.16b mov w11, #62322 - mov w12, #62778 - orr w8, w7, w19 - movk w9, #27145, lsl #16 movk w10, #47975, lsl #16 movk w11, #15470, lsl #16 + ldr q0, [x8, :lo12:.LCPI3_0] + dup v1.4s, w9 + mov w9, #58983 + orr w8, w7, w19 + movk w9, #27145, lsl #16 + and v0.16b, v1.16b, v0.16b + dup v1.4s, w11 + movi v24.4s, #64 + dup v2.4s, w9 + mov w9, #62778 + movk w9, #42319, lsl #16 str q0, [sp, #16] orr v0.4s, #128, lsl #24 - movk w12, #42319, lsl #16 + stp q2, q1, [sp, #48] str q0, [sp] -.LBB2_2: - ldr x0, [sp, #40] - mov x13, x0 - ld1r { v20.4s }, [x13], #4 - add x14, x0, #8 - add x15, x0, #12 - add x16, x0, #16 - add x17, x0, #20 - add x18, x0, #24 - add x0, x0, #28 - ld1r { v17.4s }, [x14] - ld1r { v6.4s }, [x15] - ld1r { v8.4s }, [x16] - ld1r { v9.4s }, [x17] - ld1r { v31.4s }, [x18] - ld1r { v26.4s }, [x13] - ld1r { v15.4s }, [x0] - cbz x22, .LBB2_7 + dup v0.4s, w10 + str q0, [sp, #80] + b .LBB3_3 +.LBB3_2: + zip1 v0.4s, v12.4s, v31.4s + add x10, x20, #4 + zip1 v1.4s, v29.4s, v30.4s + tst w5, #0x1 + zip1 v2.4s, v28.4s, v23.4s + csel x20, x10, x20, ne + zip1 v3.4s, v13.4s, v25.4s + add x0, x0, #32 + zip2 v6.4s, v12.4s, v31.4s + sub x24, x24, #4 + zip1 v4.2d, v0.2d, v1.2d + cmp x24, #3 + zip2 v7.4s, v29.4s, v30.4s + zip1 v5.2d, v2.2d, v3.2d + zip2 v0.2d, v0.2d, v1.2d + zip2 v1.2d, v2.2d, v3.2d + zip2 v2.4s, v28.4s, v23.4s + zip2 v3.4s, v13.4s, v25.4s + stp q4, q5, [x26] + zip2 v4.2d, v6.2d, v7.2d + stp q0, q1, [x26, #32] + zip1 v0.2d, v6.2d, v7.2d + zip1 v1.2d, v2.2d, v3.2d + zip2 v2.2d, v2.2d, v3.2d + stp q0, q1, [x26, #64] + stp q4, q2, [x26, #96] + add x26, x26, #128 + b.ls .LBB3_6 +.LBB3_3: + ldr x14, [sp, #40] + mov x10, x14 + add x11, x14, #8 + add x12, x14, #12 + add x13, x14, #16 + ld1r { v12.4s }, [x10], #4 + ld1r { v29.4s }, [x11] + add x11, x14, #20 + ld1r { v30.4s }, [x12] + add x12, x14, #24 + ld1r { v28.4s }, [x13] + ld1r { v23.4s }, [x11] + add x11, x14, #28 + ld1r { v13.4s }, [x12] + ld1r { v31.4s }, [x10] + ld1r { v25.4s }, [x11] + cbz x2, .LBB3_2 ldr q1, [sp, #16] dup v0.4s, w20 - ldp x13, x14, [x24] - ldp x15, x16, [x24, #16] + lsr x12, x20, #32 + mov x10, xzr + ldp x13, x14, [x0, #16] add v1.4s, v0.4s, v1.4s + mov x15, x2 movi v0.4s, #128, lsl #24 - str q1, [sp, #64] + mov w4, w8 + str q1, [sp, #112] eor v0.16b, v1.16b, v0.16b ldr q1, [sp] - lsr x18, x20, #32 - mov x17, xzr cmgt v0.4s, v1.4s, v0.4s - dup v1.4s, w18 + dup v1.4s, w12 + ldp x11, x12, [x0] sub v0.4s, v1.4s, v0.4s - mov w18, w8 - str q0, [sp, #48] -.LBB2_4: - mov w2, #16 - bfi x2, x17, #6, #58 - ldr q1, [x13, x2] - ldr q3, [x14, x2] - ldr q2, [x15, x2] - ldr q4, [x16, x2] - mov w2, #32 - bfi x2, x17, #6, #58 - ldr q5, [x13, x2] - ldr q18, [x14, x2] - ldr q19, [x15, x2] - ldr q23, [x16, x2] - mov w2, #48 - lsl x3, x17, #6 - bfi x2, x17, #6, #58 - add x17, x17, #1 - ldr q0, [x13, x3] - ldr q21, [x14, x3] - ldr q7, [x15, x3] - ldr q16, [x16, x3] - cmp x17, x22 - ldr q13, [x13, x2] - ldr q14, [x14, x2] - ldr q29, [x15, x2] - ldr q10, [x16, x2] - csel w2, w27, wzr, eq - orr w18, w2, w18 - mov x0, xzr - and w18, w18, #0xff - add x3, x3, #256 -.LBB2_5: - ldr x2, [x24, x0] - add x0, x0, #8 - cmp x0, #32 - add x2, x2, x3 - prfm pldl1keep, [x2] - b.ne .LBB2_5 - dup v22.4s, w18 - str q22, [sp, #192] - zip1 v27.4s, v0.4s, v21.4s - zip2 v21.4s, v0.4s, v21.4s - zip1 v0.4s, v7.4s, v16.4s - zip2 v22.4s, v7.4s, v16.4s - zip1 v7.4s, v1.4s, v3.4s - zip1 v25.4s, v2.4s, v4.4s - zip2 v16.4s, v2.4s, v4.4s - zip1 v11.4s, v19.4s, v23.4s - zip2 v12.4s, v19.4s, v23.4s - zip1 v19.4s, v13.4s, v14.4s - zip2 v23.4s, v13.4s, v14.4s - zip1 v13.4s, v29.4s, v10.4s - zip2 v14.4s, v29.4s, v10.4s - add v10.4s, v20.4s, v8.4s - add v2.4s, v26.4s, v9.4s - ext v20.16b, v22.16b, v21.16b, #8 - ext v26.16b, v25.16b, v7.16b, #8 - zip2 v24.4s, v1.4s, v3.4s - add v1.4s, v6.4s, v15.4s - ext v6.16b, v0.16b, v27.16b, #8 - ext v20.16b, v21.16b, v20.16b, #8 - mov v21.d[1], v22.d[0] - ext v22.16b, v7.16b, v26.16b, #8 - mov v7.d[1], v25.d[0] - add v3.4s, v17.4s, v31.4s - str q1, [sp, #144] - ext v1.16b, v27.16b, v6.16b, #8 - mov v6.16b, v7.16b - zip1 v28.4s, v5.4s, v18.4s - stur q1, [x29, #-80] - mov v1.16b, v27.16b - mov v27.16b, v24.16b - add v3.4s, v3.4s, v6.4s - ldr q6, [sp, #64] - ext v29.16b, v16.16b, v24.16b, #8 - mov v1.d[1], v0.d[0] - ext v0.16b, v11.16b, v28.16b, #8 - mov v27.d[1], v16.d[0] - ext v16.16b, v14.16b, v23.16b, #8 - stur q7, [x29, #-144] - ext v7.16b, v24.16b, v29.16b, #8 - ext v29.16b, v28.16b, v0.16b, #8 - ext v0.16b, v23.16b, v16.16b, #8 - mov v23.d[1], v14.d[0] - stp q0, q23, [sp, #80] - add v0.4s, v10.4s, v1.4s - eor v16.16b, v0.16b, v6.16b - ldr q6, [sp, #48] - add v2.4s, v2.4s, v21.4s - mov v28.d[1], v11.d[0] - zip2 v18.4s, v5.4s, v18.4s - eor v10.16b, v2.16b, v6.16b - movi v6.4s, #64 - eor v11.16b, v3.16b, v6.16b - ldr q6, [sp, #144] - dup v17.4s, w9 - ext v30.16b, v12.16b, v18.16b, #8 - rev32 v16.8h, v16.8h - dup v5.4s, w10 - ext v25.16b, v18.16b, v30.16b, #8 - mov v30.16b, v23.16b - mov v23.16b, v1.16b - str q1, [sp, #160] - rev32 v10.8h, v10.8h - add v1.4s, v16.4s, v17.4s - add v17.4s, v6.4s, v27.4s - ldr q6, [sp, #192] - dup v4.4s, w11 - rev32 v11.8h, v11.8h - add v5.4s, v10.4s, v5.4s - eor v8.16b, v1.16b, v8.16b - stur q21, [x29, #-128] - mov v18.d[1], v12.d[0] - add v4.4s, v11.4s, v4.4s - eor v9.16b, v5.16b, v9.16b - ushr v12.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - ldur q21, [x29, #-80] - ext v26.16b, v13.16b, v19.16b, #8 - eor v31.16b, v4.16b, v31.16b - orr v8.16b, v8.16b, v12.16b - ushr v12.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - ext v26.16b, v19.16b, v26.16b, #8 - mov v19.d[1], v13.d[0] - orr v9.16b, v9.16b, v12.16b - ushr v12.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v13.16b, v17.16b, v6.16b - orr v31.16b, v31.16b, v12.16b - dup v12.4s, w12 - rev32 v13.8h, v13.8h - add v12.4s, v13.4s, v12.4s - add v0.4s, v0.4s, v21.4s - eor v14.16b, v12.16b, v15.16b - add v0.4s, v0.4s, v8.4s - add v2.4s, v2.4s, v20.4s - ushr v15.4s, v14.4s, #12 - shl v14.4s, v14.4s, #20 - eor v16.16b, v0.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v22.4s - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v7.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v14.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v13.16b, v17.16b, v13.16b - add v1.4s, v16.4s, v1.4s - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v13.4s, #8 - shl v13.4s, v13.4s, #24 - eor v8.16b, v1.16b, v8.16b - add v5.4s, v10.4s, v5.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v11.4s, v4.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v13.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v14.16b, v12.16b, v14.16b - add v0.4s, v0.4s, v28.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #7 - shl v14.4s, v14.4s, #25 - add v0.4s, v0.4s, v9.4s - add v2.4s, v2.4s, v18.4s - orr v14.16b, v14.16b, v15.16b - eor v13.16b, v0.16b, v13.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v19.4s - rev32 v13.8h, v13.8h - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v14.4s - add v17.4s, v17.4s, v30.4s - add v4.4s, v4.4s, v13.4s - rev32 v16.8h, v16.8h - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - eor v9.16b, v4.16b, v9.16b - add v12.4s, v12.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v17.16b, v11.16b - mov v24.16b, v7.16b - stur q7, [x29, #-112] - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v1.4s, v10.4s - rev32 v11.8h, v11.8h - mov v7.16b, v26.16b - add v3.4s, v3.4s, v26.4s - ldr q26, [sp, #80] - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v14.16b, v1.16b, v14.16b - add v5.4s, v5.4s, v11.4s - add v0.4s, v0.4s, v29.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #12 - shl v14.4s, v14.4s, #20 - eor v8.16b, v5.16b, v8.16b - add v0.4s, v0.4s, v9.4s - add v2.4s, v2.4s, v25.4s - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v13.16b, v0.16b, v13.16b - add v2.4s, v2.4s, v31.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v13.4s, #8 - shl v13.4s, v13.4s, #24 - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v14.4s - add v17.4s, v17.4s, v26.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v17.16b, v11.16b - add v4.4s, v13.4s, v4.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v9.16b, v4.16b, v9.16b - add v12.4s, v16.4s, v12.4s - str q22, [sp, #128] - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v10.4s, v1.4s - ldur q22, [x29, #-128] - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v14.16b, v1.16b, v14.16b - add v5.4s, v11.4s, v5.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #7 - shl v14.4s, v14.4s, #25 - eor v8.16b, v5.16b, v8.16b - mov v6.16b, v18.16b - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - ldur q18, [x29, #-144] - orr v8.16b, v8.16b, v15.16b - add v0.4s, v0.4s, v22.4s - add v0.4s, v0.4s, v8.4s - add v2.4s, v2.4s, v20.4s - eor v16.16b, v0.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v24.4s - rev32 v16.8h, v16.8h - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v18.4s - add v1.4s, v1.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v14.4s - eor v8.16b, v1.16b, v8.16b - add v5.4s, v5.4s, v10.4s - rev32 v11.8h, v11.8h - eor v13.16b, v17.16b, v13.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v4.4s, v11.4s - rev32 v13.8h, v13.8h - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v12.4s, v13.4s - add v0.4s, v0.4s, v27.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v14.16b, v12.16b, v14.16b - add v0.4s, v0.4s, v8.4s - add v2.4s, v2.4s, v6.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #12 - shl v14.4s, v14.4s, #20 - eor v16.16b, v0.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v23.4s - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v7.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v14.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v13.16b, v17.16b, v13.16b - add v1.4s, v16.4s, v1.4s - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v13.4s, #8 - shl v13.4s, v13.4s, #24 - eor v8.16b, v1.16b, v8.16b - add v5.4s, v10.4s, v5.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v11.4s, v4.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v13.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v14.16b, v12.16b, v14.16b - add v0.4s, v0.4s, v21.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #7 - shl v14.4s, v14.4s, #25 - add v0.4s, v0.4s, v9.4s - add v2.4s, v2.4s, v19.4s - orr v14.16b, v14.16b, v15.16b - eor v13.16b, v0.16b, v13.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v29.4s - str q28, [sp, #112] - rev32 v13.8h, v13.8h - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v14.4s - add v17.4s, v17.4s, v26.4s - add v4.4s, v4.4s, v13.4s - rev32 v16.8h, v16.8h - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - ldp q28, q23, [sp, #112] - eor v9.16b, v4.16b, v9.16b - add v12.4s, v12.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v17.16b, v11.16b - ldr q21, [sp, #96] - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v1.4s, v10.4s - rev32 v11.8h, v11.8h - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v14.16b, v1.16b, v14.16b - add v5.4s, v5.4s, v11.4s - add v0.4s, v0.4s, v25.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #12 - shl v14.4s, v14.4s, #20 - eor v8.16b, v5.16b, v8.16b - add v0.4s, v0.4s, v9.4s - add v2.4s, v2.4s, v23.4s - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v13.16b, v0.16b, v13.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v21.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v13.4s, #8 - shl v13.4s, v13.4s, #24 - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v14.4s - add v17.4s, v17.4s, v28.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v17.16b, v11.16b - add v4.4s, v13.4s, v4.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v9.16b, v4.16b, v9.16b - add v12.4s, v16.4s, v12.4s - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v10.4s, v1.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v14.16b, v1.16b, v14.16b - add v5.4s, v11.4s, v5.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #7 - shl v14.4s, v14.4s, #25 - eor v8.16b, v5.16b, v8.16b - mov v30.16b, v29.16b - mov v29.16b, v25.16b - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - ldur q25, [x29, #-112] - orr v8.16b, v8.16b, v15.16b - add v0.4s, v0.4s, v20.4s - add v0.4s, v0.4s, v8.4s - add v2.4s, v2.4s, v6.4s - eor v16.16b, v0.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v7.4s - rev32 v16.8h, v16.8h - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v25.4s - add v1.4s, v1.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v14.4s - eor v8.16b, v1.16b, v8.16b - add v5.4s, v5.4s, v10.4s - rev32 v11.8h, v11.8h - eor v13.16b, v17.16b, v13.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v4.4s, v11.4s - rev32 v13.8h, v13.8h - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v12.4s, v13.4s - add v0.4s, v0.4s, v18.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v14.16b, v12.16b, v14.16b - add v0.4s, v0.4s, v8.4s - add v2.4s, v2.4s, v19.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #12 - shl v14.4s, v14.4s, #20 - eor v16.16b, v0.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v22.4s - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v21.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v14.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v13.16b, v17.16b, v13.16b - add v1.4s, v16.4s, v1.4s - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v13.4s, #8 - shl v13.4s, v13.4s, #24 - eor v8.16b, v1.16b, v8.16b - add v5.4s, v10.4s, v5.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v11.4s, v4.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v13.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v14.16b, v12.16b, v14.16b + str q0, [sp, #96] +.LBB3_5: + add x17, x11, x10 + add x21, x12, x10 + add x16, x13, x10 + add x6, x14, x10 + subs x15, x15, #1 + add x10, x10, #64 + ldp q0, q1, [x17] + csel w3, w27, wzr, eq + orr w3, w3, w4 + mov w4, w19 + and w3, w3, #0xff + ldp q3, q6, [x21] + dup v2.4s, w3 + zip1 v21.4s, v0.4s, v3.4s + zip2 v19.4s, v0.4s, v3.4s + ldp q5, q7, [x16] + zip1 v17.4s, v1.4s, v6.4s + zip2 v22.4s, v1.4s, v6.4s + ldp q16, q18, [x6] + zip1 v4.4s, v5.4s, v16.4s + zip2 v0.4s, v5.4s, v16.4s + ldp q26, q27, [x17, #32] + zip1 v1.4s, v7.4s, v18.4s + zip2 v3.4s, v7.4s, v18.4s + zip2 v20.2d, v19.2d, v0.2d + mov v19.d[1], v0.d[0] + dup v18.4s, w9 + ldp q8, q9, [x21, #32] + stur q19, [x29, #-208] + zip2 v7.4s, v26.4s, v8.4s + zip1 v10.4s, v26.4s, v8.4s + ldp q11, q5, [x16, #32] + zip2 v26.2d, v17.2d, v1.2d + stp q7, q26, [sp, #192] + mov v17.d[1], v1.d[0] + add v1.4s, v23.4s, v31.4s + ldp q16, q6, [x6, #32] + stur q17, [x29, #-256] + add v1.4s, v1.4s, v19.4s + zip1 v8.4s, v11.4s, v16.4s + zip2 v7.4s, v11.4s, v16.4s + zip1 v11.4s, v27.4s, v9.4s + zip2 v9.4s, v27.4s, v9.4s + zip2 v27.2d, v21.2d, v4.2d + mov v21.d[1], v4.d[0] + str q7, [sp, #224] + add v4.4s, v28.4s, v12.4s + zip1 v15.4s, v5.4s, v6.4s + zip2 v14.4s, v5.4s, v6.4s + stur q27, [x29, #-192] + zip2 v16.2d, v22.2d, v3.2d + stp q20, q21, [x29, #-240] + add v0.4s, v4.4s, v21.4s + ldp q6, q4, [sp, #96] + mov v22.d[1], v3.d[0] + add v5.4s, v25.4s, v30.4s + add v3.4s, v13.4s, v29.4s + eor v6.16b, v1.16b, v6.16b + add v1.4s, v1.4s, v20.4s + str q22, [sp, #256] + eor v4.16b, v0.16b, v4.16b + add v5.4s, v5.4s, v22.4s + add v3.4s, v3.4s, v17.4s + ldr q17, [sp, #48] + rev32 v6.8h, v6.8h + rev32 v4.8h, v4.8h + eor v2.16b, v5.16b, v2.16b + eor v7.16b, v3.16b, v24.16b add v0.4s, v0.4s, v27.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #7 - shl v14.4s, v14.4s, #25 - add v0.4s, v0.4s, v9.4s - add v2.4s, v2.4s, v30.4s - orr v14.16b, v14.16b, v15.16b - eor v13.16b, v0.16b, v13.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v29.4s - rev32 v13.8h, v13.8h - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v14.4s - add v17.4s, v17.4s, v28.4s - add v4.4s, v4.4s, v13.4s - rev32 v16.8h, v16.8h - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - eor v9.16b, v4.16b, v9.16b - add v12.4s, v12.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v17.16b, v11.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v1.4s, v10.4s - rev32 v11.8h, v11.8h - ldr q24, [sp, #160] - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v14.16b, v1.16b, v14.16b - add v5.4s, v5.4s, v11.4s - stur q7, [x29, #-64] - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v14.4s, #12 - shl v14.4s, v14.4s, #20 - eor v8.16b, v5.16b, v8.16b - mov v7.16b, v26.16b - add v3.4s, v3.4s, v26.4s - ldur q26, [x29, #-80] - orr v14.16b, v14.16b, v15.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - add v0.4s, v0.4s, v23.4s - orr v8.16b, v8.16b, v15.16b - add v15.4s, v0.4s, v9.4s - add v2.4s, v2.4s, v24.4s - eor v0.16b, v15.16b, v13.16b - add v2.4s, v2.4s, v31.4s - ushr v13.4s, v0.4s, #8 - shl v0.4s, v0.4s, #24 - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v14.4s - add v17.4s, v17.4s, v26.4s - orr v0.16b, v0.16b, v13.16b - ushr v13.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - orr v16.16b, v16.16b, v13.16b - ushr v13.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v17.16b, v11.16b - add v4.4s, v0.4s, v4.4s - orr v10.16b, v10.16b, v13.16b - ushr v13.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v9.16b, v4.16b, v9.16b - add v12.4s, v16.4s, v12.4s - orr v11.16b, v11.16b, v13.16b - ushr v13.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v12.16b, v31.16b - orr v9.16b, v9.16b, v13.16b - ushr v13.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - add v1.4s, v10.4s, v1.4s - orr v31.16b, v31.16b, v13.16b - eor v13.16b, v1.16b, v14.16b - add v5.4s, v11.4s, v5.4s - ushr v14.4s, v13.4s, #7 - shl v13.4s, v13.4s, #25 - eor v8.16b, v5.16b, v8.16b - orr v13.16b, v13.16b, v14.16b - ushr v14.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - stur q6, [x29, #-96] - orr v8.16b, v8.16b, v14.16b - add v14.4s, v15.4s, v6.4s - ldur q6, [x29, #-64] - mov v18.16b, v19.16b - add v14.4s, v14.4s, v8.4s - add v2.4s, v2.4s, v18.4s - eor v16.16b, v14.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v21.4s - rev32 v16.8h, v16.8h - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v6.4s - add v1.4s, v1.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v13.4s - eor v8.16b, v1.16b, v8.16b - add v5.4s, v5.4s, v10.4s - rev32 v11.8h, v11.8h - eor v0.16b, v17.16b, v0.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v4.4s, v11.4s - rev32 v0.8h, v0.8h + add v21.4s, v4.4s, v17.4s + rev32 v31.8h, v2.8h + ldr q2, [sp, #80] + rev32 v7.8h, v7.8h + mov v27.16b, v16.16b + eor v17.16b, v21.16b, v28.16b + add v29.4s, v6.4s, v2.4s + ldr q2, [sp, #64] + add v24.4s, v31.4s, v18.4s str q27, [sp, #176] - mov v27.16b, v30.16b - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v12.4s, v0.4s - add v14.4s, v14.4s, v25.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v13.16b, v12.16b, v13.16b - add v14.4s, v14.4s, v8.4s - add v2.4s, v2.4s, v27.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #12 - shl v13.4s, v13.4s, #20 - eor v16.16b, v14.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v20.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v7.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v13.4s - mov v30.16b, v23.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v0.16b, v17.16b, v0.16b - add v1.4s, v16.4s, v1.4s - ldur q23, [x29, #-144] - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v0.4s, #8 - shl v0.4s, v0.4s, #24 - eor v8.16b, v1.16b, v8.16b - add v5.4s, v10.4s, v5.4s - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v11.4s, v4.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v0.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v13.16b, v12.16b, v13.16b - add v14.4s, v14.4s, v23.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #7 - shl v13.4s, v13.4s, #25 - add v14.4s, v14.4s, v9.4s - add v2.4s, v2.4s, v29.4s - orr v13.16b, v13.16b, v15.16b - eor v0.16b, v14.16b, v0.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v30.4s - rev32 v0.8h, v0.8h - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v13.4s - add v17.4s, v17.4s, v26.4s - add v4.4s, v4.4s, v0.4s - rev32 v16.8h, v16.8h - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - ldur q22, [x29, #-128] - eor v9.16b, v4.16b, v9.16b - add v12.4s, v12.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v17.16b, v11.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v1.4s, v10.4s - rev32 v11.8h, v11.8h - ldr q26, [sp, #176] - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v13.16b, v1.16b, v13.16b - add v5.4s, v5.4s, v11.4s - add v14.4s, v14.4s, v24.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #12 - shl v13.4s, v13.4s, #20 - eor v8.16b, v5.16b, v8.16b - add v14.4s, v14.4s, v9.4s - add v2.4s, v2.4s, v22.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v0.16b, v14.16b, v0.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v28.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v0.4s, #8 - shl v0.4s, v0.4s, #24 - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v13.4s + ushr v19.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v30.4s, v7.4s, v2.4s + eor v18.16b, v29.16b, v23.16b + orr v12.16b, v17.16b, v19.16b + eor v17.16b, v30.16b, v13.16b + eor v19.16b, v24.16b, v25.16b + ushr v23.4s, v18.4s, #12 + shl v18.4s, v18.4s, #20 + ushr v25.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + ushr v28.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v13.16b, v18.16b, v23.16b + orr v25.16b, v17.16b, v25.16b + orr v2.16b, v19.16b, v28.16b + add v28.4s, v0.4s, v12.4s + add v0.4s, v3.4s, v26.4s + add v18.4s, v1.4s, v13.4s + add v3.4s, v5.4s, v16.4s + eor v1.16b, v28.16b, v4.16b + add v17.4s, v0.4s, v25.4s + eor v0.16b, v18.16b, v6.16b + add v19.4s, v3.4s, v2.4s + ushr v16.4s, v1.4s, #8 + shl v3.4s, v1.4s, #24 + eor v4.16b, v17.16b, v7.16b + ushr v6.4s, v0.4s, #8 + shl v1.4s, v0.4s, #24 + eor v5.16b, v19.16b, v31.16b + ushr v23.4s, v4.4s, #8 + shl v4.4s, v4.4s, #24 + orr v7.16b, v3.16b, v16.16b + orr v6.16b, v1.16b, v6.16b + ushr v31.4s, v5.4s, #8 + shl v0.4s, v5.4s, #24 + orr v5.16b, v4.16b, v23.16b + add v4.4s, v7.4s, v21.4s + ldr q21, [sp, #192] + add v3.4s, v6.4s, v29.4s + orr v31.16b, v0.16b, v31.16b + add v23.4s, v5.4s, v30.4s + eor v0.16b, v4.16b, v12.16b + eor v1.16b, v3.16b, v13.16b + add v16.4s, v31.4s, v24.4s + eor v20.16b, v23.16b, v25.16b + ushr v24.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v29.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + ushr v30.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + orr v25.16b, v0.16b, v24.16b + orr v0.16b, v1.16b, v29.16b + mov v29.16b, v10.16b + orr v1.16b, v20.16b, v30.16b + mov v20.16b, v10.16b + mov v24.16b, v21.16b + ldr q20, [sp, #224] + mov v29.d[1], v8.d[0] + mov v13.16b, v9.16b + zip2 v30.2d, v10.2d, v8.2d + zip2 v8.2d, v21.2d, v20.2d + mov v26.16b, v11.16b + mov v24.d[1], v20.d[0] + add v20.4s, v28.4s, v29.4s + mov v13.d[1], v14.d[0] + str q8, [sp, #128] + eor v2.16b, v16.16b, v2.16b + mov v26.d[1], v15.d[0] + str q24, [sp, #192] + add v20.4s, v20.4s, v0.4s + add v19.4s, v19.4s, v13.4s + ushr v12.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + zip2 v10.2d, v9.2d, v14.2d + add v18.4s, v18.4s, v24.4s add v17.4s, v17.4s, v26.4s - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v17.16b, v11.16b - add v4.4s, v0.4s, v4.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v9.16b, v4.16b, v9.16b - add v12.4s, v16.4s, v12.4s - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v10.4s, v1.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v13.16b, v1.16b, v13.16b - add v5.4s, v11.4s, v5.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #7 - shl v13.4s, v13.4s, #25 - eor v8.16b, v5.16b, v8.16b - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - orr v8.16b, v8.16b, v15.16b - add v14.4s, v14.4s, v18.4s - add v14.4s, v14.4s, v8.4s - add v2.4s, v2.4s, v27.4s - eor v16.16b, v14.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v7.4s - rev32 v16.8h, v16.8h - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s + mov v14.16b, v26.16b + eor v26.16b, v20.16b, v31.16b + stp q10, q30, [sp, #224] + add v19.4s, v19.4s, v25.4s + orr v2.16b, v2.16b, v12.16b + add v18.4s, v18.4s, v1.4s + rev32 v26.8h, v26.8h + eor v5.16b, v19.16b, v5.16b + add v17.4s, v17.4s, v2.4s + eor v7.16b, v18.16b, v7.16b + add v23.4s, v23.4s, v26.4s + rev32 v5.8h, v5.8h + eor v6.16b, v17.16b, v6.16b + rev32 v7.8h, v7.8h + eor v0.16b, v23.16b, v0.16b + add v3.4s, v3.4s, v5.4s + rev32 v6.8h, v6.8h + add v16.4s, v16.4s, v7.4s + ushr v31.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v25.16b, v3.16b, v25.16b + add v4.4s, v4.4s, v6.4s + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + add v20.4s, v20.4s, v30.4s + zip2 v21.2d, v11.2d, v15.2d + ushr v11.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v31.16b + add v19.4s, v19.4s, v10.4s + add v20.4s, v20.4s, v0.4s + orr v1.16b, v1.16b, v11.16b + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v18.4s, v18.4s, v8.4s + add v19.4s, v19.4s, v25.4s + eor v26.16b, v20.16b, v26.16b + orr v2.16b, v2.16b, v11.16b add v17.4s, v17.4s, v21.4s - add v1.4s, v1.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v3.16b, v11.16b - add v17.4s, v17.4s, v13.4s - eor v8.16b, v1.16b, v8.16b - add v5.4s, v5.4s, v10.4s - rev32 v11.8h, v11.8h - eor v0.16b, v17.16b, v0.16b - add v14.4s, v14.4s, v6.4s - ldur q6, [x29, #-96] - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v4.4s, v11.4s - rev32 v0.8h, v0.8h - stur q20, [x29, #-160] - mov v20.16b, v29.16b - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v12.4s, v0.4s - mov v19.16b, v29.16b - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v13.16b, v12.16b, v13.16b - add v14.4s, v14.4s, v8.4s - add v2.4s, v2.4s, v20.4s - mov v19.16b, v28.16b - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #12 - shl v13.4s, v13.4s, #20 - eor v16.16b, v14.16b, v16.16b - add v2.4s, v2.4s, v9.4s + add v18.4s, v18.4s, v1.4s + eor v5.16b, v19.16b, v5.16b + ushr v31.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v17.4s, v17.4s, v2.4s + ushr v11.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + eor v7.16b, v18.16b, v7.16b + orr v26.16b, v26.16b, v31.16b + eor v6.16b, v17.16b, v6.16b + orr v5.16b, v5.16b, v11.16b + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + add v23.4s, v26.4s, v23.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + orr v7.16b, v7.16b, v31.16b + add v3.4s, v5.4s, v3.4s + eor v0.16b, v23.16b, v0.16b + ldp q28, q12, [x29, #-256] + orr v6.16b, v6.16b, v11.16b + add v16.4s, v7.4s, v16.4s + eor v25.16b, v3.16b, v25.16b + ushr v31.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v4.4s, v6.4s, v4.4s + ushr v11.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + add v18.4s, v18.4s, v12.4s + mov v15.16b, v29.16b + ldur q29, [x29, #-208] + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v11.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + str q15, [sp, #160] + add v20.4s, v20.4s, v29.4s + add v18.4s, v18.4s, v0.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v27.4s + eor v6.16b, v6.16b, v18.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v28.4s + eor v7.16b, v7.16b, v20.16b + add v17.4s, v17.4s, v1.4s + rev32 v6.8h, v6.8h + add v19.4s, v19.4s, v2.4s + rev32 v7.8h, v7.8h + eor v5.16b, v17.16b, v5.16b add v3.4s, v3.4s, v6.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v19.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v3.16b, v11.16b + eor v26.16b, v19.16b, v26.16b + add v4.4s, v4.4s, v7.4s + rev32 v5.8h, v5.8h + eor v0.16b, v3.16b, v0.16b + rev32 v26.8h, v26.8h + eor v25.16b, v4.16b, v25.16b + add v23.4s, v23.4s, v5.4s + ushr v11.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + add v16.4s, v16.4s, v26.4s + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v18.4s, v18.4s, v24.4s + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v20.4s, v20.4s, v22.4s + add v18.4s, v18.4s, v0.4s + mov v9.16b, v30.16b + mov v30.16b, v21.16b + ldur q21, [x29, #-224] + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + str q30, [sp, #144] + add v17.4s, v17.4s, v21.4s + ldur q21, [x29, #-192] + eor v6.16b, v18.16b, v6.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v30.4s + eor v7.16b, v20.16b, v7.16b + add v17.4s, v17.4s, v1.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + add v19.4s, v19.4s, v2.4s + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + eor v5.16b, v17.16b, v5.16b + orr v6.16b, v6.16b, v11.16b + eor v26.16b, v19.16b, v26.16b + orr v7.16b, v7.16b, v31.16b + ushr v31.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + add v3.4s, v6.4s, v3.4s + ushr v11.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v4.4s, v7.4s, v4.4s + orr v5.16b, v5.16b, v31.16b + eor v0.16b, v3.16b, v0.16b + orr v26.16b, v26.16b, v11.16b + eor v25.16b, v4.16b, v25.16b + add v23.4s, v5.4s, v23.4s + ushr v11.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v16.4s, v26.4s, v16.4s + ushr v31.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v20.4s, v20.4s, v21.4s + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v0.4s + add v19.4s, v19.4s, v10.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v18.4s, v18.4s, v14.4s + eor v26.16b, v20.16b, v26.16b + add v19.4s, v19.4s, v25.4s + orr v2.16b, v2.16b, v11.16b + add v17.4s, v17.4s, v9.4s + ldr q9, [sp, #208] + add v18.4s, v18.4s, v1.4s + rev32 v26.8h, v26.8h + eor v5.16b, v19.16b, v5.16b + add v17.4s, v17.4s, v2.4s + eor v7.16b, v18.16b, v7.16b + add v23.4s, v23.4s, v26.4s + rev32 v5.8h, v5.8h + eor v6.16b, v17.16b, v6.16b + rev32 v7.8h, v7.8h + eor v0.16b, v23.16b, v0.16b + add v3.4s, v3.4s, v5.4s + rev32 v6.8h, v6.8h + add v16.4s, v16.4s, v7.4s + ushr v31.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v25.16b, v3.16b, v25.16b + add v4.4s, v4.4s, v6.4s + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + add v20.4s, v20.4s, v8.4s + ushr v11.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v31.16b + add v19.4s, v19.4s, v15.4s + add v20.4s, v20.4s, v0.4s + orr v1.16b, v1.16b, v11.16b + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v18.4s, v18.4s, v9.4s + add v19.4s, v19.4s, v25.4s + eor v26.16b, v20.16b, v26.16b + orr v2.16b, v2.16b, v11.16b add v17.4s, v17.4s, v13.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v0.16b, v17.16b, v0.16b - add v1.4s, v16.4s, v1.4s - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v0.4s, #8 - shl v0.4s, v0.4s, #24 - eor v8.16b, v1.16b, v8.16b - add v5.4s, v10.4s, v5.4s - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v11.4s, v4.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v0.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v13.16b, v12.16b, v13.16b - add v14.4s, v14.4s, v25.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #7 - shl v13.4s, v13.4s, #25 - add v14.4s, v14.4s, v9.4s - add v2.4s, v2.4s, v30.4s - orr v13.16b, v13.16b, v15.16b - eor v0.16b, v14.16b, v0.16b - add v2.4s, v2.4s, v31.4s - add v3.4s, v3.4s, v24.4s - rev32 v0.8h, v0.8h - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v13.4s - add v17.4s, v17.4s, v26.4s - mov v29.16b, v27.16b - add v4.4s, v4.4s, v0.4s - rev32 v16.8h, v16.8h - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - ldur q27, [x29, #-160] - eor v9.16b, v4.16b, v9.16b - add v12.4s, v12.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v17.16b, v11.16b - ldur q6, [x29, #-80] - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v1.4s, v10.4s - rev32 v11.8h, v11.8h - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v13.16b, v1.16b, v13.16b - add v5.4s, v5.4s, v11.4s - add v14.4s, v14.4s, v22.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #12 - shl v13.4s, v13.4s, #20 - eor v8.16b, v5.16b, v8.16b - add v14.4s, v14.4s, v9.4s - add v2.4s, v2.4s, v27.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v0.16b, v14.16b, v0.16b - add v2.4s, v2.4s, v31.4s + add v18.4s, v18.4s, v1.4s + eor v5.16b, v19.16b, v5.16b + ushr v31.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v17.4s, v17.4s, v2.4s + ushr v11.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + eor v7.16b, v18.16b, v7.16b + orr v26.16b, v26.16b, v31.16b + eor v6.16b, v17.16b, v6.16b + orr v5.16b, v5.16b, v11.16b + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + add v23.4s, v26.4s, v23.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + orr v7.16b, v7.16b, v31.16b + add v3.4s, v5.4s, v3.4s + eor v0.16b, v23.16b, v0.16b + orr v6.16b, v6.16b, v11.16b + add v16.4s, v7.4s, v16.4s + eor v25.16b, v3.16b, v25.16b + ushr v31.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v4.4s, v6.4s, v4.4s + ushr v11.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + add v18.4s, v18.4s, v24.4s + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v11.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v12.4s + add v18.4s, v18.4s, v0.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v30.4s + eor v6.16b, v6.16b, v18.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v27.4s + eor v7.16b, v7.16b, v20.16b + add v17.4s, v17.4s, v1.4s + rev32 v6.8h, v6.8h + add v19.4s, v19.4s, v2.4s + rev32 v7.8h, v7.8h + eor v5.16b, v17.16b, v5.16b add v3.4s, v3.4s, v6.4s - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v0.4s, #8 - shl v0.4s, v0.4s, #24 - eor v16.16b, v2.16b, v16.16b - add v3.4s, v3.4s, v13.4s - add v17.4s, v17.4s, v23.4s - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v3.16b, v10.16b - add v17.4s, v17.4s, v8.4s - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - eor v11.16b, v17.16b, v11.16b - add v4.4s, v0.4s, v4.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v11.4s, #8 - shl v11.4s, v11.4s, #24 - eor v9.16b, v4.16b, v9.16b - add v12.4s, v16.4s, v12.4s - orr v11.16b, v11.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v31.16b, v12.16b, v31.16b - add v1.4s, v10.4s, v1.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - eor v13.16b, v1.16b, v13.16b - add v5.4s, v11.4s, v5.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #7 - shl v13.4s, v13.4s, #25 - eor v8.16b, v5.16b, v8.16b - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - orr v8.16b, v8.16b, v15.16b - add v14.4s, v14.4s, v29.4s - add v14.4s, v14.4s, v8.4s - add v2.4s, v2.4s, v20.4s - mov v28.16b, v7.16b - eor v16.16b, v14.16b, v16.16b - add v2.4s, v2.4s, v9.4s - add v3.4s, v3.4s, v19.4s - rev32 v16.8h, v16.8h - eor v10.16b, v2.16b, v10.16b - add v3.4s, v3.4s, v31.4s - add v17.4s, v17.4s, v28.4s - add v1.4s, v1.4s, v16.4s - rev32 v10.8h, v10.8h - eor v11.16b, v3.16b, v11.16b + eor v26.16b, v19.16b, v26.16b + add v4.4s, v4.4s, v7.4s + rev32 v5.8h, v5.8h + eor v0.16b, v3.16b, v0.16b + rev32 v26.8h, v26.8h + eor v25.16b, v4.16b, v25.16b + add v23.4s, v23.4s, v5.4s + ushr v11.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + add v16.4s, v16.4s, v26.4s + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v18.4s, v18.4s, v14.4s + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v20.4s, v20.4s, v28.4s + add v18.4s, v18.4s, v0.4s + mov v10.16b, v13.16b + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v29.4s + eor v6.16b, v18.16b, v6.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v10.4s + eor v7.16b, v20.16b, v7.16b + add v17.4s, v17.4s, v1.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + add v19.4s, v19.4s, v2.4s + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + eor v5.16b, v17.16b, v5.16b + orr v6.16b, v6.16b, v11.16b + eor v26.16b, v19.16b, v26.16b + orr v7.16b, v7.16b, v31.16b + ushr v31.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + add v3.4s, v6.4s, v3.4s + ushr v11.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v4.4s, v7.4s, v4.4s + orr v5.16b, v5.16b, v31.16b + eor v0.16b, v3.16b, v0.16b + mov v22.16b, v8.16b + ldp q8, q28, [sp, #240] + orr v26.16b, v26.16b, v11.16b + eor v25.16b, v4.16b, v25.16b + add v23.4s, v5.4s, v23.4s + ushr v11.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v16.4s, v26.4s, v16.4s + ushr v31.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v20.4s, v20.4s, v28.4s + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v0.4s + add v19.4s, v19.4s, v15.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v18.4s, v18.4s, v8.4s + eor v26.16b, v20.16b, v26.16b + add v19.4s, v19.4s, v25.4s + orr v2.16b, v2.16b, v11.16b + add v17.4s, v17.4s, v22.4s + ldur q22, [x29, #-256] + add v18.4s, v18.4s, v1.4s + rev32 v26.8h, v26.8h + eor v5.16b, v19.16b, v5.16b + add v17.4s, v17.4s, v2.4s + eor v7.16b, v18.16b, v7.16b + add v23.4s, v23.4s, v26.4s + rev32 v5.8h, v5.8h + eor v6.16b, v17.16b, v6.16b + rev32 v7.8h, v7.8h + eor v0.16b, v23.16b, v0.16b + add v3.4s, v3.4s, v5.4s + rev32 v6.8h, v6.8h + add v16.4s, v16.4s, v7.4s + ushr v31.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v25.16b, v3.16b, v25.16b + add v4.4s, v4.4s, v6.4s + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + add v20.4s, v20.4s, v9.4s + mov v13.16b, v12.16b + mov v12.16b, v27.16b + mov v27.16b, v9.16b + ldur q9, [x29, #-192] + mov v21.16b, v15.16b + ldr q15, [sp, #224] + ushr v11.4s, v1.4s, #12 + ldur q21, [x29, #-224] + shl v1.4s, v1.4s, #20 + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v31.16b + add v19.4s, v19.4s, v9.4s + add v20.4s, v20.4s, v0.4s + orr v1.16b, v1.16b, v11.16b + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v18.4s, v18.4s, v21.4s + add v19.4s, v19.4s, v25.4s + eor v26.16b, v20.16b, v26.16b + orr v2.16b, v2.16b, v11.16b + add v17.4s, v17.4s, v15.4s + add v18.4s, v18.4s, v1.4s + eor v5.16b, v19.16b, v5.16b + ushr v31.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v17.4s, v17.4s, v2.4s + ushr v11.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + eor v7.16b, v18.16b, v7.16b + orr v26.16b, v26.16b, v31.16b + eor v6.16b, v17.16b, v6.16b + orr v5.16b, v5.16b, v11.16b + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + add v23.4s, v26.4s, v23.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + orr v7.16b, v7.16b, v31.16b + add v3.4s, v5.4s, v3.4s + eor v0.16b, v23.16b, v0.16b + orr v6.16b, v6.16b, v11.16b + add v16.4s, v7.4s, v16.4s + eor v25.16b, v3.16b, v25.16b + ushr v31.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v4.4s, v6.4s, v4.4s + ushr v11.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + add v18.4s, v18.4s, v14.4s + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v11.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v24.4s + add v18.4s, v18.4s, v0.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v10.4s + eor v6.16b, v6.16b, v18.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v30.4s + eor v7.16b, v7.16b, v20.16b + add v17.4s, v17.4s, v1.4s + rev32 v6.8h, v6.8h + add v19.4s, v19.4s, v2.4s + rev32 v7.8h, v7.8h + eor v5.16b, v17.16b, v5.16b + add v3.4s, v3.4s, v6.4s + eor v26.16b, v19.16b, v26.16b + add v4.4s, v4.4s, v7.4s + rev32 v5.8h, v5.8h + eor v0.16b, v3.16b, v0.16b + rev32 v26.8h, v26.8h + eor v25.16b, v4.16b, v25.16b + add v23.4s, v23.4s, v5.4s + ushr v11.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + add v16.4s, v16.4s, v26.4s + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v18.4s, v18.4s, v8.4s + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v20.4s, v20.4s, v12.4s + add v18.4s, v18.4s, v0.4s + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s add v17.4s, v17.4s, v13.4s - eor v8.16b, v1.16b, v8.16b - add v5.4s, v5.4s, v10.4s - rev32 v11.8h, v11.8h - eor v0.16b, v17.16b, v0.16b - ushr v15.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - eor v9.16b, v5.16b, v9.16b - add v4.4s, v4.4s, v11.4s - rev32 v0.8h, v0.8h - orr v8.16b, v8.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v31.16b, v4.16b, v31.16b - add v12.4s, v12.4s, v0.4s - add v14.4s, v14.4s, v21.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - eor v13.16b, v12.16b, v13.16b - add v14.4s, v14.4s, v8.4s - add v2.4s, v2.4s, v30.4s - orr v31.16b, v31.16b, v15.16b - ushr v15.4s, v13.4s, #12 - shl v13.4s, v13.4s, #20 - eor v16.16b, v14.16b, v16.16b - add v2.4s, v2.4s, v9.4s - orr v13.16b, v13.16b, v15.16b - ushr v15.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v10.16b, v2.16b, v10.16b - orr v16.16b, v16.16b, v15.16b - ushr v15.4s, v10.4s, #8 - shl v10.4s, v10.4s, #24 - add v3.4s, v3.4s, v18.4s - orr v10.16b, v10.16b, v15.16b - add v15.4s, v3.4s, v31.4s - eor v3.16b, v15.16b, v11.16b - ushr v11.4s, v3.4s, #8 - shl v3.4s, v3.4s, #24 - orr v11.16b, v3.16b, v11.16b - add v3.4s, v17.4s, v6.4s - add v17.4s, v3.4s, v13.4s - eor v0.16b, v17.16b, v0.16b - ushr v3.4s, v0.4s, #8 - shl v0.4s, v0.4s, #24 - add v1.4s, v16.4s, v1.4s - orr v0.16b, v0.16b, v3.16b - eor v3.16b, v1.16b, v8.16b - ushr v8.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - add v5.4s, v10.4s, v5.4s - orr v8.16b, v3.16b, v8.16b - eor v3.16b, v5.16b, v9.16b - add v4.4s, v11.4s, v4.4s - ushr v9.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - eor v31.16b, v4.16b, v31.16b - mov v7.16b, v23.16b - mov v23.16b, v28.16b - mov v28.16b, v6.16b - orr v3.16b, v3.16b, v9.16b - ushr v9.4s, v31.4s, #7 - shl v31.4s, v31.4s, #25 - ldur q6, [x29, #-64] - orr v31.16b, v31.16b, v9.16b - add v9.4s, v0.4s, v12.4s - eor v12.16b, v9.16b, v13.16b - ushr v13.4s, v12.4s, #7 - shl v12.4s, v12.4s, #25 - orr v12.16b, v12.16b, v13.16b - add v13.4s, v14.4s, v6.4s - add v13.4s, v13.4s, v3.4s - eor v0.16b, v13.16b, v0.16b - add v2.4s, v2.4s, v24.4s - rev32 v14.8h, v0.8h - add v0.4s, v2.4s, v31.4s - add v6.4s, v4.4s, v14.4s - eor v2.16b, v0.16b, v16.16b - eor v3.16b, v6.16b, v3.16b - rev32 v16.8h, v2.8h - ushr v4.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - add v2.4s, v9.4s, v16.4s - orr v4.16b, v3.16b, v4.16b - eor v3.16b, v2.16b, v31.16b - ushr v31.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - orr v3.16b, v3.16b, v31.16b - add v31.4s, v15.4s, v22.4s - add v31.4s, v31.4s, v12.4s - add v17.4s, v17.4s, v7.4s - eor v9.16b, v31.16b, v10.16b - add v17.4s, v17.4s, v8.4s - rev32 v9.8h, v9.8h - eor v11.16b, v17.16b, v11.16b - add v1.4s, v1.4s, v9.4s - rev32 v11.8h, v11.8h - eor v10.16b, v1.16b, v12.16b - add v5.4s, v5.4s, v11.4s - ushr v12.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v8.16b, v5.16b, v8.16b - orr v10.16b, v10.16b, v12.16b - ushr v12.4s, v8.4s, #12 - shl v8.4s, v8.4s, #20 - orr v8.16b, v8.16b, v12.16b - add v12.4s, v13.4s, v27.4s - add v12.4s, v12.4s, v4.4s - eor v13.16b, v12.16b, v14.16b - ldur q14, [x29, #-96] - mov v25.16b, v29.16b - add v29.4s, v12.4s, v20.4s - add v20.4s, v31.4s, v26.4s - add v0.4s, v0.4s, v14.4s - add v0.4s, v0.4s, v3.4s - eor v16.16b, v0.16b, v16.16b - add v0.4s, v0.4s, v30.4s - ldur q30, [x29, #-112] + ldr q13, [sp, #160] + eor v6.16b, v18.16b, v6.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v15.4s + eor v7.16b, v20.16b, v7.16b + add v17.4s, v17.4s, v1.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + add v19.4s, v19.4s, v2.4s + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + eor v5.16b, v17.16b, v5.16b + orr v6.16b, v6.16b, v11.16b + eor v26.16b, v19.16b, v26.16b + orr v7.16b, v7.16b, v31.16b + ushr v31.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + add v3.4s, v6.4s, v3.4s + ushr v11.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v4.4s, v7.4s, v4.4s + orr v5.16b, v5.16b, v31.16b + eor v0.16b, v3.16b, v0.16b + orr v26.16b, v26.16b, v11.16b + eor v25.16b, v4.16b, v25.16b + add v23.4s, v5.4s, v23.4s + ushr v11.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v16.4s, v26.4s, v16.4s + ushr v31.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v20.4s, v20.4s, v22.4s + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v0.4s + add v19.4s, v19.4s, v9.4s + mov v29.16b, v14.16b + ldr q14, [sp, #128] + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v18.4s, v18.4s, v14.4s + eor v26.16b, v20.16b, v26.16b + add v19.4s, v19.4s, v25.4s + orr v2.16b, v2.16b, v11.16b + add v17.4s, v17.4s, v27.4s + add v18.4s, v18.4s, v1.4s + rev32 v26.8h, v26.8h + eor v5.16b, v19.16b, v5.16b + add v17.4s, v17.4s, v2.4s + eor v7.16b, v18.16b, v7.16b + add v23.4s, v23.4s, v26.4s + rev32 v5.8h, v5.8h + eor v6.16b, v17.16b, v6.16b + rev32 v7.8h, v7.8h + eor v0.16b, v23.16b, v0.16b + add v3.4s, v3.4s, v5.4s + rev32 v6.8h, v6.8h + add v16.4s, v16.4s, v7.4s + ushr v31.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v25.16b, v3.16b, v25.16b + add v4.4s, v4.4s, v6.4s + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + add v20.4s, v20.4s, v21.4s + ushr v11.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v31.16b + add v19.4s, v19.4s, v28.4s + add v20.4s, v20.4s, v0.4s + mov v12.16b, v27.16b + ldur q27, [x29, #-208] + orr v1.16b, v1.16b, v11.16b + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v18.4s, v18.4s, v27.4s + add v19.4s, v19.4s, v25.4s + eor v26.16b, v20.16b, v26.16b + orr v2.16b, v2.16b, v11.16b + add v17.4s, v17.4s, v13.4s + add v18.4s, v18.4s, v1.4s + eor v5.16b, v19.16b, v5.16b + ushr v31.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v17.4s, v17.4s, v2.4s + ushr v11.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + eor v7.16b, v18.16b, v7.16b + orr v26.16b, v26.16b, v31.16b + eor v6.16b, v17.16b, v6.16b + orr v5.16b, v5.16b, v11.16b + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + add v23.4s, v26.4s, v23.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + orr v7.16b, v7.16b, v31.16b + add v3.4s, v5.4s, v3.4s + eor v0.16b, v23.16b, v0.16b + orr v6.16b, v6.16b, v11.16b + add v16.4s, v7.4s, v16.4s + eor v25.16b, v3.16b, v25.16b + ushr v31.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + add v4.4s, v6.4s, v4.4s + ushr v11.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + add v18.4s, v18.4s, v8.4s + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v11.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v29.4s + add v18.4s, v18.4s, v0.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v15.4s + eor v6.16b, v6.16b, v18.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v10.4s + eor v7.16b, v7.16b, v20.16b + add v17.4s, v17.4s, v1.4s + rev32 v6.8h, v6.8h + add v19.4s, v19.4s, v2.4s + rev32 v7.8h, v7.8h + eor v5.16b, v17.16b, v5.16b + add v3.4s, v3.4s, v6.4s + eor v26.16b, v19.16b, v26.16b + add v4.4s, v4.4s, v7.4s + rev32 v5.8h, v5.8h + eor v0.16b, v3.16b, v0.16b + rev32 v26.8h, v26.8h + eor v25.16b, v4.16b, v25.16b + add v23.4s, v23.4s, v5.4s + ushr v11.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + add v16.4s, v16.4s, v26.4s + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b + add v18.4s, v18.4s, v14.4s + mov v30.16b, v29.16b + mov v29.16b, v15.16b + ldr q15, [sp, #144] + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v20.4s, v20.4s, v15.4s + add v18.4s, v18.4s, v0.4s + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v24.4s + eor v6.16b, v18.16b, v6.16b + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v13.4s + eor v7.16b, v20.16b, v7.16b + add v17.4s, v17.4s, v1.4s + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + add v19.4s, v19.4s, v2.4s + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + eor v5.16b, v17.16b, v5.16b + orr v6.16b, v6.16b, v11.16b + eor v26.16b, v19.16b, v26.16b + orr v7.16b, v7.16b, v31.16b + ushr v31.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + add v3.4s, v6.4s, v3.4s + ushr v11.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v4.4s, v7.4s, v4.4s + orr v5.16b, v5.16b, v31.16b + eor v0.16b, v3.16b, v0.16b + orr v26.16b, v26.16b, v11.16b + eor v25.16b, v4.16b, v25.16b + add v23.4s, v5.4s, v23.4s + ushr v11.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + mov v9.16b, v28.16b + mov v28.16b, v10.16b + ldr q10, [sp, #176] + add v16.4s, v26.4s, v16.4s + ushr v31.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v23.16b, v1.16b + orr v0.16b, v0.16b, v11.16b add v20.4s, v20.4s, v10.4s - eor v31.16b, v20.16b, v9.16b + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v0.4s + add v19.4s, v19.4s, v9.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v18.4s, v18.4s, v12.4s + eor v26.16b, v20.16b, v26.16b + add v19.4s, v19.4s, v25.4s + orr v2.16b, v2.16b, v11.16b + add v17.4s, v17.4s, v21.4s + add v18.4s, v18.4s, v1.4s + rev32 v26.8h, v26.8h + eor v5.16b, v19.16b, v5.16b + add v17.4s, v17.4s, v2.4s + eor v7.16b, v18.16b, v7.16b + add v23.4s, v23.4s, v26.4s + rev32 v5.8h, v5.8h + eor v6.16b, v17.16b, v6.16b + rev32 v7.8h, v7.8h + eor v0.16b, v23.16b, v0.16b + add v3.4s, v3.4s, v5.4s + rev32 v6.8h, v6.8h + add v16.4s, v16.4s, v7.4s + ushr v31.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v25.16b, v3.16b, v25.16b + add v4.4s, v4.4s, v6.4s + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + ushr v11.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + eor v2.16b, v4.16b, v2.16b + add v20.4s, v20.4s, v27.4s + orr v25.16b, v25.16b, v31.16b + add v19.4s, v19.4s, v22.4s + mov v9.16b, v22.16b + ldur q22, [x29, #-240] + orr v1.16b, v1.16b, v11.16b + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v20.4s, v20.4s, v0.4s + add v18.4s, v18.4s, v22.4s + add v19.4s, v19.4s, v25.4s + mov v24.16b, v21.16b + ldur q21, [x29, #-192] + orr v2.16b, v2.16b, v11.16b + eor v26.16b, v20.16b, v26.16b + add v17.4s, v17.4s, v21.4s + add v18.4s, v18.4s, v1.4s + eor v5.16b, v19.16b, v5.16b + ushr v31.4s, v26.4s, #8 + add v17.4s, v17.4s, v2.4s + shl v26.4s, v26.4s, #24 + ushr v11.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + eor v7.16b, v18.16b, v7.16b + orr v26.16b, v26.16b, v31.16b + eor v6.16b, v17.16b, v6.16b + orr v5.16b, v5.16b, v11.16b + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + add v23.4s, v26.4s, v23.4s + orr v7.16b, v7.16b, v31.16b + add v3.4s, v5.4s, v3.4s + orr v6.16b, v6.16b, v11.16b + eor v0.16b, v23.16b, v0.16b + add v16.4s, v7.4s, v16.4s + eor v25.16b, v3.16b, v25.16b + add v4.4s, v6.4s, v4.4s + ushr v31.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v11.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v16.16b, v1.16b + orr v0.16b, v0.16b, v31.16b + eor v2.16b, v4.16b, v2.16b + orr v25.16b, v25.16b, v11.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v8.4s + add v18.4s, v18.4s, v14.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s + add v17.4s, v17.4s, v13.4s + add v18.4s, v18.4s, v0.4s + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v29.4s + eor v7.16b, v7.16b, v20.16b + add v17.4s, v17.4s, v1.4s + eor v6.16b, v6.16b, v18.16b + add v19.4s, v19.4s, v2.4s + rev32 v7.8h, v7.8h + eor v5.16b, v17.16b, v5.16b + rev32 v6.8h, v6.8h + eor v26.16b, v19.16b, v26.16b + add v4.4s, v4.4s, v7.4s + rev32 v5.8h, v5.8h + add v3.4s, v3.4s, v6.4s + rev32 v26.8h, v26.8h + eor v25.16b, v4.16b, v25.16b + add v23.4s, v23.4s, v5.4s + eor v0.16b, v3.16b, v0.16b + add v16.4s, v16.4s, v26.4s + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + ushr v11.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v1.16b, v23.16b, v1.16b + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + orr v0.16b, v0.16b, v11.16b + ushr v31.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 add v20.4s, v20.4s, v28.4s + add v18.4s, v18.4s, v12.4s + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v1.16b, v1.16b, v31.16b + add v20.4s, v20.4s, v25.4s add v17.4s, v17.4s, v30.4s - add v17.4s, v17.4s, v8.4s - eor v9.16b, v17.16b, v11.16b - ushr v28.4s, v13.4s, #8 - shl v11.4s, v13.4s, #24 - orr v28.16b, v11.16b, v28.16b - ushr v11.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - orr v16.16b, v16.16b, v11.16b - ushr v11.4s, v31.4s, #8 - shl v31.4s, v31.4s, #24 - add v6.4s, v28.4s, v6.4s - orr v31.16b, v31.16b, v11.16b - ushr v11.4s, v9.4s, #8 - shl v9.4s, v9.4s, #24 - add v2.4s, v16.4s, v2.4s - eor v4.16b, v6.16b, v4.16b - orr v9.16b, v9.16b, v11.16b - add v1.4s, v31.4s, v1.4s - eor v3.16b, v2.16b, v3.16b - ushr v11.4s, v4.4s, #7 - shl v4.4s, v4.4s, #25 - add v5.4s, v9.4s, v5.4s - eor v10.16b, v1.16b, v10.16b - orr v4.16b, v4.16b, v11.16b - ushr v11.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - eor v8.16b, v5.16b, v8.16b - orr v3.16b, v3.16b, v11.16b - ushr v11.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - orr v10.16b, v10.16b, v11.16b - ushr v11.4s, v8.4s, #7 - shl v8.4s, v8.4s, #25 - orr v8.16b, v8.16b, v11.16b - add v29.4s, v29.4s, v8.4s - eor v16.16b, v29.16b, v16.16b - add v0.4s, v0.4s, v4.4s - mov v12.16b, v26.16b - add v17.4s, v17.4s, v19.4s - add v26.4s, v29.4s, v23.4s - eor v29.16b, v0.16b, v31.16b - add v20.4s, v20.4s, v3.4s - rev32 v16.8h, v16.8h - stur q18, [x29, #-176] - mov v18.16b, v27.16b - add v0.4s, v0.4s, v24.4s - eor v27.16b, v20.16b, v9.16b - add v17.4s, v17.4s, v10.4s - rev32 v24.8h, v29.8h - add v1.4s, v1.4s, v16.4s + add v18.4s, v18.4s, v0.4s + orr v2.16b, v2.16b, v11.16b + add v19.4s, v19.4s, v21.4s + eor v7.16b, v20.16b, v7.16b + add v17.4s, v17.4s, v1.4s + eor v6.16b, v18.16b, v6.16b + add v19.4s, v19.4s, v2.4s + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + eor v5.16b, v17.16b, v5.16b + orr v7.16b, v7.16b, v31.16b + eor v26.16b, v19.16b, v26.16b + orr v6.16b, v6.16b, v11.16b + ushr v31.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + ushr v11.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + add v4.4s, v7.4s, v4.4s + orr v5.16b, v5.16b, v31.16b + add v3.4s, v6.4s, v3.4s + orr v26.16b, v26.16b, v11.16b + eor v25.16b, v4.16b, v25.16b + add v23.4s, v5.4s, v23.4s + eor v0.16b, v3.16b, v0.16b + add v16.4s, v26.4s, v16.4s + ushr v31.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + ushr v11.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v1.16b, v23.16b, v1.16b + orr v25.16b, v25.16b, v31.16b + eor v2.16b, v16.16b, v2.16b + orr v0.16b, v0.16b, v11.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v20.4s, v20.4s, v15.4s + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v1.16b, v31.16b + add v18.4s, v18.4s, v24.4s + add v20.4s, v20.4s, v0.4s + add v19.4s, v19.4s, v9.4s + mov v8.16b, v13.16b + ldur q13, [x29, #-208] + orr v2.16b, v2.16b, v11.16b + add v18.4s, v18.4s, v1.4s + add v17.4s, v17.4s, v13.4s + eor v26.16b, v20.16b, v26.16b + add v19.4s, v19.4s, v25.4s + eor v7.16b, v18.16b, v7.16b + add v17.4s, v17.4s, v2.4s + rev32 v26.8h, v26.8h + eor v5.16b, v19.16b, v5.16b + rev32 v7.8h, v7.8h + eor v6.16b, v17.16b, v6.16b + add v23.4s, v23.4s, v26.4s + rev32 v5.8h, v5.8h + add v16.4s, v16.4s, v7.4s + rev32 v6.8h, v6.8h + eor v0.16b, v23.16b, v0.16b + add v3.4s, v3.4s, v5.4s + eor v1.16b, v16.16b, v1.16b + add v4.4s, v4.4s, v6.4s + ushr v31.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v25.16b, v3.16b, v25.16b + ushr v11.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + orr v0.16b, v0.16b, v31.16b + eor v2.16b, v4.16b, v2.16b + ushr v31.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + orr v1.16b, v1.16b, v11.16b + ushr v11.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v20.4s, v20.4s, v22.4s + orr v25.16b, v25.16b, v31.16b + add v19.4s, v19.4s, v10.4s + mov v27.16b, v12.16b + mov v12.16b, v30.16b + mov v29.16b, v21.16b + mov v21.16b, v24.16b + ldr q24, [sp, #192] + mov v30.16b, v22.16b + ldr q22, [sp, #256] + orr v2.16b, v2.16b, v11.16b + add v20.4s, v20.4s, v0.4s + add v18.4s, v18.4s, v24.4s + add v19.4s, v19.4s, v25.4s + add v17.4s, v17.4s, v22.4s + eor v26.16b, v20.16b, v26.16b + add v18.4s, v18.4s, v1.4s + eor v5.16b, v19.16b, v5.16b + add v17.4s, v17.4s, v2.4s + ushr v31.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + ushr v11.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + eor v7.16b, v18.16b, v7.16b + eor v6.16b, v17.16b, v6.16b + orr v26.16b, v26.16b, v31.16b + orr v5.16b, v5.16b, v11.16b + ushr v31.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + ushr v11.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + add v23.4s, v26.4s, v23.4s + orr v7.16b, v7.16b, v31.16b + add v3.4s, v5.4s, v3.4s + orr v6.16b, v6.16b, v11.16b + eor v0.16b, v23.16b, v0.16b + add v16.4s, v7.4s, v16.4s + eor v25.16b, v3.16b, v25.16b + add v4.4s, v6.4s, v4.4s + ushr v31.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v11.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + eor v1.16b, v16.16b, v1.16b + eor v2.16b, v4.16b, v2.16b + orr v0.16b, v0.16b, v31.16b + orr v25.16b, v25.16b, v11.16b + ushr v31.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + ushr v11.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v20.4s, v20.4s, v14.4s + add v18.4s, v18.4s, v27.4s + ldr q27, [sp, #224] + orr v1.16b, v1.16b, v31.16b + orr v2.16b, v2.16b, v11.16b add v20.4s, v20.4s, v25.4s - eor v25.16b, v17.16b, v28.16b - rev32 v27.8h, v27.8h - add v5.4s, v5.4s, v24.4s - eor v28.16b, v1.16b, v8.16b - rev32 v25.8h, v25.8h - add v6.4s, v6.4s, v27.4s - eor v4.16b, v5.16b, v4.16b - ushr v31.4s, v28.4s, #12 - shl v28.4s, v28.4s, #20 - add v2.4s, v2.4s, v25.4s - eor v3.16b, v6.16b, v3.16b - orr v28.16b, v28.16b, v31.16b - ushr v31.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - eor v29.16b, v2.16b, v10.16b - orr v4.16b, v4.16b, v31.16b - ushr v31.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - add v26.4s, v26.4s, v28.4s - orr v3.16b, v3.16b, v31.16b - ushr v31.4s, v29.4s, #12 - shl v29.4s, v29.4s, #20 - eor v16.16b, v26.16b, v16.16b - add v0.4s, v0.4s, v4.4s - add v17.4s, v17.4s, v12.4s - orr v29.16b, v29.16b, v31.16b - eor v24.16b, v0.16b, v24.16b - add v0.4s, v0.4s, v22.4s - add v20.4s, v20.4s, v3.4s - ushr v22.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - add v23.4s, v26.4s, v21.4s - eor v21.16b, v20.16b, v27.16b add v17.4s, v17.4s, v29.4s - orr v16.16b, v16.16b, v22.16b - ushr v22.4s, v24.4s, #8 - shl v24.4s, v24.4s, #24 - eor v25.16b, v17.16b, v25.16b - orr v22.16b, v24.16b, v22.16b + add v18.4s, v18.4s, v0.4s + add v19.4s, v19.4s, v8.4s + eor v7.16b, v7.16b, v20.16b + add v17.4s, v17.4s, v1.4s + eor v6.16b, v6.16b, v18.16b + add v19.4s, v19.4s, v2.4s + rev32 v7.8h, v7.8h + eor v5.16b, v17.16b, v5.16b + rev32 v6.8h, v6.8h + eor v26.16b, v19.16b, v26.16b + add v4.4s, v4.4s, v7.4s + rev32 v5.8h, v5.8h + add v3.4s, v3.4s, v6.4s + rev32 v26.8h, v26.8h + eor v25.16b, v4.16b, v25.16b + add v23.4s, v23.4s, v5.4s + eor v0.16b, v3.16b, v0.16b + add v16.4s, v16.4s, v26.4s + ushr v29.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + ushr v31.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v1.16b, v23.16b, v1.16b + eor v2.16b, v16.16b, v2.16b + orr v25.16b, v25.16b, v29.16b + orr v0.16b, v0.16b, v31.16b + ushr v29.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + ushr v31.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v18.4s, v18.4s, v21.4s + ldr q21, [sp, #240] + add v20.4s, v20.4s, v27.4s + prfm pldl1keep, [x17, #256] + orr v1.16b, v1.16b, v29.16b + prfm pldl1keep, [x21, #256] + orr v2.16b, v2.16b, v31.16b + prfm pldl1keep, [x16, #256] + add v18.4s, v18.4s, v0.4s + prfm pldl1keep, [x6, #256] + add v17.4s, v17.4s, v21.4s + add v19.4s, v19.4s, v22.4s + add v20.4s, v20.4s, v25.4s + eor v6.16b, v18.16b, v6.16b + add v17.4s, v17.4s, v1.4s + add v19.4s, v19.4s, v2.4s + eor v7.16b, v20.16b, v7.16b + ushr v22.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + eor v5.16b, v17.16b, v5.16b + eor v26.16b, v19.16b, v26.16b + ushr v21.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + orr v6.16b, v6.16b, v22.16b + ushr v22.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 + ushr v29.4s, v26.4s, #8 + shl v26.4s, v26.4s, #24 + orr v7.16b, v7.16b, v21.16b + orr v5.16b, v5.16b, v22.16b + add v3.4s, v6.4s, v3.4s + orr v21.16b, v26.16b, v29.16b + add v4.4s, v7.4s, v4.4s + add v22.4s, v5.4s, v23.4s + eor v0.16b, v3.16b, v0.16b + add v16.4s, v21.4s, v16.4s + eor v23.16b, v4.16b, v25.16b + eor v1.16b, v22.16b, v1.16b + ushr v25.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v2.16b, v16.16b, v2.16b + ushr v26.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + orr v0.16b, v0.16b, v25.16b + ushr v25.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + ushr v29.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v20.4s, v20.4s, v28.4s + orr v23.16b, v23.16b, v26.16b + orr v1.16b, v1.16b, v25.16b + orr v2.16b, v2.16b, v29.16b + add v20.4s, v20.4s, v0.4s + add v18.4s, v18.4s, v13.4s + add v17.4s, v17.4s, v30.4s + add v19.4s, v19.4s, v10.4s + eor v21.16b, v20.16b, v21.16b + add v18.4s, v18.4s, v1.4s + add v17.4s, v17.4s, v2.4s + add v19.4s, v19.4s, v23.4s + rev32 v21.8h, v21.8h + eor v7.16b, v18.16b, v7.16b + eor v6.16b, v17.16b, v6.16b + eor v5.16b, v19.16b, v5.16b + add v22.4s, v22.4s, v21.4s + rev32 v7.8h, v7.8h + rev32 v6.8h, v6.8h + rev32 v5.8h, v5.8h + eor v0.16b, v22.16b, v0.16b + add v16.4s, v16.4s, v7.4s + add v4.4s, v4.4s, v6.4s + add v3.4s, v3.4s, v5.4s + ushr v25.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v1.16b, v16.16b, v1.16b + eor v2.16b, v4.16b, v2.16b + eor v23.16b, v3.16b, v23.16b + orr v0.16b, v0.16b, v25.16b + ushr v25.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + ushr v26.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ushr v27.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + orr v1.16b, v1.16b, v25.16b + add v20.4s, v20.4s, v24.4s + orr v2.16b, v2.16b, v26.16b + orr v23.16b, v23.16b, v27.16b + add v18.4s, v18.4s, v12.4s + add v17.4s, v17.4s, v9.4s + add v19.4s, v19.4s, v15.4s + add v20.4s, v20.4s, v0.4s + add v18.4s, v18.4s, v1.4s + add v17.4s, v17.4s, v2.4s + add v19.4s, v19.4s, v23.4s + eor v21.16b, v20.16b, v21.16b + eor v7.16b, v18.16b, v7.16b + eor v6.16b, v17.16b, v6.16b + eor v5.16b, v19.16b, v5.16b ushr v24.4s, v21.4s, #8 shl v21.4s, v21.4s, #24 + ushr v25.4s, v7.4s, #8 + shl v7.4s, v7.4s, #24 + ushr v26.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + ushr v27.4s, v5.4s, #8 + shl v5.4s, v5.4s, #24 orr v21.16b, v21.16b, v24.16b - ushr v24.4s, v25.4s, #8 - shl v25.4s, v25.4s, #24 - add v1.4s, v16.4s, v1.4s - orr v24.16b, v25.16b, v24.16b - add v5.4s, v22.4s, v5.4s - eor v25.16b, v1.16b, v28.16b - add v6.4s, v21.4s, v6.4s - eor v4.16b, v5.16b, v4.16b - ushr v27.4s, v25.4s, #7 - shl v25.4s, v25.4s, #25 - add v2.4s, v24.4s, v2.4s - eor v3.16b, v6.16b, v3.16b - orr v25.16b, v25.16b, v27.16b - ushr v27.4s, v4.4s, #7 - shl v4.4s, v4.4s, #25 - ldur q19, [x29, #-176] - eor v26.16b, v2.16b, v29.16b - orr v4.16b, v4.16b, v27.16b - ushr v27.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - orr v3.16b, v3.16b, v27.16b - ushr v27.4s, v26.4s, #7 - shl v26.4s, v26.4s, #25 - add v20.4s, v20.4s, v18.4s - add v17.4s, v17.4s, v30.4s - orr v26.16b, v26.16b, v27.16b - add v0.4s, v0.4s, v3.4s - eor v16.16b, v0.16b, v16.16b - add v0.4s, v0.4s, v19.4s - add v19.4s, v20.4s, v26.4s - add v17.4s, v17.4s, v25.4s - eor v20.16b, v19.16b, v22.16b - add v7.4s, v19.4s, v7.4s - eor v19.16b, v17.16b, v21.16b - ldur q21, [x29, #-64] - add v23.4s, v23.4s, v4.4s - eor v24.16b, v23.16b, v24.16b - rev32 v16.8h, v16.8h - add v17.4s, v17.4s, v21.4s - rev32 v21.8h, v24.8h - add v6.4s, v6.4s, v21.4s - rev32 v20.8h, v20.8h - add v2.4s, v2.4s, v16.4s - eor v4.16b, v6.16b, v4.16b - rev32 v19.8h, v19.8h - add v1.4s, v1.4s, v20.4s - eor v3.16b, v2.16b, v3.16b - ushr v24.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - add v5.4s, v5.4s, v19.4s - eor v22.16b, v1.16b, v26.16b - orr v4.16b, v4.16b, v24.16b - ushr v24.4s, v3.4s, #12 - shl v3.4s, v3.4s, #20 - add v18.4s, v23.4s, v14.4s - eor v23.16b, v5.16b, v25.16b - orr v3.16b, v3.16b, v24.16b - ushr v24.4s, v22.4s, #12 - shl v22.4s, v22.4s, #20 - orr v22.16b, v22.16b, v24.16b - ushr v24.4s, v23.4s, #12 - shl v23.4s, v23.4s, #20 - orr v23.16b, v23.16b, v24.16b - add v18.4s, v18.4s, v4.4s - add v0.4s, v0.4s, v3.4s - add v24.4s, v17.4s, v23.4s - eor v17.16b, v18.16b, v21.16b - add v7.4s, v7.4s, v22.4s - eor v16.16b, v0.16b, v16.16b - ushr v21.4s, v17.4s, #8 - shl v17.4s, v17.4s, #24 - eor v20.16b, v7.16b, v20.16b - orr v21.16b, v17.16b, v21.16b - ushr v17.4s, v16.4s, #8 - shl v16.4s, v16.4s, #24 - eor v19.16b, v24.16b, v19.16b - orr v16.16b, v16.16b, v17.16b - ushr v17.4s, v20.4s, #8 - shl v20.4s, v20.4s, #24 - orr v25.16b, v20.16b, v17.16b - ushr v17.4s, v19.4s, #8 - shl v19.4s, v19.4s, #24 - orr v19.16b, v19.16b, v17.16b - add v1.4s, v25.4s, v1.4s - eor v22.16b, v1.16b, v22.16b - eor v20.16b, v1.16b, v18.16b - add v1.4s, v19.4s, v5.4s - eor v26.16b, v1.16b, v0.16b - add v0.4s, v21.4s, v6.4s - eor v5.16b, v1.16b, v23.16b - eor v1.16b, v0.16b, v4.16b - eor v17.16b, v0.16b, v7.16b - add v0.4s, v16.4s, v2.4s - eor v2.16b, v0.16b, v3.16b - eor v6.16b, v0.16b, v24.16b - ushr v0.4s, v1.4s, #7 + orr v7.16b, v7.16b, v25.16b + orr v6.16b, v6.16b, v26.16b + orr v5.16b, v5.16b, v27.16b + add v22.4s, v21.4s, v22.4s + add v16.4s, v7.4s, v16.4s + add v4.4s, v6.4s, v4.4s + add v3.4s, v5.4s, v3.4s + eor v0.16b, v22.16b, v0.16b + eor v1.16b, v16.16b, v1.16b + eor v2.16b, v4.16b, v2.16b + eor v23.16b, v3.16b, v23.16b + ushr v24.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v25.4s, v1.4s, #7 shl v1.4s, v1.4s, #25 - orr v0.16b, v1.16b, v0.16b - ushr v1.4s, v2.4s, #7 + ushr v26.4s, v2.4s, #7 shl v2.4s, v2.4s, #25 - orr v1.16b, v2.16b, v1.16b - ushr v2.4s, v22.4s, #7 - shl v3.4s, v22.4s, #25 - orr v2.16b, v3.16b, v2.16b - ushr v3.4s, v5.4s, #7 - shl v4.4s, v5.4s, #25 - orr v3.16b, v4.16b, v3.16b - eor v8.16b, v16.16b, v3.16b - eor v9.16b, v25.16b, v0.16b - eor v31.16b, v1.16b, v19.16b - cmp x17, x22 - eor v15.16b, v2.16b, v21.16b - mov w18, w19 - b.ne .LBB2_4 -.LBB2_7: - zip1 v0.4s, v20.4s, v26.4s - zip2 v1.4s, v20.4s, v26.4s - zip1 v2.4s, v17.4s, v6.4s - zip2 v3.4s, v17.4s, v6.4s - zip1 v4.4s, v8.4s, v9.4s - zip2 v5.4s, v8.4s, v9.4s - zip1 v6.4s, v31.4s, v15.4s - zip2 v7.4s, v31.4s, v15.4s - add x13, x20, #4 - tst w5, #0x1 - sub x28, x28, #4 - zip1 v16.2d, v0.2d, v2.2d - zip2 v0.2d, v0.2d, v2.2d - zip1 v2.2d, v1.2d, v3.2d - zip2 v1.2d, v1.2d, v3.2d - zip1 v3.2d, v4.2d, v6.2d - zip2 v4.2d, v4.2d, v6.2d - zip1 v6.2d, v5.2d, v7.2d - zip2 v5.2d, v5.2d, v7.2d - add x24, x24, #32 - csel x20, x13, x20, ne - cmp x28, #3 - stp q16, q3, [x26] - stp q0, q4, [x26, #32] - stp q2, q6, [x26, #64] - stp q1, q5, [x26, #96] - add x26, x26, #128 - b.hi .LBB2_2 -.LBB2_8: - cbz x28, .LBB2_16 + ushr v27.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + orr v0.16b, v0.16b, v24.16b + orr v1.16b, v1.16b, v25.16b + orr v2.16b, v2.16b, v26.16b + orr v23.16b, v23.16b, v27.16b + movi v24.4s, #64 + eor v12.16b, v4.16b, v20.16b + eor v31.16b, v18.16b, v3.16b + eor v29.16b, v17.16b, v22.16b + eor v30.16b, v16.16b, v19.16b + eor v28.16b, v7.16b, v23.16b + eor v23.16b, v6.16b, v0.16b + eor v13.16b, v1.16b, v5.16b + eor v25.16b, v2.16b, v21.16b + cbnz x15, .LBB3_5 + b .LBB3_2 +.LBB3_6: + cbz x24, .LBB3_14 orr w8, w7, w19 - and x21, x5, #0x1 - stur w8, [x29, #-64] -.LBB2_10: + and x22, x5, #0x1 + stur w8, [x29, #-192] +.LBB3_8: ldr x8, [sp, #40] - ldr x25, [x24] - ldur w4, [x29, #-64] - ldp q1, q0, [x8] - mov x8, x22 - stp q1, q0, [x29, #-48] -.LBB2_11: - subs x23, x8, #1 - b.eq .LBB2_13 - cbnz x8, .LBB2_14 - b .LBB2_15 -.LBB2_13: - orr w4, w4, w27 -.LBB2_14: - sub x0, x29, #48 - mov w2, #64 - mov x1, x25 - mov x3, x20 - bl zfs_blake3_compress_in_place_sse2 + mov x28, x0 + ldr x25, [x0] + mov x23, x2 + ldur w5, [x29, #-192] + ldp q0, q1, [x8] + mov x8, x2 + b .LBB3_11 +.LBB3_9: + orr w5, w5, w27 +.LBB3_10: + sub x0, x29, #144 + sub x1, x29, #176 + mov x2, x25 + mov w3, #64 + mov x4, x20 + bl compress_pre + ldp q0, q1, [x29, #-144] add x25, x25, #64 - mov x8, x23 - mov w4, w19 - b .LBB2_11 -.LBB2_15: - ldp q0, q1, [x29, #-48] - add x20, x20, x21 - add x24, x24, #8 - subs x28, x28, #1 - stp q0, q1, [x26], #32 - b.ne .LBB2_10 -.LBB2_16: - add sp, sp, #384 + mov x8, x21 + mov w5, w19 + ldp q2, q3, [x29, #-112] + eor v0.16b, v2.16b, v0.16b + eor v1.16b, v3.16b, v1.16b +.LBB3_11: + subs x21, x8, #1 + stp q0, q1, [x29, #-176] + b.eq .LBB3_9 + cbnz x8, .LBB3_10 + ldp q1, q0, [x29, #-176] + mov x0, x28 + add x20, x20, x22 + add x0, x28, #8 + subs x24, x24, #1 + mov x2, x23 + stp q1, q0, [x26], #32 + b.ne .LBB3_8 +.LBB3_14: + add sp, sp, #464 ldp x20, x19, [sp, #144] ldp x22, x21, [sp, #128] ldp x24, x23, [sp, #112] @@ -2442,9 +2052,10 @@ zfs_blake3_hash_many_sse2: ldp d11, d10, [sp, #32] ldp d13, d12, [sp, #16] ldp d15, d14, [sp], #160 + hint #29 ret -.Lfunc_end2: - .size zfs_blake3_hash_many_sse2, .Lfunc_end2-zfs_blake3_hash_many_sse2 +.Lfunc_end3: + .size zfs_blake3_hash_many_sse2, .Lfunc_end3-zfs_blake3_hash_many_sse2 .cfi_endproc .section ".note.GNU-stack","",@progbits -#endif +#endif \ No newline at end of file diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S index a05baec96ce5..c4c2dfc5bcde 100644 --- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S @@ -22,17 +22,72 @@ /* * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 * Copyright (c) 2019-2022 Samuel Neves - * Copyright (c) 2022 Tino Reichardt + * Copyright (c) 2022-2023 Tino Reichardt * * This is converted assembly: SSE4.1 -> ARMv8-A * Used tools: SIMDe https://github.com/simd-everywhere/simde + * + * Should work on FreeBSD, Linux and macOS + * see: https://github.com/mcmilk/BLAKE3-tests/blob/master/contrib/simde.sh */ #if defined(__aarch64__) .text + .section .note.gnu.property,"a",@note + .p2align 3 + .word 4 + .word 16 + .word 5 + .asciz "GNU" + .word 3221225472 + .word 4 + .word 3 + .word 0 +.Lsec_end0: + .text + .globl zfs_blake3_compress_in_place_sse41 + .p2align 2 + .type zfs_blake3_compress_in_place_sse41,@function +zfs_blake3_compress_in_place_sse41: + .cfi_startproc + hint #25 + .cfi_negate_ra_state + sub sp, sp, #96 + stp x29, x30, [sp, #64] + add x29, sp, #64 + str x19, [sp, #80] + .cfi_def_cfa w29, 32 + .cfi_offset w19, -16 + .cfi_offset w30, -24 + .cfi_offset w29, -32 + mov x19, x0 + mov w5, w4 + mov x4, x3 + mov w3, w2 + mov x2, x1 + mov x0, sp + mov x1, x19 + bl compress_pre + ldp q0, q1, [sp] + ldp q2, q3, [sp, #32] + eor v0.16b, v2.16b, v0.16b + eor v1.16b, v3.16b, v1.16b + ldp x29, x30, [sp, #64] + stp q0, q1, [x19] + ldr x19, [sp, #80] + add sp, sp, #96 + hint #29 + ret +.Lfunc_end0: + .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41 + .cfi_endproc + .section .rodata.cst16,"aM",@progbits,16 .p2align 4 -.LCPI0_0: +.LCPI1_0: + .xword -4942790177982912921 + .xword -6534734903820487822 +.LCPI1_1: .byte 2 .byte 3 .byte 0 @@ -49,12 +104,7 @@ .byte 15 .byte 12 .byte 13 -.LCPI0_1: - .word 1779033703 - .word 3144134277 - .word 1013904242 - .word 2773480762 -.LCPI0_2: +.LCPI1_2: .byte 1 .byte 2 .byte 3 @@ -71,477 +121,497 @@ .byte 14 .byte 15 .byte 12 -.LCPI0_3: - .byte 0 - .byte 1 - .byte 2 - .byte 3 - .byte 20 - .byte 21 - .byte 22 - .byte 23 - .byte 8 - .byte 9 - .byte 10 - .byte 11 - .byte 28 - .byte 29 - .byte 30 - .byte 31 -.LCPI0_4: - .byte 0 - .byte 1 - .byte 2 - .byte 3 - .byte 4 - .byte 5 - .byte 6 - .byte 7 - .byte 8 - .byte 9 - .byte 10 - .byte 11 - .byte 28 - .byte 29 - .byte 30 - .byte 31 .text - .globl zfs_blake3_compress_in_place_sse41 .p2align 2 - .type zfs_blake3_compress_in_place_sse41,@function -zfs_blake3_compress_in_place_sse41: + .type compress_pre,@function +compress_pre: .cfi_startproc - ldp q7, q6, [x0] - ldp q17, q18, [x1] - add x12, x1, #32 - ld2 { v4.4s, v5.4s }, [x12] - lsr x10, x3, #32 - fmov s16, w3 - adrp x13, .LCPI0_0 - adrp x11, .LCPI0_1 - and w8, w2, #0xff - mov v16.s[1], w10 - ldr q0, [x13, :lo12:.LCPI0_0] - ldr q20, [x11, :lo12:.LCPI0_1] - adrp x11, .LCPI0_4 - and w9, w4, #0xff - ldr q2, [x11, :lo12:.LCPI0_4] - mov v16.s[2], w8 - uzp1 v21.4s, v17.4s, v18.4s - add v7.4s, v6.4s, v7.4s - adrp x12, .LCPI0_3 - mov v16.s[3], w9 - uzp2 v18.4s, v17.4s, v18.4s - add v7.4s, v7.4s, v21.4s - ext v17.16b, v5.16b, v5.16b, #12 - ldr q3, [x12, :lo12:.LCPI0_3] - ext v24.16b, v4.16b, v4.16b, #12 - eor v16.16b, v7.16b, v16.16b - mov v27.16b, v17.16b - uzp1 v19.4s, v21.4s, v21.4s - ext v25.16b, v21.16b, v21.16b, #12 - zip2 v28.4s, v18.4s, v17.4s - tbl v29.16b, { v16.16b }, v0.16b - mov v27.s[1], v24.s[2] - zip1 v23.2d, v17.2d, v18.2d - ext v19.16b, v19.16b, v21.16b, #8 - add v22.4s, v29.4s, v20.4s - ext v26.16b, v21.16b, v25.16b, #12 - tbl v20.16b, { v23.16b, v24.16b }, v2.16b - zip1 v21.4s, v28.4s, v24.4s - zip1 v23.4s, v24.4s, v28.4s - uzp2 v19.4s, v19.4s, v18.4s - eor v24.16b, v22.16b, v6.16b - ext v25.16b, v20.16b, v20.16b, #12 - ext v6.16b, v23.16b, v21.16b, #8 - add v7.4s, v7.4s, v18.4s - ext v18.16b, v19.16b, v19.16b, #4 - tbl v16.16b, { v26.16b, v27.16b }, v3.16b - uzp1 v21.4s, v20.4s, v25.4s - mov v26.16b, v6.16b - ext v23.16b, v18.16b, v18.16b, #12 - mov v26.s[1], v21.s[2] - adrp x10, .LCPI0_2 - ext v25.16b, v18.16b, v23.16b, #12 - uzp1 v23.4s, v18.4s, v18.4s - ldr q1, [x10, :lo12:.LCPI0_2] - ext v18.16b, v23.16b, v18.16b, #8 - ushr v23.4s, v24.4s, #12 - shl v24.4s, v24.4s, #20 - orr v23.16b, v24.16b, v23.16b - add v7.4s, v7.4s, v23.4s - eor v27.16b, v29.16b, v7.16b - add v4.4s, v7.4s, v4.4s - tbl v7.16b, { v25.16b, v26.16b }, v3.16b - tbl v26.16b, { v27.16b }, v1.16b - add v22.4s, v22.4s, v26.4s - uzp2 v18.4s, v18.4s, v16.4s - eor v23.16b, v23.16b, v22.16b - ext v5.16b, v18.16b, v18.16b, #4 - ushr v27.4s, v23.4s, #7 - shl v23.4s, v23.4s, #25 - uzp1 v25.4s, v5.4s, v5.4s - orr v23.16b, v23.16b, v27.16b - ext v28.16b, v4.16b, v4.16b, #12 - ext v4.16b, v25.16b, v5.16b, #8 - ext v25.16b, v26.16b, v26.16b, #8 - add v26.4s, v28.4s, v23.4s - eor v25.16b, v26.16b, v25.16b - ext v22.16b, v22.16b, v22.16b, #4 - tbl v25.16b, { v25.16b }, v0.16b - add v22.4s, v22.4s, v25.4s - eor v23.16b, v23.16b, v22.16b - add v17.4s, v26.4s, v17.4s - ushr v26.4s, v23.4s, #12 - shl v23.4s, v23.4s, #20 - orr v23.16b, v23.16b, v26.16b - add v17.4s, v17.4s, v23.4s - eor v25.16b, v25.16b, v17.16b - add v17.4s, v17.4s, v19.4s - tbl v19.16b, { v25.16b }, v1.16b - add v22.4s, v22.4s, v19.4s - eor v23.16b, v23.16b, v22.16b - ushr v25.4s, v23.4s, #7 - shl v23.4s, v23.4s, #25 - ext v17.16b, v17.16b, v17.16b, #4 - orr v23.16b, v23.16b, v25.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v17.4s, v17.4s, v23.4s - eor v19.16b, v17.16b, v19.16b - ext v22.16b, v22.16b, v22.16b, #12 - tbl v19.16b, { v19.16b }, v0.16b - add v22.4s, v22.4s, v19.4s - eor v23.16b, v23.16b, v22.16b - ushr v25.4s, v23.4s, #12 - shl v23.4s, v23.4s, #20 - add v17.4s, v17.4s, v16.4s - orr v23.16b, v23.16b, v25.16b - add v17.4s, v17.4s, v23.4s - ext v25.16b, v17.16b, v17.16b, #12 - eor v17.16b, v19.16b, v17.16b - tbl v17.16b, { v17.16b }, v1.16b - add v19.4s, v22.4s, v17.4s - eor v22.16b, v23.16b, v19.16b - add v25.4s, v25.4s, v21.4s - zip1 v20.2d, v6.2d, v16.2d - ushr v23.4s, v22.4s, #7 - shl v22.4s, v22.4s, #25 - zip2 v24.4s, v16.4s, v6.4s - tbl v26.16b, { v20.16b, v21.16b }, v2.16b - orr v22.16b, v22.16b, v23.16b - zip1 v16.4s, v24.4s, v21.4s - zip1 v20.4s, v21.4s, v24.4s - ext v21.16b, v26.16b, v26.16b, #12 - ext v17.16b, v17.16b, v17.16b, #8 - add v25.4s, v25.4s, v22.4s - ext v16.16b, v20.16b, v16.16b, #8 - uzp1 v21.4s, v26.4s, v21.4s - eor v26.16b, v25.16b, v17.16b - ext v19.16b, v19.16b, v19.16b, #4 - tbl v26.16b, { v26.16b }, v0.16b - mov v29.16b, v16.16b - add v19.4s, v19.4s, v26.4s - ext v27.16b, v5.16b, v5.16b, #12 - mov v29.s[1], v21.s[2] - eor v22.16b, v22.16b, v19.16b - ext v28.16b, v5.16b, v27.16b, #12 - ushr v27.4s, v22.4s, #12 - shl v22.4s, v22.4s, #20 - add v6.4s, v25.4s, v6.4s - orr v22.16b, v22.16b, v27.16b - add v6.4s, v6.4s, v22.4s - eor v26.16b, v26.16b, v6.16b - add v6.4s, v6.4s, v18.4s - tbl v18.16b, { v26.16b }, v1.16b - add v19.4s, v19.4s, v18.4s - eor v22.16b, v22.16b, v19.16b - ushr v26.4s, v22.4s, #7 - shl v22.4s, v22.4s, #25 - ext v6.16b, v6.16b, v6.16b, #4 - orr v22.16b, v22.16b, v26.16b - ext v18.16b, v18.16b, v18.16b, #8 - add v6.4s, v6.4s, v22.4s - eor v18.16b, v6.16b, v18.16b - ext v19.16b, v19.16b, v19.16b, #12 - tbl v18.16b, { v18.16b }, v0.16b - add v19.4s, v19.4s, v18.4s - eor v22.16b, v22.16b, v19.16b - ushr v26.4s, v22.4s, #12 - shl v22.4s, v22.4s, #20 - add v6.4s, v6.4s, v7.4s - orr v22.16b, v22.16b, v26.16b - add v6.4s, v6.4s, v22.4s - ext v26.16b, v6.16b, v6.16b, #12 - eor v6.16b, v18.16b, v6.16b - uzp2 v4.4s, v4.4s, v7.4s - zip2 v25.4s, v7.4s, v16.4s - add v26.4s, v26.4s, v21.4s - zip1 v20.2d, v16.2d, v7.2d - tbl v6.16b, { v6.16b }, v1.16b - ext v24.16b, v4.16b, v4.16b, #4 - tbl v27.16b, { v20.16b, v21.16b }, v2.16b - zip1 v7.4s, v25.4s, v21.4s - zip1 v20.4s, v21.4s, v25.4s - add v18.4s, v19.4s, v6.4s - uzp1 v5.4s, v24.4s, v24.4s - ext v21.16b, v27.16b, v27.16b, #12 - ext v7.16b, v20.16b, v7.16b, #8 - eor v19.16b, v22.16b, v18.16b - ext v5.16b, v5.16b, v24.16b, #8 - tbl v17.16b, { v28.16b, v29.16b }, v3.16b - uzp1 v21.4s, v27.4s, v21.4s - mov v28.16b, v7.16b - ushr v22.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - ext v23.16b, v24.16b, v24.16b, #12 - uzp2 v5.4s, v5.4s, v17.4s - mov v28.s[1], v21.s[2] - orr v19.16b, v19.16b, v22.16b - ext v27.16b, v24.16b, v23.16b, #12 - ext v23.16b, v5.16b, v5.16b, #4 - ext v6.16b, v6.16b, v6.16b, #8 - ext v25.16b, v18.16b, v18.16b, #4 - add v18.4s, v26.4s, v19.4s - uzp1 v24.4s, v23.4s, v23.4s - eor v6.16b, v18.16b, v6.16b - ext v24.16b, v24.16b, v23.16b, #8 - add v16.4s, v18.4s, v16.4s - tbl v18.16b, { v27.16b, v28.16b }, v3.16b - tbl v27.16b, { v6.16b }, v0.16b - uzp2 v6.4s, v24.4s, v18.4s - add v24.4s, v25.4s, v27.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - orr v19.16b, v19.16b, v25.16b - add v16.4s, v16.4s, v19.4s - eor v25.16b, v27.16b, v16.16b - add v4.4s, v16.4s, v4.4s - tbl v16.16b, { v25.16b }, v1.16b - add v24.4s, v24.4s, v16.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - ext v4.16b, v4.16b, v4.16b, #4 - orr v19.16b, v19.16b, v25.16b - ext v16.16b, v16.16b, v16.16b, #8 - add v4.4s, v4.4s, v19.4s - eor v16.16b, v4.16b, v16.16b - ext v24.16b, v24.16b, v24.16b, #12 - tbl v25.16b, { v16.16b }, v0.16b - add v24.4s, v24.4s, v25.4s - eor v16.16b, v19.16b, v24.16b - ushr v19.4s, v16.4s, #12 - shl v16.4s, v16.4s, #20 - add v4.4s, v4.4s, v17.4s - orr v19.16b, v16.16b, v19.16b - add v27.4s, v4.4s, v19.4s - eor v25.16b, v25.16b, v27.16b - tbl v25.16b, { v25.16b }, v1.16b - add v24.4s, v24.4s, v25.4s - zip2 v26.4s, v17.4s, v7.4s - ext v4.16b, v27.16b, v27.16b, #12 - eor v19.16b, v19.16b, v24.16b - add v28.4s, v4.4s, v21.4s - zip1 v20.2d, v7.2d, v17.2d - zip1 v4.4s, v26.4s, v21.4s - zip1 v17.4s, v21.4s, v26.4s - ushr v26.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - orr v19.16b, v19.16b, v26.16b - ext v25.16b, v25.16b, v25.16b, #8 - add v27.4s, v28.4s, v19.4s - eor v25.16b, v27.16b, v25.16b - ext v24.16b, v24.16b, v24.16b, #4 - tbl v25.16b, { v25.16b }, v0.16b - add v24.4s, v24.4s, v25.4s - eor v19.16b, v19.16b, v24.16b - add v7.4s, v27.4s, v7.4s - ushr v27.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - orr v19.16b, v19.16b, v27.16b - add v7.4s, v7.4s, v19.4s - eor v25.16b, v25.16b, v7.16b - add v5.4s, v7.4s, v5.4s - tbl v7.16b, { v25.16b }, v1.16b - add v24.4s, v24.4s, v7.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - ext v5.16b, v5.16b, v5.16b, #4 - orr v19.16b, v19.16b, v25.16b + hint #34 + fmov s1, w3 + movi d0, #0x0000ff000000ff + ldr q2, [x1] + adrp x8, .LCPI1_0 + mov v1.s[1], w5 + str q2, [x0] + ldr q4, [x8, :lo12:.LCPI1_0] + ldr q5, [x1, #16] + adrp x8, .LCPI1_1 + and v0.8b, v1.8b, v0.8b + fmov d1, x4 + stp q5, q4, [x0, #16] + mov v1.d[1], v0.d[0] + str q1, [x0, #48] + ldp q6, q7, [x2] + uzp1 v3.4s, v6.4s, v7.4s + add v0.4s, v2.4s, v3.4s + uzp2 v2.4s, v6.4s, v7.4s + add v16.4s, v0.4s, v5.4s + ldr q0, [x8, :lo12:.LCPI1_1] + adrp x8, .LCPI1_2 + eor v1.16b, v16.16b, v1.16b + add v7.4s, v16.4s, v2.4s + tbl v1.16b, { v1.16b }, v0.16b + add v4.4s, v1.4s, v4.4s + eor v5.16b, v4.16b, v5.16b + ushr v6.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v6.16b + add v6.4s, v7.4s, v5.4s + eor v7.16b, v1.16b, v6.16b + ldr q1, [x8, :lo12:.LCPI1_2] + add x8, x2, #32 + tbl v7.16b, { v7.16b }, v1.16b + ld2 { v16.4s, v17.4s }, [x8] + add v4.4s, v4.4s, v7.4s ext v7.16b, v7.16b, v7.16b, #8 - add v5.4s, v5.4s, v19.4s - eor v7.16b, v5.16b, v7.16b - ext v24.16b, v24.16b, v24.16b, #12 - tbl v7.16b, { v7.16b }, v0.16b - add v24.4s, v24.4s, v7.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - tbl v16.16b, { v20.16b, v21.16b }, v2.16b - add v5.4s, v5.4s, v18.4s - orr v19.16b, v19.16b, v25.16b - ext v20.16b, v16.16b, v16.16b, #12 - ext v4.16b, v17.16b, v4.16b, #8 - add v5.4s, v5.4s, v19.4s - uzp1 v21.4s, v16.4s, v20.4s - mov v17.16b, v4.16b - ext v25.16b, v5.16b, v5.16b, #12 - mov v17.s[1], v21.s[2] - add v25.4s, v25.4s, v21.4s - zip1 v20.2d, v4.2d, v18.2d - ext v22.16b, v23.16b, v23.16b, #12 - zip2 v26.4s, v18.4s, v4.4s - tbl v18.16b, { v20.16b, v21.16b }, v2.16b - eor v5.16b, v7.16b, v5.16b - ext v16.16b, v23.16b, v22.16b, #12 - ext v22.16b, v6.16b, v6.16b, #4 - zip1 v27.4s, v26.4s, v21.4s - zip1 v20.4s, v21.4s, v26.4s - ext v21.16b, v18.16b, v18.16b, #12 - tbl v5.16b, { v5.16b }, v1.16b - ext v20.16b, v20.16b, v27.16b, #8 - uzp1 v27.4s, v18.4s, v21.4s - uzp1 v18.4s, v22.4s, v22.4s - add v21.4s, v24.4s, v5.4s - ext v18.16b, v18.16b, v22.16b, #8 - eor v19.16b, v19.16b, v21.16b - tbl v7.16b, { v16.16b, v17.16b }, v3.16b - uzp2 v18.4s, v18.4s, v17.4s - zip2 v16.4s, v16.4s, v20.4s - ushr v17.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - orr v17.16b, v19.16b, v17.16b - ext v5.16b, v5.16b, v5.16b, #8 - add v19.4s, v25.4s, v17.4s - eor v5.16b, v19.16b, v5.16b - ext v21.16b, v21.16b, v21.16b, #4 - tbl v5.16b, { v5.16b }, v0.16b - add v4.4s, v19.4s, v4.4s - add v19.4s, v21.4s, v5.4s - eor v17.16b, v17.16b, v19.16b - ushr v21.4s, v17.4s, #12 - shl v17.4s, v17.4s, #20 - orr v17.16b, v17.16b, v21.16b - add v4.4s, v4.4s, v17.4s - eor v5.16b, v5.16b, v4.16b - tbl v5.16b, { v5.16b }, v1.16b - add v4.4s, v4.4s, v6.4s - add v6.4s, v19.4s, v5.4s - eor v17.16b, v17.16b, v6.16b - ushr v19.4s, v17.4s, #7 - shl v17.4s, v17.4s, #25 - ext v4.16b, v4.16b, v4.16b, #4 - orr v17.16b, v17.16b, v19.16b - ext v5.16b, v5.16b, v5.16b, #8 - add v4.4s, v4.4s, v17.4s + add v6.4s, v6.4s, v16.4s eor v5.16b, v4.16b, v5.16b + ext v4.16b, v4.16b, v4.16b, #4 + ext v16.16b, v16.16b, v16.16b, #12 ext v6.16b, v6.16b, v6.16b, #12 - tbl v5.16b, { v5.16b }, v0.16b + ushr v18.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v18.16b + ext v18.16b, v17.16b, v17.16b, #12 add v6.4s, v6.4s, v5.4s - eor v17.16b, v17.16b, v6.16b - ushr v19.4s, v17.4s, #12 - shl v17.4s, v17.4s, #20 + mov v17.16b, v18.16b + eor v7.16b, v7.16b, v6.16b + add v6.4s, v6.4s, v18.4s + mov v17.s[1], v16.s[2] + tbl v7.16b, { v7.16b }, v0.16b add v4.4s, v4.4s, v7.4s - orr v17.16b, v17.16b, v19.16b - add v4.4s, v4.4s, v17.4s - eor v5.16b, v5.16b, v4.16b - tbl v5.16b, { v5.16b }, v1.16b - mov v29.16b, v20.16b + eor v5.16b, v4.16b, v5.16b + ushr v19.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v19.16b + uzp1 v19.4s, v3.4s, v3.4s + add v6.4s, v6.4s, v5.4s + ext v19.16b, v19.16b, v3.16b, #8 + eor v7.16b, v7.16b, v6.16b + uzp2 v19.4s, v19.4s, v2.4s + tbl v7.16b, { v7.16b }, v1.16b + add v6.4s, v6.4s, v19.4s + add v4.4s, v4.4s, v7.4s + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #8 + eor v5.16b, v4.16b, v5.16b ext v4.16b, v4.16b, v4.16b, #12 + ushr v20.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v20.16b + ext v20.16b, v3.16b, v3.16b, #12 add v6.4s, v6.4s, v5.4s - mov v29.s[1], v27.s[2] - add v4.4s, v4.4s, v27.4s - zip1 v26.2d, v20.2d, v7.2d - zip1 v7.4s, v16.4s, v27.4s - zip1 v16.4s, v27.4s, v16.4s - eor v17.16b, v17.16b, v6.16b - ext v7.16b, v16.16b, v7.16b, #8 - ushr v16.4s, v17.4s, #7 - shl v17.4s, v17.4s, #25 - orr v16.16b, v17.16b, v16.16b - ext v5.16b, v5.16b, v5.16b, #8 - add v4.4s, v4.4s, v16.4s + ext v3.16b, v3.16b, v20.16b, #12 + eor v7.16b, v7.16b, v6.16b + rev64 v3.4s, v3.4s + tbl v7.16b, { v7.16b }, v0.16b + trn2 v3.4s, v3.4s, v17.4s + add v4.4s, v4.4s, v7.4s + add v6.4s, v6.4s, v3.4s eor v5.16b, v4.16b, v5.16b - ext v6.16b, v6.16b, v6.16b, #4 - tbl v5.16b, { v5.16b }, v0.16b + ushr v17.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v17.16b + zip1 v17.2d, v18.2d, v2.2d + zip2 v2.4s, v2.4s, v18.4s add v6.4s, v6.4s, v5.4s - eor v16.16b, v16.16b, v6.16b - ushr v17.4s, v16.4s, #12 - shl v16.4s, v16.4s, #20 - add v4.4s, v4.4s, v20.4s - orr v16.16b, v16.16b, v17.16b - add v4.4s, v4.4s, v16.4s - eor v5.16b, v5.16b, v4.16b - tbl v5.16b, { v5.16b }, v1.16b + mov v17.s[3], v16.s[3] + zip1 v18.4s, v2.4s, v16.4s + zip1 v2.4s, v16.4s, v2.4s + eor v7.16b, v7.16b, v6.16b + ext v6.16b, v6.16b, v6.16b, #12 + ext v16.16b, v2.16b, v18.16b, #8 + tbl v7.16b, { v7.16b }, v1.16b + add v20.4s, v4.4s, v7.4s + ext v4.16b, v17.16b, v17.16b, #12 + ext v7.16b, v7.16b, v7.16b, #8 + eor v5.16b, v20.16b, v5.16b + uzp1 v4.4s, v17.4s, v4.4s + ushr v17.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v6.4s, v6.4s, v4.4s + orr v5.16b, v5.16b, v17.16b + ext v17.16b, v20.16b, v20.16b, #4 add v6.4s, v6.4s, v5.4s - eor v16.16b, v16.16b, v6.16b + eor v7.16b, v7.16b, v6.16b + add v6.4s, v6.4s, v16.4s + tbl v7.16b, { v7.16b }, v0.16b + add v17.4s, v17.4s, v7.4s + eor v5.16b, v17.16b, v5.16b + ushr v2.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v2.16b, v5.16b, v2.16b + add v5.4s, v6.4s, v2.4s + ext v6.16b, v19.16b, v19.16b, #4 + eor v7.16b, v7.16b, v5.16b + uzp1 v18.4s, v6.4s, v6.4s + tbl v7.16b, { v7.16b }, v1.16b + ext v18.16b, v18.16b, v6.16b, #8 + add v17.4s, v17.4s, v7.4s + uzp2 v18.4s, v18.4s, v3.4s + ext v7.16b, v7.16b, v7.16b, #8 + eor v2.16b, v17.16b, v2.16b + add v5.4s, v5.4s, v18.4s + ext v17.16b, v17.16b, v17.16b, #12 + ushr v19.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v2.16b, v2.16b, v19.16b + ext v19.16b, v6.16b, v6.16b, #12 + add v5.4s, v5.4s, v2.4s + ext v6.16b, v6.16b, v19.16b, #12 + mov v19.16b, v16.16b + eor v7.16b, v7.16b, v5.16b + rev64 v6.4s, v6.4s + mov v19.s[1], v4.s[2] + tbl v7.16b, { v7.16b }, v0.16b + add v17.4s, v17.4s, v7.4s + eor v20.16b, v17.16b, v2.16b + trn2 v2.4s, v6.4s, v19.4s + ushr v6.4s, v20.4s, #12 + shl v19.4s, v20.4s, #20 + add v5.4s, v5.4s, v2.4s + orr v6.16b, v19.16b, v6.16b + add v19.4s, v5.4s, v6.4s + eor v5.16b, v7.16b, v19.16b + zip1 v7.2d, v16.2d, v3.2d + zip2 v3.4s, v3.4s, v16.4s + tbl v20.16b, { v5.16b }, v1.16b + mov v7.s[3], v4.s[3] + add v17.4s, v17.4s, v20.4s + ext v5.16b, v7.16b, v7.16b, #12 + eor v6.16b, v17.16b, v6.16b + uzp1 v5.4s, v7.4s, v5.4s + ext v7.16b, v19.16b, v19.16b, #12 + ext v17.16b, v17.16b, v17.16b, #4 + ushr v19.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + add v7.4s, v7.4s, v5.4s + orr v6.16b, v6.16b, v19.16b + ext v19.16b, v20.16b, v20.16b, #8 + add v7.4s, v7.4s, v6.4s + eor v19.16b, v19.16b, v7.16b + tbl v19.16b, { v19.16b }, v0.16b + add v16.4s, v17.4s, v19.4s + zip1 v17.4s, v3.4s, v4.4s + zip1 v3.4s, v4.4s, v3.4s + eor v4.16b, v16.16b, v6.16b + ext v17.16b, v3.16b, v17.16b, #8 + ushr v3.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + add v6.4s, v7.4s, v17.4s + orr v3.16b, v4.16b, v3.16b + add v4.4s, v6.4s, v3.4s + ext v6.16b, v18.16b, v18.16b, #4 + eor v7.16b, v19.16b, v4.16b + uzp1 v18.4s, v6.4s, v6.4s + tbl v7.16b, { v7.16b }, v1.16b + ext v18.16b, v18.16b, v6.16b, #8 + add v16.4s, v16.4s, v7.4s + uzp2 v18.4s, v18.4s, v2.4s + ext v7.16b, v7.16b, v7.16b, #8 + eor v3.16b, v16.16b, v3.16b add v4.4s, v4.4s, v18.4s - ushr v17.4s, v16.4s, #7 - shl v16.4s, v16.4s, #25 - ext v23.16b, v22.16b, v22.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ushr v19.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 ext v4.16b, v4.16b, v4.16b, #4 - orr v16.16b, v16.16b, v17.16b - ext v28.16b, v22.16b, v23.16b, #12 - ext v5.16b, v5.16b, v5.16b, #8 - add v4.4s, v16.4s, v4.4s - tbl v3.16b, { v28.16b, v29.16b }, v3.16b - eor v5.16b, v4.16b, v5.16b - ext v6.16b, v6.16b, v6.16b, #12 - add v3.4s, v4.4s, v3.4s - tbl v4.16b, { v5.16b }, v0.16b - add v5.4s, v6.4s, v4.4s - eor v6.16b, v16.16b, v5.16b - ushr v16.4s, v6.4s, #12 + orr v3.16b, v3.16b, v19.16b + ext v19.16b, v6.16b, v6.16b, #12 + add v4.4s, v4.4s, v3.4s + ext v6.16b, v6.16b, v19.16b, #12 + mov v19.16b, v17.16b + eor v7.16b, v7.16b, v4.16b + rev64 v6.4s, v6.4s + mov v19.s[1], v5.s[2] + tbl v7.16b, { v7.16b }, v0.16b + add v16.4s, v16.4s, v7.4s + eor v20.16b, v16.16b, v3.16b + trn2 v3.4s, v6.4s, v19.4s + ushr v6.4s, v20.4s, #12 + shl v19.4s, v20.4s, #20 + add v4.4s, v4.4s, v3.4s + orr v6.16b, v19.16b, v6.16b + zip1 v19.2d, v17.2d, v2.2d + zip2 v2.4s, v2.4s, v17.4s + add v4.4s, v4.4s, v6.4s + mov v19.s[3], v5.s[3] + zip1 v17.4s, v2.4s, v5.4s + zip1 v2.4s, v5.4s, v2.4s + eor v7.16b, v7.16b, v4.16b + ext v20.16b, v19.16b, v19.16b, #12 + ext v4.16b, v4.16b, v4.16b, #12 + ext v2.16b, v2.16b, v17.16b, #8 + tbl v7.16b, { v7.16b }, v1.16b + add v16.4s, v16.4s, v7.4s + ext v7.16b, v7.16b, v7.16b, #8 + eor v21.16b, v16.16b, v6.16b + uzp1 v6.4s, v19.4s, v20.4s + ext v16.16b, v16.16b, v16.16b, #4 + ushr v19.4s, v21.4s, #7 + shl v20.4s, v21.4s, #25 + add v4.4s, v4.4s, v6.4s + orr v19.16b, v20.16b, v19.16b + add v4.4s, v4.4s, v19.4s + eor v7.16b, v7.16b, v4.16b + add v4.4s, v4.4s, v2.4s + tbl v7.16b, { v7.16b }, v0.16b + add v16.4s, v16.4s, v7.4s + eor v5.16b, v16.16b, v19.16b + ushr v17.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v17.16b + ext v17.16b, v18.16b, v18.16b, #4 + add v4.4s, v4.4s, v5.4s + uzp1 v18.4s, v17.4s, v17.4s + eor v7.16b, v7.16b, v4.16b + ext v18.16b, v18.16b, v17.16b, #8 + tbl v7.16b, { v7.16b }, v1.16b + uzp2 v18.4s, v18.4s, v3.4s + add v16.4s, v16.4s, v7.4s + add v4.4s, v4.4s, v18.4s + ext v7.16b, v7.16b, v7.16b, #8 + eor v5.16b, v16.16b, v5.16b + ext v4.16b, v4.16b, v4.16b, #4 + ext v16.16b, v16.16b, v16.16b, #12 + ushr v19.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v19.16b + add v19.4s, v4.4s, v5.4s + eor v4.16b, v7.16b, v19.16b + ext v7.16b, v17.16b, v17.16b, #12 + tbl v20.16b, { v4.16b }, v0.16b + ext v4.16b, v17.16b, v7.16b, #12 + mov v7.16b, v2.16b + add v16.4s, v16.4s, v20.4s + rev64 v4.4s, v4.4s + mov v7.s[1], v6.s[2] + eor v5.16b, v16.16b, v5.16b + trn2 v4.4s, v4.4s, v7.4s + ushr v7.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v17.4s, v19.4s, v4.4s + zip1 v19.2d, v2.2d, v3.2d + zip2 v2.4s, v3.4s, v2.4s + orr v5.16b, v5.16b, v7.16b + mov v19.s[3], v6.s[3] + add v7.4s, v17.4s, v5.4s + eor v17.16b, v20.16b, v7.16b + ext v20.16b, v19.16b, v19.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + tbl v17.16b, { v17.16b }, v1.16b + add v16.4s, v16.4s, v17.4s + ext v17.16b, v17.16b, v17.16b, #8 + eor v21.16b, v16.16b, v5.16b + uzp1 v5.4s, v19.4s, v20.4s + ext v16.16b, v16.16b, v16.16b, #4 + ushr v19.4s, v21.4s, #7 + shl v20.4s, v21.4s, #25 + add v7.4s, v7.4s, v5.4s + orr v19.16b, v20.16b, v19.16b + add v7.4s, v7.4s, v19.4s + eor v17.16b, v17.16b, v7.16b + tbl v17.16b, { v17.16b }, v0.16b + add v3.4s, v16.4s, v17.4s + zip1 v16.4s, v2.4s, v6.4s + zip1 v2.4s, v6.4s, v2.4s + eor v6.16b, v3.16b, v19.16b + ext v16.16b, v2.16b, v16.16b, #8 + ushr v2.4s, v6.4s, #12 shl v6.4s, v6.4s, #20 - orr v6.16b, v6.16b, v16.16b - tbl v2.16b, { v26.16b, v27.16b }, v2.16b - add v3.4s, v3.4s, v6.4s - ext v19.16b, v2.16b, v2.16b, #12 - eor v4.16b, v4.16b, v3.16b - uzp1 v2.4s, v2.4s, v19.4s + add v7.4s, v7.4s, v16.4s + orr v2.16b, v6.16b, v2.16b + add v6.4s, v7.4s, v2.4s + ext v7.16b, v18.16b, v18.16b, #4 + eor v17.16b, v17.16b, v6.16b + uzp1 v18.4s, v7.4s, v7.4s + tbl v17.16b, { v17.16b }, v1.16b + ext v18.16b, v18.16b, v7.16b, #8 + add v3.4s, v3.4s, v17.4s + uzp2 v18.4s, v18.4s, v4.4s + eor v2.16b, v3.16b, v2.16b + add v6.4s, v6.4s, v18.4s ext v3.16b, v3.16b, v3.16b, #12 - tbl v4.16b, { v4.16b }, v1.16b - add v2.4s, v3.4s, v2.4s - add v3.4s, v5.4s, v4.4s - eor v5.16b, v6.16b, v3.16b - ushr v6.4s, v5.4s, #7 + ext v18.16b, v18.16b, v18.16b, #4 + ushr v19.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v19.16b, v2.16b, v19.16b + ext v2.16b, v17.16b, v17.16b, #8 + ext v17.16b, v7.16b, v7.16b, #12 + add v6.4s, v6.4s, v19.4s + eor v2.16b, v2.16b, v6.16b + tbl v20.16b, { v2.16b }, v0.16b + ext v2.16b, v7.16b, v17.16b, #12 + mov v7.16b, v16.16b + add v17.4s, v3.4s, v20.4s + rev64 v3.4s, v2.4s + mov v7.s[1], v5.s[2] + eor v19.16b, v17.16b, v19.16b + trn2 v3.4s, v3.4s, v7.4s + ushr v21.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + add v6.4s, v6.4s, v3.4s + orr v19.16b, v19.16b, v21.16b + add v21.4s, v6.4s, v19.4s + eor v6.16b, v20.16b, v21.16b + zip1 v20.2d, v16.2d, v4.2d + zip2 v4.4s, v4.4s, v16.4s + tbl v22.16b, { v6.16b }, v1.16b + mov v20.s[3], v5.s[3] + add v17.4s, v17.4s, v22.4s + ext v6.16b, v20.16b, v20.16b, #12 + eor v19.16b, v17.16b, v19.16b + uzp1 v6.4s, v20.4s, v6.4s + ext v20.16b, v21.16b, v21.16b, #12 + ext v17.16b, v17.16b, v17.16b, #4 + ushr v21.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + add v20.4s, v20.4s, v6.4s + orr v19.16b, v19.16b, v21.16b + ext v21.16b, v22.16b, v22.16b, #8 + add v20.4s, v20.4s, v19.4s + eor v21.16b, v21.16b, v20.16b + tbl v21.16b, { v21.16b }, v0.16b + add v16.4s, v17.4s, v21.4s + zip1 v17.4s, v4.4s, v5.4s + zip1 v4.4s, v5.4s, v4.4s + eor v5.16b, v16.16b, v19.16b + ext v4.16b, v4.16b, v17.16b, #8 + ushr v17.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v19.4s, v20.4s, v4.4s + ext v20.16b, v18.16b, v18.16b, #8 + zip1 v3.2d, v4.2d, v3.2d + orr v5.16b, v5.16b, v17.16b + zip2 v2.4s, v2.4s, v4.4s + uzp2 v7.4s, v20.4s, v7.4s + mov v3.s[3], v6.s[3] + add v17.4s, v19.4s, v5.4s + ext v7.16b, v7.16b, v20.16b, #4 + eor v19.16b, v21.16b, v17.16b + ext v17.16b, v17.16b, v17.16b, #4 + tbl v19.16b, { v19.16b }, v1.16b + add v7.4s, v17.4s, v7.4s + add v16.4s, v16.4s, v19.4s + ext v17.16b, v19.16b, v19.16b, #8 + ext v19.16b, v18.16b, v18.16b, #12 + eor v5.16b, v16.16b, v5.16b + ext v16.16b, v16.16b, v16.16b, #12 + ext v18.16b, v18.16b, v19.16b, #12 + mov v19.16b, v4.16b + ushr v20.4s, v5.4s, #7 shl v5.4s, v5.4s, #25 - orr v5.16b, v5.16b, v6.16b - ext v4.16b, v4.16b, v4.16b, #8 - add v2.4s, v2.4s, v5.4s - eor v4.16b, v2.16b, v4.16b - ext v3.16b, v3.16b, v3.16b, #4 - tbl v0.16b, { v4.16b }, v0.16b - add v3.4s, v3.4s, v0.4s - eor v4.16b, v5.16b, v3.16b - ushr v5.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - add v2.4s, v2.4s, v7.4s - orr v4.16b, v4.16b, v5.16b - add v2.4s, v2.4s, v4.4s + rev64 v18.4s, v18.4s + mov v19.s[1], v6.s[2] + orr v5.16b, v5.16b, v20.16b + trn2 v18.4s, v18.4s, v19.4s + add v7.4s, v5.4s, v7.4s + eor v17.16b, v17.16b, v7.16b + add v7.4s, v7.4s, v18.4s + ext v18.16b, v3.16b, v3.16b, #12 + tbl v17.16b, { v17.16b }, v0.16b + uzp1 v3.4s, v3.4s, v18.4s + add v16.4s, v16.4s, v17.4s + eor v5.16b, v16.16b, v5.16b + ushr v19.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v19.16b + add v7.4s, v7.4s, v5.4s + eor v17.16b, v17.16b, v7.16b + ext v7.16b, v7.16b, v7.16b, #12 + tbl v17.16b, { v17.16b }, v1.16b + add v3.4s, v7.4s, v3.4s + add v16.4s, v16.4s, v17.4s + ext v7.16b, v17.16b, v17.16b, #8 + eor v5.16b, v16.16b, v5.16b + ext v16.16b, v16.16b, v16.16b, #4 + ushr v18.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v18.16b + add v3.4s, v3.4s, v5.4s + eor v7.16b, v7.16b, v3.16b + tbl v0.16b, { v7.16b }, v0.16b + zip1 v7.4s, v2.4s, v6.4s + zip1 v2.4s, v6.4s, v2.4s + add v4.4s, v16.4s, v0.4s + ext v2.16b, v2.16b, v7.16b, #8 + eor v5.16b, v4.16b, v5.16b + add v2.4s, v3.4s, v2.4s + ushr v6.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v3.16b, v5.16b, v6.16b + add v2.4s, v2.4s, v3.4s eor v0.16b, v0.16b, v2.16b - tbl v0.16b, { v0.16b }, v1.16b - add v1.4s, v3.4s, v0.4s - eor v3.16b, v4.16b, v1.16b ext v2.16b, v2.16b, v2.16b, #4 + tbl v0.16b, { v0.16b }, v1.16b + add v1.4s, v4.4s, v0.4s + ext v0.16b, v0.16b, v0.16b, #8 + eor v3.16b, v1.16b, v3.16b ext v1.16b, v1.16b, v1.16b, #12 ushr v4.4s, v3.4s, #7 shl v3.4s, v3.4s, #25 - ext v0.16b, v0.16b, v0.16b, #8 - eor v1.16b, v2.16b, v1.16b - orr v2.16b, v3.16b, v4.16b + stp q1, q0, [x0, #32] + orr v3.16b, v3.16b, v4.16b + stp q2, q3, [x0] + ret +.Lfunc_end1: + .size compress_pre, .Lfunc_end1-compress_pre + .cfi_endproc + + .globl zfs_blake3_compress_xof_sse41 + .p2align 2 + .type zfs_blake3_compress_xof_sse41,@function +zfs_blake3_compress_xof_sse41: + .cfi_startproc + hint #25 + .cfi_negate_ra_state + sub sp, sp, #96 + stp x29, x30, [sp, #64] + add x29, sp, #64 + stp x20, x19, [sp, #80] + .cfi_def_cfa w29, 32 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w30, -24 + .cfi_offset w29, -32 + mov x20, x0 + mov x19, x5 + mov w5, w4 + mov x4, x3 + mov w3, w2 + mov x2, x1 + mov x0, sp + mov x1, x20 + bl compress_pre + ldp q0, q1, [sp] + ldp q2, q3, [sp, #32] eor v0.16b, v2.16b, v0.16b - stp q1, q0, [x0] + eor v1.16b, v3.16b, v1.16b + ldp x29, x30, [sp, #64] + stp q0, q1, [x19] + ldr q0, [x20] + eor v0.16b, v0.16b, v2.16b + str q0, [x19, #32] + ldr q0, [x20, #16] + eor v0.16b, v0.16b, v3.16b + str q0, [x19, #48] + ldp x20, x19, [sp, #80] + add sp, sp, #96 + hint #29 ret -.Lfunc_end0: - .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41 +.Lfunc_end2: + .size zfs_blake3_compress_xof_sse41, .Lfunc_end2-zfs_blake3_compress_xof_sse41 .cfi_endproc .section .rodata.cst16,"aM",@progbits,16 .p2align 4 -.LCPI1_0: +.LCPI3_0: + .word 0 + .word 1 + .word 2 + .word 3 +.LCPI3_1: .byte 2 .byte 3 .byte 0 @@ -558,12 +628,7 @@ zfs_blake3_compress_in_place_sse41: .byte 15 .byte 12 .byte 13 -.LCPI1_1: - .word 1779033703 - .word 3144134277 - .word 1013904242 - .word 2773480762 -.LCPI1_2: +.LCPI3_2: .byte 1 .byte 2 .byte 3 @@ -580,540 +645,29 @@ zfs_blake3_compress_in_place_sse41: .byte 14 .byte 15 .byte 12 -.LCPI1_3: - .byte 0 - .byte 1 - .byte 2 - .byte 3 - .byte 20 - .byte 21 - .byte 22 - .byte 23 - .byte 8 - .byte 9 - .byte 10 - .byte 11 - .byte 28 - .byte 29 - .byte 30 - .byte 31 -.LCPI1_4: - .byte 0 - .byte 1 - .byte 2 - .byte 3 - .byte 4 - .byte 5 - .byte 6 - .byte 7 - .byte 8 - .byte 9 - .byte 10 - .byte 11 - .byte 28 - .byte 29 - .byte 30 - .byte 31 +.LCPI3_3: + .word 1779033703 + .word 3144134277 + .word 1013904242 + .word 2773480762 .text - .globl zfs_blake3_compress_xof_sse41 + .globl zfs_blake3_hash_many_sse41 .p2align 2 - .type zfs_blake3_compress_xof_sse41,@function -zfs_blake3_compress_xof_sse41: + .type zfs_blake3_hash_many_sse41,@function +zfs_blake3_hash_many_sse41: .cfi_startproc - ldp q7, q6, [x0] - ldp q17, q18, [x1] - add x12, x1, #32 - ld2 { v4.4s, v5.4s }, [x12] - lsr x10, x3, #32 - fmov s16, w3 - adrp x13, .LCPI1_0 - adrp x11, .LCPI1_1 - and w8, w2, #0xff - mov v16.s[1], w10 - ldr q0, [x13, :lo12:.LCPI1_0] - ldr q20, [x11, :lo12:.LCPI1_1] - adrp x11, .LCPI1_4 - and w9, w4, #0xff - ldr q2, [x11, :lo12:.LCPI1_4] - mov v16.s[2], w8 - uzp1 v21.4s, v17.4s, v18.4s - add v7.4s, v6.4s, v7.4s - adrp x12, .LCPI1_3 - mov v16.s[3], w9 - uzp2 v18.4s, v17.4s, v18.4s - add v7.4s, v7.4s, v21.4s - ext v17.16b, v5.16b, v5.16b, #12 - ldr q3, [x12, :lo12:.LCPI1_3] - ext v24.16b, v4.16b, v4.16b, #12 - eor v16.16b, v7.16b, v16.16b - mov v27.16b, v17.16b - uzp1 v19.4s, v21.4s, v21.4s - ext v25.16b, v21.16b, v21.16b, #12 - zip2 v28.4s, v18.4s, v17.4s - tbl v29.16b, { v16.16b }, v0.16b - mov v27.s[1], v24.s[2] - zip1 v23.2d, v17.2d, v18.2d - ext v19.16b, v19.16b, v21.16b, #8 - add v22.4s, v29.4s, v20.4s - ext v26.16b, v21.16b, v25.16b, #12 - tbl v20.16b, { v23.16b, v24.16b }, v2.16b - zip1 v21.4s, v28.4s, v24.4s - zip1 v23.4s, v24.4s, v28.4s - uzp2 v19.4s, v19.4s, v18.4s - eor v24.16b, v22.16b, v6.16b - ext v25.16b, v20.16b, v20.16b, #12 - ext v6.16b, v23.16b, v21.16b, #8 - add v7.4s, v7.4s, v18.4s - ext v18.16b, v19.16b, v19.16b, #4 - tbl v16.16b, { v26.16b, v27.16b }, v3.16b - uzp1 v21.4s, v20.4s, v25.4s - mov v26.16b, v6.16b - ext v23.16b, v18.16b, v18.16b, #12 - mov v26.s[1], v21.s[2] - adrp x10, .LCPI1_2 - ext v25.16b, v18.16b, v23.16b, #12 - uzp1 v23.4s, v18.4s, v18.4s - ldr q1, [x10, :lo12:.LCPI1_2] - ext v18.16b, v23.16b, v18.16b, #8 - ushr v23.4s, v24.4s, #12 - shl v24.4s, v24.4s, #20 - orr v23.16b, v24.16b, v23.16b - add v7.4s, v7.4s, v23.4s - eor v27.16b, v29.16b, v7.16b - add v4.4s, v7.4s, v4.4s - tbl v7.16b, { v25.16b, v26.16b }, v3.16b - tbl v26.16b, { v27.16b }, v1.16b - add v22.4s, v22.4s, v26.4s - uzp2 v18.4s, v18.4s, v16.4s - eor v23.16b, v23.16b, v22.16b - ext v5.16b, v18.16b, v18.16b, #4 - ushr v27.4s, v23.4s, #7 - shl v23.4s, v23.4s, #25 - uzp1 v25.4s, v5.4s, v5.4s - orr v23.16b, v23.16b, v27.16b - ext v28.16b, v4.16b, v4.16b, #12 - ext v4.16b, v25.16b, v5.16b, #8 - ext v25.16b, v26.16b, v26.16b, #8 - add v26.4s, v28.4s, v23.4s - eor v25.16b, v26.16b, v25.16b - ext v22.16b, v22.16b, v22.16b, #4 - tbl v25.16b, { v25.16b }, v0.16b - add v22.4s, v22.4s, v25.4s - eor v23.16b, v23.16b, v22.16b - add v17.4s, v26.4s, v17.4s - ushr v26.4s, v23.4s, #12 - shl v23.4s, v23.4s, #20 - orr v23.16b, v23.16b, v26.16b - add v17.4s, v17.4s, v23.4s - eor v25.16b, v25.16b, v17.16b - add v17.4s, v17.4s, v19.4s - tbl v19.16b, { v25.16b }, v1.16b - add v22.4s, v22.4s, v19.4s - eor v23.16b, v23.16b, v22.16b - ushr v25.4s, v23.4s, #7 - shl v23.4s, v23.4s, #25 - ext v17.16b, v17.16b, v17.16b, #4 - orr v23.16b, v23.16b, v25.16b - ext v19.16b, v19.16b, v19.16b, #8 - add v17.4s, v17.4s, v23.4s - eor v19.16b, v17.16b, v19.16b - ext v22.16b, v22.16b, v22.16b, #12 - tbl v19.16b, { v19.16b }, v0.16b - add v22.4s, v22.4s, v19.4s - eor v23.16b, v23.16b, v22.16b - ushr v25.4s, v23.4s, #12 - shl v23.4s, v23.4s, #20 - add v17.4s, v17.4s, v16.4s - orr v23.16b, v23.16b, v25.16b - add v17.4s, v17.4s, v23.4s - ext v25.16b, v17.16b, v17.16b, #12 - eor v17.16b, v19.16b, v17.16b - tbl v17.16b, { v17.16b }, v1.16b - add v19.4s, v22.4s, v17.4s - eor v22.16b, v23.16b, v19.16b - add v25.4s, v25.4s, v21.4s - zip1 v20.2d, v6.2d, v16.2d - ushr v23.4s, v22.4s, #7 - shl v22.4s, v22.4s, #25 - zip2 v24.4s, v16.4s, v6.4s - tbl v26.16b, { v20.16b, v21.16b }, v2.16b - orr v22.16b, v22.16b, v23.16b - zip1 v16.4s, v24.4s, v21.4s - zip1 v20.4s, v21.4s, v24.4s - ext v21.16b, v26.16b, v26.16b, #12 - ext v17.16b, v17.16b, v17.16b, #8 - add v25.4s, v25.4s, v22.4s - ext v16.16b, v20.16b, v16.16b, #8 - uzp1 v21.4s, v26.4s, v21.4s - eor v26.16b, v25.16b, v17.16b - ext v19.16b, v19.16b, v19.16b, #4 - tbl v26.16b, { v26.16b }, v0.16b - mov v29.16b, v16.16b - add v19.4s, v19.4s, v26.4s - ext v27.16b, v5.16b, v5.16b, #12 - mov v29.s[1], v21.s[2] - eor v22.16b, v22.16b, v19.16b - ext v28.16b, v5.16b, v27.16b, #12 - ushr v27.4s, v22.4s, #12 - shl v22.4s, v22.4s, #20 - add v6.4s, v25.4s, v6.4s - orr v22.16b, v22.16b, v27.16b - add v6.4s, v6.4s, v22.4s - eor v26.16b, v26.16b, v6.16b - add v6.4s, v6.4s, v18.4s - tbl v18.16b, { v26.16b }, v1.16b - add v19.4s, v19.4s, v18.4s - eor v22.16b, v22.16b, v19.16b - ushr v26.4s, v22.4s, #7 - shl v22.4s, v22.4s, #25 - ext v6.16b, v6.16b, v6.16b, #4 - orr v22.16b, v22.16b, v26.16b - ext v18.16b, v18.16b, v18.16b, #8 - add v6.4s, v6.4s, v22.4s - eor v18.16b, v6.16b, v18.16b - ext v19.16b, v19.16b, v19.16b, #12 - tbl v18.16b, { v18.16b }, v0.16b - add v19.4s, v19.4s, v18.4s - eor v22.16b, v22.16b, v19.16b - ushr v26.4s, v22.4s, #12 - shl v22.4s, v22.4s, #20 - add v6.4s, v6.4s, v7.4s - orr v22.16b, v22.16b, v26.16b - add v6.4s, v6.4s, v22.4s - ext v26.16b, v6.16b, v6.16b, #12 - eor v6.16b, v18.16b, v6.16b - uzp2 v4.4s, v4.4s, v7.4s - zip2 v25.4s, v7.4s, v16.4s - add v26.4s, v26.4s, v21.4s - zip1 v20.2d, v16.2d, v7.2d - tbl v6.16b, { v6.16b }, v1.16b - ext v24.16b, v4.16b, v4.16b, #4 - tbl v27.16b, { v20.16b, v21.16b }, v2.16b - zip1 v7.4s, v25.4s, v21.4s - zip1 v20.4s, v21.4s, v25.4s - add v18.4s, v19.4s, v6.4s - uzp1 v5.4s, v24.4s, v24.4s - ext v21.16b, v27.16b, v27.16b, #12 - ext v7.16b, v20.16b, v7.16b, #8 - eor v19.16b, v22.16b, v18.16b - ext v5.16b, v5.16b, v24.16b, #8 - tbl v17.16b, { v28.16b, v29.16b }, v3.16b - uzp1 v21.4s, v27.4s, v21.4s - mov v28.16b, v7.16b - ushr v22.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - ext v23.16b, v24.16b, v24.16b, #12 - uzp2 v5.4s, v5.4s, v17.4s - mov v28.s[1], v21.s[2] - orr v19.16b, v19.16b, v22.16b - ext v27.16b, v24.16b, v23.16b, #12 - ext v23.16b, v5.16b, v5.16b, #4 - ext v6.16b, v6.16b, v6.16b, #8 - ext v25.16b, v18.16b, v18.16b, #4 - add v18.4s, v26.4s, v19.4s - uzp1 v24.4s, v23.4s, v23.4s - eor v6.16b, v18.16b, v6.16b - ext v24.16b, v24.16b, v23.16b, #8 - add v16.4s, v18.4s, v16.4s - tbl v18.16b, { v27.16b, v28.16b }, v3.16b - tbl v27.16b, { v6.16b }, v0.16b - uzp2 v6.4s, v24.4s, v18.4s - add v24.4s, v25.4s, v27.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - orr v19.16b, v19.16b, v25.16b - add v16.4s, v16.4s, v19.4s - eor v25.16b, v27.16b, v16.16b - add v4.4s, v16.4s, v4.4s - tbl v16.16b, { v25.16b }, v1.16b - add v24.4s, v24.4s, v16.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - ext v4.16b, v4.16b, v4.16b, #4 - orr v19.16b, v19.16b, v25.16b - ext v16.16b, v16.16b, v16.16b, #8 - add v4.4s, v4.4s, v19.4s - eor v16.16b, v4.16b, v16.16b - ext v24.16b, v24.16b, v24.16b, #12 - tbl v25.16b, { v16.16b }, v0.16b - add v24.4s, v24.4s, v25.4s - eor v16.16b, v19.16b, v24.16b - ushr v19.4s, v16.4s, #12 - shl v16.4s, v16.4s, #20 - add v4.4s, v4.4s, v17.4s - orr v19.16b, v16.16b, v19.16b - add v27.4s, v4.4s, v19.4s - eor v25.16b, v25.16b, v27.16b - tbl v25.16b, { v25.16b }, v1.16b - add v24.4s, v24.4s, v25.4s - zip2 v26.4s, v17.4s, v7.4s - ext v4.16b, v27.16b, v27.16b, #12 - eor v19.16b, v19.16b, v24.16b - add v28.4s, v4.4s, v21.4s - zip1 v20.2d, v7.2d, v17.2d - zip1 v4.4s, v26.4s, v21.4s - zip1 v17.4s, v21.4s, v26.4s - ushr v26.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - orr v19.16b, v19.16b, v26.16b - ext v25.16b, v25.16b, v25.16b, #8 - add v27.4s, v28.4s, v19.4s - eor v25.16b, v27.16b, v25.16b - ext v24.16b, v24.16b, v24.16b, #4 - tbl v25.16b, { v25.16b }, v0.16b - add v24.4s, v24.4s, v25.4s - eor v19.16b, v19.16b, v24.16b - add v7.4s, v27.4s, v7.4s - ushr v27.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - orr v19.16b, v19.16b, v27.16b - add v7.4s, v7.4s, v19.4s - eor v25.16b, v25.16b, v7.16b - add v5.4s, v7.4s, v5.4s - tbl v7.16b, { v25.16b }, v1.16b - add v24.4s, v24.4s, v7.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - ext v5.16b, v5.16b, v5.16b, #4 - orr v19.16b, v19.16b, v25.16b - ext v7.16b, v7.16b, v7.16b, #8 - add v5.4s, v5.4s, v19.4s - eor v7.16b, v5.16b, v7.16b - ext v24.16b, v24.16b, v24.16b, #12 - tbl v7.16b, { v7.16b }, v0.16b - add v24.4s, v24.4s, v7.4s - eor v19.16b, v19.16b, v24.16b - ushr v25.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - tbl v16.16b, { v20.16b, v21.16b }, v2.16b - add v5.4s, v5.4s, v18.4s - orr v19.16b, v19.16b, v25.16b - ext v20.16b, v16.16b, v16.16b, #12 - ext v4.16b, v17.16b, v4.16b, #8 - add v5.4s, v5.4s, v19.4s - uzp1 v21.4s, v16.4s, v20.4s - mov v17.16b, v4.16b - ext v25.16b, v5.16b, v5.16b, #12 - mov v17.s[1], v21.s[2] - add v25.4s, v25.4s, v21.4s - zip1 v20.2d, v4.2d, v18.2d - ext v22.16b, v23.16b, v23.16b, #12 - zip2 v26.4s, v18.4s, v4.4s - tbl v18.16b, { v20.16b, v21.16b }, v2.16b - eor v5.16b, v7.16b, v5.16b - ext v16.16b, v23.16b, v22.16b, #12 - ext v22.16b, v6.16b, v6.16b, #4 - zip1 v27.4s, v26.4s, v21.4s - zip1 v20.4s, v21.4s, v26.4s - ext v21.16b, v18.16b, v18.16b, #12 - tbl v5.16b, { v5.16b }, v1.16b - ext v20.16b, v20.16b, v27.16b, #8 - uzp1 v27.4s, v18.4s, v21.4s - uzp1 v18.4s, v22.4s, v22.4s - add v21.4s, v24.4s, v5.4s - ext v18.16b, v18.16b, v22.16b, #8 - eor v19.16b, v19.16b, v21.16b - tbl v7.16b, { v16.16b, v17.16b }, v3.16b - uzp2 v18.4s, v18.4s, v17.4s - zip2 v16.4s, v16.4s, v20.4s - ushr v17.4s, v19.4s, #7 - shl v19.4s, v19.4s, #25 - orr v17.16b, v19.16b, v17.16b - ext v5.16b, v5.16b, v5.16b, #8 - add v19.4s, v25.4s, v17.4s - eor v5.16b, v19.16b, v5.16b - ext v21.16b, v21.16b, v21.16b, #4 - tbl v5.16b, { v5.16b }, v0.16b - add v4.4s, v19.4s, v4.4s - add v19.4s, v21.4s, v5.4s - eor v17.16b, v17.16b, v19.16b - ushr v21.4s, v17.4s, #12 - shl v17.4s, v17.4s, #20 - orr v17.16b, v17.16b, v21.16b - add v4.4s, v4.4s, v17.4s - eor v5.16b, v5.16b, v4.16b - tbl v5.16b, { v5.16b }, v1.16b - add v4.4s, v4.4s, v6.4s - add v6.4s, v19.4s, v5.4s - eor v17.16b, v17.16b, v6.16b - ushr v19.4s, v17.4s, #7 - shl v17.4s, v17.4s, #25 - ext v4.16b, v4.16b, v4.16b, #4 - orr v17.16b, v17.16b, v19.16b - ext v5.16b, v5.16b, v5.16b, #8 - add v4.4s, v4.4s, v17.4s - eor v5.16b, v4.16b, v5.16b - ext v6.16b, v6.16b, v6.16b, #12 - tbl v5.16b, { v5.16b }, v0.16b - add v6.4s, v6.4s, v5.4s - eor v17.16b, v17.16b, v6.16b - ushr v19.4s, v17.4s, #12 - shl v17.4s, v17.4s, #20 - add v4.4s, v4.4s, v7.4s - orr v17.16b, v17.16b, v19.16b - add v4.4s, v4.4s, v17.4s - eor v5.16b, v5.16b, v4.16b - tbl v5.16b, { v5.16b }, v1.16b - mov v29.16b, v20.16b - ext v4.16b, v4.16b, v4.16b, #12 - add v6.4s, v6.4s, v5.4s - mov v29.s[1], v27.s[2] - add v4.4s, v4.4s, v27.4s - zip1 v26.2d, v20.2d, v7.2d - zip1 v7.4s, v16.4s, v27.4s - zip1 v16.4s, v27.4s, v16.4s - eor v17.16b, v17.16b, v6.16b - ext v7.16b, v16.16b, v7.16b, #8 - ushr v16.4s, v17.4s, #7 - shl v17.4s, v17.4s, #25 - orr v16.16b, v17.16b, v16.16b - ext v5.16b, v5.16b, v5.16b, #8 - add v4.4s, v4.4s, v16.4s - eor v5.16b, v4.16b, v5.16b - ext v6.16b, v6.16b, v6.16b, #4 - tbl v5.16b, { v5.16b }, v0.16b - add v6.4s, v6.4s, v5.4s - eor v16.16b, v16.16b, v6.16b - ushr v17.4s, v16.4s, #12 - shl v16.4s, v16.4s, #20 - add v4.4s, v4.4s, v20.4s - orr v16.16b, v16.16b, v17.16b - add v4.4s, v4.4s, v16.4s - eor v5.16b, v5.16b, v4.16b - tbl v5.16b, { v5.16b }, v1.16b - add v6.4s, v6.4s, v5.4s - eor v16.16b, v16.16b, v6.16b - add v4.4s, v4.4s, v18.4s - ushr v17.4s, v16.4s, #7 - shl v16.4s, v16.4s, #25 - ext v23.16b, v22.16b, v22.16b, #12 - ext v4.16b, v4.16b, v4.16b, #4 - orr v16.16b, v16.16b, v17.16b - ext v28.16b, v22.16b, v23.16b, #12 - ext v5.16b, v5.16b, v5.16b, #8 - add v4.4s, v16.4s, v4.4s - tbl v3.16b, { v28.16b, v29.16b }, v3.16b - eor v5.16b, v4.16b, v5.16b - ext v6.16b, v6.16b, v6.16b, #12 - add v3.4s, v4.4s, v3.4s - tbl v4.16b, { v5.16b }, v0.16b - add v5.4s, v6.4s, v4.4s - eor v6.16b, v16.16b, v5.16b - ushr v16.4s, v6.4s, #12 - shl v6.4s, v6.4s, #20 - orr v6.16b, v6.16b, v16.16b - tbl v2.16b, { v26.16b, v27.16b }, v2.16b - add v3.4s, v3.4s, v6.4s - ext v19.16b, v2.16b, v2.16b, #12 - eor v4.16b, v4.16b, v3.16b - uzp1 v2.4s, v2.4s, v19.4s - ext v3.16b, v3.16b, v3.16b, #12 - tbl v4.16b, { v4.16b }, v1.16b - add v2.4s, v3.4s, v2.4s - add v3.4s, v5.4s, v4.4s - eor v5.16b, v6.16b, v3.16b - ushr v6.4s, v5.4s, #7 - shl v5.4s, v5.4s, #25 - orr v5.16b, v5.16b, v6.16b - ext v4.16b, v4.16b, v4.16b, #8 - add v2.4s, v2.4s, v5.4s - eor v4.16b, v2.16b, v4.16b - ext v3.16b, v3.16b, v3.16b, #4 - tbl v0.16b, { v4.16b }, v0.16b - add v3.4s, v3.4s, v0.4s - eor v4.16b, v5.16b, v3.16b - ushr v5.4s, v4.4s, #12 - shl v4.4s, v4.4s, #20 - add v2.4s, v2.4s, v7.4s - orr v4.16b, v4.16b, v5.16b - add v2.4s, v2.4s, v4.4s - eor v0.16b, v0.16b, v2.16b - tbl v0.16b, { v0.16b }, v1.16b - add v1.4s, v3.4s, v0.4s - eor v3.16b, v4.16b, v1.16b - ushr v4.4s, v3.4s, #7 - shl v3.4s, v3.4s, #25 - ext v2.16b, v2.16b, v2.16b, #4 - ext v0.16b, v0.16b, v0.16b, #8 - ext v1.16b, v1.16b, v1.16b, #12 - orr v3.16b, v3.16b, v4.16b - eor v2.16b, v2.16b, v1.16b - eor v3.16b, v3.16b, v0.16b - stp q2, q3, [x5] - ldr q2, [x0] - eor v1.16b, v2.16b, v1.16b - str q1, [x5, #32] - ldr q1, [x0, #16] - eor v0.16b, v1.16b, v0.16b - str q0, [x5, #48] - ret -.Lfunc_end1: - .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-zfs_blake3_compress_xof_sse41 - .cfi_endproc - - .section .rodata.cst16,"aM",@progbits,16 - .p2align 4 -.LCPI2_0: - .word 0 - .word 1 - .word 2 - .word 3 -.LCPI2_1: - .byte 2 - .byte 3 - .byte 0 - .byte 1 - .byte 6 - .byte 7 - .byte 4 - .byte 5 - .byte 10 - .byte 11 - .byte 8 - .byte 9 - .byte 14 - .byte 15 - .byte 12 - .byte 13 -.LCPI2_2: - .byte 1 - .byte 2 - .byte 3 - .byte 0 - .byte 5 - .byte 6 - .byte 7 - .byte 4 - .byte 9 - .byte 10 - .byte 11 - .byte 8 - .byte 13 - .byte 14 - .byte 15 - .byte 12 - .text - .globl zfs_blake3_hash_many_sse41 - .p2align 2 - .type zfs_blake3_hash_many_sse41,@function -zfs_blake3_hash_many_sse41: - .cfi_startproc - stp d15, d14, [sp, #-160]! + hint #34 + stp d15, d14, [sp, #-144]! stp d13, d12, [sp, #16] stp d11, d10, [sp, #32] stp d9, d8, [sp, #48] - stp x29, x30, [sp, #64] - stp x28, x27, [sp, #80] - stp x26, x25, [sp, #96] - stp x24, x23, [sp, #112] - stp x22, x21, [sp, #128] - stp x20, x19, [sp, #144] - mov x29, sp - sub sp, sp, #448 - .cfi_def_cfa w29, 160 + stp x29, x27, [sp, #64] + stp x26, x25, [sp, #80] + stp x24, x23, [sp, #96] + stp x22, x21, [sp, #112] + stp x20, x19, [sp, #128] + sub sp, sp, #368 + .cfi_def_cfa_offset 512 .cfi_offset w19, -8 .cfi_offset w20, -16 .cfi_offset w21, -24 @@ -1123,1341 +677,1722 @@ zfs_blake3_hash_many_sse41: .cfi_offset w25, -56 .cfi_offset w26, -64 .cfi_offset w27, -72 - .cfi_offset w28, -80 - .cfi_offset w30, -88 - .cfi_offset w29, -96 - .cfi_offset b8, -104 - .cfi_offset b9, -112 - .cfi_offset b10, -120 - .cfi_offset b11, -128 - .cfi_offset b12, -136 - .cfi_offset b13, -144 - .cfi_offset b14, -152 - .cfi_offset b15, -160 - ldr x26, [x29, #168] - ldrb w27, [x29, #160] - mov w19, w6 - mov x20, x4 - mov x22, x2 - mov x28, x1 + .cfi_offset w29, -80 + .cfi_offset b8, -88 + .cfi_offset b9, -96 + .cfi_offset b10, -104 + .cfi_offset b11, -112 + .cfi_offset b12, -120 + .cfi_offset b13, -128 + .cfi_offset b14, -136 + .cfi_offset b15, -144 + ldr x8, [sp, #520] + adrp x11, .LCPI3_1 + ldrb w9, [sp, #512] + adrp x10, .LCPI3_2 cmp x1, #4 - mov x24, x0 - str x3, [sp, #40] - b.lo .LBB2_8 - adrp x11, .LCPI2_0 - ldr q0, [x11, :lo12:.LCPI2_0] + b.lo .LBB3_6 + adrp x12, .LCPI3_0 sbfx w13, w5, #0, #1 + mov w15, #58983 + mov w16, #44677 + movk w15, #27145, lsl #16 + movk w16, #47975, lsl #16 + ldr q0, [x12, :lo12:.LCPI3_0] dup v1.4s, w13 - mov w10, #58983 - mov w11, #44677 - mov w12, #62322 + movi v13.4s, #64 + mov w13, #62322 + mov w14, #62778 + orr w12, w7, w6 and v0.16b, v1.16b, v0.16b - mov w13, #62778 - orr w8, w7, w19 - adrp x9, .LCPI2_1 - movk w10, #27145, lsl #16 - movk w11, #47975, lsl #16 - movk w12, #15470, lsl #16 - movk w13, #42319, lsl #16 - str q0, [sp, #16] + ldr q1, [x11, :lo12:.LCPI3_1] + movk w13, #15470, lsl #16 + movk w14, #42319, lsl #16 + dup v14.4s, w15 + stp q0, q1, [sp, #16] orr v0.4s, #128, lsl #24 - adrp x14, .LCPI2_2 str q0, [sp] -.LBB2_2: - ldr x2, [sp, #40] - mov x15, x2 - ld1r { v7.4s }, [x15], #4 - add x16, x2, #8 - add x17, x2, #12 - add x18, x2, #16 - add x0, x2, #20 - add x3, x2, #24 - add x2, x2, #28 - ld1r { v6.4s }, [x16] - ld1r { v17.4s }, [x17] - ld1r { v10.4s }, [x18] - ld1r { v11.4s }, [x0] - ld1r { v19.4s }, [x3] - ld1r { v18.4s }, [x15] - ld1r { v16.4s }, [x2] - cbz x22, .LBB2_7 + dup v0.4s, w16 + stp q0, q14, [sp, #48] + b .LBB3_3 +.LBB3_2: + zip1 v0.4s, v29.4s, v8.4s + add x15, x4, #4 + zip1 v1.4s, v30.4s, v31.4s + tst w5, #0x1 + zip1 v2.4s, v24.4s, v18.4s + csel x4, x15, x4, ne + zip1 v3.4s, v25.4s, v26.4s + add x0, x0, #32 + zip2 v6.4s, v29.4s, v8.4s + sub x1, x1, #4 + zip1 v4.2d, v0.2d, v1.2d + cmp x1, #3 + zip2 v7.4s, v30.4s, v31.4s + zip1 v5.2d, v2.2d, v3.2d + zip2 v0.2d, v0.2d, v1.2d + zip2 v1.2d, v2.2d, v3.2d + zip2 v2.4s, v24.4s, v18.4s + zip2 v3.4s, v25.4s, v26.4s + stp q4, q5, [x8] + zip2 v4.2d, v6.2d, v7.2d + stp q0, q1, [x8, #32] + zip1 v0.2d, v6.2d, v7.2d + zip1 v1.2d, v2.2d, v3.2d + zip2 v2.2d, v2.2d, v3.2d + stp q0, q1, [x8, #64] + stp q4, q2, [x8, #96] + add x8, x8, #128 + b.ls .LBB3_6 +.LBB3_3: + mov x15, x3 + add x16, x3, #8 + add x17, x3, #12 + add x19, x3, #16 + add x20, x3, #20 + ld1r { v29.4s }, [x15], #4 + ld1r { v30.4s }, [x16] + add x16, x3, #24 + ld1r { v31.4s }, [x17] + add x17, x3, #28 + ld1r { v24.4s }, [x19] + ld1r { v18.4s }, [x20] + ld1r { v25.4s }, [x16] + ld1r { v8.4s }, [x15] + ld1r { v26.4s }, [x17] + cbz x2, .LBB3_2 ldr q1, [sp, #16] - dup v0.4s, w20 - ldp x15, x16, [x24] - ldp x17, x18, [x24, #16] + dup v0.4s, w4 + lsr x17, x4, #32 + mov x15, xzr + ldp x19, x20, [x0, #16] add v1.4s, v0.4s, v1.4s + mov x21, x2 movi v0.4s, #128, lsl #24 - str q1, [sp, #64] + mov w26, w12 + str q1, [sp, #96] eor v0.16b, v1.16b, v0.16b ldr q1, [sp] - lsr x2, x20, #32 - mov x0, xzr - mov w6, w8 cmgt v0.4s, v1.4s, v0.4s - dup v1.4s, w2 + dup v1.4s, w17 + ldp x16, x17, [x0] sub v0.4s, v1.4s, v0.4s - str q0, [sp, #48] -.LBB2_4: - mov w4, #16 - stp q16, q17, [sp, #192] - bfi x4, x0, #6, #58 - ldr q1, [x15, x4] - ldr q3, [x16, x4] - ldr q2, [x17, x4] - ldr q4, [x18, x4] - mov w4, #32 - bfi x4, x0, #6, #58 - ldr q5, [x15, x4] - ldr q20, [x16, x4] - ldr q21, [x17, x4] - ldr q22, [x18, x4] - mov w4, #48 - lsl x3, x0, #6 - bfi x4, x0, #6, #58 - add x0, x0, #1 - ldr q0, [x15, x3] - ldr q23, [x16, x3] - ldr q16, [x17, x3] - ldr q17, [x18, x3] - cmp x0, x22 - ldr q25, [x15, x4] - ldr q14, [x16, x4] - ldr q28, [x17, x4] - ldr q31, [x18, x4] - csel w4, w27, wzr, eq - orr w4, w4, w6 - mov x2, xzr - and w6, w4, #0xff - add x3, x3, #256 -.LBB2_5: - ldr x4, [x24, x2] - add x2, x2, #8 - cmp x2, #32 - add x4, x4, x3 - prfm pldl1keep, [x4] - b.ne .LBB2_5 - zip1 v29.4s, v0.4s, v23.4s - zip2 v23.4s, v0.4s, v23.4s - zip1 v0.4s, v16.4s, v17.4s - zip2 v24.4s, v16.4s, v17.4s - zip1 v9.4s, v1.4s, v3.4s - zip2 v26.4s, v1.4s, v3.4s - zip1 v27.4s, v2.4s, v4.4s - zip2 v17.4s, v2.4s, v4.4s - zip1 v12.4s, v21.4s, v22.4s - zip2 v13.4s, v21.4s, v22.4s - add v2.4s, v7.4s, v10.4s - add v1.4s, v18.4s, v11.4s - ext v7.16b, v0.16b, v29.16b, #8 - ext v22.16b, v24.16b, v23.16b, #8 - zip1 v30.4s, v5.4s, v20.4s - zip2 v20.4s, v5.4s, v20.4s - stp q1, q2, [sp, #112] - ext v2.16b, v29.16b, v7.16b, #8 - mov v29.d[1], v0.d[0] - ext v18.16b, v23.16b, v22.16b, #8 - mov v23.d[1], v24.d[0] - zip1 v21.4s, v25.4s, v14.4s - zip2 v4.4s, v25.4s, v14.4s - zip1 v14.4s, v28.4s, v31.4s - zip2 v15.4s, v28.4s, v31.4s - add v8.4s, v6.4s, v19.4s - ext v28.16b, v27.16b, v9.16b, #8 - ext v31.16b, v17.16b, v26.16b, #8 - stur q2, [x29, #-208] - mov v7.16b, v29.16b - ext v0.16b, v12.16b, v30.16b, #8 - stp q23, q29, [x29, #-80] - mov v2.16b, v19.16b - ext v19.16b, v13.16b, v20.16b, #8 - mov v29.16b, v9.16b - ext v25.16b, v9.16b, v28.16b, #8 - mov v29.d[1], v27.d[0] - ext v24.16b, v26.16b, v31.16b, #8 - mov v26.d[1], v17.d[0] - ext v17.16b, v15.16b, v4.16b, #8 - ext v27.16b, v30.16b, v0.16b, #8 - ext v0.16b, v20.16b, v19.16b, #8 - stp q0, q25, [sp, #80] - ext v0.16b, v4.16b, v17.16b, #8 - str q0, [sp, #224] - ldr q0, [sp, #128] - mov v6.16b, v23.16b - mov v22.16b, v4.16b - ldr q16, [x9, :lo12:.LCPI2_1] - add v17.4s, v0.4s, v7.4s - ldr q0, [sp, #112] - mov v30.d[1], v12.d[0] - add v7.4s, v8.4s, v29.4s - mov v20.d[1], v13.d[0] - add v4.4s, v0.4s, v6.4s - ldr q0, [sp, #64] - dup v3.4s, w12 - ext v28.16b, v14.16b, v21.16b, #8 - dup v1.4s, w10 - eor v19.16b, v17.16b, v0.16b - ldr q0, [sp, #48] - ext v23.16b, v21.16b, v28.16b, #8 - mov v21.d[1], v14.d[0] - tbl v14.16b, { v19.16b }, v16.16b - eor v12.16b, v4.16b, v0.16b - movi v0.4s, #64 - eor v13.16b, v7.16b, v0.16b - tbl v13.16b, { v13.16b }, v16.16b - add v6.4s, v13.4s, v3.4s - dup v5.4s, w11 - tbl v12.16b, { v12.16b }, v16.16b - add v1.4s, v14.4s, v1.4s - eor v9.16b, v6.16b, v2.16b - ldp q2, q0, [sp, #192] - add v5.4s, v12.4s, v5.4s - eor v19.16b, v1.16b, v10.16b - eor v10.16b, v5.16b, v11.16b - ushr v11.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - orr v11.16b, v19.16b, v11.16b - ushr v19.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - mov v22.d[1], v15.d[0] - orr v10.16b, v10.16b, v19.16b - ushr v19.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - add v15.4s, v0.4s, v2.4s - orr v9.16b, v9.16b, v19.16b - dup v19.4s, w6 - add v15.4s, v15.4s, v26.4s - eor v19.16b, v15.16b, v19.16b - tbl v3.16b, { v19.16b }, v16.16b - dup v19.4s, w13 - add v8.4s, v3.4s, v19.4s - ldur q31, [x29, #-208] - eor v19.16b, v8.16b, v2.16b - ushr v0.4s, v19.4s, #12 - shl v19.4s, v19.4s, #20 - orr v2.16b, v19.16b, v0.16b - ldr q19, [x14, :lo12:.LCPI2_2] - add v17.4s, v17.4s, v31.4s - add v17.4s, v17.4s, v11.4s - eor v14.16b, v14.16b, v17.16b - tbl v14.16b, { v14.16b }, v19.16b - add v1.4s, v1.4s, v14.4s - eor v11.16b, v1.16b, v11.16b - add v4.4s, v4.4s, v18.4s - ushr v0.4s, v11.4s, #7 - shl v11.4s, v11.4s, #25 - add v4.4s, v4.4s, v10.4s - orr v0.16b, v11.16b, v0.16b - eor v11.16b, v12.16b, v4.16b - tbl v11.16b, { v11.16b }, v19.16b - add v5.4s, v5.4s, v11.4s - eor v10.16b, v5.16b, v10.16b - add v7.4s, v7.4s, v25.4s - ushr v12.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - add v7.4s, v7.4s, v9.4s - orr v10.16b, v10.16b, v12.16b - eor v12.16b, v13.16b, v7.16b - tbl v12.16b, { v12.16b }, v19.16b - add v6.4s, v6.4s, v12.4s - eor v9.16b, v6.16b, v9.16b - ushr v13.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - orr v9.16b, v9.16b, v13.16b - add v13.4s, v15.4s, v24.4s - add v13.4s, v13.4s, v2.4s - eor v3.16b, v3.16b, v13.16b - tbl v3.16b, { v3.16b }, v19.16b - add v8.4s, v8.4s, v3.4s - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v30.4s - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v20.4s - orr v2.16b, v2.16b, v15.16b - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v21.4s - tbl v3.16b, { v3.16b }, v16.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v22.4s - mov v28.16b, v26.16b - stur q26, [x29, #-112] - mov v26.16b, v18.16b - mov v18.16b, v24.16b - stur q24, [x29, #-160] - add v6.4s, v6.4s, v3.4s - mov v24.16b, v20.16b - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - ldr q20, [sp, #80] - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v13.16b - stp q30, q22, [x29, #-192] - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - mov v30.16b, v27.16b - add v17.4s, v17.4s, v27.4s - ldr q27, [sp, #224] - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v0.16b, v5.16b, v0.16b - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v20.4s - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v23.4s - orr v0.16b, v0.16b, v15.16b - tbl v3.16b, { v3.16b }, v19.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v27.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v13.16b - stur q21, [x29, #-144] - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - ldur q21, [x29, #-80] - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 + str q0, [sp, #80] +.LBB3_5: + add x23, x16, x15 + add x24, x17, x15 + add x22, x19, x15 + add x25, x20, x15 + subs x21, x21, #1 + add x15, x15, #64 + ldp q1, q2, [x23] + csel w27, w9, wzr, eq + orr w26, w27, w26 + and w26, w26, #0xff + ldp q4, q5, [x24] + dup v0.4s, w26 + mov w26, w6 + zip1 v22.4s, v1.4s, v4.4s + zip2 v20.4s, v1.4s, v4.4s + ldp q6, q7, [x22] + zip1 v17.4s, v2.4s, v5.4s + zip2 v23.4s, v2.4s, v5.4s + ldp q16, q21, [x25] + zip1 v19.4s, v6.4s, v16.4s + zip2 v1.4s, v6.4s, v16.4s + ldp q27, q28, [x23, #32] + zip1 v4.4s, v7.4s, v21.4s + zip2 v5.4s, v7.4s, v21.4s + zip2 v15.2d, v17.2d, v4.2d + ldp q9, q10, [x24, #32] + mov v17.d[1], v4.d[0] + add v4.4s, v30.4s, v25.4s + zip2 v11.2d, v23.2d, v5.2d + zip2 v3.4s, v27.4s, v9.4s + zip1 v7.4s, v27.4s, v9.4s + ldp q12, q6, [x22, #32] + mov v23.d[1], v5.d[0] + stp q11, q3, [sp, #256] + add v5.4s, v31.4s, v26.4s + add v4.4s, v4.4s, v17.4s + str q23, [sp, #352] + ldp q16, q2, [x25, #32] + add v5.4s, v5.4s, v23.4s + zip1 v3.4s, v12.4s, v16.4s eor v0.16b, v5.16b, v0.16b - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - orr v0.16b, v0.16b, v15.16b - add v17.4s, v17.4s, v21.4s - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v26.4s - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v18.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v29.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - eor v0.16b, v0.16b, v1.16b + zip1 v9.4s, v6.4s, v2.4s + zip2 v2.4s, v6.4s, v2.4s + stp q7, q3, [sp, #208] + zip2 v3.4s, v12.4s, v16.4s + zip1 v12.4s, v28.4s, v10.4s + zip2 v10.4s, v28.4s, v10.4s + stp q17, q2, [sp, #160] + zip2 v28.2d, v22.2d, v19.2d + mov v22.d[1], v19.d[0] + str q3, [sp, #240] + add v2.4s, v8.4s, v18.4s + eor v16.16b, v4.16b, v13.16b + dup v17.4s, w13 + mov v3.16b, v22.16b + stp q22, q28, [sp, #320] + zip2 v22.2d, v20.2d, v1.2d + mov v20.d[1], v1.d[0] + add v1.4s, v29.4s, v24.4s + add v4.4s, v4.4s, v15.4s add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - eor v3.16b, v3.16b, v13.16b - ldur q22, [x29, #-64] - ushr v15.4s, v0.4s, #12 + add v2.4s, v2.4s, v20.4s + stp q15, q20, [sp, #288] + add v1.4s, v1.4s, v3.4s + ldr q3, [sp, #96] + dup v20.4s, w14 + mov v23.16b, v22.16b + mov v15.16b, v10.16b + eor v6.16b, v1.16b, v3.16b + ldr q3, [sp, #80] + add v1.4s, v1.4s, v28.4s + ldr q28, [sp, #272] + str q23, [sp, #128] + eor v7.16b, v2.16b, v3.16b + ldp q27, q3, [sp, #32] + add v2.4s, v2.4s, v22.4s + tbl v6.16b, { v6.16b }, v27.16b + tbl v7.16b, { v7.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + tbl v0.16b, { v0.16b }, v27.16b + add v19.4s, v6.4s, v14.4s + add v21.4s, v7.4s, v3.4s + add v30.4s, v16.4s, v17.4s + add v31.4s, v0.4s, v20.4s + eor v24.16b, v19.16b, v24.16b + eor v17.16b, v21.16b, v18.16b + ushr v18.4s, v24.4s, #12 + shl v20.4s, v24.4s, #20 + eor v24.16b, v30.16b, v25.16b + eor v25.16b, v31.16b, v26.16b + ushr v26.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + ushr v29.4s, v24.4s, #12 + shl v24.4s, v24.4s, #20 + ushr v8.4s, v25.4s, #12 + shl v25.4s, v25.4s, #20 + orr v3.16b, v20.16b, v18.16b + ldr q18, [x10, :lo12:.LCPI3_2] + orr v13.16b, v17.16b, v26.16b + orr v24.16b, v24.16b, v29.16b + orr v14.16b, v25.16b, v8.16b + add v8.4s, v1.4s, v3.4s + add v29.4s, v2.4s, v13.4s + add v17.4s, v4.4s, v24.4s + add v20.4s, v5.4s, v14.4s + eor v1.16b, v6.16b, v8.16b + eor v2.16b, v7.16b, v29.16b + eor v4.16b, v16.16b, v17.16b + eor v0.16b, v0.16b, v20.16b + tbl v25.16b, { v1.16b }, v18.16b + tbl v16.16b, { v2.16b }, v18.16b + tbl v6.16b, { v4.16b }, v18.16b + tbl v4.16b, { v0.16b }, v18.16b + add v19.4s, v19.4s, v25.4s + add v21.4s, v21.4s, v16.4s + add v26.4s, v30.4s, v6.4s + add v7.4s, v31.4s, v4.4s + eor v0.16b, v19.16b, v3.16b + eor v1.16b, v21.16b, v13.16b + eor v2.16b, v26.16b, v24.16b + eor v3.16b, v7.16b, v14.16b + ushr v5.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v24.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + ushr v30.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v5.16b, v0.16b, v5.16b + orr v0.16b, v1.16b, v24.16b + ushr v31.4s, v3.4s, #7 + orr v2.16b, v2.16b, v30.16b + ldp q24, q30, [sp, #208] + shl v3.4s, v3.4s, #25 + zip2 v14.2d, v12.2d, v9.2d + mov v22.16b, v24.16b + orr v1.16b, v3.16b, v31.16b + zip2 v3.2d, v24.2d, v30.2d + mov v24.16b, v28.16b + mov v22.d[1], v30.d[0] + ldr q30, [sp, #240] + mov v31.16b, v12.16b + stp q22, q14, [sp, #224] + mov v24.d[1], v30.d[0] + add v12.4s, v8.4s, v22.4s + mov v31.d[1], v9.d[0] + add v22.4s, v29.4s, v24.4s + ldr q29, [sp, #176] + zip2 v28.2d, v28.2d, v30.2d + mov v9.16b, v24.16b + mov v15.d[1], v29.d[0] + zip2 v8.2d, v10.2d, v29.2d + add v10.4s, v12.4s, v0.4s + add v22.4s, v22.4s, v2.4s + str q9, [sp, #144] + add v20.4s, v20.4s, v15.4s + add v17.4s, v17.4s, v31.4s + stp q3, q8, [sp, #192] + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v27.16b + tbl v25.16b, { v25.16b }, v27.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v16.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - add v17.4s, v17.4s, v28.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v24.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 + ushr v13.4s, v2.4s, #12 shl v2.4s, v2.4s, #20 - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v22.4s - orr v2.16b, v2.16b, v15.16b - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v23.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - eor v3.16b, v3.16b, v13.16b - ldur q22, [x29, #-144] - ushr v15.4s, v0.4s, #7 + eor v5.16b, v21.16b, v5.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v10.4s, v10.4s, v3.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v22.4s, v22.4s, v28.4s + ushr v12.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v10.4s, v10.4s, v0.4s + orr v5.16b, v5.16b, v13.16b + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v8.4s + orr v1.16b, v1.16b, v12.16b + add v17.4s, v17.4s, v14.4s + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v18.16b + tbl v25.16b, { v25.16b }, v18.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v18.16b + tbl v16.16b, { v16.16b }, v18.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v19.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v31.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 + ushr v13.4s, v2.4s, #7 shl v2.4s, v2.4s, #25 - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v22.4s - orr v2.16b, v2.16b, v15.16b - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v30.4s - tbl v3.16b, { v3.16b }, v16.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v27.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - ldr q27, [sp, #96] - mov v21.16b, v26.16b - stur q26, [x29, #-96] - mov v28.16b, v31.16b - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v13.16b - ldp q31, q26, [x29, #-192] - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - add v17.4s, v17.4s, v20.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v0.16b, v5.16b, v0.16b - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v27.4s - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #12 + eor v5.16b, v21.16b, v5.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v22.4s, v22.4s, v23.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v17.4s, v17.4s, v11.4s + mov v30.16b, v28.16b + mov v28.16b, v23.16b + ldr q23, [sp, #304] + ushr v12.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v22.4s, v22.4s, v0.4s + mov v29.16b, v31.16b + ldr q31, [sp, #160] + orr v5.16b, v5.16b, v13.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v23.4s + orr v1.16b, v1.16b, v12.16b + str q29, [sp, #272] + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v31.4s + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v27.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v27.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v27.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v27.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v26.4s - orr v0.16b, v0.16b, v15.16b - tbl v3.16b, { v3.16b }, v19.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v31.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v13.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v0.16b, v5.16b, v0.16b - mov v18.16b, v24.16b - mov v24.16b, v20.16b - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #7 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v5.16b, v5.16b, v19.16b + add v22.4s, v22.4s, v24.4s + ldr q24, [sp, #320] + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v17.4s, v17.4s, v24.4s + ldr q24, [sp, #352] + ushr v13.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v22.4s, v22.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v24.4s + ldr q24, [sp, #336] + orr v1.16b, v1.16b, v13.16b + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v14.4s + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v18.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v18.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v18.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v18.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - ldur q20, [x29, #-160] - orr v0.16b, v0.16b, v15.16b - add v17.4s, v17.4s, v21.4s - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v18.4s - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v23.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v20.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - eor v3.16b, v3.16b, v13.16b - ldur q25, [x29, #-80] - ushr v15.4s, v0.4s, #12 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v5.16b, v19.16b, v5.16b + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + add v10.4s, v10.4s, v24.4s + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v22.4s, v22.4s, v29.4s + ushr v13.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v10.4s, v10.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v8.4s + ldr q8, [sp, #288] + orr v1.16b, v1.16b, v13.16b + add v17.4s, v17.4s, v3.4s + ldr q3, [sp, #352] + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v27.16b + tbl v25.16b, { v25.16b }, v27.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v16.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - add v17.4s, v17.4s, v29.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v22.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 + ushr v13.4s, v2.4s, #12 shl v2.4s, v2.4s, #20 - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s + eor v5.16b, v21.16b, v5.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v10.4s, v10.4s, v30.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v22.4s, v22.4s, v8.4s + mov v24.16b, v30.16b + mov v30.16b, v15.16b + add v17.4s, v17.4s, v15.4s + ldr q15, [sp, #224] + ushr v12.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v10.4s, v10.4s, v0.4s + str q30, [sp, #176] + orr v5.16b, v5.16b, v13.16b + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v15.4s + orr v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v18.16b + tbl v25.16b, { v25.16b }, v18.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s add v7.4s, v7.4s, v25.4s - orr v2.16b, v2.16b, v15.16b - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v26.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - ldur q25, [x29, #-112] - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - eor v3.16b, v3.16b, v13.16b - ushr v15.4s, v0.4s, #7 + tbl v6.16b, { v6.16b }, v18.16b + tbl v16.16b, { v16.16b }, v18.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v19.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v25.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 + ushr v13.4s, v2.4s, #7 shl v2.4s, v2.4s, #25 - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v30.4s - orr v2.16b, v2.16b, v15.16b - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v24.4s - tbl v3.16b, { v3.16b }, v16.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v31.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - ldur q25, [x29, #-64] - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v13.16b - ldr q31, [sp, #224] - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - add v17.4s, v17.4s, v27.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v0.16b, v5.16b, v0.16b - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v25.4s - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #12 + eor v5.16b, v21.16b, v5.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v22.4s, v22.4s, v9.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v17.4s, v17.4s, v14.4s + ushr v12.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v22.4s, v22.4s, v0.4s + orr v5.16b, v5.16b, v13.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v28.4s + orr v1.16b, v1.16b, v12.16b + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v11.4s + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v27.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v27.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v27.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v27.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v31.4s - orr v0.16b, v0.16b, v15.16b - tbl v3.16b, { v3.16b }, v19.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v28.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v13.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v0.16b, v5.16b, v0.16b - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #7 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v5.16b, v5.16b, v19.16b + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + add v22.4s, v22.4s, v29.4s + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v17.4s, v17.4s, v23.4s + ushr v13.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v22.4s, v22.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v31.4s + orr v1.16b, v1.16b, v13.16b + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v30.4s + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v18.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v18.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v18.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v18.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - orr v0.16b, v0.16b, v15.16b - add v17.4s, v17.4s, v18.4s - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v22.4s - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v26.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v23.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - mov v21.16b, v29.16b - stur q29, [x29, #-128] - mov v29.16b, v30.16b - mov v30.16b, v27.16b - mov v27.16b, v18.16b - str q18, [sp, #176] - eor v0.16b, v0.16b, v1.16b - mov v18.16b, v22.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - eor v3.16b, v3.16b, v13.16b - ldur q22, [x29, #-96] - ushr v15.4s, v0.4s, #12 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v5.16b, v19.16b, v5.16b + add v10.4s, v10.4s, v3.4s + ldr q3, [sp, #192] + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v22.4s, v22.4s, v3.4s + ushr v13.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v10.4s, v10.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v15.4s + ldr q15, [sp, #128] + orr v1.16b, v1.16b, v13.16b + add v17.4s, v17.4s, v24.4s + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v27.16b + tbl v25.16b, { v25.16b }, v27.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v16.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - add v17.4s, v17.4s, v20.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v29.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 + ushr v13.4s, v2.4s, #12 shl v2.4s, v2.4s, #20 - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v22.4s - orr v2.16b, v2.16b, v15.16b - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v31.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - eor v3.16b, v3.16b, v13.16b - ushr v15.4s, v0.4s, #7 + eor v5.16b, v21.16b, v5.16b + ldp q23, q11, [sp, #320] + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v10.4s, v10.4s, v8.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v22.4s, v22.4s, v23.4s + ushr v12.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v10.4s, v10.4s, v0.4s + mov v28.16b, v31.16b + mov v31.16b, v8.16b + ldr q8, [sp, #208] + orr v5.16b, v5.16b, v13.16b + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v11.4s + orr v1.16b, v1.16b, v12.16b + add v17.4s, v17.4s, v8.4s + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v18.16b + tbl v25.16b, { v25.16b }, v18.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v18.16b + tbl v16.16b, { v16.16b }, v18.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v19.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v21.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 + ushr v13.4s, v2.4s, #7 shl v2.4s, v2.4s, #25 - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v24.4s - orr v2.16b, v2.16b, v15.16b - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v30.4s - tbl v3.16b, { v3.16b }, v16.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v28.4s - add v6.4s, v6.4s, v3.4s - mov v22.16b, v24.16b - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - ldur q24, [x29, #-80] - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - mov v21.16b, v30.16b - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v13.16b - ldur q30, [x29, #-192] - mov v20.16b, v29.16b - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - ldur q29, [x29, #-112] - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - add v17.4s, v17.4s, v25.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v0.16b, v5.16b, v0.16b - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v24.4s - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #12 + eor v5.16b, v21.16b, v5.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v22.4s, v22.4s, v29.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v17.4s, v17.4s, v30.4s + ushr v12.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v22.4s, v22.4s, v0.4s + orr v5.16b, v5.16b, v13.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v9.4s + orr v1.16b, v1.16b, v12.16b + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v14.4s + ldr q14, [sp, #256] + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v27.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v27.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v27.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v27.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v30.4s - orr v0.16b, v0.16b, v15.16b - tbl v3.16b, { v3.16b }, v19.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v29.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v13.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v0.16b, v5.16b, v0.16b - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #7 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v5.16b, v5.16b, v19.16b + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + add v22.4s, v22.4s, v3.4s + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v17.4s, v17.4s, v15.4s + ushr v13.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v22.4s, v22.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v14.4s + orr v1.16b, v1.16b, v13.16b + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v8.4s + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v18.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v18.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v18.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v18.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - orr v0.16b, v0.16b, v15.16b - add v17.4s, v17.4s, v18.4s - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v20.4s - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v31.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v26.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - eor v3.16b, v3.16b, v13.16b - ushr v15.4s, v0.4s, #12 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v5.16b, v19.16b, v5.16b + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + add v10.4s, v10.4s, v28.4s + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v22.4s, v22.4s, v24.4s + ushr v13.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v10.4s, v10.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v11.4s + ldr q11, [sp, #304] + orr v1.16b, v1.16b, v13.16b + add v17.4s, v17.4s, v31.4s + ldr q31, [sp, #224] + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v27.16b + tbl v25.16b, { v25.16b }, v27.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v16.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - add v17.4s, v17.4s, v23.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v22.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 + ushr v13.4s, v2.4s, #12 shl v2.4s, v2.4s, #20 - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v27.4s - orr v2.16b, v2.16b, v15.16b - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v30.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - ldur q27, [x29, #-160] - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - eor v3.16b, v3.16b, v13.16b - ushr v15.4s, v0.4s, #7 + eor v5.16b, v21.16b, v5.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v10.4s, v10.4s, v23.4s + ldr q23, [sp, #240] + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v22.4s, v22.4s, v11.4s + mov v30.16b, v8.16b + mov v8.16b, v24.16b + ldr q24, [sp, #352] + ushr v12.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v10.4s, v10.4s, v0.4s + orr v5.16b, v5.16b, v13.16b + str q8, [sp, #112] + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v24.4s + orr v1.16b, v1.16b, v12.16b + add v17.4s, v17.4s, v31.4s + eor v4.16b, v4.16b, v10.16b + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v18.16b + tbl v25.16b, { v25.16b }, v18.16b + eor v6.16b, v6.16b, v20.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v18.16b + tbl v16.16b, { v16.16b }, v18.16b + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + mov v29.16b, v3.16b + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v19.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v27.4s - mov v28.16b, v25.16b - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 + ushr v13.4s, v2.4s, #7 shl v2.4s, v2.4s, #25 - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v21.4s - orr v2.16b, v2.16b, v15.16b - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v28.4s - tbl v3.16b, { v3.16b }, v16.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v29.4s - mov v25.16b, v31.16b - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - ldur q31, [x29, #-96] - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v13.16b - ldur q28, [x29, #-208] - mov v18.16b, v20.16b - str q20, [sp, #144] - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - ldur q20, [x29, #-128] - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - add v17.4s, v17.4s, v24.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - eor v0.16b, v5.16b, v0.16b - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v31.4s - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #12 + eor v5.16b, v21.16b, v5.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v22.4s, v22.4s, v29.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v17.4s, v17.4s, v30.4s + ldr q30, [sp, #272] + ushr v12.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v22.4s, v22.4s, v0.4s + mov v3.16b, v28.16b + ldr q28, [sp, #176] + orr v5.16b, v5.16b, v13.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v30.4s + orr v1.16b, v1.16b, v12.16b + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v28.4s + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v27.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v27.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v27.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v27.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v28.4s - orr v0.16b, v0.16b, v15.16b - tbl v3.16b, { v3.16b }, v19.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v13.4s, v13.4s, v20.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v0.4s - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v13.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v1.16b, v2.16b - add v5.4s, v5.4s, v12.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - eor v0.16b, v5.16b, v0.16b - orr v2.16b, v2.16b, v15.16b - ushr v15.4s, v0.4s, #7 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v5.16b, v5.16b, v19.16b + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + add v22.4s, v22.4s, v8.4s + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v17.4s, v17.4s, v9.4s + ldr q9, [sp, #320] + ushr v13.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v22.4s, v22.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v17.4s, v17.4s, v2.4s + add v10.4s, v10.4s, v23.4s + orr v1.16b, v1.16b, v13.16b + eor v16.16b, v16.16b, v22.16b + add v20.4s, v20.4s, v31.4s + eor v6.16b, v6.16b, v17.16b + add v10.4s, v10.4s, v5.4s + tbl v16.16b, { v16.16b }, v18.16b + add v20.4s, v20.4s, v1.4s + tbl v6.16b, { v6.16b }, v18.16b + eor v25.16b, v25.16b, v10.16b + add v21.4s, v21.4s, v16.4s + eor v4.16b, v4.16b, v20.16b + add v26.4s, v26.4s, v6.4s + tbl v25.16b, { v25.16b }, v18.16b + eor v0.16b, v21.16b, v0.16b + tbl v4.16b, { v4.16b }, v18.16b + eor v2.16b, v26.16b, v2.16b + add v19.4s, v19.4s, v25.4s + ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - orr v0.16b, v0.16b, v15.16b - add v17.4s, v17.4s, v18.4s - add v17.4s, v17.4s, v0.4s - add v4.4s, v4.4s, v22.4s - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v30.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v25.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v16.16b - eor v3.16b, v3.16b, v13.16b - add v17.4s, v17.4s, v26.4s - mov v26.16b, v21.16b - add v4.4s, v4.4s, v21.4s - ldur q21, [x29, #-144] - ushr v15.4s, v0.4s, #12 + add v7.4s, v7.4s, v4.4s + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v5.16b, v19.16b, v5.16b + add v10.4s, v10.4s, v14.4s + ldr q14, [sp, #288] + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v7.16b, v1.16b + orr v2.16b, v2.16b, v13.16b + ushr v12.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v22.4s, v22.4s, v14.4s + ushr v13.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + add v10.4s, v10.4s, v0.4s + orr v5.16b, v5.16b, v12.16b + add v22.4s, v22.4s, v2.4s + add v20.4s, v20.4s, v24.4s + orr v1.16b, v1.16b, v13.16b + eor v4.16b, v4.16b, v10.16b + add v17.4s, v17.4s, v9.4s + eor v25.16b, v25.16b, v22.16b + add v20.4s, v20.4s, v5.4s + tbl v4.16b, { v4.16b }, v27.16b + add v17.4s, v17.4s, v1.4s + tbl v25.16b, { v25.16b }, v27.16b + eor v6.16b, v6.16b, v20.16b + add v26.4s, v26.4s, v4.4s + eor v16.16b, v16.16b, v17.16b + add v7.4s, v7.4s, v25.4s + tbl v6.16b, { v6.16b }, v27.16b + eor v0.16b, v26.16b, v0.16b + tbl v16.16b, { v16.16b }, v27.16b + eor v2.16b, v7.16b, v2.16b + add v21.4s, v21.4s, v6.4s + ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v16.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v8.16b, v2.16b - add v17.4s, v17.4s, v0.4s - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 + add v19.4s, v19.4s, v16.4s + ushr v13.4s, v2.4s, #12 shl v2.4s, v2.4s, #20 - eor v14.16b, v14.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v7.4s, v7.4s, v21.4s - orr v2.16b, v2.16b, v15.16b - tbl v14.16b, { v14.16b }, v19.16b - eor v11.16b, v11.16b, v4.16b - add v7.4s, v7.4s, v9.4s - add v13.4s, v13.4s, v28.4s - add v1.4s, v1.4s, v14.4s - tbl v11.16b, { v11.16b }, v19.16b - eor v12.16b, v12.16b, v7.16b - add v13.4s, v13.4s, v2.4s - str q23, [sp, #160] - eor v0.16b, v0.16b, v1.16b - add v5.4s, v5.4s, v11.4s - tbl v12.16b, { v12.16b }, v19.16b - eor v3.16b, v3.16b, v13.16b - add v17.4s, v17.4s, v23.4s - ldur q23, [x29, #-64] - ushr v15.4s, v0.4s, #7 + eor v5.16b, v21.16b, v5.16b + orr v0.16b, v0.16b, v12.16b + eor v1.16b, v19.16b, v1.16b + add v10.4s, v10.4s, v11.4s + orr v2.16b, v2.16b, v13.16b + ushr v13.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + ushr v12.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v10.4s, v10.4s, v0.4s + add v22.4s, v22.4s, v15.4s + orr v5.16b, v5.16b, v13.16b + add v20.4s, v20.4s, v3.4s + mov v24.16b, v3.16b + ldr q3, [sp, #336] + orr v1.16b, v1.16b, v12.16b + eor v4.16b, v4.16b, v10.16b + add v22.4s, v22.4s, v2.4s + add v17.4s, v17.4s, v3.4s + add v20.4s, v20.4s, v5.4s + tbl v4.16b, { v4.16b }, v18.16b + eor v25.16b, v25.16b, v22.16b + add v17.4s, v17.4s, v1.4s + eor v6.16b, v6.16b, v20.16b + add v26.4s, v26.4s, v4.4s + tbl v25.16b, { v25.16b }, v18.16b + eor v16.16b, v16.16b, v17.16b + tbl v6.16b, { v6.16b }, v18.16b + eor v0.16b, v26.16b, v0.16b + add v7.4s, v7.4s, v25.4s + tbl v16.16b, { v16.16b }, v18.16b + add v21.4s, v21.4s, v6.4s + ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 - eor v10.16b, v5.16b, v10.16b - add v6.4s, v6.4s, v12.4s - tbl v3.16b, { v3.16b }, v19.16b - orr v0.16b, v0.16b, v15.16b - ushr v15.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - eor v9.16b, v6.16b, v9.16b - add v8.4s, v8.4s, v3.4s - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v2.16b, v8.16b, v2.16b - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #7 + eor v2.16b, v7.16b, v2.16b + add v19.4s, v19.4s, v16.4s + eor v5.16b, v21.16b, v5.16b + orr v0.16b, v0.16b, v12.16b + ushr v12.4s, v2.4s, #7 shl v2.4s, v2.4s, #25 - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v23.4s - orr v2.16b, v2.16b, v15.16b - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v24.4s - tbl v3.16b, { v3.16b }, v16.16b - eor v14.16b, v14.16b, v4.16b - add v7.4s, v7.4s, v2.4s - add v6.4s, v6.4s, v3.4s - tbl v14.16b, { v14.16b }, v16.16b - eor v11.16b, v11.16b, v7.16b - add v13.4s, v13.4s, v20.4s - eor v10.16b, v6.16b, v10.16b - add v8.4s, v8.4s, v14.4s - tbl v11.16b, { v11.16b }, v16.16b - add v13.4s, v13.4s, v0.4s - ldr q20, [sp, #176] - ushr v15.4s, v10.4s, #12 - shl v10.4s, v10.4s, #20 - eor v9.16b, v8.16b, v9.16b - add v1.4s, v1.4s, v11.4s - eor v12.16b, v12.16b, v13.16b - orr v10.16b, v10.16b, v15.16b - ushr v15.4s, v9.4s, #12 - shl v9.4s, v9.4s, #20 - eor v2.16b, v1.16b, v2.16b - tbl v12.16b, { v12.16b }, v16.16b - orr v9.16b, v9.16b, v15.16b - ushr v15.4s, v2.4s, #12 - shl v2.4s, v2.4s, #20 - add v5.4s, v5.4s, v12.4s + eor v1.16b, v19.16b, v1.16b + ushr v13.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v22.4s, v22.4s, v8.4s + orr v2.16b, v2.16b, v12.16b + ushr v12.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + orr v5.16b, v5.16b, v13.16b + add v22.4s, v22.4s, v0.4s + add v10.4s, v10.4s, v29.4s + ldr q29, [sp, #208] add v17.4s, v17.4s, v31.4s - orr v2.16b, v2.16b, v15.16b - eor v0.16b, v5.16b, v0.16b - add v17.4s, v17.4s, v10.4s - add v4.4s, v4.4s, v20.4s - add v7.4s, v7.4s, v29.4s - ushr v15.4s, v0.4s, #12 + orr v1.16b, v1.16b, v12.16b + add v20.4s, v20.4s, v29.4s + eor v16.16b, v16.16b, v22.16b + add v10.4s, v10.4s, v5.4s + add v17.4s, v17.4s, v2.4s + add v20.4s, v20.4s, v1.4s + tbl v16.16b, { v16.16b }, v27.16b + eor v25.16b, v25.16b, v10.16b + eor v6.16b, v6.16b, v17.16b + eor v4.16b, v4.16b, v20.16b + add v21.4s, v21.4s, v16.4s + tbl v25.16b, { v25.16b }, v27.16b + tbl v6.16b, { v6.16b }, v27.16b + tbl v4.16b, { v4.16b }, v27.16b + eor v0.16b, v21.16b, v0.16b + add v19.4s, v19.4s, v25.4s + add v26.4s, v26.4s, v6.4s + add v7.4s, v7.4s, v4.4s + ushr v12.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v3.16b, v3.16b, v17.16b - add v4.4s, v4.4s, v9.4s - add v7.4s, v7.4s, v2.4s - orr v0.16b, v0.16b, v15.16b - mov v15.16b, v31.16b - add v17.4s, v17.4s, v22.4s - eor v31.16b, v14.16b, v4.16b - eor v22.16b, v11.16b, v7.16b - add v11.4s, v13.4s, v27.4s - tbl v3.16b, { v3.16b }, v19.16b - add v11.4s, v11.4s, v0.4s - tbl v31.16b, { v31.16b }, v19.16b - add v6.4s, v6.4s, v3.4s - eor v12.16b, v12.16b, v11.16b - tbl v22.16b, { v22.16b }, v19.16b - add v8.4s, v8.4s, v31.4s - eor v10.16b, v6.16b, v10.16b - add v30.4s, v11.4s, v30.4s - tbl v11.16b, { v12.16b }, v19.16b - add v1.4s, v1.4s, v22.4s - eor v9.16b, v8.16b, v9.16b - ushr v12.4s, v10.4s, #7 - shl v10.4s, v10.4s, #25 - add v5.4s, v5.4s, v11.4s - eor v2.16b, v1.16b, v2.16b - orr v10.16b, v10.16b, v12.16b - ushr v12.4s, v9.4s, #7 - shl v9.4s, v9.4s, #25 - eor v0.16b, v5.16b, v0.16b - orr v9.16b, v9.16b, v12.16b - ushr v12.4s, v2.4s, #7 + eor v5.16b, v5.16b, v19.16b + eor v2.16b, v26.16b, v2.16b + eor v1.16b, v7.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + ushr v12.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v22.4s, v22.4s, v14.4s + mov v8.16b, v31.16b + ushr v13.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + mov v31.16b, v14.16b + ushr v14.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + orr v5.16b, v5.16b, v12.16b + add v22.4s, v22.4s, v0.4s + add v10.4s, v10.4s, v28.4s + ldr q28, [sp, #352] + orr v2.16b, v2.16b, v13.16b + orr v1.16b, v1.16b, v14.16b + add v17.4s, v17.4s, v30.4s + add v20.4s, v20.4s, v3.4s + eor v16.16b, v16.16b, v22.16b + add v10.4s, v10.4s, v5.4s + add v17.4s, v17.4s, v2.4s + add v20.4s, v20.4s, v1.4s + tbl v16.16b, { v16.16b }, v18.16b + eor v25.16b, v25.16b, v10.16b + eor v6.16b, v6.16b, v17.16b + eor v4.16b, v4.16b, v20.16b + add v21.4s, v21.4s, v16.4s + tbl v25.16b, { v25.16b }, v18.16b + tbl v6.16b, { v6.16b }, v18.16b + tbl v4.16b, { v4.16b }, v18.16b + eor v0.16b, v21.16b, v0.16b + add v19.4s, v19.4s, v25.4s + add v26.4s, v26.4s, v6.4s + add v7.4s, v7.4s, v4.4s + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v5.16b, v19.16b, v5.16b + eor v2.16b, v26.16b, v2.16b + eor v1.16b, v7.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + ushr v12.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + add v10.4s, v10.4s, v23.4s + ushr v13.4s, v2.4s, #7 shl v2.4s, v2.4s, #25 + ushr v14.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + orr v5.16b, v5.16b, v12.16b + add v10.4s, v10.4s, v0.4s + add v20.4s, v20.4s, v24.4s + ldr q24, [sp, #144] + orr v2.16b, v2.16b, v13.16b + orr v1.16b, v1.16b, v14.16b + add v22.4s, v22.4s, v9.4s + add v17.4s, v17.4s, v11.4s + eor v4.16b, v4.16b, v10.16b + add v20.4s, v20.4s, v5.4s + add v22.4s, v22.4s, v2.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v27.16b + eor v6.16b, v6.16b, v20.16b + eor v25.16b, v25.16b, v22.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + tbl v6.16b, { v6.16b }, v27.16b + tbl v25.16b, { v25.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + eor v0.16b, v26.16b, v0.16b + add v21.4s, v21.4s, v6.4s + add v7.4s, v7.4s, v25.4s + add v19.4s, v19.4s, v16.4s + ushr v12.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v5.16b, v21.16b, v5.16b + eor v2.16b, v7.16b, v2.16b + eor v1.16b, v19.16b, v1.16b + orr v0.16b, v0.16b, v12.16b + add v10.4s, v10.4s, v15.4s + ushr v14.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + mov v30.16b, v3.16b + ldr q3, [sp, #256] + ushr v12.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ushr v13.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + add v10.4s, v10.4s, v0.4s + orr v5.16b, v5.16b, v14.16b + add v20.4s, v20.4s, v3.4s orr v2.16b, v2.16b, v12.16b + orr v1.16b, v1.16b, v13.16b + add v22.4s, v22.4s, v24.4s + add v17.4s, v17.4s, v28.4s + eor v4.16b, v4.16b, v10.16b + add v20.4s, v20.4s, v5.4s + add v22.4s, v22.4s, v2.4s + add v17.4s, v17.4s, v1.4s + tbl v4.16b, { v4.16b }, v18.16b + eor v6.16b, v6.16b, v20.16b + eor v25.16b, v25.16b, v22.16b + eor v16.16b, v16.16b, v17.16b + add v26.4s, v26.4s, v4.4s + tbl v6.16b, { v6.16b }, v18.16b + tbl v25.16b, { v25.16b }, v18.16b + tbl v16.16b, { v16.16b }, v18.16b + eor v0.16b, v26.16b, v0.16b + add v21.4s, v21.4s, v6.4s + add v7.4s, v7.4s, v25.4s + add v19.4s, v19.4s, v16.4s ushr v12.4s, v0.4s, #7 shl v0.4s, v0.4s, #25 + eor v5.16b, v21.16b, v5.16b + eor v2.16b, v7.16b, v2.16b + eor v1.16b, v19.16b, v1.16b orr v0.16b, v0.16b, v12.16b - add v4.4s, v4.4s, v26.4s - add v17.4s, v17.4s, v0.4s - add v7.4s, v7.4s, v28.4s - mov v18.16b, v27.16b - eor v31.16b, v31.16b, v17.16b - add v4.4s, v4.4s, v10.4s - add v27.4s, v30.4s, v2.4s - eor v22.16b, v22.16b, v4.16b - add v7.4s, v7.4s, v9.4s - eor v3.16b, v3.16b, v27.16b - add v26.4s, v27.4s, v29.4s - tbl v27.16b, { v31.16b }, v16.16b - eor v28.16b, v11.16b, v7.16b - tbl v22.16b, { v22.16b }, v16.16b - add v1.4s, v1.4s, v27.4s - add v4.4s, v4.4s, v23.4s - ldr q23, [sp, #144] - tbl v28.16b, { v28.16b }, v16.16b - tbl v3.16b, { v3.16b }, v16.16b - add v5.4s, v5.4s, v22.4s - eor v0.16b, v0.16b, v1.16b - add v6.4s, v6.4s, v28.4s - add v29.4s, v8.4s, v3.4s - eor v30.16b, v5.16b, v10.16b - ushr v8.4s, v0.4s, #12 + ushr v12.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + mov v23.16b, v9.16b + ldr q9, [sp, #112] + ushr v13.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ushr v14.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + orr v5.16b, v5.16b, v12.16b + add v9.4s, v10.4s, v9.4s + orr v2.16b, v2.16b, v13.16b + orr v1.16b, v1.16b, v14.16b + ldr q14, [sp, #64] + add v22.4s, v22.4s, v31.4s + add v17.4s, v17.4s, v30.4s + add v20.4s, v20.4s, v8.4s + add v9.4s, v9.4s, v5.4s + add v22.4s, v22.4s, v0.4s + add v17.4s, v17.4s, v2.4s + add v20.4s, v20.4s, v1.4s + eor v25.16b, v25.16b, v9.16b + eor v16.16b, v16.16b, v22.16b + eor v6.16b, v6.16b, v17.16b + eor v4.16b, v4.16b, v20.16b + tbl v25.16b, { v25.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + tbl v6.16b, { v6.16b }, v27.16b + tbl v4.16b, { v4.16b }, v27.16b + add v19.4s, v19.4s, v25.4s + add v21.4s, v21.4s, v16.4s + add v26.4s, v26.4s, v6.4s + add v7.4s, v7.4s, v4.4s + eor v5.16b, v5.16b, v19.16b + eor v0.16b, v21.16b, v0.16b + eor v2.16b, v26.16b, v2.16b + eor v1.16b, v7.16b, v1.16b + ushr v30.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + ushr v10.4s, v0.4s, #12 shl v0.4s, v0.4s, #20 - eor v31.16b, v6.16b, v9.16b - orr v0.16b, v0.16b, v8.16b - ushr v8.4s, v30.4s, #12 - shl v30.4s, v30.4s, #20 - eor v2.16b, v29.16b, v2.16b - orr v30.16b, v30.16b, v8.16b - ushr v8.4s, v31.4s, #12 - shl v31.4s, v31.4s, #20 - add v17.4s, v17.4s, v25.4s - add v7.4s, v7.4s, v23.4s - orr v31.16b, v31.16b, v8.16b - ushr v8.4s, v2.4s, #12 + ushr v12.4s, v2.4s, #12 shl v2.4s, v2.4s, #20 - ldur q23, [x29, #-176] - orr v2.16b, v2.16b, v8.16b - add v17.4s, v17.4s, v0.4s - eor v27.16b, v27.16b, v17.16b - add v4.4s, v4.4s, v30.4s - add v25.4s, v26.4s, v2.4s - eor v22.16b, v22.16b, v4.16b - add v4.4s, v4.4s, v24.4s - add v7.4s, v7.4s, v31.4s - eor v3.16b, v3.16b, v25.16b - add v24.4s, v25.4s, v18.4s - tbl v25.16b, { v27.16b }, v19.16b + ushr v13.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + orr v5.16b, v5.16b, v30.16b + add v30.4s, v9.4s, v29.4s + add v22.4s, v22.4s, v23.4s + ldr q23, [sp, #192] + orr v0.16b, v0.16b, v10.16b + orr v2.16b, v2.16b, v12.16b + orr v1.16b, v1.16b, v13.16b add v17.4s, v17.4s, v23.4s - eor v23.16b, v28.16b, v7.16b - tbl v22.16b, { v22.16b }, v19.16b - add v1.4s, v1.4s, v25.4s - tbl v23.16b, { v23.16b }, v19.16b - tbl v3.16b, { v3.16b }, v19.16b - add v5.4s, v5.4s, v22.4s - eor v0.16b, v0.16b, v1.16b - add v6.4s, v6.4s, v23.4s - add v26.4s, v29.4s, v3.4s - eor v27.16b, v5.16b, v30.16b - ushr v29.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - eor v28.16b, v6.16b, v31.16b - orr v0.16b, v0.16b, v29.16b - ushr v29.4s, v27.4s, #7 - shl v27.4s, v27.4s, #25 + add v20.4s, v20.4s, v28.4s + add v23.4s, v30.4s, v5.4s + add v22.4s, v22.4s, v0.4s + add v17.4s, v17.4s, v2.4s + add v20.4s, v20.4s, v1.4s + eor v25.16b, v25.16b, v23.16b + eor v16.16b, v16.16b, v22.16b + eor v6.16b, v6.16b, v17.16b + eor v4.16b, v4.16b, v20.16b + tbl v25.16b, { v25.16b }, v18.16b + tbl v16.16b, { v16.16b }, v18.16b + tbl v6.16b, { v6.16b }, v18.16b + tbl v4.16b, { v4.16b }, v18.16b + add v19.4s, v19.4s, v25.4s + add v21.4s, v21.4s, v16.4s + add v26.4s, v26.4s, v6.4s + add v7.4s, v7.4s, v4.4s + eor v5.16b, v19.16b, v5.16b + eor v0.16b, v21.16b, v0.16b eor v2.16b, v26.16b, v2.16b - orr v27.16b, v27.16b, v29.16b - ushr v29.4s, v28.4s, #7 - shl v28.4s, v28.4s, #25 - ldur q18, [x29, #-128] - orr v28.16b, v28.16b, v29.16b - ushr v29.4s, v2.4s, #7 + eor v1.16b, v7.16b, v1.16b + ushr v28.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + ushr v30.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v31.4s, v2.4s, #7 shl v2.4s, v2.4s, #25 - add v7.4s, v7.4s, v15.4s - orr v2.16b, v2.16b, v29.16b - add v17.4s, v17.4s, v27.4s - add v4.4s, v4.4s, v28.4s - add v7.4s, v7.4s, v2.4s - eor v3.16b, v3.16b, v17.16b - add v17.4s, v17.4s, v20.4s - eor v20.16b, v25.16b, v4.16b - add v4.4s, v4.4s, v21.4s - eor v21.16b, v22.16b, v7.16b - add v7.4s, v7.4s, v18.4s - add v18.4s, v24.4s, v0.4s - eor v22.16b, v23.16b, v18.16b - ldr q23, [sp, #160] - tbl v3.16b, { v3.16b }, v16.16b - tbl v20.16b, { v20.16b }, v16.16b - add v6.4s, v6.4s, v3.4s - add v18.4s, v18.4s, v23.4s - tbl v21.16b, { v21.16b }, v16.16b - tbl v16.16b, { v22.16b }, v16.16b - add v22.4s, v26.4s, v20.4s - eor v23.16b, v6.16b, v27.16b - add v1.4s, v1.4s, v21.4s - eor v24.16b, v22.16b, v28.16b - ushr v25.4s, v23.4s, #12 - shl v23.4s, v23.4s, #20 - add v5.4s, v5.4s, v16.4s - eor v2.16b, v1.16b, v2.16b - orr v23.16b, v23.16b, v25.16b - ushr v25.4s, v24.4s, #12 - shl v24.4s, v24.4s, #20 - eor v0.16b, v5.16b, v0.16b - orr v24.16b, v24.16b, v25.16b - ushr v25.4s, v2.4s, #12 + ushr v8.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + orr v5.16b, v5.16b, v28.16b + ldr q28, [sp, #176] + orr v0.16b, v0.16b, v30.16b + orr v2.16b, v2.16b, v31.16b + orr v1.16b, v1.16b, v8.16b + add v23.4s, v23.4s, v28.4s + add v22.4s, v22.4s, v11.4s + add v17.4s, v17.4s, v15.4s + add v20.4s, v20.4s, v3.4s + ldr q3, [sp, #272] + add v23.4s, v23.4s, v0.4s + add v22.4s, v22.4s, v2.4s + add v17.4s, v17.4s, v1.4s + add v20.4s, v20.4s, v5.4s + eor v4.16b, v4.16b, v23.16b + eor v25.16b, v25.16b, v22.16b + eor v16.16b, v16.16b, v17.16b + eor v6.16b, v6.16b, v20.16b + tbl v4.16b, { v4.16b }, v27.16b + tbl v25.16b, { v25.16b }, v27.16b + tbl v16.16b, { v16.16b }, v27.16b + tbl v6.16b, { v6.16b }, v27.16b + add v26.4s, v26.4s, v4.4s + add v7.4s, v7.4s, v25.4s + add v19.4s, v19.4s, v16.4s + add v21.4s, v21.4s, v6.4s + eor v0.16b, v26.16b, v0.16b + eor v2.16b, v7.16b, v2.16b + eor v1.16b, v19.16b, v1.16b + eor v5.16b, v21.16b, v5.16b + add v3.4s, v22.4s, v3.4s + ldr q22, [sp, #160] + ushr v28.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + ushr v29.4s, v2.4s, #12 shl v2.4s, v2.4s, #20 + ushr v30.4s, v1.4s, #12 + shl v1.4s, v1.4s, #20 + ushr v31.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + add v17.4s, v17.4s, v22.4s + ldr q22, [sp, #240] + orr v0.16b, v0.16b, v28.16b + prfm pldl1keep, [x23, #256] + orr v2.16b, v2.16b, v29.16b + prfm pldl1keep, [x24, #256] + orr v1.16b, v1.16b, v30.16b + prfm pldl1keep, [x22, #256] + orr v5.16b, v5.16b, v31.16b + prfm pldl1keep, [x25, #256] + add v23.4s, v23.4s, v24.4s + add v20.4s, v20.4s, v22.4s + add v3.4s, v3.4s, v2.4s + add v17.4s, v17.4s, v1.4s + add v22.4s, v23.4s, v0.4s + add v20.4s, v20.4s, v5.4s + eor v23.16b, v25.16b, v3.16b + eor v16.16b, v16.16b, v17.16b + eor v4.16b, v4.16b, v22.16b + eor v6.16b, v6.16b, v20.16b + tbl v23.16b, { v23.16b }, v18.16b + tbl v16.16b, { v16.16b }, v18.16b + tbl v4.16b, { v4.16b }, v18.16b + tbl v6.16b, { v6.16b }, v18.16b + add v7.4s, v7.4s, v23.4s + add v19.4s, v19.4s, v16.4s + add v18.4s, v26.4s, v4.4s + add v21.4s, v21.4s, v6.4s + eor v2.16b, v7.16b, v2.16b + eor v1.16b, v19.16b, v1.16b + eor v0.16b, v18.16b, v0.16b + eor v5.16b, v21.16b, v5.16b + ushr v25.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ushr v24.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ushr v26.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + ushr v27.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v0.16b, v0.16b, v24.16b orr v2.16b, v2.16b, v25.16b - ushr v25.4s, v0.4s, #12 - shl v0.4s, v0.4s, #20 - orr v0.16b, v0.16b, v25.16b - add v25.4s, v7.4s, v2.4s - add v26.4s, v18.4s, v0.4s - eor v18.16b, v21.16b, v25.16b - add v17.4s, v17.4s, v23.4s - add v4.4s, v4.4s, v24.4s - eor v16.16b, v16.16b, v26.16b - tbl v21.16b, { v18.16b }, v19.16b - eor v3.16b, v3.16b, v17.16b - eor v7.16b, v20.16b, v4.16b - tbl v16.16b, { v16.16b }, v19.16b - add v1.4s, v1.4s, v21.4s - tbl v3.16b, { v3.16b }, v19.16b - tbl v20.16b, { v7.16b }, v19.16b - eor v2.16b, v1.16b, v2.16b - eor v7.16b, v1.16b, v17.16b - add v1.4s, v5.4s, v16.4s - eor v0.16b, v1.16b, v0.16b - eor v18.16b, v1.16b, v4.16b - add v1.4s, v6.4s, v3.4s - eor v4.16b, v1.16b, v23.16b - eor v6.16b, v25.16b, v1.16b - add v1.4s, v22.4s, v20.4s - eor v5.16b, v1.16b, v24.16b - eor v17.16b, v26.16b, v1.16b - ushr v1.4s, v4.4s, #7 + orr v1.16b, v1.16b, v26.16b + orr v5.16b, v5.16b, v27.16b + movi v13.4s, #64 + eor v29.16b, v19.16b, v22.16b + eor v8.16b, v21.16b, v3.16b + eor v30.16b, v17.16b, v18.16b + eor v31.16b, v20.16b, v7.16b + eor v24.16b, v5.16b, v23.16b + eor v18.16b, v0.16b, v16.16b + eor v25.16b, v2.16b, v6.16b + eor v26.16b, v1.16b, v4.16b + cbnz x21, .LBB3_5 + b .LBB3_2 +.LBB3_6: + cbz x1, .LBB3_14 + adrp x12, .LCPI3_3 + ldr q0, [x11, :lo12:.LCPI3_1] + orr w11, w7, w6 + ldr q2, [x10, :lo12:.LCPI3_2] + ldr q1, [x12, :lo12:.LCPI3_3] + and x12, x5, #0x1 +.LBB3_8: + movi v3.4s, #64 + lsr x13, x4, #32 + ldp q5, q4, [x3] + mov x15, x2 + mov w14, w11 + mov v3.s[0], w4 + ldr x10, [x0] + mov v3.s[1], w13 + b .LBB3_11 +.LBB3_9: + orr w14, w14, w9 +.LBB3_10: + ldp q6, q7, [x10] + mov v16.16b, v3.16b + and w14, w14, #0xff + add v5.4s, v5.4s, v4.4s + mov x15, x13 + mov v16.s[3], w14 + add x14, x10, #32 + uzp1 v17.4s, v6.4s, v7.4s + add x10, x10, #64 + add v5.4s, v5.4s, v17.4s + eor v16.16b, v5.16b, v16.16b + tbl v16.16b, { v16.16b }, v0.16b + add v18.4s, v16.4s, v1.4s + eor v19.16b, v18.16b, v4.16b + uzp2 v4.4s, v6.4s, v7.4s + ushr v6.4s, v19.4s, #12 + shl v7.4s, v19.4s, #20 + ld2 { v19.4s, v20.4s }, [x14] + add v5.4s, v5.4s, v4.4s + mov w14, w6 + orr v6.16b, v7.16b, v6.16b + add v5.4s, v5.4s, v6.4s + eor v7.16b, v16.16b, v5.16b + add v5.4s, v5.4s, v19.4s + tbl v7.16b, { v7.16b }, v2.16b + ext v5.16b, v5.16b, v5.16b, #12 + add v16.4s, v18.4s, v7.4s + ext v7.16b, v7.16b, v7.16b, #8 + eor v6.16b, v6.16b, v16.16b + ext v16.16b, v16.16b, v16.16b, #4 + ushr v18.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + orr v6.16b, v6.16b, v18.16b + ext v18.16b, v20.16b, v20.16b, #12 + add v5.4s, v5.4s, v6.4s + eor v7.16b, v5.16b, v7.16b + add v5.4s, v5.4s, v18.4s + tbl v7.16b, { v7.16b }, v0.16b + add v16.4s, v16.4s, v7.4s + eor v6.16b, v6.16b, v16.16b + ushr v21.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + orr v6.16b, v6.16b, v21.16b + uzp1 v21.4s, v17.4s, v17.4s + add v5.4s, v5.4s, v6.4s + ext v21.16b, v21.16b, v17.16b, #8 + eor v7.16b, v7.16b, v5.16b + uzp2 v21.4s, v21.4s, v4.4s + tbl v7.16b, { v7.16b }, v2.16b + add v5.4s, v5.4s, v21.4s + add v16.4s, v16.4s, v7.4s + ext v5.16b, v5.16b, v5.16b, #4 + ext v7.16b, v7.16b, v7.16b, #8 + eor v6.16b, v6.16b, v16.16b + ushr v22.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + orr v6.16b, v6.16b, v22.16b + add v22.4s, v5.4s, v6.4s + eor v5.16b, v22.16b, v7.16b + ext v7.16b, v16.16b, v16.16b, #12 + tbl v16.16b, { v5.16b }, v0.16b + ext v5.16b, v17.16b, v17.16b, #12 + add v7.4s, v7.4s, v16.4s + ext v5.16b, v17.16b, v5.16b, #12 + ext v17.16b, v19.16b, v19.16b, #12 + mov v19.16b, v18.16b + eor v6.16b, v6.16b, v7.16b + rev64 v5.4s, v5.4s + mov v19.s[1], v17.s[2] + ushr v20.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + trn2 v5.4s, v5.4s, v19.4s + orr v6.16b, v6.16b, v20.16b + zip1 v20.2d, v18.2d, v4.2d + zip2 v4.4s, v4.4s, v18.4s + add v19.4s, v6.4s, v5.4s + mov v20.s[3], v17.s[3] + add v19.4s, v19.4s, v22.4s + ext v22.16b, v20.16b, v20.16b, #12 + eor v16.16b, v16.16b, v19.16b + ext v19.16b, v19.16b, v19.16b, #12 + tbl v16.16b, { v16.16b }, v2.16b + add v7.4s, v7.4s, v16.4s + ext v16.16b, v16.16b, v16.16b, #8 + eor v6.16b, v6.16b, v7.16b + ext v7.16b, v7.16b, v7.16b, #4 + ushr v23.4s, v6.4s, #7 + shl v24.4s, v6.4s, #25 + uzp1 v6.4s, v20.4s, v22.4s + orr v20.16b, v24.16b, v23.16b + add v22.4s, v20.4s, v6.4s + add v19.4s, v22.4s, v19.4s + eor v16.16b, v19.16b, v16.16b + tbl v16.16b, { v16.16b }, v0.16b + add v7.4s, v7.4s, v16.4s + eor v18.16b, v20.16b, v7.16b + zip1 v20.4s, v4.4s, v17.4s + zip1 v4.4s, v17.4s, v4.4s + ushr v17.4s, v18.4s, #12 + shl v18.4s, v18.4s, #20 + ext v20.16b, v4.16b, v20.16b, #8 + orr v4.16b, v18.16b, v17.16b + ext v18.16b, v21.16b, v21.16b, #4 + add v17.4s, v4.4s, v20.4s + add v17.4s, v17.4s, v19.4s + uzp1 v19.4s, v18.4s, v18.4s + eor v16.16b, v16.16b, v17.16b + ext v19.16b, v19.16b, v18.16b, #8 + tbl v16.16b, { v16.16b }, v2.16b + uzp2 v19.4s, v19.4s, v5.4s + add v7.4s, v7.4s, v16.4s + add v17.4s, v17.4s, v19.4s + ext v16.16b, v16.16b, v16.16b, #8 + eor v4.16b, v4.16b, v7.16b + ext v17.16b, v17.16b, v17.16b, #4 + ext v7.16b, v7.16b, v7.16b, #12 + ushr v21.4s, v4.4s, #7 shl v4.4s, v4.4s, #25 - orr v1.16b, v4.16b, v1.16b - ushr v4.4s, v5.4s, #7 + orr v4.16b, v4.16b, v21.16b + ext v21.16b, v18.16b, v18.16b, #12 + add v17.4s, v17.4s, v4.4s + ext v18.16b, v18.16b, v21.16b, #12 + mov v21.16b, v20.16b + eor v16.16b, v17.16b, v16.16b + rev64 v18.4s, v18.4s + mov v21.s[1], v6.s[2] + tbl v16.16b, { v16.16b }, v0.16b + add v7.4s, v7.4s, v16.4s + eor v4.16b, v4.16b, v7.16b + ushr v22.4s, v4.4s, #12 + shl v23.4s, v4.4s, #20 + trn2 v4.4s, v18.4s, v21.4s + orr v18.16b, v23.16b, v22.16b + add v21.4s, v18.4s, v4.4s + add v17.4s, v21.4s, v17.4s + zip1 v21.2d, v20.2d, v5.2d + zip2 v5.4s, v5.4s, v20.4s + eor v16.16b, v16.16b, v17.16b + mov v21.s[3], v6.s[3] + ext v17.16b, v17.16b, v17.16b, #12 + zip1 v20.4s, v5.4s, v6.4s + tbl v16.16b, { v16.16b }, v2.16b + zip1 v5.4s, v6.4s, v5.4s + add v22.4s, v7.4s, v16.4s + ext v16.16b, v16.16b, v16.16b, #8 + ext v20.16b, v5.16b, v20.16b, #8 + eor v7.16b, v18.16b, v22.16b + ext v18.16b, v21.16b, v21.16b, #12 + ushr v23.4s, v7.4s, #7 + shl v24.4s, v7.4s, #25 + uzp1 v7.4s, v21.4s, v18.4s + orr v18.16b, v24.16b, v23.16b + add v21.4s, v18.4s, v7.4s + add v17.4s, v21.4s, v17.4s + ext v21.16b, v22.16b, v22.16b, #4 + eor v16.16b, v17.16b, v16.16b + tbl v16.16b, { v16.16b }, v0.16b + add v21.4s, v21.4s, v16.4s + eor v18.16b, v18.16b, v21.16b + ushr v6.4s, v18.4s, #12 + shl v18.4s, v18.4s, #20 + orr v5.16b, v18.16b, v6.16b + add v6.4s, v5.4s, v20.4s + add v6.4s, v6.4s, v17.4s + ext v17.16b, v19.16b, v19.16b, #4 + eor v16.16b, v16.16b, v6.16b + uzp1 v18.4s, v17.4s, v17.4s + tbl v16.16b, { v16.16b }, v2.16b + ext v18.16b, v18.16b, v17.16b, #8 + add v19.4s, v21.4s, v16.4s + uzp2 v18.4s, v18.4s, v4.4s + ext v16.16b, v16.16b, v16.16b, #8 + eor v5.16b, v5.16b, v19.16b + add v6.4s, v6.4s, v18.4s + ext v19.16b, v19.16b, v19.16b, #12 + ushr v21.4s, v5.4s, #7 shl v5.4s, v5.4s, #25 - orr v4.16b, v5.16b, v4.16b - ushr v5.4s, v2.4s, #7 - shl v2.4s, v2.4s, #25 - orr v2.16b, v2.16b, v5.16b - ushr v5.4s, v0.4s, #7 - shl v0.4s, v0.4s, #25 - orr v0.16b, v0.16b, v5.16b - eor v10.16b, v0.16b, v20.16b - eor v11.16b, v1.16b, v21.16b - eor v19.16b, v4.16b, v16.16b - cmp x0, x22 - eor v16.16b, v2.16b, v3.16b - mov w6, w19 - b.ne .LBB2_4 -.LBB2_7: - zip1 v0.4s, v7.4s, v18.4s - zip2 v1.4s, v7.4s, v18.4s - zip1 v2.4s, v6.4s, v17.4s - zip2 v3.4s, v6.4s, v17.4s - zip1 v4.4s, v10.4s, v11.4s - zip2 v5.4s, v10.4s, v11.4s - zip1 v6.4s, v19.4s, v16.4s - zip2 v7.4s, v19.4s, v16.4s - add x15, x20, #4 - tst w5, #0x1 - sub x28, x28, #4 - zip1 v16.2d, v0.2d, v2.2d - zip2 v0.2d, v0.2d, v2.2d - zip1 v2.2d, v1.2d, v3.2d - zip2 v1.2d, v1.2d, v3.2d - zip1 v3.2d, v4.2d, v6.2d - zip2 v4.2d, v4.2d, v6.2d - zip1 v6.2d, v5.2d, v7.2d - zip2 v5.2d, v5.2d, v7.2d - add x24, x24, #32 - csel x20, x15, x20, ne - cmp x28, #3 - stp q16, q3, [x26] - stp q0, q4, [x26, #32] - stp q2, q6, [x26, #64] - stp q1, q5, [x26, #96] - add x26, x26, #128 - b.hi .LBB2_2 -.LBB2_8: - cbz x28, .LBB2_16 - orr w8, w7, w19 - and x21, x5, #0x1 - stur w8, [x29, #-64] -.LBB2_10: - ldr x8, [sp, #40] - ldr x25, [x24] - ldur w4, [x29, #-64] - ldp q1, q0, [x8] - mov x8, x22 - stp q1, q0, [x29, #-48] -.LBB2_11: - subs x23, x8, #1 - b.eq .LBB2_13 - cbnz x8, .LBB2_14 - b .LBB2_15 -.LBB2_13: - orr w4, w4, w27 -.LBB2_14: - sub x0, x29, #48 - mov w2, #64 - mov x1, x25 - mov x3, x20 - bl zfs_blake3_compress_in_place_sse41 - add x25, x25, #64 - mov x8, x23 - mov w4, w19 - b .LBB2_11 -.LBB2_15: - ldp q0, q1, [x29, #-48] - add x20, x20, x21 - add x24, x24, #8 - subs x28, x28, #1 - stp q0, q1, [x26], #32 - b.ne .LBB2_10 -.LBB2_16: - add sp, sp, #448 - ldp x20, x19, [sp, #144] - ldp x22, x21, [sp, #128] - ldp x24, x23, [sp, #112] - ldp x26, x25, [sp, #96] - ldp x28, x27, [sp, #80] - ldp x29, x30, [sp, #64] + ext v6.16b, v6.16b, v6.16b, #4 + orr v5.16b, v5.16b, v21.16b + ext v21.16b, v17.16b, v17.16b, #12 + add v6.4s, v6.4s, v5.4s + ext v17.16b, v17.16b, v21.16b, #12 + mov v21.16b, v20.16b + eor v16.16b, v6.16b, v16.16b + rev64 v17.4s, v17.4s + mov v21.s[1], v7.s[2] + tbl v16.16b, { v16.16b }, v0.16b + add v19.4s, v19.4s, v16.4s + eor v5.16b, v5.16b, v19.16b + ushr v22.4s, v5.4s, #12 + shl v23.4s, v5.4s, #20 + trn2 v5.4s, v17.4s, v21.4s + orr v17.16b, v23.16b, v22.16b + add v21.4s, v17.4s, v5.4s + add v6.4s, v21.4s, v6.4s + eor v16.16b, v16.16b, v6.16b + ext v6.16b, v6.16b, v6.16b, #12 + tbl v21.16b, { v16.16b }, v2.16b + zip1 v16.2d, v20.2d, v4.2d + zip2 v4.4s, v4.4s, v20.4s + add v19.4s, v19.4s, v21.4s + mov v16.s[3], v7.s[3] + ext v21.16b, v21.16b, v21.16b, #8 + zip1 v20.4s, v4.4s, v7.4s + eor v17.16b, v17.16b, v19.16b + ext v22.16b, v16.16b, v16.16b, #12 + ext v19.16b, v19.16b, v19.16b, #4 + zip1 v4.4s, v7.4s, v4.4s + ushr v23.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + uzp1 v16.4s, v16.4s, v22.4s + ext v4.16b, v4.16b, v20.16b, #8 + orr v17.16b, v17.16b, v23.16b + add v22.4s, v17.4s, v16.4s + add v6.4s, v22.4s, v6.4s + eor v21.16b, v6.16b, v21.16b + tbl v21.16b, { v21.16b }, v0.16b + add v19.4s, v19.4s, v21.4s + eor v17.16b, v17.16b, v19.16b + ushr v7.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + orr v7.16b, v17.16b, v7.16b + add v17.4s, v7.4s, v4.4s + add v6.4s, v17.4s, v6.4s + ext v17.16b, v18.16b, v18.16b, #4 + eor v18.16b, v21.16b, v6.16b + uzp1 v20.4s, v17.4s, v17.4s + tbl v18.16b, { v18.16b }, v2.16b + ext v20.16b, v20.16b, v17.16b, #8 + add v19.4s, v19.4s, v18.4s + uzp2 v20.4s, v20.4s, v5.4s + ext v18.16b, v18.16b, v18.16b, #8 + eor v7.16b, v7.16b, v19.16b + add v6.4s, v6.4s, v20.4s + ushr v21.4s, v7.4s, #7 + shl v7.4s, v7.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v7.16b, v7.16b, v21.16b + add v21.4s, v6.4s, v7.4s + eor v6.16b, v21.16b, v18.16b + ext v18.16b, v19.16b, v19.16b, #12 + tbl v19.16b, { v6.16b }, v0.16b + ext v6.16b, v17.16b, v17.16b, #12 + add v18.4s, v18.4s, v19.4s + ext v6.16b, v17.16b, v6.16b, #12 + mov v17.16b, v4.16b + eor v7.16b, v7.16b, v18.16b + rev64 v6.4s, v6.4s + mov v17.s[1], v16.s[2] + ushr v22.4s, v7.4s, #12 + shl v7.4s, v7.4s, #20 + trn2 v6.4s, v6.4s, v17.4s + orr v7.16b, v7.16b, v22.16b + add v17.4s, v7.4s, v6.4s + add v17.4s, v17.4s, v21.4s + zip1 v21.2d, v4.2d, v5.2d + zip2 v4.4s, v5.4s, v4.4s + eor v19.16b, v19.16b, v17.16b + mov v21.s[3], v16.s[3] + ext v17.16b, v17.16b, v17.16b, #12 + tbl v19.16b, { v19.16b }, v2.16b + ext v22.16b, v21.16b, v21.16b, #12 + add v18.4s, v18.4s, v19.4s + ext v19.16b, v19.16b, v19.16b, #8 + eor v7.16b, v7.16b, v18.16b + ext v18.16b, v18.16b, v18.16b, #4 + ushr v23.4s, v7.4s, #7 + shl v24.4s, v7.4s, #25 + uzp1 v7.4s, v21.4s, v22.4s + orr v21.16b, v24.16b, v23.16b + add v22.4s, v21.4s, v7.4s + add v17.4s, v22.4s, v17.4s + eor v19.16b, v17.16b, v19.16b + tbl v19.16b, { v19.16b }, v0.16b + add v18.4s, v18.4s, v19.4s + eor v5.16b, v21.16b, v18.16b + zip1 v21.4s, v4.4s, v16.4s + zip1 v4.4s, v16.4s, v4.4s + ushr v16.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + ext v21.16b, v4.16b, v21.16b, #8 + orr v4.16b, v5.16b, v16.16b + ext v16.16b, v20.16b, v20.16b, #4 + mov v23.16b, v21.16b + add v5.4s, v4.4s, v21.4s + mov v23.s[1], v7.s[2] + add v5.4s, v5.4s, v17.4s + eor v17.16b, v19.16b, v5.16b + uzp1 v19.4s, v16.4s, v16.4s + tbl v17.16b, { v17.16b }, v2.16b + ext v19.16b, v19.16b, v16.16b, #8 + add v18.4s, v18.4s, v17.4s + uzp2 v19.4s, v19.4s, v6.4s + eor v4.16b, v4.16b, v18.16b + add v5.4s, v5.4s, v19.4s + ext v19.16b, v19.16b, v19.16b, #4 + ushr v20.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v20.16b, v4.16b, v20.16b + ext v4.16b, v17.16b, v17.16b, #8 + add v17.4s, v5.4s, v20.4s + ext v5.16b, v18.16b, v18.16b, #12 + eor v4.16b, v17.16b, v4.16b + tbl v18.16b, { v4.16b }, v0.16b + ext v4.16b, v16.16b, v16.16b, #12 + add v22.4s, v5.4s, v18.4s + ext v4.16b, v16.16b, v4.16b, #12 + eor v5.16b, v20.16b, v22.16b + rev64 v16.4s, v4.4s + ushr v20.4s, v5.4s, #12 + shl v24.4s, v5.4s, #20 + trn2 v5.4s, v16.4s, v23.4s + orr v16.16b, v24.16b, v20.16b + add v20.4s, v16.4s, v5.4s + add v17.4s, v20.4s, v17.4s + zip1 v20.2d, v21.2d, v6.2d + zip2 v6.4s, v6.4s, v21.4s + eor v18.16b, v18.16b, v17.16b + mov v20.s[3], v7.s[3] + ext v17.16b, v17.16b, v17.16b, #12 + zip1 v21.4s, v6.4s, v7.4s + tbl v18.16b, { v18.16b }, v2.16b + ext v24.16b, v20.16b, v20.16b, #12 + zip1 v6.4s, v7.4s, v6.4s + add v22.4s, v22.4s, v18.4s + ext v18.16b, v18.16b, v18.16b, #8 + ext v6.16b, v6.16b, v21.16b, #8 + eor v16.16b, v16.16b, v22.16b + ext v22.16b, v22.16b, v22.16b, #4 + zip1 v5.2d, v6.2d, v5.2d + zip2 v4.4s, v4.4s, v6.4s + ushr v25.4s, v16.4s, #7 + shl v26.4s, v16.4s, #25 + uzp1 v16.4s, v20.4s, v24.4s + orr v20.16b, v26.16b, v25.16b + mov v5.s[3], v16.s[3] + add v24.4s, v20.4s, v16.4s + add v17.4s, v24.4s, v17.4s + eor v18.16b, v17.16b, v18.16b + tbl v18.16b, { v18.16b }, v0.16b + add v22.4s, v22.4s, v18.4s + eor v20.16b, v20.16b, v22.16b + ushr v7.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + orr v7.16b, v20.16b, v7.16b + add v20.4s, v7.4s, v6.4s + add v17.4s, v20.4s, v17.4s + ext v20.16b, v19.16b, v19.16b, #8 + eor v18.16b, v18.16b, v17.16b + ext v17.16b, v17.16b, v17.16b, #4 + tbl v18.16b, { v18.16b }, v2.16b + add v21.4s, v22.4s, v18.4s + uzp2 v22.4s, v20.4s, v23.4s + ext v18.16b, v18.16b, v18.16b, #8 + eor v7.16b, v7.16b, v21.16b + ext v20.16b, v22.16b, v20.16b, #4 + ushr v22.4s, v7.4s, #7 + shl v7.4s, v7.4s, #25 + add v17.4s, v17.4s, v20.4s + ext v20.16b, v21.16b, v21.16b, #12 + ext v21.16b, v19.16b, v19.16b, #12 + orr v7.16b, v7.16b, v22.16b + ext v19.16b, v19.16b, v21.16b, #12 + add v17.4s, v17.4s, v7.4s + mov v21.16b, v6.16b + rev64 v19.4s, v19.4s + eor v18.16b, v17.16b, v18.16b + mov v21.s[1], v16.s[2] + tbl v18.16b, { v18.16b }, v0.16b + trn2 v19.4s, v19.4s, v21.4s + add v20.4s, v20.4s, v18.4s + eor v7.16b, v7.16b, v20.16b + ushr v22.4s, v7.4s, #12 + shl v7.4s, v7.4s, #20 + orr v7.16b, v7.16b, v22.16b + add v19.4s, v7.4s, v19.4s + add v17.4s, v19.4s, v17.4s + eor v18.16b, v18.16b, v17.16b + ext v17.16b, v17.16b, v17.16b, #12 + tbl v18.16b, { v18.16b }, v2.16b + add v19.4s, v20.4s, v18.4s + ext v20.16b, v5.16b, v5.16b, #12 + ext v18.16b, v18.16b, v18.16b, #8 + eor v7.16b, v7.16b, v19.16b + uzp1 v5.4s, v5.4s, v20.4s + ushr v21.4s, v7.4s, #7 + shl v7.4s, v7.4s, #25 + orr v7.16b, v7.16b, v21.16b + add v5.4s, v7.4s, v5.4s + add v5.4s, v5.4s, v17.4s + eor v17.16b, v5.16b, v18.16b + ext v18.16b, v19.16b, v19.16b, #4 + tbl v17.16b, { v17.16b }, v0.16b + add v18.4s, v18.4s, v17.4s + eor v6.16b, v7.16b, v18.16b + zip1 v7.4s, v4.4s, v16.4s + zip1 v4.4s, v16.4s, v4.4s + ushr v16.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + ext v4.16b, v4.16b, v7.16b, #8 + orr v6.16b, v6.16b, v16.16b + add v4.4s, v6.4s, v4.4s + add v4.4s, v4.4s, v5.4s + eor v5.16b, v17.16b, v4.16b + ext v4.16b, v4.16b, v4.16b, #4 + tbl v5.16b, { v5.16b }, v2.16b + add v7.4s, v18.4s, v5.4s + eor v6.16b, v6.16b, v7.16b + ext v7.16b, v7.16b, v7.16b, #12 + ushr v16.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + orr v6.16b, v6.16b, v16.16b + ext v16.16b, v5.16b, v5.16b, #8 + eor v5.16b, v4.16b, v7.16b + eor v4.16b, v6.16b, v16.16b +.LBB3_11: + subs x13, x15, #1 + b.eq .LBB3_9 + cbnz x15, .LBB3_10 + add x4, x4, x12 + add x0, x0, #8 + subs x1, x1, #1 + stp q5, q4, [x8], #32 + b.ne .LBB3_8 +.LBB3_14: + add sp, sp, #368 + ldp x20, x19, [sp, #128] + ldp x22, x21, [sp, #112] + ldp x24, x23, [sp, #96] + ldp x26, x25, [sp, #80] + ldp x29, x27, [sp, #64] ldp d9, d8, [sp, #48] ldp d11, d10, [sp, #32] ldp d13, d12, [sp, #16] - ldp d15, d14, [sp], #160 + ldp d15, d14, [sp], #144 ret -.Lfunc_end2: - .size zfs_blake3_hash_many_sse41, .Lfunc_end2-zfs_blake3_hash_many_sse41 +.Lfunc_end3: + .size zfs_blake3_hash_many_sse41, .Lfunc_end3-zfs_blake3_hash_many_sse41 .cfi_endproc .section ".note.GNU-stack","",@progbits -#endif +#endif \ No newline at end of file From ae0d0f0e047edc0da20f9dcf28d161e31a259751 Mon Sep 17 00:00:00 2001 From: Val Packett Date: Thu, 27 Apr 2023 13:49:03 -0300 Subject: [PATCH 075/180] PAM: support the authentication facility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the pam_sm_authenticate method, using the noop argument of lzc_load_key to do a passphrase check without actually loading the key. This allows using ZFS as the source of truth for user passwords, without storing any password hashes in /etc or using other PAM modules. Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14789 --- contrib/pam_zfs_key/pam_zfs_key.c | 63 ++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 14 deletions(-) diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index 6ba5b5fba75f..27c7d63781c5 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -371,7 +371,7 @@ change_key(pam_handle_t *pamh, const char *ds_name, static int decrypt_mount(pam_handle_t *pamh, const char *ds_name, - const char *passphrase) + const char *passphrase, boolean_t noop) { zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); if (ds == NULL) { @@ -383,7 +383,7 @@ decrypt_mount(pam_handle_t *pamh, const char *ds_name, zfs_close(ds); return (-1); } - int ret = lzc_load_key(ds_name, B_FALSE, (uint8_t *)key->value, + int ret = lzc_load_key(ds_name, noop, (uint8_t *)key->value, WRAPPING_KEY_LEN); pw_free(key); if (ret) { @@ -391,12 +391,16 @@ decrypt_mount(pam_handle_t *pamh, const char *ds_name, zfs_close(ds); return (-1); } + if (noop) { + goto out; + } ret = zfs_mount(ds, NULL, 0); if (ret) { pam_syslog(pamh, LOG_ERR, "mount failed: %d", ret); zfs_close(ds); return (-1); } +out: zfs_close(ds); return (0); } @@ -443,13 +447,13 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, config->homes_prefix = strdup("rpool/home"); if (config->homes_prefix == NULL) { pam_syslog(pamh, LOG_ERR, "strdup failure"); - return (-1); + return (PAM_SERVICE_ERR); } config->runstatedir = strdup(RUNSTATEDIR "/pam_zfs_key"); if (config->runstatedir == NULL) { pam_syslog(pamh, LOG_ERR, "strdup failure"); free(config->homes_prefix); - return (-1); + return (PAM_SERVICE_ERR); } const char *name; if (pam_get_user(pamh, &name, NULL) != PAM_SUCCESS) { @@ -457,13 +461,13 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, "couldn't get username from PAM stack"); free(config->runstatedir); free(config->homes_prefix); - return (-1); + return (PAM_SERVICE_ERR); } struct passwd *entry = getpwnam(name); if (!entry) { free(config->runstatedir); free(config->homes_prefix); - return (-1); + return (PAM_USER_UNKNOWN); } config->uid = entry->pw_uid; config->username = name; @@ -484,7 +488,7 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, config->homedir = strdup(entry->pw_dir); } } - return (0); + return (PAM_SUCCESS); } static void @@ -644,12 +648,43 @@ PAM_EXTERN int pam_sm_authenticate(pam_handle_t *pamh, int flags, int argc, const char **argv) { - (void) flags, (void) argc, (void) argv; + (void) flags; - if (pw_fetch_lazy(pamh) == NULL) { - return (PAM_AUTH_ERR); + if (geteuid() != 0) { + pam_syslog(pamh, LOG_ERR, + "Cannot zfs_mount when not being root."); + return (PAM_SERVICE_ERR); + } + zfs_key_config_t config; + int config_err = zfs_key_config_load(pamh, &config, argc, argv); + if (config_err != PAM_SUCCESS) { + return (config_err); } + const pw_password_t *token = pw_fetch_lazy(pamh); + if (token == NULL) { + zfs_key_config_free(&config); + return (PAM_AUTH_ERR); + } + if (pam_zfs_init(pamh) != 0) { + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + char *dataset = zfs_key_config_get_dataset(&config); + if (!dataset) { + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + if (decrypt_mount(pamh, dataset, token->value, B_TRUE) == -1) { + free(dataset); + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_AUTH_ERR); + } + free(dataset); + pam_zfs_free(); + zfs_key_config_free(&config); return (PAM_SUCCESS); } @@ -673,7 +708,7 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags, return (PAM_PERM_DENIED); } zfs_key_config_t config; - if (zfs_key_config_load(pamh, &config, argc, argv) == -1) { + if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) { return (PAM_SERVICE_ERR); } if (config.uid < 1000) { @@ -754,7 +789,7 @@ pam_sm_open_session(pam_handle_t *pamh, int flags, return (PAM_SUCCESS); } zfs_key_config_t config; - if (zfs_key_config_load(pamh, &config, argc, argv) != 0) { + if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) { return (PAM_SESSION_ERR); } @@ -784,7 +819,7 @@ pam_sm_open_session(pam_handle_t *pamh, int flags, zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - if (decrypt_mount(pamh, dataset, token->value) == -1) { + if (decrypt_mount(pamh, dataset, token->value, B_FALSE) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); @@ -813,7 +848,7 @@ pam_sm_close_session(pam_handle_t *pamh, int flags, return (PAM_SUCCESS); } zfs_key_config_t config; - if (zfs_key_config_load(pamh, &config, argc, argv) != 0) { + if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) { return (PAM_SESSION_ERR); } if (config.uid < 1000) { From 2fd1c30423620a5b198ac1a5aa2cff8e1e57b7f3 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 27 Apr 2023 15:32:58 -0400 Subject: [PATCH 076/180] Mark TX_COMMIT transaction with TXG_NOTHROTTLE. TX_COMMIT has no on-disk representation and does not produce any more dirty data. It should not wait for anything, and even just skipping the checks if not waiting gives improvement noticeable in profiler. Reviewed-by: Brian Behlendorf Reviewed-by: Prakash Surya Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14798 --- module/zfs/zil.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index d1631c2ac9db..ec9da706a806 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -3155,7 +3155,14 @@ static void zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) { dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); - VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + /* + * Since we are not going to create any new dirty data, and we + * can even help with clearing the existing dirty data, we + * should not be subject to the dirty data based delays. We + * use TXG_NOTHROTTLE to bypass the delay mechanism. + */ + VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); itx->itx_sync = B_TRUE; From 5a83f761c7c7445dda39d3fd3c5aa2a7bcb353f1 Mon Sep 17 00:00:00 2001 From: Justin Hibbits Date: Thu, 27 Apr 2023 15:49:21 -0400 Subject: [PATCH 077/180] powerpc64: Support ELFv2 asm on Big Endian FreeBSD/powerpc64 is all ELFv2 since FreeBSD 13, even big endian. The existing sha256 and sha512 asm code assumes that BE is all ELFv1, and LE is ELFv2. Minor changes to add ELFv2 in the BE side gets this working correctly on FreeBSD with latest OpenZFS import. Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf Signed-off-by: Justin Hibbits Closes #14779 --- include/os/freebsd/spl/sys/simd_powerpc.h | 2 +- lib/libspl/include/sys/simd.h | 2 +- module/icp/asm-ppc64/sha2/sha256-p8.S | 15 +++++++++++++++ module/icp/asm-ppc64/sha2/sha256-ppc.S | 15 +++++++++++++++ module/icp/asm-ppc64/sha2/sha512-p8.S | 16 ++++++++++++++++ module/icp/asm-ppc64/sha2/sha512-ppc.S | 15 +++++++++++++++ 6 files changed, 63 insertions(+), 2 deletions(-) diff --git a/include/os/freebsd/spl/sys/simd_powerpc.h b/include/os/freebsd/spl/sys/simd_powerpc.h index edaab81d15fc..cf3c712c6af2 100644 --- a/include/os/freebsd/spl/sys/simd_powerpc.h +++ b/include/os/freebsd/spl/sys/simd_powerpc.h @@ -49,7 +49,7 @@ #include #include -#define kfpu_allowed() 1 +#define kfpu_allowed() 0 #define kfpu_initialize(tsk) do {} while (0) #define kfpu_begin() do {} while (0) #define kfpu_end() do {} while (0) diff --git a/lib/libspl/include/sys/simd.h b/lib/libspl/include/sys/simd.h index a106967d0725..41f9df506468 100644 --- a/lib/libspl/include/sys/simd.h +++ b/lib/libspl/include/sys/simd.h @@ -551,7 +551,7 @@ zfs_sha512_available(void) #elif defined(__powerpc__) -#define kfpu_allowed() 1 +#define kfpu_allowed() 0 #define kfpu_initialize(tsk) do {} while (0) #define kfpu_begin() do {} while (0) #define kfpu_end() do {} while (0) diff --git a/module/icp/asm-ppc64/sha2/sha256-p8.S b/module/icp/asm-ppc64/sha2/sha256-p8.S index 6bbfe23b6e15..dc3c4cea669c 100644 --- a/module/icp/asm-ppc64/sha2/sha256-p8.S +++ b/module/icp/asm-ppc64/sha2/sha256-p8.S @@ -21,6 +21,7 @@ #if (defined(__PPC64__) && defined(__BIG_ENDIAN__)) +#if (!defined(_CALL_ELF) || _CALL_ELF == 1) .text .globl zfs_sha256_power8 @@ -33,6 +34,16 @@ zfs_sha256_power8: .previous .align 6 .zfs_sha256_power8: +#else +.abiversion 2 +.text + +.globl zfs_sha256_power8 +.type zfs_sha256_power8,@function +.align 6 +zfs_sha256_power8: +.localentry zfs_sha256_power8,0 +#endif stdu 1,-384(1) mflr 8 li 10,207 @@ -677,8 +688,12 @@ zfs_sha256_power8: .long 0 .byte 0,12,4,1,0x80,6,3,0 .long 0 +#if (!defined(_CALL_ELF) || _CALL_ELF == 1) .size .zfs_sha256_power8,.-.zfs_sha256_power8 .size zfs_sha256_power8,.-.zfs_sha256_power8 +#else +.size zfs_sha256_power8,.-zfs_sha256_power8 +#endif .align 6 .LPICmeup: mflr 0 diff --git a/module/icp/asm-ppc64/sha2/sha256-ppc.S b/module/icp/asm-ppc64/sha2/sha256-ppc.S index 2219e313c9c6..d039bc36ee11 100644 --- a/module/icp/asm-ppc64/sha2/sha256-ppc.S +++ b/module/icp/asm-ppc64/sha2/sha256-ppc.S @@ -21,6 +21,7 @@ #if (defined(__PPC64__) && defined(__BIG_ENDIAN__)) +#if (!defined(_CALL_ELF) || _CALL_ELF == 1) .text .globl zfs_sha256_ppc @@ -33,6 +34,16 @@ zfs_sha256_ppc: .previous .align 6 .zfs_sha256_ppc: +#else +.abiversion 2 +.text + +.globl zfs_sha256_ppc +.type zfs_sha256_ppc,@function +.align 6 +zfs_sha256_ppc: +.localentry zfs_sha256_ppc,0 +#endif stdu 1,-320(1) mflr 0 sldi 5,5,6 @@ -1312,8 +1323,12 @@ zfs_sha256_ppc: blr .long 0 .byte 0,12,0x14,0,0,0,0,0 +#if (!defined(_CALL_ELF) || _CALL_ELF == 1) .size .zfs_sha256_ppc,.-.zfs_sha256_ppc .size zfs_sha256_ppc,.-.zfs_sha256_ppc +#else +.size zfs_sha256_ppc,.-zfs_sha256_ppc +#endif .align 6 .LPICmeup: mflr 0 diff --git a/module/icp/asm-ppc64/sha2/sha512-p8.S b/module/icp/asm-ppc64/sha2/sha512-p8.S index 39a90ede3dc5..2409c53385d6 100644 --- a/module/icp/asm-ppc64/sha2/sha512-p8.S +++ b/module/icp/asm-ppc64/sha2/sha512-p8.S @@ -21,6 +21,7 @@ #if (defined(__PPC64__) && defined(__BIG_ENDIAN__)) +#if (!defined(_CALL_ELF) || _CALL_ELF == 1) .text .globl zfs_sha512_power8 @@ -33,6 +34,17 @@ zfs_sha512_power8: .previous .align 6 .zfs_sha512_power8: +#else +.abiversion 2 +.text + +.globl zfs_sha512_power8 +.type zfs_sha512_power8,@function +.align 6 +zfs_sha512_power8: +.localentry zfs_sha512_power8,0 +#endif + stdu 1,-384(1) mflr 8 li 10,207 @@ -679,8 +691,12 @@ zfs_sha512_power8: .long 0 .byte 0,12,4,1,0x80,6,3,0 .long 0 +#if (!defined(_CALL_ELF) || _CALL_ELF == 1) .size .zfs_sha512_power8,.-.zfs_sha512_power8 .size zfs_sha512_power8,.-.zfs_sha512_power8 +#else +.size zfs_sha512_power8,.-zfs_sha512_power8 +#endif .align 6 .LPICmeup: mflr 0 diff --git a/module/icp/asm-ppc64/sha2/sha512-ppc.S b/module/icp/asm-ppc64/sha2/sha512-ppc.S index 37070115c3ff..57213f68abc5 100644 --- a/module/icp/asm-ppc64/sha2/sha512-ppc.S +++ b/module/icp/asm-ppc64/sha2/sha512-ppc.S @@ -21,6 +21,7 @@ #if (defined(__PPC64__) && defined(__BIG_ENDIAN__)) +#if (!defined(_CALL_ELF) || _CALL_ELF == 1) .text .globl zfs_sha512_ppc @@ -33,6 +34,16 @@ zfs_sha512_ppc: .previous .align 6 .zfs_sha512_ppc: +#else +.abiversion 2 +.text + +.globl zfs_sha512_ppc +.type zfs_sha512_ppc,@function +.align 6 +zfs_sha512_ppc: +.localentry zfs_sha512_ppc,0 +#endif stdu 1,-384(1) mflr 0 sldi 5,5,7 @@ -1350,8 +1361,12 @@ zfs_sha512_ppc: blr .long 0 .byte 0,12,0x14,0,0,0,0,0 +#if (!defined(_CALL_ELF) || _CALL_ELF == 1) .size .zfs_sha512_ppc,.-.zfs_sha512_ppc .size zfs_sha512_ppc,.-.zfs_sha512_ppc +#else +.size zfs_sha512_ppc,.-zfs_sha512_ppc +#endif .align 6 .LPICmeup: mflr 0 From 0c93d86f01e509cdfab271fa285497cbda4e3a9f Mon Sep 17 00:00:00 2001 From: Serapheim Dimitropoulos Date: Mon, 1 May 2023 17:18:42 -0700 Subject: [PATCH 078/180] Correct ABD size for split block ZIOs Currently when layering the ABD buffer of each split block on top of an indirect vdev's ZIO ABD we don't specify the split block's ABD. This results in those ABDs being incorrectly sized by inheriting the size of their parent ABD which is larger than what each split block needs. The above behavior isn't causing any bugs currently but can lead to unexpected ABD sizes for people analyzing and/or working on the ZIO codepath. This patch fixes this behavior by properly setting the ABD size for split block ZIOs. Reviewed-by: Matthew Ahrens Reviewed-by: Igor Kozhukhov Reviewed-by: Brian Behlendorf Reviewed-by: Mark Maybee Reviewed-by: Brian Atkinson Signed-off-by: Serapheim Dimitropoulos Closes #14804 --- module/zfs/vdev_indirect.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index 8c11a574ae86..a16ad2f4e7cf 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -1370,9 +1370,10 @@ vdev_indirect_io_start(zio_t *zio) is != NULL; is = list_next(&iv->iv_splits, is)) { zio_nowait(zio_vdev_child_io(zio, NULL, is->is_vdev, is->is_target_offset, - abd_get_offset(zio->io_abd, - is->is_split_offset), is->is_size, - zio->io_type, zio->io_priority, 0, + abd_get_offset_size(zio->io_abd, + is->is_split_offset, is->is_size), + is->is_size, zio->io_type, + zio->io_priority, 0, vdev_indirect_child_io_done, zio)); } From e2a92d726e1849f646d137510796b11c26d45cae Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 2 May 2023 02:21:27 +0200 Subject: [PATCH 079/180] blake3: fix up bogus checksums in face of cpu migration This is a temporary measure until a better fix is sorted out. Reviewed-by: Richard Yao Reviewed-by: Alexander Motin Reviewed-by: Rich Ercolani Signed-off-by: Mateusz Guzik Sponsored by: Rubicon Communications, LLC ("Netgate") Closes #14785 Closes #14808 --- module/zfs/blake3_zfs.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/module/zfs/blake3_zfs.c b/module/zfs/blake3_zfs.c index bcc595bca8f2..7783282b671a 100644 --- a/module/zfs/blake3_zfs.c +++ b/module/zfs/blake3_zfs.c @@ -50,7 +50,8 @@ abd_checksum_blake3_native(abd_t *abd, uint64_t size, const void *ctx_template, ASSERT(ctx_template != NULL); #if defined(_KERNEL) - BLAKE3_CTX *ctx = blake3_per_cpu_ctx[CPU_SEQID_UNSTABLE]; + kpreempt_disable(); + BLAKE3_CTX *ctx = blake3_per_cpu_ctx[CPU_SEQID]; #else BLAKE3_CTX *ctx = kmem_alloc(sizeof (*ctx), KM_SLEEP); #endif @@ -59,7 +60,9 @@ abd_checksum_blake3_native(abd_t *abd, uint64_t size, const void *ctx_template, (void) abd_iterate_func(abd, 0, size, blake3_incremental, ctx); Blake3_Final(ctx, (uint8_t *)zcp); -#if !defined(_KERNEL) +#if defined(_KERNEL) + kpreempt_enable(); +#else memset(ctx, 0, sizeof (*ctx)); kmem_free(ctx, sizeof (*ctx)); #endif From 012829df0c99d843c9c873b9be57796eaecb155b Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 2 May 2023 09:21:47 -0700 Subject: [PATCH 080/180] Wrap clang specific pragma Clang specific pragmas need to be wrapped to prevent a build warning when compiling with gcc. Reviewed-by: Tino Reichardt Signed-off-by: Brian Behlendorf Closes #14814 --- include/os/linux/zfs/sys/trace_zil.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/os/linux/zfs/sys/trace_zil.h b/include/os/linux/zfs/sys/trace_zil.h index fb03d3149f8f..7bddd9d1f469 100644 --- a/include/os/linux/zfs/sys/trace_zil.h +++ b/include/os/linux/zfs/sys/trace_zil.h @@ -153,8 +153,10 @@ * itx_t *, ...); */ +#if defined(__clang__) #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wordered-compare-function-pointers" +#endif /* BEGIN CSTYLED */ DECLARE_EVENT_CLASS(zfs_zil_process_itx_class, TP_PROTO(zilog_t *zilog, itx_t *itx), @@ -172,7 +174,9 @@ DECLARE_EVENT_CLASS(zfs_zil_process_itx_class, ZILOG_TP_PRINTK_ARGS, ITX_TP_PRINTK_ARGS) ); /* END CSTYLED */ +#if defined(__clang__) #pragma clang diagnostic pop +#endif #define DEFINE_ZIL_PROCESS_ITX_EVENT(name) \ DEFINE_EVENT(zfs_zil_process_itx_class, name, \ From d96e29576c89e6e547cb82b477651d2b85ea0fed Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Wed, 3 May 2023 01:24:26 +0900 Subject: [PATCH 081/180] Use correct block pointer in block cloning case. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Pawel Jakub Dawidek Closes #14806 --- module/zfs/dbuf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index c7f76e8d96f8..8193fb244079 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1620,8 +1620,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, * If this is not true it indicates tampering and we report an error. */ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) { - spa_log_error(db->db_objset->os_spa, &zb, - &db->db_blkptr->blk_birth); + spa_log_error(db->db_objset->os_spa, &zb, &bpp->blk_birth); zfs_panic_recover("unencrypted block in encrypted " "object set %llu", dmu_objset_id(db->db_objset)); err = SET_ERROR(EIO); From 9de5300c7fc0ff944e02d5d1a1ae5742234930e0 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Wed, 3 May 2023 18:00:14 +0200 Subject: [PATCH 082/180] Optimize check_filesystem() and process_error_log() Integrate check_clones() into check_filesystem() and implement a list instead of iterating recursively over the clones, thus eliminating the risk of a stack overflow. Also use kmem_zalloc() to allocate large structures in process_error_log() reducing its stack size from ~700 to ~128 bytes. Reviewed-by: Tino Reichardt Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #14744 --- module/zfs/spa_errlog.c | 138 ++++++++++++++++++++++++---------------- 1 file changed, 84 insertions(+), 54 deletions(-) diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index 3bc8619b51a8..e0604c4a84af 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -72,6 +72,11 @@ #define NAME_MAX_LEN 64 +typedef struct clones { + uint64_t clone_ds; + list_node_t node; +} clones_t; + /* * spa_upgrade_errlog_limit : A zfs module parameter that controls the number * of on-disk error log entries that will be converted to the new @@ -135,10 +140,6 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) } #ifdef _KERNEL -static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, - uint64_t *snap_obj_array, zbookmark_err_phys_t *zep, void* uaddr, - uint64_t *count); - static void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) { @@ -291,7 +292,7 @@ copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count) */ static int check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, - void *uaddr, uint64_t *count) + void *uaddr, uint64_t *count, list_t *clones_list) { dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; @@ -412,24 +413,10 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, dsl_dataset_rele(ds, FTAG); } - if (zap_clone != 0 && aff_snap_count > 0) { - error = check_clones(spa, zap_clone, snap_count, snap_obj_array, - zep, uaddr, count); - } - -out: - kmem_free(snap_obj_array, sizeof (*snap_obj_array)); - return (error); -} + if (zap_clone == 0 || aff_snap_count == 0) + return (0); -/* - * Clone checking. - */ -static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, - uint64_t *snap_obj_array, zbookmark_err_phys_t *zep, void* uaddr, - uint64_t *count) -{ - int error = 0; + /* Check clones. */ zap_cursor_t *zc; zap_attribute_t *za; @@ -440,7 +427,6 @@ static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { - dsl_pool_t *dp = spa->spa_dsl_pool; dsl_dataset_t *clone; error = dsl_dataset_hold_obj(dp, za->za_first_integer, FTAG, &clone); @@ -463,17 +449,17 @@ static int check_clones(spa_t *spa, uint64_t zap_clone, uint64_t snap_count, if (!found) continue; - error = check_filesystem(spa, za->za_first_integer, zep, - uaddr, count); - - if (error != 0) - break; + clones_t *ct = kmem_zalloc(sizeof (*ct), KM_SLEEP); + ct->clone_ds = za->za_first_integer; + list_insert_tail(clones_list, ct); } zap_cursor_fini(zc); kmem_free(za, sizeof (*za)); kmem_free(zc, sizeof (*zc)); +out: + kmem_free(snap_obj_array, sizeof (*snap_obj_array)); return (error); } @@ -523,8 +509,30 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, uint64_t top_affected_fs; int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs); if (error == 0) { + clones_t *ct; + list_t clones_list; + + list_create(&clones_list, sizeof (clones_t), + offsetof(clones_t, node)); + error = check_filesystem(spa, top_affected_fs, zep, - uaddr, count); + uaddr, count, &clones_list); + + while ((ct = list_remove_head(&clones_list)) != NULL) { + error = check_filesystem(spa, ct->clone_ds, zep, + uaddr, count, &clones_list); + kmem_free(ct, sizeof (*ct)); + + if (error) { + while (!list_is_empty(&clones_list)) { + ct = list_remove_head(&clones_list); + kmem_free(ct, sizeof (*ct)); + } + break; + } + } + + list_destroy(&clones_list); } return (error); @@ -827,62 +835,84 @@ spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx) static int process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count) { - zap_cursor_t zc; - zap_attribute_t za; - if (obj == 0) return (0); + zap_cursor_t *zc; + zap_attribute_t *za; + + zc = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { - for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { + for (zap_cursor_init(zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { if (*count == 0) { - zap_cursor_fini(&zc); + zap_cursor_fini(zc); + kmem_free(zc, sizeof (*zc)); + kmem_free(za, sizeof (*za)); return (SET_ERROR(ENOMEM)); } zbookmark_phys_t zb; - name_to_bookmark(za.za_name, &zb); + name_to_bookmark(za->za_name, &zb); int error = copyout_entry(&zb, uaddr, count); if (error != 0) { - zap_cursor_fini(&zc); + zap_cursor_fini(zc); + kmem_free(zc, sizeof (*zc)); + kmem_free(za, sizeof (*za)); return (error); } } - zap_cursor_fini(&zc); + zap_cursor_fini(zc); + kmem_free(zc, sizeof (*zc)); + kmem_free(za, sizeof (*za)); return (0); } - for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); - zap_cursor_retrieve(&zc, &za) == 0; - zap_cursor_advance(&zc)) { + for (zap_cursor_init(zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + + zap_cursor_t *head_ds_cursor; + zap_attribute_t *head_ds_attr; - zap_cursor_t head_ds_cursor; - zap_attribute_t head_ds_attr; + head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); - uint64_t head_ds_err_obj = za.za_first_integer; + uint64_t head_ds_err_obj = za->za_first_integer; uint64_t head_ds; - name_to_object(za.za_name, &head_ds); - for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset, - head_ds_err_obj); zap_cursor_retrieve(&head_ds_cursor, - &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) { + name_to_object(za->za_name, &head_ds); + for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset, + head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor, + head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) { zbookmark_err_phys_t head_ds_block; - name_to_errphys(head_ds_attr.za_name, &head_ds_block); + name_to_errphys(head_ds_attr->za_name, &head_ds_block); int error = process_error_block(spa, head_ds, &head_ds_block, uaddr, count); if (error != 0) { - zap_cursor_fini(&head_ds_cursor); - zap_cursor_fini(&zc); + zap_cursor_fini(head_ds_cursor); + kmem_free(head_ds_cursor, + sizeof (*head_ds_cursor)); + kmem_free(head_ds_attr, sizeof (*head_ds_attr)); + + zap_cursor_fini(zc); + kmem_free(za, sizeof (*za)); + kmem_free(zc, sizeof (*zc)); return (error); } } - zap_cursor_fini(&head_ds_cursor); + zap_cursor_fini(head_ds_cursor); + kmem_free(head_ds_cursor, sizeof (*head_ds_cursor)); + kmem_free(head_ds_attr, sizeof (*head_ds_attr)); } - zap_cursor_fini(&zc); + zap_cursor_fini(zc); + kmem_free(za, sizeof (*za)); + kmem_free(zc, sizeof (*zc)); return (0); } From a46001adb9b143eebf43cd7ca4b508c044f80f00 Mon Sep 17 00:00:00 2001 From: buzzingwires <131118055+buzzingwires@users.noreply.github.com> Date: Wed, 3 May 2023 12:03:57 -0400 Subject: [PATCH 083/180] Allow zhack label repair to restore detached devices. This commit expands on the zhack label repair command in d04b5c9 by adding the -u option to undetach a device by regenerating uberblocks, in addition to the existing functionality of fixing checksums, now represented by -c. Previous behavior is retained in the case of no options. The changes are heavily inspired by Jeff Bonwick's labelfix utility, as archived at: https://gist.github.com/jjwhitney/baaa63144da89726e482 Additionally, it is now capable of properly determining the size of block devices and other media, as well as handling sizes which are not divisible by 2^18. This should make it viable for use on physical devices and partitions, in addition to files. These changes should make it possible to import zpools that have had their uberblocks erased, such as in the case of pools rendered inaccessible by erroneous detach commands. Reviewed-by: Brian Behlendorf Signed-off-by: buzzingwires Closes #14773 --- cmd/zhack.c | 508 ++++++++++++++---- man/man1/zhack.1 | 23 +- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 6 +- .../functional/cli_root/zhack/library.kshlib | 361 +++++++++++++ .../cli_root/zhack/zhack_label_checksum.ksh | 64 --- .../cli_root/zhack/zhack_label_repair_001.ksh | 30 ++ .../cli_root/zhack/zhack_label_repair_002.ksh | 31 ++ .../cli_root/zhack/zhack_label_repair_003.ksh | 33 ++ .../cli_root/zhack/zhack_label_repair_004.ksh | 30 ++ 10 files changed, 928 insertions(+), 161 deletions(-) create mode 100644 tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib delete mode 100755 tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_checksum.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_002.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_003.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_004.ksh diff --git a/cmd/zhack.c b/cmd/zhack.c index 0b6da31ec573..44611887dd25 100644 --- a/cmd/zhack.c +++ b/cmd/zhack.c @@ -58,6 +58,12 @@ static importargs_t g_importargs; static char *g_pool; static boolean_t g_readonly; +typedef enum { + ZHACK_REPAIR_OP_UNKNOWN = 0, + ZHACK_REPAIR_OP_CKSUM = (1 << 0), + ZHACK_REPAIR_OP_UNDETACH = (1 << 1) +} zhack_repair_op_t; + static __attribute__((noreturn)) void usage(void) { @@ -81,7 +87,10 @@ usage(void) " : should be a feature guid\n" "\n" " label repair \n" - " repair corrupted label checksums\n" + " repair labels of a specified device according to options\n" + " which may be combined to do their functions in one call\n" + " -c repair corrupted label checksums\n" + " -u restore the label on a detached device\n" "\n" " : path to vdev\n"); exit(1); @@ -485,119 +494,398 @@ zhack_do_feature(int argc, char **argv) return (0); } +#define ASHIFT_UBERBLOCK_SHIFT(ashift) \ + MIN(MAX(ashift, UBERBLOCK_SHIFT), \ + MAX_UBERBLOCK_SHIFT) +#define ASHIFT_UBERBLOCK_SIZE(ashift) \ + (1ULL << ASHIFT_UBERBLOCK_SHIFT(ashift)) + +#define REPAIR_LABEL_STATUS_CKSUM (1 << 0) +#define REPAIR_LABEL_STATUS_UB (1 << 1) + static int -zhack_repair_label_cksum(int argc, char **argv) +zhack_repair_read_label(const int fd, vdev_label_t *vl, + const uint64_t label_offset, const int l) { - zio_checksum_info_t *ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; - const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION, - ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID }; - boolean_t labels_repaired[VDEV_LABELS] = {0}; - boolean_t repaired = B_FALSE; - vdev_label_t labels[VDEV_LABELS] = {{{0}}}; - struct stat st; - int fd; + const int err = pread64(fd, vl, sizeof (vdev_label_t), label_offset); - abd_init(); + if (err == -1) { + (void) fprintf(stderr, + "error: cannot read label %d: %s\n", + l, strerror(errno)); + return (err); + } else if (err != sizeof (vdev_label_t)) { + (void) fprintf(stderr, + "error: bad label %d read size\n", l); + return (err); + } - argc -= 1; - argv += 1; + return (0); +} - if (argc < 1) { - (void) fprintf(stderr, "error: missing device\n"); - usage(); - } +static void +zhack_repair_calc_cksum(const int byteswap, void *data, const uint64_t offset, + const uint64_t abdsize, zio_eck_t *eck, zio_cksum_t *cksum) +{ + zio_cksum_t verifier; + zio_cksum_t current_cksum; + zio_checksum_info_t *ci; + abd_t *abd; - if ((fd = open(argv[0], O_RDWR)) == -1) - fatal(NULL, FTAG, "cannot open '%s': %s", argv[0], - strerror(errno)); + ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0); - if (stat(argv[0], &st) != 0) - fatal(NULL, FTAG, "cannot stat '%s': %s", argv[0], - strerror(errno)); + if (byteswap) + byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); - for (int l = 0; l < VDEV_LABELS; l++) { - uint64_t label_offset, offset; - zio_cksum_t expected_cksum; - zio_cksum_t actual_cksum; - zio_cksum_t verifier; - zio_eck_t *eck; - nvlist_t *cfg; - int byteswap; + current_cksum = eck->zec_cksum; + eck->zec_cksum = verifier; + + ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL]; + abd = abd_get_from_buf(data, abdsize); + ci->ci_func[byteswap](abd, abdsize, NULL, cksum); + abd_free(abd); + + eck->zec_cksum = current_cksum; +} + +static int +zhack_repair_check_label(uberblock_t *ub, const int l, const char **cfg_keys, + const size_t cfg_keys_len, nvlist_t *cfg, nvlist_t *vdev_tree_cfg, + uint64_t *ashift) +{ + int err; + + if (ub->ub_txg != 0) { + (void) fprintf(stderr, + "error: label %d: UB TXG of 0 expected, but got %" + PRIu64 "\n", + l, ub->ub_txg); + (void) fprintf(stderr, "It would appear the device was not " + "properly removed.\n"); + return (1); + } + + for (int i = 0; i < cfg_keys_len; i++) { uint64_t val; - ssize_t err; - - vdev_label_t *vl = &labels[l]; - - label_offset = vdev_label_offset(st.st_size, l, 0); - err = pread64(fd, vl, sizeof (vdev_label_t), label_offset); - if (err == -1) { - (void) fprintf(stderr, "error: cannot read " - "label %d: %s\n", l, strerror(errno)); - continue; - } else if (err != sizeof (vdev_label_t)) { - (void) fprintf(stderr, "error: bad label %d read size " - "\n", l); - continue; + err = nvlist_lookup_uint64(cfg, cfg_keys[i], &val); + if (err) { + (void) fprintf(stderr, + "error: label %d, %d: " + "cannot find nvlist key %s\n", + l, i, cfg_keys[i]); + return (err); } + } - err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist, - VDEV_PHYS_SIZE - sizeof (zio_eck_t), &cfg, 0); - if (err) { - (void) fprintf(stderr, "error: cannot unpack nvlist " - "label %d\n", l); - continue; + err = nvlist_lookup_nvlist(cfg, + ZPOOL_CONFIG_VDEV_TREE, &vdev_tree_cfg); + if (err) { + (void) fprintf(stderr, + "error: label %d: cannot find nvlist key %s\n", + l, ZPOOL_CONFIG_VDEV_TREE); + return (err); + } + + err = nvlist_lookup_uint64(vdev_tree_cfg, + ZPOOL_CONFIG_ASHIFT, ashift); + if (err) { + (void) fprintf(stderr, + "error: label %d: cannot find nvlist key %s\n", + l, ZPOOL_CONFIG_ASHIFT); + return (err); + } + + if (*ashift == 0) { + (void) fprintf(stderr, + "error: label %d: nvlist key %s is zero\n", + l, ZPOOL_CONFIG_ASHIFT); + return (err); + } + + return (0); +} + +static int +zhack_repair_undetach(uberblock_t *ub, nvlist_t *cfg, const int l) +{ + /* + * Uberblock root block pointer has valid birth TXG. + * Copying it to the label NVlist + */ + if (ub->ub_rootbp.blk_birth != 0) { + const uint64_t txg = ub->ub_rootbp.blk_birth; + ub->ub_txg = txg; + + if (nvlist_remove_all(cfg, ZPOOL_CONFIG_CREATE_TXG) != 0) { + (void) fprintf(stderr, + "error: label %d: " + "Failed to remove pool creation TXG\n", + l); + return (1); } - for (int i = 0; i < ARRAY_SIZE(cfg_keys); i++) { - err = nvlist_lookup_uint64(cfg, cfg_keys[i], &val); - if (err) { - (void) fprintf(stderr, "error: label %d: " - "cannot find nvlist key %s\n", - l, cfg_keys[i]); - continue; - } + if (nvlist_remove_all(cfg, ZPOOL_CONFIG_POOL_TXG) != 0) { + (void) fprintf(stderr, + "error: label %d: Failed to remove pool TXG to " + "be replaced.\n", + l); + return (1); } - void *data = (char *)vl + offsetof(vdev_label_t, vl_vdev_phys); - eck = (zio_eck_t *)((char *)(data) + VDEV_PHYS_SIZE) - 1; + if (nvlist_add_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, txg) != 0) { + (void) fprintf(stderr, + "error: label %d: " + "Failed to add pool TXG of %" PRIu64 "\n", + l, txg); + return (1); + } + } + + return (0); +} - offset = label_offset + offsetof(vdev_label_t, vl_vdev_phys); - ZIO_SET_CHECKSUM(&verifier, offset, 0, 0, 0); +static boolean_t +zhack_repair_write_label(const int l, const int fd, const int byteswap, + void *data, zio_eck_t *eck, const uint64_t offset, const uint64_t abdsize) +{ + zio_cksum_t actual_cksum; + zhack_repair_calc_cksum(byteswap, data, offset, abdsize, eck, + &actual_cksum); + zio_cksum_t expected_cksum = eck->zec_cksum; + ssize_t err; + + if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) + return (B_FALSE); + + eck->zec_cksum = actual_cksum; + + err = pwrite64(fd, data, abdsize, offset); + if (err == -1) { + (void) fprintf(stderr, "error: cannot write label %d: %s\n", + l, strerror(errno)); + return (B_FALSE); + } else if (err != abdsize) { + (void) fprintf(stderr, "error: bad write size label %d\n", l); + return (B_FALSE); + } else { + (void) fprintf(stderr, + "label %d: wrote %" PRIu64 " bytes at offset %" PRIu64 "\n", + l, abdsize, offset); + } - byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); - if (byteswap) - byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); + return (B_TRUE); +} - expected_cksum = eck->zec_cksum; - eck->zec_cksum = verifier; +static void +zhack_repair_write_uberblock(vdev_label_t *vl, const int l, + const uint64_t ashift, const int fd, const int byteswap, + const uint64_t label_offset, uint32_t *labels_repaired) +{ + void *ub_data = + (char *)vl + offsetof(vdev_label_t, vl_uberblock); + zio_eck_t *ub_eck = + (zio_eck_t *) + ((char *)(ub_data) + (ASHIFT_UBERBLOCK_SIZE(ashift))) - 1; - abd_t *abd = abd_get_from_buf(data, VDEV_PHYS_SIZE); - ci->ci_func[byteswap](abd, VDEV_PHYS_SIZE, NULL, &actual_cksum); - abd_free(abd); + if (ub_eck->zec_magic != 0) { + (void) fprintf(stderr, + "error: label %d: " + "Expected Uberblock checksum magic number to " + "be 0, but got %" PRIu64 "\n", + l, ub_eck->zec_magic); + (void) fprintf(stderr, "It would appear there's already " + "a checksum for the uberblock.\n"); + return; + } - if (byteswap) - byteswap_uint64_array(&expected_cksum, - sizeof (zio_cksum_t)); - if (ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) - continue; + ub_eck->zec_magic = byteswap ? BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC; - eck->zec_cksum = actual_cksum; + if (zhack_repair_write_label(l, fd, byteswap, + ub_data, ub_eck, + label_offset + offsetof(vdev_label_t, vl_uberblock), + ASHIFT_UBERBLOCK_SIZE(ashift))) + labels_repaired[l] |= REPAIR_LABEL_STATUS_UB; +} - err = pwrite64(fd, data, VDEV_PHYS_SIZE, offset); - if (err == -1) { - (void) fprintf(stderr, "error: cannot write " - "label %d: %s\n", l, strerror(errno)); - continue; - } else if (err != VDEV_PHYS_SIZE) { - (void) fprintf(stderr, "error: bad write size " - "label %d\n", l); - continue; +static void +zhack_repair_print_cksum(FILE *stream, const zio_cksum_t *cksum) +{ + (void) fprintf(stream, + "%016llx:%016llx:%016llx:%016llx", + (u_longlong_t)cksum->zc_word[0], + (u_longlong_t)cksum->zc_word[1], + (u_longlong_t)cksum->zc_word[2], + (u_longlong_t)cksum->zc_word[3]); +} + +static int +zhack_repair_test_cksum(const int byteswap, void *vdev_data, + zio_eck_t *vdev_eck, const uint64_t vdev_phys_offset, const int l) +{ + const zio_cksum_t expected_cksum = vdev_eck->zec_cksum; + zio_cksum_t actual_cksum; + zhack_repair_calc_cksum(byteswap, vdev_data, vdev_phys_offset, + VDEV_PHYS_SIZE, vdev_eck, &actual_cksum); + const uint64_t expected_magic = byteswap ? + BSWAP_64(ZEC_MAGIC) : ZEC_MAGIC; + const uint64_t actual_magic = vdev_eck->zec_magic; + int err = 0; + if (actual_magic != expected_magic) { + (void) fprintf(stderr, "error: label %d: " + "Expected " + "the nvlist checksum magic number to not be %" + PRIu64 " not %" PRIu64 "\n", + l, expected_magic, actual_magic); + err = ECKSUM; + } + if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { + (void) fprintf(stderr, "error: label %d: " + "Expected the nvlist checksum to be ", l); + (void) zhack_repair_print_cksum(stderr, + &expected_cksum); + (void) fprintf(stderr, " not "); + zhack_repair_print_cksum(stderr, &actual_cksum); + (void) fprintf(stderr, "\n"); + err = ECKSUM; + } + return (err); +} + +static void +zhack_repair_one_label(const zhack_repair_op_t op, const int fd, + vdev_label_t *vl, const uint64_t label_offset, const int l, + uint32_t *labels_repaired) +{ + ssize_t err; + uberblock_t *ub = (uberblock_t *)vl->vl_uberblock; + void *vdev_data = + (char *)vl + offsetof(vdev_label_t, vl_vdev_phys); + zio_eck_t *vdev_eck = + (zio_eck_t *)((char *)(vdev_data) + VDEV_PHYS_SIZE) - 1; + const uint64_t vdev_phys_offset = + label_offset + offsetof(vdev_label_t, vl_vdev_phys); + const char *cfg_keys[] = { ZPOOL_CONFIG_VERSION, + ZPOOL_CONFIG_POOL_STATE, ZPOOL_CONFIG_GUID }; + nvlist_t *cfg; + nvlist_t *vdev_tree_cfg = NULL; + uint64_t ashift; + int byteswap; + + err = zhack_repair_read_label(fd, vl, label_offset, l); + if (err) + return; + + if (vdev_eck->zec_magic == 0) { + (void) fprintf(stderr, "error: label %d: " + "Expected the nvlist checksum magic number to not be zero" + "\n", + l); + (void) fprintf(stderr, "There should already be a checksum " + "for the label.\n"); + return; + } + + byteswap = + (vdev_eck->zec_magic == BSWAP_64((uint64_t)ZEC_MAGIC)); + + if (byteswap) { + byteswap_uint64_array(&vdev_eck->zec_cksum, + sizeof (zio_cksum_t)); + vdev_eck->zec_magic = BSWAP_64(vdev_eck->zec_magic); + } + + if ((op & ZHACK_REPAIR_OP_CKSUM) == 0 && + zhack_repair_test_cksum(byteswap, vdev_data, vdev_eck, + vdev_phys_offset, l) != 0) { + (void) fprintf(stderr, "It would appear checksums are " + "corrupted. Try zhack repair label -c \n"); + return; + } + + err = nvlist_unpack(vl->vl_vdev_phys.vp_nvlist, + VDEV_PHYS_SIZE - sizeof (zio_eck_t), &cfg, 0); + if (err) { + (void) fprintf(stderr, + "error: cannot unpack nvlist label %d\n", l); + return; + } + + err = zhack_repair_check_label(ub, + l, cfg_keys, ARRAY_SIZE(cfg_keys), cfg, vdev_tree_cfg, &ashift); + if (err) + return; + + if ((op & ZHACK_REPAIR_OP_UNDETACH) != 0) { + char *buf; + size_t buflen; + + err = zhack_repair_undetach(ub, cfg, l); + if (err) + return; + + buf = vl->vl_vdev_phys.vp_nvlist; + buflen = VDEV_PHYS_SIZE - sizeof (zio_eck_t); + if (nvlist_pack(cfg, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) { + (void) fprintf(stderr, + "error: label %d: Failed to pack nvlist\n", l); + return; } - fsync(fd); + zhack_repair_write_uberblock(vl, + l, ashift, fd, byteswap, label_offset, labels_repaired); + } + + if (zhack_repair_write_label(l, fd, byteswap, vdev_data, vdev_eck, + vdev_phys_offset, VDEV_PHYS_SIZE)) + labels_repaired[l] |= REPAIR_LABEL_STATUS_CKSUM; + + fsync(fd); +} + +static const char * +zhack_repair_label_status(const uint32_t label_status, + const uint32_t to_check) +{ + return ((label_status & to_check) != 0 ? "repaired" : "skipped"); +} + +static int +zhack_label_repair(const zhack_repair_op_t op, const int argc, char **argv) +{ + uint32_t labels_repaired[VDEV_LABELS] = {0}; + vdev_label_t labels[VDEV_LABELS] = {{{0}}}; + struct stat64 st; + int fd; + off_t filesize; + uint32_t repaired = 0; + + abd_init(); + + if (argc < 1) { + (void) fprintf(stderr, "error: missing device\n"); + usage(); + } + + if ((fd = open(argv[0], O_RDWR)) == -1) + fatal(NULL, FTAG, "cannot open '%s': %s", argv[0], + strerror(errno)); + + if (fstat64_blk(fd, &st) != 0) + fatal(NULL, FTAG, "cannot stat '%s': %s", argv[0], + strerror(errno)); + + filesize = st.st_size; + (void) fprintf(stderr, "Calculated filesize to be %jd\n", + (intmax_t)filesize); + + if (filesize % sizeof (vdev_label_t) != 0) + filesize = + (filesize / sizeof (vdev_label_t)) * sizeof (vdev_label_t); - labels_repaired[l] = B_TRUE; + for (int l = 0; l < VDEV_LABELS; l++) { + zhack_repair_one_label(op, fd, &labels[l], + vdev_label_offset(filesize, l, 0), l, labels_repaired); } close(fd); @@ -605,17 +893,51 @@ zhack_repair_label_cksum(int argc, char **argv) abd_fini(); for (int l = 0; l < VDEV_LABELS; l++) { - (void) printf("label %d: %s\n", l, - labels_repaired[l] ? "repaired" : "skipped"); - repaired |= labels_repaired[l]; + const uint32_t lr = labels_repaired[l]; + (void) printf("label %d: ", l); + (void) printf("uberblock: %s ", + zhack_repair_label_status(lr, REPAIR_LABEL_STATUS_UB)); + (void) printf("checksum: %s\n", + zhack_repair_label_status(lr, REPAIR_LABEL_STATUS_CKSUM)); + repaired |= lr; } - if (repaired) + if (repaired > 0) return (0); return (1); } +static int +zhack_do_label_repair(int argc, char **argv) +{ + zhack_repair_op_t op = ZHACK_REPAIR_OP_UNKNOWN; + int c; + + optind = 1; + while ((c = getopt(argc, argv, "+cu")) != -1) { + switch (c) { + case 'c': + op |= ZHACK_REPAIR_OP_CKSUM; + break; + case 'u': + op |= ZHACK_REPAIR_OP_UNDETACH; + break; + default: + usage(); + break; + } + } + + argc -= optind; + argv += optind; + + if (op == ZHACK_REPAIR_OP_UNKNOWN) + op = ZHACK_REPAIR_OP_CKSUM; + + return (zhack_label_repair(op, argc, argv)); +} + static int zhack_do_label(int argc, char **argv) { @@ -632,7 +954,7 @@ zhack_do_label(int argc, char **argv) subcommand = argv[0]; if (strcmp(subcommand, "repair") == 0) { - err = zhack_repair_label_cksum(argc, argv); + err = zhack_do_label_repair(argc, argv); } else { (void) fprintf(stderr, "error: unknown subcommand: %s\n", subcommand); diff --git a/man/man1/zhack.1 b/man/man1/zhack.1 index 26b8156b4008..937f1e9168c2 100644 --- a/man/man1/zhack.1 +++ b/man/man1/zhack.1 @@ -98,10 +98,29 @@ feature is now required to read the pool MOS. .It Xo .Nm zhack .Cm label repair +.Op Fl cu .Ar device .Xc -Repair corrupted labels by rewriting the checksum using the presumed valid -contents of the label. +Repair labels of a specified +.Ar device +according to options. +.Pp +Flags may be combined to do their functions simultaneously. +. +.Pp +The +.Fl c +flag repairs corrupted label checksums +. +.Pp +The +.Fl u +flag restores the label on a detached device +.Pp +Example: +.Nm zhack Cm label repair Fl cu Ar device + Fix checksums and undetach a device +. .El . .Sh GLOBAL OPTIONS diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 55991cfeaf78..3730f2b27038 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -325,7 +325,8 @@ tests = ['zfs_wait_deleteq', 'zfs_wait_getsubopt'] tags = ['functional', 'cli_root', 'zfs_wait'] [tests/functional/cli_root/zhack] -tests = ['zhack_label_checksum'] +tests = ['zhack_label_repair_001', 'zhack_label_repair_002', + 'zhack_label_repair_003', 'zhack_label_repair_004'] pre = post = tags = ['functional', 'cli_root', 'zhack'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 74295b86ddc2..0112d28d0c19 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -250,6 +250,7 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/cli_root/zpool_upgrade/zpool_upgrade.cfg \ functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib \ functional/cli_root/zpool_wait/zpool_wait.kshlib \ + functional/cli_root/zhack/library.kshlib \ functional/cli_user/misc/misc.cfg \ functional/cli_user/zfs_list/zfs_list.cfg \ functional/cli_user/zfs_list/zfs_list.kshlib \ @@ -932,7 +933,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs/zfs_001_neg.ksh \ functional/cli_root/zfs/zfs_002_pos.ksh \ functional/cli_root/zfs/zfs_003_neg.ksh \ - functional/cli_root/zhack/zhack_label_checksum.ksh \ + functional/cli_root/zhack/zhack_label_repair_001.ksh \ + functional/cli_root/zhack/zhack_label_repair_002.ksh \ + functional/cli_root/zhack/zhack_label_repair_003.ksh \ + functional/cli_root/zhack/zhack_label_repair_004.ksh \ functional/cli_root/zpool_add/add_nested_replacing_spare.ksh \ functional/cli_root/zpool_add/add-o_ashift.ksh \ functional/cli_root/zpool_add/add_prop_ashift.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib b/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib new file mode 100644 index 000000000000..880a78861630 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/library.kshlib @@ -0,0 +1,361 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2021 by vStack. All rights reserved. +# + +. "$STF_SUITE"/include/libtest.shlib +. "$STF_SUITE"/include/blkdev.shlib + +# +# Description: +# +# Test whether zhack label repair commands can recover detached devices +# and corrupted checksums with a variety of sizes, and ensure +# the purposes of either command is cleanly separated from the others. +# +# Strategy: +# +# Tests are done on loopback devices with sizes divisible by label size and sizes that are not. +# +# Test one: +# +# 1. Create pool on a loopback device with some test data +# 2. Export the pool. +# 3. Corrupt all label checksums in the pool +# 4. Check that pool cannot be imported +# 5. Verify that it cannot be imported after using zhack label repair -u +# to ensure that the -u option will quit on corrupted checksums. +# 6. Use zhack label repair -c on device +# 7. Check that pool can be imported and that data is intact +# +# Test two: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Verify that the remaining detached device cannot be imported +# 6. Verify that it cannot be imported after using zhack label repair -c +# to ensure that the -c option will not undetach a device. +# 7. Use zhack label repair -u on device +# 8. Verify that the detached device can be imported and that data is intact +# +# Test three: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Corrupt all label checksums on the remaining device +# 6. Verify that the remaining detached device cannot be imported +# 7. Verify that it cannot be imported after using zhack label repair -u +# to ensure that the -u option will quit on corrupted checksums. +# 8. Verify that it cannot be imported after using zhack label repair -c +# -c should repair the checksums, but not undetach a device. +# 9. Use zhack label repair -u on device +# 10. Verify that the detached device can be imported and that data is intact +# +# Test four: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Corrupt all label checksums on the remaining device +# 6. Verify that the remaining detached device cannot be imported +# 7. Use zhack label repair -cu on device to attempt to fix checksums and +# undetach the device in a single operation. +# 8. Verify that the detached device can be imported and that data is intact +# + +log_assert "Verify zhack label repair will repair label checksums and uberblocks" +log_onexit cleanup + +LABEL_SIZE="$((2**18))" +LABEL_NVLIST_END="$((LABEL_SIZE / 2))" +LABEL_CKSUM_SIZE="32" +LABEL_CKSUM_START="$(( LABEL_NVLIST_END - LABEL_CKSUM_SIZE ))" + +VIRTUAL_DISK=$TEST_BASE_DIR/disk +VIRTUAL_MIRROR_DISK=$TEST_BASE_DIR/mirrordisk + +VIRTUAL_DEVICE= +VIRTUAL_MIRROR_DEVICE= + +function cleanup_lo +{ + L_DEVICE="$1" + + if [[ -e $L_DEVICE ]]; then + if is_linux; then + log_must losetup -d "$L_DEVICE" + elif is_freebsd; then + log_must mdconfig -d -u "$L_DEVICE" + else + log_must lofiadm -d "$L_DEVICE" + fi + fi +} + +function cleanup +{ + poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL" + cleanup_lo "$VIRTUAL_DEVICE" + cleanup_lo "$VIRTUAL_MIRROR_DEVICE" + VIRTUAL_DEVICE= + VIRTUAL_MIRROR_DEVICE= + [[ -f "$VIRTUAL_DISK" ]] && log_must rm "$VIRTUAL_DISK" + [[ -f "$VIRTUAL_MIRROR_DISK" ]] && log_must rm "$VIRTUAL_MIRROR_DISK" +} + +RAND_MAX="$((2**15 - 1))" +function get_devsize +{ + if [ "$RANDOM" -gt "$(( RAND_MAX / 2 ))" ]; then + echo "$(( MINVDEVSIZE + RANDOM ))" + else + echo "$MINVDEVSIZE" + fi +} + +function pick_logop +{ + L_SHOULD_SUCCEED="$1" + + l_logop="log_mustnot" + if [ "$L_SHOULD_SUCCEED" == true ]; then + l_logop="log_must" + fi + + echo "$l_logop" +} + +function check_dataset +{ + L_SHOULD_SUCCEED="$1" + L_LOGOP="$(pick_logop "$L_SHOULD_SUCCEED")" + + "$L_LOGOP" mounted "$TESTPOOL"/"$TESTFS" + + "$L_LOGOP" test -f "$TESTDIR"/"test" +} + +function setup_dataset +{ + log_must zfs create "$TESTPOOL"/"$TESTFS" + + log_must mkdir -p "$TESTDIR" + log_must zfs set mountpoint="$TESTDIR" "$TESTPOOL"/"$TESTFS" + + log_must mounted "$TESTPOOL"/"$TESTFS" + + log_must touch "$TESTDIR"/"test" + log_must test -f "$TESTDIR"/"test" + + log_must zpool sync "$TESTPOOL" + + check_dataset true +} + +function get_practical_size +{ + L_SIZE="$1" + + if [ "$((L_SIZE % LABEL_SIZE))" -ne 0 ]; then + echo "$(((L_SIZE / LABEL_SIZE) * LABEL_SIZE))" + else + echo "$L_SIZE" + fi +} + +function corrupt_sized_label_checksum +{ + L_SIZE="$1" + L_LABEL="$2" + L_DEVICE="$3" + + L_PRACTICAL_SIZE="$(get_practical_size "$L_SIZE")" + + typeset -a L_OFFSETS=("$LABEL_CKSUM_START" \ + "$((LABEL_SIZE + LABEL_CKSUM_START))" \ + "$(((L_PRACTICAL_SIZE - LABEL_SIZE*2) + LABEL_CKSUM_START))" \ + "$(((L_PRACTICAL_SIZE - LABEL_SIZE) + LABEL_CKSUM_START))") + + dd if=/dev/urandom of="$L_DEVICE" \ + seek="${L_OFFSETS["$L_LABEL"]}" bs=1 count="$LABEL_CKSUM_SIZE" \ + conv=notrunc +} + +function corrupt_labels +{ + L_SIZE="$1" + L_DISK="$2" + + corrupt_sized_label_checksum "$L_SIZE" 0 "$L_DISK" + corrupt_sized_label_checksum "$L_SIZE" 1 "$L_DISK" + corrupt_sized_label_checksum "$L_SIZE" 2 "$L_DISK" + corrupt_sized_label_checksum "$L_SIZE" 3 "$L_DISK" +} + +function try_import_and_repair +{ + L_REPAIR_SHOULD_SUCCEED="$1" + L_IMPORT_SHOULD_SUCCEED="$2" + L_OP="$3" + L_POOLDISK="$4" + L_REPAIR_LOGOP="$(pick_logop "$L_REPAIR_SHOULD_SUCCEED")" + L_IMPORT_LOGOP="$(pick_logop "$L_IMPORT_SHOULD_SUCCEED")" + + log_mustnot zpool import "$TESTPOOL" -d "$L_POOLDISK" + + "$L_REPAIR_LOGOP" zhack label repair "$L_OP" "$L_POOLDISK" + + "$L_IMPORT_LOGOP" zpool import "$TESTPOOL" -d "$L_POOLDISK" + + check_dataset "$L_IMPORT_SHOULD_SUCCEED" +} + +function prepare_vdev +{ + L_SIZE="$1" + L_BACKFILE="$2" + + l_devname= + if truncate -s "$L_SIZE" "$L_BACKFILE"; then + if is_linux; then + l_devname="$(losetup -f "$L_BACKFILE" --show)" + elif is_freebsd; then + l_devname=/dev/"$(mdconfig -a -t vnode -f "$L_BACKFILE")" + else + l_devname="$(lofiadm -a "$L_BACKFILE")" + fi + fi + echo "$l_devname" +} + +function run_test_one +{ + L_SIZE="$1" + + VIRTUAL_DEVICE="$(prepare_vdev "$L_SIZE" "$VIRTUAL_DISK")" + log_must test -e "$VIRTUAL_DEVICE" + + log_must zpool create "$TESTPOOL" "$VIRTUAL_DEVICE" + + setup_dataset + + log_must zpool export "$TESTPOOL" + + corrupt_labels "$L_SIZE" "$VIRTUAL_DISK" + + try_import_and_repair false false "-u" "$VIRTUAL_DEVICE" + + try_import_and_repair true true "-c" "$VIRTUAL_DEVICE" + + cleanup + + log_pass "zhack label repair corruption test passed with a randomized size of $L_SIZE" +} + +function make_mirrored_pool +{ + L_SIZE="$1" + + VIRTUAL_DEVICE="$(prepare_vdev "$L_SIZE" "$VIRTUAL_DISK")" + log_must test -e "$VIRTUAL_DEVICE" + VIRTUAL_MIRROR_DEVICE="$(prepare_vdev "$L_SIZE" "$VIRTUAL_MIRROR_DISK")" + log_must test -e "$VIRTUAL_MIRROR_DEVICE" + + log_must zpool create "$TESTPOOL" "$VIRTUAL_DEVICE" + log_must zpool attach "$TESTPOOL" "$VIRTUAL_DEVICE" "$VIRTUAL_MIRROR_DEVICE" +} + +function export_and_cleanup_vdisk +{ + log_must zpool export "$TESTPOOL" + + cleanup_lo "$VIRTUAL_DEVICE" + + VIRTUAL_DEVICE= + + log_must rm "$VIRTUAL_DISK" +} + +function run_test_two +{ + L_SIZE="$1" + + make_mirrored_pool "$L_SIZE" + + setup_dataset + + log_must zpool detach "$TESTPOOL" "$VIRTUAL_MIRROR_DEVICE" + + export_and_cleanup_vdisk + + try_import_and_repair false false "-c" "$VIRTUAL_MIRROR_DEVICE" + + try_import_and_repair true true "-u" "$VIRTUAL_MIRROR_DEVICE" + + cleanup + + log_pass "zhack label repair detached test passed with a randomized size of $L_SIZE" +} + +function run_test_three +{ + L_SIZE="$1" + + make_mirrored_pool "$L_SIZE" + + setup_dataset + + log_must zpool detach "$TESTPOOL" "$VIRTUAL_MIRROR_DEVICE" + + export_and_cleanup_vdisk + + corrupt_labels "$L_SIZE" "$VIRTUAL_MIRROR_DISK" + + try_import_and_repair false false "-u" "$VIRTUAL_MIRROR_DEVICE" + + try_import_and_repair true false "-c" "$VIRTUAL_MIRROR_DEVICE" + + try_import_and_repair true true "-u" "$VIRTUAL_MIRROR_DEVICE" + + cleanup + + log_pass "zhack label repair corruption and detached test passed with a randomized size of $L_SIZE" +} + +function run_test_four +{ + L_SIZE="$1" + + make_mirrored_pool "$L_SIZE" + + setup_dataset + + log_must zpool detach "$TESTPOOL" "$VIRTUAL_MIRROR_DEVICE" + + export_and_cleanup_vdisk + + corrupt_labels "$L_SIZE" "$VIRTUAL_MIRROR_DISK" + + try_import_and_repair true true "-cu" "$VIRTUAL_MIRROR_DEVICE" + + cleanup + + log_pass "zhack label repair corruption and detached single-command test passed with a randomized size of $L_SIZE." +} diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_checksum.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_checksum.ksh deleted file mode 100755 index 67c7e7c4487d..000000000000 --- a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_checksum.ksh +++ /dev/null @@ -1,64 +0,0 @@ -#!/bin/ksh - -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright (c) 2021 by vStack. All rights reserved. -# - -. $STF_SUITE/include/libtest.shlib -. $STF_SUITE/include/blkdev.shlib - -# -# Description: -# zhack label repair will calculate and rewrite label checksum if invalid -# -# Strategy: -# 1. Create pool with some number of vdevs and export it -# 2. Corrupt all labels checksums -# 3. Check that pool cannot be imported -# 4. Use zhack to repair labels checksums -# 5. Check that pool can be imported -# - -log_assert "Verify zhack label repair will repair labels checksums" -log_onexit cleanup - -VIRTUAL_DISK=$TEST_BASE_DIR/disk - -function cleanup -{ - poolexists $TESTPOOL && destroy_pool $TESTPOOL - [[ -f $VIRTUAL_DISK ]] && log_must rm $VIRTUAL_DISK -} - -log_must truncate -s $(($MINVDEVSIZE * 8)) $VIRTUAL_DISK - -log_must zpool create $TESTPOOL $VIRTUAL_DISK -log_must zpool export $TESTPOOL - -log_mustnot zhack label repair $VIRTUAL_DISK - -corrupt_label_checksum 0 $VIRTUAL_DISK -corrupt_label_checksum 1 $VIRTUAL_DISK -corrupt_label_checksum 2 $VIRTUAL_DISK -corrupt_label_checksum 3 $VIRTUAL_DISK - -log_mustnot zpool import $TESTPOOL -d $TEST_BASE_DIR - -log_must zhack label repair $VIRTUAL_DISK - -log_must zpool import $TESTPOOL -d $TEST_BASE_DIR - -cleanup - -log_pass "zhack label repair works correctly." diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh new file mode 100755 index 000000000000..2a511e9efcb6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_001.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack label repair can recover +# corrupted checksums on devices of varied size, +# but not undetached devices. +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Export the pool. +# 3. Corrupt all label checksums in the pool +# 4. Check that pool cannot be imported +# 5. Verify that it cannot be imported after using zhack label repair -u +# to ensure that the -u option will quit on corrupted checksums. +# 6. Use zhack label repair -c on device +# 7. Check that pool can be imported and that data is intact + +. "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib + +run_test_one "$(get_devsize)" diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_002.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_002.ksh new file mode 100755 index 000000000000..4f1e61a39857 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_002.ksh @@ -0,0 +1,31 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack label repair can recover +# detached drives on devices of varied size, but not +# repair corrupted checksums. +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Verify that the remaining detached device cannot be imported +# 6. Verify that it cannot be imported after using zhack label repair -c +# to ensure that the -c option will not undetach a device. +# 7. Use zhack label repair -u on device +# 8. Verify that the detached device can be imported and that data is intact + +. "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib + +run_test_two "$(get_devsize)" diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_003.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_003.ksh new file mode 100755 index 000000000000..7e82363d2f46 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_003.ksh @@ -0,0 +1,33 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack label repair can recover a device of varied size with +# corrupted checksums and which has been detached. +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Corrupt all label checksums on the remaining device +# 6. Verify that the remaining detached device cannot be imported +# 7. Verify that it cannot be imported after using zhack label repair -u +# to ensure that the -u option will quit on corrupted checksums. +# 8. Verify that it cannot be imported after using zhack label repair -c +# -c should repair the checksums, but not undetach a device. +# 9. Use zhack label repair -u on device +# 10. Verify that the detached device can be imported and that data is intact + +. "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib + +run_test_three "$(get_devsize)" diff --git a/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_004.ksh b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_004.ksh new file mode 100755 index 000000000000..0b739402b199 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zhack/zhack_label_repair_004.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# + +# +# Description: +# +# Test whether zhack label repair can recover a device of varied size with +# corrupted checksums and which has been detached (in one command). +# +# Strategy: +# +# 1. Create pool on a loopback device with some test data +# 2. Detach either device from the mirror +# 3. Export the pool +# 4. Remove the non-detached device and its backing file +# 5. Corrupt all label checksums on the remaining device +# 6. Verify that the remaining detached device cannot be imported +# 7. Use zhack label repair -cu on device to attempt to fix checksums and +# undetach the device in a single operation. +# 8. Verify that the detached device can be imported and that data is intact + +. "$STF_SUITE"/tests/functional/cli_root/zhack/library.kshlib + +run_test_four "$(get_devsize)" From 82ac409acc77935ae366b800ee7cefb14939bbae Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 4 May 2023 03:10:32 +0500 Subject: [PATCH 084/180] zpool import -m also removing spare and cache when log device is missing spa_import() relies on a pool config fetched by spa_try_import() for spare/cache devices. Import flags are not passed to spa_tryimport(), which makes it return early due to a missing log device and missing retrieving the cache device and spare eventually. Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch the correct configuration regardless of the missing log device. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Ameer Hamza Closes #14794 --- module/zfs/spa.c | 10 +++ tests/runfiles/common.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../zpool_import/import_log_missing.ksh | 75 +++++++++++++++++++ 4 files changed, 87 insertions(+), 1 deletion(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/import_log_missing.ksh diff --git a/module/zfs/spa.c b/module/zfs/spa.c index dd4a442d97a1..c2a67fbc7c55 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -6378,6 +6378,16 @@ spa_tryimport(nvlist_t *tryconfig) spa->spa_config_source = SPA_CONFIG_SRC_SCAN; } + /* + * spa_import() relies on a pool config fetched by spa_try_import() + * for spare/cache devices. Import flags are not passed to + * spa_tryimport(), which makes it return early due to a missing log + * device and missing retrieving the cache device and spare eventually. + * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch + * the correct configuration regardless of the missing log device. + */ + spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; + error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); /* diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 3730f2b27038..e2137ac596d9 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -422,7 +422,7 @@ tests = ['zpool_import_001_pos', 'zpool_import_002_pos', 'import_cachefile_mirror_detached', 'import_cachefile_paths_changed', 'import_cachefile_shared_device', - 'import_devices_missing', + 'import_devices_missing', 'import_log_missing', 'import_paths_changed', 'import_rewind_config_changed', 'import_rewind_device_replaced'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 0112d28d0c19..9299a4ca9b47 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1056,6 +1056,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_import/import_cachefile_paths_changed.ksh \ functional/cli_root/zpool_import/import_cachefile_shared_device.ksh \ functional/cli_root/zpool_import/import_devices_missing.ksh \ + functional/cli_root/zpool_import/import_log_missing.ksh \ functional/cli_root/zpool_import/import_paths_changed.ksh \ functional/cli_root/zpool_import/import_rewind_config_changed.ksh \ functional/cli_root/zpool_import/import_rewind_device_replaced.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_log_missing.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_log_missing.ksh new file mode 100755 index 000000000000..f12cac78540f --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/import_log_missing.ksh @@ -0,0 +1,75 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +. $STF_SUITE/tests/functional/cli_root/zpool_import/zpool_import.kshlib + +# +# DESCRIPTION: +# Import with missing log device should not remove spare/cache. +# +# STRATEGY: +# 1. Create a pool. +# 2. Add spare, cache and log devices to the pool. +# 3. Export the pool. +# 4. Remove the log device. +# 5. Import the pool with -m flag. +# 6. Verify that spare and cache are still present in the pool. +# + +verify_runnable "global" + +log_onexit cleanup + +function test_missing_log +{ + typeset poolcreate="$1" + typeset cachevdev="$2" + typeset sparevdev="$3" + typeset logvdev="$4" + typeset missingvdev="$4" + + log_note "$0: pool '$poolcreate', adding $cachevdev, $sparevdev," \ + "$logvdev then moving away $missingvdev." + + log_must zpool create $TESTPOOL1 $poolcreate + + log_must zpool add $TESTPOOL1 cache $cachevdev spare $sparevdev \ + log $logvdev + + log_must_busy zpool export $TESTPOOL1 + + log_must mv $missingvdev $BACKUP_DEVICE_DIR + + log_must zpool import -m -d $DEVICE_DIR $TESTPOOL1 + + CACHE_PRESENT=$(zpool status -v $TESTPOOL1 | grep $cachevdev) + + SPARE_PRESENT=$(zpool status -v $TESTPOOL1 | grep $sparevdev) + + if [ -z "$CACHE_PRESENT"] || [ -z "SPARE_PRESENT"] + then + log_fail "cache/spare vdev missing after importing with missing" \ + "log device" + fi + + # Cleanup + log_must zpool destroy $TESTPOOL1 + + log_note "" +} + +log_must mkdir -p $BACKUP_DEVICE_DIR + +test_missing_log "$VDEV0" "$VDEV1" "$VDEV2" "$VDEV3" + +log_pass "zpool import succeeded with missing log device" From 599df8204962036c7b1039a8613c13bdb69c2e61 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Sat, 6 May 2023 00:51:41 +0900 Subject: [PATCH 085/180] Plug memory leak in zfsdev_state. On kernel module unload, free all zfsdev state structures, except for zfsdev_state_listhead, which is statically allocated. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14824 --- module/zfs/zfs_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 22e644f75f95..3b1e2ae5fb5d 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -7862,6 +7862,8 @@ zfs_kmod_fini(void) zfs_onexit_destroy(zs->zs_onexit); if (zs->zs_zevent) zfs_zevent_destroy(zs->zs_zevent); + if (zs != &zfsdev_state_listhead) + kmem_free(zs, sizeof (zfsdev_state_t)); } zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ From 6fa6bb051c2b83f90dc12a64e63d1cb2b0d12c96 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Sat, 6 May 2023 01:09:12 +0900 Subject: [PATCH 086/180] Simplify and optimize random_int_between(). Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14805 --- tests/zfs-tests/include/math.shlib | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tests/zfs-tests/include/math.shlib b/tests/zfs-tests/include/math.shlib index 38d9fecea7cf..da1e77e5fb97 100644 --- a/tests/zfs-tests/include/math.shlib +++ b/tests/zfs-tests/include/math.shlib @@ -118,9 +118,7 @@ function verify_ne # # A simple function to get a random number between two bounds (inclusive) # -# Probably not the most efficient for large ranges, but it's okay. -# -# Note since we're using $RANDOM, 32767 is the largest number we +# Note since we're using $RANDOM, $min+32767 is the largest number we # can accept as the upper bound. # # $1 lower bound @@ -129,11 +127,6 @@ function random_int_between { typeset -i min=$1 typeset -i max=$2 - typeset -i rand=0 - - while [[ $rand -lt $min ]] ; do - rand=$(( $RANDOM % $max + 1)) - done - echo $rand + echo $(( (RANDOM % (max - min + 1)) + min )) } From 190290a9ac3f2f0dd0021646f2fd787ea51b08bd Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 5 May 2023 12:17:55 -0400 Subject: [PATCH 087/180] Fix two abd_gang_add_gang() issues. - There is no reason to assert that added gang is not empty. It may be weird to add an empty gang, but it is legal. - When moving chain list from the added gang clear its size, or it will trigger assertion in abd_verify() when that gang is freed. Reviewed-by: Brian Atkinson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14816 --- module/zfs/abd.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 26222d2efe3f..745ee8f02ed4 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -370,7 +370,20 @@ abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) * will retain all the free_on_free settings after being * added to the parents list. */ +#ifdef ZFS_DEBUG + /* + * If cabd had abd_parent, we have to drop it here. We can't + * transfer it to pabd, nor we can clear abd_size leaving it. + */ + if (cabd->abd_parent != NULL) { + (void) zfs_refcount_remove_many( + &cabd->abd_parent->abd_children, + cabd->abd_size, cabd); + cabd->abd_parent = NULL; + } +#endif pabd->abd_size += cabd->abd_size; + cabd->abd_size = 0; list_move_tail(&ABD_GANG(pabd).abd_gang_chain, &ABD_GANG(cabd).abd_gang_chain); ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); @@ -408,7 +421,6 @@ abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) */ if (abd_is_gang(cabd)) { ASSERT(!list_link_active(&cabd->abd_gang_link)); - ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); return (abd_gang_add_gang(pabd, cabd, free_on_free)); } ASSERT(!abd_is_gang(cabd)); From 245f4a346779e29fe995f69d8eb2b724cddf5277 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 8 May 2023 10:09:30 -0700 Subject: [PATCH 088/180] ZTS: add snapshot/snapshot_002_pos exception Add snapshot_002_pos to the known list of occasional failures for FreeBSD until it can be made entirely reliable. Reviewed-by: Tino Reichardt Signed-off-by: Brian Behlendorf Issue #14831 Closes #14832 --- tests/test-runner/bin/zts-report.py.in | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index f3cfca912a57..63470bc041c6 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -257,6 +257,7 @@ if sys.platform.startswith('freebsd'): 'resilver/resilver_restart_001': ['FAIL', known_reason], 'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622], 'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623], + 'snapshot/snapshot_002_pos': ['FAIL', '14831'], }) elif sys.platform.startswith('linux'): maybe.update({ From dd19821149cb7e3785249eb9be75dd9864c88d56 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 8 May 2023 11:17:41 -0700 Subject: [PATCH 089/180] zdb: consistent xattr output When using zdb to output the value of an xattr only interpret it as printable characters if the entire byte array is printable. Additionally, if the --parseable option is set always output the buffer contents as octal for easy parsing. Reviewed-by: Olaf Faaland Signed-off-by: Brian Behlendorf Closes #14830 --- cmd/zdb/zdb.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index ec5d1acacf85..cea80b690841 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -3322,13 +3322,22 @@ dump_znode_sa_xattr(sa_handle_t *hdl) (void) printf("\tSA xattrs: %d bytes, %d entries\n\n", sa_xattr_size, sa_xattr_entries); while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) { + boolean_t can_print = !dump_opt['P']; uchar_t *value; uint_t cnt, idx; (void) printf("\t\t%s = ", nvpair_name(elem)); nvpair_value_byte_array(elem, &value, &cnt); + + for (idx = 0; idx < cnt; ++idx) { + if (!isprint(value[idx])) { + can_print = B_FALSE; + break; + } + } + for (idx = 0; idx < cnt; ++idx) { - if (isprint(value[idx])) + if (can_print) (void) putchar(value[idx]); else (void) printf("\\%3.3o", value[idx]); From 3095ca91c261756c509d0afb4422027753e68c90 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Mon, 8 May 2023 11:20:23 -0700 Subject: [PATCH 090/180] Verify block pointers before writing them out If a block pointer is corrupted (but the block containing it checksums correctly, e.g. due to a bug that overwrites random memory), we can often detect it before the block is read, with the `zfs_blkptr_verify()` function, which is used in `arc_read()`, `zio_free()`, etc. However, such corruption is not typically recoverable. To recover from it we would need to detect the memory error before the block pointer is written to disk. This PR verifies BP's that are contained in indirect blocks and dnodes before they are written to disk, in `dbuf_write_ready()`. This way, we'll get a panic before the on-disk data is corrupted. This will help us to diagnose what's causing the corruption, as well as being much easier to recover from. To minimize performance impact, only checks that can be done without holding the spa_config_lock are performed. Additionally, when corruption is detected, the raw words of the block pointer are logged. (Note that `dprintf_bp()` is a no-op by default, but if enabled it is not safe to use with invalid block pointers.) Reviewed-by: Rich Ercolani Reviewed-by: Brian Behlendorf Reviewed-by: Paul Zuchowski Reviewed-by: Alexander Motin Signed-off-by: Matthew Ahrens Closes #14817 --- cmd/zdb/zdb.c | 8 ++-- include/sys/zio.h | 8 +++- module/zfs/arc.c | 4 +- module/zfs/dbuf.c | 16 ++++++++ module/zfs/dsl_scan.c | 3 +- module/zfs/spa.c | 2 +- module/zfs/zio.c | 92 +++++++++++++++++++++++++++++++------------ 7 files changed, 98 insertions(+), 35 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index cea80b690841..5ab13b470dc0 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -8499,8 +8499,8 @@ zdb_read_block(char *thing, spa_t *spa) !(flags & ZDB_FLAG_DECOMPRESS)) { const blkptr_t *b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); - if (zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_ONLY) == - B_FALSE) { + if (zfs_blkptr_verify(spa, b, + BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) { abd_return_buf_copy(pabd, buf, lsize); borrowed = B_FALSE; buf = lbuf; @@ -8508,8 +8508,8 @@ zdb_read_block(char *thing, spa_t *spa) lbuf, lsize, psize, flags); b = (const blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset); - if (failed || zfs_blkptr_verify(spa, b, B_FALSE, - BLK_VERIFY_LOG) == B_FALSE) { + if (failed || zfs_blkptr_verify(spa, b, + BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) { printf("invalid block pointer at this DVA\n"); goto out; } diff --git a/include/sys/zio.h b/include/sys/zio.h index 3463682a1065..695bc09e6cb7 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -531,6 +531,12 @@ enum blk_verify_flag { BLK_VERIFY_HALT }; +enum blk_config_flag { + BLK_CONFIG_HELD, // SCL_VDEV held for writer + BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader + BLK_CONFIG_SKIP, // skip checks which require SCL_VDEV +}; + extern int zio_bookmark_compare(const void *, const void *); extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, @@ -646,7 +652,7 @@ extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, - boolean_t config_held, enum blk_verify_flag blk_verify); + enum blk_config_flag blk_config, enum blk_verify_flag blk_verify); /* * Initial setup and teardown. diff --git a/module/zfs/arc.c b/module/zfs/arc.c index c50228a2682f..bf8d99f94c39 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -5696,8 +5696,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, * and treat it as a checksum error. This allows an alternate blkptr * to be tried when one is available (e.g. ditto blocks). */ - if (!zfs_blkptr_verify(spa, bp, zio_flags & ZIO_FLAG_CONFIG_WRITER, - BLK_VERIFY_LOG)) { + if (!zfs_blkptr_verify(spa, bp, (zio_flags & ZIO_FLAG_CONFIG_WRITER) ? + BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { rc = SET_ERROR(ECKSUM); goto done; } diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 8193fb244079..6a50f1927add 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -4636,6 +4636,20 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) i += DNODE_MIN_SIZE; if (dnp->dn_type != DMU_OT_NONE) { fill++; + for (int j = 0; j < dnp->dn_nblkptr; + j++) { + (void) zfs_blkptr_verify(spa, + &dnp->dn_blkptr[j], + BLK_CONFIG_SKIP, + BLK_VERIFY_HALT); + } + if (dnp->dn_flags & + DNODE_FLAG_SPILL_BLKPTR) { + (void) zfs_blkptr_verify(spa, + DN_SPILL_BLKPTR(dnp), + BLK_CONFIG_SKIP, + BLK_VERIFY_HALT); + } i += dnp->dn_extra_slots * DNODE_MIN_SIZE; } @@ -4653,6 +4667,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; + (void) zfs_blkptr_verify(spa, ibp, + BLK_CONFIG_SKIP, BLK_VERIFY_HALT); fill += BP_GET_FILL(ibp); } } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index d6a9365df120..d398b6705551 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -1970,7 +1970,8 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, DMU_USERUSED_OBJECT, tx); } arc_buf_destroy(buf, &buf); - } else if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) { + } else if (!zfs_blkptr_verify(spa, bp, + BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { /* * Sanity check the block pointer contents, this is handled * by arc_read() for the cases above. diff --git a/module/zfs/spa.c b/module/zfs/spa.c index c2a67fbc7c55..16396170273c 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -2387,7 +2387,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, * When damaged consider it to be a metadata error since we cannot * trust the BP_GET_TYPE and BP_GET_LEVEL values. */ - if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) { + if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { atomic_inc_64(&sle->sle_meta_count); return (0); } diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 0924fb6f40bc..365d34832c3a 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -935,9 +935,35 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); + zfs_dbgmsg("bad blkptr at %px: " + "DVA[0]=%#llx/%#llx " + "DVA[1]=%#llx/%#llx " + "DVA[2]=%#llx/%#llx " + "prop=%#llx " + "pad=%#llx,%#llx " + "phys_birth=%#llx " + "birth=%#llx " + "fill=%#llx " + "cksum=%#llx/%#llx/%#llx/%#llx", + bp, + (long long)bp->blk_dva[0].dva_word[0], + (long long)bp->blk_dva[0].dva_word[1], + (long long)bp->blk_dva[1].dva_word[0], + (long long)bp->blk_dva[1].dva_word[1], + (long long)bp->blk_dva[2].dva_word[0], + (long long)bp->blk_dva[2].dva_word[1], + (long long)bp->blk_prop, + (long long)bp->blk_pad[0], + (long long)bp->blk_pad[1], + (long long)bp->blk_phys_birth, + (long long)bp->blk_birth, + (long long)bp->blk_fill, + (long long)bp->blk_cksum.zc_word[0], + (long long)bp->blk_cksum.zc_word[1], + (long long)bp->blk_cksum.zc_word[2], + (long long)bp->blk_cksum.zc_word[3]); switch (blk_verify) { case BLK_VERIFY_HALT: - dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); zfs_panic_recover("%s: %s", spa_name(spa), buf); break; case BLK_VERIFY_LOG: @@ -958,47 +984,54 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, * If everything checks out B_TRUE is returned. The zfs_blkptr_verify * argument controls the behavior when an invalid field is detected. * - * Modes for zfs_blkptr_verify: - * 1) BLK_VERIFY_ONLY (evaluate the block) - * 2) BLK_VERIFY_LOG (evaluate the block and log problems) - * 3) BLK_VERIFY_HALT (call zfs_panic_recover on error) + * Values for blk_verify_flag: + * BLK_VERIFY_ONLY: evaluate the block + * BLK_VERIFY_LOG: evaluate the block and log problems + * BLK_VERIFY_HALT: call zfs_panic_recover on error + * + * Values for blk_config_flag: + * BLK_CONFIG_HELD: caller holds SCL_VDEV for writer + * BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be + * obtained for reader + * BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better + * performance */ boolean_t -zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, - enum blk_verify_flag blk_verify) +zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, + enum blk_config_flag blk_config, enum blk_verify_flag blk_verify) { int errors = 0; if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid TYPE %llu", + "blkptr at %px has invalid TYPE %llu", bp, (longlong_t)BP_GET_TYPE(bp)); } if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid CHECKSUM %llu", + "blkptr at %px has invalid CHECKSUM %llu", bp, (longlong_t)BP_GET_CHECKSUM(bp)); } if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid COMPRESS %llu", + "blkptr at %px has invalid COMPRESS %llu", bp, (longlong_t)BP_GET_COMPRESS(bp)); } if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid LSIZE %llu", + "blkptr at %px has invalid LSIZE %llu", bp, (longlong_t)BP_GET_LSIZE(bp)); } if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid PSIZE %llu", + "blkptr at %px has invalid PSIZE %llu", bp, (longlong_t)BP_GET_PSIZE(bp)); } if (BP_IS_EMBEDDED(bp)) { if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p has invalid ETYPE %llu", + "blkptr at %px has invalid ETYPE %llu", bp, (longlong_t)BPE_GET_ETYPE(bp)); } } @@ -1010,10 +1043,19 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, if (!spa->spa_trust_config) return (errors == 0); - if (!config_held) - spa_config_enter(spa, SCL_VDEV, bp, RW_READER); - else + switch (blk_config) { + case BLK_CONFIG_HELD: ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER)); + break; + case BLK_CONFIG_NEEDED: + spa_config_enter(spa, SCL_VDEV, bp, RW_READER); + break; + case BLK_CONFIG_SKIP: + return (errors == 0); + default: + panic("invalid blk_config %u", blk_config); + } + /* * Pool-specific checks. * @@ -1028,20 +1070,20 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, if (vdevid >= spa->spa_root_vdev->vdev_children) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has invalid VDEV %llu", + "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; if (vd == NULL) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has invalid VDEV %llu", + "blkptr at %px DVA %u has invalid VDEV %llu", bp, i, (longlong_t)vdevid); continue; } if (vd->vdev_ops == &vdev_hole_ops) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has hole VDEV %llu", + "blkptr at %px DVA %u has hole VDEV %llu", bp, i, (longlong_t)vdevid); continue; } @@ -1059,13 +1101,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, asize = vdev_gang_header_asize(vd); if (offset + asize > vd->vdev_asize) { errors += zfs_blkptr_verify_log(spa, bp, blk_verify, - "blkptr at %p DVA %u has invalid OFFSET %llu", + "blkptr at %px DVA %u has invalid OFFSET %llu", bp, i, (longlong_t)offset); } } - if (errors > 0) - dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); - if (!config_held) + if (blk_config == BLK_CONFIG_NEEDED) spa_config_exit(spa, SCL_VDEV, bp); return (errors == 0); @@ -1203,7 +1243,7 @@ void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) { - (void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT); + (void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); /* * The check for EMBEDDED is a performance optimization. We @@ -1282,8 +1322,8 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, { zio_t *zio; - (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER, - BLK_VERIFY_HALT); + (void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ? + BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT); if (BP_IS_EMBEDDED(bp)) return (zio_null(pio, spa, NULL, NULL, NULL, 0)); From 4eca03faaf6a1c05d739c738e3d5c0df2931da98 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Mon, 8 May 2023 22:35:03 +0200 Subject: [PATCH 091/180] Fixes in head_errlog feature with encryption For the head_errlog feature use dsl_dataset_hold_obj_flags() instead of dsl_dataset_hold_obj() in order to enable access to the encryption keys (if loaded). This enables reporting of errors in encrypted filesystems which are not mounted but have their keys loaded. Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #14837 --- man/man7/zpool-features.7 | 7 +- module/zfs/spa_errlog.c | 76 ++++++++----------- .../zfs_receive/zfs_receive_corrective.ksh | 19 ++++- .../zpool_status/zpool_status_005_pos.ksh | 6 +- 4 files changed, 56 insertions(+), 52 deletions(-) diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index efe9e833996a..2b7dcb63829c 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -562,13 +562,12 @@ This feature enables the upgraded version of errlog, which required an on-disk error log format change. Now the error log of each head dataset is stored separately in the zap object and keyed by the head id. -In case of encrypted filesystems with unloaded keys or unmounted encrypted -filesystems we are unable to check their snapshots or clones for errors and -these will not be reported. -In this case no filenames will be reported either. With this feature enabled, every dataset affected by an error block is listed in the output of .Nm zpool Cm status . +In case of encrypted filesystems with unloaded keys we are unable to check +their snapshots or clones for errors and these will not be reported. +An "access denied" error will be reported. .Pp \*[instant-never] . diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index e0604c4a84af..44950a769d3b 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -163,15 +163,15 @@ name_to_object(char *buf, uint64_t *obj) static int get_head_ds(spa_t *spa, uint64_t dsobj, uint64_t *head_ds) { dsl_dataset_t *ds; - int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, - dsobj, FTAG, &ds); + int error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, + dsobj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); ASSERT(head_ds); *head_ds = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } @@ -297,7 +297,8 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; - int error = dsl_dataset_hold_obj(dp, head_ds, FTAG, &ds); + int error = dsl_dataset_hold_obj_flags(dp, head_ds, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); @@ -306,23 +307,6 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, boolean_t check_snapshot = B_TRUE; error = find_birth_txg(ds, zep, &latest_txg); - /* - * If the filesystem is encrypted and the key is not loaded - * or the encrypted filesystem is not mounted the error will be EACCES. - * In that case report an error in the head filesystem and return. - */ - if (error == EACCES) { - dsl_dataset_rele(ds, FTAG); - zbookmark_phys_t zb; - zep_to_zb(head_ds, zep, &zb); - error = copyout_entry(&zb, uaddr, count); - if (error != 0) { - dsl_dataset_rele(ds, FTAG); - return (error); - } - return (0); - } - /* * If find_birth_txg() errors out otherwise, let txg_to_consider be * equal to the spa's syncing txg: if check_filesystem() errors out @@ -334,7 +318,7 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, zep_to_zb(head_ds, zep, &zb); error = copyout_entry(&zb, uaddr, count); if (error != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } check_snapshot = B_FALSE; @@ -352,14 +336,14 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); if (error != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } } if (snap_count == 0) { /* Filesystem without snapshots. */ - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (0); } @@ -371,20 +355,21 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; uint64_t zap_clone = dsl_dir_phys(ds->ds_dir)->dd_clones; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); /* Check only snapshots created from this file system. */ while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && snap_obj_txg <= txg_to_consider) { - error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); + error = dsl_dataset_hold_obj_flags(dp, snap_obj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) goto out; if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != head_ds) { snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } @@ -404,13 +389,14 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, zep_to_zb(snap_obj, zep, &zb); error = copyout_entry(&zb, uaddr, count); if (error != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, + FTAG); goto out; } } snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); } if (zap_clone == 0 || aff_snap_count == 0) @@ -428,8 +414,8 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, zap_cursor_advance(zc)) { dsl_dataset_t *clone; - error = dsl_dataset_hold_obj(dp, za->za_first_integer, - FTAG, &clone); + error = dsl_dataset_hold_obj_flags(dp, za->za_first_integer, + DS_HOLD_FLAG_DECRYPT, FTAG, &clone); if (error != 0) break; @@ -444,7 +430,7 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, == snap_obj_array[i]) found = B_TRUE; } - dsl_dataset_rele(clone, FTAG); + dsl_dataset_rele_flags(clone, DS_HOLD_FLAG_DECRYPT, FTAG); if (!found) continue; @@ -474,14 +460,14 @@ find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, return (error); dsl_dataset_t *ds; - error = dsl_dataset_hold_obj(spa->spa_dsl_pool, oldest_dsobj, - FTAG, &ds); + error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); *top_affected_fs = dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); return (0); } @@ -744,7 +730,8 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, dsl_dataset_t *ds; objset_t *os; - int error = dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds); + int error = dsl_dataset_hold_obj_flags(dp, zb.zb_objset, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) continue; @@ -759,7 +746,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, * truly persistent, it should re-appear after a scan. */ if (dmu_objset_from_ds(ds, &os) != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } @@ -767,7 +754,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, blkptr_t bp; if (dnode_hold(os, zep.zb_object, FTAG, &dn) != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); continue; } @@ -781,7 +768,7 @@ sync_upgrade_errlog(spa_t *spa, uint64_t spa_err_obj, uint64_t *newobj, rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); if (error != 0 || BP_IS_HOLE(&bp)) continue; @@ -1259,7 +1246,8 @@ find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, dsl_dataset_t *ds; dsl_pool_t *dp = spa->spa_dsl_pool; - int error = dsl_dataset_hold_obj(dp, old_head, FTAG, &ds); + int error = dsl_dataset_hold_obj_flags(dp, old_head, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); if (error != 0) return (error); @@ -1267,9 +1255,9 @@ find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, uint64_t prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; while (prev_obj != 0) { - dsl_dataset_rele(ds, FTAG); - if ((error = dsl_dataset_hold_obj(dp, prev_obj, - FTAG, &ds)) == 0 && + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + if ((error = dsl_dataset_hold_obj_flags(dp, prev_obj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds)) == 0 && dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj == new_head) break; @@ -1279,7 +1267,7 @@ find_txg_ancestor_snapshot(spa_t *spa, uint64_t new_head, uint64_t old_head, prev_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; prev_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; } - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); ASSERT(prev_obj != 0); *txg = prev_obj_txg; return (0); diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh index 9ebde1cd9d32..261fc5eed8cb 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_corrective.ksh @@ -163,7 +163,24 @@ corrupt_blocks_at_level "/$TESTPOOL/testfs5/$TESTFILE0" 0 log_must zfs unmount $TESTPOOL/testfs5 log_must zfs unload-key $TESTPOOL/testfs5 # test healing recv (on an encrypted dataset) using a raw send file -test_corrective_recv "$TESTPOOL/testfs5@snap1" $raw_backup +# This is a special case since with unloaded keys we cannot report errors +# in the filesystem. +log_must zpool scrub -w $TESTPOOL +log_must zpool status -v $TESTPOOL +log_mustnot eval "zpool status -v $TESTPOOL | \ + grep \"permission denied\"" +# make sure we will read the corruption from disk by flushing the ARC +log_must zinject -a +log_must eval "zfs recv -c $TESTPOOL/testfs5@snap1 < $raw_backup" + +log_must zpool scrub -w $TESTPOOL +log_must zpool status -v $TESTPOOL +log_mustnot eval "zpool status -v $TESTPOOL | \ + grep \"Permanent errors have been detected\"" +typeset cksum=$(md5digest $file) +[[ "$cksum" == "$checksum" ]] || \ + log_fail "Checksums differ ($cksum != $checksum)" + # non raw send file healing an encrypted dataset with an unloaded key will fail log_mustnot eval "zfs recv -c $TESTPOOL/testfs5@snap1 < $backup" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh index 04cd1892380d..ec4c67fb42f5 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_005_pos.ksh @@ -29,7 +29,7 @@ # Verify correct output with 'zpool status -v' after corrupting a file # # STRATEGY: -# 1. Create a pool, an ancrypted filesystem and a file +# 1. Create a pool, an encrypted filesystem and a file # 2. zinject checksum errors # 3. Unmount the filesystem and unload the key # 4. Scrub the pool @@ -76,8 +76,8 @@ log_must zpool sync $TESTPOOL2 log_must zpool scrub $TESTPOOL2 log_must zpool wait -t scrub $TESTPOOL2 log_must zpool status -v $TESTPOOL2 -log_must eval "zpool status -v $TESTPOOL2 | \ - grep \"Permanent errors have been detected\"" +log_mustnot eval "zpool status -v $TESTPOOL2 | \ + grep \"permission denied\"" log_mustnot eval "zpool status -v $TESTPOOL2 | grep '$file'" log_must eval "cat /$TESTPOOL2/pwd | zfs load-key $TESTPOOL2/$TESTFS1" From 6839ec6f1098c28ff7b772f1b31b832d05e6b567 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Tue, 9 May 2023 17:53:27 +0200 Subject: [PATCH 092/180] Enable the head_errlog feature to remove errors In case check_filesystem() does not error out and does not report an error, remove that error block from error lists and logs without requiring a scrub. This can happen when the original file and all snapshots/clones referencing it have been removed. Otherwise zpool status will still report that "Permanent errors have been detected..." without actually reporting any of them. To implement this change the functions introduced in corrective receive were modified to take into account the head_errlog feature. Before this change: ============================= pool: test state: ONLINE status: One or more devices has experienced an error resulting in data corruption. Applications may be affected. action: Restore the file in question if possible. Otherwise restore the entire pool from backup. see: https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-8A config: NAME STATE READ WRITE CKSUM test ONLINE 0 0 0 /home/user/vdev_a ONLINE 0 0 2 errors: Permanent errors have been detected in the following files: ============================= After this change: ============================= pool: test state: ONLINE status: One or more devices has experienced an unrecoverable error. An attempt was made to correct the error. Applications are unaffected. action: Determine if the device needs to be replaced, and clear the errors using 'zpool clear' or replace the device with 'zpool replace'. see: https://openzfs.github.io/openzfs-docs/msg/ZFS-8000-9P config: NAME STATE READ WRITE CKSUM test ONLINE 0 0 0 /home/user/vdev_a ONLINE 0 0 2 errors: No known data errors ============================= Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: George Amanakis Closes #14813 --- include/sys/spa.h | 3 +- man/man8/zpool-status.8 | 3 + module/zfs/dmu_recv.c | 2 +- module/zfs/spa_errlog.c | 175 ++++++++++++++---- .../zpool_status/zpool_status_007_pos.ksh | 13 ++ 5 files changed, 159 insertions(+), 37 deletions(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index b96a9ef1d42f..460ea2bfee4e 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1140,7 +1140,8 @@ extern const char *spa_state_to_name(spa_t *spa); struct zbookmark_phys; extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth); -extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb); +extern void spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, + const uint64_t *birth); extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, zio_t *zio, uint64_t state); extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index ed572e29f51f..8f9580cf086e 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -119,6 +119,9 @@ See .It Fl v Displays verbose data error information, printing out a complete list of all data errors since the last complete pool scrub. +If the head_errlog feature is enabled and files containing errors have been +removed then the respective filenames will not be reported in subsequent runs +of this command. .It Fl x Only display status for pools that are exhibiting errors or are otherwise unavailable. diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index c2ce5ce000ac..c22a95f8647f 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -1353,7 +1353,7 @@ corrective_read_done(zio_t *zio) cr_cb_data_t *data = zio->io_private; /* Corruption corrected; update error log if needed */ if (zio->io_error == 0) - spa_remove_error(data->spa, &data->zb); + spa_remove_error(data->spa, &data->zb, &zio->io_bp->blk_birth); kmem_free(data, sizeof (cr_cb_data_t)); abd_free(zio->io_abd); } diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index 44950a769d3b..31719063a227 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -493,6 +493,7 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, } uint64_t top_affected_fs; + uint64_t init_count = *count; int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs); if (error == 0) { clones_t *ct; @@ -520,6 +521,16 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, list_destroy(&clones_list); } + if (error == 0 && init_count == *count) { + /* + * If we reach this point, no errors have been detected + * in the checked filesystems/snapshots. Before returning mark + * the error block to be removed from the error lists and logs. + */ + zbookmark_phys_t zb; + zep_to_zb(head_ds, zep, &zb); + spa_remove_error(spa, &zb, &zep->zb_birth); + } return (error); } @@ -530,37 +541,111 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, * so that we can later remove the related log entries in sync context. */ static void -spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb) +spa_add_healed_error(spa_t *spa, uint64_t obj, zbookmark_phys_t *healed_zb, + const uint64_t *birth) { char name[NAME_MAX_LEN]; if (obj == 0) return; - bookmark_to_name(healed_zb, name, sizeof (name)); - mutex_enter(&spa->spa_errlog_lock); - if (zap_contains(spa->spa_meta_objset, obj, name) == 0) { - /* - * Found an error matching healed zb, add zb to our - * tree of healed errors - */ - avl_tree_t *tree = &spa->spa_errlist_healed; - spa_error_entry_t search; - spa_error_entry_t *new; - avl_index_t where; - search.se_bookmark = *healed_zb; - mutex_enter(&spa->spa_errlist_lock); - if (avl_find(tree, &search, &where) != NULL) { - mutex_exit(&spa->spa_errlist_lock); - mutex_exit(&spa->spa_errlog_lock); - return; + boolean_t held_list = B_FALSE; + boolean_t held_log = B_FALSE; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + bookmark_to_name(healed_zb, name, sizeof (name)); + + if (zap_contains(spa->spa_meta_objset, healed_zb->zb_objset, + name) == 0) { + if (!MUTEX_HELD(&spa->spa_errlog_lock)) { + mutex_enter(&spa->spa_errlog_lock); + held_log = B_TRUE; + } + + /* + * Found an error matching healed zb, add zb to our + * tree of healed errors + */ + avl_tree_t *tree = &spa->spa_errlist_healed; + spa_error_entry_t search; + spa_error_entry_t *new; + avl_index_t where; + search.se_bookmark = *healed_zb; + if (!MUTEX_HELD(&spa->spa_errlist_lock)) { + mutex_enter(&spa->spa_errlist_lock); + held_list = B_TRUE; + } + if (avl_find(tree, &search, &where) != NULL) { + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); + return; + } + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *healed_zb; + avl_insert(tree, new, where); + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); } - new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); - new->se_bookmark = *healed_zb; - avl_insert(tree, new, where); - mutex_exit(&spa->spa_errlist_lock); + return; } - mutex_exit(&spa->spa_errlog_lock); + + zbookmark_err_phys_t healed_zep; + healed_zep.zb_object = healed_zb->zb_object; + healed_zep.zb_level = healed_zb->zb_level; + healed_zep.zb_blkid = healed_zb->zb_blkid; + + if (birth != NULL) + healed_zep.zb_birth = *birth; + else + healed_zep.zb_birth = 0; + + errphys_to_name(&healed_zep, name, sizeof (name)); + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_errlog_last); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + if (zap_contains(spa->spa_meta_objset, za.za_first_integer, + name) == 0) { + if (!MUTEX_HELD(&spa->spa_errlog_lock)) { + mutex_enter(&spa->spa_errlog_lock); + held_log = B_TRUE; + } + + avl_tree_t *tree = &spa->spa_errlist_healed; + spa_error_entry_t search; + spa_error_entry_t *new; + avl_index_t where; + search.se_bookmark = *healed_zb; + + if (!MUTEX_HELD(&spa->spa_errlist_lock)) { + mutex_enter(&spa->spa_errlist_lock); + held_list = B_TRUE; + } + + if (avl_find(tree, &search, &where) != NULL) { + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); + continue; + } + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *healed_zb; + new->se_zep = healed_zep; + avl_insert(tree, new, where); + + if (held_list) + mutex_exit(&spa->spa_errlist_lock); + if (held_log) + mutex_exit(&spa->spa_errlog_lock); + } + } + zap_cursor_fini(&zc); } /* @@ -598,12 +683,36 @@ spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) &cookie)) != NULL) { remove_error_from_list(spa, s, &se->se_bookmark); remove_error_from_list(spa, l, &se->se_bookmark); - bookmark_to_name(&se->se_bookmark, name, sizeof (name)); kmem_free(se, sizeof (spa_error_entry_t)); - (void) zap_remove(spa->spa_meta_objset, - spa->spa_errlog_last, name, tx); - (void) zap_remove(spa->spa_meta_objset, - spa->spa_errlog_scrub, name, tx); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + bookmark_to_name(&se->se_bookmark, name, sizeof (name)); + (void) zap_remove(spa->spa_meta_objset, + spa->spa_errlog_last, name, tx); + (void) zap_remove(spa->spa_meta_objset, + spa->spa_errlog_scrub, name, tx); + } else { + errphys_to_name(&se->se_zep, name, sizeof (name)); + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_errlog_last); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + zap_remove(spa->spa_meta_objset, + za.za_first_integer, name, tx); + } + zap_cursor_fini(&zc); + + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_errlog_scrub); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + zap_remove(spa->spa_meta_objset, + za.za_first_integer, name, tx); + } + zap_cursor_fini(&zc); + } } } @@ -612,14 +721,10 @@ spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) * later in spa_remove_healed_errors(). */ void -spa_remove_error(spa_t *spa, zbookmark_phys_t *zb) +spa_remove_error(spa_t *spa, zbookmark_phys_t *zb, const uint64_t *birth) { - char name[NAME_MAX_LEN]; - - bookmark_to_name(zb, name, sizeof (name)); - - spa_add_healed_error(spa, spa->spa_errlog_last, zb); - spa_add_healed_error(spa, spa->spa_errlog_scrub, zb); + spa_add_healed_error(spa, spa->spa_errlog_last, zb, birth); + spa_add_healed_error(spa, spa->spa_errlog_scrub, zb, birth); } static uint64_t diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh index c9849379f779..666ac9bfc9dd 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_007_pos.ksh @@ -39,6 +39,9 @@ # 7. Verify we report errors in the pool in 'zpool status -v' # 8. Promote clone1 # 9. Verify we report errors in the pool in 'zpool status -v' +# 10. Delete the corrupted file and origin snapshots. +# 11. Verify we do not report data errors anymore, without requiring +# a scrub. . $STF_SUITE/include/libtest.shlib @@ -95,4 +98,14 @@ log_mustnot eval "zpool status -v | grep '$TESTPOOL2/clonexx/$TESTFILE0'" log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'" log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'" +log_must rm /$TESTPOOL2/clone1/$TESTFILE0 +log_must zfs destroy -R $TESTPOOL2/clone1@snap1 +log_must zfs destroy -R $TESTPOOL2/clone1@snap2 +log_must zfs list -r $TESTPOOL2 +log_must zpool status -v $TESTPOOL2 +log_must zpool sync +log_must zpool status -v $TESTPOOL2 +log_must eval "zpool status -v $TESTPOOL2 | \ + grep \"No known data errors\"" + log_pass "Verify reporting errors when deleting corrupted files after scrub" From b035f2b2cb9b88b1330c4b48641b8793d6460c9b Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 9 May 2023 11:54:01 -0400 Subject: [PATCH 093/180] Remove single parent assertion from zio_nowait(). We only need to know if ZIO has any parent there. We do not care if it has more than one, but use of zio_unique_parent() == NULL asserts that. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14823 --- module/zfs/zio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 365d34832c3a..c17ca5e1d651 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -2341,7 +2341,7 @@ zio_nowait(zio_t *zio) ASSERT3P(zio->io_executor, ==, NULL); if (zio->io_child_type == ZIO_CHILD_LOGICAL && - zio_unique_parent(zio) == NULL) { + list_is_empty(&zio->io_parent_list)) { zio_t *pio; /* From d38c815fe27c033564d1f7cc769e74eba11cfb83 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Tue, 9 May 2023 17:54:41 +0200 Subject: [PATCH 094/180] Remove duplicate code in l2arc_evict() l2arc_evict() performs the adjustment of the size of buffers to be written on L2ARC unnecessarily. l2arc_write_size() is called right before l2arc_evict() and performs those adjustments. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: George Amanakis Closes #14828 --- module/zfs/arc.c | 22 +++++++------------ .../tests/functional/trim/trim_l2arc.ksh | 2 +- 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index bf8d99f94c39..a78f664c4fe8 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8198,10 +8198,17 @@ l2arc_write_size(l2arc_dev_t *dev) * iteration can occur. */ dev_size = dev->l2ad_end - dev->l2ad_start; + + /* We need to add in the worst case scenario of log block overhead. */ tsize = size + l2arc_log_blk_overhead(size, dev); - if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) + if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { + /* + * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) + * times the writesize, whichever is greater. + */ tsize += MAX(64 * 1024 * 1024, (tsize * l2arc_trim_ahead) / 100); + } if (tsize >= dev_size) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " @@ -8836,19 +8843,6 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) buflist = &dev->l2ad_buflist; - /* - * We need to add in the worst case scenario of log block overhead. - */ - distance += l2arc_log_blk_overhead(distance, dev); - if (vd->vdev_has_trim && l2arc_trim_ahead > 0) { - /* - * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) - * times the write size, whichever is greater. - */ - distance += MAX(64 * 1024 * 1024, - (distance * l2arc_trim_ahead) / 100); - } - top: rerun = B_FALSE; if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { diff --git a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh index 0bbd08acdd3f..a93d0b3cc803 100755 --- a/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh +++ b/tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh @@ -67,7 +67,7 @@ typeset VDEV_MIN_MB=$((MINVDEVSIZE * 0.30 / 1024 / 1024)) log_must zpool create -f $TESTPOOL $TRIM_VDEV1 cache $TRIM_VDEV2 verify_vdevs "-le" "$VDEV_MIN_MB" $TRIM_VDEV2 -typeset fill_mb=$(( floor(2 * MINVDEVSIZE) )) +typeset fill_mb=$(( floor(3 * MINVDEVSIZE) )) export DIRECTORY=/$TESTPOOL export NUMJOBS=1 export FILE_SIZE=${fill_mb} From c8b3dda18638fca8b0cc580ad7cecf410606e646 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 9 May 2023 08:57:02 -0700 Subject: [PATCH 095/180] Debug auto_replace_001_pos failures Reduced the timeout to 60 seconds which should be more than sufficient and allow the test to be marked as FAILED rather than KILLED. Also dump the pool status on cleanup. Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Closes #14829 --- tests/zfs-tests/include/libtest.shlib | 14 +++++++++++--- .../functional/fault/auto_replace_001_pos.ksh | 5 +++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 02e6a500a71a..8521f271be54 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -1951,6 +1951,7 @@ function check_pool_status # pool token keyword # is_pool_removing - to check if the pool removing is a vdev # is_pool_removed - to check if the pool remove is completed # is_pool_discarding - to check if the pool checkpoint is being discarded +# is_pool_replacing - to check if the pool is performing a replacement # function is_pool_resilvering #pool { @@ -1997,6 +1998,10 @@ function is_pool_discarding #pool { check_pool_status "$1" "checkpoint" "discarding" } +function is_pool_replacing #pool +{ + zpool status "$1" | grep -qE 'replacing-[0-9]+' +} function wait_for_degraded { @@ -2983,12 +2988,15 @@ function wait_freeing #pool # Wait for every device replace operation to complete # # $1 pool name +# $2 timeout # -function wait_replacing #pool +function wait_replacing #pool timeout { + typeset timeout=${2:-300} typeset pool=${1:-$TESTPOOL} - while zpool status $pool | grep -qE 'replacing-[0-9]+'; do - log_must sleep 1 + for (( timer = 0; timer < $timeout; timer++ )); do + is_pool_replacing $pool || break; + sleep 1; done } diff --git a/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh b/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh index 2846192d08eb..081e6c18430d 100755 --- a/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh @@ -54,6 +54,7 @@ fi function cleanup { + zpool status $TESTPOOL destroy_pool $TESTPOOL sed -i '/alias scsidebug/d' $VDEVID_CONF unload_scsi_debug @@ -99,8 +100,8 @@ block_device_wait insert_disk $SD $SD_HOST # Wait for the new disk to be online and replaced -log_must wait_vdev_state $TESTPOOL "scsidebug" "ONLINE" $MAXTIMEOUT -log_must wait_replacing $TESTPOOL +log_must wait_vdev_state $TESTPOOL "scsidebug" "ONLINE" 60 +log_must wait_replacing $TESTPOOL 60 # Validate auto-replace was successful log_must check_state $TESTPOOL "" "ONLINE" From 903c3613d490d1321d587982abb5e4dda4a43308 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Tue, 9 May 2023 09:03:10 -0700 Subject: [PATCH 096/180] Add dmu_tx_hold_append() interface Provides an interface which callers can use to declare a write when the exact starting offset in not yet known. Since the full range being updated is not available only the first L0 block at the provided offset will be prefetched. Reviewed-by: Olaf Faaland Signed-off-by: Brian Behlendorf Closes #14819 --- include/sys/dmu.h | 3 ++ include/sys/dmu_tx.h | 1 + module/zfs/dmu_tx.c | 105 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 1b82ff620f27..a5a5c378279a 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -782,6 +782,9 @@ dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); +void dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); +void dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + int len); void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index ca8514e5d2d0..aa55da626149 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -91,6 +91,7 @@ enum dmu_tx_hold_type { THT_SPACE, THT_SPILL, THT_CLONE, + THT_APPEND, THT_NUMTYPES }; diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 1c5608c4541b..c4e274bd4c42 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -294,6 +294,53 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) } } +static void +dmu_tx_count_append(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + dnode_t *dn = txh->txh_dnode; + int err = 0; + + if (len == 0) + return; + + (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); + + if (dn == NULL) + return; + + /* + * For i/o error checking, read the blocks that will be needed + * to perform the append; first level-0 block (if not aligned, i.e. + * if they are partial-block writes), no additional blocks are read. + */ + if (dn->dn_maxblkid == 0) { + if (off < dn->dn_datablksz && + (off > 0 || len < dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + } else { + zio_t *zio = zio_root(dn->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + + /* first level-0 block */ + uint64_t start = off >> dn->dn_datablkshift; + if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { + err = dmu_tx_check_ioerr(zio, dn, 0, start); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + + err = zio_wait(zio); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } +} + static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { @@ -334,6 +381,42 @@ dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) } } +/* + * Should be used when appending to an object and the exact offset is unknown. + * The write must occur at or beyond the specified offset. Only the L0 block + * at provided offset will be prefetched. + */ +void +dmu_tx_hold_append(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_APPEND, off, DMU_OBJECT_END); + if (txh != NULL) { + dmu_tx_count_append(txh, off, len); + dmu_tx_count_dnode(txh); + } +} + +void +dmu_tx_hold_append_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_APPEND, off, DMU_OBJECT_END); + if (txh != NULL) { + dmu_tx_count_append(txh, off, len); + dmu_tx_count_dnode(txh); + } +} + /* * This function marks the transaction as being a "net free". The end * result is that refquotas will be disabled for this transaction, and @@ -668,6 +751,26 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) if (blkid == 0) match_offset = TRUE; break; + case THT_APPEND: + if (blkid >= beginblk && (blkid <= endblk || + txh->txh_arg2 == DMU_OBJECT_END)) + match_offset = TRUE; + + /* + * THT_WRITE used for bonus and spill blocks. + */ + ASSERT(blkid != DMU_BONUS_BLKID && + blkid != DMU_SPILL_BLKID); + + /* + * They might have to increase nlevels, + * thus dirtying the new TLIBs. Or the + * might have to change the block size, + * thus dirying the new lvl=0 blk=0. + */ + if (blkid == 0) + match_offset = TRUE; + break; case THT_FREE: /* * We will dirty all the level 1 blocks in @@ -1454,6 +1557,8 @@ dmu_tx_fini(void) EXPORT_SYMBOL(dmu_tx_create); EXPORT_SYMBOL(dmu_tx_hold_write); EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); +EXPORT_SYMBOL(dmu_tx_hold_append); +EXPORT_SYMBOL(dmu_tx_hold_append_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_free); EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); EXPORT_SYMBOL(dmu_tx_hold_zap); From d3db900a4e457c3a75e6cef8e9bac8d278ddc929 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 9 May 2023 17:55:19 -0700 Subject: [PATCH 097/180] pam: Fix "buffer overflow" in pam ZTS tests on F38 The pam ZTS tests were reporting a buffer overflow on F38, possibly due to F38 now setting _FORTIFY_SOURCE=3 by default. gdb and valgrind narrowed this down to a snprintf() buffer overflow in zfs_key_config_modify_session_counter(). I'm not clear why this particular snprintf() was being flagged as an overflow, but when I replaced it with an asprintf(), the test passed reliably. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #14802 Closes #14842 --- contrib/pam_zfs_key/pam_zfs_key.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index 27c7d63781c5..979546ab3090 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -587,16 +587,11 @@ zfs_key_config_modify_session_counter(pam_handle_t *pamh, errno); return (-1); } - size_t runtime_path_len = strlen(runtime_path); - size_t counter_path_len = runtime_path_len + 1 + 10; - char *counter_path = malloc(counter_path_len + 1); - if (!counter_path) { + + char *counter_path; + if (asprintf(&counter_path, "%s/%u", runtime_path, config->uid) == -1) return (-1); - } - counter_path[0] = 0; - strcat(counter_path, runtime_path); - snprintf(counter_path + runtime_path_len, counter_path_len, "/%d", - config->uid); + const int fd = open(counter_path, O_RDWR | O_CLOEXEC | O_CREAT | O_NOFOLLOW, S_IRUSR | S_IWUSR); From 14ba8ab97ddb3674351861ecf373125ac4e1dc63 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Wed, 10 May 2023 05:56:35 +0500 Subject: [PATCH 098/180] Prevent panic during concurrent snapshot rollback and zvol read Protect zvol_cdev_read with zv_suspend_lock to prevent concurrent release of the dnode, avoiding panic when a snapshot is rolled back in parallel during ongoing zvol read operation. Reviewed-by: Chunwei Chen Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #14839 --- module/os/freebsd/zfs/zvol_os.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 26578491fd67..2520507b98aa 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -832,6 +832,7 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) return (SET_ERROR(EIO)); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); ssize_t start_resid = zfs_uio_resid(&uio); lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), zfs_uio_resid(&uio), RL_READER); @@ -853,6 +854,7 @@ zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) zfs_rangelock_exit(lr); int64_t nread = start_resid - zfs_uio_resid(&uio); dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); + rw_exit(&zv->zv_suspend_lock); return (error); } From 469019fb0b2b7ca4bd6c3de5c2f1056a4446f0e3 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 11 May 2023 17:27:12 -0400 Subject: [PATCH 099/180] zil: Don't expect zio_shrink() to succeed. At least for RAIDZ zio_shrink() does not reduce zio size, but reduced wsz in that case likely results in writing uninitialized memory. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14853 --- module/zfs/zil.c | 1 + 1 file changed, 1 insertion(+) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index ec9da706a806..c37da89dd438 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1866,6 +1866,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); ASSERT3U(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_write_zio, wsz); + wsz = lwb->lwb_write_zio->io_size; } else { wsz = lwb->lwb_sz; From 555ef90c5c1db5dcd1b47c02134c85b5a03dc6bc Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Sun, 30 Apr 2023 02:47:09 -0700 Subject: [PATCH 100/180] Additional block cloning fixes. Reimplement some of the block cloning vs dbuf logic, mostly to fix situation where we clone a block and in the same transaction group we want to partially overwrite the clone. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14825 --- include/sys/dbuf.h | 23 +++++----- module/zfs/dbuf.c | 103 +++++++++++++++++++++++++++++++++++---------- module/zfs/dmu.c | 14 +----- 3 files changed, 95 insertions(+), 45 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index fb26a83b1844..1800a7e31da0 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -61,16 +61,18 @@ extern "C" { /* * The simplified state transition diagram for dbufs looks like: * - * +----> READ ----+ - * | | - * | V - * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) - * | ^ ^ - * | | | - * +----> FILL ----+ | - * | | - * | | - * +--------> NOFILL -------+ + * +--> READ --+ + * | | + * | V + * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) + * ^ | ^ ^ + * | | | | + * | +--> FILL --+ | + * | | | + * | | | + * | +------> NOFILL -----+ + * | | + * +---------------+ * * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range * to find all dbufs in a range of a dnode and must be less than any other @@ -375,6 +377,7 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, uint64_t blkid, uint64_t *hash_out); int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); +void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 6a50f1927add..049a62c1c171 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1573,24 +1573,22 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, bpp = &bp; } } else { - struct dirty_leaf *dl; dbuf_dirty_record_t *dr; ASSERT3S(db->db_state, ==, DB_NOFILL); + /* + * Block cloning: If we have a pending block clone, + * we don't want to read the underlying block, but the content + * of the block being cloned, so we have the most recent data. + */ dr = list_head(&db->db_dirty_records); - if (dr == NULL) { + if (dr == NULL || !dr->dt.dl.dr_brtwrite) { err = EIO; goto early_unlock; - } else { - dl = &dr->dt.dl; - if (!dl->dr_brtwrite) { - err = EIO; - goto early_unlock; - } - bp = dl->dr_overridden_by; - bpp = &bp; } + bp = dr->dt.dl.dr_overridden_by; + bpp = &bp; } err = dbuf_read_hole(db, dn, bpp); @@ -1906,6 +1904,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; + boolean_t release; ASSERT(MUTEX_HELD(&db->db_mtx)); /* @@ -1926,8 +1925,10 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); + release = !dr->dt.dl.dr_brtwrite; dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; + dr->dt.dl.dr_brtwrite = B_FALSE; dr->dt.dl.dr_has_raw_params = B_FALSE; /* @@ -1938,7 +1939,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ - if (!dr->dt.dl.dr_brtwrite) + if (release) arc_release(dr->dt.dl.dr_data, db); } @@ -2022,11 +2023,6 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); - if (dr->dt.dl.dr_brtwrite) { - ASSERT(db->db.db_data == NULL); - mutex_exit(&db->db_mtx); - continue; - } } else { /* * This dbuf is not dirty in the open context. @@ -2613,6 +2609,7 @@ static void dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + boolean_t undirty = B_FALSE; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); @@ -2625,7 +2622,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) */ mutex_enter(&db->db_mtx); - if (db->db_state == DB_CACHED) { + if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) { dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); /* * It's possible that it is already dirty but not cached, @@ -2633,10 +2630,21 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) * go through dmu_buf_will_dirty(). */ if (dr != NULL) { - /* This dbuf is already dirty and cached. */ - dbuf_redirty(dr); - mutex_exit(&db->db_mtx); - return; + if (dr->dt.dl.dr_brtwrite) { + /* + * Block cloning: If we are dirtying a cloned + * block, we cannot simply redirty it, because + * this dr has no data associated with it. + * We will go through a full undirtying below, + * before dirtying it again. + */ + undirty = B_TRUE; + } else { + /* This dbuf is already dirty and cached. */ + dbuf_redirty(dr); + mutex_exit(&db->db_mtx); + return; + } } } mutex_exit(&db->db_mtx); @@ -2645,7 +2653,20 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) flags |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); + + /* + * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we + * want to make sure dbuf_read() will read the pending cloned block and + * not the uderlying block that is being replaced. dbuf_undirty() will + * do dbuf_unoverride(), so we will end up with cloned block content, + * without overridden BP. + */ (void) dbuf_read(db, NULL, flags); + if (undirty) { + mutex_enter(&db->db_mtx); + VERIFY(!dbuf_undirty(db, tx)); + mutex_exit(&db->db_mtx); + } (void) dbuf_dirty(db, tx); } @@ -2668,6 +2689,28 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) return (dr != NULL); } +void +dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + /* + * Block cloning: We are going to clone into this block, so undirty + * modifications done to this block so far in this txg. This includes + * writes and clones into this block. + */ + mutex_enter(&db->db_mtx); + VERIFY(!dbuf_undirty(db, tx)); + ASSERT(list_head(&db->db_dirty_records) == NULL); + if (db->db_buf != NULL) { + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + } + mutex_exit(&db->db_mtx); + + dmu_buf_will_not_fill(db_fake, tx); +} + void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { @@ -2675,7 +2718,9 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) db->db_state = DB_NOFILL; DTRACE_SET_STATE(db, "allocating NOFILL buffer"); - dmu_buf_will_fill(db_fake, tx); + + dbuf_noread(db); + (void) dbuf_dirty(db, tx); } void @@ -2691,6 +2736,19 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + if (db->db_state == DB_NOFILL) { + /* + * Block cloning: We will be completely overwriting a block + * cloned in this transaction group, so let's undirty the + * pending clone and mark the block as uncached. This will be + * as if the clone was never done. + */ + mutex_enter(&db->db_mtx); + VERIFY(!dbuf_undirty(db, tx)); + mutex_exit(&db->db_mtx); + db->db_state = DB_UNCACHED; + } + dbuf_noread(db); (void) dbuf_dirty(db, tx); } @@ -5155,6 +5213,7 @@ EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); EXPORT_SYMBOL(dmu_buf_is_dirty); +EXPORT_SYMBOL(dmu_buf_will_clone); EXPORT_SYMBOL(dmu_buf_will_not_fill); EXPORT_SYMBOL(dmu_buf_will_fill); EXPORT_SYMBOL(dmu_buf_fill_done); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index cda1472a77aa..f8accafd6d12 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2284,18 +2284,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); - mutex_enter(&db->db_mtx); - - VERIFY(!dbuf_undirty(db, tx)); - ASSERT(list_head(&db->db_dirty_records) == NULL); - if (db->db_buf != NULL) { - arc_buf_destroy(db->db_buf, db); - db->db_buf = NULL; - } - - mutex_exit(&db->db_mtx); - - dmu_buf_will_not_fill(dbuf, tx); + dmu_buf_will_clone(dbuf, tx); mutex_enter(&db->db_mtx); @@ -2305,7 +2294,6 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dl = &dr->dt.dl; dl->dr_overridden_by = *bp; dl->dr_brtwrite = B_TRUE; - dl->dr_override_state = DR_OVERRIDDEN; if (BP_IS_HOLE(bp)) { dl->dr_overridden_by.blk_birth = 0; From bd8c6bd66f9dde7534ae2f52237a1b208721cbf7 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Tue, 2 May 2023 14:24:43 -0700 Subject: [PATCH 101/180] Deny block cloning is dbuf size doesn't match BP size. I don't know an easy way to shrink down dbuf size, so just deny block cloning into dbufs that don't match our BP's size. This fixes the following situation: 1. Create a small file, eg. 1kB of random bytes. Its dbuf will be 1kB. 2. Create a larger file, eg. 2kB of random bytes. Its dbuf will be 2kB. 3. Truncate the large file to 0. Its dbuf will remain 2kB. 4. Clone the small file into the large file. Small file's BP lsize is 1kB, but the large file's dbuf is 2kB. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14825 --- include/sys/dmu.h | 2 +- module/zfs/dmu.c | 29 +++++++++++++++++++++++++---- module/zfs/zfs_vnops.c | 8 ++++++-- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index a5a5c378279a..6a5fb5530498 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -1066,7 +1066,7 @@ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dmu_tx_t *tx, struct blkptr *bps, size_t *nbpsp); -void dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, +int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps, boolean_t replay); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index f8accafd6d12..c1f9d02f0dd8 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2257,7 +2257,7 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, return (error); } -void +int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay) { @@ -2267,7 +2267,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, struct dirty_leaf *dl; dbuf_dirty_record_t *dr; const blkptr_t *bp; - int numbufs; + int error = 0, i, numbufs; spa = os->os_spa; @@ -2275,7 +2275,26 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, &numbufs, &dbp)); ASSERT3U(nbps, ==, numbufs); - for (int i = 0; i < numbufs; i++) { + /* + * Before we start cloning make sure that the dbufs sizes much new BPs + * sizes. If they don't, that's a no-go, as we are not able to shrink + * dbufs. + */ + for (i = 0; i < numbufs; i++) { + dbuf = dbp[i]; + db = (dmu_buf_impl_t *)dbuf; + bp = &bps[i]; + + ASSERT0(db->db_level); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) { + error = SET_ERROR(EXDEV); + goto out; + } + } + + for (i = 0; i < numbufs; i++) { dbuf = dbp[i]; db = (dmu_buf_impl_t *)dbuf; bp = &bps[i]; @@ -2319,8 +2338,10 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, brt_pending_add(spa, bp, tx); } } - +out: dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (error); } void diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index a6a27222bf4c..71955f90db03 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1309,8 +1309,12 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, ((len - 1) / inblksz + 1) * inblksz); } - dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, bps, nbps, - B_FALSE); + error = dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, + bps, nbps, B_FALSE); + if (error != 0) { + dmu_tx_commit(tx); + break; + } zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, &clear_setid_bits_txg, tx); From d0d91f185efd9149d8faceb89a9a0e5e54093fc8 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Tue, 2 May 2023 15:46:14 -0700 Subject: [PATCH 102/180] Don't use dmu_buf_is_dirty() for unassigned transaction. The dmu_buf_is_dirty() call doesn't make sense here for two reasons: 1. txg is 0 for unassigned tx, so it was a no-op. 2. It is equivalent of checking if we have dirty records and we are doing this few lines earlier. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14825 --- include/sys/dmu.h | 2 +- module/zfs/dmu.c | 6 +----- module/zfs/zfs_vnops.c | 13 +++++-------- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 6a5fb5530498..5ee6704668a4 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -1065,7 +1065,7 @@ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, - uint64_t length, dmu_tx_t *tx, struct blkptr *bps, size_t *nbpsp); + uint64_t length, struct blkptr *bps, size_t *nbpsp); int dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps, boolean_t replay); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index c1f9d02f0dd8..4e42bb3ef90c 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2173,7 +2173,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, - dmu_tx_t *tx, blkptr_t *bps, size_t *nbpsp) + blkptr_t *bps, size_t *nbpsp) { dmu_buf_t **dbp, *dbuf; dmu_buf_impl_t *db; @@ -2235,10 +2235,6 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, error = SET_ERROR(EAGAIN); goto out; } - if (dmu_buf_is_dirty(dbuf, tx)) { - error = SET_ERROR(EAGAIN); - goto out; - } /* * Make sure we clone only data blocks. */ diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 71955f90db03..dca76227a4ac 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1246,16 +1246,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, break; } - /* - * Start a transaction. - */ - tx = dmu_tx_create(outos); - nbps = maxblocks; - error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, tx, bps, + error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, &nbps); if (error != 0) { - dmu_tx_abort(tx); /* * If we are tyring to clone a block that was created * in the current transaction group. Return an error, @@ -1276,12 +1270,15 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, */ if (BP_IS_PROTECTED(&bps[0])) { if (inzfsvfs != outzfsvfs) { - dmu_tx_abort(tx); error = SET_ERROR(EXDEV); break; } } + /* + * Start a transaction. + */ + tx = dmu_tx_create(outos); dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE); db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl); DB_DNODE_ENTER(db); From b6d7370b9de5ebc7aae8ada702c3d05b81d28d77 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Wed, 3 May 2023 00:24:47 -0700 Subject: [PATCH 103/180] Don't call zfs_exit_two() before zfs_enter_two(). Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14825 --- module/zfs/zfs_vnops.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index dca76227a4ac..86706469acee 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1072,6 +1072,15 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, inzfsvfs = ZTOZSB(inzp); outzfsvfs = ZTOZSB(outzp); + + /* + * We need to call zfs_enter() potentially on two different datasets, + * so we need a dedicated function for that. + */ + error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); + if (error != 0) + return (error); + inos = inzfsvfs->z_os; outos = outzfsvfs->z_os; @@ -1083,14 +1092,6 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, return (SET_ERROR(EXDEV)); } - /* - * We need to call zfs_enter() potentially on two different datasets, - * so we need a dedicated function for that. - */ - error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG); - if (error != 0) - return (error); - ASSERT(!outzfsvfs->z_replay); error = zfs_verify_zp(inzp); From 9879930f7a42350a7f5cc0a0edc611c77ae1281e Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Wed, 3 May 2023 23:25:22 -0700 Subject: [PATCH 104/180] Remove badly placed comment. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14825 --- module/zfs/dmu.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 4e42bb3ef90c..072076ffe91d 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2197,10 +2197,6 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, mutex_enter(&db->db_mtx); - /* - * If the block is not on the disk yet, it has no BP assigned. - * There is not much we can do... - */ if (!list_is_empty(&db->db_dirty_records)) { dbuf_dirty_record_t *dr; From fbbe5e96eff9afadf9def323a246d4dd876eb0bd Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Thu, 4 May 2023 16:14:19 -0700 Subject: [PATCH 105/180] Correct comment. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14825 --- module/zfs/dmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 072076ffe91d..97379dfc1cd5 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2268,7 +2268,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, ASSERT3U(nbps, ==, numbufs); /* - * Before we start cloning make sure that the dbufs sizes much new BPs + * Before we start cloning make sure that the dbufs sizes match new BPs * sizes. If they don't, that's a no-go, as we are not able to shrink * dbufs. */ From e6107668385044718b0a73330ed6423650806473 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Tue, 9 May 2023 22:32:30 -0700 Subject: [PATCH 106/180] Make sure we are not trying to clone a spill block. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #14825 --- module/zfs/dmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 97379dfc1cd5..8a13b8f410a1 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2279,6 +2279,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_SPILL_BLKID); if (!BP_IS_HOLE(bp) && BP_GET_LSIZE(bp) != dbuf->db_size) { error = SET_ERROR(EXDEV); @@ -2293,6 +2294,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_blkid != DMU_SPILL_BLKID); ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); dmu_buf_will_clone(dbuf, tx); From 5b3b6e95c0f3aeea55932d91f469e8edd3c9cd0f Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 12 May 2023 09:07:58 -0700 Subject: [PATCH 107/180] ZTS: Add auto_replace_001_pos to exceptions The auto_replace_001_pos test case does not reliably pass on Fedora 37 and newer. Until the test case can be updated to make it reliable add it to the list of "maybe" exceptions on Linux. Signed-off-by: Brian Behlendorf Issue #14851 Closes #14852 --- tests/test-runner/bin/zts-report.py.in | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 63470bc041c6..3f7498f5c6bf 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -264,6 +264,7 @@ elif sys.platform.startswith('linux'): 'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason], 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason], 'fault/auto_online_002_pos': ['FAIL', 11889], + 'fault/auto_replace_001_pos': ['FAIL', 14851], 'fault/auto_spare_002_pos': ['FAIL', 11889], 'fault/auto_spare_multiple': ['FAIL', 11889], 'fault/auto_spare_shared': ['FAIL', 11889], From da211a4a337cce2917fa597d6930cff75f6cca2e Mon Sep 17 00:00:00 2001 From: Don Brady Date: Fri, 12 May 2023 10:12:28 -0600 Subject: [PATCH 108/180] Refine special_small_blocks property validation When the special_small_blocks property is being set during a pool create it enforces a limit of 128KiB even if the pool's record size is larger. If the recordsize property is being set during a pool create, then use that value instead of the default SPA_OLD_MAXBLOCKSIZE value. Reviewed-by: Brian Behlendorf Signed-off-by: Don Brady Closes #13815 Closes #14811 --- lib/libzfs/libzfs_dataset.c | 8 +++- tests/runfiles/common.run | 2 +- tests/zfs-tests/tests/Makefile.am | 2 + .../alloc_class/alloc_class_014_neg.ksh | 38 ++++++++++++++++ .../alloc_class/alloc_class_015_pos.ksh | 45 +++++++++++++++++++ 5 files changed, 93 insertions(+), 2 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index 138eca19acc3..fe9f3268d338 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -1034,6 +1034,7 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, nvlist_t *ret; int chosen_normal = -1; int chosen_utf = -1; + int set_maxbs = 0; if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) { (void) no_memory(hdl); @@ -1252,12 +1253,17 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, (void) zfs_error(hdl, EZFS_BADPROP, errbuf); goto error; } + /* save the ZFS_PROP_RECORDSIZE during create op */ + if (zpool_hdl == NULL && prop == ZFS_PROP_RECORDSIZE) { + set_maxbs = intval; + } break; } case ZFS_PROP_SPECIAL_SMALL_BLOCKS: { - int maxbs = SPA_OLD_MAXBLOCKSIZE; + int maxbs = + set_maxbs == 0 ? SPA_OLD_MAXBLOCKSIZE : set_maxbs; char buf[64]; if (zpool_hdl != NULL) { diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index e2137ac596d9..1665e20e0e39 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -37,7 +37,7 @@ tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', 'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos', 'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos', - 'alloc_class_013_pos'] + 'alloc_class_013_pos', 'alloc_class_014_neg', 'alloc_class_015_pos'] tags = ['functional', 'alloc_class'] [tests/functional/append] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 9299a4ca9b47..a4932fc988ac 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -415,6 +415,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/alloc_class/alloc_class_011_neg.ksh \ functional/alloc_class/alloc_class_012_pos.ksh \ functional/alloc_class/alloc_class_013_pos.ksh \ + functional/alloc_class/alloc_class_014_neg.ksh \ + functional/alloc_class/alloc_class_015_pos.ksh \ functional/alloc_class/cleanup.ksh \ functional/alloc_class/setup.ksh \ functional/append/file_append.ksh \ diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh new file mode 100755 index 000000000000..1b52014fd2d9 --- /dev/null +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_014_neg.ksh @@ -0,0 +1,38 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib + +# +# DESCRIPTION: +# Setting the special_small_blocks property greater than recordsize fails. +# + +verify_runnable "global" + +claim="Setting the special_small_blocks property greater than recordsize fails" + +log_assert $claim +log_onexit cleanup +log_must disk_setup + +for size in 512 4096 32768 131072 524288 1048576 +do + let bigger=$size*2 + log_mustnot zpool create -O recordsize=$size \ + -O special_small_blocks=$bigger \ + $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 +done + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh new file mode 100755 index 000000000000..49c468af6702 --- /dev/null +++ b/tests/zfs-tests/tests/functional/alloc_class/alloc_class_015_pos.ksh @@ -0,0 +1,45 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +. $STF_SUITE/tests/functional/alloc_class/alloc_class.kshlib + +# +# DESCRIPTION: +# Can set special_small_blocks property less than or equal to recordsize. +# + +verify_runnable "global" + +claim="Can set special_small_blocks property less than or equal to recordsize" + +log_assert $claim +log_onexit cleanup +log_must disk_setup + +for size in 8192 32768 131072 524288 1048576 +do + let smaller=$size/2 + log_must zpool create -O recordsize=$size \ + -O special_small_blocks=$smaller \ + $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + log_must zpool destroy -f "$TESTPOOL" + + log_must zpool create -O recordsize=$size \ + -O special_small_blocks=$size \ + $TESTPOOL raidz $ZPOOL_DISKS special mirror \ + $CLASS_DISK0 $CLASS_DISK1 + log_must zpool destroy -f "$TESTPOOL" +done + +log_pass $claim From 895e03135e4251be0872d96ce38f387bdc13faa2 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 12 May 2023 12:14:29 -0400 Subject: [PATCH 109/180] zil: Some micro-optimizations. Should not cause functional changes. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14854 --- module/zfs/zil.c | 75 +++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 52 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index c37da89dd438..81e1c3be1086 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -146,9 +146,6 @@ static uint64_t zil_slog_bulk = 768 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; -#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ - sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) - static int zil_bp_compare(const void *x1, const void *x2) { @@ -769,11 +766,6 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, list_insert_tail(&zilog->zl_lwb_list, lwb); mutex_exit(&zilog->zl_lock); - ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); - ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); - VERIFY(list_is_empty(&lwb->lwb_waiters)); - VERIFY(list_is_empty(&lwb->lwb_itxs)); - return (lwb); } @@ -782,8 +774,8 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); - VERIFY(list_is_empty(&lwb->lwb_waiters)); - VERIFY(list_is_empty(&lwb->lwb_itxs)); + ASSERT(list_is_empty(&lwb->lwb_waiters)); + ASSERT(list_is_empty(&lwb->lwb_itxs)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); @@ -1026,12 +1018,10 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) if (!list_is_empty(&zilog->zl_lwb_list)) { ASSERT(zh->zh_claim_txg == 0); VERIFY(!keep_first); - while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + while ((lwb = list_remove_head(&zilog->zl_lwb_list)) != NULL) { if (lwb->lwb_fastwrite) metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); - - list_remove(&zilog->zl_lwb_list, lwb); if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); @@ -1387,6 +1377,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio) spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp; mutex_enter(&zilog->zl_lock); @@ -1399,9 +1390,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio) */ lwb->lwb_buf = NULL; - ASSERT3U(lwb->lwb_issued_timestamp, >, 0); - zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 3 + - gethrtime() - lwb->lwb_issued_timestamp) / 4; + zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8; lwb->lwb_root_zio = NULL; @@ -1418,17 +1407,12 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } - while ((itx = list_head(&lwb->lwb_itxs)) != NULL) { - list_remove(&lwb->lwb_itxs, itx); + while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) zil_itx_destroy(itx); - } - while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) { + while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { mutex_enter(&zcw->zcw_lock); - ASSERT(list_link_active(&zcw->zcw_node)); - list_remove(&lwb->lwb_waiters, zcw); - ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; /* @@ -1581,7 +1565,7 @@ zil_lwb_write_done(zio_t *zio) * write and/or fsync activity, as it has the potential to * coalesce multiple flush commands to a vdev into one. */ - if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) { + if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) { zil_lwb_flush_defer(lwb, nlwb); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); return; @@ -1589,7 +1573,7 @@ zil_lwb_write_done(zio_t *zio) while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); - if (vd != NULL) { + if (vd != NULL && !vd->vdev_nowritecache) { /* * The "ZIO_FLAG_DONT_PROPAGATE" is currently * always used within "zio_flush". This means, @@ -1980,8 +1964,6 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) zilog->zl_cur_used += (reclen + dlen); txg = lrc->lrc_txg; - ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen)); - cont: /* * If this record won't fit in the current log block, start a new one. @@ -1997,7 +1979,6 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) if (lwb == NULL) return (NULL); zil_lwb_write_open(zilog, lwb); - ASSERT(LWB_EMPTY(lwb)); lwb_sp = lwb->lwb_sz - lwb->lwb_nused; /* @@ -2184,7 +2165,7 @@ zil_itxg_clean(void *arg) itx_async_node_t *ian; list = &itxs->i_sync_list; - while ((itx = list_head(list)) != NULL) { + while ((itx = list_remove_head(list)) != NULL) { /* * In the general case, commit itxs will not be found * here, as they'll be committed to an lwb via @@ -2207,7 +2188,6 @@ zil_itxg_clean(void *arg) if (itx->itx_lr.lrc_txtype == TX_COMMIT) zil_commit_waiter_skip(itx->itx_private); - list_remove(list, itx); zil_itx_destroy(itx); } @@ -2215,8 +2195,7 @@ zil_itxg_clean(void *arg) t = &itxs->i_async_tree; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list = &ian->ia_list; - while ((itx = list_head(list)) != NULL) { - list_remove(list, itx); + while ((itx = list_remove_head(list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); @@ -2277,8 +2256,7 @@ zil_remove_async(zilog_t *zilog, uint64_t oid) list_move_tail(&clean_list, &ian->ia_list); mutex_exit(&itxg->itxg_lock); } - while ((itx = list_head(&clean_list)) != NULL) { - list_remove(&clean_list, itx); + while ((itx = list_remove_head(&clean_list)) != NULL) { /* commit itxs should never be on the async lists. */ ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); zil_itx_destroy(itx); @@ -2580,7 +2558,7 @@ zil_commit_writer_stall(zilog_t *zilog) */ ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); txg_wait_synced(zilog->zl_dmu_pool, 0); - ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); + ASSERT(list_is_empty(&zilog->zl_lwb_list)); } /* @@ -2605,7 +2583,7 @@ zil_process_commit_list(zilog_t *zilog) * Return if there's nothing to commit before we dirty the fs by * calling zil_create(). */ - if (list_head(&zilog->zl_itx_commit_list) == NULL) + if (list_is_empty(&zilog->zl_itx_commit_list)) return; list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); @@ -2629,7 +2607,7 @@ zil_process_commit_list(zilog_t *zilog) plwb->lwb_state == LWB_STATE_FLUSH_DONE); } - while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) { + while ((itx = list_remove_head(&zilog->zl_itx_commit_list)) != NULL) { lr_t *lrc = &itx->itx_lr; uint64_t txg = lrc->lrc_txg; @@ -2643,8 +2621,6 @@ zil_process_commit_list(zilog_t *zilog) zilog_t *, zilog, itx_t *, itx); } - list_remove(&zilog->zl_itx_commit_list, itx); - boolean_t synced = txg <= spa_last_synced_txg(spa); boolean_t frozen = txg > spa_freeze_txg(spa); @@ -2730,20 +2706,16 @@ zil_process_commit_list(zilog_t *zilog) * normal. */ zil_commit_waiter_t *zcw; - while ((zcw = list_head(&nolwb_waiters)) != NULL) { + while ((zcw = list_remove_head(&nolwb_waiters)) != NULL) zil_commit_waiter_skip(zcw); - list_remove(&nolwb_waiters, zcw); - } /* * And finally, we have to destroy the itx's that * couldn't be committed to an lwb; this will also call * the itx's callback if one exists for the itx. */ - while ((itx = list_head(&nolwb_itxs)) != NULL) { - list_remove(&nolwb_itxs, itx); + while ((itx = list_remove_head(&nolwb_itxs)) != NULL) zil_itx_destroy(itx); - } } else { ASSERT(list_is_empty(&nolwb_waiters)); ASSERT3P(lwb, !=, NULL); @@ -2951,7 +2923,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) */ lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); - IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); /* * Since the lwb's zio hadn't been issued by the time this thread @@ -3429,7 +3401,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) blkptr_t blk = zh->zh_log; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); - ASSERT(list_head(&zilog->zl_lwb_list) == NULL); + ASSERT(list_is_empty(&zilog->zl_lwb_list)); memset(zh, 0, sizeof (zil_header_t)); memset(zilog->zl_replayed_seq, 0, @@ -3473,7 +3445,7 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) * out the zil_header blkptr so that we don't end * up freeing the same block twice. */ - if (list_head(&zilog->zl_lwb_list) == NULL) + if (list_is_empty(&zilog->zl_lwb_list)) BP_ZERO(&zh->zh_log); } @@ -3674,7 +3646,7 @@ zil_close(zilog_t *zilog) if (!dmu_objset_is_snapshot(zilog->zl_os)) { zil_commit(zilog, 0); } else { - ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); + ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT0(zilog->zl_dirty_max_txg); ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); } @@ -3716,15 +3688,14 @@ zil_close(zilog_t *zilog) * We should have only one lwb left on the list; remove it now. */ mutex_enter(&zilog->zl_lock); - lwb = list_head(&zilog->zl_lwb_list); + lwb = list_remove_head(&zilog->zl_lwb_list); if (lwb != NULL) { - ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list)); + ASSERT(list_is_empty(&zilog->zl_lwb_list)); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); if (lwb->lwb_fastwrite) metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); - list_remove(&zilog->zl_lwb_list, lwb); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zil_free_lwb(zilog, lwb); } From 7381ddf1abd16152646c921384c094ffbcae2271 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 12 May 2023 12:49:26 -0400 Subject: [PATCH 110/180] zil: Free lwb_buf after write completion. There is no sense to keep that memory allocated during the flush. Reviewed-by: Brian Behlendorf Reviewed-by: Prakash Surya Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14855 --- module/zfs/zil.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 81e1c3be1086..d887e4900d1d 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1376,20 +1376,10 @@ zil_lwb_flush_vdevs_done(zio_t *zio) spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp; mutex_enter(&zilog->zl_lock); - /* - * If we have had an allocation failure and the txg is - * waiting to sync then we want zil_sync() to remove the lwb so - * that it's not picked up as the next new one in - * zil_process_commit_list(). zil_sync() will only remove the - * lwb if lwb_buf is null. - */ - lwb->lwb_buf = NULL; - zilog->zl_last_lwb_latency = (zilog->zl_last_lwb_latency * 7 + t) / 8; lwb->lwb_root_zio = NULL; @@ -1475,7 +1465,8 @@ zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg) IMPLY(lwb->lwb_issued_txg > 0, lwb->lwb_state == LWB_STATE_FLUSH_DONE); } - IMPLY(lwb->lwb_state == LWB_STATE_FLUSH_DONE, + IMPLY(lwb->lwb_state == LWB_STATE_WRITE_DONE || + lwb->lwb_state == LWB_STATE_FLUSH_DONE, lwb->lwb_buf == NULL); lwb = list_next(&zilog->zl_lwb_list, lwb); } @@ -1519,6 +1510,8 @@ zil_lwb_write_done(zio_t *zio) ASSERT(BP_GET_FILL(zio->io_bp) == 0); abd_free(zio->io_abd); + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + lwb->lwb_buf = NULL; mutex_enter(&zilog->zl_lock); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); @@ -3433,7 +3426,8 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; - if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) + if (lwb->lwb_state != LWB_STATE_FLUSH_DONE || + lwb->lwb_max_txg > txg) break; list_remove(&zilog->zl_lwb_list, lwb); zio_free(spa, txg, &lwb->lwb_blk); From c87798d8ff6a63158e80acbbce8b034518a1656e Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 12 May 2023 16:47:56 -0400 Subject: [PATCH 111/180] Fix use after free regression in spa_remove_healed_errors() 6839ec6f1098c28ff7b772f1b31b832d05e6b567 placed code in spa_remove_healed_errors() that uses a pointer after the kmem_free() call that frees it. Reported-by: Coverity (CID-1562375) Reviewed-by: Brian Behlendorf Reviewed-by: George Amanakis Signed-off-by: Richard Yao Closes #14860 --- module/zfs/spa_errlog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index 31719063a227..5fe35278683a 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -683,7 +683,6 @@ spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) &cookie)) != NULL) { remove_error_from_list(spa, s, &se->se_bookmark); remove_error_from_list(spa, l, &se->se_bookmark); - kmem_free(se, sizeof (spa_error_entry_t)); if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { bookmark_to_name(&se->se_bookmark, name, sizeof (name)); @@ -713,6 +712,7 @@ spa_remove_healed_errors(spa_t *spa, avl_tree_t *s, avl_tree_t *l, dmu_tx_t *tx) } zap_cursor_fini(&zc); } + kmem_free(se, sizeof (spa_error_entry_t)); } } From ee7b71dbc919439b1db6352bcd95f121127b42dd Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 12 May 2023 17:10:14 -0400 Subject: [PATCH 112/180] Fix undefined behavior in spa_sync_props() 8eae2d214cfa53862833eeeda9a5c1e9d5ded47d caused Coverity to begin complaining about "Improper use of negative value" in two places in spa_sync_props() because Coverity correctly inferred from `prop == ZPOOL_PROP_INVAL` that prop could be -1 while both zpool_prop_to_name() and zpool_prop_get_type() use it an array index, which is undefined behavior. Assuming that the system does not panic from an attempt to read invalid memory, the case statement for ZPOOL_PROP_INVAL will ensure that only user properties will reach this code when prop is ZPOOL_PROP_INVAL, such that execution will continue safely. However, if we are unlucky enough to read invalid memory, then the system will panic. This issue predates the patch that caused coverity to begin complaining. Thankfully, our userland tools do not pass nonsense to us, so this bug should not be triggered unless a future userland tool attempts to set a property that we do not understand. Reported-by: Coverity (CID-1561129) Reported-by: Coverity (CID-1561130) Reviewed-by: Brian Behlendorf Reviewed-by: George Amanakis Signed-off-by: Richard Yao Closes #14860 --- module/zfs/spa.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 16396170273c..1ca114783ce4 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -8942,12 +8942,12 @@ spa_sync_props(void *arg, dmu_tx_t *tx) } /* normalize the property name */ - propname = zpool_prop_to_name(prop); - proptype = zpool_prop_get_type(prop); - if (prop == ZPOOL_PROP_INVAL && - zfs_prop_user(elemname)) { + if (prop == ZPOOL_PROP_INVAL) { propname = elemname; proptype = PROP_TYPE_STRING; + } else { + propname = zpool_prop_to_name(prop); + proptype = zpool_prop_get_type(prop); } if (nvpair_type(elem) == DATA_TYPE_STRING) { From e0d5007bcf7e4425d43ba2ad56489c7db5c4a4c5 Mon Sep 17 00:00:00 2001 From: Antonio Russo Date: Mon, 15 May 2023 17:11:33 -0600 Subject: [PATCH 113/180] test-runner: pass kmemleak and kmsg to Cmd.run test-runner.py orchestrates all of the ZTS executions. The `Cmd` object manages these process, and its `run` method specifically invokes these possibly long-running processes, possibly retrying in the event of a timeout. Since its inception, memory leak detection using the kmemleak infrastructure [1], and kernel logging [2] have been added to this run mechanism. However, the callback to cull a process beyond its timeout threshold, `kill_cmd`, has evaded modernization by both of these changes. As a result, this function fails to properly invoke `run`, leading to an untrapped exception and unreported test failure. This patch extends `kill_cmd` to receive these kernel devices through the `options` parameter, and regularizes all the `.run` calls from `Cmd`, and its subclasses, to accept that parameter. [1] Commit a69765ea5b563e0cd4d15fac4b1ac08c6ccf12d1 [2] Commit fc2c0256c55a2859d1988671b0896d22b75c8aba Reviewed-by: John Wren Kennedy Signed-off-by: Antonio Russo Closes #14849 --- tests/test-runner/bin/test-runner.py.in | 38 ++++++++++++++----------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/tests/test-runner/bin/test-runner.py.in b/tests/test-runner/bin/test-runner.py.in index c454bf8d7c6a..422ebd7bc8bf 100755 --- a/tests/test-runner/bin/test-runner.py.in +++ b/tests/test-runner/bin/test-runner.py.in @@ -181,7 +181,7 @@ Timeout: %d User: %s ''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user) - def kill_cmd(self, proc, keyboard_interrupt=False): + def kill_cmd(self, proc, options, kmemleak, keyboard_interrupt=False): """ Kill a running command due to timeout, or ^C from the keyboard. If sudo is required, this user was verified previously. @@ -211,7 +211,7 @@ User: %s if int(self.timeout) > runtime: self.killed = False self.reran = False - self.run(False) + self.run(options, dryrun=False, kmemleak=kmemleak) self.reran = True def update_cmd_privs(self, cmd, user): @@ -257,15 +257,19 @@ User: %s return out.lines, err.lines - def run(self, dryrun, kmemleak, kmsg): + def run(self, options, dryrun=None, kmemleak=None): """ This is the main function that runs each individual test. Determine whether or not the command requires sudo, and modify it if needed. Run the command, and update the result object. """ + if dryrun is None: + dryrun = options.dryrun if dryrun is True: print(self) return + if kmemleak is None: + kmemleak = options.kmemleak privcmd = self.update_cmd_privs(self.pathname, self.user) try: @@ -280,7 +284,7 @@ User: %s Log each test we run to /dev/kmsg (on Linux), so if there's a kernel warning we'll be able to match it up to a particular test. """ - if kmsg is True and exists("/dev/kmsg"): + if options.kmsg is True and exists("/dev/kmsg"): try: kp = Popen([SUDO, "sh", "-c", f"echo ZTS run {self.pathname} > /dev/kmsg"]) @@ -298,7 +302,9 @@ User: %s # Allow a special timeout value of 0 to mean infinity if int(self.timeout) == 0: self.timeout = sys.maxsize / (10 ** 9) - t = Timer(int(self.timeout), self.kill_cmd, [proc]) + t = Timer( + int(self.timeout), self.kill_cmd, [proc, options, kmemleak] + ) try: t.start() @@ -310,7 +316,7 @@ User: %s cmd = f'{SUDO} cat {KMEMLEAK_FILE}' self.result.kmemleak = check_output(cmd, shell=True) except KeyboardInterrupt: - self.kill_cmd(proc, True) + self.kill_cmd(proc, options, kmemleak, True) fail('\nRun terminated at user request.') finally: t.cancel() @@ -450,7 +456,7 @@ Tags: %s return True - def run(self, options): + def run(self, options, dryrun=None, kmemleak=None): """ Create Cmd instances for the pre/post/failsafe scripts. If the pre script doesn't pass, skip this Test. Run the post script regardless. @@ -472,14 +478,14 @@ Tags: %s cont = True if len(pretest.pathname): - pretest.run(options.dryrun, False, options.kmsg) + pretest.run(options, kmemleak=False) cont = pretest.result.result == 'PASS' pretest.log(options) if cont: - test.run(options.dryrun, options.kmemleak, options.kmsg) + test.run(options, kmemleak=kmemleak) if test.result.result == 'KILLED' and len(failsafe.pathname): - failsafe.run(options.dryrun, False, options.kmsg) + failsafe.run(options, kmemleak=False) failsafe.log(options, suppress_console=True) else: test.skip() @@ -487,7 +493,7 @@ Tags: %s test.log(options) if len(posttest.pathname): - posttest.run(options.dryrun, False, options.kmsg) + posttest.run(options, kmemleak=False) posttest.log(options) @@ -571,7 +577,7 @@ Tags: %s return len(self.tests) != 0 - def run(self, options): + def run(self, options, dryrun=None, kmemleak=None): """ Create Cmd instances for the pre/post/failsafe scripts. If the pre script doesn't pass, skip all the tests in this TestGroup. Run the @@ -590,7 +596,7 @@ Tags: %s cont = True if len(pretest.pathname): - pretest.run(options.dryrun, False, options.kmsg) + pretest.run(options, dryrun=dryrun, kmemleak=False) cont = pretest.result.result == 'PASS' pretest.log(options) @@ -603,9 +609,9 @@ Tags: %s failsafe = Cmd(self.failsafe, outputdir=odir, timeout=self.timeout, user=self.failsafe_user, identifier=self.identifier) if cont: - test.run(options.dryrun, options.kmemleak, options.kmsg) + test.run(options, dryrun=dryrun, kmemleak=kmemleak) if test.result.result == 'KILLED' and len(failsafe.pathname): - failsafe.run(options.dryrun, False, options.kmsg) + failsafe.run(options, dryrun=dryrun, kmemleak=False) failsafe.log(options, suppress_console=True) else: test.skip() @@ -613,7 +619,7 @@ Tags: %s test.log(options) if len(posttest.pathname): - posttest.run(options.dryrun, False, options.kmsg) + posttest.run(options, dryrun=dryrun, kmemleak=False) posttest.log(options) From e34e15ed6d1882d29e314321b7642305d99f1b78 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 18 May 2023 10:02:20 -0700 Subject: [PATCH 114/180] Add the ability to uninitialize zpool initialize functions well for touching every free byte...once. But if we want to do it again, we're currently out of luck. So let's add zpool initialize -u to clear it. Co-authored-by: Rich Ercolani Signed-off-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #12451 Closes #14873 --- cmd/zpool/zpool_main.c | 22 ++- include/sys/fs/zfs.h | 1 + include/sys/vdev_initialize.h | 1 + lib/libzfs/libzfs.abi | 3 +- lib/libzfs/libzfs_pool.c | 15 +- lib/libzfs_core/libzfs_core.abi | 3 +- man/man8/zpool-initialize.8 | 10 +- module/zfs/spa.c | 7 + module/zfs/vdev_initialize.c | 66 +++++++- module/zfs/zfs_ioctl.c | 3 +- tests/runfiles/common.run | 1 + tests/zfs-tests/tests/Makefile.am | 1 + .../zpool_initialize_uninit.ksh | 141 ++++++++++++++++++ 13 files changed, 258 insertions(+), 16 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 301c5f4bfc6f..3e08e031414d 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -398,7 +398,7 @@ get_usage(zpool_help_t idx) case HELP_REOPEN: return (gettext("\treopen [-n] \n")); case HELP_INITIALIZE: - return (gettext("\tinitialize [-c | -s] [-w] " + return (gettext("\tinitialize [-c | -s | -u] [-w] " "[ ...]\n")); case HELP_SCRUB: return (gettext("\tscrub [-s | -p] [-w] ...\n")); @@ -585,12 +585,13 @@ usage(boolean_t requested) } /* - * zpool initialize [-c | -s] [-w] [ ...] + * zpool initialize [-c | -s | -u] [-w] [ ...] * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool * if none specified. * * -c Cancel. Ends active initializing. * -s Suspend. Initializing can then be restarted with no flags. + * -u Uninitialize. Clears initialization state. * -w Wait. Blocks until initializing has completed. */ int @@ -606,12 +607,14 @@ zpool_do_initialize(int argc, char **argv) struct option long_options[] = { {"cancel", no_argument, NULL, 'c'}, {"suspend", no_argument, NULL, 's'}, + {"uninit", no_argument, NULL, 'u'}, {"wait", no_argument, NULL, 'w'}, {0, 0, 0, 0} }; pool_initialize_func_t cmd_type = POOL_INITIALIZE_START; - while ((c = getopt_long(argc, argv, "csw", long_options, NULL)) != -1) { + while ((c = getopt_long(argc, argv, "csuw", long_options, + NULL)) != -1) { switch (c) { case 'c': if (cmd_type != POOL_INITIALIZE_START && @@ -631,6 +634,15 @@ zpool_do_initialize(int argc, char **argv) } cmd_type = POOL_INITIALIZE_SUSPEND; break; + case 'u': + if (cmd_type != POOL_INITIALIZE_START && + cmd_type != POOL_INITIALIZE_UNINIT) { + (void) fprintf(stderr, gettext("-u cannot be " + "combined with other options\n")); + usage(B_FALSE); + } + cmd_type = POOL_INITIALIZE_UNINIT; + break; case 'w': wait = B_TRUE; break; @@ -657,8 +669,8 @@ zpool_do_initialize(int argc, char **argv) } if (wait && (cmd_type != POOL_INITIALIZE_START)) { - (void) fprintf(stderr, gettext("-w cannot be used with -c or " - "-s\n")); + (void) fprintf(stderr, gettext("-w cannot be used with -c, -s" + "or -u\n")); usage(B_FALSE); } diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 0734ff12280e..4c2097fb830e 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1265,6 +1265,7 @@ typedef enum pool_initialize_func { POOL_INITIALIZE_START, POOL_INITIALIZE_CANCEL, POOL_INITIALIZE_SUSPEND, + POOL_INITIALIZE_UNINIT, POOL_INITIALIZE_FUNCS } pool_initialize_func_t; diff --git a/include/sys/vdev_initialize.h b/include/sys/vdev_initialize.h index 4e63f063cb66..78702b7325a0 100644 --- a/include/sys/vdev_initialize.h +++ b/include/sys/vdev_initialize.h @@ -33,6 +33,7 @@ extern "C" { #endif extern void vdev_initialize(vdev_t *vd); +extern void vdev_uninitialize(vdev_t *vd); extern void vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, list_t *vd_list); extern void vdev_initialize_stop_all(vdev_t *vd, diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 732863dcffc7..57b096ca6e96 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -5741,7 +5741,8 @@ - + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 4fb71b4e0dc8..a71cb24736a9 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2387,8 +2387,8 @@ xlate_init_err(int err) } /* - * Begin, suspend, or cancel the initialization (initializing of all free - * blocks) for the given vdevs in the given pool. + * Begin, suspend, cancel, or uninit (clear) the initialization (initializing + * of all free blocks) for the given vdevs in the given pool. */ static int zpool_initialize_impl(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, @@ -2414,11 +2414,16 @@ zpool_initialize_impl(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, vdev_guids, &errlist); if (err != 0) { - if (errlist != NULL) { - vd_errlist = fnvlist_lookup_nvlist(errlist, - ZPOOL_INITIALIZE_VDEVS); + if (errlist != NULL && nvlist_lookup_nvlist(errlist, + ZPOOL_INITIALIZE_VDEVS, &vd_errlist) == 0) { goto list_errors; } + + if (err == EINVAL && cmd_type == POOL_INITIALIZE_UNINIT) { + zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, + "uninitialize is not supported by kernel")); + } + (void) zpool_standard_error(zhp->zpool_hdl, err, dgettext(TEXT_DOMAIN, "operation failed")); goto out; diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index ec94a4650553..33d794e3f809 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -1249,7 +1249,8 @@ - + + diff --git a/man/man8/zpool-initialize.8 b/man/man8/zpool-initialize.8 index eae711bff429..a9c8fd35aec9 100644 --- a/man/man8/zpool-initialize.8 +++ b/man/man8/zpool-initialize.8 @@ -36,7 +36,7 @@ .Sh SYNOPSIS .Nm zpool .Cm initialize -.Op Fl c Ns | Ns Fl s +.Op Fl c Ns | Ns Fl s | Ns Fl u .Op Fl w .Ar pool .Oo Ar device Oc Ns … @@ -60,6 +60,14 @@ initialized, the command will fail and no suspension will occur on any device. Initializing can then be resumed by running .Nm zpool Cm initialize with no flags on the relevant target devices. +.It Fl u , -uninit +Clears the initialization state on the specified devices, or all eligible +devices if none are specified. +If the devices are being actively initialized the command will fail. +After being cleared +.Nm zpool Cm initialize +with no flags can be used to re-initialize all unallocoated regions on +the relevant target devices. .It Fl w , -wait Wait until the devices have finished initializing before returning. .El diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1ca114783ce4..51d6de9105fb 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -7421,6 +7421,10 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { mutex_exit(&vd->vdev_initialize_lock); return (SET_ERROR(ESRCH)); + } else if (cmd_type == POOL_INITIALIZE_UNINIT && + vd->vdev_initialize_thread != NULL) { + mutex_exit(&vd->vdev_initialize_lock); + return (SET_ERROR(EBUSY)); } switch (cmd_type) { @@ -7433,6 +7437,9 @@ spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, case POOL_INITIALIZE_SUSPEND: vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); break; + case POOL_INITIALIZE_UNINIT: + vdev_uninitialize(vd); + break; default: panic("invalid cmd_type %llu", (unsigned long long)cmd_type); } diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c index 75beb0cc3d12..ffdcef1972c3 100644 --- a/module/zfs/vdev_initialize.c +++ b/module/zfs/vdev_initialize.c @@ -96,6 +96,39 @@ vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) &initialize_state, tx)); } +static void +vdev_initialize_zap_remove_sync(void *arg, dmu_tx_t *tx) +{ + uint64_t guid = *(uint64_t *)arg; + + kmem_free(arg, sizeof (uint64_t)); + + vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + return; + + ASSERT3S(vd->vdev_initialize_state, ==, VDEV_INITIALIZE_NONE); + ASSERT3U(vd->vdev_leaf_zap, !=, 0); + + vd->vdev_initialize_last_offset = 0; + vd->vdev_initialize_action_time = 0; + + objset_t *mos = vd->vdev_spa->spa_meta_objset; + int error; + + error = zap_remove(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, tx); + VERIFY(error == 0 || error == ENOENT); + + error = zap_remove(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_STATE, tx); + VERIFY(error == 0 || error == ENOENT); + + error = zap_remove(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, tx); + VERIFY(error == 0 || error == ENOENT); +} + static void vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) { @@ -123,8 +156,14 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); - dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, - guid, tx); + + if (new_state != VDEV_INITIALIZE_NONE) { + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_initialize_zap_update_sync, guid, tx); + } else { + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_initialize_zap_remove_sync, guid, tx); + } switch (new_state) { case VDEV_INITIALIZE_ACTIVE: @@ -145,6 +184,10 @@ vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) spa_history_log_internal(spa, "initialize", tx, "vdev=%s complete", vd->vdev_path); break; + case VDEV_INITIALIZE_NONE: + spa_history_log_internal(spa, "uninitialize", tx, + "vdev=%s", vd->vdev_path); + break; default: panic("invalid state %llu", (unsigned long long)new_state); } @@ -594,6 +637,24 @@ vdev_initialize(vdev_t *vd) vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); } +/* + * Uninitializes a device. Caller must hold vdev_initialize_lock. + * Device must be a leaf and not already be initializing. + */ +void +vdev_uninitialize(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_initialize_exit_wanted); + ASSERT(!vd->vdev_top->vdev_removing); + + vdev_initialize_change_state(vd, VDEV_INITIALIZE_NONE); +} + /* * Wait for the initialize thread to be terminated (cancelled or stopped). */ @@ -750,6 +811,7 @@ vdev_initialize_restart(vdev_t *vd) } EXPORT_SYMBOL(vdev_initialize); +EXPORT_SYMBOL(vdev_uninitialize); EXPORT_SYMBOL(vdev_initialize_stop); EXPORT_SYMBOL(vdev_initialize_stop_all); EXPORT_SYMBOL(vdev_initialize_stop_wait); diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index 3b1e2ae5fb5d..efaf6f9b390a 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -4070,7 +4070,8 @@ zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) if (!(cmd_type == POOL_INITIALIZE_CANCEL || cmd_type == POOL_INITIALIZE_START || - cmd_type == POOL_INITIALIZE_SUSPEND)) { + cmd_type == POOL_INITIALIZE_SUSPEND || + cmd_type == POOL_INITIALIZE_UNINIT)) { return (SET_ERROR(EINVAL)); } diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 1665e20e0e39..62d9cbeb6d90 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -446,6 +446,7 @@ tests = ['zpool_initialize_attach_detach_add_remove', 'zpool_initialize_start_and_cancel_neg', 'zpool_initialize_start_and_cancel_pos', 'zpool_initialize_suspend_resume', + 'zpool_initialize_uninit', 'zpool_initialize_unsupported_vdevs', 'zpool_initialize_verify_checksums', 'zpool_initialize_verify_initialized'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index a4932fc988ac..3e4120f52ca5 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1102,6 +1102,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh \ + functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh \ functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh new file mode 100755 index 000000000000..17f776cfbc20 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_uninit.ksh @@ -0,0 +1,141 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (C) 2023 Lawrence Livermore National Security, LLC. +# +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib + +# +# DESCRIPTION: +# Starting, stopping, uninitializing, and restart an initialize works. +# +# STRATEGY: +# 1. Create a one-disk pool. +# 2. Verify uninitialize succeeds for uninitialized pool. +# 3. Verify pool wide cancel|suspend + uninit +# a. Start initializing and verify that initializing is active. +# b. Verify uninitialize fails when actively initializing. +# c. Cancel or suspend initializing and verify that initializing is not active. +# d. Verify uninitialize succeeds after being cancelled. +# 4. Verify per-disk cancel|suspend + uninit +# + +DISK1="$(echo $DISKS | cut -d' ' -f1)" +DISK2="$(echo $DISKS | cut -d' ' -f2)" +DISK3="$(echo $DISKS | cut -d' ' -f3)" + +function status_check # pool disk1-state disk2-state disk3-state +{ + typeset pool="$1" + typeset disk1_state="$2" + typeset disk2_state="$3" + typeset disk3_state="$4" + + state=$(zpool status -i "$pool" | grep "$DISK1" | grep "$disk1_state") + if [[ -z "$state" ]]; then + log_fail "DISK1 state; expected='$disk1_state' got '$state'" + fi + + state=$(zpool status -i "$pool" | grep "$DISK2" | grep "$disk2_state") + if [[ -z "$state" ]]; then + log_fail "DISK2 state; expected='$disk2_state' got '$state'" + fi + + state=$(zpool status -i "$pool" | grep "$DISK3" | grep "$disk3_state") + if [[ -z "$state" ]]; then + log_fail "DISK3 state; expected='$disk3_state' got '$state'" + fi +} + +function status_check_all # pool disk-state +{ + typeset pool="$1" + typeset disk_state="$2" + + status_check "$pool" "$disk_state" "$disk_state" "$disk_state" +} + +# 1. Create a one-disk pool. +log_must zpool create -f $TESTPOOL $DISK1 $DISK2 $DISK3 +status_check_all $TESTPOOL "uninitialized" + +# 2. Verify uninitialize succeeds for uninitialized pool. +log_must zpool initialize -u $TESTPOOL +status_check_all $TESTPOOL "uninitialized" + +# 3. Verify pool wide cancel + uninit +log_must zpool initialize $TESTPOOL +status_check_all $TESTPOOL "[[:digit:]]* initialized" + +log_mustnot zpool initialize -u $TESTPOOL +status_check_all $TESTPOOL "[[:digit:]]* initialized" + +log_must zpool initialize -c $TESTPOOL +status_check_all $TESTPOOL "uninitialized" + +log_must zpool initialize -u $TESTPOOL +status_check_all $TESTPOOL "uninitialized" + +# 3. Verify pool wide suspend + uninit +log_must zpool initialize $TESTPOOL +status_check_all $TESTPOOL "[[:digit:]]* initialized" + +log_mustnot zpool initialize -u $TESTPOOL +status_check_all $TESTPOOL "[[:digit:]]* initialized" + +log_must zpool initialize -s $TESTPOOL +status_check_all $TESTPOOL "suspended" + +log_must zpool initialize -u $TESTPOOL +status_check_all $TESTPOOL "uninitialized" + +# 4. Verify per-disk cancel|suspend + uninit +log_must zpool initialize $TESTPOOL +status_check_all $TESTPOOL "[[:digit:]]* initialized" + +log_must zpool initialize -c $TESTPOOL $DISK1 +log_must zpool initialize -s $TESTPOOL $DISK2 +log_mustnot zpool initialize -u $TESTPOOL $DISK3 +status_check $TESTPOOL "uninitialized" "suspended" "[[:digit:]]* initialized" + +log_must zpool initialize -u $TESTPOOL $DISK1 +status_check $TESTPOOL "uninitialized" "suspended" "[[:digit:]]* initialized" + +log_must zpool initialize -u $TESTPOOL $DISK2 +status_check $TESTPOOL "uninitialized" "uninitialized" "[[:digit:]]* initialized" + +log_must zpool initialize $TESTPOOL $DISK1 +status_check $TESTPOOL "[[:digit:]]* initialized" "uninitialized" "[[:digit:]]* initialized" + +log_must zpool initialize $TESTPOOL $DISK2 +status_check_all $TESTPOOL "[[:digit:]]* initialized" + +log_must zpool initialize -s $TESTPOOL +status_check_all $TESTPOOL "suspended" + +log_must zpool initialize -u $TESTPOOL $DISK1 $DISK2 $DISK3 +status_check_all $TESTPOOL "uninitialized" + +log_pass "Initialize start + cancel/suspend + uninit + start works" From 482eeef804f0f325faddb102f112c0f1ec86a1b6 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Fri, 17 Dec 2021 21:35:28 +0100 Subject: [PATCH 115/180] Teach zpool scrub to scrub only blocks in error log Added a flag '-e' in zpool scrub to scrub only blocks in error log. A user can pause, resume and cancel the error scrub by passing additional command line arguments -p -s just like a regular scrub. This involves adding a new flag, creating new libzfs interfaces, a new ioctl, and the actual iteration and read-issuing logic. Error scrubbing is executed in multiple txg to make sure pool performance is not affected. Reviewed-by: Brian Behlendorf Reviewed-by: Tony Hutter Co-authored-by: TulsiJain tulsi.jain@delphix.com Signed-off-by: George Amanakis Closes #8995 Closes #12355 --- cmd/zpool/zpool_main.c | 111 ++- include/libzfs.h | 3 + include/libzfs_core.h | 2 + include/sys/dmu.h | 1 + include/sys/dsl_scan.h | 27 +- include/sys/fs/zfs.h | 19 +- include/sys/spa.h | 8 + include/sys/spa_impl.h | 4 + include/sys/sysevent/eventdefs.h | 5 + lib/libzfs/libzfs.abi | 3 +- lib/libzfs/libzfs_pool.c | 105 ++- lib/libzfs/libzfs_util.c | 14 +- lib/libzfs_core/libzfs_core.abi | 105 +++ lib/libzfs_core/libzfs_core.c | 7 + man/man4/zfs.4 | 3 + man/man8/zpool-scrub.8 | 19 + module/zfs/dsl_scan.c | 696 +++++++++++++++++- module/zfs/spa.c | 6 + module/zfs/spa_errlog.c | 82 ++- module/zfs/spa_misc.c | 25 +- module/zfs/zfs_ioctl.c | 46 ++ tests/runfiles/common.run | 4 +- tests/zfs-tests/cmd/libzfs_input_check.c | 15 + tests/zfs-tests/include/libtest.shlib | 18 + tests/zfs-tests/tests/Makefile.am | 4 + .../zpool_scrub/zpool_error_scrub_001_pos.ksh | 79 ++ .../zpool_scrub/zpool_error_scrub_002_pos.ksh | 99 +++ .../zpool_scrub/zpool_error_scrub_003_pos.ksh | 109 +++ .../zpool_scrub/zpool_error_scrub_004_pos.ksh | 54 ++ 29 files changed, 1602 insertions(+), 71 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 3e08e031414d..013dd4a23380 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -401,7 +401,7 @@ get_usage(zpool_help_t idx) return (gettext("\tinitialize [-c | -s | -u] [-w] " "[ ...]\n")); case HELP_SCRUB: - return (gettext("\tscrub [-s | -p] [-w] ...\n")); + return (gettext("\tscrub [-s | -p] [-w] [-e] ...\n")); case HELP_RESILVER: return (gettext("\tresilver ...\n")); case HELP_TRIM: @@ -7309,8 +7309,9 @@ wait_callback(zpool_handle_t *zhp, void *data) } /* - * zpool scrub [-s | -p] [-w] ... + * zpool scrub [-s | -p] [-w] [-e] ... * + * -e Only scrub blocks in the error log. * -s Stop. Stops any in-progress scrub. * -p Pause. Pause in-progress scrub. * -w Wait. Blocks until scrub has completed. @@ -7326,14 +7327,21 @@ zpool_do_scrub(int argc, char **argv) cb.cb_type = POOL_SCAN_SCRUB; cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; + boolean_t is_error_scrub = B_FALSE; + boolean_t is_pause = B_FALSE; + boolean_t is_stop = B_FALSE; + /* check options */ - while ((c = getopt(argc, argv, "spw")) != -1) { + while ((c = getopt(argc, argv, "spwe")) != -1) { switch (c) { + case 'e': + is_error_scrub = B_TRUE; + break; case 's': - cb.cb_type = POOL_SCAN_NONE; + is_stop = B_TRUE; break; case 'p': - cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; + is_pause = B_TRUE; break; case 'w': wait = B_TRUE; @@ -7345,11 +7353,21 @@ zpool_do_scrub(int argc, char **argv) } } - if (cb.cb_type == POOL_SCAN_NONE && - cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) { - (void) fprintf(stderr, gettext("invalid option combination: " - "-s and -p are mutually exclusive\n")); + if (is_pause && is_stop) { + (void) fprintf(stderr, gettext("invalid option " + "combination :-s and -p are mutually exclusive\n")); usage(B_FALSE); + } else { + if (is_error_scrub) + cb.cb_type = POOL_SCAN_ERRORSCRUB; + + if (is_pause) { + cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; + } else if (is_stop) { + cb.cb_type = POOL_SCAN_NONE; + } else { + cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; + } } if (wait && (cb.cb_type == POOL_SCAN_NONE || @@ -7573,6 +7591,70 @@ secs_to_dhms(uint64_t total, char *buf) } } +/* + * Print out detailed error scrub status. + */ +static void +print_err_scrub_status(pool_scan_stat_t *ps) +{ + time_t start, end, pause; + uint64_t total_secs_left; + uint64_t secs_left, mins_left, hours_left, days_left; + uint64_t examined, to_be_examined; + + if (ps == NULL || ps->pss_error_scrub_func != POOL_SCAN_ERRORSCRUB) { + return; + } + + (void) printf(gettext(" scrub: ")); + + start = ps->pss_error_scrub_start; + end = ps->pss_error_scrub_end; + pause = ps->pss_pass_error_scrub_pause; + examined = ps->pss_error_scrub_examined; + to_be_examined = ps->pss_error_scrub_to_be_examined; + + assert(ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB); + + if (ps->pss_error_scrub_state == DSS_FINISHED) { + total_secs_left = end - start; + days_left = total_secs_left / 60 / 60 / 24; + hours_left = (total_secs_left / 60 / 60) % 24; + mins_left = (total_secs_left / 60) % 60; + secs_left = (total_secs_left % 60); + + (void) printf(gettext("scrubbed %llu error blocks in %llu days " + "%02llu:%02llu:%02llu on %s"), (u_longlong_t)examined, + (u_longlong_t)days_left, (u_longlong_t)hours_left, + (u_longlong_t)mins_left, (u_longlong_t)secs_left, + ctime(&end)); + + return; + } else if (ps->pss_error_scrub_state == DSS_CANCELED) { + (void) printf(gettext("error scrub canceled on %s"), + ctime(&end)); + return; + } + assert(ps->pss_error_scrub_state == DSS_ERRORSCRUBBING); + + /* Error scrub is in progress. */ + if (pause == 0) { + (void) printf(gettext("error scrub in progress since %s"), + ctime(&start)); + } else { + (void) printf(gettext("error scrub paused since %s"), + ctime(&pause)); + (void) printf(gettext("\terror scrub started on %s"), + ctime(&start)); + } + + double fraction_done = (double)examined / (to_be_examined + examined); + (void) printf(gettext("\t%.2f%% done, issued I/O for %llu error" + " blocks"), 100 * fraction_done, (u_longlong_t)examined); + + (void) printf("\n"); +} + /* * Print out detailed scrub status. */ @@ -7909,10 +7991,12 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) { uint64_t rebuild_end_time = 0, resilver_end_time = 0; boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE; + boolean_t have_errorscrub = B_FALSE; boolean_t active_resilver = B_FALSE; pool_checkpoint_stat_t *pcs = NULL; pool_scan_stat_t *ps = NULL; uint_t c; + time_t scrub_start = 0, errorscrub_start = 0; if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c) == 0) { @@ -7921,16 +8005,23 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) active_resilver = (ps->pss_state == DSS_SCANNING); } + have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); + scrub_start = ps->pss_start_time; + have_errorscrub = (ps->pss_error_scrub_func == + POOL_SCAN_ERRORSCRUB); + errorscrub_start = ps->pss_error_scrub_start; } boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0)); /* Always print the scrub status when available. */ - if (have_scrub) + if (have_scrub && scrub_start > errorscrub_start) print_scan_scrub_resilver_status(ps); + else if (have_errorscrub && errorscrub_start >= scrub_start) + print_err_scrub_status(ps); /* * When there is an active resilver or rebuild print its status. diff --git a/include/libzfs.h b/include/libzfs.h index 87d1ed738f2b..a7037e3e6266 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -125,11 +125,14 @@ typedef enum zfs_error { EZFS_THREADCREATEFAILED, /* thread create failed */ EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ EZFS_SCRUBBING, /* currently scrubbing */ + EZFS_ERRORSCRUBBING, /* currently error scrubbing */ + EZFS_ERRORSCRUB_PAUSED, /* error scrub currently paused */ EZFS_NO_SCRUB, /* no active scrub */ EZFS_DIFF, /* general failure of zfs diff */ EZFS_DIFFDATA, /* bad zfs diff data */ EZFS_POOLREADONLY, /* pool is in read-only mode */ EZFS_SCRUB_PAUSED, /* scrub currently paused */ + EZFS_SCRUB_PAUSED_TO_CANCEL, /* scrub currently paused */ EZFS_ACTIVE_POOL, /* pool is imported on a different system */ EZFS_CRYPTOFAILED, /* failed to setup encryption */ EZFS_NO_PENDING, /* cannot cancel, no operation is pending */ diff --git a/include/libzfs_core.h b/include/libzfs_core.h index 14a4857c35da..867c18b9c226 100644 --- a/include/libzfs_core.h +++ b/include/libzfs_core.h @@ -155,6 +155,8 @@ _LIBZFS_CORE_H int lzc_get_bootenv(const char *, nvlist_t **); _LIBZFS_CORE_H int lzc_get_vdev_prop(const char *, nvlist_t *, nvlist_t **); _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **); +_LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **); + #ifdef __cplusplus } #endif diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 5ee6704668a4..7e57d133c2ec 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -378,6 +378,7 @@ typedef struct dmu_buf { #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" +#define DMU_POOL_ERRORSCRUB "error_scrub" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 8925b5815a37..6753b4a8f359 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -29,6 +29,7 @@ #include #include +#include #include #include @@ -78,6 +79,21 @@ typedef enum dsl_scan_flags { #define DSL_SCAN_FLAGS_MASK (DSF_VISIT_DS_AGAIN) +typedef struct dsl_errorscrub_phys { + uint64_t dep_func; /* pool_scan_func_t */ + uint64_t dep_state; /* dsl_scan_state_t */ + uint64_t dep_cursor; /* serialized zap cursor for tracing progress */ + uint64_t dep_start_time; /* error scrub start time, unix timestamp */ + uint64_t dep_end_time; /* error scrub end time, unix timestamp */ + uint64_t dep_to_examine; /* total error blocks to be scrubbed */ + uint64_t dep_examined; /* blocks scrubbed so far */ + uint64_t dep_errors; /* error scrub I/O error count */ + uint64_t dep_paused_flags; /* flag for paused */ +} dsl_errorscrub_phys_t; + +#define ERRORSCRUB_PHYS_NUMINTS (sizeof (dsl_errorscrub_phys_t) \ + / sizeof (uint64_t)) + /* * Every pool will have one dsl_scan_t and this structure will contain * in-memory information about the scan and a pointer to the on-disk @@ -151,11 +167,15 @@ typedef struct dsl_scan { uint64_t scn_avg_zio_size_this_txg; uint64_t scn_zios_this_txg; + /* zap cursor for tracing error scrub progress */ + zap_cursor_t errorscrub_cursor; /* members needed for syncing scan status to disk */ dsl_scan_phys_t scn_phys; /* on disk representation of scan */ dsl_scan_phys_t scn_phys_cached; avl_tree_t scn_queue; /* queue of datasets to scan */ uint64_t scn_queues_pending; /* outstanding data to issue */ + /* members needed for syncing error scrub status to disk */ + dsl_errorscrub_phys_t errorscrub_phys; } dsl_scan_t; typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; @@ -171,8 +191,12 @@ int dsl_scan_cancel(struct dsl_pool *); int dsl_scan(struct dsl_pool *, pool_scan_func_t); void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd); boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp); -int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd); +boolean_t dsl_errorscrubbing(const struct dsl_pool *dp); +boolean_t dsl_errorscrub_active(dsl_scan_t *scn); void dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg); +int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, + pool_scrub_cmd_t cmd); +void dsl_errorscrub_sync(struct dsl_pool *, dmu_tx_t *); boolean_t dsl_scan_resilvering(struct dsl_pool *dp); boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); @@ -184,6 +208,7 @@ void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, struct dmu_tx *tx); boolean_t dsl_scan_active(dsl_scan_t *scn); boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn); +boolean_t dsl_errorscrub_is_paused(const dsl_scan_t *scn); void dsl_scan_freed(spa_t *spa, const blkptr_t *bp); void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue); void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 4c2097fb830e..93193fa142da 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1036,6 +1036,7 @@ typedef enum pool_scan_func { POOL_SCAN_NONE, POOL_SCAN_SCRUB, POOL_SCAN_RESILVER, + POOL_SCAN_ERRORSCRUB, POOL_SCAN_FUNCS } pool_scan_func_t; @@ -1099,6 +1100,20 @@ typedef struct pool_scan_stat { uint64_t pss_pass_scrub_spent_paused; uint64_t pss_pass_issued; /* issued bytes per scan pass */ uint64_t pss_issued; /* total bytes checked by scanner */ + + /* error scrub values stored on disk */ + uint64_t pss_error_scrub_func; /* pool_scan_func_t */ + uint64_t pss_error_scrub_state; /* dsl_scan_state_t */ + uint64_t pss_error_scrub_start; /* error scrub start time */ + uint64_t pss_error_scrub_end; /* error scrub end time */ + uint64_t pss_error_scrub_examined; /* error blocks issued I/O */ + /* error blocks to be issued I/O */ + uint64_t pss_error_scrub_to_be_examined; + + /* error scrub values not stored on disk */ + /* error scrub pause time in milliseconds */ + uint64_t pss_pass_error_scrub_pause; + } pool_scan_stat_t; typedef struct pool_removal_stat { @@ -1120,6 +1135,7 @@ typedef enum dsl_scan_state { DSS_SCANNING, DSS_FINISHED, DSS_CANCELED, + DSS_ERRORSCRUBBING, DSS_NUM_STATES } dsl_scan_state_t; @@ -1360,7 +1376,7 @@ typedef enum { */ typedef enum zfs_ioc { /* - * Core features - 81/128 numbers reserved. + * Core features - 88/128 numbers reserved. */ #ifdef __FreeBSD__ ZFS_IOC_FIRST = 0, @@ -1455,6 +1471,7 @@ typedef enum zfs_ioc { ZFS_IOC_WAIT_FS, /* 0x5a54 */ ZFS_IOC_VDEV_GET_PROPS, /* 0x5a55 */ ZFS_IOC_VDEV_SET_PROPS, /* 0x5a56 */ + ZFS_IOC_POOL_SCRUB, /* 0x5a57 */ /* * Per-platform (Optional) - 8/128 numbers reserved. diff --git a/include/sys/spa.h b/include/sys/spa.h index 460ea2bfee4e..ed752967cca6 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1155,6 +1155,7 @@ extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_approx_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count); +extern uint64_t spa_get_last_errlog_size(spa_t *spa); extern void spa_errlog_rotate(spa_t *spa); extern void spa_errlog_drain(spa_t *spa); extern void spa_errlog_sync(spa_t *spa, uint64_t txg); @@ -1165,6 +1166,13 @@ extern void spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, extern void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx); extern void spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx); +extern int find_top_affected_fs(spa_t *spa, uint64_t head_ds, + zbookmark_err_phys_t *zep, uint64_t *top_affected_fs); +extern int find_birth_txg(struct dsl_dataset *ds, zbookmark_err_phys_t *zep, + uint64_t *birth_txg); +extern void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, + zbookmark_phys_t *zb); +extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep); /* vdev cache */ extern void vdev_cache_stat_init(void); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 5782c54bd78f..44afa763283a 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -295,6 +295,10 @@ struct spa { uint64_t spa_scan_pass_exam; /* examined bytes per pass */ uint64_t spa_scan_pass_issued; /* issued bytes per pass */ + /* error scrub pause time in milliseconds */ + uint64_t spa_scan_pass_errorscrub_pause; + /* total error scrub paused time in milliseconds */ + uint64_t spa_scan_pass_errorscrub_spent_paused; /* * We are in the middle of a resilver, and another resilver * is needed once this one completes. This is set iff any diff --git a/include/sys/sysevent/eventdefs.h b/include/sys/sysevent/eventdefs.h index eb1dfd16c0fd..a21085257967 100644 --- a/include/sys/sysevent/eventdefs.h +++ b/include/sys/sysevent/eventdefs.h @@ -123,6 +123,11 @@ extern "C" { #define ESC_ZFS_TRIM_CANCEL "trim_cancel" #define ESC_ZFS_TRIM_RESUME "trim_resume" #define ESC_ZFS_TRIM_SUSPEND "trim_suspend" +#define ESC_ZFS_ERRORSCRUB_START "errorscrub_start" +#define ESC_ZFS_ERRORSCRUB_FINISH "errorscrub_finish" +#define ESC_ZFS_ERRORSCRUB_ABORT "errorscrub_abort" +#define ESC_ZFS_ERRORSCRUB_RESUME "errorscrub_resume" +#define ESC_ZFS_ERRORSCRUB_PAUSED "errorscrub_paused" /* * datalink subclass definitions. diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 57b096ca6e96..6e53bcb41a87 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -5717,7 +5717,8 @@ - + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index a71cb24736a9..d4af31c50cf8 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -2648,50 +2648,84 @@ zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds, int zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) { - zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; int err; libzfs_handle_t *hdl = zhp->zpool_hdl; - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - zc.zc_cookie = func; - zc.zc_flags = cmd; + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_uint64(args, "scan_type", (uint64_t)func); + fnvlist_add_uint64(args, "scan_command", (uint64_t)cmd); + + err = lzc_scrub(ZFS_IOC_POOL_SCRUB, zhp->zpool_name, args, NULL); + fnvlist_free(args); - if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0) + if (err == 0) { return (0); + } else if (err == ZFS_ERR_IOC_CMD_UNAVAIL) { + zfs_cmd_t zc = {"\0"}; + (void) strlcpy(zc.zc_name, zhp->zpool_name, + sizeof (zc.zc_name)); + zc.zc_cookie = func; + zc.zc_flags = cmd; - err = errno; + if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0) + return (0); + } - /* ECANCELED on a scrub means we resumed a paused scrub */ - if (err == ECANCELED && func == POOL_SCAN_SCRUB && - cmd == POOL_SCRUB_NORMAL) + /* + * An ECANCELED on a scrub means one of the following: + * 1. we resumed a paused scrub. + * 2. we resumed a paused error scrub. + * 3. Error scrub is not run because of no error log. + */ + if (err == ECANCELED && (func == POOL_SCAN_SCRUB || + func == POOL_SCAN_ERRORSCRUB) && cmd == POOL_SCRUB_NORMAL) return (0); - - if (err == ENOENT && func != POOL_SCAN_NONE && cmd == POOL_SCRUB_NORMAL) + /* + * The following cases have been handled here: + * 1. Paused a scrub/error scrub if there is none in progress. + */ + if (err == ENOENT && func != POOL_SCAN_NONE && cmd == + POOL_SCRUB_PAUSE) { return (0); + } + + ASSERT3U(func, >=, POOL_SCAN_NONE); + ASSERT3U(func, <, POOL_SCAN_FUNCS); - if (func == POOL_SCAN_SCRUB) { + if (func == POOL_SCAN_SCRUB || func == POOL_SCAN_ERRORSCRUB) { if (cmd == POOL_SCRUB_PAUSE) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot pause scrubbing %s"), - zc.zc_name); + zhp->zpool_name); } else { assert(cmd == POOL_SCRUB_NORMAL); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot scrub %s"), - zc.zc_name); + zhp->zpool_name); } } else if (func == POOL_SCAN_RESILVER) { assert(cmd == POOL_SCRUB_NORMAL); (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot restart resilver on %s"), zc.zc_name); + "cannot restart resilver on %s"), zhp->zpool_name); } else if (func == POOL_SCAN_NONE) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot cancel scrubbing %s"), zc.zc_name); + "cannot cancel scrubbing %s"), zhp->zpool_name); } else { assert(!"unexpected result"); } + /* + * With EBUSY, five cases are possible: + * + * Current state Requested + * 1. Normal Scrub Running Normal Scrub or Error Scrub + * 2. Normal Scrub Paused Error Scrub + * 3. Normal Scrub Paused Pause Normal Scrub + * 4. Error Scrub Running Normal Scrub or Error Scrub + * 5. Error Scrub Paused Pause Error Scrub + * 6. Resilvering Anything else + */ if (err == EBUSY) { nvlist_t *nvroot; pool_scan_stat_t *ps = NULL; @@ -2703,12 +2737,43 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); if (ps && ps->pss_func == POOL_SCAN_SCRUB && ps->pss_state == DSS_SCANNING) { - if (cmd == POOL_SCRUB_PAUSE) - return (zfs_error(hdl, EZFS_SCRUB_PAUSED, + if (ps->pss_pass_scrub_pause == 0) { + /* handles case 1 */ + assert(cmd == POOL_SCRUB_NORMAL); + return (zfs_error(hdl, EZFS_SCRUBBING, errbuf)); - else - return (zfs_error(hdl, EZFS_SCRUBBING, errbuf)); + } else { + if (func == POOL_SCAN_ERRORSCRUB) { + /* handles case 2 */ + ASSERT3U(cmd, ==, POOL_SCRUB_NORMAL); + return (zfs_error(hdl, + EZFS_SCRUB_PAUSED_TO_CANCEL, + errbuf)); + } else { + /* handles case 3 */ + ASSERT3U(func, ==, POOL_SCAN_SCRUB); + ASSERT3U(cmd, ==, POOL_SCRUB_PAUSE); + return (zfs_error(hdl, + EZFS_SCRUB_PAUSED, errbuf)); + } + } + } else if (ps && + ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB && + ps->pss_error_scrub_state == DSS_ERRORSCRUBBING) { + if (ps->pss_pass_error_scrub_pause == 0) { + /* handles case 4 */ + ASSERT3U(cmd, ==, POOL_SCRUB_NORMAL); + return (zfs_error(hdl, EZFS_ERRORSCRUBBING, + errbuf)); + } else { + /* handles case 5 */ + ASSERT3U(func, ==, POOL_SCAN_ERRORSCRUB); + ASSERT3U(cmd, ==, POOL_SCRUB_PAUSE); + return (zfs_error(hdl, EZFS_ERRORSCRUB_PAUSED, + errbuf)); + } } else { + /* handles case 6 */ return (zfs_error(hdl, EZFS_RESILVERING, errbuf)); } } else if (err == ENOENT) { diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index 4b8a20160e02..b94abea3d581 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -243,10 +243,20 @@ libzfs_error_description(libzfs_handle_t *hdl) "into a new one")); case EZFS_SCRUB_PAUSED: return (dgettext(TEXT_DOMAIN, "scrub is paused; " - "use 'zpool scrub' to resume")); + "use 'zpool scrub' to resume scrub")); + case EZFS_SCRUB_PAUSED_TO_CANCEL: + return (dgettext(TEXT_DOMAIN, "scrub is paused; " + "use 'zpool scrub' to resume or 'zpool scrub -s' to " + "cancel scrub")); case EZFS_SCRUBBING: return (dgettext(TEXT_DOMAIN, "currently scrubbing; " - "use 'zpool scrub -s' to cancel current scrub")); + "use 'zpool scrub -s' to cancel scrub")); + case EZFS_ERRORSCRUBBING: + return (dgettext(TEXT_DOMAIN, "currently error scrubbing; " + "use 'zpool scrub -s' to cancel error scrub")); + case EZFS_ERRORSCRUB_PAUSED: + return (dgettext(TEXT_DOMAIN, "error scrub is paused; " + "use 'zpool scrub -e' to resume error scrub")); case EZFS_NO_SCRUB: return (dgettext(TEXT_DOMAIN, "there is no active scrub")); case EZFS_DIFF: diff --git a/lib/libzfs_core/libzfs_core.abi b/lib/libzfs_core/libzfs_core.abi index 33d794e3f809..f2087186aa44 100644 --- a/lib/libzfs_core/libzfs_core.abi +++ b/lib/libzfs_core/libzfs_core.abi @@ -187,6 +187,7 @@ + @@ -1261,6 +1262,110 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 254f14e04321..c63a16de5ab6 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -247,6 +247,13 @@ lzc_ioctl(zfs_ioc_t ioc, const char *name, return (error); } +int +lzc_scrub(zfs_ioc_t ioc, const char *name, + nvlist_t *source, nvlist_t **resultp) +{ + return (lzc_ioctl(ioc, name, source, resultp)); +} + int lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props, uint8_t *wkeydata, uint_t wkeylen) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index d529147464fe..9ec940a94488 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1764,6 +1764,9 @@ Scrubs are processed by the sync thread. While scrubbing, it will spend at least this much time working on a scrub between TXG flushes. . +.It Sy zfs_scrub_error_blocks_per_txg Ns = Ns Sy 4096 Pq uint +Error blocks to be scrubbed in one txg. +. .It Sy zfs_scan_checkpoint_intval Ns = Ns Sy 7200 Ns s Po 2 hour Pc Pq uint To preserve progress across reboots, the sequential scan algorithm periodically needs to stop metadata scanning and issue all the verification I/O to disk. diff --git a/man/man8/zpool-scrub.8 b/man/man8/zpool-scrub.8 index 1fdbb8a5d56d..138226e4562c 100644 --- a/man/man8/zpool-scrub.8 +++ b/man/man8/zpool-scrub.8 @@ -38,6 +38,7 @@ .Cm scrub .Op Fl s Ns | Ns Fl p .Op Fl w +.Op Fl e .Ar pool Ns … . .Sh DESCRIPTION @@ -62,6 +63,13 @@ device whereas scrubbing examines all data to discover silent errors due to hardware faults or disk failure. .Pp +When scrubbing a pool with encrypted filesystems the keys do not need to be +loaded. +However, if the keys are not loaded and an unrepairable checksum error is +detected the file name cannot be included in the +.Nm zpool Cm status Fl v +verbose error report. +.Pp Because scrubbing and resilvering are I/O-intensive operations, ZFS only allows one at a time. .Pp @@ -92,9 +100,20 @@ Once resumed the scrub will pick up from the place where it was last checkpointed to disk. To resume a paused scrub issue .Nm zpool Cm scrub +or +.Nm zpool Cm scrub +.Fl e again. .It Fl w Wait until scrub has completed before returning. +.It Fl e +Only scrub files with known data errors as reported by +.Nm zpool Cm status Fl v . +The pool must have been scrubbed at least once with the +.Sy head_errlog +feature enabled to use this option. +Error scrubbing cannot be run simultaneously with regular scrubbing or +resilvering, nor can it be run when a regular scrub is paused. .El .Sh EXAMPLES .Ss Example 1 diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index d398b6705551..5e3559b251e3 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -54,6 +54,7 @@ #include #include #include +#include #ifdef _KERNEL #include #endif @@ -129,6 +130,7 @@ static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); static uint64_t dsl_scan_count_data_disks(spa_t *spa); +static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb); extern uint_t zfs_vdev_async_write_active_min_dirty_percent; static int zfs_scan_blkstats = 0; @@ -231,6 +233,9 @@ static int zfs_resilver_disable_defer = B_FALSE; */ static int zfs_free_bpobj_enabled = 1; +/* Error blocks to be scrubbed in one txg. */ +unsigned long zfs_scrub_error_blocks_per_txg = 1 << 12; + /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, @@ -511,9 +516,17 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) "scrub_queue", sizeof (uint64_t), 1, &scn->scn_phys.scn_queue_obj); } else { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRORSCRUB, sizeof (uint64_t), + ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys); + + if (err != 0 && err != ENOENT) + return (err); + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys); + /* * Detect if the pool contains the signature of #2094. If it * does properly update the scn->scn_phys structure and notify @@ -663,6 +676,22 @@ dsl_scan_scrubbing(const dsl_pool_t *dp) scn_phys->scn_func == POOL_SCAN_SCRUB); } +boolean_t +dsl_errorscrubbing(const dsl_pool_t *dp) +{ + dsl_errorscrub_phys_t *errorscrub_phys = &dp->dp_scan->errorscrub_phys; + + return (errorscrub_phys->dep_state == DSS_ERRORSCRUBBING && + errorscrub_phys->dep_func == POOL_SCAN_ERRORSCRUB); +} + +boolean_t +dsl_errorscrub_is_paused(const dsl_scan_t *scn) +{ + return (dsl_errorscrubbing(scn->scn_dp) && + scn->errorscrub_phys.dep_paused_flags); +} + boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn) { @@ -670,6 +699,68 @@ dsl_scan_is_paused_scrub(const dsl_scan_t *scn) scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); } +static void +dsl_errorscrub_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) +{ + scn->errorscrub_phys.dep_cursor = + zap_cursor_serialize(&scn->errorscrub_cursor); + + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS, + &scn->errorscrub_phys, tx)); +} + +static void +dsl_errorscrub_setup_sync(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + pool_scan_func_t *funcp = arg; + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + ASSERT(!dsl_scan_is_running(scn)); + ASSERT(!dsl_errorscrubbing(scn->scn_dp)); + ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); + + memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys)); + scn->errorscrub_phys.dep_func = *funcp; + scn->errorscrub_phys.dep_state = DSS_ERRORSCRUBBING; + scn->errorscrub_phys.dep_start_time = gethrestime_sec(); + scn->errorscrub_phys.dep_to_examine = spa_get_last_errlog_size(spa); + scn->errorscrub_phys.dep_examined = 0; + scn->errorscrub_phys.dep_errors = 0; + scn->errorscrub_phys.dep_cursor = 0; + zap_cursor_init_serialized(&scn->errorscrub_cursor, + spa->spa_meta_objset, spa->spa_errlog_last, + scn->errorscrub_phys.dep_cursor); + + vdev_config_dirty(spa->spa_root_vdev); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_START); + + dsl_errorscrub_sync_state(scn, tx); + + spa_history_log_internal(spa, "error scrub setup", tx, + "func=%u mintxg=%u maxtxg=%llu", + *funcp, 0, (u_longlong_t)tx->tx_txg); +} + +static int +dsl_errorscrub_setup_check(void *arg, dmu_tx_t *tx) +{ + (void) arg; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + if (dsl_scan_is_running(scn) || (dsl_errorscrubbing(scn->scn_dp))) { + return (SET_ERROR(EBUSY)); + } + + if (spa_get_last_errlog_size(scn->scn_dp->dp_spa) == 0) { + return (ECANCELED); + } + return (0); +} + /* * Writes out a persistent dsl_scan_phys_t record to the pool directory. * Because we can be running in the block sorting algorithm, we do not always @@ -745,7 +836,8 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx) dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; - if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd)) + if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd) || + dsl_errorscrubbing(scn->scn_dp)) return (SET_ERROR(EBUSY)); return (0); @@ -754,6 +846,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx) void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { + (void) arg; dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; pool_scan_func_t *funcp = arg; dmu_object_type_t ot = 0; @@ -763,6 +856,14 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) ASSERT(!dsl_scan_is_running(scn)); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); memset(&scn->scn_phys, 0, sizeof (scn->scn_phys)); + + /* + * If we are starting a fresh scrub, we erase the error scrub + * information from disk. + */ + memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys)); + dsl_errorscrub_sync_state(scn, tx); + scn->scn_phys.scn_func = *funcp; scn->scn_phys.scn_state = DSS_SCANNING; scn->scn_phys.scn_min_txg = 0; @@ -856,8 +957,9 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) } /* - * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. - * Can also be called to resume a paused scrub. + * Called by ZFS_IOC_POOL_SCRUB and ZFS_IOC_POOL_SCAN ioctl to start a scrub, + * error scrub or resilver. Can also be called to resume a paused scrub or + * error scrub. */ int dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) @@ -883,6 +985,26 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) return (0); } + if (func == POOL_SCAN_ERRORSCRUB) { + if (dsl_errorscrub_is_paused(dp->dp_scan)) { + /* + * got error scrub start cmd, resume paused error scrub. + */ + int err = dsl_scrub_set_pause_resume(scn->scn_dp, + POOL_SCRUB_NORMAL); + if (err == 0) { + spa_event_notify(spa, NULL, NULL, + ESC_ZFS_ERRORSCRUB_RESUME); + return (ECANCELED); + } + return (SET_ERROR(err)); + } + + return (dsl_sync_task(spa_name(dp->dp_spa), + dsl_errorscrub_setup_check, dsl_errorscrub_setup_sync, + &func, 0, ZFS_SPACE_CHECK_RESERVED)); + } + if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { /* got scrub start cmd, resume paused scrub */ int err = dsl_scrub_set_pause_resume(scn->scn_dp, @@ -891,7 +1013,6 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); return (SET_ERROR(ECANCELED)); } - return (SET_ERROR(err)); } @@ -899,6 +1020,33 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } +static void +dsl_errorscrub_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + if (complete) { + spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_FINISH); + spa_history_log_internal(spa, "error scrub done", tx, + "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); + } else { + spa_history_log_internal(spa, "error scrub canceled", tx, + "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa)); + } + + scn->errorscrub_phys.dep_state = complete ? DSS_FINISHED : DSS_CANCELED; + spa->spa_scrub_active = B_FALSE; + spa_errlog_rotate(spa); + scn->errorscrub_phys.dep_end_time = gethrestime_sec(); + zap_cursor_fini(&scn->errorscrub_cursor); + + if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB) + spa->spa_errata = 0; + + ASSERT(!dsl_errorscrubbing(scn->scn_dp)); +} + static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) { @@ -1045,6 +1193,92 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) ASSERT(!dsl_scan_is_running(scn)); } +static int +dsl_errorscrub_pause_resume_check(void *arg, dmu_tx_t *tx) +{ + pool_scrub_cmd_t *cmd = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_scan_t *scn = dp->dp_scan; + + if (*cmd == POOL_SCRUB_PAUSE) { + /* + * can't pause a error scrub when there is no in-progress + * error scrub. + */ + if (!dsl_errorscrubbing(dp)) + return (SET_ERROR(ENOENT)); + + /* can't pause a paused error scrub */ + if (dsl_errorscrub_is_paused(scn)) + return (SET_ERROR(EBUSY)); + } else if (*cmd != POOL_SCRUB_NORMAL) { + return (SET_ERROR(ENOTSUP)); + } + + return (0); +} + +static void +dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx) +{ + pool_scrub_cmd_t *cmd = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + if (*cmd == POOL_SCRUB_PAUSE) { + spa->spa_scan_pass_errorscrub_pause = gethrestime_sec(); + scn->errorscrub_phys.dep_paused_flags = B_TRUE; + dsl_errorscrub_sync_state(scn, tx); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED); + } else { + ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); + if (dsl_errorscrub_is_paused(scn)) { + /* + * We need to keep track of how much time we spend + * paused per pass so that we can adjust the error scrub + * rate shown in the output of 'zpool status'. + */ + spa->spa_scan_pass_errorscrub_spent_paused += + gethrestime_sec() - + spa->spa_scan_pass_errorscrub_pause; + + spa->spa_scan_pass_errorscrub_pause = 0; + scn->errorscrub_phys.dep_paused_flags = B_FALSE; + + zap_cursor_init_serialized( + &scn->errorscrub_cursor, + spa->spa_meta_objset, spa->spa_errlog_last, + scn->errorscrub_phys.dep_cursor); + + dsl_errorscrub_sync_state(scn, tx); + } + } +} + +static int +dsl_errorscrub_cancel_check(void *arg, dmu_tx_t *tx) +{ + (void) arg; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + /* can't cancel a error scrub when there is no one in-progress */ + if (!dsl_errorscrubbing(scn->scn_dp)) + return (SET_ERROR(ENOENT)); + return (0); +} + +static void +dsl_errorscrub_cancel_sync(void *arg, dmu_tx_t *tx) +{ + (void) arg; + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + dsl_errorscrub_done(scn, B_FALSE, tx); + dsl_errorscrub_sync_state(scn, tx); + spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, + ESC_ZFS_ERRORSCRUB_ABORT); +} + static int dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { @@ -1070,6 +1304,11 @@ dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) int dsl_scan_cancel(dsl_pool_t *dp) { + if (dsl_errorscrubbing(dp)) { + return (dsl_sync_task(spa_name(dp->dp_spa), + dsl_errorscrub_cancel_check, dsl_errorscrub_cancel_sync, + NULL, 3, ZFS_SPACE_CHECK_RESERVED)); + } return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } @@ -1136,6 +1375,12 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) int dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) { + if (dsl_errorscrubbing(dp)) { + return (dsl_sync_task(spa_name(dp->dp_spa), + dsl_errorscrub_pause_resume_check, + dsl_errorscrub_pause_resume_sync, &cmd, 3, + ZFS_SPACE_CHECK_RESERVED)); + } return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, ZFS_SPACE_CHECK_RESERVED)); @@ -1422,6 +1667,42 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) return (B_FALSE); } +static boolean_t +dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) +{ + /* + * We suspend if: + * - we have scrubbed for at least the minimum time (default 1 sec + * for error scrub), someone is explicitly waiting for this txg + * to complete, or we have used up all of the time in the txg + * timeout (default 5 sec). + * or + * - the spa is shutting down because this pool is being exported + * or the machine is rebooting. + */ + uint64_t curr_time_ns = gethrtime(); + uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time; + uint64_t sync_time_ns = curr_time_ns - + scn->scn_dp->dp_spa->spa_sync_starttime; + int mintime = zfs_scrub_min_time_ms; + + if ((NSEC2MSEC(error_scrub_time_ns) > mintime && + (txg_sync_waiting(scn->scn_dp) || + NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || + spa_shutting_down(scn->scn_dp->dp_spa)) { + if (zb) { + dprintf("error scrub suspending at bookmark " + "%llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + } + return (B_TRUE); + } + return (B_FALSE); +} + typedef struct zil_scan_arg { dsl_pool_t *zsa_dp; zil_header_t *zsa_zh; @@ -3352,6 +3633,19 @@ dsl_scan_active(dsl_scan_t *scn) return ((used != 0) || (clones_left)); } +boolean_t +dsl_errorscrub_active(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + if (spa->spa_load_state != SPA_LOAD_NONE) + return (B_FALSE); + if (spa_shutting_down(spa)) + return (B_FALSE); + if (dsl_errorscrubbing(scn->scn_dp)) + return (B_TRUE); + return (B_FALSE); +} + static boolean_t dsl_scan_check_deferred(vdev_t *vd) { @@ -3568,6 +3862,387 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) return (0); } +static void +name_to_bookmark(char *buf, zbookmark_phys_t *zb) +{ + zb->zb_objset = zfs_strtonum(buf, &buf); + ASSERT(*buf == ':'); + zb->zb_object = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_level = (int)zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_blkid = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == '\0'); +} + +static void +name_to_object(char *buf, uint64_t *obj) +{ + *obj = zfs_strtonum(buf, &buf); + ASSERT(*buf == '\0'); +} + +static void +read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb) +{ + dsl_pool_t *dp = scn->scn_dp; + dsl_dataset_t *ds; + objset_t *os; + if (dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds) != 0) + return; + + if (dmu_objset_from_ds(ds, &os) != 0) { + dsl_dataset_rele(ds, FTAG); + return; + } + + /* + * If the key is not loaded dbuf_dnode_findbp() will error out with + * EACCES. However in that case dnode_hold() will eventually call + * dbuf_read()->zio_wait() which may call spa_log_error(). This will + * lead to a deadlock due to us holding the mutex spa_errlist_lock. + * Avoid this by checking here if the keys are loaded, if not return. + * If the keys are not loaded the head_errlog feature is meaningless + * as we cannot figure out the birth txg of the block pointer. + */ + if (dsl_dataset_get_keystatus(ds->ds_dir) == + ZFS_KEYSTATUS_UNAVAILABLE) { + dsl_dataset_rele(ds, FTAG); + return; + } + + dnode_t *dn; + blkptr_t bp; + + if (dnode_hold(os, zb.zb_object, FTAG, &dn) != 0) { + dsl_dataset_rele(ds, FTAG); + return; + } + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + int error = dbuf_dnode_findbp(dn, zb.zb_level, zb.zb_blkid, &bp, NULL, + NULL); + + if (error) { + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + return; + } + + if (!error && BP_IS_HOLE(&bp)) { + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); + return; + } + + int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | + ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB; + + /* If it's an intent log block, failure is expected. */ + if (zb.zb_level == ZB_ZIL_LEVEL) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + ASSERT(!BP_IS_EMBEDDED(&bp)); + scan_exec_io(dp, &bp, zio_flags, &zb, NULL); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + dsl_dataset_rele(ds, FTAG); +} + +/* + * We keep track of the scrubbed error blocks in "count". This will be used + * when deciding whether we exceeded zfs_scrub_error_blocks_per_txg. This + * function is modelled after check_filesystem(). + */ +static int +scrub_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep, + int *count) +{ + dsl_dataset_t *ds; + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + int error = dsl_dataset_hold_obj(dp, fs, FTAG, &ds); + if (error != 0) + return (error); + + uint64_t latest_txg; + uint64_t txg_to_consider = spa->spa_syncing_txg; + boolean_t check_snapshot = B_TRUE; + + error = find_birth_txg(ds, zep, &latest_txg); + + /* + * If find_birth_txg() errors out, then err on the side of caution and + * proceed. In worst case scenario scrub all objects. If zep->zb_birth + * is 0 (e.g. in case of encryption with unloaded keys) also proceed to + * scrub all objects. + */ + if (error == 0 && zep->zb_birth == latest_txg) { + /* Block neither free nor re written. */ + zbookmark_phys_t zb; + zep_to_zb(fs, zep, &zb); + scn->scn_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + /* We have already acquired the config lock for spa */ + read_by_block_level(scn, zb); + + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + scn->errorscrub_phys.dep_examined++; + scn->errorscrub_phys.dep_to_examine--; + (*count)++; + if ((*count) == zfs_scrub_error_blocks_per_txg || + dsl_error_scrub_check_suspend(scn, &zb)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EFAULT)); + } + + check_snapshot = B_FALSE; + } else if (error == 0) { + txg_to_consider = latest_txg; + } + + /* + * Retrieve the number of snapshots if the dataset is not a snapshot. + */ + uint64_t snap_count = 0; + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { + + error = zap_count(spa->spa_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + } + + if (snap_count == 0) { + /* Filesystem without snapshots. */ + dsl_dataset_rele(ds, FTAG); + return (0); + } + + uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + + dsl_dataset_rele(ds, FTAG); + + /* Check only snapshots created from this file system. */ + while (snap_obj != 0 && zep->zb_birth < snap_obj_txg && + snap_obj_txg <= txg_to_consider) { + + error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds); + if (error != 0) + return (error); + + if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) { + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + dsl_dataset_rele(ds, FTAG); + continue; + } + + boolean_t affected = B_TRUE; + if (check_snapshot) { + uint64_t blk_txg; + error = find_birth_txg(ds, zep, &blk_txg); + + /* + * Scrub the snapshot also when zb_birth == 0 or when + * find_birth_txg() returns an error. + */ + affected = (error == 0 && zep->zb_birth == blk_txg) || + (error != 0) || (zep->zb_birth == 0); + } + + /* Scrub snapshots. */ + if (affected) { + zbookmark_phys_t zb; + zep_to_zb(snap_obj, zep, &zb); + scn->scn_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + /* We have already acquired the config lock for spa */ + read_by_block_level(scn, zb); + + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + scn->errorscrub_phys.dep_examined++; + scn->errorscrub_phys.dep_to_examine--; + (*count)++; + if ((*count) == zfs_scrub_error_blocks_per_txg || + dsl_error_scrub_check_suspend(scn, &zb)) { + dsl_dataset_rele(ds, FTAG); + return (EFAULT); + } + } + snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsl_dataset_rele(ds, FTAG); + } + return (0); +} + +void +dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) +{ + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + /* + * Only process scans in sync pass 1. + */ + + if (spa_sync_pass(spa) > 1) + return; + + /* + * If the spa is shutting down, then stop scanning. This will + * ensure that the scan does not dirty any new data during the + * shutdown phase. + */ + if (spa_shutting_down(spa)) + return; + + if (!dsl_errorscrub_active(scn) || dsl_errorscrub_is_paused(scn)) { + return; + } + + if (dsl_scan_resilvering(scn->scn_dp)) { + /* cancel the error scrub if resilver started */ + dsl_scan_cancel(scn->scn_dp); + return; + } + + spa->spa_scrub_active = B_TRUE; + scn->scn_sync_start_time = gethrtime(); + + /* + * zfs_scan_suspend_progress can be set to disable scrub progress. + * See more detailed comment in dsl_scan_sync(). + */ + if (zfs_scan_suspend_progress) { + uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time; + int mintime = zfs_scrub_min_time_ms; + + while (zfs_scan_suspend_progress && + !txg_sync_waiting(scn->scn_dp) && + !spa_shutting_down(scn->scn_dp->dp_spa) && + NSEC2MSEC(scan_time_ns) < mintime) { + delay(hz); + scan_time_ns = gethrtime() - scn->scn_sync_start_time; + } + return; + } + + int i = 0; + zap_attribute_t *za; + zbookmark_phys_t *zb; + boolean_t limit_exceeded = B_FALSE; + + za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + zb = kmem_zalloc(sizeof (zbookmark_phys_t), KM_SLEEP); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) { + for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0; + zap_cursor_advance(&scn->errorscrub_cursor)) { + name_to_bookmark(za->za_name, zb); + + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + dsl_pool_config_enter(dp, FTAG); + read_by_block_level(scn, *zb); + dsl_pool_config_exit(dp, FTAG); + + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + scn->errorscrub_phys.dep_examined += 1; + scn->errorscrub_phys.dep_to_examine -= 1; + i++; + if (i == zfs_scrub_error_blocks_per_txg || + dsl_error_scrub_check_suspend(scn, zb)) { + limit_exceeded = B_TRUE; + break; + } + } + + if (!limit_exceeded) + dsl_errorscrub_done(scn, B_TRUE, tx); + + dsl_errorscrub_sync_state(scn, tx); + kmem_free(za, sizeof (*za)); + kmem_free(zb, sizeof (*zb)); + return; + } + + int error = 0; + for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0; + zap_cursor_advance(&scn->errorscrub_cursor)) { + + zap_cursor_t *head_ds_cursor; + zap_attribute_t *head_ds_attr; + zbookmark_err_phys_t head_ds_block; + + head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP); + head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP); + + uint64_t head_ds_err_obj = za->za_first_integer; + uint64_t head_ds; + name_to_object(za->za_name, &head_ds); + boolean_t config_held = B_FALSE; + uint64_t top_affected_fs; + + for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset, + head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor, + head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) { + + name_to_errphys(head_ds_attr->za_name, &head_ds_block); + + /* + * In case we are called from spa_sync the pool + * config is already held. + */ + if (!dsl_pool_config_held(dp)) { + dsl_pool_config_enter(dp, FTAG); + config_held = B_TRUE; + } + + error = find_top_affected_fs(spa, + head_ds, &head_ds_block, &top_affected_fs); + if (error) + break; + + error = scrub_filesystem(spa, top_affected_fs, + &head_ds_block, &i); + + if (error == SET_ERROR(EFAULT)) { + limit_exceeded = B_TRUE; + break; + } + } + + zap_cursor_fini(head_ds_cursor); + kmem_free(head_ds_cursor, sizeof (*head_ds_cursor)); + kmem_free(head_ds_attr, sizeof (*head_ds_attr)); + + if (config_held) + dsl_pool_config_exit(dp, FTAG); + } + + kmem_free(za, sizeof (*za)); + kmem_free(zb, sizeof (*zb)); + if (!limit_exceeded) + dsl_errorscrub_done(scn, B_TRUE, tx); + + dsl_errorscrub_sync_state(scn, tx); +} + /* * This is the primary entry point for scans that is called from syncing * context. Scans must happen entirely during syncing context so that we @@ -4109,7 +4784,14 @@ dsl_scan_scrub_done(zio_t *zio) if (zio->io_error && (zio->io_error != ECKSUM || !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { - atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors); + if (dsl_errorscrubbing(spa->spa_dsl_pool) && + !dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) { + atomic_inc_64(&spa->spa_dsl_pool->dp_scan + ->errorscrub_phys.dep_errors); + } else { + atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys + .scn_errors); + } } } @@ -4559,3 +5241,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, "Process all resilvers immediately"); + +ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, U64, ZMOD_RW, + "Error blocks to be scrubbed in one txg"); +/* END CSTYLED */ diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 51d6de9105fb..1fc2c5e8c55d 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -8173,6 +8173,7 @@ spa_scan_stop(spa_t *spa) ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); + return (dsl_scan_cancel(spa->spa_dsl_pool)); } @@ -8198,6 +8199,10 @@ spa_scan(spa_t *spa, pool_scan_func_t func) return (0); } + if (func == POOL_SCAN_ERRORSCRUB && + !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) + return (SET_ERROR(ENOTSUP)); + return (dsl_scan(spa->spa_dsl_pool, func)); } @@ -9249,6 +9254,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) brt_sync(spa, txg); ddt_sync(spa, txg); dsl_scan_sync(dp, tx); + dsl_errorscrub_sync(dp, tx); svr_sync(spa, tx); spa_sync_upgrades(spa, tx); diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index 5fe35278683a..2e5c22c11490 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -110,7 +110,7 @@ errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len) /* * Convert a string to a err_phys. */ -static void +void name_to_errphys(char *buf, zbookmark_err_phys_t *zep) { zep->zb_object = zfs_strtonum(buf, &buf); @@ -139,8 +139,7 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) ASSERT(*buf == '\0'); } -#ifdef _KERNEL -static void +void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) { zb->zb_objset = dataset; @@ -148,7 +147,6 @@ zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb) zb->zb_level = zep->zb_level; zb->zb_blkid = zep->zb_blkid; } -#endif static void name_to_object(char *buf, uint64_t *obj) @@ -238,8 +236,7 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth) mutex_exit(&spa->spa_errlist_lock); } -#ifdef _KERNEL -static int +int find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, uint64_t *birth_txg) { @@ -267,6 +264,34 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep, return (error); } +/* + * This function finds the oldest affected filesystem containing an error + * block. + */ +int +find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, + uint64_t *top_affected_fs) +{ + uint64_t oldest_dsobj; + int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth, + &oldest_dsobj); + if (error != 0) + return (error); + + dsl_dataset_t *ds; + error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds); + if (error != 0) + return (error); + + *top_affected_fs = + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + return (0); +} + + +#ifdef _KERNEL /* * Copy the bookmark to the end of the user-space buffer which starts at * uaddr and has *count unused entries, and decrement *count by 1. @@ -288,7 +313,8 @@ copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count) * Each time the error block is referenced by a snapshot or clone, add a * zbookmark_phys_t entry to the userspace array at uaddr. The array is * filled from the back and the in-out parameter *count is modified to be the - * number of unused entries at the beginning of the array. + * number of unused entries at the beginning of the array. The function + * scrub_filesystem() is modelled after this one. */ static int check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, @@ -449,28 +475,6 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, return (error); } -static int -find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, - uint64_t *top_affected_fs) -{ - uint64_t oldest_dsobj; - int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth, - &oldest_dsobj); - if (error != 0) - return (error); - - dsl_dataset_t *ds; - error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj, - DS_HOLD_FLAG_DECRYPT, FTAG, &ds); - if (error != 0) - return (error); - - *top_affected_fs = - dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj; - dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); - return (0); -} - static int process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, void *uaddr, uint64_t *count) @@ -536,6 +540,21 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep, } #endif +/* Return the number of errors in the error log */ +uint64_t +spa_get_last_errlog_size(spa_t *spa) +{ + uint64_t total = 0, count; + mutex_enter(&spa->spa_errlog_lock); + + if (spa->spa_errlog_last != 0 && + zap_count(spa->spa_meta_objset, spa->spa_errlog_last, + &count) == 0) + total += count; + mutex_exit(&spa->spa_errlog_lock); + return (total); +} + /* * If a healed bookmark matches an entry in the error log we stash it in a tree * so that we can later remove the related log entries in sync context. @@ -1447,6 +1466,7 @@ spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds, /* error handling */ EXPORT_SYMBOL(spa_log_error); EXPORT_SYMBOL(spa_approx_errlog_size); +EXPORT_SYMBOL(spa_get_last_errlog_size); EXPORT_SYMBOL(spa_get_errlog); EXPORT_SYMBOL(spa_errlog_rotate); EXPORT_SYMBOL(spa_errlog_drain); @@ -1456,6 +1476,10 @@ EXPORT_SYMBOL(spa_delete_dataset_errlog); EXPORT_SYMBOL(spa_swap_errlog); EXPORT_SYMBOL(sync_error_list); EXPORT_SYMBOL(spa_upgrade_errlog); +EXPORT_SYMBOL(find_top_affected_fs); +EXPORT_SYMBOL(find_birth_txg); +EXPORT_SYMBOL(zep_to_zb); +EXPORT_SYMBOL(name_to_errphys); #endif /* BEGIN CSTYLED */ diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 54a0eeccf27b..89e1ce7165db 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2579,9 +2579,18 @@ spa_scan_stat_init(spa_t *spa) spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; else spa->spa_scan_pass_scrub_pause = 0; + + if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) + spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start; + else + spa->spa_scan_pass_errorscrub_pause = 0; + spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; spa->spa_scan_pass_issued = 0; + + // error scrub stats + spa->spa_scan_pass_errorscrub_spent_paused = 0; } /* @@ -2592,8 +2601,10 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) { dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; - if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) + if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE && + scn->errorscrub_phys.dep_func == POOL_SCAN_NONE)) return (SET_ERROR(ENOENT)); + memset(ps, 0, sizeof (pool_scan_stat_t)); /* data stored on disk */ @@ -2616,6 +2627,18 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) ps->pss_issued = scn->scn_issued_before_pass + spa->spa_scan_pass_issued; + /* error scrub data stored on disk */ + ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func; + ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state; + ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time; + ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time; + ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined; + ps->pss_error_scrub_to_be_examined = + scn->errorscrub_phys.dep_to_examine; + + /* error scrub data not stored on disk */ + ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause; + return (0); } diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index efaf6f9b390a..f91a2f3bbca5 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -1685,6 +1685,47 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc) return (error); } +/* + * inputs: + * poolname name of the pool + * scan_type scan func (pool_scan_func_t) + * scan_command scrub pause/resume flag (pool_scrub_cmd_t) + */ +static const zfs_ioc_key_t zfs_keys_pool_scrub[] = { + {"scan_type", DATA_TYPE_UINT64, 0}, + {"scan_command", DATA_TYPE_UINT64, 0}, +}; + +static int +zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + uint64_t scan_type, scan_cmd; + + if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0) + return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(innvl, "scan_command", &scan_cmd) != 0) + return (SET_ERROR(EINVAL)); + + if (scan_cmd >= POOL_SCRUB_FLAGS_END) + return (SET_ERROR(EINVAL)); + + if ((error = spa_open(poolname, &spa, FTAG)) != 0) + return (error); + + if (scan_cmd == POOL_SCRUB_PAUSE) { + error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); + } else if (scan_type == POOL_SCAN_NONE) { + error = spa_scan_stop(spa); + } else { + error = spa_scan(spa, scan_type); + } + + spa_close(spa, FTAG); + return (error); +} + static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { @@ -7218,6 +7259,11 @@ zfs_ioctl_init(void) POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_vdev_set_props, ARRAY_SIZE(zfs_keys_vdev_set_props)); + zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB, + zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_NONE, B_TRUE, B_TRUE, + zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub)); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 62d9cbeb6d90..9ed1a6d37a97 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -479,7 +479,9 @@ tags = ['functional', 'cli_root', 'zpool_resilver'] tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', - 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies'] + 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies', + 'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos', + 'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos'] tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] diff --git a/tests/zfs-tests/cmd/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check.c index a1dfaefd7105..c661718a296c 100644 --- a/tests/zfs-tests/cmd/libzfs_input_check.c +++ b/tests/zfs-tests/cmd/libzfs_input_check.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * Test the nvpair inputs for the non-legacy zfs ioctl commands. @@ -688,6 +689,17 @@ test_vdev_trim(const char *pool) nvlist_free(required); } +/* Test with invalid values */ +static void +test_scrub(const char *pool) +{ + nvlist_t *required = fnvlist_alloc(); + fnvlist_add_uint64(required, "scan_type", POOL_SCAN_FUNCS + 1); + fnvlist_add_uint64(required, "scan_command", POOL_SCRUB_FLAGS_END + 1); + IOC_INPUT_TEST(ZFS_IOC_POOL_SCRUB, pool, required, NULL, EINVAL); + nvlist_free(required); +} + static int zfs_destroy(const char *dataset) { @@ -868,6 +880,8 @@ zfs_ioc_input_tests(const char *pool) test_set_bootenv(pool); test_get_bootenv(pool); + test_scrub(pool); + /* * cleanup */ @@ -1022,6 +1036,7 @@ validate_ioc_values(void) CHECK(ZFS_IOC_BASE + 82 == ZFS_IOC_GET_BOOKMARK_PROPS); CHECK(ZFS_IOC_BASE + 83 == ZFS_IOC_WAIT); CHECK(ZFS_IOC_BASE + 84 == ZFS_IOC_WAIT_FS); + CHECK(ZFS_IOC_BASE + 87 == ZFS_IOC_POOL_SCRUB); CHECK(ZFS_IOC_PLATFORM_BASE + 1 == ZFS_IOC_EVENTS_NEXT); CHECK(ZFS_IOC_PLATFORM_BASE + 2 == ZFS_IOC_EVENTS_CLEAR); CHECK(ZFS_IOC_PLATFORM_BASE + 3 == ZFS_IOC_EVENTS_SEEK); diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 8521f271be54..133f8387ddaf 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -1969,6 +1969,12 @@ function is_pool_scrubbing #pool check_pool_status "$1" "scan" "scrub in progress since " $2 } +function is_pool_error_scrubbing #pool +{ + check_pool_status "$1" "scrub" "error scrub in progress since " $2 + return $? +} + function is_pool_scrubbed #pool { check_pool_status "$1" "scan" "scrub repaired" $2 @@ -1979,11 +1985,23 @@ function is_pool_scrub_stopped #pool check_pool_status "$1" "scan" "scrub canceled" $2 } +function is_pool_error_scrub_stopped #pool +{ + check_pool_status "$1" "scrub" "error scrub canceled on " $2 + return $? +} + function is_pool_scrub_paused #pool { check_pool_status "$1" "scan" "scrub paused since " $2 } +function is_pool_error_scrub_paused #pool +{ + check_pool_status "$1" "scrub" "error scrub paused since " $2 + return $? +} + function is_pool_removing #pool { check_pool_status "$1" "remove" "in progress since " diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 3e4120f52ca5..ad4aec543299 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1153,6 +1153,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \ + functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \ + functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \ + functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \ + functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh \ functional/cli_root/zpool_set/cleanup.ksh \ functional/cli_root/zpool_set/setup.ksh \ functional/cli_root/zpool/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh new file mode 100755 index 000000000000..e414cd1beaad --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh @@ -0,0 +1,79 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2019, Delphix. All rights reserved. +# Copyright (c) 2023, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg + +# +# DESCRIPTION: +# Verify scrub -e, -p, and -s show the right status. +# +# STRATEGY: +# 1. Create a pool and create a 10MB file in it. +# 2. Start a error scrub (-e) and verify it's doing a scrub. +# 3. Pause error scrub (-p) and verify it's paused. +# 4. Try to pause a paused error scrub (-p) and make sure that fails. +# 5. Resume the paused error scrub and verify again it's doing a scrub. +# 6. Verify zpool scrub -s succeed when the system is error scrubbing. +# + +verify_runnable "global" + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + log_must zinject -c all + rm -f /$TESTPOOL/10m_file +} + +log_onexit cleanup + +log_assert "Verify scrub -e, -p, and -s show the right status." + +log_must fio --rw=write --name=job --size=10M --filename=/$TESTPOOL/10m_file + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +log_must zinject -t data -e checksum -f 100 -am /$TESTPOOL/10m_file + +# create some error blocks +dd if=/$TESTPOOL/10m_file bs=1M count=1 || true + +# sync error blocks to disk +log_must sync_pool $TESTPOOL + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 +log_must zpool scrub -e $TESTPOOL +log_must is_pool_error_scrubbing $TESTPOOL true +log_must zpool scrub -p $TESTPOOL +log_must is_pool_error_scrub_paused $TESTPOOL true +log_mustnot zpool scrub -p $TESTPOOL +log_must is_pool_error_scrub_paused $TESTPOOL true +log_must zpool scrub -e $TESTPOOL +log_must is_pool_error_scrubbing $TESTPOOL true +log_must zpool scrub -s $TESTPOOL +log_must is_pool_error_scrub_stopped $TESTPOOL true + +log_pass "Verified scrub -e, -p, and -s show expected status." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh new file mode 100755 index 000000000000..daa11c3949c6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh @@ -0,0 +1,99 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2019, Delphix. All rights reserved. +# Copyright (c) 2023, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg + +# +# DESCRIPTION: +# Verify regular scrub and error scrub can't run at the same time. +# +# STRATEGY: +# 1. Create a pool and create a 10MB file in it. +# 2. Start a scrub and verify it's doing a scrub. +# 3. Start a error scrub (-e) and verify it fails. +# 4. Pause scrub (-p) and verify it's paused. +# 5. Start a error scrub (-e) verify it fails again. +# 6. Resume the paused scrub, verify it and cancel it. +# 7. Start a error scrub (-e) and verify it's doing error scrub. +# 8. Start a scrub and verify it fails. +# 9. Cancel error scrub (-e) and verify it is canceled. +# 10. Start scrub, verify it, cancel it and verify it. +# + +verify_runnable "global" + +function cleanup +{ + log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0 + log_must zinject -c all + rm -f /$TESTPOOL/10m_file +} + +log_onexit cleanup + +log_assert "Verify regular scrub and error scrub can't run at the same time." + +log_must fio --rw=write --name=job --size=10M --filename=/$TESTPOOL/10m_file + +log_must zpool export $TESTPOOL +log_must zpool import $TESTPOOL +log_must zinject -t data -e checksum -f 100 -am /$TESTPOOL/10m_file + +# create some error blocks before error scrub is requested. +dd if=/$TESTPOOL/10m_file bs=1M count=1 || true +# sync error blocks to disk +log_must sync_pool $TESTPOOL + +log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1 + +log_must zpool scrub $TESTPOOL +log_must is_pool_scrubbing $TESTPOOL true +log_mustnot zpool scrub -e $TESTPOOL +log_must zpool scrub -p $TESTPOOL +log_must is_pool_scrub_paused $TESTPOOL true +log_mustnot zpool scrub -e $TESTPOOL +log_must zpool scrub $TESTPOOL +log_must is_pool_scrubbing $TESTPOOL true +log_must zpool scrub -s $TESTPOOL +log_must is_pool_scrub_stopped $TESTPOOL true + +# create some error blocks before error scrub is requested. +dd if=/$TESTPOOL/10m_file bs=1M count=1 || true +# sync error blocks to disk +log_must sync_pool $TESTPOOL + +log_must zpool scrub -e $TESTPOOL +log_must is_pool_error_scrubbing $TESTPOOL true +log_mustnot zpool scrub $TESTPOOL +log_must zpool scrub -s $TESTPOOL +log_must is_pool_error_scrub_stopped $TESTPOOL true + +log_must zpool scrub $TESTPOOL +log_must is_pool_scrubbing $TESTPOOL true +log_must zpool scrub -s $TESTPOOL +log_must is_pool_scrub_stopped $TESTPOOL true + +log_pass "Verified regular scrub and error scrub can't run at the same time." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh new file mode 100755 index 000000000000..d0066fdbb4a3 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh @@ -0,0 +1,109 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2019, Delphix. All rights reserved. +# Copyright (c) 2023, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg + +# +# DESCRIPTION: +# Verify error scrub clears the errorlog, if errors no longer exist. +# +# STRATEGY: +# 1. Create a pool and create file in it. +# 2. Zinject errors and read using dd to log errors to disk. +# 3. Make sure file name is mentioned in the list of error files. +# 4. Start error scrub and wait for it finish. +# 5. Check scrub ran and errors are still reported. +# 6. Clear corruption and error scrub again. +# 7. Check scrub ran and errors are cleared. +# + +verify_runnable "global" + +function cleanup +{ + zinject -c all + rm -f /$TESTPOOL2/$TESTFILE0 + destroy_pool $TESTPOOL2 +} + +log_onexit cleanup + +log_assert "Verify error scrub clears the errorlog, if errors no longer exist." + +truncate -s $MINVDEVSIZE $TESTDIR/vdev_a +log_must zpool create -f -O primarycache=none $TESTPOOL2 $TESTDIR/vdev_a +log_must zfs create $TESTPOOL2/$TESTFS1 +typeset file=/$TESTPOOL2/$TESTFS1/$TESTFILE0 +log_must dd if=/dev/urandom of=$file bs=2M count=10 + +lastfs="$(zfs list -r $TESTPOOL2 | tail -1 | awk '{print $1}')" +for i in {1..3}; do + log_must zfs snap $lastfs@snap$i + log_must zfs clone $lastfs@snap$i $TESTPOOL2/clone$i + lastfs="$(zfs list -r $TESTPOOL2/clone$i | tail -1 | awk '{print $1}')" +done + +log_must zinject -t data -e checksum -f 100 -a $file +dd if=$file of=/dev/null bs=2M count=10 + +# Important: sync error log to disk +log_must sync_pool $TESTPOOL2 + +# Check reported errors +log_must zpool status -v $TESTPOOL2 +log_must eval "zpool status -v $TESTPOOL2 | \ + grep \"Permanent errors have been detected\"" +log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1@snap1:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1@snap2:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone2/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'" + +# Check errors are reported if corruption persists +log_must zpool scrub -e -w $TESTPOOL2 +log_must eval "zpool status -v | grep 'error blocks'" +log_must zpool status -v $TESTPOOL2 +log_must eval "zpool status -v $TESTPOOL2 | \ + grep \"Permanent errors have been detected\"" +log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1@snap1:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone1@snap2:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone2/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'" +log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'" + +# Check errors are cleared +log_must zinject -c all +log_must zpool scrub -e -w $TESTPOOL2 +log_must zpool status -v $TESTPOOL2 +log_must eval "zpool status -v | grep 'error blocks'" +log_mustnot eval "zpool status -v | grep '$TESTFILE0'" + + +log_pass "Verify error scrub clears the errorlog, if errors no longer exist." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh new file mode 100755 index 000000000000..c88b9b0c8d33 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh @@ -0,0 +1,54 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (c) 2023, George Amanakis. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg + +# +# DESCRIPTION: +# Verify error scrub clears the errorlog, if errors no longer exist. +# +# STRATEGY: +# 1. Create a pool with head_errlog disabled. +# 2. Run an error scrub and verify it is not supported. +# + +verify_runnable "global" + +function cleanup +{ + rm -f /$TESTPOOL2/$TESTFILE0 + destroy_pool $TESTPOOL2 +} + +log_onexit cleanup + +log_assert "Verify error scrub cannot run without the head_errlog feature." + +truncate -s $MINVDEVSIZE $TESTDIR/vdev_a +log_must zpool create -f -o feature@head_errlog=disabled $TESTPOOL2 $TESTDIR/vdev_a +log_mustnot zpool scrub -ew $TESTPOOL2 + +log_pass "Verify error scrub cannot run without the head_errlog feature." + From 577e835f30c9b92ed8126eb4e8fb17cb0e411c04 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 19 May 2023 13:05:09 -0700 Subject: [PATCH 116/180] Probe vdevs before marking removed Before allowing the ZED to mark a vdev as REMOVED due to a hotplug event confirm that it is non-responsive with probe. Any device which can be successfully probed should be left ONLINE to prevent a healthy pool from being incorrectly SUSPENDED. This may occur for at least the following two scenarios. 1) Drive expansion (zpool online -e) in VMware environments. If, during the partition resize operation, a partition is removed and re-created then udev will send a removed event. 2) Re-scanning the namespaces of an NVMe device (nvme ns-rescan) may result in a udev remove and add event being delivered. Finally, update the ZED to only kick in a spare when the removal was successful. Reviewed-by: Ameer Hamza Reviewed-by: Tony Hutter Reviewed-by: Richard Yao Signed-off-by: Brian Behlendorf Issue #14859 Closes #14861 --- cmd/zed/agents/zfs_retire.c | 8 +++++--- module/zfs/vdev.c | 11 +++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c index 28714ec295bb..f83ae09259ab 100644 --- a/cmd/zed/agents/zfs_retire.c +++ b/cmd/zed/agents/zfs_retire.c @@ -445,14 +445,16 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, return; /* Remove the vdev since device is unplugged */ + int remove_status = 0; if (l2arc || (strcmp(class, "resource.fs.zfs.removed") == 0)) { - int status = zpool_vdev_remove_wanted(zhp, devname); + remove_status = zpool_vdev_remove_wanted(zhp, devname); fmd_hdl_debug(hdl, "zpool_vdev_remove_wanted '%s'" - ", ret:%d", devname, status); + ", err:%d", devname, libzfs_errno(zhdl)); } /* Replace the vdev with a spare if its not a l2arc */ - if (!l2arc && (!fmd_prop_get_int32(hdl, "spare_on_remove") || + if (!l2arc && !remove_status && + (!fmd_prop_get_int32(hdl, "spare_on_remove") || replace_with_spare(hdl, zhp, vdev) == B_FALSE)) { /* Could not handle with spare */ fmd_hdl_debug(hdl, "no spare for '%s'", devname); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 4bfd95861e02..c243dddb7e61 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4097,11 +4097,18 @@ vdev_remove_wanted(spa_t *spa, uint64_t guid) return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); /* - * If the vdev is already removed, then don't do anything. + * If the vdev is already removed, or expanding which can trigger + * repartition add/remove events, then don't do anything. */ - if (vd->vdev_removed) + if (vd->vdev_removed || vd->vdev_expanding) return (spa_vdev_state_exit(spa, NULL, 0)); + /* + * Confirm the vdev has been removed, otherwise don't do anything. + */ + if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL))) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST))); + vd->vdev_remove_wanted = B_TRUE; spa_async_request(spa, SPA_ASYNC_REMOVE); From ad0a554614b096698d9969340c4c593690042d5b Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Fri, 19 May 2023 16:05:53 -0400 Subject: [PATCH 117/180] Hold db_mtx when updating db_state Commit 555ef90 did some general code refactoring for dmu_buf_will_not_fill() and dmu_buf_will_fill(). However, the db_mtx was not held when update db->db_state in those code block. The rest of the dbuf code always holds the db_mtx when updating db_state. This is important because cv_wait() db_changed is used to check for db_state changes. Updating dmu_buf_will_not_fill() and dmu_buf_will_fill() to hold the db_mtx when updating db_state. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Brian Atkinson Closes #14875 --- module/zfs/dbuf.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 049a62c1c171..272e712586fa 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2716,8 +2716,10 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + mutex_enter(&db->db_mtx); db->db_state = DB_NOFILL; DTRACE_SET_STATE(db, "allocating NOFILL buffer"); + mutex_exit(&db->db_mtx); dbuf_noread(db); (void) dbuf_dirty(db, tx); @@ -2736,6 +2738,7 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + mutex_enter(&db->db_mtx); if (db->db_state == DB_NOFILL) { /* * Block cloning: We will be completely overwriting a block @@ -2743,11 +2746,10 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) * pending clone and mark the block as uncached. This will be * as if the clone was never done. */ - mutex_enter(&db->db_mtx); VERIFY(!dbuf_undirty(db, tx)); - mutex_exit(&db->db_mtx); db->db_state = DB_UNCACHED; } + mutex_exit(&db->db_mtx); dbuf_noread(db); (void) dbuf_dirty(db, tx); From f8447cf22ec39b2ec3498f0205d4fde3d7efcb27 Mon Sep 17 00:00:00 2001 From: youzhongyang Date: Wed, 24 May 2023 15:23:42 -0400 Subject: [PATCH 118/180] Linux 6.4 compat: reclaimed_slab renamed to reclaimed Reviewed-by: Richard Yao Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Youzhong Yang Closes #14891 --- config/kernel-reclaim_state.m4 | 26 ++++++++++++++++++++++++++ config/kernel.m4 | 2 ++ module/os/linux/spl/spl-kmem-cache.c | 5 ++++- module/os/linux/zfs/arc_os.c | 4 ++++ 4 files changed, 36 insertions(+), 1 deletion(-) create mode 100644 config/kernel-reclaim_state.m4 diff --git a/config/kernel-reclaim_state.m4 b/config/kernel-reclaim_state.m4 new file mode 100644 index 000000000000..9936b3c1001f --- /dev/null +++ b/config/kernel-reclaim_state.m4 @@ -0,0 +1,26 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_RECLAIMED], [ + dnl # + dnl # 6.4 API change + dnl # The reclaimed_slab of struct reclaim_state + dnl # is renamed to reclaimed + dnl # + ZFS_LINUX_TEST_SRC([reclaim_state_reclaimed], [ + #include + static const struct reclaim_state + rs __attribute__ ((unused)) = { + .reclaimed = 100, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_RECLAIMED], [ + AC_MSG_CHECKING([whether struct reclaim_state has reclaimed field]) + ZFS_LINUX_TEST_RESULT([reclaim_state_reclaimed], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RECLAIM_STATE_RECLAIMED, 1, + [struct reclaim_state has reclaimed]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + diff --git a/config/kernel.m4 b/config/kernel.m4 index 439ffdf5a898..cb7e736c9a43 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -153,6 +153,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_IATTR_VFSID ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_WRITEPAGE_T + ZFS_AC_KERNEL_SRC_RECLAIMED case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -285,6 +286,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_IATTR_VFSID ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_WRITEPAGE_T + ZFS_AC_KERNEL_RECLAIMED case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 963e7a1ec96a..745d03012f9d 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -182,8 +182,11 @@ kv_free(spl_kmem_cache_t *skc, void *ptr, int size) * of that infrastructure we are responsible for incrementing it. */ if (current->reclaim_state) +#ifdef HAVE_RECLAIM_STATE_RECLAIMED + current->reclaim_state->reclaimed += size >> PAGE_SHIFT; +#else current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; - +#endif vfree(ptr); } diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index b7d6053529b4..29a8802b8367 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -219,7 +219,11 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) arc_reduce_target_size(ptob(sc->nr_to_scan)); arc_wait_for_eviction(ptob(sc->nr_to_scan), B_FALSE); if (current->reclaim_state != NULL) +#ifdef HAVE_RECLAIM_STATE_RECLAIMED + current->reclaim_state->reclaimed += sc->nr_to_scan; +#else current->reclaim_state->reclaimed_slab += sc->nr_to_scan; +#endif /* * We are experiencing memory pressure which the arc_evict_zthr was From 9d618615d1ede4dd40a69386bc300580550fd4d0 Mon Sep 17 00:00:00 2001 From: Akash B Date: Thu, 25 May 2023 00:58:09 +0530 Subject: [PATCH 119/180] Fix concurrent resilvers initiated at same time For draid vdevs it was possible to initiate both the sequential and healing resilver at same time. This fixes the following two scenarios. 1) There's a window where a sequential rebuild can be started via ZED even if a healing resilver has been scheduled. - This is fixed by adding additional check in spa_vdev_attach() for any scheduled resilver and return appropriate error code when a resilver is already in progress. 2) It was possible for zpool clear to start a healing resilver when it wasn't needed at all. This occurs because during a vdev_open() the device is presumed to be healthy not until the device is validated by vdev_validate() and it's set unavailable. However, by this point an async resilver will have already been requested if the DTL isn't empty. - This is fixed by cancelling the SPA_ASYNC_RESILVER request immediately at the end of vdev_reopen() when a resilver is unneeded. Finally, added a testcase in ZTS for verification. Reviewed-by: Brian Behlendorf Reviewed-by: Dipak Ghosh Signed-off-by: Akash B Closes #14881 Closes #14892 --- module/zfs/spa.c | 5 +- module/zfs/vdev.c | 13 ++- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 1 + .../zpool_resilver_concurrent.ksh | 101 ++++++++++++++++++ 5 files changed, 120 insertions(+), 3 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1fc2c5e8c55d..27bbb8f09259 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -33,6 +33,7 @@ * Copyright 2017 Joyent, Inc. * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2021, Colm Buckley + * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. */ /* @@ -6874,9 +6875,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); - if (dsl_scan_resilvering(spa_get_dsl(spa))) + if (dsl_scan_resilvering(spa_get_dsl(spa)) || + dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { return (spa_vdev_exit(spa, NULL, txg, ZFS_ERR_RESILVER_IN_PROGRESS)); + } } else { if (vdev_rebuild_active(rvd)) return (spa_vdev_exit(spa, NULL, txg, diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index c243dddb7e61..58dcd9f79799 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -29,7 +29,7 @@ * Copyright (c) 2017, Intel Corporation. * Copyright (c) 2019, Datto Inc. All rights reserved. * Copyright (c) 2021, Klara Inc. - * Copyright [2021] Hewlett Packard Enterprise Development LP + * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP. */ #include @@ -2699,6 +2699,17 @@ vdev_reopen(vdev_t *vd) (void) vdev_validate(vd); } + /* + * Recheck if resilver is still needed and cancel any + * scheduled resilver if resilver is unneeded. + */ + if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) && + spa->spa_async_tasks & SPA_ASYNC_RESILVER) { + mutex_enter(&spa->spa_async_lock); + spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER; + mutex_exit(&spa->spa_async_lock); + } + /* * Reassess parent vdev's health. */ diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 9ed1a6d37a97..10525289a3bd 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -472,7 +472,8 @@ tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift'] tags = ['functional', 'cli_root', 'zpool_replace'] [tests/functional/cli_root/zpool_resilver] -tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart'] +tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart', + 'zpool_resilver_concurrent'] tags = ['functional', 'cli_root', 'zpool_resilver'] [tests/functional/cli_root/zpool_scrub] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index ad4aec543299..129893cd61f3 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1142,6 +1142,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_resilver/setup.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh \ functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh \ + functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh \ functional/cli_root/zpool_scrub/cleanup.ksh \ functional/cli_root/zpool_scrub/setup.ksh \ functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh new file mode 100755 index 000000000000..4c3b09796869 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_concurrent.ksh @@ -0,0 +1,101 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 Hewlett Packard Enterprise Development LP. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/redundancy/redundancy.kshlib + +# +# DESCRIPTION: +# Verify 'zpool clear' doesn't cause concurrent resilvers +# +# STRATEGY: +# 1. Create N(10) virtual disk files. +# 2. Create draid pool based on the virtual disk files. +# 3. Fill the filesystem with directories and files. +# 4. Force-fault 2 vdevs and verify distributed spare is kicked in. +# 5. Free the distributed spare by replacing the faulty drive. +# 6. Run zpool clear and verify that it does not initiate 2 resilvers +# concurrently while distributed spare gets kicked in. +# + +verify_runnable "global" + +typeset -ir devs=10 +typeset -ir nparity=1 +typeset -ir ndata=8 +typeset -ir dspare=1 + +function cleanup +{ + poolexists "$TESTPOOL" && destroy_pool "$TESTPOOL" + + for i in {0..$devs}; do + log_must rm -f "$BASEDIR/vdev$i" + done + + for dir in $BASEDIR; do + if [[ -d $dir ]]; then + log_must rm -rf $dir + fi + done + + zed_stop + zed_cleanup +} + +log_assert "Verify zpool clear on draid pool doesn't cause concurrent resilvers" +log_onexit cleanup + +setup_test_env $TESTPOOL draid${nparity}:${ndata}d:${dspare}s $devs + +# ZED needed for sequential resilver +zed_setup +log_must zed_start + +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev5 +log_must wait_vdev_state $TESTPOOL draid1-0-0 "ONLINE" 60 +log_must zpool wait -t resilver $TESTPOOL +log_must zpool offline -f $TESTPOOL $BASEDIR/vdev6 + +log_must zpool labelclear -f $BASEDIR/vdev5 +log_must zpool labelclear -f $BASEDIR/vdev6 + +log_must zpool replace -w $TESTPOOL $BASEDIR/vdev5 +sync_pool $TESTPOOL + +log_must zpool events -c +log_must zpool clear $TESTPOOL +log_must wait_vdev_state $TESTPOOL draid1-0-0 "ONLINE" 60 +log_must zpool wait -t resilver $TESTPOOL +log_must zpool wait -t scrub $TESTPOOL + +nof_resilver=$(zpool events | grep -c resilver_start) +if [ $nof_resilver = 1 ] ; then + log_must verify_pool $TESTPOOL + log_pass "zpool clear on draid pool doesn't cause concurrent resilvers" +else + log_fail "FAIL: sequential and healing resilver initiated concurrently" +fi From 79b20949b25c8db4d379f6486b0835a6613b480c Mon Sep 17 00:00:00 2001 From: Dimitri John Ledkov <19779+xnox@users.noreply.github.com> Date: Wed, 24 May 2023 20:31:28 +0100 Subject: [PATCH 120/180] systemd: Use non-absolute paths in Exec* lines Since systemd v239, Exec* binaries are resolved from PATH when they are not-absolute. Switch to this by default for ease of downstream maintenance. Many downstream distributions move individual binaries to locations that existing compile-time configurations cannot accommodate. Reviewed-by: Brian Behlendorf Signed-off-by: Dimitri John Ledkov Closes #14880 --- etc/systemd/system/zfs-import-cache.service.in | 2 +- etc/systemd/system/zfs-import-scan.service.in | 2 +- etc/systemd/system/zfs-mount.service.in | 2 +- etc/systemd/system/zfs-scrub@.service.in | 10 +++++----- etc/systemd/system/zfs-share.service.in | 2 +- etc/systemd/system/zfs-trim@.service.in | 10 +++++----- etc/systemd/system/zfs-volume-wait.service.in | 2 +- etc/systemd/system/zfs-zed.service.in | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/etc/systemd/system/zfs-import-cache.service.in b/etc/systemd/system/zfs-import-cache.service.in index fd822989da93..6d9a065e7e3a 100644 --- a/etc/systemd/system/zfs-import-cache.service.in +++ b/etc/systemd/system/zfs-import-cache.service.in @@ -15,7 +15,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=@sbindir@/zpool import -c @sysconfdir@/zfs/zpool.cache -aN $ZPOOL_IMPORT_OPTS +ExecStart=zpool import -c @sysconfdir@/zfs/zpool.cache -aN $ZPOOL_IMPORT_OPTS [Install] WantedBy=zfs-import.target diff --git a/etc/systemd/system/zfs-import-scan.service.in b/etc/systemd/system/zfs-import-scan.service.in index c5dd45d87e68..fb524f3b0889 100644 --- a/etc/systemd/system/zfs-import-scan.service.in +++ b/etc/systemd/system/zfs-import-scan.service.in @@ -14,7 +14,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=@sbindir@/zpool import -aN -o cachefile=none $ZPOOL_IMPORT_OPTS +ExecStart=zpool import -aN -o cachefile=none $ZPOOL_IMPORT_OPTS [Install] WantedBy=zfs-import.target diff --git a/etc/systemd/system/zfs-mount.service.in b/etc/systemd/system/zfs-mount.service.in index 66d894923f4a..fc4e1c49f1c5 100644 --- a/etc/systemd/system/zfs-mount.service.in +++ b/etc/systemd/system/zfs-mount.service.in @@ -12,7 +12,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=@sbindir@/zfs mount -a +ExecStart=zfs mount -a [Install] WantedBy=zfs.target diff --git a/etc/systemd/system/zfs-scrub@.service.in b/etc/systemd/system/zfs-scrub@.service.in index 8ffffeb0cf6c..2bb2757d5e97 100644 --- a/etc/systemd/system/zfs-scrub@.service.in +++ b/etc/systemd/system/zfs-scrub@.service.in @@ -8,8 +8,8 @@ ConditionPathIsDirectory=/sys/module/zfs [Service] EnvironmentFile=-@initconfdir@/zfs -ExecStart=/bin/sh -c '\ -if @sbindir@/zpool status %i | grep -q "scrub in progress"; then\ -exec @sbindir@/zpool wait -t scrub %i;\ -else exec @sbindir@/zpool scrub -w %i; fi' -ExecStop=-/bin/sh -c '@sbindir@/zpool scrub -p %i 2>/dev/null || true' +ExecStart=sh -c '\ +if zpool status %i | grep -q "scrub in progress"; then\ +exec zpool wait -t scrub %i;\ +else exec zpool scrub -w %i; fi' +ExecStop=-sh -c 'zpool scrub -p %i 2>/dev/null || true' diff --git a/etc/systemd/system/zfs-share.service.in b/etc/systemd/system/zfs-share.service.in index 1a6342a06fec..dd321f490fe6 100644 --- a/etc/systemd/system/zfs-share.service.in +++ b/etc/systemd/system/zfs-share.service.in @@ -14,7 +14,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=@sbindir@/zfs share -a +ExecStart=zfs share -a [Install] WantedBy=zfs.target diff --git a/etc/systemd/system/zfs-trim@.service.in b/etc/systemd/system/zfs-trim@.service.in index 423fb448c16f..f55e36cd8454 100644 --- a/etc/systemd/system/zfs-trim@.service.in +++ b/etc/systemd/system/zfs-trim@.service.in @@ -8,8 +8,8 @@ ConditionPathIsDirectory=/sys/module/zfs [Service] EnvironmentFile=-@initconfdir@/zfs -ExecStart=/bin/sh -c '\ -if @sbindir@/zpool status %i | grep -q "(trimming)"; then\ -exec @sbindir@/zpool wait -t trim %i;\ -else exec @sbindir@/zpool trim -w %i; fi' -ExecStop=-/bin/sh -c '@sbindir@/zpool trim -s %i 2>/dev/null || true' +ExecStart=sh -c '\ +if zpool status %i | grep -q "(trimming)"; then\ +exec zpool wait -t trim %i;\ +else exec zpool trim -w %i; fi' +ExecStop=-sh -c 'zpool trim -s %i 2>/dev/null || true' diff --git a/etc/systemd/system/zfs-volume-wait.service.in b/etc/systemd/system/zfs-volume-wait.service.in index 110c0f5f52ee..a86a3561e032 100644 --- a/etc/systemd/system/zfs-volume-wait.service.in +++ b/etc/systemd/system/zfs-volume-wait.service.in @@ -9,7 +9,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=@bindir@/zvol_wait +ExecStart=zvol_wait [Install] WantedBy=zfs-volumes.target diff --git a/etc/systemd/system/zfs-zed.service.in b/etc/systemd/system/zfs-zed.service.in index be2fc67348f9..ac58ad3eff7b 100644 --- a/etc/systemd/system/zfs-zed.service.in +++ b/etc/systemd/system/zfs-zed.service.in @@ -5,7 +5,7 @@ ConditionPathIsDirectory=/sys/module/zfs [Service] EnvironmentFile=-@initconfdir@/zfs -ExecStart=@sbindir@/zed -F +ExecStart=zed -F Restart=always [Install] From f63811f07213361d49878648bd597af88d06859c Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 25 May 2023 12:48:43 -0400 Subject: [PATCH 121/180] ZIL: Reduce scope of per-dataset zl_issuer_lock. Before this change ZIL copied all log data while holding the lock. It caused huge lock contention on workloads with many big parallel writes. This change splits the process into two parts: first, zil_lwb_assign() estimates the log space needed for all transactions, and zil_lwb_write_close() allocates blocks and zios while holding the lock, then, after the lock in dropped, zil_lwb_commit() copies the data, and zil_lwb_write_issue() issues the I/Os. Also while there slightly reduce scope of zl_lock. Reviewed-by: Paul Dagnelie Reviewed-by: Prakash Surya Reviewed-by: Richard Yao Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14841 --- include/sys/zil_impl.h | 7 +- module/zfs/zil.c | 428 +++++++++++++++++++++++++++-------------- 2 files changed, 285 insertions(+), 150 deletions(-) diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h index bb85bf6d1eb1..03a409c5257c 100644 --- a/include/sys/zil_impl.h +++ b/include/sys/zil_impl.h @@ -44,7 +44,7 @@ extern "C" { * must be held. * * After the lwb is "opened", it can transition into the "issued" state - * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must + * via zil_lwb_write_close(). Again, the zilog's "zl_issuer_lock" must * be held when making this transition. * * After the lwb's write zio completes, it transitions into the "write @@ -93,20 +93,23 @@ typedef struct lwb { blkptr_t lwb_blk; /* on disk address of this log blk */ boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ boolean_t lwb_slog; /* lwb_blk is on SLOG device */ + boolean_t lwb_indirect; /* do not postpone zil_lwb_commit() */ int lwb_nused; /* # used bytes in buffer */ + int lwb_nfilled; /* # filled bytes in buffer */ int lwb_sz; /* size of block and buffer */ lwb_state_t lwb_state; /* the state of this lwb */ char *lwb_buf; /* log write buffer */ zio_t *lwb_write_zio; /* zio for the lwb buffer */ zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ + hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ uint64_t lwb_issued_txg; /* the txg when the write is issued */ uint64_t lwb_max_txg; /* highest txg in this lwb */ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ + list_node_t lwb_issue_node; /* linkage of lwbs ready for issue */ list_t lwb_itxs; /* list of itx's */ list_t lwb_waiters; /* list of zil_commit_waiter's */ avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */ kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */ - hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ } lwb_t; /* diff --git a/module/zfs/zil.c b/module/zfs/zil.c index d887e4900d1d..f2798270a8a2 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -146,6 +146,9 @@ static uint64_t zil_slog_bulk = 768 * 1024; static kmem_cache_t *zil_lwb_cache; static kmem_cache_t *zil_zcw_cache; +static void zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx); +static itx_t *zil_itx_clone(itx_t *oitx); + static int zil_bp_compare(const void *x1, const void *x2) { @@ -747,20 +750,21 @@ zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, lwb->lwb_blk = *bp; lwb->lwb_fastwrite = fastwrite; lwb->lwb_slog = slog; + lwb->lwb_indirect = B_FALSE; + if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { + lwb->lwb_nused = lwb->lwb_nfilled = sizeof (zil_chain_t); + lwb->lwb_sz = BP_GET_LSIZE(bp); + } else { + lwb->lwb_nused = lwb->lwb_nfilled = 0; + lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); + } lwb->lwb_state = LWB_STATE_CLOSED; lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); - lwb->lwb_max_txg = txg; lwb->lwb_write_zio = NULL; lwb->lwb_root_zio = NULL; lwb->lwb_issued_timestamp = 0; lwb->lwb_issued_txg = 0; - if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - lwb->lwb_nused = sizeof (zil_chain_t); - lwb->lwb_sz = BP_GET_LSIZE(bp); - } else { - lwb->lwb_nused = 0; - lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); - } + lwb->lwb_max_txg = txg; mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); @@ -1397,6 +1401,8 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } + mutex_exit(&zilog->zl_lock); + while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) zil_itx_destroy(itx); @@ -1429,8 +1435,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio) mutex_exit(&zcw->zcw_lock); } - mutex_exit(&zilog->zl_lock); - mutex_enter(&zilog->zl_lwb_io_lock); txg = lwb->lwb_issued_txg; ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0); @@ -1666,46 +1670,41 @@ zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); + if (lwb->lwb_root_zio != NULL) + return; + + lwb->lwb_root_zio = zio_root(zilog->zl_spa, + zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); + + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, + BP_GET_LSIZE(&lwb->lwb_blk)); + + if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + prio = ZIO_PRIORITY_SYNC_WRITE; + else + prio = ZIO_PRIORITY_ASYNC_WRITE; + SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ mutex_enter(&zilog->zl_lock); - if (lwb->lwb_root_zio == NULL) { - abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, - BP_GET_LSIZE(&lwb->lwb_blk)); - - if (!lwb->lwb_fastwrite) { - metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); - lwb->lwb_fastwrite = 1; - } - - if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) - prio = ZIO_PRIORITY_SYNC_WRITE; - else - prio = ZIO_PRIORITY_ASYNC_WRITE; - - lwb->lwb_root_zio = zio_root(zilog->zl_spa, - zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); - ASSERT3P(lwb->lwb_root_zio, !=, NULL); + if (!lwb->lwb_fastwrite) { + metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); + lwb->lwb_fastwrite = 1; + } - lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, - zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, - BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, - prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); + lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, zilog->zl_spa, 0, + &lwb->lwb_blk, lwb_abd, BP_GET_LSIZE(&lwb->lwb_blk), + zil_lwb_write_done, lwb, prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_FASTWRITE, &zb); - lwb->lwb_state = LWB_STATE_OPENED; + lwb->lwb_state = LWB_STATE_OPENED; - zil_lwb_set_zio_dependency(zilog, lwb); - zilog->zl_last_lwb_opened = lwb; - } + zil_lwb_set_zio_dependency(zilog, lwb); + zilog->zl_last_lwb_opened = lwb; mutex_exit(&zilog->zl_lock); - - ASSERT3P(lwb->lwb_root_zio, !=, NULL); - ASSERT3P(lwb->lwb_write_zio, !=, NULL); - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); } /* @@ -1736,11 +1735,11 @@ static const struct { static uint_t zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; /* - * Start a log block write and advance to the next log block. - * Calls are serialized. + * Close the log block for being issued and allocate the next one. + * Has to be called under zl_issuer_lock to chain more lwbs. */ static lwb_t * -zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) +zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb) { lwb_t *nlwb = NULL; zil_chain_t *zilc; @@ -1748,7 +1747,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) blkptr_t *bp; dmu_tx_t *tx; uint64_t txg; - uint64_t zil_blksz, wsz; + uint64_t zil_blksz; int i, error; boolean_t slog; @@ -1757,16 +1756,17 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) ASSERT3P(lwb->lwb_write_zio, !=, NULL); ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); - if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { - zilc = (zil_chain_t *)lwb->lwb_buf; - bp = &zilc->zc_next_blk; - } else { - zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); - bp = &zilc->zc_next_blk; + /* + * If this lwb includes indirect writes, we have to commit before + * creating the transaction, otherwise we may end up in dead lock. + */ + if (lwb->lwb_indirect) { + for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; + itx = list_next(&lwb->lwb_itxs, itx)) + zil_lwb_commit(zilog, lwb, itx); + lwb->lwb_nused = lwb->lwb_nfilled; } - ASSERT(lwb->lwb_nused <= lwb->lwb_sz); - /* * Allocate the next block and save its address in this block * before writing it in order to establish the log chain. @@ -1816,17 +1816,13 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) + zilc = (zil_chain_t *)lwb->lwb_buf; + else + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); + bp = &zilc->zc_next_blk; BP_ZERO(bp); error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog); - if (slog) { - ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); - ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, - lwb->lwb_nused); - } else { - ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); - ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, - lwb->lwb_nused); - } if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); bp->blk_cksum = lwb->lwb_blk.blk_cksum; @@ -1838,17 +1834,47 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE); } + lwb->lwb_state = LWB_STATE_ISSUED; + + dmu_tx_commit(tx); + + /* + * If there was an allocation failure then nlwb will be null which + * forces a txg_wait_synced(). + */ + return (nlwb); +} + +/* + * Finalize previously closed block and issue the write zio. + * Does not require locking. + */ +static void +zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) +{ + zil_chain_t *zilc; + int wsz; + + /* Actually fill the lwb with the data if not yet. */ + if (!lwb->lwb_indirect) { + for (itx_t *itx = list_head(&lwb->lwb_itxs); itx; + itx = list_next(&lwb->lwb_itxs, itx)) + zil_lwb_commit(zilog, lwb, itx); + lwb->lwb_nused = lwb->lwb_nfilled; + } + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { /* For Slim ZIL only write what is used. */ - wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); - ASSERT3U(wsz, <=, lwb->lwb_sz); + wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, int); + ASSERT3S(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_write_zio, wsz); wsz = lwb->lwb_write_zio->io_size; + zilc = (zil_chain_t *)lwb->lwb_buf; } else { wsz = lwb->lwb_sz; + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); } - zilc->zc_pad = 0; zilc->zc_nused = lwb->lwb_nused; zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; @@ -1858,22 +1884,20 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) */ memset(lwb->lwb_buf + lwb->lwb_nused, 0, wsz - lwb->lwb_nused); + if (lwb->lwb_slog) { + ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, + lwb->lwb_nused); + } else { + ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, + lwb->lwb_nused); + } spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); - zil_lwb_add_block(lwb, &lwb->lwb_blk); lwb->lwb_issued_timestamp = gethrtime(); - lwb->lwb_state = LWB_STATE_ISSUED; - zio_nowait(lwb->lwb_root_zio); zio_nowait(lwb->lwb_write_zio); - - dmu_tx_commit(tx); - - /* - * If there was an allocation failure then nlwb will be null which - * forces a txg_wait_synced(). - */ - return (nlwb); } /* @@ -1909,13 +1933,19 @@ zil_max_copied_data(zilog_t *zilog) sizeof (lr_write_t)); } +/* + * Estimate space needed in the lwb for the itx. Allocate more lwbs or + * split the itx as needed, but don't touch the actual transaction data. + * Has to be called under zl_issuer_lock to call zil_lwb_write_close() + * to chain more lwbs. + */ static lwb_t * -zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) +zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) { - lr_t *lrcb, *lrc; - lr_write_t *lrwb, *lrw; - char *lr_buf; - uint64_t dlen, dnow, dpad, lwb_sp, reclen, txg, max_log_data; + itx_t *citx; + lr_t *lr, *clr; + lr_write_t *lrw; + uint64_t dlen, dnow, lwb_sp, reclen, max_log_data; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); ASSERT3P(lwb, !=, NULL); @@ -1923,8 +1953,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) zil_lwb_write_open(zilog, lwb); - lrc = &itx->itx_lr; - lrw = (lr_write_t *)lrc; + lr = &itx->itx_lr; + lrw = (lr_write_t *)lr; /* * A commit itx doesn't represent any on-disk state; instead @@ -1938,24 +1968,23 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) * * For more details, see the comment above zil_commit(). */ - if (lrc->lrc_txtype == TX_COMMIT) { + if (lr->lrc_txtype == TX_COMMIT) { mutex_enter(&zilog->zl_lock); zil_commit_waiter_link_lwb(itx->itx_private, lwb); itx->itx_private = NULL; mutex_exit(&zilog->zl_lock); + list_insert_tail(&lwb->lwb_itxs, itx); return (lwb); } - if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); - dpad = dlen - lrw->lr_length; } else { - dlen = dpad = 0; + dlen = 0; } - reclen = lrc->lrc_reclen; + reclen = lr->lrc_reclen; zilog->zl_cur_used += (reclen + dlen); - txg = lrc->lrc_txg; cont: /* @@ -1968,7 +1997,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) lwb_sp < zil_max_waste_space(zilog) && (dlen % max_log_data == 0 || lwb_sp < reclen + dlen % max_log_data))) { - lwb = zil_lwb_write_issue(zilog, lwb); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb); if (lwb == NULL) return (NULL); zil_lwb_write_open(zilog, lwb); @@ -1987,19 +2017,99 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) } dnow = MIN(dlen, lwb_sp - reclen); - lr_buf = lwb->lwb_buf + lwb->lwb_nused; - memcpy(lr_buf, lrc, reclen); - lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ - lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ + if (dlen > dnow) { + ASSERT3U(lr->lrc_txtype, ==, TX_WRITE); + ASSERT3U(itx->itx_wr_state, ==, WR_NEED_COPY); + citx = zil_itx_clone(itx); + clr = &citx->itx_lr; + lr_write_t *clrw = (lr_write_t *)clr; + clrw->lr_length = dnow; + lrw->lr_offset += dnow; + lrw->lr_length -= dnow; + } else { + citx = itx; + clr = lr; + } + + /* + * We're actually making an entry, so update lrc_seq to be the + * log record sequence number. Note that this is generally not + * equal to the itx sequence number because not all transactions + * are synchronous, and sometimes spa_sync() gets there first. + */ + clr->lrc_seq = ++zilog->zl_lr_seq; + + lwb->lwb_nused += reclen + dnow; + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); + ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); + + zil_lwb_add_txg(lwb, lr->lrc_txg); + list_insert_tail(&lwb->lwb_itxs, citx); + + dlen -= dnow; + if (dlen > 0) { + zilog->zl_cur_used += reclen; + goto cont; + } + + /* + * We have to really issue all queued LWBs before we may have to + * wait for a txg sync. Otherwise we may end up in a dead lock. + */ + if (lr->lrc_txtype == TX_WRITE) { + boolean_t frozen = lr->lrc_txg > spa_freeze_txg(zilog->zl_spa); + if (frozen || itx->itx_wr_state == WR_INDIRECT) { + lwb_t *tlwb; + while ((tlwb = list_remove_head(ilwbs)) != NULL) + zil_lwb_write_issue(zilog, tlwb); + } + if (itx->itx_wr_state == WR_INDIRECT) + lwb->lwb_indirect = B_TRUE; + if (frozen) + txg_wait_synced(zilog->zl_dmu_pool, lr->lrc_txg); + } + + return (lwb); +} + +/* + * Fill the actual transaction data into the lwb, following zil_lwb_assign(). + * Does not require locking. + */ +static void +zil_lwb_commit(zilog_t *zilog, lwb_t *lwb, itx_t *itx) +{ + lr_t *lr, *lrb; + lr_write_t *lrw, *lrwb; + char *lr_buf; + uint64_t dlen, reclen; + + lr = &itx->itx_lr; + lrw = (lr_write_t *)lr; + + if (lr->lrc_txtype == TX_COMMIT) + return; + + if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + dlen = P2ROUNDUP_TYPED( + lrw->lr_length, sizeof (uint64_t), uint64_t); + } else { + dlen = 0; + } + reclen = lr->lrc_reclen; + ASSERT3U(reclen + dlen, <=, lwb->lwb_nused - lwb->lwb_nfilled); + + lr_buf = lwb->lwb_buf + lwb->lwb_nfilled; + memcpy(lr_buf, lr, reclen); + lrb = (lr_t *)lr_buf; /* Like lr, but inside lwb. */ + lrwb = (lr_write_t *)lrb; /* Like lrw, but inside lwb. */ ZIL_STAT_BUMP(zilog, zil_itx_count); /* * If it's a write, fetch the data or get its blkptr as appropriate. */ - if (lrc->lrc_txtype == TX_WRITE) { - if (txg > spa_freeze_txg(zilog->zl_spa)) - txg_wait_synced(zilog->zl_dmu_pool, txg); + if (lr->lrc_txtype == TX_WRITE) { if (itx->itx_wr_state == WR_COPIED) { ZIL_STAT_BUMP(zilog, zil_itx_copied_count); ZIL_STAT_INCR(zilog, zil_itx_copied_bytes, @@ -2010,14 +2120,10 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) if (itx->itx_wr_state == WR_NEED_COPY) { dbuf = lr_buf + reclen; - lrcb->lrc_reclen += dnow; - if (lrwb->lr_length > dnow) - lrwb->lr_length = dnow; - lrw->lr_offset += dnow; - lrw->lr_length -= dnow; + lrb->lrc_reclen += dlen; ZIL_STAT_BUMP(zilog, zil_itx_needcopy_count); ZIL_STAT_INCR(zilog, zil_itx_needcopy_bytes, - dnow); + dlen); } else { ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT); dbuf = NULL; @@ -2044,9 +2150,11 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) error = zilog->zl_get_data(itx->itx_private, itx->itx_gen, lrwb, dbuf, lwb, lwb->lwb_write_zio); - if (dbuf != NULL && error == 0 && dnow == dlen) + if (dbuf != NULL && error == 0) { /* Zero any padding bytes in the last block. */ - memset((char *)dbuf + lrwb->lr_length, 0, dpad); + memset((char *)dbuf + lrwb->lr_length, 0, + dlen - lrwb->lr_length); + } /* * Typically, the only return values we should see from @@ -2074,39 +2182,26 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) error); zfs_fallthrough; case EIO: - txg_wait_synced(zilog->zl_dmu_pool, txg); + if (lwb->lwb_indirect) { + txg_wait_synced(zilog->zl_dmu_pool, + lr->lrc_txg); + } else { + lwb->lwb_write_zio->io_error = error; + } zfs_fallthrough; case ENOENT: zfs_fallthrough; case EEXIST: zfs_fallthrough; case EALREADY: - return (lwb); + return; } } } - /* - * We're actually making an entry, so update lrc_seq to be the - * log record sequence number. Note that this is generally not - * equal to the itx sequence number because not all transactions - * are synchronous, and sometimes spa_sync() gets there first. - */ - lrcb->lrc_seq = ++zilog->zl_lr_seq; - lwb->lwb_nused += reclen + dnow; - - zil_lwb_add_txg(lwb, txg); - - ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); - ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); - - dlen -= dnow; - if (dlen > 0) { - zilog->zl_cur_used += reclen; - goto cont; - } - - return (lwb); + lwb->lwb_nfilled += reclen + dlen; + ASSERT3S(lwb->lwb_nfilled, <=, lwb->lwb_nused); + ASSERT0(P2PHASE(lwb->lwb_nfilled, sizeof (uint64_t))); } itx_t * @@ -2131,6 +2226,16 @@ zil_itx_create(uint64_t txtype, size_t olrsize) return (itx); } +static itx_t * +zil_itx_clone(itx_t *oitx) +{ + itx_t *itx = zio_data_buf_alloc(oitx->itx_size); + memcpy(itx, oitx, oitx->itx_size); + itx->itx_callback = NULL; + itx->itx_callback_data = NULL; + return (itx); +} + void zil_itx_destroy(itx_t *itx) { @@ -2162,7 +2267,7 @@ zil_itxg_clean(void *arg) /* * In the general case, commit itxs will not be found * here, as they'll be committed to an lwb via - * zil_lwb_commit(), and free'd in that function. Having + * zil_lwb_assign(), and free'd in that function. Having * said that, it is still possible for commit itxs to be * found here, due to the following race: * @@ -2561,7 +2666,7 @@ zil_commit_writer_stall(zilog_t *zilog) * lwb will be issued to the zio layer to be written to disk. */ static void -zil_process_commit_list(zilog_t *zilog) +zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) { spa_t *spa = zilog->zl_spa; list_t nolwb_itxs; @@ -2663,18 +2768,23 @@ zil_process_commit_list(zilog_t *zilog) */ if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { if (lwb != NULL) { - lwb = zil_lwb_commit(zilog, itx, lwb); - - if (lwb == NULL) + lwb = zil_lwb_assign(zilog, lwb, itx, ilwbs); + if (lwb == NULL) { list_insert_tail(&nolwb_itxs, itx); - else - list_insert_tail(&lwb->lwb_itxs, itx); + } else if ((zcw->zcw_lwb != NULL && + zcw->zcw_lwb != lwb) || zcw->zcw_done) { + /* + * Our lwb is done, leave the rest of + * itx list to somebody else who care. + */ + first = B_FALSE; + break; + } } else { if (lrc->lrc_txtype == TX_COMMIT) { zil_commit_waiter_link_nolwb( itx->itx_private, &nolwb_waiters); } - list_insert_tail(&nolwb_itxs, itx); } } else { @@ -2690,6 +2800,8 @@ zil_process_commit_list(zilog_t *zilog) * the ZIL write pipeline; see the comment within * zil_commit_writer_stall() for more details. */ + while ((lwb = list_remove_head(ilwbs)) != NULL) + zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); /* @@ -2735,13 +2847,13 @@ zil_process_commit_list(zilog_t *zilog) * on the system, such that this function will be * immediately called again (not necessarily by the same * thread) and this lwb's zio will be issued via - * zil_lwb_commit(). This way, the lwb is guaranteed to + * zil_lwb_assign(). This way, the lwb is guaranteed to * be "full" when it is issued to disk, and we'll make * use of the lwb's size the best we can. * * 2. If there isn't sufficient ZIL activity occurring on * the system, such that this lwb's zio isn't issued via - * zil_lwb_commit(), zil_commit_waiter() will issue the + * zil_lwb_assign(), zil_commit_waiter() will issue the * lwb's zio. If this occurs, the lwb is not guaranteed * to be "full" by the time its zio is issued, and means * the size of the lwb was "too large" given the amount @@ -2773,10 +2885,15 @@ zil_process_commit_list(zilog_t *zilog) zfs_commit_timeout_pct / 100; if (sleep < zil_min_commit_timeout || lwb->lwb_sz - lwb->lwb_nused < lwb->lwb_sz / 8) { - lwb = zil_lwb_write_issue(zilog, lwb); + list_insert_tail(ilwbs, lwb); + lwb = zil_lwb_write_close(zilog, lwb); zilog->zl_cur_used = 0; - if (lwb == NULL) + if (lwb == NULL) { + while ((lwb = list_remove_head(ilwbs)) + != NULL) + zil_lwb_write_issue(zilog, lwb); zil_commit_writer_stall(zilog); + } } } } @@ -2799,9 +2916,13 @@ zil_process_commit_list(zilog_t *zilog) static void zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) { + list_t ilwbs; + lwb_t *lwb; + ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(spa_writeable(zilog->zl_spa)); + list_create(&ilwbs, sizeof (lwb_t), offsetof(lwb_t, lwb_issue_node)); mutex_enter(&zilog->zl_issuer_lock); if (zcw->zcw_lwb != NULL || zcw->zcw_done) { @@ -2828,10 +2949,13 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) zil_get_commit_list(zilog); zil_prune_commit_list(zilog); - zil_process_commit_list(zilog); + zil_process_commit_list(zilog, zcw, &ilwbs); out: mutex_exit(&zilog->zl_issuer_lock); + while ((lwb = list_remove_head(&ilwbs)) != NULL) + zil_lwb_write_issue(zilog, lwb); + list_destroy(&ilwbs); } static void @@ -2858,7 +2982,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) return; /* - * In order to call zil_lwb_write_issue() we must hold the + * In order to call zil_lwb_write_close() we must hold the * zilog's "zl_issuer_lock". We can't simply acquire that lock, * since we're already holding the commit waiter's "zcw_lock", * and those two locks are acquired in the opposite order @@ -2876,8 +3000,10 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * the waiter is marked "done"), so without this check we could * wind up with a use-after-free error below. */ - if (zcw->zcw_done) + if (zcw->zcw_done) { + lwb = NULL; goto out; + } ASSERT3P(lwb, ==, zcw->zcw_lwb); @@ -2896,15 +3022,17 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * if it's ISSUED or OPENED, and block any other threads that might * attempt to issue this lwb. For that reason we hold the * zl_issuer_lock when checking the lwb_state; we must not call - * zil_lwb_write_issue() if the lwb had already been issued. + * zil_lwb_write_close() if the lwb had already been issued. * * See the comment above the lwb_state_t structure definition for * more details on the lwb states, and locking requirements. */ if (lwb->lwb_state == LWB_STATE_ISSUED || lwb->lwb_state == LWB_STATE_WRITE_DONE || - lwb->lwb_state == LWB_STATE_FLUSH_DONE) + lwb->lwb_state == LWB_STATE_FLUSH_DONE) { + lwb = NULL; goto out; + } ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); @@ -2914,7 +3042,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * since we've reached the commit waiter's timeout and it still * hasn't been issued. */ - lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); + lwb_t *nlwb = zil_lwb_write_close(zilog, lwb); ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); @@ -2934,7 +3062,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) if (nlwb == NULL) { /* - * When zil_lwb_write_issue() returns NULL, this + * When zil_lwb_write_close() returns NULL, this * indicates zio_alloc_zil() failed to allocate the * "next" lwb on-disk. When this occurs, the ZIL write * pipeline must be stalled; see the comment within the @@ -2956,12 +3084,16 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * lock, which occurs prior to calling dmu_tx_commit() */ mutex_exit(&zcw->zcw_lock); + zil_lwb_write_issue(zilog, lwb); + lwb = NULL; zil_commit_writer_stall(zilog); mutex_enter(&zcw->zcw_lock); } out: mutex_exit(&zilog->zl_issuer_lock); + if (lwb) + zil_lwb_write_issue(zilog, lwb); ASSERT(MUTEX_HELD(&zcw->zcw_lock)); } @@ -2976,7 +3108,7 @@ zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) * waited "long enough" and the lwb is still in the "open" state. * * Given a sufficient amount of itxs being generated and written using - * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() + * the ZIL, the lwb's zio will be issued via the zil_lwb_assign() * function. If this does not occur, this secondary responsibility will * ensure the lwb is issued even if there is not other synchronous * activity on the system. @@ -3656,7 +3788,7 @@ zil_close(zilog_t *zilog) /* * zl_lwb_max_issued_txg may be larger than lwb_max_txg. It depends * on the time when the dmu_tx transaction is assigned in - * zil_lwb_write_issue(). + * zil_lwb_write_close(). */ mutex_enter(&zilog->zl_lwb_io_lock); txg = MAX(zilog->zl_lwb_max_issued_txg, txg); From b6fbe61fa6a75747d9b65082ad4dbec05305d496 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 25 May 2023 16:51:53 -0400 Subject: [PATCH 122/180] zil: Add some more statistics. In addition to a number of actual log bytes written, account also a total written bytes including padding and total allocated bytes (bytes <= write <= alloc). It should allow to monitor zil traffic and space efficiency. Add dtrace probe for zil block size selection. Make zilstat report more information and fit it into less width. Reviewed-by: Ameer Hamza Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14863 --- cmd/zilstat.in | 180 +++++++++++++----- include/os/linux/zfs/sys/trace_zil.h | 34 ++++ include/sys/zil.h | 12 +- module/zfs/dataset_kstats.c | 6 +- module/zfs/zil.c | 31 +++ .../cli_user/misc/zilstat_001_pos.ksh | 2 +- 6 files changed, 213 insertions(+), 52 deletions(-) diff --git a/cmd/zilstat.in b/cmd/zilstat.in index cf4e2e0dd0c8..e8678e20cafa 100755 --- a/cmd/zilstat.in +++ b/cmd/zilstat.in @@ -36,31 +36,49 @@ import argparse from argparse import RawTextHelpFormatter cols = { - # hdr: [size, scale, kstat name] + # hdr: [size, scale, kstat name] "time": [8, -1, "time"], "pool": [12, -1, "pool"], "ds": [12, -1, "dataset_name"], "obj": [12, -1, "objset"], - "zcc": [10, 1000, "zil_commit_count"], - "zcwc": [10, 1000, "zil_commit_writer_count"], - "ziic": [10, 1000, "zil_itx_indirect_count"], - "zic": [10, 1000, "zil_itx_count"], - "ziib": [10, 1024, "zil_itx_indirect_bytes"], - "zicc": [10, 1000, "zil_itx_copied_count"], - "zicb": [10, 1024, "zil_itx_copied_bytes"], - "zinc": [10, 1000, "zil_itx_needcopy_count"], - "zinb": [10, 1024, "zil_itx_needcopy_bytes"], - "zimnc": [10, 1000, "zil_itx_metaslab_normal_count"], - "zimnb": [10, 1024, "zil_itx_metaslab_normal_bytes"], - "zimsc": [10, 1000, "zil_itx_metaslab_slog_count"], - "zimsb": [10, 1024, "zil_itx_metaslab_slog_bytes"], + "cc": [5, 1000, "zil_commit_count"], + "cwc": [5, 1000, "zil_commit_writer_count"], + "ic": [5, 1000, "zil_itx_count"], + "iic": [5, 1000, "zil_itx_indirect_count"], + "iib": [5, 1024, "zil_itx_indirect_bytes"], + "icc": [5, 1000, "zil_itx_copied_count"], + "icb": [5, 1024, "zil_itx_copied_bytes"], + "inc": [5, 1000, "zil_itx_needcopy_count"], + "inb": [5, 1024, "zil_itx_needcopy_bytes"], + "idc": [5, 1000, "icc+inc"], + "idb": [5, 1024, "icb+inb"], + "iwc": [5, 1000, "iic+idc"], + "iwb": [5, 1024, "iib+idb"], + "imnc": [6, 1000, "zil_itx_metaslab_normal_count"], + "imnb": [6, 1024, "zil_itx_metaslab_normal_bytes"], + "imnw": [6, 1024, "zil_itx_metaslab_normal_write"], + "imna": [6, 1024, "zil_itx_metaslab_normal_alloc"], + "imsc": [6, 1000, "zil_itx_metaslab_slog_count"], + "imsb": [6, 1024, "zil_itx_metaslab_slog_bytes"], + "imsw": [6, 1024, "zil_itx_metaslab_slog_write"], + "imsa": [6, 1024, "zil_itx_metaslab_slog_alloc"], + "imc": [5, 1000, "imnc+imsc"], + "imb": [5, 1024, "imnb+imsb"], + "imw": [5, 1024, "imnw+imsw"], + "ima": [5, 1024, "imna+imsa"], + "se%": [3, 100, "imb/ima"], + "sen%": [4, 100, "imnb/imna"], + "ses%": [4, 100, "imsb/imsa"], + "te%": [3, 100, "imb/imw"], + "ten%": [4, 100, "imnb/imnw"], + "tes%": [4, 100, "imsb/imsw"], } -hdr = ["time", "pool", "ds", "obj", "zcc", "zcwc", "ziic", "zic", "ziib", \ - "zicc", "zicb", "zinc", "zinb", "zimnc", "zimnb", "zimsc", "zimsb"] +hdr = ["time", "ds", "cc", "ic", "idc", "idb", "iic", "iib", + "imnc", "imnw", "imsc", "imsw"] -ghdr = ["time", "zcc", "zcwc", "ziic", "zic", "ziib", "zicc", "zicb", - "zinc", "zinb", "zimnc", "zimnb", "zimsc", "zimsb"] +ghdr = ["time", "cc", "ic", "idc", "idb", "iic", "iib", + "imnc", "imnw", "imsc", "imsw"] cmd = ("Usage: zilstat [-hgdv] [-i interval] [-p pool_name]") @@ -105,7 +123,7 @@ def print_header(): global sep for col in hdr: new_col = col - if interval > 0 and col not in ['time', 'pool', 'ds', 'obj']: + if interval > 0 and cols[col][1] > 100: new_col += "/s" sys.stdout.write("%*s%s" % (cols[col][0], new_col, sep)) sys.stdout.write("\n") @@ -115,7 +133,7 @@ def print_values(v): global sep for col in hdr: val = v[cols[col][2]] - if col not in ['time', 'pool', 'ds', 'obj'] and interval > 0: + if interval > 0 and cols[col][1] > 100: val = v[cols[col][2]] // interval sys.stdout.write("%s%s" % ( prettynum(cols[col][0], cols[col][1], val), sep)) @@ -237,9 +255,7 @@ def init(): invalid = [] for ele in hdr: - if gFlag and ele not in ghdr: - invalid.append(ele) - elif ele not in cols: + if ele not in cols: invalid.append(ele) if len(invalid) > 0: @@ -403,17 +419,17 @@ def calculate_diff(): diff = copy.deepcopy(curr) for pool in curr: for objset in curr[pool]: - for col in hdr: - if col not in ['time', 'pool', 'ds', 'obj']: - key = cols[col][2] - # If prev is NULL, this is the - # first time we are here - if not prev: - diff[pool][objset][key] = 0 - else: - diff[pool][objset][key] \ - = curr[pool][objset][key] \ - - prev[pool][objset][key] + for key in curr[pool][objset]: + if not isinstance(diff[pool][objset][key], int): + continue + # If prev is NULL, this is the + # first time we are here + if not prev: + diff[pool][objset][key] = 0 + else: + diff[pool][objset][key] \ + = curr[pool][objset][key] \ + - prev[pool][objset][key] def zil_build_dict(pool = "GLOBAL"): global kstat @@ -425,10 +441,77 @@ def zil_build_dict(pool = "GLOBAL"): if objset not in curr[pool]: curr[pool][objset] = dict() curr[pool][objset][key] = val - curr[pool][objset]["pool"] = pool - curr[pool][objset]["objset"] = objset - curr[pool][objset]["time"] = time.strftime("%H:%M:%S", \ - time.localtime()) + +def zil_extend_dict(): + global diff + for pool in diff: + for objset in diff[pool]: + diff[pool][objset]["pool"] = pool + diff[pool][objset]["objset"] = objset + diff[pool][objset]["time"] = time.strftime("%H:%M:%S", \ + time.localtime()) + diff[pool][objset]["icc+inc"] = \ + diff[pool][objset]["zil_itx_copied_count"] + \ + diff[pool][objset]["zil_itx_needcopy_count"] + diff[pool][objset]["icb+inb"] = \ + diff[pool][objset]["zil_itx_copied_bytes"] + \ + diff[pool][objset]["zil_itx_needcopy_bytes"] + diff[pool][objset]["iic+idc"] = \ + diff[pool][objset]["zil_itx_indirect_count"] + \ + diff[pool][objset]["zil_itx_copied_count"] + \ + diff[pool][objset]["zil_itx_needcopy_count"] + diff[pool][objset]["iib+idb"] = \ + diff[pool][objset]["zil_itx_indirect_bytes"] + \ + diff[pool][objset]["zil_itx_copied_bytes"] + \ + diff[pool][objset]["zil_itx_needcopy_bytes"] + diff[pool][objset]["imnc+imsc"] = \ + diff[pool][objset]["zil_itx_metaslab_normal_count"] + \ + diff[pool][objset]["zil_itx_metaslab_slog_count"] + diff[pool][objset]["imnb+imsb"] = \ + diff[pool][objset]["zil_itx_metaslab_normal_bytes"] + \ + diff[pool][objset]["zil_itx_metaslab_slog_bytes"] + diff[pool][objset]["imnw+imsw"] = \ + diff[pool][objset]["zil_itx_metaslab_normal_write"] + \ + diff[pool][objset]["zil_itx_metaslab_slog_write"] + diff[pool][objset]["imna+imsa"] = \ + diff[pool][objset]["zil_itx_metaslab_normal_alloc"] + \ + diff[pool][objset]["zil_itx_metaslab_slog_alloc"] + if diff[pool][objset]["imna+imsa"] > 0: + diff[pool][objset]["imb/ima"] = 100 * \ + diff[pool][objset]["imnb+imsb"] // \ + diff[pool][objset]["imna+imsa"] + else: + diff[pool][objset]["imb/ima"] = 100 + if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0: + diff[pool][objset]["imnb/imna"] = 100 * \ + diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \ + diff[pool][objset]["zil_itx_metaslab_normal_alloc"] + else: + diff[pool][objset]["imnb/imna"] = 100 + if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0: + diff[pool][objset]["imsb/imsa"] = 100 * \ + diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \ + diff[pool][objset]["zil_itx_metaslab_slog_alloc"] + else: + diff[pool][objset]["imsb/imsa"] = 100 + if diff[pool][objset]["imnw+imsw"] > 0: + diff[pool][objset]["imb/imw"] = 100 * \ + diff[pool][objset]["imnb+imsb"] // \ + diff[pool][objset]["imnw+imsw"] + else: + diff[pool][objset]["imb/imw"] = 100 + if diff[pool][objset]["zil_itx_metaslab_normal_alloc"] > 0: + diff[pool][objset]["imnb/imnw"] = 100 * \ + diff[pool][objset]["zil_itx_metaslab_normal_bytes"] // \ + diff[pool][objset]["zil_itx_metaslab_normal_write"] + else: + diff[pool][objset]["imnb/imnw"] = 100 + if diff[pool][objset]["zil_itx_metaslab_slog_alloc"] > 0: + diff[pool][objset]["imsb/imsw"] = 100 * \ + diff[pool][objset]["zil_itx_metaslab_slog_bytes"] // \ + diff[pool][objset]["zil_itx_metaslab_slog_write"] + else: + diff[pool][objset]["imsb/imsw"] = 100 def sign_handler_epipe(sig, frame): print("Caught EPIPE signal: " + str(frame)) @@ -437,30 +520,31 @@ def sign_handler_epipe(sig, frame): def main(): global interval - global curr + global curr, diff hprint = False init() signal.signal(signal.SIGINT, signal.SIG_DFL) signal.signal(signal.SIGPIPE, sign_handler_epipe) + zil_process_kstat() + if not curr: + print ("Error: No stats to show") + sys.exit(0) + print_header() if interval > 0: + time.sleep(interval) while True: calculate_diff() if not diff: print ("Error: No stats to show") sys.exit(0) - if hprint == False: - print_header() - hprint = True + zil_extend_dict() print_dict(diff) time.sleep(interval) else: - zil_process_kstat() - if not curr: - print ("Error: No stats to show") - sys.exit(0) - print_header() - print_dict(curr) + diff = curr + zil_extend_dict() + print_dict(diff) if __name__ == '__main__': main() diff --git a/include/os/linux/zfs/sys/trace_zil.h b/include/os/linux/zfs/sys/trace_zil.h index 7bddd9d1f469..afa1a274e43c 100644 --- a/include/os/linux/zfs/sys/trace_zil.h +++ b/include/os/linux/zfs/sys/trace_zil.h @@ -215,6 +215,39 @@ DEFINE_EVENT(zfs_zil_commit_io_error_class, name, \ TP_ARGS(zilog, zcw)) DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(zfs_zil__commit__io__error); +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * zilog_t *, ..., + * uint64_t, ..., + * uint64_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_zil_block_size_class, + TP_PROTO(zilog_t *zilog, uint64_t res, uint64_t s1), + TP_ARGS(zilog, res, s1), + TP_STRUCT__entry( + ZILOG_TP_STRUCT_ENTRY + __field(uint64_t, res) + __field(uint64_t, s1) + ), + TP_fast_assign( + ZILOG_TP_FAST_ASSIGN + __entry->res = res; + __entry->s1 = s1; + ), + TP_printk( + ZILOG_TP_PRINTK_FMT " res %llu s1 %llu", + ZILOG_TP_PRINTK_ARGS, __entry->res, __entry->s1) +); + +#define DEFINE_ZIL_BLOCK_SIZE_EVENT(name) \ +DEFINE_EVENT(zfs_zil_block_size_class, name, \ + TP_PROTO(zilog_t *zilog, uint64_t res, uint64_t s1), \ + TP_ARGS(zilog, res, s1)) +DEFINE_ZIL_BLOCK_SIZE_EVENT(zfs_zil__block__size); + #endif /* _TRACE_ZIL_H */ #undef TRACE_INCLUDE_PATH @@ -228,6 +261,7 @@ DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(zfs_zil__commit__io__error); DEFINE_DTRACE_PROBE2(zil__process__commit__itx); DEFINE_DTRACE_PROBE2(zil__process__normal__itx); DEFINE_DTRACE_PROBE2(zil__commit__io__error); +DEFINE_DTRACE_PROBE3(zil__block__size); #endif /* HAVE_DECLARE_EVENT_CLASS */ #endif /* _KERNEL */ diff --git a/include/sys/zil.h b/include/sys/zil.h index cff8ebcad819..4747ecc067a9 100644 --- a/include/sys/zil.h +++ b/include/sys/zil.h @@ -489,18 +489,22 @@ typedef struct zil_stats { * Transactions which have been allocated to the "normal" * (i.e. not slog) storage pool. Note that "bytes" accumulate * the actual log record sizes - which do not include the actual - * data in case of indirect writes. + * data in case of indirect writes. bytes <= write <= alloc. */ kstat_named_t zil_itx_metaslab_normal_count; kstat_named_t zil_itx_metaslab_normal_bytes; + kstat_named_t zil_itx_metaslab_normal_write; + kstat_named_t zil_itx_metaslab_normal_alloc; /* * Transactions which have been allocated to the "slog" storage pool. * If there are no separate log devices, this is the same as the - * "normal" pool. + * "normal" pool. bytes <= write <= alloc. */ kstat_named_t zil_itx_metaslab_slog_count; kstat_named_t zil_itx_metaslab_slog_bytes; + kstat_named_t zil_itx_metaslab_slog_write; + kstat_named_t zil_itx_metaslab_slog_alloc; } zil_kstat_values_t; typedef struct zil_sums { @@ -515,8 +519,12 @@ typedef struct zil_sums { wmsum_t zil_itx_needcopy_bytes; wmsum_t zil_itx_metaslab_normal_count; wmsum_t zil_itx_metaslab_normal_bytes; + wmsum_t zil_itx_metaslab_normal_write; + wmsum_t zil_itx_metaslab_normal_alloc; wmsum_t zil_itx_metaslab_slog_count; wmsum_t zil_itx_metaslab_slog_bytes; + wmsum_t zil_itx_metaslab_slog_write; + wmsum_t zil_itx_metaslab_slog_alloc; } zil_sums_t; #define ZIL_STAT_INCR(zil, stat, val) \ diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c index 57b8faf213eb..767a461e0026 100644 --- a/module/zfs/dataset_kstats.c +++ b/module/zfs/dataset_kstats.c @@ -49,8 +49,12 @@ static dataset_kstat_values_t empty_dataset_kstats = { { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, - { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 } + { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 } } }; diff --git a/module/zfs/zil.c b/module/zfs/zil.c index f2798270a8a2..509fd39d3590 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -116,8 +116,12 @@ static zil_kstat_values_t zil_stats = { { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_alloc", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_write", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_alloc", KSTAT_DATA_UINT64 }, }; static zil_sums_t zil_sums_global; @@ -378,8 +382,12 @@ zil_sums_init(zil_sums_t *zs) wmsum_init(&zs->zil_itx_needcopy_bytes, 0); wmsum_init(&zs->zil_itx_metaslab_normal_count, 0); wmsum_init(&zs->zil_itx_metaslab_normal_bytes, 0); + wmsum_init(&zs->zil_itx_metaslab_normal_write, 0); + wmsum_init(&zs->zil_itx_metaslab_normal_alloc, 0); wmsum_init(&zs->zil_itx_metaslab_slog_count, 0); wmsum_init(&zs->zil_itx_metaslab_slog_bytes, 0); + wmsum_init(&zs->zil_itx_metaslab_slog_write, 0); + wmsum_init(&zs->zil_itx_metaslab_slog_alloc, 0); } void @@ -396,8 +404,12 @@ zil_sums_fini(zil_sums_t *zs) wmsum_fini(&zs->zil_itx_needcopy_bytes); wmsum_fini(&zs->zil_itx_metaslab_normal_count); wmsum_fini(&zs->zil_itx_metaslab_normal_bytes); + wmsum_fini(&zs->zil_itx_metaslab_normal_write); + wmsum_fini(&zs->zil_itx_metaslab_normal_alloc); wmsum_fini(&zs->zil_itx_metaslab_slog_count); wmsum_fini(&zs->zil_itx_metaslab_slog_bytes); + wmsum_fini(&zs->zil_itx_metaslab_slog_write); + wmsum_fini(&zs->zil_itx_metaslab_slog_alloc); } void @@ -425,10 +437,18 @@ zil_kstat_values_update(zil_kstat_values_t *zs, zil_sums_t *zil_sums) wmsum_value(&zil_sums->zil_itx_metaslab_normal_count); zs->zil_itx_metaslab_normal_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_normal_bytes); + zs->zil_itx_metaslab_normal_write.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_normal_write); + zs->zil_itx_metaslab_normal_alloc.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_normal_alloc); zs->zil_itx_metaslab_slog_count.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_count); zs->zil_itx_metaslab_slog_bytes.value.ui64 = wmsum_value(&zil_sums->zil_itx_metaslab_slog_bytes); + zs->zil_itx_metaslab_slog_write.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_slog_write); + zs->zil_itx_metaslab_slog_alloc.value.ui64 = + wmsum_value(&zil_sums->zil_itx_metaslab_slog_alloc); } /* @@ -1814,6 +1834,9 @@ zil_lwb_write_close(zilog_t *zilog, lwb_t *lwb) zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); + DTRACE_PROBE3(zil__block__size, zilog_t *, zilog, + uint64_t, zil_blksz, + uint64_t, zilog->zl_prev_blks[zilog->zl_prev_rotor]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) @@ -1888,10 +1911,18 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) ZIL_STAT_BUMP(zilog, zil_itx_metaslab_slog_count); ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_bytes, lwb->lwb_nused); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_write, + wsz); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_slog_alloc, + BP_GET_LSIZE(&lwb->lwb_blk)); } else { ZIL_STAT_BUMP(zilog, zil_itx_metaslab_normal_count); ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_bytes, lwb->lwb_nused); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_write, + wsz); + ZIL_STAT_INCR(zilog, zil_itx_metaslab_normal_alloc, + BP_GET_LSIZE(&lwb->lwb_blk)); } spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); zil_lwb_add_block(lwb, &lwb->lwb_blk); diff --git a/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh index 9bf6a94cfc84..9deee67a56ca 100755 --- a/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_user/misc/zilstat_001_pos.ksh @@ -25,7 +25,7 @@ is_freebsd && ! python3 -c 'import sysctl' 2>/dev/null && log_unsupported "python3 sysctl module missing" set -A args "" "-s \",\"" "-v" \ - "-f time,zcwc,zimnb,zimsb" + "-f time,cwc,imnb,imsb" log_assert "zilstat generates output and doesn't return an error code" From 91a2325c4a0fbe01d0bf212e44fa9d85017837ce Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 25 May 2023 13:53:08 -0700 Subject: [PATCH 123/180] Update compatibility.d files Add an openzfs-2.2 compatibility file for the next release. Edon-R support has been enabled for FreeBSD removing the need for different FreeBSD and Linux files. Symlinks for the -linux and -freebsd names are created for any scripts expecting that convention. Additionally, a symlink for ubunutu-22.04 was added. Signed-off-by: Brian Behlendorf Closes #14833 --- cmd/zpool/Makefile.am | 6 +++- cmd/zpool/compatibility.d/openzfs-2.2 | 40 +++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 cmd/zpool/compatibility.d/openzfs-2.2 diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am index 3c7c8a9aebe2..de700eabf86b 100644 --- a/cmd/zpool/Makefile.am +++ b/cmd/zpool/Makefile.am @@ -145,6 +145,7 @@ dist_zpoolcompat_DATA = \ %D%/compatibility.d/openzfs-2.0-linux \ %D%/compatibility.d/openzfs-2.1-freebsd \ %D%/compatibility.d/openzfs-2.1-linux \ + %D%/compatibility.d/openzfs-2.2 \ %D%/compatibility.d/openzfsonosx-1.7.0 \ %D%/compatibility.d/openzfsonosx-1.8.1 \ %D%/compatibility.d/openzfsonosx-1.9.3 \ @@ -173,7 +174,10 @@ zpoolcompatlinks = \ "openzfsonosx-1.9.3 openzfsonosx-1.9.4" \ "openzfs-2.0-freebsd truenas-12.0" \ "zol-0.7 ubuntu-18.04" \ - "zol-0.8 ubuntu-20.04" + "zol-0.8 ubuntu-20.04" \ + "openzfs-2.1-linux ubuntu-22.04" \ + "openzfs-2.2 openzfs-2.2-linux" \ + "openzfs-2.2 openzfs-2.2-freebsd" zpoolconfdir = $(sysconfdir)/zfs/zpool.d INSTALL_DATA_HOOKS += zpool-install-data-hook diff --git a/cmd/zpool/compatibility.d/openzfs-2.2 b/cmd/zpool/compatibility.d/openzfs-2.2 new file mode 100644 index 000000000000..c9491cd8dc42 --- /dev/null +++ b/cmd/zpool/compatibility.d/openzfs-2.2 @@ -0,0 +1,40 @@ +# Features supported by OpenZFS 2.2 on Linux and FreeBSD +allocation_classes +async_destroy +blake3 +block_cloning +bookmark_v2 +bookmark_written +bookmarks +device_rebuild +device_removal +draid +edonr +embedded_data +empty_bpobj +enabled_txg +encryption +extensible_dataset +filesystem_limits +head_errlog +hole_birth +large_blocks +large_dnode +livelist +log_spacemap +lz4_compress +multi_vdev_crash_dump +obsolete_counts +project_quota +redacted_datasets +redaction_bookmarks +resilver_defer +sha512 +skein +spacemap_histogram +spacemap_v2 +userobj_accounting +vdev_zaps_v2 +zilsaxattr +zpool_checkpoint +zstd_compress From ff03dfd4d8cc533fcec5f63dd7cc5aa20f99cb18 Mon Sep 17 00:00:00 2001 From: Damiano Albani Date: Fri, 26 May 2023 01:10:54 +0200 Subject: [PATCH 124/180] Add missing files to Debian DKMS package Reviewed-by: Tino Reichardt Reviewed-by: Umer Saleem Reviewed-by: Brian Behlendorf Signed-off-by: Damiano Albani Closes #14887 Closes #14889 --- contrib/debian/rules.in | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/debian/rules.in b/contrib/debian/rules.in index 63892c6ca243..f0791cfabd38 100755 --- a/contrib/debian/rules.in +++ b/contrib/debian/rules.in @@ -7,8 +7,8 @@ NAME := $(shell awk '$$1 == "Name:" { print $$2; }' META) LINUX_MIN := $(shell awk '/Linux-Minimum:/{print $$2}' META) LINUX_NEXT := $(shell awk -F'[ .]' '/Linux-Maximum:/{print $$2 "." $$3+1}' META) -DKMSFILES := module include config zfs.release.in autogen.sh META AUTHORS \ - COPYRIGHT LICENSE README.md +DKMSFILES := module include config zfs.release.in autogen.sh copy-builtin META AUTHORS \ + COPYRIGHT LICENSE README.md CODE_OF_CONDUCT.md NEWS NOTICE RELEASES.md ifndef KVERS KVERS=$(shell uname -r) From bb736d98d133b4449a4e3bb97a914651677e6713 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Fri, 26 May 2023 18:53:00 +0200 Subject: [PATCH 125/180] Fix inconsistent definition of zfs_scrub_error_blocks_per_txg Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #14894 --- module/zfs/dsl_scan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 5e3559b251e3..6cad339104a4 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -234,7 +234,7 @@ static int zfs_resilver_disable_defer = B_FALSE; static int zfs_free_bpobj_enabled = 1; /* Error blocks to be scrubbed in one txg. */ -unsigned long zfs_scrub_error_blocks_per_txg = 1 << 12; +static uint_t zfs_scrub_error_blocks_per_txg = 1 << 12; /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { @@ -5242,6 +5242,6 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, "Process all resilvers immediately"); -ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, U64, ZMOD_RW, +ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, UINT, ZMOD_RW, "Error blocks to be scrubbed in one txg"); /* END CSTYLED */ From 677c6f8457943fe5b56d7aa8807010a104563e4a Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 26 May 2023 13:03:12 -0400 Subject: [PATCH 126/180] btree: Implement faster binary search algorithm This implements a binary search algorithm for B-Trees that reduces branching to the absolute minimum necessary for a binary search algorithm. It also enables the compiler to inline the comparator to ensure that the only slowdown when doing binary search is from waiting for memory accesses. Additionally, it instructs the compiler to unroll the loop, which gives an additional 40% improve with Clang and 8% improvement with GCC. Consumers must opt into using the faster algorithm. At present, only B-Trees used inside kernel code have been modified to use the faster algorithm. Micro-benchmarks suggest that this can improve binary search performance by up to 3.5 times when compiling with Clang 16 and up to 1.9 times when compiling with GCC 12.2. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #14866 --- cmd/zdb/zdb.c | 8 ++-- include/sys/btree.h | 66 ++++++++++++++++++++++++++++++-- module/Kbuild.in | 14 +++++++ module/Makefile.bsd | 14 +++++++ module/zfs/btree.c | 22 +++++++---- module/zfs/dsl_scan.c | 7 +++- module/zfs/metaslab.c | 23 +++++++---- module/zfs/range_tree.c | 18 ++++++++- module/zfs/zap_micro.c | 6 ++- tests/zfs-tests/cmd/btree_test.c | 2 +- 10 files changed, 154 insertions(+), 26 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 5ab13b470dc0..61f1258f72b9 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -326,7 +326,7 @@ sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) int err; struct sublivelist_verify *sv = args; - zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, + zfs_btree_create(&sv->sv_pair, sublivelist_block_refcnt_compare, NULL, sizeof (sublivelist_verify_block_refcnt_t)); err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, @@ -390,7 +390,7 @@ sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) { (void) args; sublivelist_verify_t sv; - zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); int err = sublivelist_verify_func(&sv, dle); zfs_btree_clear(&sv.sv_leftover); @@ -682,7 +682,7 @@ livelist_metaslab_validate(spa_t *spa) (void) printf("Verifying deleted livelist entries\n"); sublivelist_verify_t sv; - zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); iterate_deleted_livelists(spa, livelist_verify, &sv); @@ -716,7 +716,7 @@ livelist_metaslab_validate(spa_t *spa) mv.mv_start = m->ms_start; mv.mv_end = m->ms_start + m->ms_size; zfs_btree_create(&mv.mv_livelist_allocs, - livelist_block_compare, + livelist_block_compare, NULL, sizeof (sublivelist_verify_block_t)); mv_populate_livelist_allocs(&mv, &sv); diff --git a/include/sys/btree.h b/include/sys/btree.h index 883abb5181c9..6e05eee8f01d 100644 --- a/include/sys/btree.h +++ b/include/sys/btree.h @@ -105,8 +105,13 @@ typedef struct zfs_btree_index { boolean_t bti_before; } zfs_btree_index_t; -typedef struct btree { +typedef struct btree zfs_btree_t; +typedef void * (*bt_find_in_buf_f) (zfs_btree_t *, uint8_t *, uint32_t, + const void *, zfs_btree_index_t *); + +struct btree { int (*bt_compar) (const void *, const void *); + bt_find_in_buf_f bt_find_in_buf; size_t bt_elem_size; size_t bt_leaf_size; uint32_t bt_leaf_cap; @@ -115,7 +120,54 @@ typedef struct btree { uint64_t bt_num_nodes; zfs_btree_hdr_t *bt_root; zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading -} zfs_btree_t; +}; + +/* + * Implementation of Shar's algorithm designed to accelerate binary search by + * eliminating impossible to predict branches. + * + * For optimality, this should be used to generate the search function in the + * same file as the comparator and the comparator should be marked + * `__attribute__((always_inline) inline` so that the compiler will inline it. + * + * Arguments are: + * + * NAME - The function name for this instance of the search function. Use it + * in a subsequent call to zfs_btree_create(). + * T - The element type stored inside the B-Tree. + * COMP - A comparator to compare two nodes, it must return exactly: -1, 0, + * or +1 -1 for <, 0 for ==, and +1 for >. For trivial comparisons, + * TREE_CMP() from avl.h can be used in a boilerplate function. + */ +/* BEGIN CSTYLED */ +#define ZFS_BTREE_FIND_IN_BUF_FUNC(NAME, T, COMP) \ +_Pragma("GCC diagnostic push") \ +_Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") \ +static void * \ +NAME(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, \ + const void *value, zfs_btree_index_t *where) \ +{ \ + T *i = (T *)buf; \ + (void) tree; \ + _Pragma("GCC unroll 9") \ + while (nelems > 1) { \ + uint32_t half = nelems / 2; \ + nelems -= half; \ + i += (COMP(&i[half - 1], value) < 0) * half; \ + } \ + \ + int comp = COMP(i, value); \ + where->bti_offset = (i - (T *)buf) + (comp < 0); \ + where->bti_before = (comp != 0); \ + \ + if (comp == 0) { \ + return (i); \ + } \ + \ + return (NULL); \ +} \ +_Pragma("GCC diagnostic pop") +/* END CSTYLED */ /* * Allocate and deallocate caches for btree nodes. @@ -129,13 +181,19 @@ void zfs_btree_fini(void); * tree - the tree to be initialized * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 * -1 for <, 0 for ==, and +1 for > + * find - optional function to accelerate searches inside B-Tree nodes + * through Shar's algorithm and comparator inlining. Setting this to + * NULL will use a generic function. The function should be created + * using ZFS_BTREE_FIND_IN_BUF_FUNC() in the same file as compar. + * compar should be marked `__attribute__((always_inline)) inline` or + * performance is unlikely to improve very much. * size - the value of sizeof(struct my_type) * lsize - custom leaf size */ void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), - size_t); + bt_find_in_buf_f, size_t); void zfs_btree_create_custom(zfs_btree_t *, int (*)(const void *, const void *), - size_t, size_t); + bt_find_in_buf_f, size_t, size_t); /* * Find a node with a matching value in the tree. Returns the matching node diff --git a/module/Kbuild.in b/module/Kbuild.in index 8d29f56c2fb8..29a55c9778b1 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -34,6 +34,20 @@ ifeq ($(CONFIG_KASAN),y) ZFS_MODULE_CFLAGS += -Wno-error=frame-larger-than= endif +# Generated binary search code is particularly bad with this optimization. +# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c +# is not affected when unrolling is done. +# Disable it until the following upstream issue is resolved: +# https://github.com/llvm/llvm-project/issues/62790 +ifeq ($(CONFIG_X86),y) +ifeq ($(CONFIG_CC_IS_CLANG),y) +CFLAGS_zfs/dsl_scan.o += -mllvm -x86-cmov-converter=false +CFLAGS_zfs/metaslab.o += -mllvm -x86-cmov-converter=false +CFLAGS_zfs/range_tree.o += -mllvm -x86-cmov-converter=false +CFLAGS_zfs/zap_micro.o += -mllvm -x86-cmov-converter=false +endif +endif + ifneq ($(KBUILD_EXTMOD),) @CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include @CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 365609fb8585..9464223f6ca6 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -400,6 +400,20 @@ beforeinstall: .include +# Generated binary search code is particularly bad with this optimization. +# Oddly, range_tree.c is not affected when unrolling is not done and dsl_scan.c +# is not affected when unrolling is done. +# Disable it until the following upstream issue is resolved: +# https://github.com/llvm/llvm-project/issues/62790 +.if ${CC} == "clang" +.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "amd64" +CFLAGS.dsl_scan.c= -mllvm -x86-cmov-converter=false +CFLAGS.metaslab.c= -mllvm -x86-cmov-converter=false +CFLAGS.range_tree.c= -mllvm -x86-cmov-converter=false +CFLAGS.zap_micro.c= -mllvm -x86-cmov-converter=false +.endif +.endif + CFLAGS.sysctl_os.c= -include ../zfs_config.h CFLAGS.xxhash.c+= -include ${SYSDIR}/sys/_null.h diff --git a/module/zfs/btree.c b/module/zfs/btree.c index 4c25afaa8199..af2b94a850be 100644 --- a/module/zfs/btree.c +++ b/module/zfs/btree.c @@ -193,14 +193,20 @@ zfs_btree_leaf_free(zfs_btree_t *tree, void *ptr) void zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), - size_t size) + bt_find_in_buf_f bt_find_in_buf, size_t size) { - zfs_btree_create_custom(tree, compar, size, BTREE_LEAF_SIZE); + zfs_btree_create_custom(tree, compar, bt_find_in_buf, size, + BTREE_LEAF_SIZE); } +static void * +zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint32_t nelems, + const void *value, zfs_btree_index_t *where); + void zfs_btree_create_custom(zfs_btree_t *tree, int (*compar) (const void *, const void *), + bt_find_in_buf_f bt_find_in_buf, size_t size, size_t lsize) { size_t esize = lsize - offsetof(zfs_btree_leaf_t, btl_elems); @@ -208,6 +214,8 @@ zfs_btree_create_custom(zfs_btree_t *tree, ASSERT3U(size, <=, esize / 2); memset(tree, 0, sizeof (*tree)); tree->bt_compar = compar; + tree->bt_find_in_buf = (bt_find_in_buf == NULL) ? + zfs_btree_find_in_buf : bt_find_in_buf; tree->bt_elem_size = size; tree->bt_leaf_size = lsize; tree->bt_leaf_cap = P2ALIGN(esize / size, 2); @@ -303,7 +311,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) * element in the last leaf, it's in the last leaf or * it's not in the tree. */ - void *d = zfs_btree_find_in_buf(tree, + void *d = tree->bt_find_in_buf(tree, last_leaf->btl_elems + last_leaf->btl_hdr.bth_first * size, last_leaf->btl_hdr.bth_count, value, &idx); @@ -327,7 +335,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; node = (zfs_btree_core_t *)node->btc_children[child], depth++) { ASSERT3P(node, !=, NULL); - void *d = zfs_btree_find_in_buf(tree, node->btc_elems, + void *d = tree->bt_find_in_buf(tree, node->btc_elems, node->btc_hdr.bth_count, value, &idx); EQUIV(d != NULL, !idx.bti_before); if (d != NULL) { @@ -347,7 +355,7 @@ zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) */ zfs_btree_leaf_t *leaf = (depth == 0 ? (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); - void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems + + void *d = tree->bt_find_in_buf(tree, leaf->btl_elems + leaf->btl_hdr.bth_first * size, leaf->btl_hdr.bth_count, value, &idx); @@ -671,7 +679,7 @@ zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; zfs_btree_index_t idx; ASSERT(zfs_btree_is_core(par_hdr)); - VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, par_hdr->bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); uint32_t offset = idx.bti_offset; @@ -897,7 +905,7 @@ zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) } zfs_btree_index_t idx; zfs_btree_core_t *parent = hdr->bth_parent; - VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + VERIFY3P(tree->bt_find_in_buf(tree, parent->btc_elems, parent->btc_hdr.bth_count, buf, &idx), ==, NULL); ASSERT(idx.bti_before); ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 6cad339104a4..9ee719a5eef6 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -4877,6 +4877,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, * with single operation. Plus it makes scrubs more sequential and reduces * chances that minor extent change move it within the B-tree. */ +__attribute__((always_inline)) inline static int ext_size_compare(const void *x, const void *y) { @@ -4885,13 +4886,17 @@ ext_size_compare(const void *x, const void *y) return (TREE_CMP(*a, *b)); } +ZFS_BTREE_FIND_IN_BUF_FUNC(ext_size_find_in_buf, uint64_t, + ext_size_compare) + static void ext_size_create(range_tree_t *rt, void *arg) { (void) rt; zfs_btree_t *size_tree = arg; - zfs_btree_create(size_tree, ext_size_compare, sizeof (uint64_t)); + zfs_btree_create(size_tree, ext_size_compare, ext_size_find_in_buf, + sizeof (uint64_t)); } static void diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 24d52a74933f..94b131fcdb79 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1342,6 +1342,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, * Comparison function for the private size-ordered tree using 32-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ +__attribute__((always_inline)) inline static int metaslab_rangesize32_compare(const void *x1, const void *x2) { @@ -1352,16 +1353,15 @@ metaslab_rangesize32_compare(const void *x1, const void *x2) uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); - if (likely(cmp)) - return (cmp); - return (TREE_CMP(r1->rs_start, r2->rs_start)); + return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } /* * Comparison function for the private size-ordered tree using 64-bit * ranges. Tree is sorted by size, larger sizes at the end of the tree. */ +__attribute__((always_inline)) inline static int metaslab_rangesize64_compare(const void *x1, const void *x2) { @@ -1372,11 +1372,10 @@ metaslab_rangesize64_compare(const void *x1, const void *x2) uint64_t rs_size2 = r2->rs_end - r2->rs_start; int cmp = TREE_CMP(rs_size1, rs_size2); - if (likely(cmp)) - return (cmp); - return (TREE_CMP(r1->rs_start, r2->rs_start)); + return (cmp + !cmp * TREE_CMP(r1->rs_start, r2->rs_start)); } + typedef struct metaslab_rt_arg { zfs_btree_t *mra_bt; uint32_t mra_floor_shift; @@ -1412,6 +1411,13 @@ metaslab_size_tree_full_load(range_tree_t *rt) range_tree_walk(rt, metaslab_size_sorted_add, &arg); } + +ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize32_in_buf, + range_seg32_t, metaslab_rangesize32_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(metaslab_rt_find_rangesize64_in_buf, + range_seg64_t, metaslab_rangesize64_compare) + /* * Create any block allocator specific components. The current allocators * rely on using both a size-ordered range_tree_t and an array of uint64_t's. @@ -1424,19 +1430,22 @@ metaslab_rt_create(range_tree_t *rt, void *arg) size_t size; int (*compare) (const void *, const void *); + bt_find_in_buf_f bt_find; switch (rt->rt_type) { case RANGE_SEG32: size = sizeof (range_seg32_t); compare = metaslab_rangesize32_compare; + bt_find = metaslab_rt_find_rangesize32_in_buf; break; case RANGE_SEG64: size = sizeof (range_seg64_t); compare = metaslab_rangesize64_compare; + bt_find = metaslab_rt_find_rangesize64_in_buf; break; default: panic("Invalid range seg type %d", rt->rt_type); } - zfs_btree_create(size_tree, compare, size); + zfs_btree_create(size_tree, compare, bt_find, size); mrap->mra_floor_shift = metaslab_by_size_min_shift; } diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c index 894c30fcae16..5174e2c46633 100644 --- a/module/zfs/range_tree.c +++ b/module/zfs/range_tree.c @@ -151,6 +151,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) rt->rt_histogram[idx]--; } +__attribute__((always_inline)) inline static int range_tree_seg32_compare(const void *x1, const void *x2) { @@ -163,6 +164,7 @@ range_tree_seg32_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +__attribute__((always_inline)) inline static int range_tree_seg64_compare(const void *x1, const void *x2) { @@ -175,6 +177,7 @@ range_tree_seg64_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +__attribute__((always_inline)) inline static int range_tree_seg_gap_compare(const void *x1, const void *x2) { @@ -187,6 +190,15 @@ range_tree_seg_gap_compare(const void *x1, const void *x2) return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); } +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg32_find_in_buf, range_seg32_t, + range_tree_seg32_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg64_find_in_buf, range_seg64_t, + range_tree_seg64_compare) + +ZFS_BTREE_FIND_IN_BUF_FUNC(range_tree_seg_gap_find_in_buf, range_seg_gap_t, + range_tree_seg_gap_compare) + range_tree_t * range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, uint64_t gap) @@ -197,23 +209,27 @@ range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type, ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES); size_t size; int (*compare) (const void *, const void *); + bt_find_in_buf_f bt_find; switch (type) { case RANGE_SEG32: size = sizeof (range_seg32_t); compare = range_tree_seg32_compare; + bt_find = range_tree_seg32_find_in_buf; break; case RANGE_SEG64: size = sizeof (range_seg64_t); compare = range_tree_seg64_compare; + bt_find = range_tree_seg64_find_in_buf; break; case RANGE_SEG_GAP: size = sizeof (range_seg_gap_t); compare = range_tree_seg_gap_compare; + bt_find = range_tree_seg_gap_find_in_buf; break; default: panic("Invalid range seg type %d", type); } - zfs_btree_create(&rt->rt_root, compare, size); + zfs_btree_create(&rt->rt_root, compare, bt_find, size); rt->rt_ops = ops; rt->rt_gap = gap; diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c index d6ad8b2b8bc5..085d9cd8b4b6 100644 --- a/module/zfs/zap_micro.c +++ b/module/zfs/zap_micro.c @@ -285,6 +285,7 @@ zap_byteswap(void *buf, size_t size) } } +__attribute__((always_inline)) inline static int mze_compare(const void *arg1, const void *arg2) { @@ -295,6 +296,9 @@ mze_compare(const void *arg1, const void *arg2) (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd)); } +ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t, + mze_compare) + static void mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash) { @@ -461,7 +465,7 @@ mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) * 62 entries before we have to add 2KB B-tree core node. */ zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare, - sizeof (mzap_ent_t), 512); + mze_find_in_buf, sizeof (mzap_ent_t), 512); zap_name_t *zn = zap_name_alloc(zap); for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) { diff --git a/tests/zfs-tests/cmd/btree_test.c b/tests/zfs-tests/cmd/btree_test.c index 9a34bf559be0..fda9229915ce 100644 --- a/tests/zfs-tests/cmd/btree_test.c +++ b/tests/zfs-tests/cmd/btree_test.c @@ -501,7 +501,7 @@ main(int argc, char *argv[]) srandom(seed); zfs_btree_init(); - zfs_btree_create(&bt, zfs_btree_compare, sizeof (uint64_t)); + zfs_btree_create(&bt, zfs_btree_compare, NULL, sizeof (uint64_t)); /* * This runs the named negative test. None of them should From d3e0138a3d186d61a13b9b8450c3b0d1b0ba9398 Mon Sep 17 00:00:00 2001 From: Colm Date: Fri, 26 May 2023 10:04:19 -0700 Subject: [PATCH 127/180] Adding new read-only compatible zpool features to compatibility.d/grub2 GRUB2 is compatible with all "read-only compatible" features, so it is safe to add new features of this type to the grub2 compatibility list. We generally want to include all compatible features, to minimize the differences between grub2-compatible pools and no-compatibility pools. Adding new properties `livelist` and `zpool_checkpoint` accordingly. Also adding them to the man page which references this file as an example, for consistency. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Colm Buckley Closes #14893 --- cmd/zpool/compatibility.d/grub2 | 2 ++ man/man7/zpool-features.7 | 2 ++ 2 files changed, 4 insertions(+) diff --git a/cmd/zpool/compatibility.d/grub2 b/cmd/zpool/compatibility.d/grub2 index 4e8f21362554..fec73a269a78 100644 --- a/cmd/zpool/compatibility.d/grub2 +++ b/cmd/zpool/compatibility.d/grub2 @@ -8,5 +8,7 @@ extensible_dataset filesystem_limits hole_birth large_blocks +livelist lz4_compress spacemap_histogram +zpool_checkpoint diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 2b7dcb63829c..b901ce6c2935 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -228,8 +228,10 @@ extensible_dataset filesystem_limits hole_birth large_blocks +livelist lz4_compress spacemap_histogram +zpool_checkpoint .No example# Nm zpool Cm create Fl o Sy compatibility Ns = Ns Ar grub2 Ar bootpool Ar vdev .Ed From 365bae0eab3bf1c9ce29789094fb352a7f269974 Mon Sep 17 00:00:00 2001 From: Mike Swanson Date: Fri, 26 May 2023 15:37:15 -0700 Subject: [PATCH 128/180] Add compatibility symlinks for FreeBSD 12.{3,4} and 13.{0,1,2} Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Mike Swanson Closes #14902 --- cmd/zpool/Makefile.am | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am index de700eabf86b..d08b8e1791b6 100644 --- a/cmd/zpool/Makefile.am +++ b/cmd/zpool/Makefile.am @@ -169,6 +169,11 @@ zpoolcompatlinks = \ "freebsd-11.3 freebsd-12.0" \ "freebsd-11.3 freebsd-12.1" \ "freebsd-11.3 freebsd-12.2" \ + "freebsd-11.3 freebsd-12.3" \ + "freebsd-11.3 freebsd-12.4" \ + "openzfs-2.1-freebsd freebsd-13.0" \ + "openzfs-2.1-freebsd freebsd-13.1" \ + "openzfs-2.1-freebsd freebsd-13.2" \ "freebsd-11.3 freenas-11.3" \ "freenas-11.0 freenas-11.1" \ "openzfsonosx-1.9.3 openzfsonosx-1.9.4" \ From 20494d47d22a60964274c73db3b22bc385eb9667 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 26 May 2023 15:39:23 -0700 Subject: [PATCH 129/180] ZTS: Add zpool_resilver_concurrent exception The zpool_resilver_concurrent test case requires the ZED which is not used on FreeBSD. Add this test to the known list of skipped tested for FreeBSD. Signed-off-by: Brian Behlendorf Closes #14904 --- tests/test-runner/bin/zts-report.py.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 3f7498f5c6bf..3eeee35878f8 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -163,6 +163,8 @@ if sys.platform.startswith('freebsd'): known.update({ 'cli_root/zfs_receive/receive-o-x_props_override': ['FAIL', known_reason], + 'cli_root/zpool_resilver/zpool_resilver_concurrent': + ['SKIP', na_reason], 'cli_root/zpool_wait/zpool_wait_trim_basic': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason], From 0f03a411615a797425de488eecfaaf63fc41acfe Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 26 May 2023 18:47:52 -0400 Subject: [PATCH 130/180] Use __attribute__((malloc)) on memory allocation functions This informs the C compiler that pointers returned from these functions do not alias other functions, which allows it to do better code optimization and should make the compiled code smaller. References: https://stackoverflow.com/a/53654773 https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-malloc-function-attribute https://clang.llvm.org/docs/AttributeReference.html#malloc Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #14827 --- include/os/freebsd/spl/sys/kmem.h | 3 ++- include/os/linux/spl/sys/kmem.h | 16 ++++++++-------- include/os/linux/spl/sys/vmem.h | 6 ++++-- include/sys/abd.h | 5 +++++ lib/libspl/include/umem.h | 7 ++++--- 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/include/os/freebsd/spl/sys/kmem.h b/include/os/freebsd/spl/sys/kmem.h index 27d290863c0b..c633799318d5 100644 --- a/include/os/freebsd/spl/sys/kmem.h +++ b/include/os/freebsd/spl/sys/kmem.h @@ -75,7 +75,7 @@ typedef struct kmem_cache { extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache); extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); -__attribute__((alloc_size(1))) +__attribute__((malloc, alloc_size(1))) void *zfs_kmem_alloc(size_t size, int kmflags); void zfs_kmem_free(void *buf, size_t size); uint64_t kmem_size(void); @@ -83,6 +83,7 @@ kmem_cache_t *kmem_cache_create(const char *name, size_t bufsize, size_t align, int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags); void kmem_cache_destroy(kmem_cache_t *cache); +__attribute__((malloc)) void *kmem_cache_alloc(kmem_cache_t *cache, int flags); void kmem_cache_free(kmem_cache_t *cache, void *buf); boolean_t kmem_cache_reap_active(void); diff --git a/include/os/linux/spl/sys/kmem.h b/include/os/linux/spl/sys/kmem.h index 594425f7b297..8a203f7bb8e2 100644 --- a/include/os/linux/spl/sys/kmem.h +++ b/include/os/linux/spl/sys/kmem.h @@ -31,10 +31,10 @@ #include extern int kmem_debugging(void); -extern char *kmem_vasprintf(const char *fmt, va_list ap) - __attribute__((format(printf, 1, 0))); -extern char *kmem_asprintf(const char *fmt, ...) - __attribute__((format(printf, 1, 2))); +__attribute__((format(printf, 1, 0))) +extern char *kmem_vasprintf(const char *fmt, va_list ap); +__attribute__((format(printf, 1, 2))) +extern char *kmem_asprintf(const char *fmt, ...); extern char *kmem_strdup(const char *str); extern void kmem_strfree(char *str); @@ -186,10 +186,10 @@ extern unsigned int spl_kmem_alloc_max; #define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz)) #define kmem_cache_reap_active spl_kmem_cache_reap_active -extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line) - __attribute__((alloc_size(1))); -extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line) - __attribute__((alloc_size(1))); +__attribute__((malloc, alloc_size(1))) +extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line); +__attribute__((malloc, alloc_size(1))) +extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line); extern void spl_kmem_free(const void *ptr, size_t sz); /* diff --git a/include/os/linux/spl/sys/vmem.h b/include/os/linux/spl/sys/vmem.h index e77af2a7a48c..92585a17e263 100644 --- a/include/os/linux/spl/sys/vmem.h +++ b/include/os/linux/spl/sys/vmem.h @@ -91,8 +91,10 @@ typedef struct vmem { } vmem_t; #define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__) #define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz)) -extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line); -extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line) + __attribute__((malloc, alloc_size(1))); +extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line) + __attribute__((malloc, alloc_size(1))); extern void spl_vmem_free(const void *ptr, size_t sz); int spl_vmem_init(void); diff --git a/include/sys/abd.h b/include/sys/abd.h index 82c51cb05cbc..750f9986c1da 100644 --- a/include/sys/abd.h +++ b/include/sys/abd.h @@ -86,10 +86,15 @@ extern int zfs_abd_scatter_enabled; * Allocations and deallocations */ +__attribute__((malloc)) abd_t *abd_alloc(size_t, boolean_t); +__attribute__((malloc)) abd_t *abd_alloc_linear(size_t, boolean_t); +__attribute__((malloc)) abd_t *abd_alloc_gang(void); +__attribute__((malloc)) abd_t *abd_alloc_for_io(size_t, boolean_t); +__attribute__((malloc)) abd_t *abd_alloc_sametype(abd_t *, size_t); boolean_t abd_size_alloc_linear(size_t); void abd_gang_add(abd_t *, abd_t *, boolean_t); diff --git a/lib/libspl/include/umem.h b/lib/libspl/include/umem.h index 77c216721253..9039212baf14 100644 --- a/lib/libspl/include/umem.h +++ b/lib/libspl/include/umem.h @@ -83,7 +83,7 @@ const char *_umem_debug_init(void); const char *_umem_options_init(void); const char *_umem_logging_init(void); -__attribute__((alloc_size(1))) +__attribute__((malloc, alloc_size(1))) static inline void * umem_alloc(size_t size, int flags) { @@ -96,7 +96,7 @@ umem_alloc(size_t size, int flags) return (ptr); } -__attribute__((alloc_size(1))) +__attribute__((malloc, alloc_size(1))) static inline void * umem_alloc_aligned(size_t size, size_t align, int flags) { @@ -118,7 +118,7 @@ umem_alloc_aligned(size_t size, size_t align, int flags) return (ptr); } -__attribute__((alloc_size(1))) +__attribute__((malloc, alloc_size(1))) static inline void * umem_zalloc(size_t size, int flags) { @@ -188,6 +188,7 @@ umem_cache_destroy(umem_cache_t *cp) umem_free(cp, sizeof (umem_cache_t)); } +__attribute__((malloc)) static inline void * umem_cache_alloc(umem_cache_t *cp, int flags) { From e085e98d541ad96bfe16be98a235bbe4e00b2e08 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 29 May 2023 12:55:35 -0700 Subject: [PATCH 131/180] ZTS: zvol_misc_trim disable blk mq Disable the zvol_misc_fua.ksh and zvol_misc_trim.ksh test cases on impacted kernels. This issue is being actively worked in #14872 and as part of that fix this commit will be reverted. VERIFY(zh->zh_claim_txg == 0) failed PANIC at zil.c:904:zil_create() Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Issue #14872 Closes #14870 --- tests/test-runner/bin/zts-report.py.in | 2 ++ .../tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh | 9 +++++++++ .../tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh | 10 +++++++++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 3eeee35878f8..ef1a46dca72a 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -279,6 +279,8 @@ elif sys.platform.startswith('linux'): 'mmp/mmp_inactive_import': ['FAIL', known_reason], 'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', 12621], 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason], + 'zvol/zvol_misc/zvol_misc_fua': ['SKIP', 14872], + 'zvol/zvol_misc/zvol_misc_trim': ['SKIP', 14872], 'idmap_mount/idmap_mount_001': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_002': ['SKIP', idmap_reason], 'idmap_mount/idmap_mount_003': ['SKIP', idmap_reason], diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh index 9ebd5b149118..619d8d0e8f07 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_fua.ksh @@ -45,6 +45,15 @@ fi if ! is_linux ; then log_unsupported "Only linux supports dd with oflag=dsync for FUA writes" +else + if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then + log_unsupported "Disabled while issue #14872 is being worked" + fi + + # Disabled for the CentOS 9 kernel + if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then + log_unsupported "Disabled while issue #14872 is being worked" + fi fi typeset datafile1="$(mktemp zvol_misc_fua1.XXXXXX)" diff --git a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh index 46cac3ecb6c2..c0b191aafd45 100755 --- a/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh +++ b/tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_trim.ksh @@ -44,6 +44,15 @@ verify_runnable "global" if is_linux ; then + if [[ $(linux_version) -gt $(linux_version "6.2") ]]; then + log_unsupported "Disabled while issue #14872 is being worked" + fi + + # Disabled for the CentOS 9 kernel + if [[ $(linux_version) -eq $(linux_version "5.14") ]]; then + log_unsupported "Disabled while issue #14872 is being worked" + fi + # We need '--force' here since the prior tests may leave a filesystem # on the zvol, and blkdiscard will see that filesystem and print a # warning unless you force it. @@ -123,7 +132,6 @@ log_must zfs set compression=off $TESTPOOL/$TESTVOL # Remove old data from previous tests log_must $trimcmd $zvolpath - set_blk_mq 1 log_must_busy zpool export $TESTPOOL log_must zpool import $TESTPOOL From 928c81f4dfa994aad9a9406dee695ed954d77371 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lu=C3=ADs=20Henriques?= <73643340+lumigch@users.noreply.github.com> Date: Tue, 30 May 2023 23:15:24 +0100 Subject: [PATCH 132/180] Fix NULL pointer dereference when doing concurrent 'send' operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A NULL pointer will occur when doing a 'zfs send -S' on a dataset that is still being received. The problem is that the new 'send' will rightfully fail to own the datasets (i.e. dsl_dataset_own_force() will fail), but then dmu_send() will still do the dsl_dataset_disown(). Reviewed-by: Brian Behlendorf Signed-off-by: Luís Henriques Closes #14903 Closes #14890 --- module/zfs/dmu_send.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 5b7f5543ad09..b3ebdec6b45c 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -2793,6 +2793,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, } if (err == 0) { + owned = B_TRUE; err = zap_lookup(dspp.dp->dp_meta_objset, dspp.to_ds->ds_object, DS_FIELD_RESUME_TOGUID, 8, 1, @@ -2806,21 +2807,24 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, sizeof (dspp.saved_toname), dspp.saved_toname); } - if (err != 0) + /* Only disown if there was an error in the lookups */ + if (owned && (err != 0)) dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); kmem_strfree(name); } else { err = dsl_dataset_own(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); + if (err == 0) + owned = B_TRUE; } - owned = B_TRUE; } else { err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG, &dspp.to_ds); } if (err != 0) { + /* Note: dsl dataset is not owned at this point */ dsl_pool_rele(dspp.dp, FTAG); return (err); } From 2810dda80b8e1d629236b82c5bee6a4ef717e02e Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Wed, 31 May 2023 19:58:41 -0400 Subject: [PATCH 133/180] Revert "initramfs: use `mount.zfs` instead of `mount`" This broke mounting of snapshots on / for users. See https://github.com/openzfs/zfs/issues/9461#issuecomment-1376162949 for more context. Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #14908 --- contrib/initramfs/scripts/zfs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs index 7f977a30f75b..0a2bd2efda7a 100644 --- a/contrib/initramfs/scripts/zfs +++ b/contrib/initramfs/scripts/zfs @@ -344,7 +344,7 @@ mount_fs() # Need the _original_ datasets mountpoint! mountpoint=$(get_fs_value "$fs" mountpoint) - ZFS_CMD="mount.zfs -o zfsutil" + ZFS_CMD="mount -o zfsutil -t zfs" if [ "$mountpoint" = "legacy" ] || [ "$mountpoint" = "none" ]; then # Can't use the mountpoint property. Might be one of our # clones. Check the 'org.zol:mountpoint' property set in @@ -361,7 +361,7 @@ mount_fs() fi # Don't use mount.zfs -o zfsutils for legacy mountpoint if [ "$mountpoint" = "legacy" ]; then - ZFS_CMD="mount.zfs" + ZFS_CMD="mount -t zfs" fi # Last hail-mary: Hope 'rootmnt' is set! mountpoint="" @@ -944,7 +944,7 @@ mountroot() echo " not specified on the kernel command line." echo "" echo "Manually mount the root filesystem on $rootmnt and then exit." - echo "Hint: Try: mount.zfs -o zfsutil ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt" + echo "Hint: Try: mount -o zfsutil -t zfs ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt" shell fi From c47b708647d10e6391101492dbd0f63a386ccd10 Mon Sep 17 00:00:00 2001 From: Val Packett Date: Fri, 5 May 2023 20:00:48 -0300 Subject: [PATCH 134/180] PAM: do not fail to mount if the key's already loaded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we're expecting a working home directory on login, it would be rather frustrating to not have it mounted just because it e.g. failed to unmount once on logout. Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14834 --- contrib/pam_zfs_key/pam_zfs_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index 979546ab3090..9d9076e1aa0d 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -386,7 +386,7 @@ decrypt_mount(pam_handle_t *pamh, const char *ds_name, int ret = lzc_load_key(ds_name, noop, (uint8_t *)key->value, WRAPPING_KEY_LEN); pw_free(key); - if (ret) { + if (ret && ret != EEXIST) { pam_syslog(pamh, LOG_ERR, "load_key failed: %d", ret); zfs_close(ds); return (-1); From bd4962b5ac42940a0c674b03ae9f47e36b13c908 Mon Sep 17 00:00:00 2001 From: Val Packett Date: Fri, 5 May 2023 21:56:39 -0300 Subject: [PATCH 135/180] PAM: use boolean_t for config flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we already use boolean_t in the file, we can use it here. Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14834 --- contrib/pam_zfs_key/pam_zfs_key.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index 9d9076e1aa0d..b3086e038e5b 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -437,7 +437,7 @@ typedef struct { char *dsname; uid_t uid; const char *username; - int unmount_and_unload; + boolean_t unmount_and_unload; } zfs_key_config_t; static int @@ -471,7 +471,7 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, } config->uid = entry->pw_uid; config->username = name; - config->unmount_and_unload = 1; + config->unmount_and_unload = B_TRUE; config->dsname = NULL; config->homedir = NULL; for (int c = 0; c < argc; c++) { @@ -482,7 +482,7 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, free(config->runstatedir); config->runstatedir = strdup(argv[c] + 12); } else if (strcmp(argv[c], "nounmount") == 0) { - config->unmount_and_unload = 0; + config->unmount_and_unload = B_FALSE; } else if (strcmp(argv[c], "prop_mountpoint") == 0) { if (config->homedir == NULL) config->homedir = strdup(entry->pw_dir); From 850bccd3bc163a602700c4a4b15c8d52c0b6231c Mon Sep 17 00:00:00 2001 From: Val Packett Date: Fri, 5 May 2023 19:35:57 -0300 Subject: [PATCH 136/180] PAM: add 'recursive_homes' flag to use with 'prop_mountpoint' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It's not always desirable to have a fixed flat homes directory. With the 'recursive_homes' flag, 'prop_mountpoint' search would traverse the whole tree starting at 'homes' (which can now be '*' to mean all pools) to find a dataset with a mountpoint matching the home directory. Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14834 --- contrib/pam_zfs_key/pam_zfs_key.c | 36 +++++++--- tests/runfiles/linux.run | 2 +- .../tests/functional/pam/cleanup.ksh | 1 + .../tests/functional/pam/pam_recursive.ksh | 72 +++++++++++++++++++ 4 files changed, 99 insertions(+), 12 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/pam/pam_recursive.ksh diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index b3086e038e5b..259ac7a8f191 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -438,6 +438,7 @@ typedef struct { uid_t uid; const char *username; boolean_t unmount_and_unload; + boolean_t recursive_homes; } zfs_key_config_t; static int @@ -472,6 +473,7 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, config->uid = entry->pw_uid; config->username = name; config->unmount_and_unload = B_TRUE; + config->recursive_homes = B_FALSE; config->dsname = NULL; config->homedir = NULL; for (int c = 0; c < argc; c++) { @@ -483,6 +485,8 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, config->runstatedir = strdup(argv[c] + 12); } else if (strcmp(argv[c], "nounmount") == 0) { config->unmount_and_unload = B_FALSE; + } else if (strcmp(argv[c], "recursive_homes") == 0) { + config->recursive_homes = B_TRUE; } else if (strcmp(argv[c], "prop_mountpoint") == 0) { if (config->homedir == NULL) config->homedir = strdup(entry->pw_dir); @@ -517,8 +521,12 @@ find_dsname_by_prop_value(zfs_handle_t *zhp, void *data) (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, sizeof (mountpoint), NULL, NULL, 0, B_FALSE); if (strcmp(target->homedir, mountpoint) != 0) { + if (target->recursive_homes) { + (void) zfs_iter_filesystems_v2(zhp, 0, + find_dsname_by_prop_value, target); + } zfs_close(zhp); - return (0); + return (target->dsname != NULL); } target->dsname = strdup(zfs_get_name(zhp)); @@ -531,17 +539,23 @@ zfs_key_config_get_dataset(zfs_key_config_t *config) { if (config->homedir != NULL && config->homes_prefix != NULL) { - zfs_handle_t *zhp = zfs_open(g_zfs, config->homes_prefix, - ZFS_TYPE_FILESYSTEM); - if (zhp == NULL) { - pam_syslog(NULL, LOG_ERR, "dataset %s not found", - config->homes_prefix); - return (NULL); - } + if (strcmp(config->homes_prefix, "*") == 0) { + (void) zfs_iter_root(g_zfs, + find_dsname_by_prop_value, config); + } else { + zfs_handle_t *zhp = zfs_open(g_zfs, + config->homes_prefix, ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) { + pam_syslog(NULL, LOG_ERR, + "dataset %s not found", + config->homes_prefix); + return (NULL); + } - (void) zfs_iter_filesystems_v2(zhp, 0, - find_dsname_by_prop_value, config); - zfs_close(zhp); + (void) zfs_iter_filesystems_v2(zhp, 0, + find_dsname_by_prop_value, config); + zfs_close(zhp); + } char *dsname = config->dsname; config->dsname = NULL; return (dsname); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 4df770d61f07..97fc250a7cbf 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -140,7 +140,7 @@ tests = ['umount_unlinked_drain'] tags = ['functional', 'mount'] [tests/functional/pam:Linux] -tests = ['pam_basic', 'pam_nounmount', 'pam_short_password'] +tests = ['pam_basic', 'pam_nounmount', 'pam_recursive', 'pam_short_password'] tags = ['functional', 'pam'] [tests/functional/procfs:Linux] diff --git a/tests/zfs-tests/tests/functional/pam/cleanup.ksh b/tests/zfs-tests/tests/functional/pam/cleanup.ksh index 971c7fce64e5..dbcb175ed069 100755 --- a/tests/zfs-tests/tests/functional/pam/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/pam/cleanup.ksh @@ -25,5 +25,6 @@ rmconfig destroy_pool $TESTPOOL del_user ${username} +del_user ${username}rec del_group pamtestgroup log_must rm -rf "$runstatedir" $TESTDIRS diff --git a/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh b/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh new file mode 100755 index 000000000000..3714b179b852 --- /dev/null +++ b/tests/zfs-tests/tests/functional/pam/pam_recursive.ksh @@ -0,0 +1,72 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/tests/functional/pam/utilities.kshlib + +if [ -n "$ASAN_OPTIONS" ]; then + export LD_PRELOAD=$(ldd "$(command -v zfs)" | awk '/libasan\.so/ {print $3}') +fi + +username="${username}rec" + +# Set up a deeper hierarchy, a mountpoint that doesn't interfere with other tests, +# and a user which references that mountpoint +log_must zfs create "$TESTPOOL/pampam" +log_must zfs create -o mountpoint="$TESTDIR/rec" "$TESTPOOL/pampam/pam" +echo "recurpass" | zfs create -o encryption=aes-256-gcm -o keyformat=passphrase \ + -o keylocation=prompt "$TESTPOOL/pampam/pam/${username}" +log_must zfs unmount "$TESTPOOL/pampam/pam/${username}" +log_must zfs unload-key "$TESTPOOL/pampam/pam/${username}" +log_must add_user pamtestgroup ${username} "$TESTDIR/rec" + +function keystatus { + log_must [ "$(get_prop keystatus "$TESTPOOL/pampam/pam/${username}")" = "$1" ] +} + +log_mustnot ismounted "$TESTPOOL/pampam/pam/${username}" +keystatus unavailable + +function test_session { + echo "recurpass" | pamtester ${pamservice} ${username} open_session + references 1 + log_must ismounted "$TESTPOOL/pampam/pam/${username}" + keystatus available + + log_must pamtester ${pamservice} ${username} close_session + references 0 + log_mustnot ismounted "$TESTPOOL/pampam/pam/${username}" + keystatus unavailable +} + +genconfig "homes=$TESTPOOL/pampam/pam prop_mountpoint runstatedir=${runstatedir}" +test_session + +genconfig "homes=$TESTPOOL/pampam recursive_homes prop_mountpoint runstatedir=${runstatedir}" +test_session + +genconfig "homes=$TESTPOOL recursive_homes prop_mountpoint runstatedir=${runstatedir}" +test_session + +genconfig "homes=* recursive_homes prop_mountpoint runstatedir=${runstatedir}" +test_session + +log_pass "done." From f2f3ec17edb5015c068c737f328654ae2c36a790 Mon Sep 17 00:00:00 2001 From: Val Packett Date: Fri, 5 May 2023 22:02:13 -0300 Subject: [PATCH 137/180] PAM: add 'forceunmount' flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Probably not always a good idea, but it's nice to have the option. It is a workaround for FreeBSD calling the PAM session end earier than the last process is actually done touching the mount, for example. Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14834 --- contrib/pam_zfs_key/pam_zfs_key.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index 259ac7a8f191..c6abb34619d6 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -406,14 +406,14 @@ decrypt_mount(pam_handle_t *pamh, const char *ds_name, } static int -unmount_unload(pam_handle_t *pamh, const char *ds_name) +unmount_unload(pam_handle_t *pamh, const char *ds_name, boolean_t force) { zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); if (ds == NULL) { pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); return (-1); } - int ret = zfs_unmount(ds, NULL, 0); + int ret = zfs_unmount(ds, NULL, force ? MS_FORCE : 0); if (ret) { pam_syslog(pamh, LOG_ERR, "zfs_unmount failed with: %d", ret); zfs_close(ds); @@ -438,6 +438,7 @@ typedef struct { uid_t uid; const char *username; boolean_t unmount_and_unload; + boolean_t force_unmount; boolean_t recursive_homes; } zfs_key_config_t; @@ -473,6 +474,7 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, config->uid = entry->pw_uid; config->username = name; config->unmount_and_unload = B_TRUE; + config->force_unmount = B_FALSE; config->recursive_homes = B_FALSE; config->dsname = NULL; config->homedir = NULL; @@ -485,6 +487,8 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, config->runstatedir = strdup(argv[c] + 12); } else if (strcmp(argv[c], "nounmount") == 0) { config->unmount_and_unload = B_FALSE; + } else if (strcmp(argv[c], "forceunmount") == 0) { + config->force_unmount = B_TRUE; } else if (strcmp(argv[c], "recursive_homes") == 0) { config->recursive_homes = B_TRUE; } else if (strcmp(argv[c], "prop_mountpoint") == 0) { @@ -882,7 +886,7 @@ pam_sm_close_session(pam_handle_t *pamh, int flags, zfs_key_config_free(&config); return (PAM_SESSION_ERR); } - if (unmount_unload(pamh, dataset) == -1) { + if (unmount_unload(pamh, dataset, config.force_unmount) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); From e3ba6b93de32bc76e9e616472af1c7aafea585a5 Mon Sep 17 00:00:00 2001 From: Val Packett Date: Fri, 5 May 2023 22:34:58 -0300 Subject: [PATCH 138/180] PAM: add 'uid_min' and 'uid_max' options for changing the uid range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of a fixed >=1000 check, allow the configuration to override the minimum UID and add a maximum one as well. While here, add the uid range check to the authenticate method as well, and fix the return in the chauthtok method (seems very wrong to report success when we've done absolutely nothing). Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14834 --- contrib/pam_zfs_key/pam_zfs_key.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index c6abb34619d6..6b7a41fa1739 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -435,6 +435,8 @@ typedef struct { char *runstatedir; char *homedir; char *dsname; + uid_t uid_min; + uid_t uid_max; uid_t uid; const char *username; boolean_t unmount_and_unload; @@ -471,6 +473,8 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, free(config->homes_prefix); return (PAM_USER_UNKNOWN); } + config->uid_min = 1000; + config->uid_max = MAXUID; config->uid = entry->pw_uid; config->username = name; config->unmount_and_unload = B_TRUE; @@ -485,6 +489,10 @@ zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, } else if (strncmp(argv[c], "runstatedir=", 12) == 0) { free(config->runstatedir); config->runstatedir = strdup(argv[c] + 12); + } else if (strncmp(argv[c], "uid_min=", 8) == 0) { + sscanf(argv[c] + 8, "%u", &config->uid_min); + } else if (strncmp(argv[c], "uid_max=", 8) == 0) { + sscanf(argv[c] + 8, "%u", &config->uid_max); } else if (strcmp(argv[c], "nounmount") == 0) { config->unmount_and_unload = B_FALSE; } else if (strcmp(argv[c], "forceunmount") == 0) { @@ -673,6 +681,10 @@ pam_sm_authenticate(pam_handle_t *pamh, int flags, if (config_err != PAM_SUCCESS) { return (config_err); } + if (config.uid < config.uid_min || config.uid > config.uid_max) { + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } const pw_password_t *token = pw_fetch_lazy(pamh); if (token == NULL) { @@ -724,9 +736,9 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags, if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) { return (PAM_SERVICE_ERR); } - if (config.uid < 1000) { + if (config.uid < config.uid_min || config.uid > config.uid_max) { zfs_key_config_free(&config); - return (PAM_SUCCESS); + return (PAM_SERVICE_ERR); } { if (pam_zfs_init(pamh) != 0) { @@ -806,7 +818,7 @@ pam_sm_open_session(pam_handle_t *pamh, int flags, return (PAM_SESSION_ERR); } - if (config.uid < 1000) { + if (config.uid < config.uid_min || config.uid > config.uid_max) { zfs_key_config_free(&config); return (PAM_SUCCESS); } @@ -864,7 +876,7 @@ pam_sm_close_session(pam_handle_t *pamh, int flags, if (zfs_key_config_load(pamh, &config, argc, argv) != PAM_SUCCESS) { return (PAM_SESSION_ERR); } - if (config.uid < 1000) { + if (config.uid < config.uid_min || config.uid > config.uid_max) { zfs_key_config_free(&config); return (PAM_SUCCESS); } From db994458bbee99968827d88afd823d36ef82af28 Mon Sep 17 00:00:00 2001 From: Val Packett Date: Fri, 5 May 2023 22:17:12 -0300 Subject: [PATCH 139/180] PAM: support password changes even when not mounted MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's usually no requirement that a user be logged in for changing their password, so let's not be surprising here. We need to use the fetch_lazy mechanism for the old password to avoid a double prompt for it, so that mechanism is now generalized a bit. Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14834 --- contrib/pam_zfs_key/pam_zfs_key.c | 70 ++++++++++++------- tests/runfiles/linux.run | 3 +- .../functional/pam/pam_change_unmounted.ksh | 55 +++++++++++++++ .../functional/pam/pam_short_password.ksh | 2 +- 4 files changed, 102 insertions(+), 28 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c index 6b7a41fa1739..08a8640669b3 100644 --- a/contrib/pam_zfs_key/pam_zfs_key.c +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -67,6 +67,7 @@ pam_syslog(pam_handle_t *pamh, int loglevel, const char *fmt, ...) #include static const char PASSWORD_VAR_NAME[] = "pam_zfs_key_authtok"; +static const char OLD_PASSWORD_VAR_NAME[] = "pam_zfs_key_oldauthtok"; static libzfs_handle_t *g_zfs; @@ -160,10 +161,10 @@ pw_free(pw_password_t *pw) } static pw_password_t * -pw_fetch(pam_handle_t *pamh) +pw_fetch(pam_handle_t *pamh, int tok) { const char *token; - if (pam_get_authtok(pamh, PAM_AUTHTOK, &token, NULL) != PAM_SUCCESS) { + if (pam_get_authtok(pamh, tok, &token, NULL) != PAM_SUCCESS) { pam_syslog(pamh, LOG_ERR, "couldn't get password from PAM stack"); return (NULL); @@ -177,13 +178,13 @@ pw_fetch(pam_handle_t *pamh) } static const pw_password_t * -pw_fetch_lazy(pam_handle_t *pamh) +pw_fetch_lazy(pam_handle_t *pamh, int tok, const char *var_name) { - pw_password_t *pw = pw_fetch(pamh); + pw_password_t *pw = pw_fetch(pamh, tok); if (pw == NULL) { return (NULL); } - int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, pw, destroy_pw); + int ret = pam_set_data(pamh, var_name, pw, destroy_pw); if (ret != PAM_SUCCESS) { pw_free(pw); pam_syslog(pamh, LOG_ERR, "pam_set_data failed"); @@ -193,23 +194,23 @@ pw_fetch_lazy(pam_handle_t *pamh) } static const pw_password_t * -pw_get(pam_handle_t *pamh) +pw_get(pam_handle_t *pamh, int tok, const char *var_name) { const pw_password_t *authtok = NULL; - int ret = pam_get_data(pamh, PASSWORD_VAR_NAME, + int ret = pam_get_data(pamh, var_name, (const void**)(&authtok)); if (ret == PAM_SUCCESS) return (authtok); if (ret == PAM_NO_MODULE_DATA) - return (pw_fetch_lazy(pamh)); + return (pw_fetch_lazy(pamh, tok, var_name)); pam_syslog(pamh, LOG_ERR, "password not available"); return (NULL); } static int -pw_clear(pam_handle_t *pamh) +pw_clear(pam_handle_t *pamh, const char *var_name) { - int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, NULL, NULL); + int ret = pam_set_data(pamh, var_name, NULL, NULL); if (ret != PAM_SUCCESS) { pam_syslog(pamh, LOG_ERR, "clearing password failed"); return (-1); @@ -686,7 +687,8 @@ pam_sm_authenticate(pam_handle_t *pamh, int flags, return (PAM_SERVICE_ERR); } - const pw_password_t *token = pw_fetch_lazy(pamh); + const pw_password_t *token = pw_fetch_lazy(pamh, + PAM_AUTHTOK, PASSWORD_VAR_NAME); if (token == NULL) { zfs_key_config_free(&config); return (PAM_AUTH_ERR); @@ -740,6 +742,8 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags, zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } + const pw_password_t *old_token = pw_get(pamh, + PAM_OLDAUTHTOK, OLD_PASSWORD_VAR_NAME); { if (pam_zfs_init(pamh) != 0) { zfs_key_config_free(&config); @@ -751,49 +755,62 @@ pam_sm_chauthtok(pam_handle_t *pamh, int flags, zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - int key_loaded = is_key_loaded(pamh, dataset); - if (key_loaded == -1) { + if (!old_token) { + pam_syslog(pamh, LOG_ERR, + "old password from PAM stack is null"); free(dataset); pam_zfs_free(); zfs_key_config_free(&config); return (PAM_SERVICE_ERR); } - free(dataset); - pam_zfs_free(); - if (! key_loaded) { + if (decrypt_mount(pamh, dataset, + old_token->value, B_TRUE) == -1) { pam_syslog(pamh, LOG_ERR, - "key not loaded, returning try_again"); + "old token mismatch"); + free(dataset); + pam_zfs_free(); zfs_key_config_free(&config); return (PAM_PERM_DENIED); } } if ((flags & PAM_UPDATE_AUTHTOK) != 0) { - const pw_password_t *token = pw_get(pamh); + const pw_password_t *token = pw_get(pamh, PAM_AUTHTOK, + PASSWORD_VAR_NAME); if (token == NULL) { + pam_syslog(pamh, LOG_ERR, "new password unavailable"); + pam_zfs_free(); zfs_key_config_free(&config); - return (PAM_SERVICE_ERR); - } - if (pam_zfs_init(pamh) != 0) { - zfs_key_config_free(&config); + pw_clear(pamh, OLD_PASSWORD_VAR_NAME); return (PAM_SERVICE_ERR); } char *dataset = zfs_key_config_get_dataset(&config); if (!dataset) { pam_zfs_free(); zfs_key_config_free(&config); + pw_clear(pamh, OLD_PASSWORD_VAR_NAME); + pw_clear(pamh, PASSWORD_VAR_NAME); return (PAM_SERVICE_ERR); } - if (change_key(pamh, dataset, token->value) == -1) { + int was_loaded = is_key_loaded(pamh, dataset); + if (!was_loaded && decrypt_mount(pamh, dataset, + old_token->value, B_FALSE) == -1) { free(dataset); pam_zfs_free(); zfs_key_config_free(&config); + pw_clear(pamh, OLD_PASSWORD_VAR_NAME); + pw_clear(pamh, PASSWORD_VAR_NAME); return (PAM_SERVICE_ERR); } + int changed = change_key(pamh, dataset, token->value); + if (!was_loaded) { + unmount_unload(pamh, dataset, config.force_unmount); + } free(dataset); pam_zfs_free(); zfs_key_config_free(&config); - if (pw_clear(pamh) == -1) { + if (pw_clear(pamh, OLD_PASSWORD_VAR_NAME) == -1 || + pw_clear(pamh, PASSWORD_VAR_NAME) == -1 || changed == -1) { return (PAM_SERVICE_ERR); } } else { @@ -829,7 +846,8 @@ pam_sm_open_session(pam_handle_t *pamh, int flags, return (PAM_SUCCESS); } - const pw_password_t *token = pw_get(pamh); + const pw_password_t *token = pw_get(pamh, + PAM_AUTHTOK, PASSWORD_VAR_NAME); if (token == NULL) { zfs_key_config_free(&config); return (PAM_SESSION_ERR); @@ -853,7 +871,7 @@ pam_sm_open_session(pam_handle_t *pamh, int flags, free(dataset); pam_zfs_free(); zfs_key_config_free(&config); - if (pw_clear(pamh) == -1) { + if (pw_clear(pamh, PASSWORD_VAR_NAME) == -1) { return (PAM_SERVICE_ERR); } return (PAM_SUCCESS); diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 97fc250a7cbf..618eeb934017 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -140,7 +140,8 @@ tests = ['umount_unlinked_drain'] tags = ['functional', 'mount'] [tests/functional/pam:Linux] -tests = ['pam_basic', 'pam_nounmount', 'pam_recursive', 'pam_short_password'] +tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive', + 'pam_short_password'] tags = ['functional', 'pam'] [tests/functional/procfs:Linux] diff --git a/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh b/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh new file mode 100755 index 000000000000..91b202f7609d --- /dev/null +++ b/tests/zfs-tests/tests/functional/pam/pam_change_unmounted.ksh @@ -0,0 +1,55 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/tests/functional/pam/utilities.kshlib + +if [ -n "$ASAN_OPTIONS" ]; then + export LD_PRELOAD=$(ldd "$(command -v zfs)" | awk '/libasan\.so/ {print $3}') +fi + +log_mustnot ismounted "$TESTPOOL/pam/${username}" +keystatus unavailable + +genconfig "homes=$TESTPOOL/pam runstatedir=${runstatedir}" + +printf "testpass\nsecondpass\nsecondpass\n" | pamtester -v ${pamservice} ${username} chauthtok + +log_mustnot ismounted "$TESTPOOL/pam/${username}" +keystatus unavailable + +echo "secondpass" | pamtester ${pamservice} ${username} open_session +references 1 +log_must ismounted "$TESTPOOL/pam/${username}" +keystatus available + +printf "secondpass\ntestpass\ntestpass\n" | pamtester -v ${pamservice} ${username} chauthtok + +log_must ismounted "$TESTPOOL/pam/${username}" +log_must ismounted "$TESTPOOL/pam/${username}" +keystatus available + +log_must pamtester ${pamservice} ${username} close_session +references 0 +log_mustnot ismounted "$TESTPOOL/pam/${username}" +keystatus unavailable + +log_pass "done." diff --git a/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh b/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh index 443e07d7f003..079608583a72 100755 --- a/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh +++ b/tests/zfs-tests/tests/functional/pam/pam_short_password.ksh @@ -52,7 +52,7 @@ log_must ismounted "$TESTPOOL/pam/${username}" keystatus available # Change user and dataset password to short one. -printf "short\nshort\n" | pamtester ${pamservice} ${username} chauthtok +printf "testpass\nshort\nshort\n" | pamtester -v ${pamservice} ${username} chauthtok # Unmount and unload key. log_must pamtester ${pamservice} ${username} close_session From 4f583a827e3b40f3bdfba78db75e1fe1feff122c Mon Sep 17 00:00:00 2001 From: Val Packett Date: Thu, 11 May 2023 18:16:57 -0300 Subject: [PATCH 140/180] PAM: enable testing on FreeBSD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Brian Behlendorf Reviewed-by: Felix Dörre Signed-off-by: Val Packett Closes #14834 --- tests/runfiles/freebsd.run | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/runfiles/freebsd.run b/tests/runfiles/freebsd.run index c7ca1d769fc3..13696d645850 100644 --- a/tests/runfiles/freebsd.run +++ b/tests/runfiles/freebsd.run @@ -25,3 +25,8 @@ tags = ['functional'] [tests/functional/cli_root/zfs_jail:FreeBSD] tests = ['zfs_jail_001_pos'] tags = ['functional', 'cli_root', 'zfs_jail'] + +[tests/functional/pam:FreeBSD] +tests = ['pam_basic', 'pam_change_unmounted', 'pam_nounmount', 'pam_recursive', + 'pam_short_password'] +tags = ['functional', 'pam'] From 482da24e20735ebd67fb582ad8025ba517801503 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 2 Jun 2023 14:01:58 -0400 Subject: [PATCH 141/180] ZIL: Allow to replay blocks of any size. There seems to be no reason for ZIL blocks to be limited by 128KB other than replay code is written in such a way. This change does not increase the limit yet, just removes the artificial limitation. Avoided extra memcpy() may save us a second during replay. Reviewed-by: Brian Behlendorf Reviewed-by: Prakash Surya Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14910 --- module/zfs/zil.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 509fd39d3590..8672a61387a5 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -248,11 +248,10 @@ zil_kstats_global_update(kstat_t *ksp, int rw) */ static int zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, - blkptr_t *nbp, void *dst, char **end) + blkptr_t *nbp, char **begin, char **end, arc_buf_t **abuf) { zio_flag_t zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t aflags = ARC_FLAG_WAIT; - arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; @@ -269,7 +268,7 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, - &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; @@ -284,23 +283,23 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, */ cksum.zc_word[ZIL_ZC_SEQ]++; + uint64_t size = BP_GET_LSIZE(bp); if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = abuf->b_data; + zil_chain_t *zilc = (*abuf)->b_data; char *lr = (char *)(zilc + 1); - uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { + sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || + zilc->zc_nused < sizeof (*zilc) || + zilc->zc_nused > size) { error = SET_ERROR(ECKSUM); } else { - ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); - memcpy(dst, lr, len); - *end = (char *)dst + len; + *begin = lr; + *end = lr + zilc->zc_nused - sizeof (*zilc); *nbp = zilc->zc_next_blk; } } else { - char *lr = abuf->b_data; - uint64_t size = BP_GET_LSIZE(bp); + char *lr = (*abuf)->b_data; zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (memcmp(&cksum, &zilc->zc_next_blk.blk_cksum, @@ -308,15 +307,11 @@ zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { - ASSERT3U(zilc->zc_nused, <=, - SPA_OLD_MAXBLOCKSIZE); - memcpy(dst, lr, zilc->zc_nused); - *end = (char *)dst + zilc->zc_nused; + *begin = lr; + *end = lr + zilc->zc_nused; *nbp = zilc->zc_next_blk; } } - - arc_buf_destroy(abuf, &abuf); } return (error); @@ -468,7 +463,6 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, uint64_t blk_count = 0; uint64_t lr_count = 0; blkptr_t blk, next_blk = {{{{0}}}}; - char *lrbuf, *lrp; int error = 0; /* @@ -486,13 +480,13 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ - lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; - char *end = NULL; + char *lrp, *end; + arc_buf_t *abuf = NULL; if (blk_seq > claim_blk_seq) break; @@ -508,8 +502,10 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, break; error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, - lrbuf, &end); + &lrp, &end, &abuf); if (error != 0) { + if (abuf) + arc_buf_destroy(abuf, &abuf); if (claimed) { char name[ZFS_MAX_DATASET_NAME_LEN]; @@ -522,7 +518,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, break; } - for (lrp = lrbuf; lrp < end; lrp += reclen) { + for (; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); @@ -536,6 +532,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, max_lr_seq = lr->lrc_seq; lr_count++; } + arc_buf_destroy(abuf, &abuf); } done: zilog->zl_parse_error = error; @@ -545,7 +542,6 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zilog->zl_parse_lr_count = lr_count; zil_bp_tree_fini(zilog); - zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); return (error); } From 6c29422e90dca7a7ef6a69c721238a1c33f393f9 Mon Sep 17 00:00:00 2001 From: Graham Perrin Date: Fri, 2 Jun 2023 19:25:13 +0100 Subject: [PATCH 142/180] zfs-create(8): ZFS for swap: caution, clarity Make the section heading more generic (the section relates to ZFS files as well as ZFS volumes). Swapping to a ZFS volume is prone to deadlock. Remove the related instruction, direct readers to OpenZFS FAQ. Related, but not linked from within the manual page: (Using a zvol for a swap device on Linux). Reviewed-by: Brian Behlendorf Signed-off-by: Graham Perrin Issue #7734 Closes #14756 --- man/man8/zfs-create.8 | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/man/man8/zfs-create.8 b/man/man8/zfs-create.8 index a7b6097c37f1..b3997d32767c 100644 --- a/man/man8/zfs-create.8 +++ b/man/man8/zfs-create.8 @@ -234,14 +234,11 @@ if the volume is not sparse. Print verbose information about the created dataset. .El .El -.Ss ZFS Volumes as Swap -ZFS volumes may be used as swap devices. -After creating the volume with the -.Nm zfs Cm create Fl V -enable the swap area using the -.Xr swapon 8 -command. -Swapping to files on ZFS filesystems is not supported. +.Ss ZFS for Swap +Swapping to a ZFS volume is prone to deadlock and not recommended. +See OpenZFS FAQ. +.Pp +Swapping to a file on a ZFS filesystem is not supported. . .Sh EXAMPLES .\" These are, respectively, examples 1, 10 from zfs.8 From dae3c549f59a4650edd07b86707166765c240310 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 5 Jun 2023 11:08:24 -0700 Subject: [PATCH 143/180] Linux 6.3 compat: META (#14930) Update the META file to reflect compatibility with the 6.3 kernel. Signed-off-by: Brian Behlendorf Reviewed-by: Tony Hutter --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 8779e512f7be..e4b476aff112 100644 --- a/META +++ b/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.2 +Linux-Maximum: 6.3 Linux-Minimum: 3.10 From 5ba4025a8d94699d2638938a0cdf790113ff0531 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Mon, 5 Jun 2023 14:51:44 -0400 Subject: [PATCH 144/180] Introduce zfs_refcount_(add|remove)_few(). There are two places where we need to add/remove several references with semantics of zfs_refcount_(add|remove). But when debug/tracing is disabled, it is a crime to run multiple atomic_inc() in a loop, especially under congested pool-wide allocator lock. Introduced new functions implement the same semantics as the loop, but without overhead in production builds. Reviewed-by: Rich Ercolani Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14934 --- include/sys/zfs_refcount.h | 12 +++++++++--- module/zfs/dmu_zfetch.c | 3 +-- module/zfs/metaslab.c | 6 ++---- module/zfs/refcount.c | 18 ++++++++++++++++++ 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/include/sys/zfs_refcount.h b/include/sys/zfs_refcount.h index 42f846b8920a..4efa266a53c5 100644 --- a/include/sys/zfs_refcount.h +++ b/include/sys/zfs_refcount.h @@ -73,13 +73,15 @@ int64_t zfs_refcount_count(zfs_refcount_t *); int64_t zfs_refcount_add(zfs_refcount_t *, const void *); int64_t zfs_refcount_remove(zfs_refcount_t *, const void *); /* - * Note that (add|remove)_many add/remove one reference with "number" N, - * _not_ make N references with "number" 1, which is what vanilla - * zfs_refcount_(add|remove) would do if called N times. + * Note that (add|remove)_many adds/removes one reference with "number" N, + * _not_ N references with "number" 1, which is what (add|remove)_few does, + * or what vanilla zfs_refcount_(add|remove) called N times would do. * * Attempting to remove a reference with number N when none exists is a * panic on debug kernels with reference_tracking enabled. */ +void zfs_refcount_add_few(zfs_refcount_t *, uint64_t, const void *); +void zfs_refcount_remove_few(zfs_refcount_t *, uint64_t, const void *); int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, const void *); int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, const void *); void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *); @@ -108,6 +110,10 @@ typedef struct refcount { #define zfs_refcount_count(rc) atomic_load_64(&(rc)->rc_count) #define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count) #define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count) +#define zfs_refcount_add_few(rc, number, holder) \ + atomic_add_64(&(rc)->rc_count, number) +#define zfs_refcount_remove_few(rc, number, holder) \ + atomic_add_64(&(rc)->rc_count, -number) #define zfs_refcount_add_many(rc, number, holder) \ atomic_add_64_nv(&(rc)->rc_count, number) #define zfs_refcount_remove_many(rc, number, holder) \ diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c index ffc012e6c217..b70459380c24 100644 --- a/module/zfs/dmu_zfetch.c +++ b/module/zfs/dmu_zfetch.c @@ -520,8 +520,7 @@ dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) issued = pf_end - pf_start + ipf_end - ipf_start; if (issued > 1) { /* More references on top of taken in dmu_zfetch_prepare(). */ - for (int i = 0; i < issued - 1; i++) - zfs_refcount_add(&zs->zs_refs, NULL); + zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL); } else if (issued == 0) { /* Some other thread has done our work, so drop the ref. */ if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 94b131fcdb79..176247d63b76 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -5650,8 +5650,7 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, * We reserve the slots individually so that we can unreserve * them individually when an I/O completes. */ - for (int d = 0; d < slots; d++) - zfs_refcount_add(&mca->mca_alloc_slots, zio); + zfs_refcount_add_few(&mca->mca_alloc_slots, slots, zio); zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; return (B_TRUE); } @@ -5665,8 +5664,7 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; ASSERT(mc->mc_alloc_throttle_enabled); - for (int d = 0; d < slots; d++) - zfs_refcount_remove(&mca->mca_alloc_slots, zio); + zfs_refcount_remove_few(&mca->mca_alloc_slots, slots, zio); } static int diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index 62ec03e1035a..c9a504f67451 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -151,6 +151,15 @@ zfs_refcount_add(zfs_refcount_t *rc, const void *holder) return (zfs_refcount_add_many(rc, 1, holder)); } +void +zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder) +{ + if (!rc->rc_tracked) + (void) zfs_refcount_add_many(rc, number, holder); + else for (; number > 0; number--) + (void) zfs_refcount_add(rc, holder); +} + int64_t zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, const void *holder) @@ -204,6 +213,15 @@ zfs_refcount_remove(zfs_refcount_t *rc, const void *holder) return (zfs_refcount_remove_many(rc, 1, holder)); } +void +zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder) +{ + if (!rc->rc_tracked) + (void) zfs_refcount_remove_many(rc, number, holder); + else for (; number > 0; number--) + (void) zfs_refcount_remove(rc, holder); +} + void zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) { From 2b9f8ba6736419d38292bee218f2756997a02c8c Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 4 Jun 2023 11:14:20 +1000 Subject: [PATCH 145/180] znode: expose zfs_get_zplprop to libzpool There's no particular reason this function should be kernel-only, and I want to use it (indirectly) from zdb. I've moved it to zfs_znode.c because libzpool does not compile in zfs_vfsops.c, and this at least matches the header its imported from. Sponsored-By: Klara, Inc. Reviewed-by: Tino Reichardt Reviewed-by: WHR Signed-off-by: Rob Norris Closes #14642 --- include/sys/zfs_znode.h | 2 +- module/os/freebsd/zfs/zfs_vfsops.c | 86 ----------------------------- module/os/freebsd/zfs/zfs_znode.c | 87 ++++++++++++++++++++++++++++++ module/os/linux/zfs/zfs_vfsops.c | 85 ----------------------------- module/os/linux/zfs/zfs_znode.c | 85 +++++++++++++++++++++++++++++ 5 files changed, 173 insertions(+), 172 deletions(-) diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h index 012e7403e2a6..2f266f53247e 100644 --- a/include/sys/zfs_znode.h +++ b/include/sys/zfs_znode.h @@ -158,6 +158,7 @@ extern "C" { #define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); +extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); #ifdef _KERNEL #include @@ -280,7 +281,6 @@ extern void zfs_znode_delete(znode_t *, dmu_tx_t *); extern void zfs_remove_op_tables(void); extern int zfs_create_op_tables(void); extern dev_t zfs_cmpldev(uint64_t); -extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); extern int zfs_get_stats(objset_t *os, nvlist_t *nv); extern boolean_t zfs_get_vfs_flag_unmounted(objset_t *os); extern void zfs_znode_dmu_fini(znode_t *); diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index 30851f5273a2..33759fa26169 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -2216,92 +2216,6 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) return (0); } -/* - * Read a property stored within the master node. - */ -int -zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) -{ - uint64_t *cached_copy = NULL; - - /* - * Figure out where in the objset_t the cached copy would live, if it - * is available for the requested property. - */ - if (os != NULL) { - switch (prop) { - case ZFS_PROP_VERSION: - cached_copy = &os->os_version; - break; - case ZFS_PROP_NORMALIZE: - cached_copy = &os->os_normalization; - break; - case ZFS_PROP_UTF8ONLY: - cached_copy = &os->os_utf8only; - break; - case ZFS_PROP_CASE: - cached_copy = &os->os_casesensitivity; - break; - default: - break; - } - } - if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { - *value = *cached_copy; - return (0); - } - - /* - * If the property wasn't cached, look up the file system's value for - * the property. For the version property, we look up a slightly - * different string. - */ - const char *pname; - int error = ENOENT; - if (prop == ZFS_PROP_VERSION) { - pname = ZPL_VERSION_STR; - } else { - pname = zfs_prop_to_name(prop); - } - - if (os != NULL) { - ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); - error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); - } - - if (error == ENOENT) { - /* No value set, use the default value */ - switch (prop) { - case ZFS_PROP_VERSION: - *value = ZPL_VERSION; - break; - case ZFS_PROP_NORMALIZE: - case ZFS_PROP_UTF8ONLY: - *value = 0; - break; - case ZFS_PROP_CASE: - *value = ZFS_CASE_SENSITIVE; - break; - case ZFS_PROP_ACLTYPE: - *value = ZFS_ACLTYPE_NFSV4; - break; - default: - return (error); - } - error = 0; - } - - /* - * If one of the methods for getting the property value above worked, - * copy it into the objset_t's cache. - */ - if (error == 0 && cached_copy != NULL) { - *cached_copy = *value; - } - - return (error); -} - /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c index d26d89544e7c..c4f2b722ef4e 100644 --- a/module/os/freebsd/zfs/zfs_znode.c +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -2069,6 +2069,93 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, return (error); } +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) { + pname = ZPL_VERSION_STR; + } else { + pname = zfs_prop_to_name(prop); + } + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLTYPE: + *value = ZFS_ACLTYPE_NFSV4; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + + void zfs_znode_update_vfs(znode_t *zp) diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 48945b8af8c1..6b6293b9e482 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -2052,91 +2052,6 @@ zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) return (0); } -/* - * Read a property stored within the master node. - */ -int -zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) -{ - uint64_t *cached_copy = NULL; - - /* - * Figure out where in the objset_t the cached copy would live, if it - * is available for the requested property. - */ - if (os != NULL) { - switch (prop) { - case ZFS_PROP_VERSION: - cached_copy = &os->os_version; - break; - case ZFS_PROP_NORMALIZE: - cached_copy = &os->os_normalization; - break; - case ZFS_PROP_UTF8ONLY: - cached_copy = &os->os_utf8only; - break; - case ZFS_PROP_CASE: - cached_copy = &os->os_casesensitivity; - break; - default: - break; - } - } - if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { - *value = *cached_copy; - return (0); - } - - /* - * If the property wasn't cached, look up the file system's value for - * the property. For the version property, we look up a slightly - * different string. - */ - const char *pname; - int error = ENOENT; - if (prop == ZFS_PROP_VERSION) - pname = ZPL_VERSION_STR; - else - pname = zfs_prop_to_name(prop); - - if (os != NULL) { - ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); - error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); - } - - if (error == ENOENT) { - /* No value set, use the default value */ - switch (prop) { - case ZFS_PROP_VERSION: - *value = ZPL_VERSION; - break; - case ZFS_PROP_NORMALIZE: - case ZFS_PROP_UTF8ONLY: - *value = 0; - break; - case ZFS_PROP_CASE: - *value = ZFS_CASE_SENSITIVE; - break; - case ZFS_PROP_ACLTYPE: - *value = ZFS_ACLTYPE_OFF; - break; - default: - return (error); - } - error = 0; - } - - /* - * If one of the methods for getting the property value above worked, - * copy it into the objset_t's cache. - */ - if (error == 0 && cached_copy != NULL) { - *cached_copy = *value; - } - - return (error); -} - /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index c104cd661bf5..02b1af3edc4f 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -2254,6 +2254,91 @@ zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, return (error); } +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) + pname = ZPL_VERSION_STR; + else + pname = zfs_prop_to_name(prop); + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLTYPE: + *value = ZFS_ACLTYPE_OFF; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + #if defined(_KERNEL) EXPORT_SYMBOL(zfs_create_fs); EXPORT_SYMBOL(zfs_obj_to_path); From 8653f1de48ee5b41e2bf421785e1279a08c6b903 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 15 Mar 2023 18:18:10 +1100 Subject: [PATCH 146/180] zdb: add -B option to generate backup stream This is more-or-less like `zfs send`, but specifying the snapshot by its objset id for situations where it can't be referenced any other way. Sponsored-By: Klara, Inc. Reviewed-by: Tino Reichardt Reviewed-by: WHR Signed-off-by: Rob Norris Closes #14642 --- cmd/zdb/zdb.c | 97 ++++++++++++++++++- man/man8/zdb.8 | 25 ++++- module/zfs/dmu_send.c | 3 +- tests/runfiles/common.run | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../functional/cli_root/zdb/zdb_backup.ksh | 55 +++++++++++ 6 files changed, 174 insertions(+), 9 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 61f1258f72b9..105d36882291 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -33,6 +33,7 @@ * under sponsorship from the FreeBSD Foundation. * Copyright (c) 2021 Allan Jude * Copyright (c) 2021 Toomas Soome + * Copyright (c) 2023, Klara Inc. */ #include @@ -789,6 +790,9 @@ usage(void) "\t\t[[/] [ ...]]\n" "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ] [-K ]\n" "\t\t[[/] [ ...]\n" + "\t%s -B [-e [-V] [-p ...]] [-I ]\n" + "\t\t[-o =]... [-t ] [-U ] [-x ]\n" + "\t\t[-K ] / []\n" "\t%s [-v] \n" "\t%s -C [-A] [-U ]\n" "\t%s -l [-Aqu] \n" @@ -802,7 +806,7 @@ usage(void) "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " "\n\n", cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, - cmdname, cmdname, cmdname, cmdname); + cmdname, cmdname, cmdname, cmdname, cmdname); (void) fprintf(stderr, " Dataset name must include at least one " "separator character '/' or '@'\n"); @@ -825,6 +829,8 @@ usage(void) (void) fprintf(stderr, " Options to control amount of output:\n"); (void) fprintf(stderr, " -b --block-stats " "block statistics\n"); + (void) fprintf(stderr, " -B --backup " + "backup stream\n"); (void) fprintf(stderr, " -c --checksum " "checksum all metadata (twice for all data) blocks\n"); (void) fprintf(stderr, " -C --config " @@ -4875,6 +4881,81 @@ dump_path(char *ds, char *path, uint64_t *retobj) return (err); } +static int +dump_backup_bytes(objset_t *os, void *buf, int len, void *arg) +{ + const char *p = (const char *)buf; + ssize_t nwritten; + + (void) os; + (void) arg; + + /* Write the data out, handling short writes and signals. */ + while ((nwritten = write(STDOUT_FILENO, p, len)) < len) { + if (nwritten < 0) { + if (errno == EINTR) + continue; + return (errno); + } + p += nwritten; + len -= nwritten; + } + + return (0); +} + +static void +dump_backup(const char *pool, uint64_t objset_id, const char *flagstr) +{ + boolean_t embed = B_FALSE; + boolean_t large_block = B_FALSE; + boolean_t compress = B_FALSE; + boolean_t raw = B_FALSE; + + const char *c; + for (c = flagstr; c != NULL && *c != '\0'; c++) { + switch (*c) { + case 'e': + embed = B_TRUE; + break; + case 'L': + large_block = B_TRUE; + break; + case 'c': + compress = B_TRUE; + break; + case 'w': + raw = B_TRUE; + break; + default: + fprintf(stderr, "dump_backup: invalid flag " + "'%c'\n", *c); + return; + } + } + + if (isatty(STDOUT_FILENO)) { + fprintf(stderr, "dump_backup: stream cannot be written " + "to a terminal\n"); + return; + } + + offset_t off = 0; + dmu_send_outparams_t out = { + .dso_outfunc = dump_backup_bytes, + .dso_dryrun = B_FALSE, + }; + + int err = dmu_send_obj(pool, objset_id, /* fromsnap */0, embed, + large_block, compress, raw, /* saved */ B_FALSE, STDOUT_FILENO, + &off, &out); + if (err != 0) { + fprintf(stderr, "dump_backup: dmu_send_obj: %s\n", + strerror(err)); + return; + } +} + static int zdb_copy_object(objset_t *os, uint64_t srcobj, char *destfile) { @@ -8695,6 +8776,7 @@ main(int argc, char **argv) struct option long_options[] = { {"ignore-assertions", no_argument, NULL, 'A'}, {"block-stats", no_argument, NULL, 'b'}, + {"backup", no_argument, NULL, 'B'}, {"checksum", no_argument, NULL, 'c'}, {"config", no_argument, NULL, 'C'}, {"datasets", no_argument, NULL, 'd'}, @@ -8736,10 +8818,11 @@ main(int argc, char **argv) }; while ((c = getopt_long(argc, argv, - "AbcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ", + "AbBcCdDeEFGhiI:kK:lLmMNo:Op:PqrRsSt:uU:vVx:XYyZ", long_options, NULL)) != -1) { switch (c) { case 'b': + case 'B': case 'c': case 'C': case 'd': @@ -8887,7 +8970,7 @@ main(int argc, char **argv) verbose = MAX(verbose, 1); for (c = 0; c < 256; c++) { - if (dump_all && strchr("AeEFkKlLNOPrRSXy", c) == NULL) + if (dump_all && strchr("ABeEFkKlLNOPrRSXy", c) == NULL) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; @@ -9073,7 +9156,8 @@ main(int argc, char **argv) checkpoint_pool, error); } - } else if (target_is_spa || dump_opt['R'] || objset_id == 0) { + } else if (target_is_spa || dump_opt['R'] || dump_opt['B'] || + objset_id == 0) { zdb_set_skip_mmp(target); error = spa_open_rewind(target, &spa, FTAG, policy, NULL); @@ -9209,7 +9293,10 @@ main(int argc, char **argv) strerror(errno)); } } - if (os != NULL) { + if (dump_opt['B']) { + dump_backup(target, objset_id, + argc > 0 ? argv[0] : NULL); + } else if (os != NULL) { dump_objset(os); } else if (zopt_object_args > 0 && !dump_opt['m']) { dump_objset(spa->spa_meta_objset); diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index 26c67dabd705..031953c543a1 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -14,7 +14,7 @@ .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC. .\" Copyright (c) 2017 Intel Corporation. .\" -.Dd October 7, 2020 +.Dd June 4, 2023 .Dt ZDB 8 .Os . @@ -41,6 +41,13 @@ .Ar poolname Ns Op Ar / Ns Ar dataset Ns | Ns Ar objset-ID .Op Ar object Ns | Ns Ar range Ns … .Nm +.Fl B +.Op Fl e Oo Fl V Oc Oo Fl p Ar path Oc Ns … +.Op Fl U Ar cache +.Op Fl K Ar key +.Ar poolname Ns Ar / Ns Ar objset-ID +.Op Ar backup-flags +.Nm .Fl C .Op Fl A .Op Fl U Ar cache @@ -123,6 +130,22 @@ Display options: Display statistics regarding the number, size .Pq logical, physical and allocated and deduplication of blocks. +.It Fl B , -backup +Generate a backup stream, similar to +.Nm zfs Cm send , +but for the numeric objset ID, and without opening the dataset. +This can be useful in recovery scenarios if dataset metadata has become +corrupted but the dataset itself is readable. +The optional +.Ar flags +argument is a string of one or more of the letters +.Sy e , +.Sy L , +.Sy c , +and +.Sy w , +which correspond to the same flags in +.Xr zfs-send 8 . .It Fl c , -checksum Verify the checksum of all metadata blocks while printing block statistics .Po see diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index b3ebdec6b45c..2d37ed2cdfb5 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1955,7 +1955,7 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os, { dsl_dataset_t *to_ds = dspp->to_ds; dsl_pool_t *dp = dspp->dp; -#ifdef _KERNEL + if (dmu_objset_type(os) == DMU_OST_ZFS) { uint64_t version; if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) @@ -1964,7 +1964,6 @@ setup_featureflags(struct dmu_send_params *dspp, objset_t *os, if (version >= ZPL_VERSION_SA) *featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } -#endif /* raw sends imply large_block_ok */ if ((dspp->rawok || dspp->large_block_ok) && diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 10525289a3bd..342f56d50d04 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -128,7 +128,7 @@ tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', 'zdb_display_block', 'zdb_encrypted', 'zdb_label_checksum', 'zdb_object_range_neg', 'zdb_object_range_pos', 'zdb_objset_id', - 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2'] + 'zdb_decompress_zstd', 'zdb_recover', 'zdb_recover_2', 'zdb_backup'] pre = post = tags = ['functional', 'cli_root', 'zdb'] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 129893cd61f3..ff65dc1ac2b0 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -572,6 +572,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zdb/zdb_006_pos.ksh \ functional/cli_root/zdb/zdb_args_neg.ksh \ functional/cli_root/zdb/zdb_args_pos.ksh \ + functional/cli_root/zdb/zdb_backup.ksh \ functional/cli_root/zdb/zdb_block_size_histogram.ksh \ functional/cli_root/zdb/zdb_checksum.ksh \ functional/cli_root/zdb/zdb_decompress.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh new file mode 100755 index 000000000000..d98ab86ab667 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_backup.ksh @@ -0,0 +1,55 @@ +#!/bin/ksh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2023, Klara Inc. +# + +. $STF_SUITE/include/libtest.shlib + +write_count=8 +blksize=131072 + +tmpfile=$TEST_BASE_DIR/tmpfile + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + rm $tmpfile.1 $tmpfile.2 +} + +log_onexit cleanup + +log_assert "Verify that zfs send and zdb -B produce the same stream" + +verify_runnable "global" +verify_disk_count "$DISKS" 2 + +default_mirror_setup_noexit $DISKS +file_write -o create -w -f $TESTDIR/file -b $blksize -c $write_count + +snap=$TESTPOOL/$TESTFS@snap +log_must zfs snapshot $snap +typeset -i objsetid=$(zfs get -Ho value objsetid $snap) + +sync_pool $TESTPOOL + +log_must eval "zfs send -ecL $snap > $tmpfile.1" +log_must eval "zdb -B $TESTPOOL/$objsetid ecL > $tmpfile.2" + +typeset sum1=$(cat $tmpfile.1 | md5sum) +typeset sum2=$(cat $tmpfile.2 | md5sum) + +log_must test "$sum1" = "$sum2" + +log_pass "zfs send and zdb -B produce the same stream" From bcd5321039c3de29c14eac1068d392c15ad7fe2c Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Tue, 6 Jun 2023 21:32:37 +0200 Subject: [PATCH 147/180] Fix the L2ARC write size calculating logic l2arc_write_size() should return the write size after adjusting for trim and overhead of the L2ARC log blocks. Also take into account the allocated size of log blocks when deciding when to stop writing buffers to L2ARC. Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #14939 --- module/zfs/arc.c | 45 +++++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index a78f664c4fe8..6f68c29fc7f5 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -965,7 +965,7 @@ static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev); /* L2ARC persistence write I/O routines. */ -static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, +static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb); /* L2ARC persistence auxiliary routines. */ @@ -8175,7 +8175,7 @@ l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) static uint64_t l2arc_write_size(l2arc_dev_t *dev) { - uint64_t size, dev_size, tsize; + uint64_t size; /* * Make sure our globals have meaningful values in case the user @@ -8192,35 +8192,40 @@ l2arc_write_size(l2arc_dev_t *dev) if (arc_warm == B_FALSE) size += l2arc_write_boost; - /* - * Make sure the write size does not exceed the size of the cache - * device. This is important in l2arc_evict(), otherwise infinite - * iteration can occur. - */ - dev_size = dev->l2ad_end - dev->l2ad_start; - /* We need to add in the worst case scenario of log block overhead. */ - tsize = size + l2arc_log_blk_overhead(size, dev); + size += l2arc_log_blk_overhead(size, dev); if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { /* * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) * times the writesize, whichever is greater. */ - tsize += MAX(64 * 1024 * 1024, - (tsize * l2arc_trim_ahead) / 100); + size += MAX(64 * 1024 * 1024, + (size * l2arc_trim_ahead) / 100); } - if (tsize >= dev_size) { + /* + * Make sure the write size does not exceed the size of the cache + * device. This is important in l2arc_evict(), otherwise infinite + * iteration can occur. + */ + if (size >= dev->l2ad_end - dev->l2ad_start) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " "(guid %llu), resetting them to the default (%d)", (u_longlong_t)l2arc_log_blk_overhead(size, dev), (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); + size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; if (arc_warm == B_FALSE) size += l2arc_write_boost; + + size += l2arc_log_blk_overhead(size, dev); + if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { + size += MAX(64 * 1024 * 1024, + (size * l2arc_trim_ahead) / 100); + } } return (size); @@ -9413,8 +9418,14 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) * arcstat_l2_{size,asize} kstats are updated * internally. */ - if (l2arc_log_blk_insert(dev, hdr)) - l2arc_log_blk_commit(dev, pio, cb); + if (l2arc_log_blk_insert(dev, hdr)) { + /* + * l2ad_hand has been accounted for in + * l2arc_log_blk_commit(). + */ + write_asize += + l2arc_log_blk_commit(dev, pio, cb); + } zio_nowait(wzio); } @@ -10564,7 +10575,7 @@ l2arc_dev_hdr_update(l2arc_dev_t *dev) * This function allocates some memory to temporarily hold the serialized * buffer to be written. This is then released in l2arc_write_done. */ -static void +static uint64_t l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) { l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; @@ -10675,6 +10686,8 @@ l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) dev->l2ad_log_ent_idx = 0; dev->l2ad_log_blk_payload_asize = 0; dev->l2ad_log_blk_payload_start = 0; + + return (asize); } /* From 93f8abeff08e9c4363ec4d53d501cf21830c95e1 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 7 Jun 2023 10:43:43 -0700 Subject: [PATCH 148/180] Linux: Never sleep in kmem_cache_alloc(..., KM_NOSLEEP) (#14926) When a kmem cache is exhausted and needs to be expanded a new slab is allocated. KM_SLEEP callers can block and wait for the allocation, but KM_NOSLEEP callers were incorrectly allowed to block as well. Resolve this by attempting an emergency allocation as a best effort. This may fail but that's fine since any KM_NOSLEEP consumer is required to handle an allocation failure. Signed-off-by: Brian Behlendorf Reviewed-by: Adam Moss Reviewed-by: Brian Atkinson Reviewed-by: Richard Yao Reviewed-by: Tony Hutter --- module/os/linux/spl/spl-kmem-cache.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 745d03012f9d..3c30dfc577b4 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -1015,9 +1015,19 @@ spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) ASSERT0(flags & ~KM_PUBLIC_MASK); ASSERT(skc->skc_magic == SKC_MAGIC); ASSERT((skc->skc_flags & KMC_SLAB) == 0); - might_sleep(); + *obj = NULL; + /* + * Since we can't sleep attempt an emergency allocation to satisfy + * the request. The only alterative is to fail the allocation but + * it's preferable try. The use of KM_NOSLEEP is expected to be rare. + */ + if (flags & KM_NOSLEEP) + return (spl_emergency_alloc(skc, flags, obj)); + + might_sleep(); + /* * Before allocating a new slab wait for any reaping to complete and * then return so the local magazine can be rechecked for new objects. From 6c962690245a6a2a4dfc2350c71a249641139c26 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Wed, 7 Jun 2023 14:14:05 -0400 Subject: [PATCH 149/180] Revert "systemd: Use non-absolute paths in Exec* lines" This reverts commit 79b20949b25c8db4d379f6486b0835a6613b480c since it doesn't work with the systemd version shipped with RHEL7-based systems. Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #14943 Closes #14945 --- etc/systemd/system/zfs-import-cache.service.in | 2 +- etc/systemd/system/zfs-import-scan.service.in | 2 +- etc/systemd/system/zfs-mount.service.in | 2 +- etc/systemd/system/zfs-scrub@.service.in | 10 +++++----- etc/systemd/system/zfs-share.service.in | 2 +- etc/systemd/system/zfs-trim@.service.in | 10 +++++----- etc/systemd/system/zfs-volume-wait.service.in | 2 +- etc/systemd/system/zfs-zed.service.in | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/etc/systemd/system/zfs-import-cache.service.in b/etc/systemd/system/zfs-import-cache.service.in index 6d9a065e7e3a..fd822989da93 100644 --- a/etc/systemd/system/zfs-import-cache.service.in +++ b/etc/systemd/system/zfs-import-cache.service.in @@ -15,7 +15,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=zpool import -c @sysconfdir@/zfs/zpool.cache -aN $ZPOOL_IMPORT_OPTS +ExecStart=@sbindir@/zpool import -c @sysconfdir@/zfs/zpool.cache -aN $ZPOOL_IMPORT_OPTS [Install] WantedBy=zfs-import.target diff --git a/etc/systemd/system/zfs-import-scan.service.in b/etc/systemd/system/zfs-import-scan.service.in index fb524f3b0889..c5dd45d87e68 100644 --- a/etc/systemd/system/zfs-import-scan.service.in +++ b/etc/systemd/system/zfs-import-scan.service.in @@ -14,7 +14,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=zpool import -aN -o cachefile=none $ZPOOL_IMPORT_OPTS +ExecStart=@sbindir@/zpool import -aN -o cachefile=none $ZPOOL_IMPORT_OPTS [Install] WantedBy=zfs-import.target diff --git a/etc/systemd/system/zfs-mount.service.in b/etc/systemd/system/zfs-mount.service.in index fc4e1c49f1c5..66d894923f4a 100644 --- a/etc/systemd/system/zfs-mount.service.in +++ b/etc/systemd/system/zfs-mount.service.in @@ -12,7 +12,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=zfs mount -a +ExecStart=@sbindir@/zfs mount -a [Install] WantedBy=zfs.target diff --git a/etc/systemd/system/zfs-scrub@.service.in b/etc/systemd/system/zfs-scrub@.service.in index 2bb2757d5e97..8ffffeb0cf6c 100644 --- a/etc/systemd/system/zfs-scrub@.service.in +++ b/etc/systemd/system/zfs-scrub@.service.in @@ -8,8 +8,8 @@ ConditionPathIsDirectory=/sys/module/zfs [Service] EnvironmentFile=-@initconfdir@/zfs -ExecStart=sh -c '\ -if zpool status %i | grep -q "scrub in progress"; then\ -exec zpool wait -t scrub %i;\ -else exec zpool scrub -w %i; fi' -ExecStop=-sh -c 'zpool scrub -p %i 2>/dev/null || true' +ExecStart=/bin/sh -c '\ +if @sbindir@/zpool status %i | grep -q "scrub in progress"; then\ +exec @sbindir@/zpool wait -t scrub %i;\ +else exec @sbindir@/zpool scrub -w %i; fi' +ExecStop=-/bin/sh -c '@sbindir@/zpool scrub -p %i 2>/dev/null || true' diff --git a/etc/systemd/system/zfs-share.service.in b/etc/systemd/system/zfs-share.service.in index dd321f490fe6..1a6342a06fec 100644 --- a/etc/systemd/system/zfs-share.service.in +++ b/etc/systemd/system/zfs-share.service.in @@ -14,7 +14,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=zfs share -a +ExecStart=@sbindir@/zfs share -a [Install] WantedBy=zfs.target diff --git a/etc/systemd/system/zfs-trim@.service.in b/etc/systemd/system/zfs-trim@.service.in index f55e36cd8454..423fb448c16f 100644 --- a/etc/systemd/system/zfs-trim@.service.in +++ b/etc/systemd/system/zfs-trim@.service.in @@ -8,8 +8,8 @@ ConditionPathIsDirectory=/sys/module/zfs [Service] EnvironmentFile=-@initconfdir@/zfs -ExecStart=sh -c '\ -if zpool status %i | grep -q "(trimming)"; then\ -exec zpool wait -t trim %i;\ -else exec zpool trim -w %i; fi' -ExecStop=-sh -c 'zpool trim -s %i 2>/dev/null || true' +ExecStart=/bin/sh -c '\ +if @sbindir@/zpool status %i | grep -q "(trimming)"; then\ +exec @sbindir@/zpool wait -t trim %i;\ +else exec @sbindir@/zpool trim -w %i; fi' +ExecStop=-/bin/sh -c '@sbindir@/zpool trim -s %i 2>/dev/null || true' diff --git a/etc/systemd/system/zfs-volume-wait.service.in b/etc/systemd/system/zfs-volume-wait.service.in index a86a3561e032..110c0f5f52ee 100644 --- a/etc/systemd/system/zfs-volume-wait.service.in +++ b/etc/systemd/system/zfs-volume-wait.service.in @@ -9,7 +9,7 @@ ConditionPathIsDirectory=/sys/module/zfs Type=oneshot RemainAfterExit=yes EnvironmentFile=-@initconfdir@/zfs -ExecStart=zvol_wait +ExecStart=@bindir@/zvol_wait [Install] WantedBy=zfs-volumes.target diff --git a/etc/systemd/system/zfs-zed.service.in b/etc/systemd/system/zfs-zed.service.in index ac58ad3eff7b..be2fc67348f9 100644 --- a/etc/systemd/system/zfs-zed.service.in +++ b/etc/systemd/system/zfs-zed.service.in @@ -5,7 +5,7 @@ ConditionPathIsDirectory=/sys/module/zfs [Service] EnvironmentFile=-@initconfdir@/zfs -ExecStart=zed -F +ExecStart=@sbindir@/zed -F Restart=always [Install] From 55b1842f92a24fe7192d129bca7b60882080d31a Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 9 Jun 2023 13:08:05 -0400 Subject: [PATCH 150/180] ZIL: Fix race introduced by f63811f0721. We are not allowed to access lwb after setting LWB_STATE_FLUSH_DONE state and dropping zl_lock, since it may be freed by zil_sync(). To free itxs and waiters after dropping the lock we need to move lwb_itxs and lwb_waiters lists elements to local storage. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14957 Closes #14959 --- module/zfs/zil.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 8672a61387a5..8c1fe5f66838 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1393,9 +1393,14 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zil_commit_waiter_t *zcw; itx_t *itx; uint64_t txg; + list_t itxs, waiters; spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); + list_create(&itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); + list_create(&waiters, sizeof (zil_commit_waiter_t), + offsetof(zil_commit_waiter_t, zcw_node)); + hrtime_t t = gethrtime() - lwb->lwb_issued_timestamp; mutex_enter(&zilog->zl_lock); @@ -1404,9 +1409,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio) lwb->lwb_root_zio = NULL; - ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); - lwb->lwb_state = LWB_STATE_FLUSH_DONE; - if (zilog->zl_last_lwb_opened == lwb) { /* * Remember the highest committed log sequence number @@ -1417,15 +1419,21 @@ zil_lwb_flush_vdevs_done(zio_t *zio) zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } + list_move_tail(&itxs, &lwb->lwb_itxs); + list_move_tail(&waiters, &lwb->lwb_waiters); + + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); + lwb->lwb_state = LWB_STATE_FLUSH_DONE; + mutex_exit(&zilog->zl_lock); - while ((itx = list_remove_head(&lwb->lwb_itxs)) != NULL) + while ((itx = list_remove_head(&itxs)) != NULL) zil_itx_destroy(itx); + list_destroy(&itxs); - while ((zcw = list_remove_head(&lwb->lwb_waiters)) != NULL) { + while ((zcw = list_remove_head(&waiters)) != NULL) { mutex_enter(&zcw->zcw_lock); - ASSERT3P(zcw->zcw_lwb, ==, lwb); zcw->zcw_lwb = NULL; /* * We expect any ZIO errors from child ZIOs to have been @@ -1450,6 +1458,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio) mutex_exit(&zcw->zcw_lock); } + list_destroy(&waiters); mutex_enter(&zilog->zl_lwb_io_lock); txg = lwb->lwb_issued_txg; From b3ad3f48d9d215ce9bea1090d86ced17862ea441 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 9 Jun 2023 13:12:52 -0400 Subject: [PATCH 151/180] Use list_remove_head() where possible. ... instead of list_head() + list_remove(). On FreeBSD the list functions are not inlined, so in addition to more compact code this also saves another function call. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14955 --- cmd/zed/agents/zfs_agents.c | 7 ++----- cmd/zed/agents/zfs_mod.c | 7 ++----- module/os/freebsd/zfs/zfs_acl.c | 4 +--- module/os/linux/zfs/zfs_acl.c | 4 +--- module/zfs/arc.c | 13 +++---------- module/zfs/bplist.c | 8 ++------ module/zfs/dmu_objset.c | 3 +-- module/zfs/dmu_tx.c | 3 +-- module/zfs/dsl_dataset.c | 3 +-- module/zfs/dsl_dir.c | 3 +-- module/zfs/dsl_scan.c | 4 +--- module/zfs/fm.c | 3 +-- module/zfs/refcount.c | 7 ++----- module/zfs/spa.c | 6 +++--- module/zfs/spa_misc.c | 3 +-- module/zfs/vdev_indirect.c | 11 +++++------ module/zfs/zfs_fm.c | 3 +-- module/zfs/zfs_fuid.c | 8 ++------ module/zfs/zfs_onexit.c | 3 +-- module/zfs/zvol.c | 7 ++----- 20 files changed, 34 insertions(+), 76 deletions(-) diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c index a2daa77a61fe..8fabb8d081a5 100644 --- a/cmd/zed/agents/zfs_agents.c +++ b/cmd/zed/agents/zfs_agents.c @@ -369,9 +369,7 @@ zfs_agent_consumer_thread(void *arg) return (NULL); } - if ((event = (list_head(&agent_events))) != NULL) { - list_remove(&agent_events, event); - + if ((event = list_remove_head(&agent_events)) != NULL) { (void) pthread_mutex_unlock(&agent_lock); /* dispatch to all event subscribers */ @@ -434,8 +432,7 @@ zfs_agent_fini(void) (void) pthread_join(g_agents_tid, NULL); /* drain any pending events */ - while ((event = (list_head(&agent_events))) != NULL) { - list_remove(&agent_events, event); + while ((event = list_remove_head(&agent_events)) != NULL) { nvlist_free(event->ae_nvl); free(event); } diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 1c82bd4f0010..b07a02712295 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -1288,17 +1288,14 @@ zfs_slm_fini(void) tpool_destroy(g_tpool); } - while ((pool = (list_head(&g_pool_list))) != NULL) { - list_remove(&g_pool_list, pool); + while ((pool = list_remove_head(&g_pool_list)) != NULL) { zpool_close(pool->uap_zhp); free(pool); } list_destroy(&g_pool_list); - while ((device = (list_head(&g_device_list))) != NULL) { - list_remove(&g_device_list, device); + while ((device = list_remove_head(&g_device_list)) != NULL) free(device); - } list_destroy(&g_device_list); libzfs_fini(g_zfshdl); diff --git a/module/os/freebsd/zfs/zfs_acl.c b/module/os/freebsd/zfs/zfs_acl.c index a077076927a1..20466aeaaa05 100644 --- a/module/os/freebsd/zfs/zfs_acl.c +++ b/module/os/freebsd/zfs/zfs_acl.c @@ -495,10 +495,8 @@ zfs_acl_release_nodes(zfs_acl_t *aclp) { zfs_acl_node_t *aclnode; - while ((aclnode = list_head(&aclp->z_acl))) { - list_remove(&aclp->z_acl, aclnode); + while ((aclnode = list_remove_head(&aclp->z_acl))) zfs_acl_node_free(aclnode); - } aclp->z_acl_count = 0; aclp->z_acl_bytes = 0; } diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c index ff26f47f2e04..a1fd3c9856cc 100644 --- a/module/os/linux/zfs/zfs_acl.c +++ b/module/os/linux/zfs/zfs_acl.c @@ -493,10 +493,8 @@ zfs_acl_release_nodes(zfs_acl_t *aclp) { zfs_acl_node_t *aclnode; - while ((aclnode = list_head(&aclp->z_acl))) { - list_remove(&aclp->z_acl, aclnode); + while ((aclnode = list_remove_head(&aclp->z_acl))) zfs_acl_node_free(aclnode); - } aclp->z_acl_count = 0; aclp->z_acl_bytes = 0; } diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 6f68c29fc7f5..dcd4620fcd20 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -7866,8 +7866,7 @@ arc_fini(void) taskq_destroy(arc_prune_taskq); mutex_enter(&arc_prune_mtx); - while ((p = list_head(&arc_prune_list)) != NULL) { - list_remove(&arc_prune_list, p); + while ((p = list_remove_head(&arc_prune_list)) != NULL) { zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); zfs_refcount_destroy(&p->p_refcnt); kmem_free(p, sizeof (*p)); @@ -8324,20 +8323,14 @@ l2arc_dev_get_next(void) static void l2arc_do_free_on_write(void) { - list_t *buflist; - l2arc_data_free_t *df, *df_prev; + l2arc_data_free_t *df; mutex_enter(&l2arc_free_on_write_mtx); - buflist = l2arc_free_on_write; - - for (df = list_tail(buflist); df; df = df_prev) { - df_prev = list_prev(buflist, df); + while ((df = list_remove_head(l2arc_free_on_write)) != NULL) { ASSERT3P(df->l2df_abd, !=, NULL); abd_free(df->l2df_abd); - list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } - mutex_exit(&l2arc_free_on_write_mtx); } diff --git a/module/zfs/bplist.c b/module/zfs/bplist.c index 1c1f7892bb7d..da7360f8ce10 100644 --- a/module/zfs/bplist.c +++ b/module/zfs/bplist.c @@ -65,9 +65,8 @@ bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) bplist_entry_t *bpe; mutex_enter(&bpl->bpl_lock); - while ((bpe = list_head(&bpl->bpl_list))) { + while ((bpe = list_remove_head(&bpl->bpl_list))) { bplist_iterate_last_removed = bpe; - list_remove(&bpl->bpl_list, bpe); mutex_exit(&bpl->bpl_lock); func(arg, &bpe->bpe_blk, tx); kmem_free(bpe, sizeof (*bpe)); @@ -82,10 +81,7 @@ bplist_clear(bplist_t *bpl) bplist_entry_t *bpe; mutex_enter(&bpl->bpl_lock); - while ((bpe = list_head(&bpl->bpl_list))) { - bplist_iterate_last_removed = bpe; - list_remove(&bpl->bpl_list, bpe); + while ((bpe = list_remove_head(&bpl->bpl_list))) kmem_free(bpe, sizeof (*bpe)); - } mutex_exit(&bpl->bpl_lock); } diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index c19ebf424953..778b18817eef 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1755,9 +1755,8 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; - while ((dr = list_head(list)) != NULL) { + while ((dr = list_remove_head(list)) != NULL) { ASSERT0(dr->dr_dbuf->db_level); - list_remove(list, dr); zio_nowait(dr->dr_zio); } diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index c4e274bd4c42..0eb8c17e331a 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1396,8 +1396,7 @@ dmu_tx_do_callbacks(list_t *cb_list, int error) { dmu_tx_callback_t *dcb; - while ((dcb = list_tail(cb_list)) != NULL) { - list_remove(cb_list, dcb); + while ((dcb = list_remove_tail(cb_list)) != NULL) { dcb->dcb_func(dcb->dcb_data, error); kmem_free(dcb, sizeof (dmu_tx_callback_t)); } diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 14e7ced4007c..d6db61729223 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -3782,8 +3782,7 @@ snaplist_destroy(list_t *l, const void *tag) if (l == NULL || !list_link_active(&l->list_head)) return; - while ((snap = list_tail(l)) != NULL) { - list_remove(l, snap); + while ((snap = list_remove_tail(l)) != NULL) { dsl_dataset_rele(snap->ds, tag); kmem_free(snap, sizeof (*snap)); } diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index eac9828a204a..bbe6a03d620f 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -1490,7 +1490,7 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) if (tr_cookie == NULL) return; - while ((tr = list_head(tr_list)) != NULL) { + while ((tr = list_remove_head(tr_list)) != NULL) { if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, @@ -1500,7 +1500,6 @@ dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) } else { arc_tempreserve_clear(tr->tr_size); } - list_remove(tr_list, tr); kmem_free(tr, sizeof (struct tempreserve)); } diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 9ee719a5eef6..1dd44171c10e 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -3437,10 +3437,8 @@ scan_io_queues_run_one(void *arg) * If we were suspended in the middle of processing, * requeue any unfinished sios and exit. */ - while ((sio = list_head(&sio_list)) != NULL) { - list_remove(&sio_list, sio); + while ((sio = list_remove_head(&sio_list)) != NULL) scan_io_queue_insert_impl(queue, sio); - } queue->q_zio = NULL; mutex_exit(q_lock); diff --git a/module/zfs/fm.c b/module/zfs/fm.c index 76956572f8bd..77d87b694a43 100644 --- a/module/zfs/fm.c +++ b/module/zfs/fm.c @@ -148,8 +148,7 @@ zfs_zevent_drain(zevent_t *ev) list_remove(&zevent_list, ev); /* Remove references to this event in all private file data */ - while ((ze = list_head(&ev->ev_ze_list)) != NULL) { - list_remove(&ev->ev_ze_list, ze); + while ((ze = list_remove_head(&ev->ev_ze_list)) != NULL) { ze->ze_zevent = NULL; ze->ze_dropped++; } diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index c9a504f67451..601d27f8c47a 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -88,14 +88,11 @@ zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) reference_t *ref; ASSERT3U(rc->rc_count, ==, number); - while ((ref = list_head(&rc->rc_list))) { - list_remove(&rc->rc_list, ref); + while ((ref = list_remove_head(&rc->rc_list))) kmem_cache_free(reference_cache, ref); - } list_destroy(&rc->rc_list); - while ((ref = list_head(&rc->rc_removed))) { - list_remove(&rc->rc_removed, ref); + while ((ref = list_remove_head(&rc->rc_removed))) { kmem_cache_free(reference_history_cache, ref->ref_removed); kmem_cache_free(reference_cache, ref); } diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 27bbb8f09259..88ee4ea9f458 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1609,16 +1609,16 @@ spa_unload_log_sm_metadata(spa_t *spa) { void *cookie = NULL; spa_log_sm_t *sls; + log_summary_entry_t *e; + while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, &cookie)) != NULL) { VERIFY0(sls->sls_mscount); kmem_free(sls, sizeof (spa_log_sm_t)); } - for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); - e != NULL; e = list_head(&spa->spa_log_summary)) { + while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { VERIFY0(e->lse_mscount); - list_remove(&spa->spa_log_summary, e); kmem_free(e, sizeof (log_summary_entry_t)); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 89e1ce7165db..014c539eb683 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -814,8 +814,7 @@ spa_remove(spa_t *spa) if (spa->spa_root) spa_strfree(spa->spa_root); - while ((dp = list_head(&spa->spa_config_list)) != NULL) { - list_remove(&spa->spa_config_list, dp); + while ((dp = list_remove_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index a16ad2f4e7cf..89667585345d 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -293,17 +293,16 @@ vdev_indirect_map_free(zio_t *zio) indirect_vsd_t *iv = zio->io_vsd; indirect_split_t *is; - while ((is = list_head(&iv->iv_splits)) != NULL) { + while ((is = list_remove_head(&iv->iv_splits)) != NULL) { for (int c = 0; c < is->is_children; c++) { indirect_child_t *ic = &is->is_child[c]; if (ic->ic_data != NULL) abd_free(ic->ic_data); } - list_remove(&iv->iv_splits, is); indirect_child_t *ic; - while ((ic = list_head(&is->is_unique_child)) != NULL) - list_remove(&is->is_unique_child, ic); + while ((ic = list_remove_head(&is->is_unique_child)) != NULL) + ; list_destroy(&is->is_unique_child); @@ -1659,8 +1658,8 @@ vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) for (indirect_split_t *is = list_head(&iv->iv_splits); is != NULL; is = list_next(&iv->iv_splits, is)) { indirect_child_t *ic; - while ((ic = list_head(&is->is_unique_child)) != NULL) - list_remove(&is->is_unique_child, ic); + while ((ic = list_remove_head(&is->is_unique_child)) != NULL) + ; is->is_unique_children = 0; } diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index bdd0e96c327a..c42ef048dd74 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1522,9 +1522,8 @@ zfs_ereport_fini(void) { recent_events_node_t *entry; - while ((entry = list_head(&recent_events_list)) != NULL) { + while ((entry = list_remove_head(&recent_events_list)) != NULL) { avl_remove(&recent_events_tree, entry); - list_remove(&recent_events_list, entry); kmem_free(entry, sizeof (*entry)); } avl_destroy(&recent_events_tree); diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c index 44aaae9c1264..add4241dcc99 100644 --- a/module/zfs/zfs_fuid.c +++ b/module/zfs/zfs_fuid.c @@ -699,19 +699,15 @@ zfs_fuid_info_free(zfs_fuid_info_t *fuidp) zfs_fuid_t *zfuid; zfs_fuid_domain_t *zdomain; - while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) { - list_remove(&fuidp->z_fuids, zfuid); + while ((zfuid = list_remove_head(&fuidp->z_fuids)) != NULL) kmem_free(zfuid, sizeof (zfs_fuid_t)); - } if (fuidp->z_domain_table != NULL) kmem_free(fuidp->z_domain_table, (sizeof (char *)) * fuidp->z_domain_cnt); - while ((zdomain = list_head(&fuidp->z_domains)) != NULL) { - list_remove(&fuidp->z_domains, zdomain); + while ((zdomain = list_remove_head(&fuidp->z_domains)) != NULL) kmem_free(zdomain, sizeof (zfs_fuid_domain_t)); - } kmem_free(fuidp, sizeof (zfs_fuid_info_t)); } diff --git a/module/zfs/zfs_onexit.c b/module/zfs/zfs_onexit.c index 63acf7ab2e4d..7bf804b67790 100644 --- a/module/zfs/zfs_onexit.c +++ b/module/zfs/zfs_onexit.c @@ -87,8 +87,7 @@ zfs_onexit_destroy(zfs_onexit_t *zo) zfs_onexit_action_node_t *ap; mutex_enter(&zo->zo_lock); - while ((ap = list_head(&zo->zo_actions)) != NULL) { - list_remove(&zo->zo_actions, ap); + while ((ap = list_remove_head(&zo->zo_actions)) != NULL) { mutex_exit(&zo->zo_lock); ap->za_func(ap->za_data); kmem_free(ap, sizeof (zfs_onexit_action_node_t)); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 06bc75c634a6..cd4e6f0c7558 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -1203,8 +1203,7 @@ zvol_create_minors_recursive(const char *name) * Prefetch is completed, we can do zvol_os_create_minor * sequentially. */ - while ((job = list_head(&minors_list)) != NULL) { - list_remove(&minors_list, job); + while ((job = list_remove_head(&minors_list)) != NULL) { if (!job->error) (void) zvol_os_create_minor(job->name); kmem_strfree(job->name); @@ -1311,10 +1310,8 @@ zvol_remove_minors_impl(const char *name) rw_exit(&zvol_state_lock); /* Drop zvol_state_lock before calling zvol_free() */ - while ((zv = list_head(&free_list)) != NULL) { - list_remove(&free_list, zv); + while ((zv = list_remove_head(&free_list)) != NULL) zvol_os_free(zv); - } } /* Remove minor for this specific volume only */ From 90ccfd426d9c9152c3f054d1335ab245d794d974 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 9 Jun 2023 13:14:05 -0400 Subject: [PATCH 152/180] Improve l2arc reporting in arc_summary. - Do not report L2ARC as FAULTED in presence of in-flight writes. - Report read and write I/Os, bytes and errors. - Remove few numbers not important to average user. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #12304 Closes #14946 --- cmd/arc_summary | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/cmd/arc_summary b/cmd/arc_summary index 5d10e903fcba..6b29a611dab3 100755 --- a/cmd/arc_summary +++ b/cmd/arc_summary @@ -842,7 +842,8 @@ def section_l2arc(kstats_dict): ('Free on write:', 'l2_free_on_write'), ('R/W clashes:', 'l2_rw_clash'), ('Bad checksums:', 'l2_cksum_bad'), - ('I/O errors:', 'l2_io_error')) + ('Read errors:', 'l2_io_error'), + ('Write errors:', 'l2_writes_error')) for title, value in l2_todo: prt_i1(title, f_hits(arc_stats[value])) @@ -878,28 +879,20 @@ def section_l2arc(kstats_dict): prt_i2('Miss ratio:', f_perc(arc_stats['l2_misses'], l2_access_total), f_hits(arc_stats['l2_misses'])) - prt_i1('Feeds:', f_hits(arc_stats['l2_feeds'])) print() - print('L2ARC writes:') - - if arc_stats['l2_writes_done'] != arc_stats['l2_writes_sent']: - prt_i2('Writes sent:', 'FAULTED', f_hits(arc_stats['l2_writes_sent'])) - prt_i2('Done ratio:', - f_perc(arc_stats['l2_writes_done'], - arc_stats['l2_writes_sent']), - f_hits(arc_stats['l2_writes_done'])) - prt_i2('Error ratio:', - f_perc(arc_stats['l2_writes_error'], - arc_stats['l2_writes_sent']), - f_hits(arc_stats['l2_writes_error'])) - else: - prt_i2('Writes sent:', '100 %', f_hits(arc_stats['l2_writes_sent'])) + print('L2ARC I/O:') + prt_i2('Reads:', + f_bytes(arc_stats['l2_read_bytes']), + f_hits(arc_stats['l2_hits'])) + prt_i2('Writes:', + f_bytes(arc_stats['l2_write_bytes']), + f_hits(arc_stats['l2_writes_sent'])) print() print('L2ARC evicts:') - prt_i1('Lock retries:', f_hits(arc_stats['l2_evict_lock_retry'])) - prt_i1('Upon reading:', f_hits(arc_stats['l2_evict_reading'])) + prt_i1('L1 cached:', f_hits(arc_stats['l2_evict_l1cached'])) + prt_i1('While reading:', f_hits(arc_stats['l2_evict_reading'])) print() From 6db4ed51d6c2cd8ec6a3ad318118f7a0d6b6dfe8 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 9 Jun 2023 11:10:01 -0700 Subject: [PATCH 153/180] ZTS: Skip checkpoint_discard_busy Until the ASSERT which is occasionally hit while running checkpoint_discard_busy is resolved skip this test case. Signed-off-by: Brian Behlendorf Issue #12053 Closes #14952 --- tests/test-runner/bin/zts-report.py.in | 1 + .../functional/pool_checkpoint/checkpoint_discard_busy.ksh | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index ef1a46dca72a..9517ce8073a5 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -152,6 +152,7 @@ known = { ['FAIL', rewind_reason], 'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason], 'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason], + 'pool_checkpoint/checkpoint_discard_busy': ['SKIP', 12053], 'privilege/setup': ['SKIP', na_reason], 'refreserv/refreserv_004_pos': ['FAIL', known_reason], 'rootpool/setup': ['SKIP', na_reason], diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh index f970935f5bd0..087aef9027ea 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh @@ -38,6 +38,8 @@ verify_runnable "global" +log_unsupported "Skipping, issue https://github.com/openzfs/zfs/issues/12053" + function test_cleanup { # reset memory limit to 16M From 70ea484e3ec56c529c6c5027ffc43840100ce224 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 9 Jun 2023 15:40:55 -0400 Subject: [PATCH 154/180] Finally drop long disabled vdev cache. It was a vdev level read cache, designed to aggregate many small reads by speculatively issuing bigger reads instead and caching the result. But since it has almost no idea about what is going on with exception of ZIO_FLAG_DONT_CACHE flag set by higher layers, it was found to make more harm than good, for which reason it was disabled for the past 12 years. These days we have much better instruments to enlarge the I/Os, such as speculative and prescient prefetches, I/O scheduler, I/O aggregation etc. Besides just the dead code removal this removes one extra mutex lock/unlock per write inside vdev_cache_write(), not otherwise disabled and trying to do some work. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14953 --- cmd/arc_summary | 35 -- cmd/zdb/zdb.c | 7 +- include/os/linux/kernel/linux/mod_compat.h | 1 - include/sys/spa.h | 4 - include/sys/vdev.h | 6 - include/sys/vdev_impl.h | 20 - include/sys/zio.h | 1 - lib/libzpool/Makefile.am | 1 - man/man4/zfs.4 | 15 - man/man8/zpool-events.8 | 1 - module/Kbuild.in | 1 - module/Makefile.bsd | 1 - module/os/freebsd/zfs/sysctl_os.c | 2 - module/zfs/arc.c | 11 +- module/zfs/dmu_recv.c | 4 +- module/zfs/spa_misc.c | 2 - module/zfs/vdev.c | 7 +- module/zfs/vdev_cache.c | 436 --------------------- module/zfs/vdev_queue.c | 5 +- module/zfs/zio.c | 15 +- 20 files changed, 13 insertions(+), 562 deletions(-) delete mode 100644 module/zfs/vdev_cache.c diff --git a/cmd/arc_summary b/cmd/arc_summary index 6b29a611dab3..426e0207052d 100755 --- a/cmd/arc_summary +++ b/cmd/arc_summary @@ -64,7 +64,6 @@ SECTION_HELP = 'print info from one section ('+' '.join(SECTIONS)+')' SECTION_PATHS = {'arc': 'arcstats', 'dmu': 'dmu_tx', 'l2arc': 'arcstats', # L2ARC stuff lives in arcstats - 'vdev': 'vdev_cache_stats', 'zfetch': 'zfetchstats', 'zil': 'zil'} @@ -90,8 +89,6 @@ if sys.platform.startswith('freebsd'): # Requires py36-sysctl on FreeBSD import sysctl - VDEV_CACHE_SIZE = 'vdev.cache_size' - def is_value(ctl): return ctl.type != sysctl.CTLTYPE_NODE @@ -135,8 +132,6 @@ elif sys.platform.startswith('linux'): SPL_PATH = '/sys/module/spl/parameters' TUNABLES_PATH = '/sys/module/zfs/parameters' - VDEV_CACHE_SIZE = 'zfs_vdev_cache_size' - def load_kstats(section): path = os.path.join(KSTAT_PATH, section) with open(path) as f: @@ -952,35 +947,6 @@ def section_tunables(*_): print() -def section_vdev(kstats_dict): - """Collect information on VDEV caches""" - - # Currently [Nov 2017] the VDEV cache is disabled, because it is actually - # harmful. When this is the case, we just skip the whole entry. See - # https://github.com/openzfs/zfs/blob/master/module/zfs/vdev_cache.c - # for details - tunables = get_vdev_params() - - if tunables[VDEV_CACHE_SIZE] == '0': - print('VDEV cache disabled, skipping section\n') - return - - vdev_stats = isolate_section('vdev_cache_stats', kstats_dict) - - vdev_cache_total = int(vdev_stats['hits']) +\ - int(vdev_stats['misses']) +\ - int(vdev_stats['delegations']) - - prt_1('VDEV cache summary:', f_hits(vdev_cache_total)) - prt_i2('Hit ratio:', f_perc(vdev_stats['hits'], vdev_cache_total), - f_hits(vdev_stats['hits'])) - prt_i2('Miss ratio:', f_perc(vdev_stats['misses'], vdev_cache_total), - f_hits(vdev_stats['misses'])) - prt_i2('Delegations:', f_perc(vdev_stats['delegations'], vdev_cache_total), - f_hits(vdev_stats['delegations'])) - print() - - def section_zil(kstats_dict): """Collect information on the ZFS Intent Log. Some of the information taken from https://github.com/openzfs/zfs/blob/master/include/sys/zil.h @@ -1008,7 +974,6 @@ section_calls = {'arc': section_arc, 'l2arc': section_l2arc, 'spl': section_spl, 'tunables': section_tunables, - 'vdev': section_vdev, 'zil': section_zil} diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 105d36882291..04a10c4eedd7 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -8546,9 +8546,9 @@ zdb_read_block(char *thing, spa_t *spa) */ zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | - ZIO_FLAG_OPTIONAL, NULL, NULL)); + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | ZIO_FLAG_OPTIONAL, + NULL, NULL)); } error = zio_wait(zio); @@ -8642,7 +8642,6 @@ zdb_read_block(char *thing, spa_t *spa) zio_nowait(zio_vdev_child_io(czio, bp, vd, offset, pabd, psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h index 09d109d191bf..8e20a9613539 100644 --- a/include/os/linux/kernel/linux/mod_compat.h +++ b/include/os/linux/kernel/linux/mod_compat.h @@ -68,7 +68,6 @@ enum scope_prefix_types { zfs_trim, zfs_txg, zfs_vdev, - zfs_vdev_cache, zfs_vdev_file, zfs_vdev_mirror, zfs_vnops, diff --git a/include/sys/spa.h b/include/sys/spa.h index ed752967cca6..1fa2044008dc 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1174,10 +1174,6 @@ extern void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb); extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep); -/* vdev cache */ -extern void vdev_cache_stat_init(void); -extern void vdev_cache_stat_fini(void); - /* vdev mirror */ extern void vdev_mirror_stat_init(void); extern void vdev_mirror_stat_fini(void); diff --git a/include/sys/vdev.h b/include/sys/vdev.h index d529bbcdd9a4..26c834ff57cf 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -158,12 +158,6 @@ extern boolean_t vdev_allocatable(vdev_t *vd); extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd); -extern void vdev_cache_init(vdev_t *vd); -extern void vdev_cache_fini(vdev_t *vd); -extern boolean_t vdev_cache_read(zio_t *zio); -extern void vdev_cache_write(zio_t *zio); -extern void vdev_cache_purge(vdev_t *vd); - extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); extern zio_t *vdev_queue_io(zio_t *zio); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index ea3043c82a39..74b3737d8ee5 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -57,8 +57,6 @@ extern "C" { * Forward declarations that lots of things need. */ typedef struct vdev_queue vdev_queue_t; -typedef struct vdev_cache vdev_cache_t; -typedef struct vdev_cache_entry vdev_cache_entry_t; struct abd; extern uint_t zfs_vdev_queue_depth_pct; @@ -132,23 +130,6 @@ typedef const struct vdev_ops { /* * Virtual device properties */ -struct vdev_cache_entry { - struct abd *ve_abd; - uint64_t ve_offset; - clock_t ve_lastused; - avl_node_t ve_offset_node; - avl_node_t ve_lastused_node; - uint32_t ve_hits; - uint16_t ve_missed_update; - zio_t *ve_fill_io; -}; - -struct vdev_cache { - avl_tree_t vc_offset_tree; - avl_tree_t vc_lastused_tree; - kmutex_t vc_lock; -}; - typedef struct vdev_queue_class { uint32_t vqc_active; @@ -443,7 +424,6 @@ struct vdev { boolean_t vdev_resilver_deferred; /* resilver deferred */ boolean_t vdev_kobj_flag; /* kobj event record */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ - vdev_cache_t vdev_cache; /* physical block cache */ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ vdev_aux_t vdev_label_aux; /* on-disk aux state */ diff --git a/include/sys/zio.h b/include/sys/zio.h index 695bc09e6cb7..6b1352a72b9a 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -190,7 +190,6 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_SPECULATIVE (1ULL << 8) #define ZIO_FLAG_CONFIG_WRITER (1ULL << 9) #define ZIO_FLAG_DONT_RETRY (1ULL << 10) -#define ZIO_FLAG_DONT_CACHE (1ULL << 11) #define ZIO_FLAG_NODATA (1ULL << 12) #define ZIO_FLAG_INDUCE_DAMAGE (1ULL << 13) #define ZIO_FLAG_IO_ALLOCATING (1ULL << 14) diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index ceac2963e647..58d7f07527aa 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -135,7 +135,6 @@ nodist_libzpool_la_SOURCES = \ module/zfs/uberblock.c \ module/zfs/unique.c \ module/zfs/vdev.c \ - module/zfs/vdev_cache.c \ module/zfs/vdev_draid.c \ module/zfs/vdev_draid_rand.c \ module/zfs/vdev_indirect.c \ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 9ec940a94488..5fbd9d7db93f 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2028,21 +2028,6 @@ Max vdev I/O aggregation size. .It Sy zfs_vdev_aggregation_limit_non_rotating Ns = Ns Sy 131072 Ns B Po 128 KiB Pc Pq uint Max vdev I/O aggregation size for non-rotating media. . -.It Sy zfs_vdev_cache_bshift Ns = Ns Sy 16 Po 64 KiB Pc Pq uint -Shift size to inflate reads to. -. -.It Sy zfs_vdev_cache_max Ns = Ns Sy 16384 Ns B Po 16 KiB Pc Pq uint -Inflate reads smaller than this value to meet the -.Sy zfs_vdev_cache_bshift -size -.Pq default Sy 64 KiB . -. -.It Sy zfs_vdev_cache_size Ns = Ns Sy 0 Pq uint -Total size of the per-disk cache in bytes. -.Pp -Currently this feature is disabled, as it has been found to not be helpful -for performance and in some cases harmful. -. .It Sy zfs_vdev_mirror_rotating_inc Ns = Ns Sy 0 Pq int A number by which the balancing algorithm increments the load calculation for the purpose of selecting the least busy mirror member when an I/O operation diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 index 0ba93e4166e7..341f902fe66e 100644 --- a/man/man8/zpool-events.8 +++ b/man/man8/zpool-events.8 @@ -456,7 +456,6 @@ ZIO_FLAG_CANFAIL:0x00000080 ZIO_FLAG_SPECULATIVE:0x00000100 ZIO_FLAG_CONFIG_WRITER:0x00000200 ZIO_FLAG_DONT_RETRY:0x00000400 -ZIO_FLAG_DONT_CACHE:0x00000800 ZIO_FLAG_NODATA:0x00001000 ZIO_FLAG_INDUCE_DAMAGE:0x00002000 diff --git a/module/Kbuild.in b/module/Kbuild.in index 29a55c9778b1..485331ac655e 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -382,7 +382,6 @@ ZFS_OBJS := \ uberblock.o \ unique.o \ vdev.o \ - vdev_cache.o \ vdev_draid.o \ vdev_draid_rand.o \ vdev_indirect.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 9464223f6ca6..0c4d8bfe1159 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -308,7 +308,6 @@ SRCS+= abd.c \ uberblock.c \ unique.c \ vdev.c \ - vdev_cache.c \ vdev_draid.c \ vdev_draid_rand.c \ vdev_indirect.c \ diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c index cc616f33db96..8ae2f23c3ecf 100644 --- a/module/os/freebsd/zfs/sysctl_os.c +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -872,8 +872,6 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, "Enable to bypass vdev_validate()."); /* END CSTYLED */ -/* vdev_cache.c */ - /* vdev_mirror.c */ /* vdev_queue.c */ diff --git a/module/zfs/arc.c b/module/zfs/arc.c index dcd4620fcd20..3dbaaa76b4a5 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -6106,8 +6106,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, asize, abd, ZIO_CHECKSUM_OFF, l2arc_read_done, cb, priority, - zio_flags | ZIO_FLAG_DONT_CACHE | - ZIO_FLAG_CANFAIL | + zio_flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); acb->acb_zio_head = rzio; @@ -10177,8 +10176,7 @@ l2arc_dev_hdr_read(l2arc_dev_t *dev) err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | - ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SPECULATIVE, B_FALSE)); abd_free(abd); @@ -10498,11 +10496,10 @@ l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); cb->l2rcb_abd = abd_get_from_buf(lb, asize); pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | - ZIO_FLAG_DONT_RETRY); + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); return (pio); diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index c22a95f8647f..2fdd7c1ece73 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -1371,8 +1371,8 @@ do_corrective_recv(struct receive_writer_arg *rwa, struct drr_write *drrw, dnode_t *dn; abd_t *abd = rrd->abd; zio_cksum_t bp_cksum = bp->blk_cksum; - zio_flag_t flags = ZIO_FLAG_SPECULATIVE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL; + zio_flag_t flags = ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL; if (rwa->raw) flags |= ZIO_FLAG_RAW; diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 014c539eb683..9ef948e9e434 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2438,7 +2438,6 @@ spa_init(spa_mode_t mode) zio_init(); dmu_init(); zil_init(); - vdev_cache_stat_init(); vdev_mirror_stat_init(); vdev_raidz_math_init(); vdev_file_init(); @@ -2462,7 +2461,6 @@ spa_fini(void) spa_evict_all(); vdev_file_fini(); - vdev_cache_stat_fini(); vdev_mirror_stat_fini(); vdev_raidz_math_fini(); chksum_fini(); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 58dcd9f79799..612e66c3a8a8 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -715,7 +715,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) offsetof(struct vdev, vdev_dtl_node)); vd->vdev_stat.vs_timestamp = gethrtime(); vdev_queue_init(vd); - vdev_cache_init(vd); return (vd); } @@ -1096,7 +1095,6 @@ vdev_free(vdev_t *vd) * Clean up vdev structure. */ vdev_queue_fini(vd); - vdev_cache_fini(vd); if (vd->vdev_path) spa_strfree(vd->vdev_path); @@ -1720,8 +1718,7 @@ vdev_probe(vdev_t *vd, zio_t *zio) vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | - ZIO_FLAG_TRYHARD; + ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD; if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { /* @@ -2612,8 +2609,6 @@ vdev_close(vdev_t *vd) vd->vdev_ops->vdev_op_close(vd); - vdev_cache_purge(vd); - /* * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c deleted file mode 100644 index f0a17600d58e..000000000000 --- a/module/zfs/vdev_cache.c +++ /dev/null @@ -1,436 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or https://opensource.org/licenses/CDDL-1.0. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* - * Copyright (c) 2013, 2016 by Delphix. All rights reserved. - */ - -#include -#include -#include -#include -#include -#include - -/* - * Virtual device read-ahead caching. - * - * This file implements a simple LRU read-ahead cache. When the DMU reads - * a given block, it will often want other, nearby blocks soon thereafter. - * We take advantage of this by reading a larger disk region and caching - * the result. In the best case, this can turn 128 back-to-back 512-byte - * reads into a single 64k read followed by 127 cache hits; this reduces - * latency dramatically. In the worst case, it can turn an isolated 512-byte - * read into a 64k read, which doesn't affect latency all that much but is - * terribly wasteful of bandwidth. A more intelligent version of the cache - * could keep track of access patterns and not do read-ahead unless it sees - * at least two temporally close I/Os to the same region. Currently, only - * metadata I/O is inflated. A further enhancement could take advantage of - * more semantic information about the I/O. And it could use something - * faster than an AVL tree; that was chosen solely for convenience. - * - * There are five cache operations: allocate, fill, read, write, evict. - * - * (1) Allocate. This reserves a cache entry for the specified region. - * We separate the allocate and fill operations so that multiple threads - * don't generate I/O for the same cache miss. - * - * (2) Fill. When the I/O for a cache miss completes, the fill routine - * places the data in the previously allocated cache entry. - * - * (3) Read. Read data from the cache. - * - * (4) Write. Update cache contents after write completion. - * - * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry - * if the total cache size exceeds zfs_vdev_cache_size. - */ - -/* - * These tunables are for performance analysis. - */ -/* - * All i/os smaller than zfs_vdev_cache_max will be turned into - * 1<ve_offset, ve2->ve_offset)); -} - -static int -vdev_cache_lastused_compare(const void *a1, const void *a2) -{ - const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; - const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; - - int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused); - if (likely(cmp)) - return (cmp); - - /* - * Among equally old entries, sort by offset to ensure uniqueness. - */ - return (vdev_cache_offset_compare(a1, a2)); -} - -/* - * Evict the specified entry from the cache. - */ -static void -vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) -{ - ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT3P(ve->ve_fill_io, ==, NULL); - ASSERT3P(ve->ve_abd, !=, NULL); - - avl_remove(&vc->vc_lastused_tree, ve); - avl_remove(&vc->vc_offset_tree, ve); - abd_free(ve->ve_abd); - kmem_free(ve, sizeof (vdev_cache_entry_t)); -} - -/* - * Allocate an entry in the cache. At the point we don't have the data, - * we're just creating a placeholder so that multiple threads don't all - * go off and read the same blocks. - */ -static vdev_cache_entry_t * -vdev_cache_allocate(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - uint64_t offset = P2ALIGN(zio->io_offset, VCBS); - vdev_cache_entry_t *ve; - - ASSERT(MUTEX_HELD(&vc->vc_lock)); - - if (zfs_vdev_cache_size == 0) - return (NULL); - - /* - * If adding a new entry would exceed the cache size, - * evict the oldest entry (LRU). - */ - if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > - zfs_vdev_cache_size) { - ve = avl_first(&vc->vc_lastused_tree); - if (ve->ve_fill_io != NULL) - return (NULL); - ASSERT3U(ve->ve_hits, !=, 0); - vdev_cache_evict(vc, ve); - } - - ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); - ve->ve_offset = offset; - ve->ve_lastused = ddi_get_lbolt(); - ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); - - avl_add(&vc->vc_offset_tree, ve); - avl_add(&vc->vc_lastused_tree, ve); - - return (ve); -} - -static void -vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) -{ - uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); - - ASSERT(MUTEX_HELD(&vc->vc_lock)); - ASSERT3P(ve->ve_fill_io, ==, NULL); - - if (ve->ve_lastused != ddi_get_lbolt()) { - avl_remove(&vc->vc_lastused_tree, ve); - ve->ve_lastused = ddi_get_lbolt(); - avl_add(&vc->vc_lastused_tree, ve); - } - - ve->ve_hits++; - abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); -} - -/* - * Fill a previously allocated cache entry with data. - */ -static void -vdev_cache_fill(zio_t *fio) -{ - vdev_t *vd = fio->io_vd; - vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve = fio->io_private; - zio_t *pio; - - ASSERT3U(fio->io_size, ==, VCBS); - - /* - * Add data to the cache. - */ - mutex_enter(&vc->vc_lock); - - ASSERT3P(ve->ve_fill_io, ==, fio); - ASSERT3U(ve->ve_offset, ==, fio->io_offset); - ASSERT3P(ve->ve_abd, ==, fio->io_abd); - - ve->ve_fill_io = NULL; - - /* - * Even if this cache line was invalidated by a missed write update, - * any reads that were queued up before the missed update are still - * valid, so we can satisfy them from this line before we evict it. - */ - zio_link_t *zl = NULL; - while ((pio = zio_walk_parents(fio, &zl)) != NULL) - vdev_cache_hit(vc, ve, pio); - - if (fio->io_error || ve->ve_missed_update) - vdev_cache_evict(vc, ve); - - mutex_exit(&vc->vc_lock); -} - -/* - * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss. - */ -boolean_t -vdev_cache_read(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - vdev_cache_entry_t *ve, ve_search; - uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); - zio_t *fio; - uint64_t cache_phase __maybe_unused = P2PHASE(zio->io_offset, VCBS); - - ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); - - if (zfs_vdev_cache_size == 0) - return (B_FALSE); - - if (zio->io_flags & ZIO_FLAG_DONT_CACHE) - return (B_FALSE); - - if (zio->io_size > zfs_vdev_cache_max) - return (B_FALSE); - - /* - * If the I/O straddles two or more cache blocks, don't cache it. - */ - if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) - return (B_FALSE); - - ASSERT3U(cache_phase + zio->io_size, <=, VCBS); - - mutex_enter(&vc->vc_lock); - - ve_search.ve_offset = cache_offset; - ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL); - - if (ve != NULL) { - if (ve->ve_missed_update) { - mutex_exit(&vc->vc_lock); - return (B_FALSE); - } - - if ((fio = ve->ve_fill_io) != NULL) { - zio_vdev_io_bypass(zio); - zio_add_child(zio, fio); - mutex_exit(&vc->vc_lock); - VDCSTAT_BUMP(vdc_stat_delegations); - return (B_TRUE); - } - - vdev_cache_hit(vc, ve, zio); - zio_vdev_io_bypass(zio); - - mutex_exit(&vc->vc_lock); - VDCSTAT_BUMP(vdc_stat_hits); - return (B_TRUE); - } - - ve = vdev_cache_allocate(zio); - - if (ve == NULL) { - mutex_exit(&vc->vc_lock); - return (B_FALSE); - } - - fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, - ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, - ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); - - ve->ve_fill_io = fio; - zio_vdev_io_bypass(zio); - zio_add_child(zio, fio); - - mutex_exit(&vc->vc_lock); - zio_nowait(fio); - VDCSTAT_BUMP(vdc_stat_misses); - - return (B_TRUE); -} - -/* - * Update cache contents upon write completion. - */ -void -vdev_cache_write(zio_t *zio) -{ - vdev_cache_t *vc = &zio->io_vd->vdev_cache; - vdev_cache_entry_t *ve, ve_search; - uint64_t io_start = zio->io_offset; - uint64_t io_end = io_start + zio->io_size; - uint64_t min_offset = P2ALIGN(io_start, VCBS); - uint64_t max_offset = P2ROUNDUP(io_end, VCBS); - avl_index_t where; - - ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); - - mutex_enter(&vc->vc_lock); - - ve_search.ve_offset = min_offset; - ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); - - if (ve == NULL) - ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); - - while (ve != NULL && ve->ve_offset < max_offset) { - uint64_t start = MAX(ve->ve_offset, io_start); - uint64_t end = MIN(ve->ve_offset + VCBS, io_end); - - if (ve->ve_fill_io != NULL) { - ve->ve_missed_update = 1; - } else { - abd_copy_off(ve->ve_abd, zio->io_abd, - start - ve->ve_offset, start - io_start, - end - start); - } - ve = AVL_NEXT(&vc->vc_offset_tree, ve); - } - mutex_exit(&vc->vc_lock); -} - -void -vdev_cache_purge(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve; - - mutex_enter(&vc->vc_lock); - while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) - vdev_cache_evict(vc, ve); - mutex_exit(&vc->vc_lock); -} - -void -vdev_cache_init(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - - mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); - - avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, - sizeof (vdev_cache_entry_t), - offsetof(struct vdev_cache_entry, ve_offset_node)); - - avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, - sizeof (vdev_cache_entry_t), - offsetof(struct vdev_cache_entry, ve_lastused_node)); -} - -void -vdev_cache_fini(vdev_t *vd) -{ - vdev_cache_t *vc = &vd->vdev_cache; - - vdev_cache_purge(vd); - - avl_destroy(&vc->vc_offset_tree); - avl_destroy(&vc->vc_lastused_tree); - - mutex_destroy(&vc->vc_lock); -} - -void -vdev_cache_stat_init(void) -{ - vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", - KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - if (vdc_ksp != NULL) { - vdc_ksp->ks_data = &vdc_stats; - kstat_install(vdc_ksp); - } -} - -void -vdev_cache_stat_fini(void) -{ - if (vdc_ksp != NULL) { - kstat_delete(vdc_ksp); - vdc_ksp = NULL; - } -} - -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_max, UINT, ZMOD_RW, - "Inflate reads small than max"); - -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_size, UINT, ZMOD_RD, - "Total size of the per-disk cache"); - -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_bshift, UINT, ZMOD_RW, - "Shift size to inflate reads too"); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 1a75d68abd9e..abb7d0662b8c 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -748,8 +748,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, abd, size, first->io_type, zio->io_priority, - flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, - vdev_queue_agg_io_done, NULL); + flags | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); aio->io_timestamp = first->io_timestamp; nio = first; @@ -907,7 +906,7 @@ vdev_queue_io(zio_t *zio) ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM); } - zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; + zio->io_flags |= ZIO_FLAG_DONT_QUEUE; zio->io_timestamp = gethrtime(); mutex_enter(&vq->vq_lock); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index c17ca5e1d651..d7b2217623e6 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1617,12 +1617,6 @@ zio_read_bp_init(zio_t *zio) ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); } - if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) - zio->io_flags |= ZIO_FLAG_DONT_CACHE; - - if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) - zio->io_flags |= ZIO_FLAG_DONT_CACHE; - if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_pipeline = ZIO_DDT_READ_PIPELINE; @@ -3955,9 +3949,6 @@ zio_vdev_io_start(zio_t *zio) zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { - if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) - return (zio); - if ((zio = vdev_queue_io(zio)) == NULL) return (NULL); @@ -3994,9 +3985,6 @@ zio_vdev_io_done(zio_t *zio) vd->vdev_ops != &vdev_draid_spare_ops) { vdev_queue_io_done(zio); - if (zio->io_type == ZIO_TYPE_WRITE) - vdev_cache_write(zio); - if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_device_injections(vd, zio, EIO, EILSEQ); @@ -4106,8 +4094,7 @@ zio_vdev_io_assess(zio_t *zio) ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ zio->io_error = 0; - zio->io_flags |= ZIO_FLAG_IO_RETRY | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; + zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_AGGREGATE; zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, zio_requeue_io_start_cut_in_line); From feff9dfed3df1bbae5dd74959a6ad87d11f27ffb Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Sat, 10 Jun 2023 02:05:47 +0200 Subject: [PATCH 155/180] Fix the L2ARC write size calculating logic (2) While commit bcd5321 adjusts the write size based on the size of the log block, this happens after comparing the unadjusted write size to the evicted (target) size. In this case l2ad_hand will exceed l2ad_evict and violate an assertion at the end of l2arc_write_buffers(). Fix this by adding the max log block size to the allocated size of the buffer to be committed before comparing the result to the target size. Also reset the l2arc_trim_ahead ZFS module variable when the adjusted write size exceeds the size of the L2ARC device. Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #14936 Closes #14954 --- module/zfs/arc.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 3dbaaa76b4a5..a23715309f2b 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8206,7 +8206,7 @@ l2arc_write_size(l2arc_dev_t *dev) * device. This is important in l2arc_evict(), otherwise infinite * iteration can occur. */ - if (size >= dev->l2ad_end - dev->l2ad_start) { + if (size > dev->l2ad_end - dev->l2ad_start) { cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " "plus the overhead of log blocks (persistent L2ARC, " "%llu bytes) exceeds the size of the cache device " @@ -8216,6 +8216,11 @@ l2arc_write_size(l2arc_dev_t *dev) size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; + if (l2arc_trim_ahead > 1) { + cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1"); + l2arc_trim_ahead = 1; + } + if (arc_warm == B_FALSE) size += l2arc_write_boost; @@ -8842,7 +8847,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) top: rerun = B_FALSE; - if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { + if (dev->l2ad_hand + distance > dev->l2ad_end) { /* * When there is no space to accommodate upcoming writes, * evict to the end. Then bump the write and evict hands @@ -9036,7 +9041,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) */ ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); if (!dev->l2ad_first) - ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); + ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); } } @@ -9296,7 +9301,13 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); - if ((write_asize + asize) > target_sz) { + /* + * If the allocated size of this buffer plus the max + * size for the pending log block exceeds the evicted + * target size, terminate writing buffers for this run. + */ + if (write_asize + asize + + sizeof (l2arc_log_blk_phys_t) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); break; @@ -9412,7 +9423,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) */ if (l2arc_log_blk_insert(dev, hdr)) { /* - * l2ad_hand has been accounted for in + * l2ad_hand will be adjusted in * l2arc_log_blk_commit(). */ write_asize += From 8af1104f83eb44501b83218ed456e2d4b0ac3521 Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Wed, 14 Jun 2023 17:01:17 +0200 Subject: [PATCH 156/180] Store the L2ARC device ashift in the vdev label If this is not done, and the pool has an ashift other than the default (at the moment 9) then the following happens: 1) vdev_alloc() assigns the ashift of the pool to L2ARC device, but upon export it is not stored anywhere 2) at the first import, vdev_open() sees an vdev_ashift() of 0 and assigns the logical_ashift, which is 9 3) reading the contents of L2ARC, including the header fails 4) L2ARC buffers are not restored in ARC. Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #14313 Closes #14963 --- module/zfs/vdev_label.c | 3 +++ .../l2arc/persist_l2arc_001_pos.ksh | 19 ++++++++----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 85c7134ca4c4..a5c76808f2d2 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -486,6 +486,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_isspare) fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); + if (flags & VDEV_CONFIG_L2CACHE) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && vd == vd->vdev_top) { fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, diff --git a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh index 6f7b9aff7c38..a9968723c3ca 100755 --- a/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/l2arc/persist_l2arc_001_pos.ksh @@ -27,15 +27,14 @@ # # STRATEGY: # 1. Create pool with a cache device. -# 2. Export and re-import pool without writing any data. -# 3. Create a random file in that pool and random read for 10 sec. -# 4. Export pool. -# 5. Read the amount of log blocks written from the header of the +# 2. Create a random file in that pool and random read for 10 sec. +# 3. Export pool. +# 4. Read the amount of log blocks written from the header of the # L2ARC device. -# 6. Import pool. -# 7. Read the amount of log blocks rebuilt in arcstats and compare to +# 5. Import pool. +# 6. Read the amount of log blocks rebuilt in arcstats and compare to # (5). -# 8. Check if the labels of the L2ARC device are intact. +# 7. Check if the labels of the L2ARC device are intact. # # * We can predict the minimum bytes of L2ARC restored if we subtract # from the effective size of the cache device the bytes l2arc_evict() @@ -77,10 +76,8 @@ export FILE_SIZE=$(( floor($fill_mb / $NUMJOBS) ))M log_must truncate -s ${cache_sz}M $VDEV_CACHE -log_must zpool create -f $TESTPOOL $VDEV cache $VDEV_CACHE - -log_must zpool export $TESTPOOL -log_must zpool import -d $VDIR $TESTPOOL +log_must zpool create -f -o ashift=12 $TESTPOOL $VDEV +log_must zpool add $TESTPOOL cache $VDEV_CACHE log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio From d057807ede05ce809e9ba1e2b47b12ada0d3b2ed Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 14 Jun 2023 11:02:27 -0400 Subject: [PATCH 157/180] Switch refcount tracking from lists to AVL-trees. With large number of tracked references list searches under the lock become too expensive, creating enormous lock contention. On my tests with ZFS_DEBUG enabled this increases write throughput with 32KB blocks from ~1.2GB/s to ~7.5GB/s. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14970 --- include/sys/zfs_refcount.h | 16 ++-- module/zfs/refcount.c | 187 +++++++++++++++++++------------------ 2 files changed, 108 insertions(+), 95 deletions(-) diff --git a/include/sys/zfs_refcount.h b/include/sys/zfs_refcount.h index 4efa266a53c5..77965a0aa580 100644 --- a/include/sys/zfs_refcount.h +++ b/include/sys/zfs_refcount.h @@ -27,6 +27,7 @@ #define _SYS_ZFS_REFCOUNT_H #include +#include #include #include @@ -43,19 +44,22 @@ extern "C" { #ifdef ZFS_DEBUG typedef struct reference { - list_node_t ref_link; + union { + avl_node_t a; + list_node_t l; + } ref_link; const void *ref_holder; uint64_t ref_number; - uint8_t *ref_removed; + boolean_t ref_search; } reference_t; typedef struct refcount { + uint64_t rc_count; kmutex_t rc_mtx; - boolean_t rc_tracked; - list_t rc_list; + avl_tree_t rc_tree; list_t rc_removed; - uint64_t rc_count; - uint64_t rc_removed_count; + uint_t rc_removed_count; + boolean_t rc_tracked; } zfs_refcount_t; /* diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index 601d27f8c47a..718bbb34a8d5 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -36,33 +36,40 @@ int reference_tracking_enable = B_FALSE; static uint_t reference_history = 3; /* tunable */ static kmem_cache_t *reference_cache; -static kmem_cache_t *reference_history_cache; void zfs_refcount_init(void) { reference_cache = kmem_cache_create("reference_cache", sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - - reference_history_cache = kmem_cache_create("reference_history_cache", - sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); } void zfs_refcount_fini(void) { kmem_cache_destroy(reference_cache); - kmem_cache_destroy(reference_history_cache); +} + +static int +zfs_refcount_compare(const void *x1, const void *x2) +{ + const reference_t *r1 = (const reference_t *)x1; + const reference_t *r2 = (const reference_t *)x2; + + int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder); + int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number); + int cmp = cmp1 ? cmp1 : cmp2; + return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2)); } void zfs_refcount_create(zfs_refcount_t *rc) { mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); - list_create(&rc->rc_list, sizeof (reference_t), - offsetof(reference_t, ref_link)); + avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t), + offsetof(reference_t, ref_link.a)); list_create(&rc->rc_removed, sizeof (reference_t), - offsetof(reference_t, ref_link)); + offsetof(reference_t, ref_link.l)); rc->rc_count = 0; rc->rc_removed_count = 0; rc->rc_tracked = reference_tracking_enable; @@ -86,16 +93,15 @@ void zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) { reference_t *ref; + void *cookie = NULL; ASSERT3U(rc->rc_count, ==, number); - while ((ref = list_remove_head(&rc->rc_list))) + while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL) kmem_cache_free(reference_cache, ref); - list_destroy(&rc->rc_list); + avl_destroy(&rc->rc_tree); - while ((ref = list_remove_head(&rc->rc_removed))) { - kmem_cache_free(reference_history_cache, ref->ref_removed); + while ((ref = list_remove_head(&rc->rc_removed))) kmem_cache_free(reference_cache, ref); - } list_destroy(&rc->rc_removed); mutex_destroy(&rc->rc_mtx); } @@ -121,10 +127,10 @@ zfs_refcount_count(zfs_refcount_t *rc) int64_t zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { - reference_t *ref = NULL; + reference_t *ref; int64_t count; - if (!rc->rc_tracked) { + if (likely(!rc->rc_tracked)) { count = atomic_add_64_nv(&(rc)->rc_count, number); ASSERT3U(count, >=, number); return (count); @@ -133,8 +139,9 @@ zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) ref = kmem_cache_alloc(reference_cache, KM_SLEEP); ref->ref_holder = holder; ref->ref_number = number; + ref->ref_search = B_FALSE; mutex_enter(&rc->rc_mtx); - list_insert_head(&rc->rc_list, ref); + avl_add(&rc->rc_tree, ref); rc->rc_count += number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); @@ -151,7 +158,7 @@ zfs_refcount_add(zfs_refcount_t *rc, const void *holder) void zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder) { - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) (void) zfs_refcount_add_many(rc, number, holder); else for (; number > 0; number--) (void) zfs_refcount_add(rc, holder); @@ -161,47 +168,42 @@ int64_t zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { - reference_t *ref; + reference_t *ref, s; int64_t count; - if (!rc->rc_tracked) { + if (likely(!rc->rc_tracked)) { count = atomic_add_64_nv(&(rc)->rc_count, -number); ASSERT3S(count, >=, 0); return (count); } + s.ref_holder = holder; + s.ref_number = number; + s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); ASSERT3U(rc->rc_count, >=, number); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder && ref->ref_number == number) { - list_remove(&rc->rc_list, ref); - if (reference_history > 0) { - ref->ref_removed = - kmem_cache_alloc(reference_history_cache, - KM_SLEEP); - list_insert_head(&rc->rc_removed, ref); - rc->rc_removed_count++; - if (rc->rc_removed_count > reference_history) { - ref = list_tail(&rc->rc_removed); - list_remove(&rc->rc_removed, ref); - kmem_cache_free(reference_history_cache, - ref->ref_removed); - kmem_cache_free(reference_cache, ref); - rc->rc_removed_count--; - } - } else { - kmem_cache_free(reference_cache, ref); - } - rc->rc_count -= number; - count = rc->rc_count; - mutex_exit(&rc->rc_mtx); - return (count); + ref = avl_find(&rc->rc_tree, &s, NULL); + if (unlikely(ref == NULL)) { + panic("No such hold %p on refcount %llx", holder, + (u_longlong_t)(uintptr_t)rc); + return (-1); + } + avl_remove(&rc->rc_tree, ref); + if (reference_history > 0) { + list_insert_head(&rc->rc_removed, ref); + if (rc->rc_removed_count >= reference_history) { + ref = list_remove_tail(&rc->rc_removed); + kmem_cache_free(reference_cache, ref); + } else { + rc->rc_removed_count++; } + } else { + kmem_cache_free(reference_cache, ref); } - panic("No such hold %p on refcount %llx", holder, - (u_longlong_t)(uintptr_t)rc); - return (-1); + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); } int64_t @@ -213,7 +215,7 @@ zfs_refcount_remove(zfs_refcount_t *rc, const void *holder) void zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder) { - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) (void) zfs_refcount_remove_many(rc, number, holder); else for (; number > 0; number--) (void) zfs_refcount_remove(rc, holder); @@ -222,31 +224,38 @@ zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder) void zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) { - int64_t count, removed_count; - list_t list, removed; + avl_tree_t tree; + list_t removed; + reference_t *ref; + void *cookie = NULL; + uint64_t count; + uint_t removed_count; - list_create(&list, sizeof (reference_t), - offsetof(reference_t, ref_link)); + avl_create(&tree, zfs_refcount_compare, sizeof (reference_t), + offsetof(reference_t, ref_link.a)); list_create(&removed, sizeof (reference_t), - offsetof(reference_t, ref_link)); + offsetof(reference_t, ref_link.l)); mutex_enter(&src->rc_mtx); count = src->rc_count; removed_count = src->rc_removed_count; src->rc_count = 0; src->rc_removed_count = 0; - list_move_tail(&list, &src->rc_list); + avl_swap(&tree, &src->rc_tree); list_move_tail(&removed, &src->rc_removed); mutex_exit(&src->rc_mtx); mutex_enter(&dst->rc_mtx); dst->rc_count += count; dst->rc_removed_count += removed_count; - list_move_tail(&dst->rc_list, &list); + if (avl_is_empty(&dst->rc_tree)) + avl_swap(&dst->rc_tree, &tree); + else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL) + avl_add(&dst->rc_tree, ref); list_move_tail(&dst->rc_removed, &removed); mutex_exit(&dst->rc_mtx); - list_destroy(&list); + avl_destroy(&tree); list_destroy(&removed); } @@ -254,23 +263,19 @@ void zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number, const void *current_holder, const void *new_holder) { - reference_t *ref; - boolean_t found = B_FALSE; + reference_t *ref, s; - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) return; + s.ref_holder = current_holder; + s.ref_number = number; + s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == current_holder && - ref->ref_number == number) { - ref->ref_holder = new_holder; - found = B_TRUE; - break; - } - } - ASSERT(found); + ref = avl_find(&rc->rc_tree, &s, NULL); + ASSERT(ref); + ref->ref_holder = new_holder; + avl_update(&rc->rc_tree, ref); mutex_exit(&rc->rc_mtx); } @@ -290,21 +295,23 @@ zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder, boolean_t zfs_refcount_held(zfs_refcount_t *rc, const void *holder) { - reference_t *ref; + reference_t *ref, s; + avl_index_t idx; + boolean_t res; - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) return (zfs_refcount_count(rc) > 0); + s.ref_holder = holder; + s.ref_number = 0; + s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder) { - mutex_exit(&rc->rc_mtx); - return (B_TRUE); - } - } + ref = avl_find(&rc->rc_tree, &s, &idx); + if (likely(ref == NULL)) + ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); + res = ref && ref->ref_holder == holder; mutex_exit(&rc->rc_mtx); - return (B_FALSE); + return (res); } /* @@ -315,21 +322,23 @@ zfs_refcount_held(zfs_refcount_t *rc, const void *holder) boolean_t zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder) { - reference_t *ref; + reference_t *ref, s; + avl_index_t idx; + boolean_t res; - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) return (B_TRUE); mutex_enter(&rc->rc_mtx); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder) { - mutex_exit(&rc->rc_mtx); - return (B_FALSE); - } - } + s.ref_holder = holder; + s.ref_number = 0; + s.ref_search = B_TRUE; + ref = avl_find(&rc->rc_tree, &s, &idx); + if (likely(ref == NULL)) + ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); + res = ref == NULL || ref->ref_holder != holder; mutex_exit(&rc->rc_mtx); - return (B_TRUE); + return (res); } EXPORT_SYMBOL(zfs_refcount_create); From e32e326c5b3c6ba4a7632abcf86f394233041148 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 14 Jun 2023 10:04:05 -0500 Subject: [PATCH 158/180] ZTS: Skip send_raw_ashift on FreeBSD On FreeBSD 14 this test runs slowly in the CI environment and is killed by the 10 minute timeout. Skip the test on FreeBSD until the slow down is resolved. Signed-off-by: Brian Behlendorf Issue #14961 --- tests/test-runner/bin/zts-report.py.in | 1 + tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 9517ce8073a5..cf438e0e6495 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -173,6 +173,7 @@ if sys.platform.startswith('freebsd'): 'link_count/link_count_001': ['SKIP', na_reason], 'casenorm/mixed_create_failure': ['FAIL', 13215], 'mmap/mmap_sync_001_pos': ['SKIP', na_reason], + 'rsend/send_raw_ashift': ['SKIP', 14961], }) elif sys.platform.startswith('linux'): known.update({ diff --git a/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh b/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh index 3cea334495d9..f238c361134f 100755 --- a/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh +++ b/tests/zfs-tests/tests/functional/rsend/send_raw_ashift.ksh @@ -37,6 +37,10 @@ verify_runnable "both" log_assert "Verify raw sending to pools with greater ashift succeeds" +if is_freebsd; then + log_unsupported "Runs too long on FreeBSD 14 (Issue #14961)" +fi + function cleanup { rm -f $BACKDIR/fs@* From ccec7fbe1c66c5b63a3af9d152403ce43344f4ab Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Thu, 15 Jun 2023 13:49:03 -0400 Subject: [PATCH 159/180] Remove ARC/ZIO physdone callbacks. Those callbacks were introduced many years ago as part of a bigger patch to smoothen the write throttling within a txg. They allow to account completion of individual physical writes within a logical one, improving cases when some of physical writes complete much sooner than others, gradually opening the write throttle. Few years after that ZFS got allocation throttling, working on a level of logical writes and limiting number of writes queued to vdevs at any point, and so limiting latency distribution between the physical writes and especially writes of multiple copies. The addition of scheduling deadline I proposed in #14925 should further reduce the latency distribution. Grown memory sizes over the past 10 years should also reduce importance of the smoothing. While the use of physdone callback may still in theory provide some smoother throttling, there are cases where we simply can not afford it. Since dirty data accounting is protected by pool-wide lock, in case of 6-wide RAIDZ, for example, it requires us to take it 8 times per logical block write, creating huge lock contention. My tests of this patch show radical reduction of the lock spinning time on workloads when smaller blocks are written to RAIDZ pools, when each of the disks receives 8-16KB chunks, but the total rate reaching 100K+ blocks per second. Same time attempts to measure any write time fluctuations didn't show anything noticeable. While there, remove also io_child_count/io_parent_count counters. They are used only for couple assertions that can be avoided. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14948 --- include/sys/arc.h | 5 +-- include/sys/arc_impl.h | 1 - include/sys/zio.h | 9 +--- module/zfs/arc.c | 22 ++-------- module/zfs/dbuf.c | 94 ++++------------------------------------- module/zfs/dmu.c | 4 +- module/zfs/dmu_objset.c | 2 +- module/zfs/zio.c | 32 +++----------- 8 files changed, 26 insertions(+), 143 deletions(-) diff --git a/include/sys/arc.h b/include/sys/arc.h index 836ed679dbac..9d67dab06ca3 100644 --- a/include/sys/arc.h +++ b/include/sys/arc.h @@ -304,9 +304,8 @@ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, arc_write_done_func_t *child_ready, - arc_write_done_func_t *physdone, arc_write_done_func_t *done, - void *priv, zio_priority_t priority, int zio_flags, - const zbookmark_phys_t *zb); + arc_write_done_func_t *done, void *priv, zio_priority_t priority, + int zio_flags, const zbookmark_phys_t *zb); arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv); void arc_remove_prune_callback(arc_prune_t *p); diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index fd24d2f3c8bd..78774792f367 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -123,7 +123,6 @@ struct arc_write_callback { void *awcb_private; arc_write_done_func_t *awcb_ready; arc_write_done_func_t *awcb_children_ready; - arc_write_done_func_t *awcb_physdone; arc_write_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; diff --git a/include/sys/zio.h b/include/sys/zio.h index 6b1352a72b9a..ec32211f6906 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -460,7 +460,6 @@ struct zio { /* Callback info */ zio_done_func_t *io_ready; zio_done_func_t *io_children_ready; - zio_done_func_t *io_physdone; zio_done_func_t *io_done; void *io_private; int64_t io_prev_space_delta; /* DMU private */ @@ -503,9 +502,6 @@ struct zio { int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; - uint64_t io_child_count; - uint64_t io_phys_children; - uint64_t io_parent_count; uint64_t *io_stall; zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; @@ -553,9 +549,8 @@ extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, - zio_done_func_t *physdone, zio_done_func_t *done, - void *priv, zio_priority_t priority, zio_flag_t flags, - const zbookmark_phys_t *zb); + zio_done_func_t *done, void *priv, zio_priority_t priority, + zio_flag_t flags, const zbookmark_phys_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, struct abd *data, uint64_t size, zio_done_func_t *done, void *priv, diff --git a/module/zfs/arc.c b/module/zfs/arc.c index a23715309f2b..7023f448182a 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -6675,18 +6675,6 @@ arc_write_children_ready(zio_t *zio) callback->awcb_children_ready(zio, buf, callback->awcb_private); } -/* - * The SPA calls this callback for each physical write that happens on behalf - * of a logical write. See the comment in dbuf_write_physdone() for details. - */ -static void -arc_write_physdone(zio_t *zio) -{ - arc_write_callback_t *cb = zio->io_private; - if (cb->awcb_physdone != NULL) - cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); -} - static void arc_write_done(zio_t *zio) { @@ -6776,9 +6764,9 @@ zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t uncached, boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, - arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, - arc_write_done_func_t *done, void *private, zio_priority_t priority, - int zio_flags, const zbookmark_phys_t *zb) + arc_write_done_func_t *children_ready, arc_write_done_func_t *done, + void *private, zio_priority_t priority, int zio_flags, + const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; @@ -6825,7 +6813,6 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; - callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; @@ -6862,8 +6849,7 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, (children_ready != NULL) ? arc_write_children_ready : NULL, - arc_write_physdone, arc_write_done, callback, - priority, zio_flags, zb); + arc_write_done, callback, priority, zio_flags, zb); return (zio); } diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 272e712586fa..1ea075217fb1 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -4369,22 +4369,6 @@ dbuf_lightweight_ready(zio_t *zio) rw_exit(&parent_db->db_rwlock); } -static void -dbuf_lightweight_physdone(zio_t *zio) -{ - dbuf_dirty_record_t *dr = zio->io_private; - dsl_pool_t *dp = spa_get_dsl(zio->io_spa); - ASSERT3U(dr->dr_txg, ==, zio->io_txg); - - /* - * The callback will be called io_phys_children times. Retire one - * portion of our dirty space each time we are called. Any rounding - * error will be cleaned up by dbuf_lightweight_done(). - */ - int delta = dr->dr_accounted / zio->io_phys_children; - dsl_pool_undirty_space(dp, delta, zio->io_txg); -} - static void dbuf_lightweight_done(zio_t *zio) { @@ -4403,16 +4387,8 @@ dbuf_lightweight_done(zio_t *zio) dsl_dataset_block_born(ds, zio->io_bp, tx); } - /* - * See comment in dbuf_write_done(). - */ - if (zio->io_phys_children == 0) { - dsl_pool_undirty_space(dmu_objset_pool(os), - dr->dr_accounted, zio->io_txg); - } else { - dsl_pool_undirty_space(dmu_objset_pool(os), - dr->dr_accounted % zio->io_phys_children, zio->io_txg); - } + dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted, + zio->io_txg); abd_free(dr->dt.dll.dr_abd); kmem_free(dr, sizeof (*dr)); @@ -4446,8 +4422,7 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd, dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd), &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL, - dbuf_lightweight_physdone, dbuf_lightweight_done, dr, - ZIO_PRIORITY_ASYNC_WRITE, + dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb); zio_nowait(dr->dr_zio); @@ -4789,37 +4764,6 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) DB_DNODE_EXIT(db); } -/* - * The SPA will call this callback several times for each zio - once - * for every physical child i/o (zio->io_phys_children times). This - * allows the DMU to monitor the progress of each logical i/o. For example, - * there may be 2 copies of an indirect block, or many fragments of a RAID-Z - * block. There may be a long delay before all copies/fragments are completed, - * so this callback allows us to retire dirty space gradually, as the physical - * i/os complete. - */ -static void -dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) -{ - (void) buf; - dmu_buf_impl_t *db = arg; - objset_t *os = db->db_objset; - dsl_pool_t *dp = dmu_objset_pool(os); - dbuf_dirty_record_t *dr; - int delta = 0; - - dr = db->db_data_pending; - ASSERT3U(dr->dr_txg, ==, zio->io_txg); - - /* - * The callback will be called io_phys_children times. Retire one - * portion of our dirty space each time we are called. Any rounding - * error will be cleaned up by dbuf_write_done(). - */ - delta = dr->dr_accounted / zio->io_phys_children; - dsl_pool_undirty_space(dp, delta, zio->io_txg); -} - static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { @@ -4894,27 +4838,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); - /* - * If we didn't do a physical write in this ZIO and we - * still ended up here, it means that the space of the - * dbuf that we just released (and undirtied) above hasn't - * been marked as undirtied in the pool's accounting. - * - * Thus, we undirty that space in the pool's view of the - * world here. For physical writes this type of update - * happens in dbuf_write_physdone(). - * - * If we did a physical write, cleanup any rounding errors - * that came up due to writing multiple copies of a block - * on disk [see dbuf_write_physdone()]. - */ - if (zio->io_phys_children == 0) { - dsl_pool_undirty_space(dmu_objset_pool(os), - dr->dr_accounted, zio->io_txg); - } else { - dsl_pool_undirty_space(dmu_objset_pool(os), - dr->dr_accounted % zio->io_phys_children, zio->io_txg); - } + dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted, + zio->io_txg); kmem_free(dr, sizeof (dbuf_dirty_record_t)); } @@ -5162,7 +5087,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, contents, db->db.db_size, db->db.db_size, &zp, - dbuf_write_override_ready, NULL, NULL, + dbuf_write_override_ready, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); @@ -5176,7 +5101,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, - dbuf_write_nofill_ready, NULL, NULL, + dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); @@ -5195,9 +5120,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) dr->dr_zio = arc_write(pio, os->os_spa, txg, &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), &zp, dbuf_write_ready, - children_ready_cb, dbuf_write_physdone, - dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_MUSTSUCCEED, &zb); + children_ready_cb, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 8a13b8f410a1..dda869287c78 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1698,7 +1698,7 @@ dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, - dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, + dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); @@ -1864,7 +1864,7 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, !DBUF_IS_CACHEABLE(db), dbuf_is_l2cacheable(db), - &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, + &zp, dmu_sync_ready, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 778b18817eef..d134d4958f7c 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1698,7 +1698,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) zio = arc_write(pio, os->os_spa, tx->tx_txg, blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os), - &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, + &zp, dmu_objset_write_ready, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* diff --git a/module/zfs/zio.c b/module/zfs/zio.c index d7b2217623e6..fb8164f0aea9 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -650,9 +650,6 @@ zio_add_child(zio_t *pio, zio_t *cio) list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); - pio->io_child_count++; - cio->io_parent_count++; - mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); } @@ -669,9 +666,6 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); - pio->io_child_count--; - cio->io_parent_count--; - mutex_exit(&cio->io_lock); mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); @@ -1162,9 +1156,8 @@ zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *children_ready, - zio_done_func_t *physdone, zio_done_func_t *done, - void *private, zio_priority_t priority, zio_flag_t flags, - const zbookmark_phys_t *zb) + zio_done_func_t *done, void *private, zio_priority_t priority, + zio_flag_t flags, const zbookmark_phys_t *zb) { zio_t *zio; @@ -1184,7 +1177,6 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio->io_ready = ready; zio->io_children_ready = children_ready; - zio->io_physdone = physdone; zio->io_prop = *zp; /* @@ -1517,16 +1509,11 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, flags &= ~ZIO_FLAG_IO_ALLOCATING; } - zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, done, private, type, priority, flags, vd, offset, &pio->io_bookmark, ZIO_STAGE_VDEV_IO_START >> 1, pipeline); ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); - zio->io_physdone = pio->io_physdone; - if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) - zio->io_logical->io_phys_children++; - return (zio); } @@ -2711,7 +2698,7 @@ zio_gang_tree_assemble_done(zio_t *zio) blkptr_t *bp = zio->io_bp; ASSERT(gio == zio_unique_parent(zio)); - ASSERT(zio->io_child_count == 0); + ASSERT(list_is_empty(&zio->io_child_list)); if (zio->io_error) return; @@ -2969,7 +2956,7 @@ zio_write_gang_block(zio_t *pio, metaslab_class_t *mc) zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], has_data ? abd_get_offset(pio->io_abd, pio->io_size - resid) : NULL, lsize, lsize, &zp, - zio_write_gang_member_ready, NULL, NULL, + zio_write_gang_member_ready, NULL, zio_write_gang_done, &gn->gn_child[g], pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); @@ -3431,7 +3418,7 @@ zio_ddt_write(zio_t *zio) } else { cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, zio->io_orig_size, zio->io_orig_size, zp, - zio_ddt_child_write_ready, NULL, NULL, + zio_ddt_child_write_ready, NULL, zio_ddt_child_write_done, dde, zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); @@ -4134,13 +4121,6 @@ zio_vdev_io_assess(zio_t *zio) if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - if (vd != NULL && vd->vdev_ops->vdev_op_leaf && - zio->io_physdone != NULL) { - ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); - ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); - zio->io_physdone(zio->io_logical); - } - return (zio); } @@ -4890,7 +4870,7 @@ zio_done(zio_t *zio) return (NULL); } - ASSERT(zio->io_child_count == 0); + ASSERT(list_is_empty(&zio->io_child_list)); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); From 10e36e17612ba9c634b140ae76847bb62b5be68f Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Thu, 15 Jun 2023 21:45:36 +0200 Subject: [PATCH 160/180] Shorten arcstat_quiescence sleep time With the latest L2ARC fixes, 2 seconds is too long to wait for quiescence of arcstats like l2_size. Shorten this interval to avoid having the persistent L2ARC tests in ZTS prematurely terminated. Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #14981 --- tests/zfs-tests/include/libtest.shlib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index 133f8387ddaf..844caa17d8ed 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -3706,7 +3706,7 @@ function arcstat_quiescence # stat echo while $do_once || [ $stat1 -ne $stat2 ] || [ $stat2 -eq 0 ]; do typeset stat1=$(get_arcstat $stat) - sleep 2 + sleep 0.5 typeset stat2=$(get_arcstat $stat) do_once=false done From 8e8acabdcaeb831c777f71361722f4235b698a8d Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Sat, 17 Jun 2023 22:51:37 -0400 Subject: [PATCH 161/180] Fix memory leak in zil_parse(). 482da24e2 missed arc_buf_destroy() calls on log parse errors, possibly leaking up to 128KB of memory per dataset during ZIL replay. Reviewed-by: Brian Behlendorf Reviewed-by: Paul Dagnelie Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14987 --- module/zfs/zil.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 8c1fe5f66838..ee8dcce3b361 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -522,12 +522,16 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); - if (lr->lrc_seq > claim_lr_seq) + if (lr->lrc_seq > claim_lr_seq) { + arc_buf_destroy(abuf, &abuf); goto done; + } error = parse_lr_func(zilog, lr, arg, txg); - if (error != 0) + if (error != 0) { + arc_buf_destroy(abuf, &abuf); goto done; + } ASSERT3U(max_lr_seq, <, lr->lrc_seq); max_lr_seq = lr->lrc_seq; lr_count++; From 35a6247c5fe788aa77e0b3c7e8010fedb9e60eb5 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Mon, 26 Jun 2023 16:57:12 -0400 Subject: [PATCH 162/180] Add a delay to tearing down threads. It's been observed that in certain workloads (zvol-related being a big one), ZFS will end up spending a large amount of time spinning up taskqs only to tear them down again almost immediately, then spin them up again... I noticed this when I looked at what my mostly-idle system was doing and wondered how on earth taskq creation/destroy was a bunch of time... So I added a configurable delay to avoid it tearing down tasks the first time it notices them idle, and the total number of threads at steady state went up, but the amount of time being burned just tearing down/turning up new ones almost vanished. Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #14938 --- include/os/linux/spl/sys/taskq.h | 1 + man/man4/spl.4 | 15 ++++++++++++++ module/os/linux/spl/spl-taskq.c | 34 +++++++++++++++++++++++++++++++- 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h index 2a6cd8283d16..6c1b4377a98a 100644 --- a/include/os/linux/spl/sys/taskq.h +++ b/include/os/linux/spl/sys/taskq.h @@ -104,6 +104,7 @@ typedef struct taskq { /* list node for the cpu hotplug callback */ struct hlist_node tq_hp_cb_node; boolean_t tq_hp_support; + unsigned long lastshouldstop; /* when to purge dynamic */ } taskq_t; typedef struct taskq_ent { diff --git a/man/man4/spl.4 b/man/man4/spl.4 index 02efaf16dc3a..82455fb53254 100644 --- a/man/man4/spl.4 +++ b/man/man4/spl.4 @@ -193,4 +193,19 @@ The proc file will walk the lists with lock held, reading it could cause a lock-up if the list grow too large without limiting the output. "(truncated)" will be shown if the list is larger than the limit. +. +.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 10000 Pq uint +(Linux-only) +How long a taskq has to have had no work before we tear it down. +Previously, we would tear down a dynamic taskq worker as soon +as we noticed it had no work, but it was observed that this led +to a lot of churn in tearing down things we then immediately +spawned anew. +In practice, it seems any nonzero value will remove the vast +majority of this churn, while the nontrivially larger value +was chosen to help filter out the little remaining churn on +a mostly idle system. +Setting this value to +.Sy 0 +will revert to the previous behavior. .El diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index 84497359ce2e..d18f935b167c 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -36,6 +36,12 @@ static int spl_taskq_thread_bind = 0; module_param(spl_taskq_thread_bind, int, 0644); MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); +static uint_t spl_taskq_thread_timeout_ms = 10000; +/* BEGIN CSTYLED */ +module_param(spl_taskq_thread_timeout_ms, uint, 0644); +/* END CSTYLED */ +MODULE_PARM_DESC(spl_taskq_thread_timeout_ms, + "Time to require a dynamic thread be idle before it gets cleaned up"); static int spl_taskq_thread_dynamic = 1; module_param(spl_taskq_thread_dynamic, int, 0444); @@ -848,12 +854,37 @@ taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt) tqt_thread_list) == tqt) return (0); - return + int no_work = ((tq->tq_nspawn == 0) && /* No threads are being spawned */ (tq->tq_nactive == 0) && /* No threads are handling tasks */ (tq->tq_nthreads > 1) && /* More than 1 thread is running */ (!taskq_next_ent(tq)) && /* There are no pending tasks */ (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */ + + /* + * If we would have said stop before, let's instead wait a bit, maybe + * we'll see more work come our way soon... + */ + if (no_work) { + /* if it's 0, we want the old behavior. */ + /* if the taskq is being torn down, we also want to go away. */ + if (spl_taskq_thread_timeout_ms == 0 || + !(tq->tq_flags & TASKQ_ACTIVE)) + return (1); + unsigned long lasttime = tq->lastshouldstop; + if (lasttime > 0) { + if (time_after(jiffies, lasttime + + msecs_to_jiffies(spl_taskq_thread_timeout_ms))) + return (1); + else + return (0); + } else { + tq->lastshouldstop = jiffies; + } + } else { + tq->lastshouldstop = 0; + } + return (0); } static int @@ -1091,6 +1122,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri, tq->tq_flags = (flags | TASKQ_ACTIVE); tq->tq_next_id = TASKQID_INITIAL; tq->tq_lowest_id = TASKQID_INITIAL; + tq->lastshouldstop = 0; INIT_LIST_HEAD(&tq->tq_free_list); INIT_LIST_HEAD(&tq->tq_pend_list); INIT_LIST_HEAD(&tq->tq_prio_list); From 8469b5aac0cee4f0e8b13018c3e83129554a6945 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 27 Jun 2023 12:09:48 -0400 Subject: [PATCH 163/180] Another set of vdev queue optimizations. Switch FIFO queues (SYNC/TRIM) and active queue of vdev queue from time-sorted AVL-trees to simple lists. AVL-trees are too expensive for such a simple task. To change I/O priority without searching through the trees, add io_queue_state field to struct zio. To not check number of queued I/Os for each priority add vq_cqueued bitmap to struct vdev_queue. Update it when adding/removing I/Os. Make vq_cactive a separate array instead of struct vdev_queue_class member. Together those allow to avoid lots of cache misses when looking for work in vdev_queue_class_to_issue(). Introduce deadline of ~0.5s for LBA-sorted queues. Before this I saw some I/Os waiting in a queue for up to 8 seconds and possibly more due to starvation. With this change I no longer see it. I had to slightly more complicate the comparison function, but since it uses all the same cache lines the difference is minimal. For a sequential I/Os the new code in vdev_queue_io_to_issue() actually often uses more simple avl_first(), falling back to avl_find() and avl_nearest() only when needed. Arrange members in struct zio to access only one cache line when searching through vdev queues. While there, remove io_alloc_node, reusing the io_queue_node instead. Those two are never used same time. Remove zfs_vdev_aggregate_trim parameter. It was disabled for 4 years since implemented, while still wasted time maintaining the offset-sorted tree of TRIM requests. Just remove the tree. Remove locking from txg_all_lists_empty(). It is racy by design, while 2 pair of locks/unlocks take noticeable time under the vdev queue lock. With these changes in my tests with volblocksize=4KB I measure vdev queue lock spin time reduction by 50% on read and 75% on write. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14925 --- include/sys/vdev.h | 3 +- include/sys/vdev_impl.h | 17 +-- include/sys/zio.h | 15 +- man/man4/zfs.4 | 6 - module/zfs/spa_misc.c | 2 +- module/zfs/txg.c | 13 +- module/zfs/vdev.c | 16 +-- module/zfs/vdev_queue.c | 305 ++++++++++++++++++++++------------------ 8 files changed, 205 insertions(+), 172 deletions(-) diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 26c834ff57cf..03e1f438aaf9 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -164,8 +164,9 @@ extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); -extern int vdev_queue_length(vdev_t *vd); +extern uint32_t vdev_queue_length(vdev_t *vd); extern uint64_t vdev_queue_last_offset(vdev_t *vd); +extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p); extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 74b3737d8ee5..2b22b973ba49 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -130,27 +130,24 @@ typedef const struct vdev_ops { /* * Virtual device properties */ -typedef struct vdev_queue_class { - uint32_t vqc_active; - - /* - * Sorted by offset or timestamp, depending on if the queue is - * LBA-ordered vs FIFO. - */ - avl_tree_t vqc_queued_tree; +typedef union vdev_queue_class { + list_t vqc_list; + avl_tree_t vqc_tree; } vdev_queue_class_t; struct vdev_queue { vdev_t *vq_vdev; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; - avl_tree_t vq_active_tree; avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; - avl_tree_t vq_trim_offset_tree; uint64_t vq_last_offset; zio_priority_t vq_last_prio; /* Last sent I/O priority. */ + uint32_t vq_cqueued; /* Classes with queued I/Os. */ + uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE]; + uint32_t vq_active; /* Number of active I/Os. */ uint32_t vq_ia_active; /* Active interactive I/Os. */ uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ + list_t vq_active_list; /* List of active I/Os. */ hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ diff --git a/include/sys/zio.h b/include/sys/zio.h index ec32211f6906..85217b873dc8 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -436,6 +436,12 @@ typedef struct zio_link { list_node_t zl_child_node; } zio_link_t; +enum zio_qstate { + ZIO_QS_NONE = 0, + ZIO_QS_QUEUED, + ZIO_QS_ACTIVE, +}; + struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; @@ -479,6 +485,12 @@ struct zio { const zio_vsd_ops_t *io_vsd_ops; metaslab_class_t *io_metaslab_class; /* dva throttle class */ + enum zio_qstate io_queue_state; /* vdev queue state */ + union { + list_node_t l; + avl_node_t a; + } io_queue_node ____cacheline_aligned; /* allocator and vdev queues */ + avl_node_t io_offset_node; /* vdev offset queues */ uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ hrtime_t io_queued_timestamp; @@ -486,9 +498,6 @@ struct zio { hrtime_t io_delta; /* vdev queue service delta */ hrtime_t io_delay; /* Device access time (disk or */ /* file). */ - avl_node_t io_queue_node; - avl_node_t io_offset_node; - avl_node_t io_alloc_node; zio_alloc_list_t io_alloc_list; /* Internal pipeline state */ diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 5fbd9d7db93f..04bbbc5fdf59 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2016,12 +2016,6 @@ Historical statistics for this many latest TXGs will be available in Flush dirty data to disk at least every this many seconds (maximum TXG duration). . -.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint -Allow TRIM I/O operations to be aggregated. -This is normally not helpful because the extents to be trimmed -will have been already been aggregated by the metaslab. -This option is provided for debugging and performance analysis. -. .It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint Max vdev I/O aggregation size. . diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 9ef948e9e434..8dc83445e198 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -730,7 +730,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, - sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); } avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); diff --git a/module/zfs/txg.c b/module/zfs/txg.c index ec61cabcaab2..a67c043446f5 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -895,15 +895,10 @@ txg_list_destroy(txg_list_t *tl) boolean_t txg_all_lists_empty(txg_list_t *tl) { - mutex_enter(&tl->tl_lock); - for (int i = 0; i < TXG_SIZE; i++) { - if (!txg_list_empty_impl(tl, i)) { - mutex_exit(&tl->tl_lock); - return (B_FALSE); - } - } - mutex_exit(&tl->tl_lock); - return (B_TRUE); + boolean_t res = B_TRUE; + for (int i = 0; i < TXG_SIZE; i++) + res &= (tl->tl_head[i] == NULL); + return (res); } /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 612e66c3a8a8..30551feb6322 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4608,11 +4608,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); - for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { - vsx->vsx_active_queue[t] = - vd->vdev_queue.vq_class[t].vqc_active; - vsx->vsx_pend_queue[t] = avl_numnodes( - &vd->vdev_queue.vq_class[t].vqc_queued_tree); + for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { + vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t]; + vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t); } } } @@ -5470,20 +5468,20 @@ vdev_deadman(vdev_t *vd, const char *tag) vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); - if (avl_numnodes(&vq->vq_active_tree) > 0) { + if (vq->vq_active > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; - zfs_dbgmsg("slow vdev: %s has %lu active IOs", - vd->vdev_path, avl_numnodes(&vq->vq_active_tree)); + zfs_dbgmsg("slow vdev: %s has %u active IOs", + vd->vdev_path, vq->vq_active); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ - fio = avl_first(&vq->vq_active_tree); + fio = list_head(&vq->vq_active_list); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index abb7d0662b8c..08d918467d03 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -228,13 +228,6 @@ uint_t zfs_vdev_queue_depth_pct = 300; */ uint_t zfs_vdev_def_queue_depth = 32; -/* - * Allow TRIM I/Os to be aggregated. This should normally not be needed since - * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted - * by the TRIM code in zfs_trim.c. - */ -static uint_t zfs_vdev_aggregate_trim = 0; - static int vdev_queue_offset_compare(const void *x1, const void *x2) { @@ -249,38 +242,60 @@ vdev_queue_offset_compare(const void *x1, const void *x2) return (TREE_PCMP(z1, z2)); } -static inline avl_tree_t * -vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) -{ - return (&vq->vq_class[p].vqc_queued_tree); -} - -static inline avl_tree_t * -vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) -{ - ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM); - if (t == ZIO_TYPE_READ) - return (&vq->vq_read_offset_tree); - else if (t == ZIO_TYPE_WRITE) - return (&vq->vq_write_offset_tree); - else - return (&vq->vq_trim_offset_tree); -} +#define VDQ_T_SHIFT 29 static int -vdev_queue_timestamp_compare(const void *x1, const void *x2) +vdev_queue_to_compare(const void *x1, const void *x2) { const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; - int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp); + int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT, + z2->io_timestamp >> VDQ_T_SHIFT); + int ocmp = TREE_CMP(z1->io_offset, z2->io_offset); + int cmp = tcmp ? tcmp : ocmp; - if (likely(cmp)) + if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE))) return (cmp); return (TREE_PCMP(z1, z2)); } +static inline boolean_t +vdev_queue_class_fifo(zio_priority_t p) +{ + return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE || + p == ZIO_PRIORITY_TRIM); +} + +static void +vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio) +{ + zio_priority_t p = zio->io_priority; + vq->vq_cqueued |= 1U << p; + if (vdev_queue_class_fifo(p)) + list_insert_tail(&vq->vq_class[p].vqc_list, zio); + else + avl_add(&vq->vq_class[p].vqc_tree, zio); +} + +static void +vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio) +{ + zio_priority_t p = zio->io_priority; + uint32_t empty; + if (vdev_queue_class_fifo(p)) { + list_t *list = &vq->vq_class[p].vqc_list; + list_remove(list, zio); + empty = list_is_empty(list); + } else { + avl_tree_t *tree = &vq->vq_class[p].vqc_tree; + avl_remove(tree, zio); + empty = avl_is_empty(tree); + } + vq->vq_cqueued &= ~(empty << p); +} + static uint_t vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) { @@ -360,7 +375,7 @@ vdev_queue_max_async_writes(spa_t *spa) } static uint_t -vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) +vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: @@ -370,7 +385,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_max_active); case ZIO_PRIORITY_ASYNC_WRITE: - return (vdev_queue_max_async_writes(spa)); + return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa)); case ZIO_PRIORITY_SCRUB: if (vq->vq_ia_active > 0) { return (MIN(vq->vq_nia_credit, @@ -414,10 +429,10 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { - spa_t *spa = vq->vq_vdev->vdev_spa; - zio_priority_t p, n; + uint32_t cq = vq->vq_cqueued; + zio_priority_t p, p1; - if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) + if (cq == 0 || vq->vq_active >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); /* @@ -425,14 +440,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) * Do round-robin to reduce starvation due to zfs_vdev_max_active * and vq_nia_credit limits. */ - for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) { - p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE; - if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && - vq->vq_class[p].vqc_active < - vdev_queue_class_min_active(vq, p)) { - vq->vq_last_prio = p; - return (p); - } + p1 = vq->vq_last_prio + 1; + if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE) + p1 = 0; + for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_min_active(vq, p)) + goto found; + } + for (p = 0; p < p1; p++) { + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_min_active(vq, p)) + goto found; } /* @@ -440,16 +459,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) * maximum # outstanding i/os. */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && - vq->vq_class[p].vqc_active < - vdev_queue_class_max_active(spa, vq, p)) { - vq->vq_last_prio = p; - return (p); - } + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_max_active(vq, p)) + break; } - /* No eligible queued i/os */ - return (ZIO_PRIORITY_NUM_QUEUEABLE); +found: + vq->vq_last_prio = p; + return (p); } void @@ -458,42 +475,30 @@ vdev_queue_init(vdev_t *vd) vdev_queue_t *vq = &vd->vdev_queue; zio_priority_t p; - mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); vq->vq_vdev = vd; - taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent); - - avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, - sizeof (zio_t), offsetof(struct zio, io_queue_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - int (*compfn) (const void *, const void *); - - /* - * The synchronous/trim i/o queues are dispatched in FIFO rather - * than LBA order. This provides more consistent latency for - * these i/os. - */ - if (p == ZIO_PRIORITY_SYNC_READ || - p == ZIO_PRIORITY_SYNC_WRITE || - p == ZIO_PRIORITY_TRIM) { - compfn = vdev_queue_timestamp_compare; + if (vdev_queue_class_fifo(p)) { + list_create(&vq->vq_class[p].vqc_list, + sizeof (zio_t), + offsetof(struct zio, io_queue_node.l)); } else { - compfn = vdev_queue_offset_compare; + avl_create(&vq->vq_class[p].vqc_tree, + vdev_queue_to_compare, sizeof (zio_t), + offsetof(struct zio, io_queue_node.a)); } - avl_create(vdev_queue_class_tree(vq, p), compfn, - sizeof (zio_t), offsetof(struct zio, io_queue_node)); } + avl_create(&vq->vq_read_offset_tree, + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); + avl_create(&vq->vq_write_offset_tree, + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); vq->vq_last_offset = 0; + list_create(&vq->vq_active_list, sizeof (struct zio), + offsetof(struct zio, io_queue_node.l)); + mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); } void @@ -501,30 +506,39 @@ vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; - for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) - avl_destroy(vdev_queue_class_tree(vq, p)); - avl_destroy(&vq->vq_active_tree); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM)); + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if (vdev_queue_class_fifo(p)) + list_destroy(&vq->vq_class[p].vqc_list); + else + avl_destroy(&vq->vq_class[p].vqc_tree); + } + avl_destroy(&vq->vq_read_offset_tree); + avl_destroy(&vq->vq_write_offset_tree); + list_destroy(&vq->vq_active_list); mutex_destroy(&vq->vq_lock); } static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); - avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); + zio->io_queue_state = ZIO_QS_QUEUED; + vdev_queue_class_add(vq, zio); + if (zio->io_type == ZIO_TYPE_READ) + avl_add(&vq->vq_read_offset_tree, zio); + else if (zio->io_type == ZIO_TYPE_WRITE) + avl_add(&vq->vq_write_offset_tree, zio); } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); - avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); + vdev_queue_class_remove(vq, zio); + if (zio->io_type == ZIO_TYPE_READ) + avl_remove(&vq->vq_read_offset_tree, zio); + else if (zio->io_type == ZIO_TYPE_WRITE) + avl_remove(&vq->vq_write_offset_tree, zio); + zio->io_queue_state = ZIO_QS_NONE; } static boolean_t @@ -546,14 +560,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_class[zio->io_priority].vqc_active++; + vq->vq_cactive[zio->io_priority]++; + vq->vq_active++; if (vdev_queue_is_interactive(zio->io_priority)) { if (++vq->vq_ia_active == 1) vq->vq_nia_credit = 1; } else if (vq->vq_ia_active > 0) { vq->vq_nia_credit--; } - avl_add(&vq->vq_active_tree, zio); + zio->io_queue_state = ZIO_QS_ACTIVE; + list_insert_tail(&vq->vq_active_list, zio); } static void @@ -561,7 +577,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_class[zio->io_priority].vqc_active--; + vq->vq_cactive[zio->io_priority]--; + vq->vq_active--; if (vdev_queue_is_interactive(zio->io_priority)) { if (--vq->vq_ia_active == 0) vq->vq_nia_credit = 0; @@ -569,7 +586,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) vq->vq_nia_credit = zfs_vdev_nia_credit; } else if (vq->vq_ia_active == 0) vq->vq_nia_credit++; - avl_remove(&vq->vq_active_tree, zio); + list_remove(&vq->vq_active_list, zio); + zio->io_queue_state = ZIO_QS_NONE; } static void @@ -602,29 +620,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) uint64_t maxgap = 0; uint64_t size; uint64_t limit; - int maxblocksize; boolean_t stretch = B_FALSE; - avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); - zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; uint64_t next_offset; abd_t *abd; + avl_tree_t *t; + + /* + * TRIM aggregation should not be needed since code in zfs_trim.c can + * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M). + */ + if (zio->io_type == ZIO_TYPE_TRIM) + return (NULL); + + if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) + return (NULL); - maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa); if (vq->vq_vdev->vdev_nonrot) limit = zfs_vdev_aggregation_limit_non_rotating; else limit = zfs_vdev_aggregation_limit; - limit = MIN(limit, maxblocksize); - - if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0) - return (NULL); - - /* - * While TRIM commands could be aggregated based on offset this - * behavior is disabled until it's determined to be beneficial. - */ - if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim) + if (limit == 0) return (NULL); + limit = MIN(limit, SPA_MAXBLOCKSIZE); /* * I/Os to distributed spares are directly dispatched to the dRAID @@ -635,8 +652,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) first = last = zio; - if (zio->io_type == ZIO_TYPE_READ) + if (zio->io_type == ZIO_TYPE_READ) { maxgap = zfs_vdev_read_gap_limit; + t = &vq->vq_read_offset_tree; + } else { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + t = &vq->vq_write_offset_tree; + } /* * We can aggregate I/Os that are sufficiently adjacent and of @@ -657,6 +679,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) * Walk backwards through sufficiently contiguous I/Os * recording the last non-optional I/O. */ + zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; while ((dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= limit && @@ -686,7 +709,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (IO_SPAN(first, dio) <= limit || (dio->io_flags & ZIO_FLAG_OPTIONAL)) && - IO_SPAN(first, dio) <= maxblocksize && + IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE && IO_GAP(last, dio) <= maxgap && dio->io_type == zio->io_type) { last = dio; @@ -740,7 +763,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) return (NULL); size = IO_SPAN(first, last); - ASSERT3U(size, <=, maxblocksize); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); abd = abd_alloc_gang(); if (abd == NULL) @@ -824,19 +847,30 @@ vdev_queue_io_to_issue(vdev_queue_t *vq) return (NULL); } - /* - * For LBA-ordered queues (async / scrub / initializing), issue the - * i/o which follows the most recently issued i/o in LBA (offset) order. - * - * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp. - */ - tree = vdev_queue_class_tree(vq, p); - vq->vq_io_search.io_timestamp = 0; - vq->vq_io_search.io_offset = vq->vq_last_offset - 1; - VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL); - zio = avl_nearest(tree, idx, AVL_AFTER); - if (zio == NULL) - zio = avl_first(tree); + if (vdev_queue_class_fifo(p)) { + zio = list_head(&vq->vq_class[p].vqc_list); + } else { + /* + * For LBA-ordered queues (async / scrub / initializing), + * issue the I/O which follows the most recently issued I/O + * in LBA (offset) order, but to avoid starvation only within + * the same 0.5 second interval as the first I/O. + */ + tree = &vq->vq_class[p].vqc_tree; + zio = aio = avl_first(tree); + if (zio->io_offset < vq->vq_last_offset) { + vq->vq_io_search.io_timestamp = zio->io_timestamp; + vq->vq_io_search.io_offset = vq->vq_last_offset; + zio = avl_find(tree, &vq->vq_io_search, &idx); + if (zio == NULL) { + zio = avl_nearest(tree, idx, AVL_AFTER); + if (zio == NULL || + (zio->io_timestamp >> VDQ_T_SHIFT) != + (aio->io_timestamp >> VDQ_T_SHIFT)) + zio = aio; + } + } + } ASSERT3U(zio->io_priority, ==, p); aio = vdev_queue_aggregate(vq, zio); @@ -967,7 +1001,6 @@ void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; - avl_tree_t *tree; /* * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio @@ -1002,12 +1035,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) * Otherwise, the zio is currently active and we cannot change its * priority. */ - tree = vdev_queue_class_tree(vq, zio->io_priority); - if (avl_find(tree, zio, NULL) == zio) { - avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + if (zio->io_queue_state == ZIO_QS_QUEUED) { + vdev_queue_class_remove(vq, zio); zio->io_priority = priority; - avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); - } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { + vdev_queue_class_add(vq, zio); + } else if (zio->io_queue_state == ZIO_QS_NONE) { zio->io_priority = priority; } @@ -1020,10 +1052,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) * vq_lock mutex use here, instead we prefer to keep it lock free for * performance. */ -int +uint32_t vdev_queue_length(vdev_t *vd) { - return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); + return (vd->vdev_queue.vq_active); } uint64_t @@ -1032,15 +1064,22 @@ vdev_queue_last_offset(vdev_t *vd) return (vd->vdev_queue.vq_last_offset); } +uint64_t +vdev_queue_class_length(vdev_t *vd, zio_priority_t p) +{ + vdev_queue_t *vq = &vd->vdev_queue; + if (vdev_queue_class_fifo(p)) + return (list_is_empty(&vq->vq_class[p].vqc_list) == 0); + else + return (avl_numnodes(&vq->vq_class[p].vqc_tree)); +} + ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW, "Max vdev I/O aggregation size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT, ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, UINT, ZMOD_RW, - "Allow TRIM I/O to be aggregated"); - ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW, "Aggregate read I/O over gap"); From bc9d0084ea82a96bfe52e6f7943a554f218f871e Mon Sep 17 00:00:00 2001 From: Laevos <5572812+Laevos@users.noreply.github.com> Date: Tue, 27 Jun 2023 16:58:32 -0700 Subject: [PATCH 164/180] Remove unnecessary commas in zpool-create.8 Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Laevos <5572812+Laevos@users.noreply.github.com> Closes #15011 --- man/man8/zpool-create.8 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/man/man8/zpool-create.8 b/man/man8/zpool-create.8 index da1a79c72c28..8449520944fb 100644 --- a/man/man8/zpool-create.8 +++ b/man/man8/zpool-create.8 @@ -87,13 +87,13 @@ currently in use by another subsystem. However this check is not robust enough to detect simultaneous attempts to use a new device in different pools, even if .Sy multihost Ns = Sy enabled . -The administrator must ensure, that simultaneous invocations of any combination +The administrator must ensure that simultaneous invocations of any combination of .Nm zpool Cm replace , .Nm zpool Cm create , .Nm zpool Cm add , or -.Nm zpool Cm labelclear , +.Nm zpool Cm labelclear do not refer to the same device. Using the same device in two pools will result in pool corruption. .Pp From b0cbc1aa9a1f2a3329f5e483ef9e297e1eca4833 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 27 Jun 2023 20:00:30 -0400 Subject: [PATCH 165/180] Use big transactions for small recordsize writes. When ZFS appends files in chunks bigger than recordsize, it borrows buffer from ARC and fills it before opening transaction. This supposed to help in case of page faults to not hold transaction open indefinitely. The problem appears when recordsize is set lower than default 128KB. Since each block is committed in separate transaction, per-transaction overhead becomes significant, and what is even worse, active use of of per-dataset and per-pool locks to protect space use accounting for each transaction badly hurts the code SMP scalability. The same transaction size limitation applies in case of file rewrite, but without even excuse of buffer borrowing. To address the issue, disable the borrowing mechanism if recordsize is smaller than default and the write request is 4x bigger than it. In such case writes up to 32MB are executed in single transaction, that dramatically reduces overhead and lock contention. Since the borrowing mechanism is not used for file rewrites, and it was never used by zvols, which seem to work fine, I don't think this change should create significant problems, partially because in addition to the borrowing mechanism there are also used pre-faults. My tests with 4/8 threads writing several files same time on datasets with 32KB recordsize in 1MB requests show reduction of CPU usage by the user threads by 25-35%. I would measure it in GB/s, but at that block size we are now limited by the lock contention of single write issue taskqueue, which is a separate problem we are going to work on. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14964 --- module/zfs/zfs_vnops.c | 106 ++++++++++++++++++----------------------- 1 file changed, 46 insertions(+), 60 deletions(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 86706469acee..7bdcc1639384 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -462,14 +462,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) return (SET_ERROR(EINVAL)); } - const uint64_t max_blksz = zfsvfs->z_max_blksz; - /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. - * Skip this if uio contains loaned arc_buf. */ - if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { + ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1); + if (zfs_uio_prefaultpages(pfbytes, uio)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFAULT)); } @@ -544,10 +542,31 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) break; } + uint64_t blksz; + if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) { + if (zp->z_blksz > zfsvfs->z_max_blksz && + !ISP2(zp->z_blksz)) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + blksz = 1 << highbit64(zp->z_blksz); + } else { + blksz = zfsvfs->z_max_blksz; + } + blksz = MIN(blksz, P2ROUNDUP(end_size, + SPA_MINBLOCKSIZE)); + blksz = MAX(blksz, zp->z_blksz); + } else { + blksz = zp->z_blksz; + } + arc_buf_t *abuf = NULL; - if (n >= max_blksz && woff >= zp->z_size && - P2PHASE(woff, max_blksz) == 0 && - zp->z_blksz == max_blksz) { + ssize_t nbytes = n; + if (n >= blksz && woff >= zp->z_size && + P2PHASE(woff, blksz) == 0 && + (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter @@ -555,18 +574,26 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ - size_t cbytes; - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - max_blksz); + blksz); ASSERT(abuf != NULL); - ASSERT(arc_buf_size(abuf) == max_blksz); - if ((error = zfs_uiocopy(abuf->b_data, max_blksz, - UIO_WRITE, uio, &cbytes))) { + ASSERT(arc_buf_size(abuf) == blksz); + if ((error = zfs_uiocopy(abuf->b_data, blksz, + UIO_WRITE, uio, &nbytes))) { dmu_return_arcbuf(abuf); break; } - ASSERT3S(cbytes, ==, max_blksz); + ASSERT3S(nbytes, ==, blksz); + } else { + nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) - + P2PHASE(woff, blksz)); + if (pfbytes < nbytes) { + if (zfs_uio_prefaultpages(nbytes, uio)) { + error = SET_ERROR(EFAULT); + break; + } + pfbytes = nbytes; + } } /* @@ -576,8 +603,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); DB_DNODE_ENTER(db); - dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, - MIN(n, max_blksz)); + dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes); DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); @@ -600,31 +626,10 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * shrink down lr_length to the appropriate size. */ if (lr->lr_length == UINT64_MAX) { - uint64_t new_blksz; - - if (zp->z_blksz > max_blksz) { - /* - * File's blocksize is already larger than the - * "recordsize" property. Only let it grow to - * the next power of 2. - */ - ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end_size, - 1 << highbit64(zp->z_blksz)); - } else { - new_blksz = MIN(end_size, max_blksz); - } - zfs_grow_blocksize(zp, new_blksz, tx); + zfs_grow_blocksize(zp, blksz, tx); zfs_rangelock_reduce(lr, woff, n); } - /* - * XXX - should we really limit each write to z_max_blksz? - * Perhaps we should use SPA_MAXBLOCKSIZE chunks? - */ - const ssize_t nbytes = - MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - ssize_t tx_bytes; if (abuf == NULL) { tx_bytes = zfs_uio_resid(uio); @@ -644,12 +649,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * zfs_uio_prefaultpages, or prefaultpages may * error, and we may break the loop early. */ - if (tx_bytes != zfs_uio_resid(uio)) - n -= tx_bytes - zfs_uio_resid(uio); - if (zfs_uio_prefaultpages(MIN(n, max_blksz), - uio)) { - break; - } + n -= tx_bytes - zfs_uio_resid(uio); + pfbytes -= tx_bytes - zfs_uio_resid(uio); continue; } #endif @@ -665,15 +666,6 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) } tx_bytes -= zfs_uio_resid(uio); } else { - /* Implied by abuf != NULL: */ - ASSERT3S(n, >=, max_blksz); - ASSERT0(P2PHASE(woff, max_blksz)); - /* - * We can simplify nbytes to MIN(n, max_blksz) since - * P2PHASE(woff, max_blksz) is 0, and knowing - * n >= max_blksz lets us simplify further: - */ - ASSERT3S(nbytes, ==, max_blksz); /* * Thus, we're writing a full block at a block-aligned * offset and extending the file past EOF. @@ -758,13 +750,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) break; ASSERT3S(tx_bytes, ==, nbytes); n -= nbytes; - - if (n > 0) { - if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { - error = SET_ERROR(EFAULT); - break; - } - } + pfbytes -= nbytes; } zfs_znode_update_vfs(zp); From a9d6b0690b1863f39a7efce08b1227f2e9e26abb Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 27 Jun 2023 20:03:37 -0400 Subject: [PATCH 166/180] ZIL: Fix another use-after-free. lwb->lwb_issued_txg can not be accessed after lwb_state is set to LWB_STATE_FLUSH_DONE and zl_lock is dropped, since the lwb may be freed by zil_sync(). We must save the txg number before that. This is similar to the 55b1842f92, but as I see the bug is not new. It existed for quite a while, just was not triggered due to smaller race window. Reviewed-by: Allan Jude Reviewed-by: Brian Atkinson Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14988 Closes #14999 --- module/zfs/zil.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index ee8dcce3b361..ef6f52542ded 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1425,6 +1425,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio) list_move_tail(&itxs, &lwb->lwb_itxs); list_move_tail(&waiters, &lwb->lwb_waiters); + txg = lwb->lwb_issued_txg; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); lwb->lwb_state = LWB_STATE_FLUSH_DONE; @@ -1465,7 +1466,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio) list_destroy(&waiters); mutex_enter(&zilog->zl_lwb_io_lock); - txg = lwb->lwb_issued_txg; ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0); zilog->zl_lwb_inflight[txg & TXG_MASK]--; if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0) From 62ace21a149c34cfe1b5870a76eb036fe805f869 Mon Sep 17 00:00:00 2001 From: Mateusz Piotrowski <0mp@FreeBSD.org> Date: Thu, 29 Jun 2023 19:54:43 +0200 Subject: [PATCH 167/180] zdb: Add missing poolname to -C synopsis Reviewed-by: Tino Reichardt Reviewed-by: Rob Norris Signed-off-by: Mateusz Piotrowski <0mp@FreeBSD.org> Sponsored-by: Klara Inc. Closes #15014 --- cmd/zdb/zdb.c | 2 +- man/man8/zdb.8 | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 04a10c4eedd7..9568d2bbfe38 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -794,7 +794,7 @@ usage(void) "\t\t[-o =]... [-t ] [-U ] [-x ]\n" "\t\t[-K ] / []\n" "\t%s [-v] \n" - "\t%s -C [-A] [-U ]\n" + "\t%s -C [-A] [-U ] []\n" "\t%s -l [-Aqu] \n" "\t%s -m [-AFLPX] [-e [-V] [-p ...]] [-t ] " "[-U ]\n\t\t [ [ ...]]\n" diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index 031953c543a1..52c8e452fa7c 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -14,7 +14,7 @@ .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC. .\" Copyright (c) 2017 Intel Corporation. .\" -.Dd June 4, 2023 +.Dd June 27, 2023 .Dt ZDB 8 .Os . @@ -51,6 +51,7 @@ .Fl C .Op Fl A .Op Fl U Ar cache +.Op Ar poolname .Nm .Fl E .Op Fl A From 77a3bb1f47e67c233eb1961b8746748c02bafde1 Mon Sep 17 00:00:00 2001 From: Yuri Pankov <113725409+yuripv@users.noreply.github.com> Date: Thu, 29 Jun 2023 20:50:52 +0200 Subject: [PATCH 168/180] spa.h: use IN_BASE instead of IN_FREEBSD_BASE Consistently get the proper default value for autotrim. Currently, only the kernel module is built with IN_FREEBSD_BASE, and libzfs get the wrong default value, leading to confusion and incorrect output when autotrim value was not set explicitly. Reviewed-by: Warner Losh Signed-off-by: Yuri Pankov Closes #15016 --- include/sys/spa.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index 1fa2044008dc..ac0847793c84 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -723,12 +723,12 @@ typedef enum spa_mode { * Send TRIM commands in-line during normal pool operation while deleting. * OFF: no * ON: yes - * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources. + * NB: IN_BASE is defined within the FreeBSD sources. */ typedef enum { SPA_AUTOTRIM_OFF = 0, /* default */ SPA_AUTOTRIM_ON, -#ifdef IN_FREEBSD_BASE +#ifdef IN_BASE SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON, #else SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF, From 24554082bd93cb90400c4cb751275debda229009 Mon Sep 17 00:00:00 2001 From: vimproved <66446404+vimproved@users.noreply.github.com> Date: Thu, 29 Jun 2023 19:54:37 +0000 Subject: [PATCH 169/180] contrib: dracut: Conditionalize copying of libgcc_s.so.1 to glibc only The issue that this is designed to work around is only applicable to glibc, since it's caused by glibc's pthread_cancel() implementation using dlopen on libgcc_s.so.1 (and therefor not triggering dracut to include it in the initramfs). This commit adds an extra condition to the workaround that tests for glibc via "ldconfig -p | grep -qF 'libc.so.6'" (which should only be present on glibc systems). Reviewed-by: Brian Behlendorf Signed-off-by: Violet Purcell Closes #14992 --- contrib/dracut/90zfs/module-setup.sh.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/dracut/90zfs/module-setup.sh.in b/contrib/dracut/90zfs/module-setup.sh.in index e55cb60e1612..acad468edfd1 100755 --- a/contrib/dracut/90zfs/module-setup.sh.in +++ b/contrib/dracut/90zfs/module-setup.sh.in @@ -36,7 +36,7 @@ install() { { dfatal "Failed to install essential binaries"; exit 1; } # Adapted from https://github.com/zbm-dev/zfsbootmenu - if ! ldd "$(command -v zpool)" | grep -qF 'libgcc_s.so'; then + if ! ldd "$(command -v zpool)" | grep -qF 'libgcc_s.so' && ldconfig -p 2> /dev/null | grep -qF 'libc.so.6' ; then # On systems with gcc-config (Gentoo, Funtoo, etc.), use it to find libgcc_s if command -v gcc-config >/dev/null; then inst_simple "/usr/lib/gcc/$(s=$(gcc-config -c); echo "${s%-*}/${s##*-}")/libgcc_s.so.1" || From eda32dca92a854e5d23877166188c476c2ac75bd Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 30 Jun 2023 11:36:43 -0400 Subject: [PATCH 170/180] Fix remount when setting multiple properties. The previous code was checking zfs_is_namespace_prop() only for the last property on the list. If one was not "namespace", then remount wasn't called. To fix that move zfs_is_namespace_prop() inside the loop and remount if at least one of properties was "namespace". Reviewed-by: Umer Saleem Reviewed-by: Ameer Hamza Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15000 --- lib/libzfs/libzfs_dataset.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index fe9f3268d338..11d3eb6a3c60 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -1789,7 +1789,8 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) nvlist_t *nvl; int nvl_len = 0; int added_resv = 0; - zfs_prop_t prop = 0; + zfs_prop_t prop; + boolean_t nsprop = B_FALSE; nvpair_t *elem; (void) snprintf(errbuf, sizeof (errbuf), @@ -1836,6 +1837,7 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) elem = nvlist_next_nvpair(nvl, elem)) { prop = zfs_name_to_prop(nvpair_name(elem)); + nsprop |= zfs_is_namespace_prop(prop); assert(cl_idx < nvl_len); /* @@ -1934,8 +1936,7 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) * if one of the options handled by the generic * Linux namespace layer has been modified. */ - if (zfs_is_namespace_prop(prop) && - zfs_is_mounted(zhp, NULL)) + if (nsprop && zfs_is_mounted(zhp, NULL)) ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); } } From 6052060c133d0caed0e1bc3ec2c057f8c33e5f7a Mon Sep 17 00:00:00 2001 From: Arshad Hussain Date: Fri, 30 Jun 2023 21:07:26 +0530 Subject: [PATCH 171/180] Don't use hard-coded 'size' value in snprintf() This patch changes the passing of "size" to snprintf from hard-coded (openended) to sizeof(errbuf). This is bringing to standard with rest of the code where- ever 'errbuf' is used. Reviewed-by: Brian Behlendorf Signed-off-by: Arshad Hussain Closes #15003 --- cmd/zfs/zfs_main.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index e28f1d04f350..5ed25d1ea720 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -6057,8 +6057,8 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) if (p != NULL) rid = p->pw_uid; else if (*endch != '\0') { - (void) snprintf(errbuf, 256, gettext( - "invalid user %s\n"), curr); + (void) snprintf(errbuf, sizeof (errbuf), + gettext("invalid user %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else if (opts->group) { @@ -6071,8 +6071,9 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) if (g != NULL) rid = g->gr_gid; else if (*endch != '\0') { - (void) snprintf(errbuf, 256, gettext( - "invalid group %s\n"), curr); + (void) snprintf(errbuf, sizeof (errbuf), + gettext("invalid group %s\n"), + curr); allow_usage(un, B_TRUE, errbuf); } } else { @@ -6097,8 +6098,9 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) who_type = ZFS_DELEG_GROUP; rid = g->gr_gid; } else { - (void) snprintf(errbuf, 256, gettext( - "invalid user/group %s\n"), curr); + (void) snprintf(errbuf, sizeof (errbuf), + gettext("invalid user/group %s\n"), + curr); allow_usage(un, B_TRUE, errbuf); } } From fa7b2390d4982412f9dd27c151bb5ec2da89dcca Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 30 Jun 2023 11:47:13 -0400 Subject: [PATCH 172/180] Do not report bytes skipped by scan as issued. Scan process may skip blocks based on their birth time, DVA, etc. Traditionally those blocks were accounted as issued, that caused reporting of hugely over-inflated numbers, having nothing to do with actual disk I/O. This change utilizes never used field in struct dsl_scan_phys to account such skipped bytes, allowing to report how much data were actually scrubbed/resilvered and what is the actual I/O speed. While formally it is an on-disk format change, it should be compatible both ways, so should not need a feature flag. This should partially address the same issue as c85ac731a0e, but from a different perspective, complementing it. Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Reviewed-by: Akash B Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15007 --- cmd/zpool/zpool_main.c | 94 +++++++++++++++++------------ cmd/zpool_influxdb/zpool_influxdb.c | 2 +- include/sys/dsl_scan.h | 2 +- include/sys/fs/zfs.h | 3 +- include/sys/vdev_rebuild.h | 1 + man/man8/zpool-scrub.8 | 4 +- module/zfs/dsl_scan.c | 22 +++++-- module/zfs/spa_misc.c | 2 +- module/zfs/vdev_rebuild.c | 6 +- 9 files changed, 84 insertions(+), 52 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 013dd4a23380..10a3b5b14fc9 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -7662,11 +7662,11 @@ static void print_scan_scrub_resilver_status(pool_scan_stat_t *ps) { time_t start, end, pause; - uint64_t pass_scanned, scanned, pass_issued, issued, total; + uint64_t pass_scanned, scanned, pass_issued, issued, total_s, total_i; uint64_t elapsed, scan_rate, issue_rate; double fraction_done; - char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; - char srate_buf[7], irate_buf[7], time_buf[32]; + char processed_buf[7], scanned_buf[7], issued_buf[7], total_s_buf[7]; + char total_i_buf[7], srate_buf[7], irate_buf[7], time_buf[32]; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); @@ -7738,10 +7738,11 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) pass_scanned = ps->pss_pass_exam; issued = ps->pss_issued; pass_issued = ps->pss_pass_issued; - total = ps->pss_to_examine; + total_s = ps->pss_to_examine; + total_i = ps->pss_to_examine - ps->pss_skipped; /* we are only done with a block once we have issued the IO for it */ - fraction_done = (double)issued / total; + fraction_done = (double)issued / total_i; /* elapsed time for this pass, rounding up to 1 if it's 0 */ elapsed = time(NULL) - ps->pss_pass_start; @@ -7750,26 +7751,25 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) scan_rate = pass_scanned / elapsed; issue_rate = pass_issued / elapsed; - uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ? - ((total - issued) / issue_rate) : UINT64_MAX; - secs_to_dhms(total_secs_left, time_buf); /* format all of the numbers we will be reporting */ zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); zfs_nicebytes(issued, issued_buf, sizeof (issued_buf)); - zfs_nicebytes(total, total_buf, sizeof (total_buf)); - zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); - zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); + zfs_nicebytes(total_s, total_s_buf, sizeof (total_s_buf)); + zfs_nicebytes(total_i, total_i_buf, sizeof (total_i_buf)); /* do not print estimated time if we have a paused scrub */ - if (pause == 0) { - (void) printf(gettext("\t%s scanned at %s/s, " - "%s issued at %s/s, %s total\n"), - scanned_buf, srate_buf, issued_buf, irate_buf, total_buf); - } else { - (void) printf(gettext("\t%s scanned, %s issued, %s total\n"), - scanned_buf, issued_buf, total_buf); + (void) printf(gettext("\t%s / %s scanned"), scanned_buf, total_s_buf); + if (pause == 0 && scan_rate > 0) { + zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); + (void) printf(gettext(" at %s/s"), srate_buf); } + (void) printf(gettext(", %s / %s issued"), issued_buf, total_i_buf); + if (pause == 0 && issue_rate > 0) { + zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); + (void) printf(gettext(" at %s/s"), irate_buf); + } + (void) printf(gettext("\n")); if (is_resilver) { (void) printf(gettext("\t%s resilvered, %.2f%% done"), @@ -7782,16 +7782,16 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) if (pause == 0) { /* * Only provide an estimate iff: - * 1) the time remaining is valid, and + * 1) we haven't yet issued all we expected, and * 2) the issue rate exceeds 10 MB/s, and * 3) it's either: * a) a resilver which has started repairs, or * b) a scrub which has entered the issue phase. */ - if (total_secs_left != UINT64_MAX && - issue_rate >= 10 * 1024 * 1024 && + if (total_i >= issued && issue_rate >= 10 * 1024 * 1024 && ((is_resilver && ps->pss_processed > 0) || (is_scrub && issued > 0))) { + secs_to_dhms((total_i - issued) / issue_rate, time_buf); (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " @@ -7803,7 +7803,7 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) } static void -print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) +print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, uint_t c, char *vdev_name) { if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE) return; @@ -7815,17 +7815,20 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) uint64_t bytes_scanned = vrs->vrs_bytes_scanned; uint64_t bytes_issued = vrs->vrs_bytes_issued; uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt; - uint64_t bytes_est = vrs->vrs_bytes_est; + uint64_t bytes_est_s = vrs->vrs_bytes_est; + uint64_t bytes_est_i = vrs->vrs_bytes_est; + if (c > offsetof(vdev_rebuild_stat_t, vrs_pass_bytes_skipped) / 8) + bytes_est_i -= vrs->vrs_pass_bytes_skipped; uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned / (vrs->vrs_pass_time_ms + 1)) * 1000; uint64_t issue_rate = (vrs->vrs_pass_bytes_issued / (vrs->vrs_pass_time_ms + 1)) * 1000; double scan_pct = MIN((double)bytes_scanned * 100 / - (bytes_est + 1), 100); + (bytes_est_s + 1), 100); /* Format all of the numbers we will be reporting */ char bytes_scanned_buf[7], bytes_issued_buf[7]; - char bytes_rebuilt_buf[7], bytes_est_buf[7]; + char bytes_rebuilt_buf[7], bytes_est_s_buf[7], bytes_est_i_buf[7]; char scan_rate_buf[7], issue_rate_buf[7], time_buf[32]; zfs_nicebytes(bytes_scanned, bytes_scanned_buf, sizeof (bytes_scanned_buf)); @@ -7833,9 +7836,8 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) sizeof (bytes_issued_buf)); zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf, sizeof (bytes_rebuilt_buf)); - zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf)); - zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); - zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf)); + zfs_nicebytes(bytes_est_s, bytes_est_s_buf, sizeof (bytes_est_s_buf)); + zfs_nicebytes(bytes_est_i, bytes_est_i_buf, sizeof (bytes_est_i_buf)); time_t start = vrs->vrs_start_time; time_t end = vrs->vrs_end_time; @@ -7858,17 +7860,29 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE); - secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) / - MAX(scan_rate, 1), time_buf); + (void) printf(gettext("\t%s / %s scanned"), bytes_scanned_buf, + bytes_est_s_buf); + if (scan_rate > 0) { + zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); + (void) printf(gettext(" at %s/s"), scan_rate_buf); + } + (void) printf(gettext(", %s / %s issued"), bytes_issued_buf, + bytes_est_i_buf); + if (issue_rate > 0) { + zfs_nicebytes(issue_rate, issue_rate_buf, + sizeof (issue_rate_buf)); + (void) printf(gettext(" at %s/s"), issue_rate_buf); + } + (void) printf(gettext("\n")); - (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, " - "%s total\n"), bytes_scanned_buf, scan_rate_buf, - bytes_issued_buf, issue_rate_buf, bytes_est_buf); (void) printf(gettext("\t%s resilvered, %.2f%% done"), bytes_rebuilt_buf, scan_pct); if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { - if (scan_rate >= 10 * 1024 * 1024) { + if (bytes_est_s >= bytes_scanned && + scan_rate >= 10 * 1024 * 1024) { + secs_to_dhms((bytes_est_s - bytes_scanned) / scan_rate, + time_buf); (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " @@ -7900,7 +7914,7 @@ print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot) ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { char *name = zpool_vdev_name(g_zfs, zhp, child[c], VDEV_NAME_TYPE_ID); - print_rebuild_status_impl(vrs, name); + print_rebuild_status_impl(vrs, i, name); free(name); } } @@ -8005,13 +8019,15 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) active_resilver = (ps->pss_state == DSS_SCANNING); } - have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); scrub_start = ps->pss_start_time; - have_errorscrub = (ps->pss_error_scrub_func == - POOL_SCAN_ERRORSCRUB); - errorscrub_start = ps->pss_error_scrub_start; + if (c > offsetof(pool_scan_stat_t, + pss_pass_error_scrub_pause) / 8) { + have_errorscrub = (ps->pss_error_scrub_func == + POOL_SCAN_ERRORSCRUB); + errorscrub_start = ps->pss_error_scrub_start; + } } boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); diff --git a/cmd/zpool_influxdb/zpool_influxdb.c b/cmd/zpool_influxdb/zpool_influxdb.c index 80d08485891e..520e56926905 100644 --- a/cmd/zpool_influxdb/zpool_influxdb.c +++ b/cmd/zpool_influxdb/zpool_influxdb.c @@ -238,6 +238,7 @@ print_scan_status(nvlist_t *nvroot, const char *pool_name) print_kv("end_ts", ps->pss_end_time); print_kv(",errors", ps->pss_errors); print_kv(",examined", examined); + print_kv(",skipped", ps->pss_skipped); print_kv(",issued", ps->pss_issued); print_kv(",pass_examined", pass_exam); print_kv(",pass_issued", ps->pss_pass_issued); @@ -249,7 +250,6 @@ print_scan_status(nvlist_t *nvroot, const char *pool_name) print_kv(",remaining_t", remaining_time); print_kv(",start_ts", ps->pss_start_time); print_kv(",to_examine", ps->pss_to_examine); - print_kv(",to_process", ps->pss_to_process); printf(" %llu\n", (u_longlong_t)timestamp); return (0); } diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 6753b4a8f359..2e3452e5ebaa 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -61,7 +61,7 @@ typedef struct dsl_scan_phys { uint64_t scn_end_time; uint64_t scn_to_examine; /* total bytes to be scanned */ uint64_t scn_examined; /* bytes scanned so far */ - uint64_t scn_to_process; + uint64_t scn_skipped; /* bytes skipped by scanner */ uint64_t scn_processed; uint64_t scn_errors; /* scan I/O error count */ uint64_t scn_ddt_class_max; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 93193fa142da..bc940e8a7929 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1088,7 +1088,7 @@ typedef struct pool_scan_stat { uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ uint64_t pss_examined; /* total bytes located by scanner */ - uint64_t pss_to_process; /* total bytes to process */ + uint64_t pss_skipped; /* total bytes skipped by scanner */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ @@ -1152,6 +1152,7 @@ typedef struct vdev_rebuild_stat { uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */ uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */ uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */ + uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */ } vdev_rebuild_stat_t; /* diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h index c4cfe0c56762..55ec6c570316 100644 --- a/include/sys/vdev_rebuild.h +++ b/include/sys/vdev_rebuild.h @@ -79,6 +79,7 @@ typedef struct vdev_rebuild { uint64_t vr_pass_start_time; uint64_t vr_pass_bytes_scanned; uint64_t vr_pass_bytes_issued; + uint64_t vr_pass_bytes_skipped; /* On-disk state updated by vdev_rebuild_zap_update_sync() */ vdev_rebuild_phys_t vr_rebuild_phys; diff --git a/man/man8/zpool-scrub.8 b/man/man8/zpool-scrub.8 index 138226e4562c..03f3ad4991f9 100644 --- a/man/man8/zpool-scrub.8 +++ b/man/man8/zpool-scrub.8 @@ -26,7 +26,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd July 25, 2021 +.Dd June 22, 2023 .Dt ZPOOL-SCRUB 8 .Os . @@ -123,7 +123,7 @@ Status of pool with ongoing scrub: .No # Nm zpool Cm status ... scan: scrub in progress since Sun Jul 25 16:07:49 2021 - 403M scanned at 100M/s, 68.4M issued at 10.0M/s, 405M total + 403M / 405M scanned at 100M/s, 68.4M / 405M issued at 10.0M/s 0B repaired, 16.91% done, 00:00:04 to go ... .Ed diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 1dd44171c10e..50428bff3ef4 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -573,7 +573,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) * counter to how far we've scanned. We know we're consistent * up to here. */ - scn->scn_issued_before_pass = scn->scn_phys.scn_examined; + scn->scn_issued_before_pass = scn->scn_phys.scn_examined - + scn->scn_phys.scn_skipped; if (dsl_scan_is_running(scn) && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { @@ -4362,7 +4363,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) * Disabled by default, set zfs_scan_report_txgs to report * average performance over the last zfs_scan_report_txgs TXGs. */ - if (!dsl_scan_is_paused_scrub(scn) && zfs_scan_report_txgs != 0 && + if (zfs_scan_report_txgs != 0 && tx->tx_txg % zfs_scan_report_txgs == 0) { scn->scn_issued_before_pass += spa->spa_scan_pass_issued; spa_scan_stat_init(spa); @@ -4564,6 +4565,15 @@ count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all) all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0])); } +static void +count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all) +{ + if (BP_IS_EMBEDDED(bp)) + return; + atomic_add_64(&scn->scn_phys.scn_skipped, + all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0])); +} + static void count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) { @@ -4709,7 +4719,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, count_block(dp->dp_blkstats, bp); if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) { - count_block_issued(spa, bp, B_TRUE); + count_block_skipped(scn, bp, B_TRUE); return (0); } @@ -4750,7 +4760,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, if (needs_io && !zfs_no_scrub_io) { dsl_scan_enqueue(dp, bp, zio_flags, zb); } else { - count_block_issued(spa, bp, B_TRUE); + count_block_skipped(scn, bp, B_TRUE); } /* do not relocate this block */ @@ -5119,9 +5129,9 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); range_tree_remove_fill(queue->q_exts_by_addr, start, size); - /* count the block as though we issued it */ + /* count the block as though we skipped it */ sio2bp(sio, &tmpbp); - count_block_issued(spa, &tmpbp, B_FALSE); + count_block_skipped(scn, &tmpbp, B_FALSE); sio_free(sio); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 8dc83445e198..06f640769043 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -2611,7 +2611,7 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; ps->pss_examined = scn->scn_phys.scn_examined; - ps->pss_to_process = scn->scn_phys.scn_to_process; + ps->pss_skipped = scn->scn_phys.scn_skipped; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 62aa61b3b9e7..75c3900cbb0c 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -571,8 +571,10 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) vdev_rebuild_blkptr_init(&blk, vd, start, size); uint64_t psize = BP_GET_PSIZE(&blk); - if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) + if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) { + vr->vr_pass_bytes_skipped += size; return (0); + } mutex_enter(&vr->vr_io_lock); @@ -786,6 +788,7 @@ vdev_rebuild_thread(void *arg) vr->vr_pass_start_time = gethrtime(); vr->vr_pass_bytes_scanned = 0; vr->vr_pass_bytes_issued = 0; + vr->vr_pass_bytes_skipped = 0; uint64_t update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, 0); @@ -1153,6 +1156,7 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) vr->vr_pass_start_time); vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; + vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped; mutex_exit(&tvd->vdev_rebuild_lock); } From b4a0873092353cefe0f1bc3ee6a50d5e16b35675 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 30 Jun 2023 11:54:00 -0400 Subject: [PATCH 173/180] Some ZIO micro-optimizations. - Pack struct zio_prop by 4 bytes from 84 to 80. - Skip new child ZIO locking while linking to parent. The newly allocated ZIO is not externally visible yet, so nobody should care. - Skip io_bp_copy writes when not used (write && non-debug). Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14985 --- include/sys/zio.h | 3 ++- module/zfs/zio.c | 52 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/include/sys/zio.h b/include/sys/zio.h index 85217b873dc8..f4da80783e56 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -341,9 +341,9 @@ typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; uint8_t zp_complevel; - dmu_object_type_t zp_type; uint8_t zp_level; uint8_t zp_copies; + dmu_object_type_t zp_type; boolean_t zp_dedup; boolean_t zp_dedup_verify; boolean_t zp_nopwrite; @@ -611,6 +611,7 @@ extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); +extern void zio_add_child_first(zio_t *pio, zio_t *cio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); diff --git a/module/zfs/zio.c b/module/zfs/zio.c index fb8164f0aea9..10279fde89df 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -626,8 +626,6 @@ zio_unique_parent(zio_t *cio) void zio_add_child(zio_t *pio, zio_t *cio) { - zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); - /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. @@ -636,6 +634,7 @@ zio_add_child(zio_t *pio, zio_t *cio) */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; @@ -644,8 +643,9 @@ zio_add_child(zio_t *pio, zio_t *cio) ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); + uint64_t *countp = pio->io_children[cio->io_child_type]; for (int w = 0; w < ZIO_WAIT_TYPES; w++) - pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; + countp[w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); @@ -654,6 +654,37 @@ zio_add_child(zio_t *pio, zio_t *cio) mutex_exit(&pio->io_lock); } +void +zio_add_child_first(zio_t *pio, zio_t *cio) +{ + /* + * Logical I/Os can have logical, gang, or vdev children. + * Gang I/Os can have gang or vdev children. + * Vdev I/Os can only have vdev children. + * The following ASSERT captures all of these constraints. + */ + ASSERT3S(cio->io_child_type, <=, pio->io_child_type); + + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); + zl->zl_parent = pio; + zl->zl_child = cio; + + ASSERT(list_is_empty(&cio->io_parent_list)); + list_insert_head(&cio->io_parent_list, zl); + + mutex_enter(&pio->io_lock); + + ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); + + uint64_t *countp = pio->io_children[cio->io_child_type]; + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + countp[w] += !cio->io_state[w]; + + list_insert_head(&pio->io_child_list, zl); + + mutex_exit(&pio->io_lock); +} + static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { @@ -840,12 +871,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { - zio->io_bp = (blkptr_t *)bp; - zio->io_bp_copy = *bp; - zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE || - zio->io_child_type == ZIO_CHILD_DDT) + zio->io_child_type == ZIO_CHILD_DDT) { + zio->io_bp_copy = *bp; zio->io_bp = &zio->io_bp_copy; /* so caller can free */ + } else { + zio->io_bp = (blkptr_t *)bp; + } + zio->io_bp_orig = *bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) @@ -880,7 +913,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; - zio_add_child(pio, zio); + zio_add_child_first(pio, zio); } taskq_init_ent(&zio->io_tqent); @@ -1601,7 +1634,6 @@ zio_read_bp_init(zio_t *zio) abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); } if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) @@ -4442,8 +4474,10 @@ zio_ready(zio_t *zio) zio->io_ready(zio); } +#ifdef ZFS_DEBUG if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; +#endif if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; From 233425a153af74b7d5ef9730684f3a1d61ff8f11 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 30 Jun 2023 11:59:39 -0400 Subject: [PATCH 174/180] Again fix race between zil_commit() and zil_suspend(). With zl_suspend read in zil_commit() not protected by any locks it is possible for new ZIL writes to be in progress while zil_destroy() called by zil_suspend() freeing them. This patch closes the race by taking zl_issuer_lock in zil_suspend() and adding the second zl_suspend check to zil_get_commit_list(), protected by the lock. It allows all already queued transactions to be logged normally, while blocks any new ones, calling txg_wait_synced() for the TXGs. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14979 --- module/zfs/zil.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index ef6f52542ded..00d66a2481d7 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -798,8 +798,8 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); - ASSERT(list_is_empty(&lwb->lwb_waiters)); - ASSERT(list_is_empty(&lwb->lwb_itxs)); + VERIFY(list_is_empty(&lwb->lwb_waiters)); + VERIFY(list_is_empty(&lwb->lwb_itxs)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); @@ -2525,10 +2525,10 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg) * This function will traverse the queue of itxs that need to be * committed, and move them onto the ZIL's zl_itx_commit_list. */ -static void +static uint64_t zil_get_commit_list(zilog_t *zilog) { - uint64_t otxg, txg; + uint64_t otxg, txg, wtxg = 0; list_t *commit_list = &zilog->zl_itx_commit_list; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); @@ -2562,10 +2562,22 @@ zil_get_commit_list(zilog_t *zilog) */ ASSERT(zilog_is_dirty_in_txg(zilog, txg) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); - list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); + list_t *sync_list = &itxg->itxg_itxs->i_sync_list; + if (unlikely(zilog->zl_suspend > 0)) { + /* + * ZIL was just suspended, but we lost the race. + * Allow all earlier itxs to be committed, but ask + * caller to do txg_wait_synced(txg) for any new. + */ + if (!list_is_empty(sync_list)) + wtxg = MAX(wtxg, txg); + } else { + list_move_tail(commit_list, sync_list); + } mutex_exit(&itxg->itxg_lock); } + return (wtxg); } /* @@ -2953,11 +2965,12 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * not issued, we rely on future calls to zil_commit_writer() to issue * the lwb, or the timeout mechanism found in zil_commit_waiter(). */ -static void +static uint64_t zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) { list_t ilwbs; lwb_t *lwb; + uint64_t wtxg = 0; ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(spa_writeable(zilog->zl_spa)); @@ -2987,7 +3000,7 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) ZIL_STAT_BUMP(zilog, zil_commit_writer_count); - zil_get_commit_list(zilog); + wtxg = zil_get_commit_list(zilog); zil_prune_commit_list(zilog); zil_process_commit_list(zilog, zcw, &ilwbs); @@ -2996,6 +3009,7 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) while ((lwb = list_remove_head(&ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); list_destroy(&ilwbs); + return (wtxg); } static void @@ -3511,7 +3525,7 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid) zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); zil_commit_itx_assign(zilog, zcw); - zil_commit_writer(zilog, zcw); + uint64_t wtxg = zil_commit_writer(zilog, zcw); zil_commit_waiter(zilog, zcw); if (zcw->zcw_zio_error != 0) { @@ -3526,6 +3540,8 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid) DTRACE_PROBE2(zil__commit__io__error, zilog_t *, zilog, zil_commit_waiter_t *, zcw); txg_wait_synced(zilog->zl_dmu_pool, 0); + } else if (wtxg != 0) { + txg_wait_synced(zilog->zl_dmu_pool, wtxg); } zil_free_commit_waiter(zcw); @@ -3905,11 +3921,13 @@ zil_suspend(const char *osname, void **cookiep) return (error); zilog = dmu_objset_zil(os); + mutex_enter(&zilog->zl_issuer_lock); mutex_enter(&zilog->zl_lock); zh = zilog->zl_header; if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); + mutex_exit(&zilog->zl_issuer_lock); dmu_objset_rele(os, suspend_tag); return (SET_ERROR(EBUSY)); } @@ -3923,6 +3941,7 @@ zil_suspend(const char *osname, void **cookiep) if (cookiep == NULL && !zilog->zl_suspending && (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { mutex_exit(&zilog->zl_lock); + mutex_exit(&zilog->zl_issuer_lock); dmu_objset_rele(os, suspend_tag); return (0); } @@ -3931,6 +3950,7 @@ zil_suspend(const char *osname, void **cookiep) dsl_pool_rele(dmu_objset_pool(os), suspend_tag); zilog->zl_suspend++; + mutex_exit(&zilog->zl_issuer_lock); if (zilog->zl_suspend > 1) { /* From 61ab05cac74830f2658cd16138c5876b4b31b4fa Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 1 Jul 2023 02:01:58 +1000 Subject: [PATCH 175/180] ddt_addref: remove unnecessary phys fill when refcount is 0 The previous comment wondered if this case could happen; it turns out that it really can't. This block can only be entered if dde_type and dde_class are "real"; that only happens when a ddt entry has been previously synced to a ddt store, that is, it was created on a previous txg. Since its gone through that sync, its dde_refcount must be >0. ddt_addref() is called from brt_pending_apply(), which is called at the beginning of spa_sync(), before pending DMU writes/frees are issued. Freeing a dedup block is the only thing that can decrement dde_refcount, so there's no way for it to drop to zero before applying the clone bumps it. Further, even if it _could_ go to zero, it wouldn't be necessary to fill the entry from the block. The phys content is not cleared until the free is issued, which happens when the refcount goes to zero, when the last real free comes through. The cloned block should be identical to what's in the phys already, so the fill should be a no-op anyway. I've replaced this with an assertion because this is all very dependent on the ordering in which BRT and DDT changes are applied, and that might change in the future. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-By: Klara, Inc. Closes #15004 --- module/zfs/ddt.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 33fea0ba3d3c..1fb198219904 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -1209,10 +1209,19 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) ASSERT3S(dde->dde_class, <, DDT_CLASSES); ddp = &dde->dde_phys[BP_GET_NDVAS(bp)]; - if (ddp->ddp_refcnt == 0) { - /* This should never happen? */ - ddt_phys_fill(ddp, bp); - } + + /* + * This entry already existed (dde_type is real), so it must + * have refcnt >0 at the start of this txg. We are called from + * brt_pending_apply(), before frees are issued, so the refcnt + * can't be lowered yet. Therefore, it must be >0. We assert + * this because if the order of BRT and DDT interactions were + * ever to change and the refcnt was ever zero here, then + * likely further action is required to fill out the DDT entry, + * and this is a place that is likely to be missed in testing. + */ + ASSERT3U(ddp->ddp_refcnt, >, 0); + ddt_phys_addref(ddp); result = B_TRUE; } else { From 2b10e32561dff234144c0b0d998c60359864ac71 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Fri, 30 Jun 2023 12:42:02 -0400 Subject: [PATCH 176/180] Pack our DDT ZAPs a bit denser. The DDT is really inefficient on 4k and up vdevs, because it always allocates 4k blocks, and while compression could save us somewhat at ashift 9, that stops being true. So let's change the default to 32 KiB, which seems like a reasonable compromise between improved space savings and inflated write sizes for DDT updates. Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #14654 --- man/man4/zfs.4 | 10 ++++++++++ module/zfs/ddt_zap.c | 13 ++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 04bbbc5fdf59..271b02b6ee42 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -239,6 +239,16 @@ relative to the pool. Make some blocks above a certain size be gang blocks. This option is used by the test suite to facilitate testing. . +.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int +Default DDT ZAP data block size as a power of 2. Note that changing this after +creating a DDT on the pool will not affect existing DDTs, only newly created +ones. +. +.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int +Default DDT ZAP indirect block size as a power of 2. Note that changing this +after creating a DDT on the pool will not affect existing DDTs, only newly +created ones. +. .It Sy zfs_default_bs Ns = Ns Sy 9 Po 512 B Pc Pq int Default dnode block size as a power of 2. . diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 27dbbc55f121..8f6397a6d108 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -31,8 +31,8 @@ #include #include -static const int ddt_zap_leaf_blockshift = 12; -static const int ddt_zap_indirect_blockshift = 12; +static unsigned int ddt_zap_default_bs = 15; +static unsigned int ddt_zap_default_ibs = 15; static int ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) @@ -43,7 +43,7 @@ ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) flags |= ZAP_FLAG_PRE_HASHED_KEY; *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, - ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift, + ddt_zap_default_bs, ddt_zap_default_ibs, DMU_OT_NONE, 0, tx); return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0); @@ -166,3 +166,10 @@ const ddt_ops_t ddt_zap_ops = { ddt_zap_walk, ddt_zap_count, }; + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_bs, UINT, ZMOD_RW, + "DDT ZAP leaf blockshift"); +ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_ibs, UINT, ZMOD_RW, + "DDT ZAP indirect blockshift"); +/* END CSTYLED */ From ac8ae18d2255eab48a77e3fa4e9e6e3230bde015 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 30 Jun 2023 10:03:41 -0700 Subject: [PATCH 177/180] Revert "spa.h: use IN_BASE instead of IN_FREEBSD_BASE" This reverts commit 77a3bb1f47e67c233eb1961b8746748c02bafde1. Signed-off-by: Brian Behlendorf --- include/sys/spa.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/sys/spa.h b/include/sys/spa.h index ac0847793c84..1fa2044008dc 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -723,12 +723,12 @@ typedef enum spa_mode { * Send TRIM commands in-line during normal pool operation while deleting. * OFF: no * ON: yes - * NB: IN_BASE is defined within the FreeBSD sources. + * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources. */ typedef enum { SPA_AUTOTRIM_OFF = 0, /* default */ SPA_AUTOTRIM_ON, -#ifdef IN_BASE +#ifdef IN_FREEBSD_BASE SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON, #else SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF, From 945e39fc3a34dbffb9a630a99ae523f2e03e314b Mon Sep 17 00:00:00 2001 From: Prakash Surya Date: Fri, 30 Jun 2023 11:34:05 -0700 Subject: [PATCH 178/180] Enable tuning of ZVOL open timeout value The default timeout for ZVOL opens may not be sufficient for all cases, so we should enable the value to be more easily tuned to account for systems where the default value is insufficient. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Matthew Ahrens Signed-off-by: Prakash Surya Closes #15023 --- module/os/linux/zfs/zvol_os.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index cdf32c78b4fe..38bc8e2c4eeb 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -54,7 +54,7 @@ static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS -static const unsigned int zvol_open_timeout_ms = 1000; +static unsigned int zvol_open_timeout_ms = 1000; #endif static unsigned int zvol_threads = 0; @@ -1612,4 +1612,9 @@ MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, "Process volblocksize blocks per thread"); #endif +#ifndef HAVE_BLKDEV_GET_ERESTARTSYS +module_param(zvol_open_timeout_ms, uint, 0644); +MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); +#endif + /* END CSTYLED */ From 009d3288dea524c7ad373b04b65bee8bb6f0bfea Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 30 Jun 2023 11:42:14 -0700 Subject: [PATCH 179/180] Tag 2.2.0-rc1 New features: - Fully adaptive ARC eviction (#14359) - Block cloning (#13392) - Scrub error log (#12812, #12355) - Linux container support (#14070, #14097, #12263) - BLAKE3 Checksums (#12918) - Corrective "zfs receive" (#9372) Signed-off-by: Brian Behlendorf --- META | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/META b/META index e4b476aff112..5f834d5cc7c4 100644 --- a/META +++ b/META @@ -1,8 +1,8 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.1.99 -Release: 1 +Version: 2.2.0 +Release: rc1 Release-Tags: relext License: CDDL Author: OpenZFS From ca960ce56ce1bfe207e4d80ba6e5ab67ea41b32f Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 30 Jun 2023 13:32:18 -0700 Subject: [PATCH 180/180] Update META Increase the version to 2.2.99 to indicate the master branch is newer than the 2.2.x release. This ensures packages built from master branch are considered to be newer than the last release. Signed-off-by: Brian Behlendorf --- META | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/META b/META index 5f834d5cc7c4..e6488a6fa6f0 100644 --- a/META +++ b/META @@ -1,8 +1,8 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.2.0 -Release: rc1 +Version: 2.2.99 +Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS