From 5a63c719198ff47a087fe387231460eeecad811a Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Tue, 9 Jun 2020 10:41:01 -0700 Subject: [PATCH] File incorrectly zeroed when receiving incremental stream that toggles -L Background: By increasing the recordsize property above the default of 128KB, a filesystem may have "large" blocks. By default, a send stream of such a filesystem does not contain large WRITE records, instead it decreases objects' block sizes to 128KB and splits the large blocks into 128KB blocks, allowing the large-block filesystem to be received by a system that does not support the `large_blocks` feature. A send stream generated by `zfs send -L` (or `--large-block`) preserves the large block size on the receiving system, by using large WRITE records. When receiving an incremental send stream for a filesystem with large blocks, if the send stream's -L flag was toggled, a bug is encountered in which the file's contents are incorrectly zeroed out. The contents of any blocks that were not modified by this send stream will be lost. "Toggled" means that the previous send used `-L`, but this incremental does not use `-L` (-L to no-L); or that the previous send did not use `-L`, but this incremental does use `-L` (no-L to -L). Changes: This commit addresses the problem with several changes to the semantics of zfs send/receive: 1. "-L to no-L" incrementals are rejected. If the previous send used `-L`, but this incremental does not use `-L`, the `zfs receive` will fail with this error message: incremental send stream requires -L (--large-block), to match previous receive. 2. "no-L to -L" incrementals are handled correctly, preserving the smaller (128KB) block size of any already-received files that used large blocks on the sending system but were split by `zfs send` without the `-L` flag. 3. A new send stream format flag is added, `SWITCH_TO_LARGE_BLOCKS`. This feature indicates that we can correctly handle "no-L to -L" incrementals. This flag is currently not set on any send streams. In the future, we intend for incremental send streams of snapshots that have large blocks to use `-L` by default, and these streams will also have the `SWITCH_TO_LARGE_BLOCKS` feature set. This ensures that streams from the default use of `zfs send` won't encounter the bug mentioned above, because they can't be received by software with the bug. Implementation notes: To facilitate accessing the ZPL's generation number, `zfs_space_delta_cb()` has been renamed to `zpl_get_file_info()` and restructured to fill in a struct with ZPL-specific info including owner and generation. In the "no-L to -L" case, if this is a compressed send stream (from `zfs send -cL`), large WRITE records that are being written to small (128KB) blocksize files need to be decompressed so that they can be written split up into multiple blocks. The zio pipeline will recompress each smaller block individually. A new test case, `send-L_toggle`, is added, which tests the "no-L to -L" case and verifies that we get an error for the "-L to no-L" case. Reviewed-by: Paul Dagnelie Reviewed-by: Brian Behlendorf Signed-off-by: Matthew Ahrens Closes #6224 Closes #10383 --- cmd/zhack/zhack.c | 4 +- include/sys/dmu.h | 13 +- include/sys/dmu_objset.h | 2 + include/sys/fs/zfs.h | 1 + include/sys/zfs_ioctl.h | 18 +- include/sys/zfs_quota.h | 29 +- lib/libzfs/libzfs_sendrecv.c | 6 + module/os/freebsd/zfs/zfs_vfsops.c | 2 +- module/os/linux/zfs/zfs_vfsops.c | 2 +- module/zfs/dmu_objset.c | 49 ++- module/zfs/dmu_recv.c | 410 ++++++++++++++---- module/zfs/zfs_quota.c | 61 ++- tests/runfiles/common.run | 2 +- .../tests/functional/rsend/Makefile.am | 1 + .../tests/functional/rsend/send-L_toggle.ksh | 65 +++ 15 files changed, 500 insertions(+), 165 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/rsend/send-L_toggle.ksh diff --git a/cmd/zhack/zhack.c b/cmd/zhack/zhack.c index bb974133d1d0..4d958fe4365a 100644 --- a/cmd/zhack/zhack.c +++ b/cmd/zhack/zhack.c @@ -103,8 +103,8 @@ fatal(spa_t *spa, void *tag, const char *fmt, ...) /* ARGSUSED */ static int -space_delta_cb(dmu_object_type_t bonustype, void *data, - uint64_t *userp, uint64_t *groupp, uint64_t *projectp) +space_delta_cb(dmu_object_type_t bonustype, const void *data, + zfs_file_info_t *zoi) { /* * Is it a valid type of object to track? diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 139f3cbdfea6..5174bdc4528f 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -1013,10 +1013,17 @@ extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); -typedef int objset_used_cb_t(dmu_object_type_t bonustype, - void *bonus, uint64_t *userp, uint64_t *groupp, uint64_t *projectp); +typedef struct zfs_file_info { + uint64_t zfi_user; + uint64_t zfi_group; + uint64_t zfi_project; + uint64_t zfi_generation; +} zfs_file_info_t; + +typedef int file_info_cb_t(dmu_object_type_t bonustype, const void *data, + struct zfs_file_info *zoi); extern void dmu_objset_register_type(dmu_objset_type_t ost, - objset_used_cb_t *cb); + file_info_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index 9b6614e98b71..a77131ef1c59 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -254,6 +254,8 @@ boolean_t dmu_objset_projectquota_enabled(objset_t *os); boolean_t dmu_objset_projectquota_present(objset_t *os); boolean_t dmu_objset_projectquota_upgradable(objset_t *os); void dmu_objset_id_quota_upgrade(objset_t *os); +int dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, + const void *data, zfs_file_info_t *zfi); int dmu_fsname(const char *snapname, char *buf); diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index ecdfd42d01a8..575a4af51439 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1336,6 +1336,7 @@ typedef enum { ZFS_ERR_EXPORT_IN_PROGRESS, ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR, ZFS_ERR_STREAM_TRUNCATED, + ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH, } zfs_errno_t; /* diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index d4ffe70bbe36..78d33dedaf92 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -107,6 +107,22 @@ typedef enum drr_headertype { #define DMU_BACKUP_FEATURE_RAW (1 << 24) /* flag #25 is reserved for the ZSTD compression feature */ #define DMU_BACKUP_FEATURE_HOLDS (1 << 26) +/* + * The SWITCH_TO_LARGE_BLOCKS feature indicates that we can receive + * incremental LARGE_BLOCKS streams (those with WRITE records of >128KB) even + * if the previous send did not use LARGE_BLOCKS, and thus its large blocks + * were split into multiple 128KB WRITE records. (See + * flush_write_batch_impl() and receive_object()). Older software that does + * not support this flag may encounter a bug when switching to large blocks, + * which causes files to incorrectly be zeroed. + * + * This flag is currently not set on any send streams. In the future, we + * intend for incremental send streams of snapshots that have large blocks to + * use LARGE_BLOCKS by default, and these streams will also have the + * SWITCH_TO_LARGE_BLOCKS feature set. This ensures that streams from the + * default use of "zfs send" won't encounter the bug mentioned above. + */ +#define DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS (1 << 27) /* * Mask of all supported backup features @@ -116,7 +132,7 @@ typedef enum drr_headertype { DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_LARGE_BLOCKS | \ DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE | \ DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_HOLDS | \ - DMU_BACKUP_FEATURE_REDACTED) + DMU_BACKUP_FEATURE_REDACTED | DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) diff --git a/include/sys/zfs_quota.h b/include/sys/zfs_quota.h index ec4dc8f16392..b215b8dd0013 100644 --- a/include/sys/zfs_quota.h +++ b/include/sys/zfs_quota.h @@ -24,23 +24,22 @@ #include #include -#include -extern int zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - uint64_t *userp, uint64_t *groupp, uint64_t *projectp); +struct zfsvfs; +struct zfs_file_info_t; -extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - const char *domain, uint64_t rid, uint64_t *valuep); -extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); -extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, - const char *domain, uint64_t rid, uint64_t quota); +extern int zpl_get_file_info(dmu_object_type_t, + const void *, struct zfs_file_info *); -extern boolean_t zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, - uint64_t id); -extern boolean_t zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, - uint64_t id); -extern boolean_t zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, - uint64_t id); +extern int zfs_userspace_one(struct zfsvfs *, zfs_userquota_prop_t, + const char *, uint64_t, uint64_t *); +extern int zfs_userspace_many(struct zfsvfs *, zfs_userquota_prop_t, + uint64_t *, void *, uint64_t *); +extern int zfs_set_userquota(struct zfsvfs *, zfs_userquota_prop_t, + const char *, uint64_t, uint64_t); + +extern boolean_t zfs_id_overobjquota(struct zfsvfs *, uint64_t, uint64_t); +extern boolean_t zfs_id_overblockquota(struct zfsvfs *, uint64_t, uint64_t); +extern boolean_t zfs_id_overquota(struct zfsvfs *, uint64_t, uint64_t); #endif diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 39b5c6df1063..3fffc426c052 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -4802,6 +4802,12 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, ioctl_err == ECKSUM); (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); break; + case ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental send stream requires -L " + "(--large-block), to match previous receive.")); + (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); + break; case ENOTSUP: zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be upgraded to receive this stream.")); diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index 3c37d3faa982..317773e444a3 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -2202,7 +2202,7 @@ zfs_init(void) */ zfs_vnodes_adjust(); - dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); + dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); } diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 28ea34a00c60..ea5971b0c5ec 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -2131,7 +2131,7 @@ zfs_init(void) { zfsctl_init(); zfs_znode_init(); - dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); + dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); register_filesystem(&zpl_fs_type); } diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 16b93a4fe4f4..d305cee409ee 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1728,19 +1728,29 @@ dmu_objset_is_dirty(objset_t *os, uint64_t txg) return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK])); } -static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; +static file_info_cb_t *file_cbs[DMU_OST_NUMTYPES]; void -dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) +dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb) { - used_cbs[ost] = cb; + file_cbs[ost] = cb; +} + +int +dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, const void *data, + zfs_file_info_t *zfi) +{ + file_info_cb_t *cb = file_cbs[os->os_phys->os_type]; + if (cb == NULL) + return (EINVAL); + return (cb(bonustype, data, zfi)); } boolean_t dmu_objset_userused_enabled(objset_t *os) { return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && - used_cbs[os->os_phys->os_type] != NULL && + file_cbs[os->os_phys->os_type] != NULL && DMU_USERUSED_DNODE(os) != NULL); } @@ -1754,7 +1764,7 @@ dmu_objset_userobjused_enabled(objset_t *os) boolean_t dmu_objset_projectquota_enabled(objset_t *os) { - return (used_cbs[os->os_phys->os_type] != NULL && + return (file_cbs[os->os_phys->os_type] != NULL && DMU_PROJECTUSED_DNODE(os) != NULL && spa_feature_is_enabled(os->os_spa, SPA_FEATURE_PROJECT_QUOTA)); } @@ -2089,9 +2099,6 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) objset_t *os = dn->dn_objset; void *data = NULL; dmu_buf_impl_t *db = NULL; - uint64_t *user = NULL; - uint64_t *group = NULL; - uint64_t *project = NULL; int flags = dn->dn_id_flags; int error; boolean_t have_spill = B_FALSE; @@ -2145,23 +2152,23 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) return; } - if (before) { - ASSERT(data); - user = &dn->dn_olduid; - group = &dn->dn_oldgid; - project = &dn->dn_oldprojid; - } else if (data) { - user = &dn->dn_newuid; - group = &dn->dn_newgid; - project = &dn->dn_newprojid; - } - /* * Must always call the callback in case the object * type has changed and that type isn't an object type to track */ - error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, - user, group, project); + zfs_file_info_t zfi; + error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi); + + if (before) { + ASSERT(data); + dn->dn_olduid = zfi.zfi_user; + dn->dn_oldgid = zfi.zfi_group; + dn->dn_oldprojid = zfi.zfi_project; + } else if (data) { + dn->dn_newuid = zfi.zfi_user; + dn->dn_newgid = zfi.zfi_group; + dn->dn_newprojid = zfi.zfi_project; + } /* * Preserve existing uid/gid when the callback can't determine diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 29fbe854d793..17ebeb682d93 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -104,6 +104,7 @@ struct receive_writer_arg { boolean_t resumable; boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */ boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */ + boolean_t full; /* this is a full send stream */ uint64_t last_object; uint64_t last_offset; uint64_t max_object; /* highest object ID referenced in stream */ @@ -333,6 +334,21 @@ redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin) return (ret); } +/* + * If we previously received a stream with --large-block, we don't support + * receiving an incremental on top of it without --large-block. This avoids + * forcing a read-modify-write or trying to re-aggregate a string of WRITE + * records. + */ +static int +recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags) +{ + if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_BLOCKS) && + !(featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH)); + return (0); +} + static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid, uint64_t featureflags) @@ -445,6 +461,12 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, return (SET_ERROR(EINVAL)); } + error = recv_check_large_blocks(snap, featureflags); + if (error != 0) { + dsl_dataset_rele(snap, FTAG); + return (error); + } + dsl_dataset_rele(snap, FTAG); } else { /* if full, then must be forced */ @@ -479,7 +501,6 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, } return (0); - } /* @@ -725,6 +746,13 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) } } + error = recv_check_large_blocks(ds, featureflags); + if (error != 0) { + dsl_dataset_rele_flags(origin, dsflags, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + dsl_dataset_rele_flags(origin, dsflags, FTAG); } @@ -1050,6 +1078,13 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) } } } + + error = recv_check_large_blocks(ds, drc->drc_featureflags); + if (error != 0) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (0); } @@ -1289,14 +1324,251 @@ save_resume_state(struct receive_writer_arg *rwa, rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; } +static int +receive_object_is_same_generation(objset_t *os, uint64_t object, + dmu_object_type_t old_bonus_type, dmu_object_type_t new_bonus_type, + const void *new_bonus, boolean_t *samegenp) +{ + zfs_file_info_t zoi; + int err; + + dmu_buf_t *old_bonus_dbuf; + err = dmu_bonus_hold(os, object, FTAG, &old_bonus_dbuf); + if (err != 0) + return (err); + err = dmu_get_file_info(os, old_bonus_type, old_bonus_dbuf->db_data, + &zoi); + dmu_buf_rele(old_bonus_dbuf, FTAG); + if (err != 0) + return (err); + uint64_t old_gen = zoi.zfi_generation; + + err = dmu_get_file_info(os, new_bonus_type, new_bonus, &zoi); + if (err != 0) + return (err); + uint64_t new_gen = zoi.zfi_generation; + + *samegenp = (old_gen == new_gen); + return (0); +} + +static int +receive_handle_existing_object(const struct receive_writer_arg *rwa, + const struct drr_object *drro, const dmu_object_info_t *doi, + const void *bonus_data, + uint64_t *object_to_hold, uint32_t *new_blksz) +{ + uint32_t indblksz = drro->drr_indblkshift ? + 1ULL << drro->drr_indblkshift : 0; + int nblkptr = deduce_nblkptr(drro->drr_bonustype, + drro->drr_bonuslen); + uint8_t dn_slots = drro->drr_dn_slots != 0 ? + drro->drr_dn_slots : DNODE_MIN_SLOTS; + boolean_t do_free_range = B_FALSE; + int err; + + *object_to_hold = drro->drr_object; + + /* nblkptr should be bounded by the bonus size and type */ + if (rwa->raw && nblkptr != drro->drr_nblkptr) + return (SET_ERROR(EINVAL)); + + /* + * After the previous send stream, the sending system may + * have freed this object, and then happened to re-allocate + * this object number in a later txg. In this case, we are + * receiving a different logical file, and the block size may + * appear to be different. i.e. we may have a different + * block size for this object than what the send stream says. + * In this case we need to remove the object's contents, + * so that its structure can be changed and then its contents + * entirely replaced by subsequent WRITE records. + * + * If this is a -L (--large-block) incremental stream, and + * the previous stream was not -L, the block size may appear + * to increase. i.e. we may have a smaller block size for + * this object than what the send stream says. In this case + * we need to keep the object's contents and block size + * intact, so that we don't lose parts of the object's + * contents that are not changed by this incremental send + * stream. + * + * We can distinguish between the two above cases by using + * the ZPL's generation number (see + * receive_object_is_same_generation()). However, we only + * want to rely on the generation number when absolutely + * necessary, because with raw receives, the generation is + * encrypted. We also want to minimize dependence on the + * ZPL, so that other types of datasets can also be received + * (e.g. ZVOLs, although note that ZVOLS currently do not + * reallocate their objects or change their structure). + * Therefore, we check a number of different cases where we + * know it is safe to discard the object's contents, before + * using the ZPL's generation number to make the above + * distinction. + */ + if (drro->drr_blksz != doi->doi_data_block_size) { + if (rwa->raw) { + /* + * RAW streams always have large blocks, so + * we are sure that the data is not needed + * due to changing --large-block to be on. + * Which is fortunate since the bonus buffer + * (which contains the ZPL generation) is + * encrypted, and the key might not be + * loaded. + */ + do_free_range = B_TRUE; + } else if (rwa->full) { + /* + * This is a full send stream, so it always + * replaces what we have. Even if the + * generation numbers happen to match, this + * can not actually be the same logical file. + * This is relevant when receiving a full + * send as a clone. + */ + do_free_range = B_TRUE; + } else if (drro->drr_type != + DMU_OT_PLAIN_FILE_CONTENTS || + doi->doi_type != DMU_OT_PLAIN_FILE_CONTENTS) { + /* + * PLAIN_FILE_CONTENTS are the only type of + * objects that have ever been stored with + * large blocks, so we don't need the special + * logic below. ZAP blocks can shrink (when + * there's only one block), so we don't want + * to hit the error below about block size + * only increasing. + */ + do_free_range = B_TRUE; + } else if (doi->doi_max_offset <= + doi->doi_data_block_size) { + /* + * There is only one block. We can free it, + * because its contents will be replaced by a + * WRITE record. This can not be the no-L -> + * -L case, because the no-L case would have + * resulted in multiple blocks. If we + * supported -L -> no-L, it would not be safe + * to free the file's contents. Fortunately, + * that is not allowed (see + * recv_check_large_blocks()). + */ + do_free_range = B_TRUE; + } else { + boolean_t is_same_gen; + err = receive_object_is_same_generation(rwa->os, + drro->drr_object, doi->doi_bonus_type, + drro->drr_bonustype, bonus_data, &is_same_gen); + if (err != 0) + return (SET_ERROR(EINVAL)); + + if (is_same_gen) { + /* + * This is the same logical file, and + * the block size must be increasing. + * It could only decrease if + * --large-block was changed to be + * off, which is checked in + * recv_check_large_blocks(). + */ + if (drro->drr_blksz <= + doi->doi_data_block_size) + return (SET_ERROR(EINVAL)); + /* + * We keep the existing blocksize and + * contents. + */ + *new_blksz = + doi->doi_data_block_size; + } else { + do_free_range = B_TRUE; + } + } + } + + /* nblkptr can only decrease if the object was reallocated */ + if (nblkptr < doi->doi_nblkptr) + do_free_range = B_TRUE; + + /* number of slots can only change on reallocation */ + if (dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) + do_free_range = B_TRUE; + + /* + * For raw sends we also check a few other fields to + * ensure we are preserving the objset structure exactly + * as it was on the receive side: + * - A changed indirect block size + * - A smaller nlevels + */ + if (rwa->raw) { + if (indblksz != doi->doi_metadata_block_size) + do_free_range = B_TRUE; + if (drro->drr_nlevels < doi->doi_indirection) + do_free_range = B_TRUE; + } + + if (do_free_range) { + err = dmu_free_long_range(rwa->os, drro->drr_object, + 0, DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); + } + + /* + * The dmu does not currently support decreasing nlevels + * or changing the number of dnode slots on an object. For + * non-raw sends, this does not matter and the new object + * can just use the previous one's nlevels. For raw sends, + * however, the structure of the received dnode (including + * nlevels and dnode slots) must match that of the send + * side. Therefore, instead of using dmu_object_reclaim(), + * we must free the object completely and call + * dmu_object_claim_dnsize() instead. + */ + if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) || + dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) { + err = dmu_free_long_object(rwa->os, drro->drr_object); + if (err != 0) + return (SET_ERROR(EINVAL)); + + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + *object_to_hold = DMU_NEW_OBJECT; + } + + /* + * For raw receives, free everything beyond the new incoming + * maxblkid. Normally this would be done with a DRR_FREE + * record that would come after this DRR_OBJECT record is + * processed. However, for raw receives we manually set the + * maxblkid from the drr_maxblkid and so we must first free + * everything above that blkid to ensure the DMU is always + * consistent with itself. We will never free the first block + * of the object here because a maxblkid of 0 could indicate + * an object with a single block or one with no blocks. This + * free may be skipped when dmu_free_long_range() was called + * above since it covers the entire object's contents. + */ + if (rwa->raw && *object_to_hold != DMU_NEW_OBJECT && !do_free_range) { + err = dmu_free_long_range(rwa->os, drro->drr_object, + (drro->drr_maxblkid + 1) * doi->doi_data_block_size, + DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); + } + return (0); +} + noinline static int receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, void *data) { dmu_object_info_t doi; dmu_tx_t *tx; - uint64_t object; int err; + uint32_t new_blksz = drro->drr_blksz; uint8_t dn_slots = drro->drr_dn_slots != 0 ? drro->drr_dn_slots : DNODE_MIN_SLOTS; @@ -1360,86 +1632,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, * Raw receives will also check that the indirect structure of the * dnode hasn't changed. */ + uint64_t object_to_hold; if (err == 0) { - uint32_t indblksz = drro->drr_indblkshift ? - 1ULL << drro->drr_indblkshift : 0; - int nblkptr = deduce_nblkptr(drro->drr_bonustype, - drro->drr_bonuslen); - boolean_t did_free = B_FALSE; - - object = drro->drr_object; - - /* nblkptr should be bounded by the bonus size and type */ - if (rwa->raw && nblkptr != drro->drr_nblkptr) - return (SET_ERROR(EINVAL)); - - /* - * Check for indicators that the object was freed and - * reallocated. For all sends, these indicators are: - * - A changed block size - * - A smaller nblkptr - * - A changed dnode size - * For raw sends we also check a few other fields to - * ensure we are preserving the objset structure exactly - * as it was on the receive side: - * - A changed indirect block size - * - A smaller nlevels - */ - if (drro->drr_blksz != doi.doi_data_block_size || - nblkptr < doi.doi_nblkptr || - dn_slots != doi.doi_dnodesize >> DNODE_SHIFT || - (rwa->raw && - (indblksz != doi.doi_metadata_block_size || - drro->drr_nlevels < doi.doi_indirection))) { - err = dmu_free_long_range(rwa->os, drro->drr_object, - 0, DMU_OBJECT_END); - if (err != 0) - return (SET_ERROR(EINVAL)); - else - did_free = B_TRUE; - } - - /* - * The dmu does not currently support decreasing nlevels - * or changing the number of dnode slots on an object. For - * non-raw sends, this does not matter and the new object - * can just use the previous one's nlevels. For raw sends, - * however, the structure of the received dnode (including - * nlevels and dnode slots) must match that of the send - * side. Therefore, instead of using dmu_object_reclaim(), - * we must free the object completely and call - * dmu_object_claim_dnsize() instead. - */ - if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) || - dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) { - err = dmu_free_long_object(rwa->os, drro->drr_object); - if (err != 0) - return (SET_ERROR(EINVAL)); - - txg_wait_synced(dmu_objset_pool(rwa->os), 0); - object = DMU_NEW_OBJECT; - } - - /* - * For raw receives, free everything beyond the new incoming - * maxblkid. Normally this would be done with a DRR_FREE - * record that would come after this DRR_OBJECT record is - * processed. However, for raw receives we manually set the - * maxblkid from the drr_maxblkid and so we must first free - * everything above that blkid to ensure the DMU is always - * consistent with itself. We will never free the first block - * of the object here because a maxblkid of 0 could indicate - * an object with a single block or one with no blocks. This - * free may be skipped when dmu_free_long_range() was called - * above since it covers the entire object's contents. - */ - if (rwa->raw && object != DMU_NEW_OBJECT && !did_free) { - err = dmu_free_long_range(rwa->os, drro->drr_object, - (drro->drr_maxblkid + 1) * doi.doi_data_block_size, - DMU_OBJECT_END); - if (err != 0) - return (SET_ERROR(EINVAL)); - } + err = receive_handle_existing_object(rwa, drro, &doi, data, + &object_to_hold, &new_blksz); } else if (err == EEXIST) { /* * The object requested is currently an interior slot of a @@ -1454,10 +1650,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, return (SET_ERROR(EINVAL)); /* object was freed and we are about to allocate a new one */ - object = DMU_NEW_OBJECT; + object_to_hold = DMU_NEW_OBJECT; } else { /* object is free and we are about to allocate a new one */ - object = DMU_NEW_OBJECT; + object_to_hold = DMU_NEW_OBJECT; } /* @@ -1492,27 +1688,27 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, } tx = dmu_tx_create(rwa->os); - dmu_tx_hold_bonus(tx, object); - dmu_tx_hold_write(tx, object, 0, 0); + dmu_tx_hold_bonus(tx, object_to_hold); + dmu_tx_hold_write(tx, object_to_hold, 0, 0); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } - if (object == DMU_NEW_OBJECT) { + if (object_to_hold == DMU_NEW_OBJECT) { /* Currently free, wants to be allocated */ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, - drro->drr_type, drro->drr_blksz, + drro->drr_type, new_blksz, drro->drr_bonustype, drro->drr_bonuslen, dn_slots << DNODE_SHIFT, tx); } else if (drro->drr_type != doi.doi_type || - drro->drr_blksz != doi.doi_data_block_size || + new_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* Currently allocated, but with different properties */ err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object, - drro->drr_type, drro->drr_blksz, + drro->drr_type, new_blksz, drro->drr_bonustype, drro->drr_bonuslen, dn_slots << DNODE_SHIFT, rwa->spill ? DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx); @@ -1578,6 +1774,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, * For non-new objects block size and indirect block * shift cannot change and nlevels can only increase. */ + ASSERT3U(new_blksz, ==, drro->drr_blksz); VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object, drro->drr_blksz, drro->drr_indblkshift, tx)); VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object, @@ -1707,6 +1904,40 @@ flush_write_batch_impl(struct receive_writer_arg *rwa) DRR_WRITE_PAYLOAD_SIZE(drrw)); } + /* + * If we are receiving an incremental large-block stream into + * a dataset that previously did a non-large-block receive, + * the WRITE record may be larger than the object's block + * size. dmu_assign_arcbuf_by_dnode() handles this as long + * as the arcbuf is not compressed, so decompress it here if + * necessary. + */ + if (drrw->drr_logical_size != dn->dn_datablksz && + arc_get_compression(abuf) != ZIO_COMPRESS_OFF) { + ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz); + zbookmark_phys_t zb = { + .zb_objset = dmu_objset_id(rwa->os), + .zb_object = rwa->last_object, + .zb_level = 0, + .zb_blkid = + drrw->drr_offset >> dn->dn_datablkshift, + }; + + /* + * The size of loaned arc bufs is counted in + * arc_loaned_bytes. When we untransform + * (decompress) the buf, its size increases. To + * ensure that arc_loaned_bytes remains accurate, we + * need to return (un-loan) the buf (with its + * compressed size) and then re-loan it (with its + * new, uncompressed size). + */ + arc_return_buf(abuf, FTAG); + VERIFY0(arc_untransform(abuf, dmu_objset_spa(rwa->os), + &zb, B_FALSE)); + arc_loan_inuse_buf(abuf, FTAG); + } + err = dmu_assign_arcbuf_by_dnode(dn, drrw->drr_offset, abuf, tx); if (err != 0) { @@ -2710,6 +2941,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) rwa->resumable = drc->drc_resumable; rwa->raw = drc->drc_raw; rwa->spill = drc->drc_spill; + rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0); rwa->os->os_raw_receive = drc->drc_raw; list_create(&rwa->write_batch, sizeof (struct receive_record_arg), offsetof(struct receive_record_arg, node.bqn_node)); diff --git a/module/zfs/zfs_quota.c b/module/zfs/zfs_quota.c index 6c83f79c97b8..e61db5c7ab83 100644 --- a/module/zfs/zfs_quota.c +++ b/module/zfs/zfs_quota.c @@ -39,21 +39,17 @@ #include int -zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, - uint64_t *userp, uint64_t *groupp, uint64_t *projectp) +zpl_get_file_info(dmu_object_type_t bonustype, const void *data, + zfs_file_info_t *zoi) { - sa_hdr_phys_t sa; - sa_hdr_phys_t *sap = data; - uint64_t flags; - int hdrsize; - boolean_t swap = B_FALSE; - /* * Is it a valid type of object to track? */ if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) return (SET_ERROR(ENOENT)); + zoi->zfi_project = ZFS_DEFAULT_PROJID; + /* * If we have a NULL data pointer * then assume the id's aren't changing and @@ -64,52 +60,55 @@ zfs_space_delta_cb(dmu_object_type_t bonustype, void *data, return (SET_ERROR(EEXIST)); if (bonustype == DMU_OT_ZNODE) { - znode_phys_t *znp = data; - *userp = znp->zp_uid; - *groupp = znp->zp_gid; - *projectp = ZFS_DEFAULT_PROJID; + const znode_phys_t *znp = data; + zoi->zfi_user = znp->zp_uid; + zoi->zfi_group = znp->zp_gid; + zoi->zfi_generation = znp->zp_gen; return (0); } + const sa_hdr_phys_t *sap = data; if (sap->sa_magic == 0) { /* * This should only happen for newly created files * that haven't had the znode data filled in yet. */ - *userp = 0; - *groupp = 0; - *projectp = ZFS_DEFAULT_PROJID; + zoi->zfi_user = 0; + zoi->zfi_group = 0; + zoi->zfi_generation = 0; return (0); } - sa = *sap; + sa_hdr_phys_t sa = *sap; + boolean_t swap = B_FALSE; if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { sa.sa_magic = SA_MAGIC; sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); swap = B_TRUE; - } else { - VERIFY3U(sa.sa_magic, ==, SA_MAGIC); } + VERIFY3U(sa.sa_magic, ==, SA_MAGIC); - hdrsize = sa_hdrsize(&sa); + int hdrsize = sa_hdrsize(&sa); VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); - *userp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_UID_OFFSET)); - *groupp = *((uint64_t *)((uintptr_t)data + hdrsize + SA_GID_OFFSET)); - flags = *((uint64_t *)((uintptr_t)data + hdrsize + SA_FLAGS_OFFSET)); + uintptr_t data_after_hdr = (uintptr_t)data + hdrsize; + zoi->zfi_user = *((uint64_t *)(data_after_hdr + SA_UID_OFFSET)); + zoi->zfi_group = *((uint64_t *)(data_after_hdr + SA_GID_OFFSET)); + zoi->zfi_generation = *((uint64_t *)(data_after_hdr + SA_GEN_OFFSET)); + uint64_t flags = *((uint64_t *)(data_after_hdr + SA_FLAGS_OFFSET)); if (swap) flags = BSWAP_64(flags); - if (flags & ZFS_PROJID) - *projectp = *((uint64_t *)((uintptr_t)data + hdrsize + - SA_PROJID_OFFSET)); - else - *projectp = ZFS_DEFAULT_PROJID; + if (flags & ZFS_PROJID) { + zoi->zfi_project = + *((uint64_t *)(data_after_hdr + SA_PROJID_OFFSET)); + } if (swap) { - *userp = BSWAP_64(*userp); - *groupp = BSWAP_64(*groupp); - *projectp = BSWAP_64(*projectp); + zoi->zfi_user = BSWAP_64(zoi->zfi_user); + zoi->zfi_group = BSWAP_64(zoi->zfi_group); + zoi->zfi_project = BSWAP_64(zoi->zfi_project); + zoi->zfi_generation = BSWAP_64(zoi->zfi_generation); } return (0); } @@ -468,7 +467,7 @@ zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) zfs_id_overobjquota(zfsvfs, usedobj, id)); } -EXPORT_SYMBOL(zfs_space_delta_cb); +EXPORT_SYMBOL(zpl_get_file_info); EXPORT_SYMBOL(zfs_userspace_one); EXPORT_SYMBOL(zfs_userspace_many); EXPORT_SYMBOL(zfs_set_userquota); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index cbad90ad1467..d8c109eb7ddc 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -781,7 +781,7 @@ tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos', 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled', 'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize', - 'send-c_recv_dedup', 'send_encrypted_hierarchy', + 'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_hierarchy', 'send_encrypted_props', 'send_encrypted_truncated_files', 'send_freeobjects', 'send_realloc_files', 'send_realloc_encrypted_files', 'send_spill_block', 'send_holds', diff --git a/tests/zfs-tests/tests/functional/rsend/Makefile.am b/tests/zfs-tests/tests/functional/rsend/Makefile.am index 7728a6481da2..ab3a1c6c3cba 100644 --- a/tests/zfs-tests/tests/functional/rsend/Makefile.am +++ b/tests/zfs-tests/tests/functional/rsend/Makefile.am @@ -41,6 +41,7 @@ dist_pkgdata_SCRIPTS = \ send-c_volume.ksh \ send-c_zstreamdump.ksh \ send-cpL_varied_recsize.ksh \ + send-L_toggle.ksh \ send_freeobjects.ksh \ send_partial_dataset.ksh \ send_realloc_dnode_size.ksh \ diff --git a/tests/zfs-tests/tests/functional/rsend/send-L_toggle.ksh b/tests/zfs-tests/tests/functional/rsend/send-L_toggle.ksh new file mode 100755 index 000000000000..483efcc60548 --- /dev/null +++ b/tests/zfs-tests/tests/functional/rsend/send-L_toggle.ksh @@ -0,0 +1,65 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2020 by Delphix. All rights reserved. +# + +. $STF_SUITE/tests/functional/rsend/rsend.kshlib + +# +# Description: +# Verify that send -L can be changed to on in an incremental. +# Verify that send -L can not be turned off in an incremental. +# + +function cleanup +{ + log_must_busy zfs destroy -r $TESTPOOL/fs + log_must_busy zfs destroy -r $TESTPOOL/recv +} + +verify_runnable "both" + +log_assert "Verify toggling send -L works as expected" +log_onexit cleanup + +log_must zfs create -o compression=on -o recordsize=1m $TESTPOOL/fs + +log_must dd if=/dev/urandom of=/$TESTPOOL/fs/file bs=1024 count=1500 + +log_must zfs snapshot $TESTPOOL/fs@snap + +log_must dd if=/dev/urandom of=/$TESTPOOL/fs/file bs=1024 count=1500 conv=notrunc seek=2048 + +log_must zfs snapshot $TESTPOOL/fs@snap2 + +log_must zfs create $TESTPOOL/recv + +log_must zfs send -c $TESTPOOL/fs@snap | zfs recv $TESTPOOL/recv/noL-noL +log_must zfs send -c -i @snap $TESTPOOL/fs@snap2| zfs recv $TESTPOOL/recv/noL-noL +log_must diff /$TESTPOOL/fs/file /$TESTPOOL/recv/noL-noL/file + +log_must zfs send -c -L $TESTPOOL/fs@snap | zfs recv $TESTPOOL/recv/L-L +log_must zfs send -c -L -i @snap $TESTPOOL/fs@snap2 | zfs recv $TESTPOOL/recv/L-L +log_must diff /$TESTPOOL/fs/file /$TESTPOOL/recv/L-L/file + +log_must zfs send -c $TESTPOOL/fs@snap | zfs recv $TESTPOOL/recv/noL-L +log_must zfs send -c -L -i @snap $TESTPOOL/fs@snap2 | zfs recv $TESTPOOL/recv/noL-L +log_must diff /$TESTPOOL/fs/file /$TESTPOOL/recv/noL-L/file + +log_must zfs send -c -L $TESTPOOL/fs@snap | zfs recv $TESTPOOL/recv/L-noL +log_mustnot zfs send -c -i @snap $TESTPOOL/fs@snap2 | zfs recv $TESTPOOL/recv/L-noL +log_must diff /$TESTPOOL/fs/.zfs/snapshot/snap/file /$TESTPOOL/recv/L-noL/file + +log_pass "Verify toggling send -L works as expected"