diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 365f93ba4281..bd8e11558335 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -24,6 +24,7 @@ * Copyright 2012 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. */ #include @@ -5193,8 +5194,7 @@ zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) continue; } if (holding) { - if (zfs_hold(zhp, delim+1, tag, recursive, - B_FALSE, -1) != 0) + if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0) ++errors; } else { if (zfs_release(zhp, delim+1, tag, recursive) != 0) diff --git a/cmd/zhack/zhack.c b/cmd/zhack/zhack.c index 99d26719450e..64ab8edbbc02 100644 --- a/cmd/zhack/zhack.c +++ b/cmd/zhack/zhack.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. */ /* @@ -152,7 +153,7 @@ import_pool(const char *target, boolean_t readonly) g_importargs.poolname = g_pool; pools = zpool_search_import(g_zfs, &g_importargs); - if (pools == NULL || nvlist_next_nvpair(pools, NULL) == NULL) { + if (nvlist_empty(pools)) { if (!g_importargs.can_be_active) { g_importargs.can_be_active = B_TRUE; if (zpool_search_import(g_zfs, &g_importargs) != NULL || diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index e192ab17a16b..48da5b206186 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. */ /* @@ -4830,7 +4831,7 @@ ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) error = user_release_one(fullname, tag); if (error) - fatal(0, "user_release_one(%s)", fullname, tag); + fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); diff --git a/include/libzfs.h b/include/libzfs.h index 3826c2cc8338..83e03407a7e6 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -24,6 +24,7 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. */ #ifndef _LIBZFS_H @@ -607,7 +608,8 @@ extern int zfs_send(zfs_handle_t *, const char *, const char *, extern int zfs_promote(zfs_handle_t *); extern int zfs_hold(zfs_handle_t *, const char *, const char *, - boolean_t, boolean_t, int); + boolean_t, int); +extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); extern int zfs_get_holds(zfs_handle_t *, nvlist_t **); extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index b0db7604da7e..12cd88339971 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -571,7 +571,7 @@ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size); -int dmu_free_object(objset_t *os, uint64_t object); +int dmu_free_long_object(objset_t *os, uint64_t object); /* * Convenience functions. diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h index f13a2a37ce84..bbff15df9bd7 100644 --- a/include/sys/dmu_impl.h +++ b/include/sys/dmu_impl.h @@ -21,7 +21,10 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + */ +/* * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_IMPL_H @@ -265,6 +268,9 @@ typedef struct dmu_sendarg { uint64_t dsa_toguid; int dsa_err; dmu_pendop_t dsa_pending_op; + boolean_t dsa_incremental; + uint64_t dsa_last_data_object; + uint64_t dsa_last_data_offset; } dmu_sendarg_t; #ifdef __cplusplus diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h index 7fe91bebef7e..edf362f7f702 100644 --- a/include/sys/dmu_objset.h +++ b/include/sys/dmu_objset.h @@ -136,6 +136,7 @@ struct objset { int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); +void dmu_objset_refresh_ownership(objset_t *os, void *tag); void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, void *tag); int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp); diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h index ee0885a60ff6..65514b7620aa 100644 --- a/include/sys/dmu_send.h +++ b/include/sys/dmu_send.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -55,12 +55,14 @@ typedef struct dmu_recv_cookie { struct avl_tree *drc_guid_to_ds_map; zio_cksum_t drc_cksum; uint64_t drc_newsnapobj; + void *drc_owner; } dmu_recv_cookie_t; int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, boolean_t force, char *origin, dmu_recv_cookie_t *drc); int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp, int cleanup_fd, uint64_t *action_handlep); -int dmu_recv_end(dmu_recv_cookie_t *drc); +int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner); +boolean_t dmu_objset_is_receiving(objset_t *os); #endif /* _DMU_SEND_H */ diff --git a/include/sys/dnode.h b/include/sys/dnode.h index 9f9134d8cdbe..90bddb2cb885 100644 --- a/include/sys/dnode.h +++ b/include/sys/dnode.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_DNODE_H @@ -189,6 +189,8 @@ typedef struct dnode { /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */ uint32_t dn_dbufs_count; /* count of dn_dbufs */ + /* There are no level-0 blocks of this blkid or higher in dn_dbufs */ + uint64_t dn_unlisted_l0_blkid; /* protected by os_lock: */ list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h index 494f11b90296..9eba46b6d6f5 100644 --- a/include/sys/dsl_dataset.h +++ b/include/sys/dsl_dataset.h @@ -22,6 +22,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. */ #ifndef _SYS_DSL_DATASET_H @@ -187,8 +188,6 @@ int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); void dsl_dataset_name(dsl_dataset_t *ds, char *name); boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag); -void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, - minor_t minor); uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, @@ -248,7 +247,7 @@ void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag); boolean_t dsl_dataset_long_held(dsl_dataset_t *ds); int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, - dsl_dataset_t *origin_head, boolean_t force); + dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx); void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, dmu_tx_t *tx); int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, @@ -265,7 +264,7 @@ int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx); void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, zprop_source_t source, uint64_t value, dmu_tx_t *tx); -int dsl_dataset_rollback(const char *fsname); +int dsl_dataset_rollback(const char *fsname, void *owner); #ifdef ZFS_DEBUG #define dprintf_ds(ds, fmt, ...) do { \ diff --git a/include/sys/dsl_userhold.h b/include/sys/dsl_userhold.h index 56c6c8f47a87..071aeb86d1f1 100644 --- a/include/sys/dsl_userhold.h +++ b/include/sys/dsl_userhold.h @@ -23,6 +23,7 @@ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. */ #ifndef _SYS_DSL_USERHOLD_H @@ -43,8 +44,7 @@ int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist); int dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist); int dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl); -void dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj, - const char *htag); +void dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds); int dsl_dataset_user_hold_check_one(struct dsl_dataset *ds, const char *htag, boolean_t temphold, struct dmu_tx *tx); void dsl_dataset_user_hold_sync_one(struct dsl_dataset *ds, const char *htag, diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index e43c7c6bdeac..c5b7cb06060f 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -21,10 +21,12 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 Martin Matuska. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. */ #include @@ -3152,18 +3154,14 @@ static int zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) { struct destroydata *dd = arg; - zfs_handle_t *szhp; char name[ZFS_MAXNAMELEN]; int rv = 0; (void) snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, dd->snapname); - szhp = make_dataset_handle(zhp->zfs_hdl, name); - if (szhp) { + if (lzc_exists(name)) verify(nvlist_add_boolean(dd->nvl, name) == 0); - zfs_close(szhp); - } if (zhp->zfs_type == ZFS_TYPE_VOLUME) { (void) zvol_remove_link(zhp->zfs_hdl, name); @@ -3192,7 +3190,7 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) verify(nvlist_alloc(&dd.nvl, NV_UNIQUE_NAME, 0) == 0); (void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd); - if (nvlist_next_nvpair(dd.nvl, NULL) == NULL) { + if (nvlist_empty(dd.nvl)) { ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"), zhp->zfs_name, snapname); @@ -3218,7 +3216,7 @@ zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer) if (ret == 0) return (0); - if (nvlist_next_nvpair(errlist, NULL) == NULL) { + if (nvlist_empty(errlist)) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot destroy snapshots")); @@ -4414,24 +4412,21 @@ struct holdarg { const char *snapname; const char *tag; boolean_t recursive; + int error; }; static int zfs_hold_one(zfs_handle_t *zhp, void *arg) { struct holdarg *ha = arg; - zfs_handle_t *szhp; char name[ZFS_MAXNAMELEN]; int rv = 0; (void) snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, ha->snapname); - szhp = make_dataset_handle(zhp->zfs_hdl, name); - if (szhp) { + if (lzc_exists(name)) fnvlist_add_string(ha->nvl, name, ha->tag); - zfs_close(szhp); - } if (ha->recursive) rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha); @@ -4441,27 +4436,55 @@ zfs_hold_one(zfs_handle_t *zhp, void *arg) int zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, - boolean_t recursive, boolean_t enoent_ok, int cleanup_fd) + boolean_t recursive, int cleanup_fd) { int ret; struct holdarg ha; - nvlist_t *errors; - libzfs_handle_t *hdl = zhp->zfs_hdl; - char errbuf[1024]; - nvpair_t *elem; ha.nvl = fnvlist_alloc(); ha.snapname = snapname; ha.tag = tag; ha.recursive = recursive; (void) zfs_hold_one(zfs_handle_dup(zhp), &ha); - ret = lzc_hold(ha.nvl, cleanup_fd, &errors); + + if (nvlist_empty(ha.nvl)) { + char errbuf[1024]; + + fnvlist_free(ha.nvl); + ret = ENOENT; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot hold snapshot '%s@%s'"), + zhp->zfs_name, snapname); + (void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf); + return (ret); + } + + ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl); fnvlist_free(ha.nvl); - if (ret == 0) + return (ret); +} + +int +zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds) +{ + int ret; + nvlist_t *errors; + libzfs_handle_t *hdl = zhp->zfs_hdl; + char errbuf[1024]; + nvpair_t *elem; + + errors = NULL; + ret = lzc_hold(holds, cleanup_fd, &errors); + + if (ret == 0) { + /* There may be errors even in the success case. */ + fnvlist_free(errors); return (0); + } - if (nvlist_next_nvpair(errors, NULL) == NULL) { + if (nvlist_empty(errors)) { /* no hold-specific errors */ (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot hold")); @@ -4501,10 +4524,6 @@ zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, case EEXIST: (void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf); break; - case ENOENT: - if (enoent_ok) - return (ENOENT); - /* FALLTHROUGH */ default: (void) zfs_standard_error(hdl, fnvpair_value_int32(elem), errbuf); @@ -4515,30 +4534,26 @@ zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, return (ret); } -struct releasearg { - nvlist_t *nvl; - const char *snapname; - const char *tag; - boolean_t recursive; -}; - static int zfs_release_one(zfs_handle_t *zhp, void *arg) { struct holdarg *ha = arg; - zfs_handle_t *szhp; char name[ZFS_MAXNAMELEN]; int rv = 0; + nvlist_t *existing_holds; (void) snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, ha->snapname); - szhp = make_dataset_handle(zhp->zfs_hdl, name); - if (szhp) { - nvlist_t *holds = fnvlist_alloc(); - fnvlist_add_boolean(holds, ha->tag); - fnvlist_add_nvlist(ha->nvl, name, holds); - zfs_close(szhp); + if (lzc_get_holds(name, &existing_holds) != 0) { + ha->error = ENOENT; + } else if (!nvlist_exists(existing_holds, ha->tag)) { + ha->error = ESRCH; + } else { + nvlist_t *torelease = fnvlist_alloc(); + fnvlist_add_boolean(torelease, ha->tag); + fnvlist_add_nvlist(ha->nvl, name, torelease); + fnvlist_free(torelease); } if (ha->recursive) @@ -4553,25 +4568,44 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, { int ret; struct holdarg ha; - nvlist_t *errors; + nvlist_t *errors = NULL; nvpair_t *elem; libzfs_handle_t *hdl = zhp->zfs_hdl; + char errbuf[1024]; ha.nvl = fnvlist_alloc(); ha.snapname = snapname; ha.tag = tag; ha.recursive = recursive; + ha.error = 0; (void) zfs_release_one(zfs_handle_dup(zhp), &ha); + + if (nvlist_empty(ha.nvl)) { + fnvlist_free(ha.nvl); + ret = ha.error; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot release hold from snapshot '%s@%s'"), + zhp->zfs_name, snapname); + if (ret == ESRCH) { + (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); + } else { + (void) zfs_standard_error(hdl, ret, errbuf); + } + return (ret); + } + ret = lzc_release(ha.nvl, &errors); fnvlist_free(ha.nvl); - if (ret == 0) + if (ret == 0) { + /* There may be errors even in the success case. */ + fnvlist_free(errors); return (0); + } - if (nvlist_next_nvpair(errors, NULL) == NULL) { + if (nvlist_empty(errors)) { /* no hold-specific errors */ - char errbuf[1024]; - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release")); switch (errno) { @@ -4588,8 +4622,6 @@ zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, for (elem = nvlist_next_nvpair(errors, NULL); elem != NULL; elem = nvlist_next_nvpair(errors, elem)) { - char errbuf[1024]; - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot release hold from snapshot '%s'"), diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c index 28751b215d2c..525d632b8313 100644 --- a/lib/libzfs/libzfs_sendrecv.c +++ b/lib/libzfs/libzfs_sendrecv.c @@ -24,6 +24,7 @@ * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012 Pawel Jakub Dawidek . * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. * All rights reserved */ @@ -799,6 +800,7 @@ typedef struct send_dump_data { int outfd; boolean_t err; nvlist_t *fss; + nvlist_t *snapholds; avl_tree_t *fsavl; snapfilter_cb_t *filter_cb; void *filter_cb_arg; @@ -948,41 +950,19 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, return (0); } -static int -hold_for_send(zfs_handle_t *zhp, send_dump_data_t *sdd) +static void +gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd) { - zfs_handle_t *pzhp; - int error = 0; - char *thissnap; - assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); - if (sdd->dryrun) - return (0); - /* - * zfs_send() only opens a cleanup_fd for sends that need it, + * zfs_send() only sets snapholds for sends that need them, * e.g. replication and doall. */ - if (sdd->cleanup_fd == -1) - return (0); - - thissnap = strchr(zhp->zfs_name, '@') + 1; - *(thissnap - 1) = '\0'; - pzhp = zfs_open(zhp->zfs_hdl, zhp->zfs_name, ZFS_TYPE_DATASET); - *(thissnap - 1) = '@'; - - /* - * It's OK if the parent no longer exists. The send code will - * handle that error. - */ - if (pzhp) { - error = zfs_hold(pzhp, thissnap, sdd->holdtag, - B_FALSE, B_TRUE, sdd->cleanup_fd); - zfs_close(pzhp); - } + if (sdd->snapholds == NULL) + return; - return (error); + fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag); } static void * @@ -1038,28 +1018,23 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) send_dump_data_t *sdd = arg; progress_arg_t pa = { 0 }; pthread_t tid; - char *thissnap; int err; boolean_t isfromsnap, istosnap, fromorigin; boolean_t exclude = B_FALSE; + err = 0; thissnap = strchr(zhp->zfs_name, '@') + 1; isfromsnap = (sdd->fromsnap != NULL && strcmp(sdd->fromsnap, thissnap) == 0); if (!sdd->seenfrom && isfromsnap) { - err = hold_for_send(zhp, sdd); - if (err == 0) { - sdd->seenfrom = B_TRUE; - (void) strcpy(sdd->prevsnap, thissnap); - sdd->prevsnap_obj = zfs_prop_get_int(zhp, - ZFS_PROP_OBJSETID); - } else if (err == ENOENT) { - err = 0; - } + gather_holds(zhp, sdd); + sdd->seenfrom = B_TRUE; + (void) strcpy(sdd->prevsnap, thissnap); + sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); zfs_close(zhp); - return (err); + return (0); } if (sdd->seento || !sdd->seenfrom) { @@ -1110,14 +1085,7 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) return (0); } - err = hold_for_send(zhp, sdd); - if (err) { - if (err == ENOENT) - err = 0; - zfs_close(zhp); - return (err); - } - + gather_holds(zhp, sdd); fromorigin = sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate); @@ -1385,7 +1353,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, avl_tree_t *fsavl = NULL; static uint64_t holdseq; int spa_version; - pthread_t tid; + pthread_t tid = 0; int pipefd[2]; dedup_arg_t dda = { 0 }; int featureflags = 0; @@ -1458,11 +1426,8 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, *debugnvp = hdrnv; else nvlist_free(hdrnv); - if (err) { - fsavl_destroy(fsavl); - nvlist_free(fss); + if (err) goto stderr_out; - } } if (!flags->dryrun) { @@ -1486,8 +1451,6 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, } free(packbuf); if (err == -1) { - fsavl_destroy(fsavl); - nvlist_free(fss); err = errno; goto stderr_out; } @@ -1498,8 +1461,6 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, drr.drr_u.drr_end.drr_checksum = zc; err = write(outfd, &drr, sizeof (drr)); if (err == -1) { - fsavl_destroy(fsavl); - nvlist_free(fss); err = errno; goto stderr_out; } @@ -1511,7 +1472,7 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, /* dump each stream */ sdd.fromsnap = fromsnap; sdd.tosnap = tosnap; - if (flags->dedup) + if (tid != 0) sdd.outfd = pipefd[0]; else sdd.outfd = outfd; @@ -1548,36 +1509,71 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, err = errno; goto stderr_out; } + sdd.snapholds = fnvlist_alloc(); } else { sdd.cleanup_fd = -1; + sdd.snapholds = NULL; } - if (flags->verbose) { + if (flags->verbose || sdd.snapholds != NULL) { /* * Do a verbose no-op dry run to get all the verbose output - * before generating any data. Then do a non-verbose real - * run to generate the streams. + * or to gather snapshot hold's before generating any data, + * then do a non-verbose real run to generate the streams. */ sdd.dryrun = B_TRUE; err = dump_filesystems(zhp, &sdd); - sdd.dryrun = flags->dryrun; - sdd.verbose = B_FALSE; - if (flags->parsable) { - (void) fprintf(stderr, "size\t%llu\n", - (longlong_t)sdd.size); - } else { - char buf[16]; - zfs_nicenum(sdd.size, buf, sizeof (buf)); - (void) fprintf(stderr, dgettext(TEXT_DOMAIN, - "total estimated size is %s\n"), buf); + + if (err != 0) + goto stderr_out; + + if (flags->verbose) { + if (flags->parsable) { + (void) fprintf(stderr, "size\t%llu\n", + (longlong_t)sdd.size); + } else { + char buf[16]; + zfs_nicenum(sdd.size, buf, sizeof (buf)); + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "total estimated size is %s\n"), buf); + } + } + + /* Ensure no snaps found is treated as an error. */ + if (!sdd.seento) { + err = ENOENT; + goto err_out; } + + /* Skip the second run if dryrun was requested. */ + if (flags->dryrun) + goto err_out; + + if (sdd.snapholds != NULL) { + err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds); + if (err != 0) + goto stderr_out; + + fnvlist_free(sdd.snapholds); + sdd.snapholds = NULL; + } + + sdd.dryrun = B_FALSE; + sdd.verbose = B_FALSE; } + err = dump_filesystems(zhp, &sdd); fsavl_destroy(fsavl); nvlist_free(fss); - if (flags->dedup) { - (void) close(pipefd[0]); + /* Ensure no snaps found is treated as an error. */ + if (err == 0 && !sdd.seento) + err = ENOENT; + + if (tid != 0) { + if (err != 0) + (void) pthread_cancel(tid); (void) pthread_join(tid, NULL); + (void) close(pipefd[0]); } if (sdd.cleanup_fd != -1) { @@ -1605,9 +1601,13 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, stderr_out: err = zfs_standard_error(zhp->zfs_hdl, err, errbuf); err_out: + fsavl_destroy(fsavl); + nvlist_free(fss); + fnvlist_free(sdd.snapholds); + if (sdd.cleanup_fd != -1) VERIFY(0 == close(sdd.cleanup_fd)); - if (flags->dedup) { + if (tid != 0) { (void) pthread_cancel(tid); (void) pthread_join(tid, NULL); (void) close(pipefd[0]); diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c index 44a2070d6028..53c813d4a28f 100644 --- a/lib/libzfs_core/libzfs_core.c +++ b/lib/libzfs_core/libzfs_core.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. */ /* @@ -286,7 +287,6 @@ lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist) nvlist_free(args); return (error); - } int @@ -346,11 +346,17 @@ lzc_exists(const char *dataset) * uncleanly, the holds will be released when the pool is next opened * or imported. * - * The return value will be 0 if all holds were created. Otherwise the return - * value will be the errno of a (unspecified) hold that failed, no holds will - * be created, and the errlist will have an entry for each hold that - * failed (name = snapshot). The value in the errlist will be the error - * code (int32). + * Holds for snapshots which don't exist will be skipped and have an entry + * added to errlist, but will not cause an overall failure. + * + * The return value will be 0 if all holds, for snapshots that existed, + * were succesfully created. + * + * Otherwise the return value will be the errno of a (unspecified) hold that + * failed and no holds will be created. + * + * In all cases the errlist will have an entry for each hold that failed + * (name = snapshot), with its value being the error code (int32). */ int lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist) @@ -387,11 +393,17 @@ lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist) * The snapshots must all be in the same pool. * The value is a nvlist whose keys are the holds to remove. * - * The return value will be 0 if all holds were removed. - * Otherwise the return value will be the errno of a (unspecified) release - * that failed, no holds will be released, and the errlist will have an - * entry for each snapshot that has failed releases (name = snapshot). - * The value in the errlist will be the error code (int32) of a failed release. + * Holds which failed to release because they didn't exist will have an entry + * added to errlist, but will not cause an overall failure. + * + * The return value will be 0 if the nvl holds was empty or all holds that + * existed, were successfully removed. + * + * Otherwise the return value will be the errno of a (unspecified) hold that + * failed to release and no holds will be released. + * + * In all cases the errlist will have an entry for each hold that failed to + * to release. */ int lzc_release(nvlist_t *holds, nvlist_t **errlist) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index d655d66212ce..bd3b0bcde321 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +64,12 @@ static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, void *tag, dmu_buf_impl_t **dbp, int depth); static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); +/* + * Number of times that zfs_free_range() took the slow path while doing + * a zfs receive. A nonzero value indicates a potential performance problem. + */ +uint64_t zfs_free_range_recv_miss; + static void dbuf_destroy(dmu_buf_impl_t *db); static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); @@ -831,9 +838,12 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find - * empty blocks. Also, if we happen accross any level-1 dbufs in the + * empty blocks. Also, if we happen across any level-1 dbufs in the * range that have not already been marked dirty, mark them dirty so * they stay in memory. + * + * This is a no-op if the dataset is in the middle of an incremental + * receive; see comment below for details. */ void dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) @@ -849,7 +859,23 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) last_l1 = end >> epbs; } dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); + mutex_enter(&dn->dn_dbufs_mtx); + if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) { + /* There can't be any dbufs in this range; no need to search. */ + mutex_exit(&dn->dn_dbufs_mtx); + return; + } else if (dmu_objset_is_receiving(dn->dn_objset)) { + /* + * If we are receiving, we expect there to be no dbufs in + * the range to be freed, because receive modifies each + * block at most once, and in offset order. If this is + * not the case, it can lead to performance problems, + * so note that we unexpectedly took the slow path. + */ + atomic_inc_64(&zfs_free_range_recv_miss); + } + for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); @@ -1747,6 +1773,9 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, return (odb); } list_insert_head(&dn->dn_dbufs, db); + if (db->db_level == 0 && db->db_blkid >= + dn->dn_unlisted_l0_blkid) + dn->dn_unlisted_l0_blkid = db->db_blkid + 1; db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index cbf4790b1799..eb1b8cd2364a 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -563,98 +563,95 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) * the end so that the file gets shorter over time (if we crashes in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. + * + * On input, *start should be the first offset that does not need to be + * freed (e.g. "offset + length"). On return, *start will be the first + * offset that should be freed. */ static int -get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) +get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) { - uint64_t len = *start - limit; - uint64_t blkcnt = 0; - uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1)); + uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); + /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); + uint64_t blks; - ASSERT(limit <= *start); + ASSERT3U(minimum, <=, *start); - if (len <= iblkrange * maxblks) { - *start = limit; + if (*start - minimum <= iblkrange * maxblks) { + *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); - while (*start > limit && blkcnt < maxblks) { + for (blks = 0; *start > minimum && blks < maxblks; blks++) { int err; - /* find next allocated L1 indirect */ + /* + * dnode_next_offset(BACKWARDS) will find an allocated L1 + * indirect block at or before the input offset. We must + * decrement *start so that it is at the end of the region + * to search. + */ + (*start)--; err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); - /* if there are no more, then we are done */ + /* if there are no indirect blocks before start, we are done */ if (err == ESRCH) { - *start = limit; - return (0); - } else if (err) { + *start = minimum; + break; + } else if (err != 0) { return (err); } - blkcnt += 1; - /* reset offset to end of "next" block back */ + /* set start to the beginning of this L1 indirect */ *start = P2ALIGN(*start, iblkrange); - if (*start <= limit) - *start = limit; - else - *start -= 1; } + if (*start < minimum) + *start = minimum; return (0); } static int dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, - uint64_t length, boolean_t free_dnode) + uint64_t length) { - dmu_tx_t *tx; - uint64_t object_size, start, end, len; - boolean_t trunc = (length == DMU_OBJECT_END); - int align, err; - - align = 1 << dn->dn_datablkshift; - ASSERT(align > 0); - object_size = align == 1 ? dn->dn_datablksz : - (dn->dn_maxblkid + 1) << dn->dn_datablkshift; - - end = offset + length; - if (trunc || end > object_size) - end = object_size; - if (end <= offset) + uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; + int err; + + if (offset >= object_size) return (0); - length = end - offset; - while (length) { - start = end; - /* assert(offset <= start) */ - err = get_next_chunk(dn, &start, offset); + if (length == DMU_OBJECT_END || offset + length > object_size) + length = object_size - offset; + + while (length != 0) { + uint64_t chunk_end, chunk_begin; + dmu_tx_t *tx; + + chunk_end = chunk_begin = offset + length; + + /* move chunk_begin backwards to the beginning of this chunk */ + err = get_next_chunk(dn, &chunk_begin, offset); if (err) return (err); - len = trunc ? DMU_OBJECT_END : end - start; + ASSERT3U(chunk_begin, >=, offset); + ASSERT3U(chunk_begin, <=, chunk_end); tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, dn->dn_object, start, len); + dmu_tx_hold_free(tx, dn->dn_object, + chunk_begin, chunk_end - chunk_begin); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } - - dnode_free_range(dn, start, trunc ? -1 : len, tx); - - if (start == 0 && free_dnode) { - ASSERT(trunc); - dnode_free(dn, tx); - } - - length -= end - start; - + dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx); dmu_tx_commit(tx); - end = start; + + length -= chunk_end - chunk_begin; } return (0); } @@ -669,38 +666,42 @@ dmu_free_long_range(objset_t *os, uint64_t object, err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); - err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); + err = dmu_free_long_range_impl(os, dn, offset, length); + + /* + * It is important to zero out the maxblkid when freeing the entire + * file, so that (a) subsequent calls to dmu_free_long_range_impl() + * will take the fast path, and (b) dnode_reallocate() can verify + * that the entire file has been freed. + */ + if (offset == 0 && length == DMU_OBJECT_END) + dn->dn_maxblkid = 0; + dnode_rele(dn, FTAG); return (err); } int -dmu_free_object(objset_t *os, uint64_t object) +dmu_free_long_object(objset_t *os, uint64_t object) { - dnode_t *dn; dmu_tx_t *tx; int err; - err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, - FTAG, &dn); + err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); if (err != 0) return (err); - if (dn->dn_nlevels == 1) { - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, object); - dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err == 0) { - dnode_free_range(dn, 0, DMU_OBJECT_END, tx); - dnode_free(dn, tx); - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, object); + dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err == 0) { + err = dmu_object_free(os, object, tx); + dmu_tx_commit(tx); } else { - err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE); + dmu_tx_abort(tx); } - dnode_rele(dn, FTAG); + return (err); } @@ -1980,7 +1981,7 @@ EXPORT_SYMBOL(dmu_buf_rele_array); EXPORT_SYMBOL(dmu_prefetch); EXPORT_SYMBOL(dmu_free_range); EXPORT_SYMBOL(dmu_free_long_range); -EXPORT_SYMBOL(dmu_free_object); +EXPORT_SYMBOL(dmu_free_long_object); EXPORT_SYMBOL(dmu_read); EXPORT_SYMBOL(dmu_write); EXPORT_SYMBOL(dmu_prealloc); diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 73807b6786bd..347e602e04a4 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -517,6 +517,38 @@ dmu_objset_rele(objset_t *os, void *tag) dsl_pool_rele(dp, tag); } +/* + * When we are called, os MUST refer to an objset associated with a dataset + * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner + * == tag. We will then release and reacquire ownership of the dataset while + * holding the pool config_rwlock to avoid intervening namespace or ownership + * changes may occur. + * + * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to + * release the hold on its dataset and acquire a new one on the dataset of the + * same name so that it can be partially torn down and reconstructed. + */ +void +dmu_objset_refresh_ownership(objset_t *os, void *tag) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds, *newds; + char name[MAXNAMELEN]; + + ds = os->os_dsl_dataset; + VERIFY3P(ds, !=, NULL); + VERIFY3P(ds->ds_owner, ==, tag); + VERIFY(dsl_dataset_long_held(ds)); + + dsl_dataset_name(ds, name); + dp = dmu_objset_pool(os); + dsl_pool_config_enter(dp, FTAG); + dmu_objset_disown(os, tag); + VERIFY0(dsl_dataset_own(dp, name, tag, &newds)); + VERIFY3P(newds, ==, os->os_dsl_dataset); + dsl_pool_config_exit(dp, FTAG); +} + void dmu_objset_disown(objset_t *os, void *tag) { diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 2945be89b8d8..c78833aa6116 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -109,6 +109,32 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, { struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); + /* + * When we receive a free record, dbuf_free_range() assumes + * that the receiving system doesn't have any dbufs in the range + * being freed. This is always true because there is a one-record + * constraint: we only send one WRITE record for any given + * object+offset. We know that the one-record constraint is + * true because we always send data in increasing order by + * object,offset. + * + * If the increasing-order constraint ever changes, we should find + * another way to assert that the one-record constraint is still + * satisfied. + */ + ASSERT(object > dsp->dsa_last_data_object || + (object == dsp->dsa_last_data_object && + offset > dsp->dsa_last_data_offset)); + + /* + * If we are doing a non-incremental send, then there can't + * be any data in the dataset we're receiving into. Therefore + * a free record would simply be a no-op. Save space by not + * sending it to begin with. + */ + if (!dsp->dsa_incremental) + return (0); + if (length != -1ULL && offset + length < offset) length = -1ULL; @@ -175,6 +201,15 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type, { struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); + /* + * We send data in increasing object, offset order. + * See comment in dump_free() for details. + */ + ASSERT(object > dsp->dsa_last_data_object || + (object == dsp->dsa_last_data_object && + offset > dsp->dsa_last_data_offset)); + dsp->dsa_last_data_object = object; + dsp->dsa_last_data_offset = offset + blksz - 1; /* * If there is any kind of pending aggregation (currently either @@ -242,6 +277,10 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) { struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); + /* See comment in dump_free(). */ + if (!dsp->dsa_incremental) + return (0); + /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, * push it out, since free block aggregation can only be done for @@ -318,9 +357,9 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) return (EINTR); - /* free anything past the end of the file */ + /* Free anything past the end of the file. */ if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) return (EINTR); if (dsp->dsa_err != 0) return (EINTR); @@ -503,6 +542,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, dsp->dsa_toguid = ds->ds_phys->ds_guid; ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); dsp->dsa_pending_op = PENDING_NONE; + dsp->dsa_incremental = (fromtxg != 0); mutex_enter(&ds->ds_sendstream_lock); list_insert_head(&ds->ds_sendstreams, dsp); @@ -1213,7 +1253,7 @@ restore_freeobjects(struct restorearg *ra, objset_t *os, if (dmu_object_info(os, obj, NULL) != 0) continue; - err = dmu_free_object(os, obj); + err = dmu_free_long_object(os, obj); if (err != 0) return (err); } @@ -1577,7 +1617,7 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) if (error != 0) return (error); error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, - origin_head, drc->drc_force); + origin_head, drc->drc_force, drc->drc_owner, tx); if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); @@ -1629,6 +1669,9 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) dsl_dataset_rele(origin_head, FTAG); dsl_destroy_head_sync_impl(drc->drc_ds, tx); + + if (drc->drc_owner != NULL) + VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); } else { dsl_dataset_t *ds = drc->drc_ds; @@ -1730,10 +1773,22 @@ dmu_recv_new_end(dmu_recv_cookie_t *drc) } int -dmu_recv_end(dmu_recv_cookie_t *drc) +dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) { + drc->drc_owner = owner; + if (drc->drc_newfs) return (dmu_recv_new_end(drc)); else return (dmu_recv_existing_end(drc)); } + +/* + * Return TRUE if this objset is currently being received into. + */ +boolean_t +dmu_objset_is_receiving(objset_t *os) +{ + return (os->os_dsl_dataset != NULL && + os->os_dsl_dataset->ds_owner == dmu_recv_tag); +} diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 3e46a02f8e57..daf99ea0d54f 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -604,8 +604,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) { dmu_tx_hold_t *txh; dnode_t *dn; - uint64_t start, end, i; - int err, shift; + int err; zio_t *zio; ASSERT(tx->tx_txg == 0); @@ -616,30 +615,47 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) return; dn = txh->txh_dnode; - /* first block */ - if (off != 0) - dmu_tx_count_write(txh, off, 1); - /* last block */ - if (len != DMU_OBJECT_END) - dmu_tx_count_write(txh, off+len, 1); - - dmu_tx_count_dnode(txh); - if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; + dmu_tx_count_dnode(txh); + /* - * For i/o error checking, read the first and last level-0 - * blocks, and all the level-1 blocks. The above count_write's - * have already taken care of the level-0 blocks. + * For i/o error checking, we read the first and last level-0 + * blocks if they are not aligned, and all the level-1 blocks. + * + * Note: dbuf_free_range() assumes that we have not instantiated + * any level-0 dbufs that will be completely freed. Therefore we must + * exercise care to not read or count the first and last blocks + * if they are blocksize-aligned. + */ + if (dn->dn_datablkshift == 0) { + if (off != 0 || len < dn->dn_datablksz) + dmu_tx_count_write(txh, 0, dn->dn_datablksz); + } else { + /* first block will be modified if it is not aligned */ + if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) + dmu_tx_count_write(txh, off, 1); + /* last block will be modified if it is not aligned */ + if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) + dmu_tx_count_write(txh, off+len, 1); + } + + /* + * Check level-1 blocks. */ if (dn->dn_nlevels > 1) { - shift = dn->dn_datablkshift + dn->dn_indblkshift - + uint64_t i; + + int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; - start = off >> shift; - end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; + uint64_t start = off >> shift; + uint64_t end = (off + len) >> shift; + + ASSERT(dn->dn_datablkshift != 0); + ASSERT(dn->dn_indblkshift != 0); zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c index d88134d72b01..53d50153ae0d 100644 --- a/module/zfs/dnode.c +++ b/module/zfs/dnode.c @@ -117,6 +117,7 @@ dnode_cons(void *arg, void *unused, int kmflag) dn->dn_id_flags = 0; dn->dn_dbufs_count = 0; + dn->dn_unlisted_l0_blkid = 0; list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); @@ -169,6 +170,7 @@ dnode_dest(void *arg, void *unused) ASSERT0(dn->dn_id_flags); ASSERT0(dn->dn_dbufs_count); + ASSERT0(dn->dn_unlisted_l0_blkid); list_destroy(&dn->dn_dbufs); } @@ -472,6 +474,7 @@ dnode_destroy(dnode_t *dn) dn->dn_newuid = 0; dn->dn_newgid = 0; dn->dn_id_flags = 0; + dn->dn_unlisted_l0_blkid = 0; dmu_zfetch_rele(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); @@ -703,6 +706,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) ASSERT(list_is_empty(&ndn->dn_dbufs)); list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs); ndn->dn_dbufs_count = odn->dn_dbufs_count; + ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid; ndn->dn_bonus = odn->dn_bonus; ndn->dn_have_spill = odn->dn_have_spill; ndn->dn_zio = odn->dn_zio; @@ -737,6 +741,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); odn->dn_dbufs_count = 0; + odn->dn_unlisted_l0_blkid = 0; odn->dn_bonus = NULL; odn->dn_zfetch.zf_dnode = NULL; @@ -1524,7 +1529,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) blkshift = dn->dn_datablkshift; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - if (len == -1ULL) { + if (len == DMU_OBJECT_END) { len = UINT64_MAX - off; trunc = TRUE; } diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index 5c0ca4d96225..a937bcf481f8 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -1657,16 +1657,52 @@ dsl_dataset_rename_snapshot(const char *fsname, dsl_dataset_rename_snapshot_sync, &ddrsa, 1)); } +/* + * If we're doing an ownership handoff, we need to make sure that there is + * only one long hold on the dataset. We're not allowed to change anything here + * so we don't permanently release the long hold or regular hold here. We want + * to do this only when syncing to avoid the dataset unexpectedly going away + * when we release the long hold. + */ +static int +dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) +{ + boolean_t held; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + if (owner != NULL) { + VERIFY3P(ds->ds_owner, ==, owner); + dsl_dataset_long_rele(ds, owner); + } + + held = dsl_dataset_long_held(ds); + + if (owner != NULL) + dsl_dataset_long_hold(ds, owner); + + if (held) + return (EBUSY); + + return (0); +} + +typedef struct dsl_dataset_rollback_arg { + const char *ddra_fsname; + void *ddra_owner; +} dsl_dataset_rollback_arg_t; + static int dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) { - const char *fsname = arg; + dsl_dataset_rollback_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int64_t unused_refres_delta; int error; - error = dsl_dataset_hold(dp, fsname, FTAG, &ds); + error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds); if (error != 0) return (error); @@ -1682,9 +1718,10 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) return (EINVAL); } - if (dsl_dataset_long_held(ds)) { + error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx); + if (error != 0) { dsl_dataset_rele(ds, FTAG); - return (EBUSY); + return (error); } /* @@ -1721,12 +1758,12 @@ dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) static void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) { - const char *fsname = arg; + dsl_dataset_rollback_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds, *clone; uint64_t cloneobj; - VERIFY0(dsl_dataset_hold(dp, fsname, FTAG, &ds)); + VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds)); cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback", ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx); @@ -1742,11 +1779,26 @@ dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) dsl_dataset_rele(ds, FTAG); } +/* + * If owner != NULL: + * + * - The existing dataset MUST be owned by the specified owner at entry + * - Upon return, dataset will still be held by the same owner, whether we + * succeed or not. + * + * This mode is required any time the existing filesystem is mounted. See + * notes above zfs_suspend_fs() for further details. + */ int -dsl_dataset_rollback(const char *fsname) +dsl_dataset_rollback(const char *fsname, void *owner) { + dsl_dataset_rollback_arg_t ddra; + + ddra.ddra_fsname = fsname; + ddra.ddra_owner = owner; + return (dsl_sync_task(fsname, dsl_dataset_rollback_check, - dsl_dataset_rollback_sync, (void *)fsname, 1)); + dsl_dataset_rollback_sync, (void *)&ddra, 1)); } struct promotenode { @@ -2264,7 +2316,7 @@ dsl_dataset_promote(const char *name, char *conflsnap) int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, - dsl_dataset_t *origin_head, boolean_t force) + dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx) { int64_t unused_refres_delta; @@ -2293,7 +2345,7 @@ dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, return (ETXTBSY); /* origin_head should have no long holds (e.g. is not mounted) */ - if (dsl_dataset_long_held(origin_head)) + if (dsl_dataset_handoff_check(origin_head, owner, tx)) return (EBUSY); /* check amount of any unconsumed refreservation */ diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c index 5c80c4eee353..4ec887755bd1 100644 --- a/module/zfs/dsl_destroy.c +++ b/module/zfs/dsl_destroy.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. */ #include @@ -127,6 +128,7 @@ dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx) pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL); if (pair != NULL) return (fnvpair_value_int32(pair)); + return (0); } @@ -902,7 +904,7 @@ dsl_destroy_head(const char *name) for (obj = 0; error == 0; error = dmu_object_next(os, &obj, FALSE, prev_snap_txg)) - (void) dmu_free_object(os, obj); + (void) dmu_free_long_object(os, obj); /* sync out all frees */ txg_wait_synced(dmu_objset_pool(os), 0); dmu_objset_disown(os, FTAG); diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index ccae3f2709f4..cd7343edbb4c 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 Martin Matuska. All rights reserved. */ #include @@ -971,12 +972,18 @@ dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), - ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, - &ddsqra->ddsqra_value, tx); + if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); - VERIFY0(dsl_prop_get_int_ds(ds, - zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); + } else { + newval = ddsqra->ddsqra_value; + spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", + zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); + } dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); @@ -1086,12 +1093,20 @@ dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); - dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), - ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, - &ddsqra->ddsqra_value, tx); + if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_RESERVATION), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); - VERIFY0(dsl_prop_get_int_ds(ds, - zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); + } else { + newval = ddsqra->ddsqra_value; + spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", + zfs_prop_to_name(ZFS_PROP_RESERVATION), + (longlong_t)newval); + } dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); dsl_dataset_rele(ds, FTAG); diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index b59e056bfb57..1c62eb0a69e7 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. */ #include @@ -1034,23 +1035,34 @@ dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) zap_cursor_t zc; objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; + nvlist_t *holds; if (zapobj == 0) return; ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + holds = fnvlist_alloc(); + for (zap_cursor_init(&zc, mos, zapobj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { char *htag; - uint64_t dsobj; + nvlist_t *tags; htag = strchr(za.za_name, '-'); *htag = '\0'; ++htag; - dsobj = strtonum(za.za_name, NULL); - dsl_dataset_user_release_tmp(dp, dsobj, htag); + if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { + tags = fnvlist_alloc(); + fnvlist_add_boolean(tags, htag); + fnvlist_add_nvlist(holds, za.za_name, tags); + fnvlist_free(tags); + } else { + fnvlist_add_boolean(tags, htag); + } } + dsl_dataset_user_release_tmp(dp, holds); + fnvlist_free(holds); zap_cursor_fini(&zc); } diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c index 7473d8f691d1..ca4b5142acc4 100644 --- a/module/zfs/dsl_prop.c +++ b/module/zfs/dsl_prop.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 Martin Matuska. All rights reserved. */ #include @@ -557,10 +558,6 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, } if (version < SPA_VERSION_RECVD_PROPS) { - zfs_prop_t prop = zfs_name_to_prop(propname); - if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION) - return; - if (source & ZPROP_SRC_NONE) source = ZPROP_SRC_NONE; else if (source & ZPROP_SRC_RECEIVED) diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c index 0419f3fab27a..ef02141af940 100644 --- a/module/zfs/dsl_userhold.c +++ b/module/zfs/dsl_userhold.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. */ #include @@ -37,6 +38,7 @@ typedef struct dsl_dataset_user_hold_arg { nvlist_t *dduha_holds; + nvlist_t *dduha_chkholds; nvlist_t *dduha_errlist; minor_t dduha_minor; } dsl_dataset_user_hold_arg_t; @@ -53,6 +55,8 @@ dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag, objset_t *mos = dp->dp_meta_objset; int error = 0; + ASSERT(dsl_pool_config_held(dp)); + if (strlen(htag) > MAXNAMELEN) return (E2BIG); /* Tempholds have a more restricted length */ @@ -60,18 +64,15 @@ dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag, return (E2BIG); /* tags must be unique (if ds already exists) */ - if (ds != NULL) { - mutex_enter(&ds->ds_lock); - if (ds->ds_phys->ds_userrefs_obj != 0) { - uint64_t value; - error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, - htag, 8, 1, &value); - if (error == 0) - error = EEXIST; - else if (error == ENOENT) - error = 0; - } - mutex_exit(&ds->ds_lock); + if (ds != NULL && ds->ds_phys->ds_userrefs_obj != 0) { + uint64_t value; + + error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, + htag, 8, 1, &value); + if (error == 0) + error = (EEXIST); + else if (error == ENOENT) + error = 0; } return (error); @@ -83,51 +84,63 @@ dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx) dsl_dataset_user_hold_arg_t *dduha = arg; dsl_pool_t *dp = dmu_tx_pool(tx); nvpair_t *pair; - int rv = 0; if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) return (ENOTSUP); - for (pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); pair != NULL; - pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { - int error = 0; + if (!dmu_tx_is_syncing(tx)) + return (0); + + for (pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); + pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { dsl_dataset_t *ds; - char *htag; + int error = 0; + char *htag, *name; /* must be a snapshot */ - if (strchr(nvpair_name(pair), '@') == NULL) + name = nvpair_name(pair); + if (strchr(name, '@') == NULL) error = EINVAL; if (error == 0) error = nvpair_value_string(pair, &htag); - if (error == 0) { - error = dsl_dataset_hold(dp, - nvpair_name(pair), FTAG, &ds); - } + + if (error == 0) + error = dsl_dataset_hold(dp, name, FTAG, &ds); + if (error == 0) { error = dsl_dataset_user_hold_check_one(ds, htag, dduha->dduha_minor != 0, tx); dsl_dataset_rele(ds, FTAG); } - if (error != 0) { - rv = error; - fnvlist_add_int32(dduha->dduha_errlist, - nvpair_name(pair), error); + if (error == 0) { + fnvlist_add_string(dduha->dduha_chkholds, name, htag); + } else { + /* + * We register ENOENT errors so they can be correctly + * reported if needed, such as when all holds fail. + */ + fnvlist_add_int32(dduha->dduha_errlist, name, error); + if (error != ENOENT) + return (error); } } - return (rv); + + return (0); } -void -dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag, - minor_t minor, uint64_t now, dmu_tx_t *tx) + +static void +dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds, + const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; uint64_t zapobj; - mutex_enter(&ds->ds_lock); + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + if (ds->ds_phys->ds_userrefs_obj == 0) { /* * This is the first user hold for this dataset. Create @@ -140,14 +153,26 @@ dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag, zapobj = ds->ds_phys->ds_userrefs_obj; } ds->ds_userrefs++; - mutex_exit(&ds->ds_lock); VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx)); if (minor != 0) { + char name[MAXNAMELEN]; + nvlist_t *tags; + VERIFY0(dsl_pool_user_hold(dp, ds->ds_object, htag, now, tx)); - dsl_register_onexit_hold_cleanup(ds, htag, minor); + (void) snprintf(name, sizeof (name), "%llx", + (u_longlong_t)ds->ds_object); + + if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) { + tags = fnvlist_alloc(); + fnvlist_add_boolean(tags, htag); + fnvlist_add_nvlist(tmpholds, name, tags); + fnvlist_free(tags); + } else { + fnvlist_add_boolean(tags, htag); + } } spa_history_log_internal_ds(ds, "hold", tx, @@ -155,183 +180,346 @@ dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag, htag, minor != 0, ds->ds_userrefs); } +typedef struct zfs_hold_cleanup_arg { + char zhca_spaname[MAXNAMELEN]; + uint64_t zhca_spa_load_guid; + nvlist_t *zhca_holds; +} zfs_hold_cleanup_arg_t; + +static void +dsl_dataset_user_release_onexit(void *arg) +{ + zfs_hold_cleanup_arg_t *ca = arg; + spa_t *spa; + int error; + + error = spa_open(ca->zhca_spaname, &spa, FTAG); + if (error != 0) { + zfs_dbgmsg("couldn't release holds on pool=%s " + "because pool is no longer loaded", + ca->zhca_spaname); + return; + } + if (spa_load_guid(spa) != ca->zhca_spa_load_guid) { + zfs_dbgmsg("couldn't release holds on pool=%s " + "because pool is no longer loaded (guid doesn't match)", + ca->zhca_spaname); + spa_close(spa, FTAG); + return; + } + + (void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds); + fnvlist_free(ca->zhca_holds); + kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); + spa_close(spa, FTAG); +} + +static void +dsl_onexit_hold_cleanup(spa_t *spa, nvlist_t *holds, minor_t minor) +{ + zfs_hold_cleanup_arg_t *ca; + + if (minor == 0 || nvlist_empty(holds)) { + fnvlist_free(holds); + return; + } + + ASSERT(spa != NULL); + ca = kmem_alloc(sizeof (*ca), KM_PUSHPAGE); + + (void) strlcpy(ca->zhca_spaname, spa_name(spa), + sizeof (ca->zhca_spaname)); + ca->zhca_spa_load_guid = spa_load_guid(spa); + ca->zhca_holds = holds; + VERIFY0(zfs_onexit_add_cb(minor, + dsl_dataset_user_release_onexit, ca, NULL)); +} + +void +dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag, + minor_t minor, uint64_t now, dmu_tx_t *tx) +{ + nvlist_t *tmpholds; + + if (minor != 0) + tmpholds = fnvlist_alloc(); + else + tmpholds = NULL; + dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx); + dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor); +} + static void dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_user_hold_arg_t *dduha = arg; dsl_pool_t *dp = dmu_tx_pool(tx); - nvpair_t *pair; + nvlist_t *tmpholds; uint64_t now = gethrestime_sec(); + nvpair_t *pair; - for (pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); pair != NULL; - pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { + if (dduha->dduha_minor != 0) + tmpholds = fnvlist_alloc(); + else + tmpholds = NULL; + for (pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL); + pair != NULL; + pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) { dsl_dataset_t *ds; + VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); - dsl_dataset_user_hold_sync_one(ds, fnvpair_value_string(pair), - dduha->dduha_minor, now, tx); + dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, + fnvpair_value_string(pair), dduha->dduha_minor, now, tx); dsl_dataset_rele(ds, FTAG); } + dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor); } /* + * The full semantics of this function are described in the comment above + * lzc_hold(). + * + * To summarize: * holds is nvl of snapname -> holdname * errlist will be filled in with snapname -> error - * if cleanup_minor is not 0, the holds will be temporary, cleaned up - * when the process exits. * - * if any fails, all will fail. + * The snaphosts must all be in the same pool. + * + * Holds for snapshots that don't exist will be skipped. + * + * If none of the snapshots for requested holds exist then ENOENT will be + * returned. + * + * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned + * up when the process exits. + * + * On success all the holds, for snapshots that existed, will be created and 0 + * will be returned. + * + * On failure no holds will be created, the errlist will be filled in, + * and an errno will returned. + * + * In all cases the errlist will contain entries for holds where the snapshot + * didn't exist. */ int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist) { dsl_dataset_user_hold_arg_t dduha; nvpair_t *pair; + int ret; pair = nvlist_next_nvpair(holds, NULL); if (pair == NULL) return (0); dduha.dduha_holds = holds; + dduha.dduha_chkholds = fnvlist_alloc(); dduha.dduha_errlist = errlist; dduha.dduha_minor = cleanup_minor; - return (dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check, - dsl_dataset_user_hold_sync, &dduha, fnvlist_num_pairs(holds))); + ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check, + dsl_dataset_user_hold_sync, &dduha, fnvlist_num_pairs(holds)); + fnvlist_free(dduha.dduha_chkholds); + + return (ret); } +typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag, + dsl_dataset_t **dsp); + typedef struct dsl_dataset_user_release_arg { + dsl_holdfunc_t *ddura_holdfunc; nvlist_t *ddura_holds; nvlist_t *ddura_todelete; nvlist_t *ddura_errlist; + nvlist_t *ddura_chkholds; } dsl_dataset_user_release_arg_t; +/* Place a dataset hold on the snapshot identified by passed dsobj string */ static int -dsl_dataset_user_release_check_one(dsl_dataset_t *ds, - nvlist_t *holds, boolean_t *todelete) +dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag, + dsl_dataset_t **dsp) +{ + return (dsl_dataset_hold_obj(dp, strtonum(dsobj, NULL), tag, dsp)); +} + +static int +dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura, + dsl_dataset_t *ds, nvlist_t *holds, const char *snapname) { uint64_t zapobj; + nvlist_t *holds_found; + objset_t *mos; + int numholds; nvpair_t *pair; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - int error; - int numholds = 0; - - *todelete = B_FALSE; if (!dsl_dataset_is_snapshot(ds)) return (EINVAL); + if (nvlist_empty(holds)) + return (0); + + numholds = 0; + mos = ds->ds_dir->dd_pool->dp_meta_objset; zapobj = ds->ds_phys->ds_userrefs_obj; - if (zapobj == 0) - return (ESRCH); + holds_found = fnvlist_alloc(); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { - /* Make sure the hold exists */ uint64_t tmp; - error = zap_lookup(mos, zapobj, nvpair_name(pair), 8, 1, &tmp); - if (error == ENOENT) - error = ESRCH; - if (error != 0) + int error; + const char *holdname = nvpair_name(pair); + + if (zapobj != 0) + error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp); + else + error = ENOENT; + + /* + * Non-existent holds are put on the errlist, but don't + * cause an overall failure. + */ + if (error == ENOENT) { + if (ddura->ddura_errlist != NULL) { + char *errtag = kmem_asprintf("%s#%s", + snapname, holdname); + fnvlist_add_int32(ddura->ddura_errlist, errtag, + ENOENT); + strfree(errtag); + } + continue; + } + + if (error != 0) { + fnvlist_free(holds_found); return (error); + } + + fnvlist_add_boolean(holds_found, holdname); numholds++; } if (DS_IS_DEFER_DESTROY(ds) && ds->ds_phys->ds_num_children == 1 && ds->ds_userrefs == numholds) { /* we need to destroy the snapshot as well */ - - if (dsl_dataset_long_held(ds)) + if (dsl_dataset_long_held(ds)) { + fnvlist_free(holds_found); return (EBUSY); - *todelete = B_TRUE; + } + fnvlist_add_boolean(ddura->ddura_todelete, snapname); + } + + if (numholds != 0) { + fnvlist_add_nvlist(ddura->ddura_chkholds, snapname, + holds_found); } + fnvlist_free(holds_found); + return (0); } static int dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx) { - dsl_dataset_user_release_arg_t *ddura = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_user_release_arg_t *ddura; + dsl_holdfunc_t *holdfunc; + dsl_pool_t *dp; nvpair_t *pair; - int rv = 0; if (!dmu_tx_is_syncing(tx)) return (0); - for (pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); pair != NULL; - pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { - const char *name = nvpair_name(pair); + dp = dmu_tx_pool(tx); + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + ddura = arg; + holdfunc = ddura->ddura_holdfunc; + + for (pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { int error; dsl_dataset_t *ds; nvlist_t *holds; + const char *snapname = nvpair_name(pair); error = nvpair_value_nvlist(pair, &holds); if (error != 0) - return (EINVAL); - - error = dsl_dataset_hold(dp, name, FTAG, &ds); + error = ((EINVAL)); + else + error = holdfunc(dp, snapname, FTAG, &ds); if (error == 0) { - boolean_t deleteme; - error = dsl_dataset_user_release_check_one(ds, - holds, &deleteme); - if (error == 0 && deleteme) { - fnvlist_add_boolean(ddura->ddura_todelete, - name); - } + error = dsl_dataset_user_release_check_one(ddura, ds, + holds, snapname); dsl_dataset_rele(ds, FTAG); } if (error != 0) { if (ddura->ddura_errlist != NULL) { fnvlist_add_int32(ddura->ddura_errlist, - name, error); + snapname, error); } - rv = error; + /* + * Non-existent snapshots are put on the errlist, + * but don't cause an overall failure. + */ + if (error != ENOENT) + return (error); } } - return (rv); + + return (0); } static void dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds, dmu_tx_t *tx) { + nvpair_t *pair; + dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; - uint64_t zapobj; - int error; - nvpair_t *pair; for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { - ds->ds_userrefs--; - error = dsl_pool_user_release(dp, ds->ds_object, - nvpair_name(pair), tx); + int error; + const char *holdname = nvpair_name(pair); + + /* Remove temporary hold if one exists. */ + error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx); VERIFY(error == 0 || error == ENOENT); - zapobj = ds->ds_phys->ds_userrefs_obj; - VERIFY0(zap_remove(mos, zapobj, nvpair_name(pair), tx)); + + VERIFY0(zap_remove(mos, ds->ds_phys->ds_userrefs_obj, holdname, + tx)); + ds->ds_userrefs--; spa_history_log_internal_ds(ds, "release", tx, - "tag=%s refs=%lld", nvpair_name(pair), - (longlong_t)ds->ds_userrefs); + "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs); } } static void dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx) { + nvpair_t *pair; + dsl_dataset_user_release_arg_t *ddura = arg; + dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc; dsl_pool_t *dp = dmu_tx_pool(tx); - nvpair_t *pair; - for (pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); pair != NULL; - pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + for (pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds, + pair)) { dsl_dataset_t *ds; + const char *name = nvpair_name(pair); + + VERIFY0(holdfunc(dp, name, FTAG, &ds)); - VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); dsl_dataset_user_release_sync_one(ds, fnvpair_value_nvlist(pair), tx); - if (nvlist_exists(ddura->ddura_todelete, - nvpair_name(pair))) { + if (nvlist_exists(ddura->ddura_todelete, name)) { ASSERT(ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && DS_IS_DEFER_DESTROY(ds)); @@ -342,162 +530,108 @@ dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx) } /* + * The full semantics of this function are described in the comment above + * lzc_release(). + * + * To summarize: + * Releases holds specified in the nvl holds. + * * holds is nvl of snapname -> { holdname, ... } * errlist will be filled in with snapname -> error * - * if any fails, all will fail. + * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots, + * otherwise they should be the names of shapshots. + * + * As a release may cause snapshots to be destroyed this trys to ensure they + * aren't mounted. + * + * The release of non-existent holds are skipped. + * + * At least one hold must have been released for the this function to succeed + * and return 0. */ -int -dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist) +static int +dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist, + dsl_pool_t *tmpdp) { dsl_dataset_user_release_arg_t ddura; nvpair_t *pair; + char *pool; int error; pair = nvlist_next_nvpair(holds, NULL); if (pair == NULL) return (0); + /* + * The release may cause snapshots to be destroyed; make sure they + * are not mounted. + */ + if (tmpdp != NULL) { + /* Temporary holds are specified by dsobj string. */ + ddura.ddura_holdfunc = dsl_dataset_hold_obj_string; + pool = spa_name(tmpdp->dp_spa); +#ifdef _KERNEL + dsl_pool_config_enter(tmpdp, FTAG); + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + dsl_dataset_t *ds; + + error = dsl_dataset_hold_obj_string(tmpdp, + nvpair_name(pair), FTAG, &ds); + if (error == 0) { + char name[MAXNAMELEN]; + dsl_dataset_name(ds, name); + dsl_dataset_rele(ds, FTAG); + (void) zfs_unmount_snap(name); + } + } + dsl_pool_config_exit(tmpdp, FTAG); +#endif + } else { + /* Non-temporary holds are specified by name. */ + ddura.ddura_holdfunc = dsl_dataset_hold; + pool = nvpair_name(pair); +#ifdef _KERNEL + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + (void) zfs_unmount_snap(nvpair_name(pair)); + } +#endif + } + ddura.ddura_holds = holds; ddura.ddura_errlist = errlist; ddura.ddura_todelete = fnvlist_alloc(); + ddura.ddura_chkholds = fnvlist_alloc(); - error = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_release_check, - dsl_dataset_user_release_sync, &ddura, fnvlist_num_pairs(holds)); + error = dsl_sync_task(pool, dsl_dataset_user_release_check, + dsl_dataset_user_release_sync, &ddura, + fnvlist_num_pairs(holds)); fnvlist_free(ddura.ddura_todelete); - return (error); -} - -typedef struct dsl_dataset_user_release_tmp_arg { - uint64_t ddurta_dsobj; - nvlist_t *ddurta_holds; - boolean_t ddurta_deleteme; -} dsl_dataset_user_release_tmp_arg_t; - -static int -dsl_dataset_user_release_tmp_check(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_user_release_tmp_arg_t *ddurta = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - int error; - - if (!dmu_tx_is_syncing(tx)) - return (0); + fnvlist_free(ddura.ddura_chkholds); - error = dsl_dataset_hold_obj(dp, ddurta->ddurta_dsobj, FTAG, &ds); - if (error) - return (error); - - error = dsl_dataset_user_release_check_one(ds, - ddurta->ddurta_holds, &ddurta->ddurta_deleteme); - dsl_dataset_rele(ds, FTAG); return (error); } -static void -dsl_dataset_user_release_tmp_sync(void *arg, dmu_tx_t *tx) -{ - dsl_dataset_user_release_tmp_arg_t *ddurta = arg; - dsl_pool_t *dp = dmu_tx_pool(tx); - dsl_dataset_t *ds; - - VERIFY0(dsl_dataset_hold_obj(dp, ddurta->ddurta_dsobj, FTAG, &ds)); - dsl_dataset_user_release_sync_one(ds, ddurta->ddurta_holds, tx); - if (ddurta->ddurta_deleteme) { - ASSERT(ds->ds_userrefs == 0 && - ds->ds_phys->ds_num_children == 1 && - DS_IS_DEFER_DESTROY(ds)); - dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); - } - dsl_dataset_rele(ds, FTAG); -} - /* - * Called at spa_load time to release a stale temporary user hold. - * Also called by the onexit code. + * holds is nvl of snapname -> { holdname, ... } + * errlist will be filled in with snapname -> error */ -void -dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, const char *htag) -{ - dsl_dataset_user_release_tmp_arg_t ddurta; - -#ifdef _KERNEL - dsl_dataset_t *ds; - int error; - - /* Make sure it is not mounted. */ - dsl_pool_config_enter(dp, FTAG); - error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); - if (error == 0) { - char name[MAXNAMELEN]; - dsl_dataset_name(ds, name); - dsl_dataset_rele(ds, FTAG); - dsl_pool_config_exit(dp, FTAG); - zfs_unmount_snap(name); - } else { - dsl_pool_config_exit(dp, FTAG); - } -#endif - - ddurta.ddurta_dsobj = dsobj; - ddurta.ddurta_holds = fnvlist_alloc(); - fnvlist_add_boolean(ddurta.ddurta_holds, htag); - - (void) dsl_sync_task(spa_name(dp->dp_spa), - dsl_dataset_user_release_tmp_check, - dsl_dataset_user_release_tmp_sync, &ddurta, 1); - fnvlist_free(ddurta.ddurta_holds); -} - -typedef struct zfs_hold_cleanup_arg { - char zhca_spaname[MAXNAMELEN]; - uint64_t zhca_spa_load_guid; - uint64_t zhca_dsobj; - char zhca_htag[MAXNAMELEN]; -} zfs_hold_cleanup_arg_t; - -static void -dsl_dataset_user_release_onexit(void *arg) +int +dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist) { - zfs_hold_cleanup_arg_t *ca = arg; - spa_t *spa; - int error; - - error = spa_open(ca->zhca_spaname, &spa, FTAG); - if (error != 0) { - zfs_dbgmsg("couldn't release hold on pool=%s ds=%llu tag=%s " - "because pool is no longer loaded", - ca->zhca_spaname, ca->zhca_dsobj, ca->zhca_htag); - return; - } - if (spa_load_guid(spa) != ca->zhca_spa_load_guid) { - zfs_dbgmsg("couldn't release hold on pool=%s ds=%llu tag=%s " - "because pool is no longer loaded (guid doesn't match)", - ca->zhca_spaname, ca->zhca_dsobj, ca->zhca_htag); - spa_close(spa, FTAG); - return; - } - - dsl_dataset_user_release_tmp(spa_get_dsl(spa), - ca->zhca_dsobj, ca->zhca_htag); - kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); - spa_close(spa, FTAG); + return (dsl_dataset_user_release_impl(holds, errlist, NULL)); } +/* + * holds is nvl of snapdsobj -> { holdname, ... } + */ void -dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, - minor_t minor) +dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds) { - zfs_hold_cleanup_arg_t *ca = kmem_alloc(sizeof (*ca), KM_PUSHPAGE); - spa_t *spa = dsl_dataset_get_spa(ds); - (void) strlcpy(ca->zhca_spaname, spa_name(spa), - sizeof (ca->zhca_spaname)); - ca->zhca_spa_load_guid = spa_load_guid(spa); - ca->zhca_dsobj = ds->ds_object; - (void) strlcpy(ca->zhca_htag, htag, sizeof (ca->zhca_htag)); - VERIFY0(zfs_onexit_add_cb(minor, - dsl_dataset_user_release_onexit, ca, NULL)); + ASSERT(dp != NULL); + (void) dsl_dataset_user_release_impl(holds, NULL, dp); } int diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c index be782ba80298..2a0e1b78db17 100644 --- a/module/zfs/zfs_ioctl.c +++ b/module/zfs/zfs_ioctl.c @@ -27,8 +27,9 @@ * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. - * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. */ /* @@ -3495,13 +3496,13 @@ zfs_ioc_rollback(zfs_cmd_t *zc) if (error == 0) { int resume_err; - error = dsl_dataset_rollback(zc->zc_name); + error = dsl_dataset_rollback(zc->zc_name, zsb); resume_err = zfs_resume_fs(zsb, zc->zc_name); error = error ? error : resume_err; } deactivate_super(zsb->z_sb); } else { - error = dsl_dataset_rollback(zc->zc_name); + error = dsl_dataset_rollback(zc->zc_name, NULL); } return (error); } @@ -4029,13 +4030,13 @@ zfs_ioc_recv(zfs_cmd_t *zc) * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ - end_err = dmu_recv_end(&drc); + end_err = dmu_recv_end(&drc, zsb); if (error == 0) error = zfs_resume_fs(zsb, tofs); error = error ? error : end_err; deactivate_super(zsb->z_sb); } else { - error = dmu_recv_end(&drc); + error = dmu_recv_end(&drc, NULL); } } @@ -4519,8 +4520,11 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) * objset_phys_t). Suspend/resume the fs will do that. */ error = zfs_suspend_fs(zsb); - if (error == 0) + if (error == 0) { + dmu_objset_refresh_ownership(zsb->z_os, + zsb); error = zfs_resume_fs(zsb, zc->zc_name); + } } if (error == 0) error = dmu_objset_userspace_upgrade(zsb->z_os); @@ -4859,16 +4863,6 @@ zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { - nvpair_t *pair; - - /* - * The release may cause the snapshot to be destroyed; make sure it - * is not mounted. - */ - for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; - pair = nvlist_next_nvpair(holds, pair)) - zfs_unmount_snap(nvpair_name(pair)); - return (dsl_dataset_user_release(holds, errlist)); } diff --git a/module/zfs/zfs_vfsops.c b/module/zfs/zfs_vfsops.c index eeac0391cb05..a7c2946575a4 100644 --- a/module/zfs/zfs_vfsops.c +++ b/module/zfs/zfs_vfsops.c @@ -1422,7 +1422,9 @@ EXPORT_SYMBOL(zfs_vget); * Block out VFS ops and close zfs_sb_t * * Note, if successful, then we return with the 'z_teardown_lock' and - * 'z_teardown_inactive_lock' write held. + * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying + * dataset and objset intact so that they can be atomically handed off during + * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfs_sb_t *zsb) @@ -1432,64 +1434,76 @@ zfs_suspend_fs(zfs_sb_t *zsb) if ((error = zfs_sb_teardown(zsb, B_FALSE)) != 0) return (error); - dmu_objset_disown(zsb->z_os, zsb); - return (0); } EXPORT_SYMBOL(zfs_suspend_fs); /* - * Reopen zfs_sb_t and release VFS ops. + * Rebuild SA and release VOPs. Note that ownership of the underlying dataset + * is an invariant across any of the operations that can be performed while the + * filesystem was suspended. Whether it succeeded or failed, the preconditions + * are the same: the relevant objset and associated dataset are owned by + * zfsvfs, held, and long held on entry. */ int zfs_resume_fs(zfs_sb_t *zsb, const char *osname) { - int err, err2; + int err; + znode_t *zp; + uint64_t sa_obj = 0; ASSERT(RRW_WRITE_HELD(&zsb->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zsb->z_teardown_inactive_lock)); - err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zsb, &zsb->z_os); - if (err) { - zsb->z_os = NULL; - } else { - znode_t *zp; - uint64_t sa_obj = 0; + /* + * We already own this, so just hold and rele it to update the + * objset_t, as the one we had before may have been evicted. + */ + VERIFY0(dmu_objset_hold(osname, zsb, &zsb->z_os)); + VERIFY3P(zsb->z_os->os_dsl_dataset->ds_owner, ==, zsb); + VERIFY(dsl_dataset_long_held(zsb->z_os->os_dsl_dataset)); + dmu_objset_rele(zsb->z_os, zsb); - err2 = zap_lookup(zsb->z_os, MASTER_NODE_OBJ, - ZFS_SA_ATTRS, 8, 1, &sa_obj); + /* + * Make sure version hasn't changed + */ - if ((err || err2) && zsb->z_version >= ZPL_VERSION_SA) - goto bail; + err = zfs_get_zplprop(zsb->z_os, ZFS_PROP_VERSION, + &zsb->z_version); + if (err) + goto bail; - if ((err = sa_setup(zsb->z_os, sa_obj, - zfs_attr_table, ZPL_END, &zsb->z_attr_table)) != 0) - goto bail; + err = zap_lookup(zsb->z_os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj); - VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0); - zsb->z_rollback_time = jiffies; + if (err && zsb->z_version >= ZPL_VERSION_SA) + goto bail; - /* - * Attempt to re-establish all the active inodes with their - * dbufs. If a zfs_rezget() fails, then we unhash the inode - * and mark it stale. This prevents a collision if a new - * inode/object is created which must use the same inode - * number. The stale inode will be be released when the - * VFS prunes the dentry holding the remaining references - * on the stale inode. - */ - mutex_enter(&zsb->z_znodes_lock); - for (zp = list_head(&zsb->z_all_znodes); zp; - zp = list_next(&zsb->z_all_znodes, zp)) { - err2 = zfs_rezget(zp); - if (err2) { - remove_inode_hash(ZTOI(zp)); - zp->z_is_stale = B_TRUE; - } - } - mutex_exit(&zsb->z_znodes_lock); + if ((err = sa_setup(zsb->z_os, sa_obj, + zfs_attr_table, ZPL_END, &zsb->z_attr_table)) != 0) + goto bail; + + if (zsb->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(zsb->z_os, + zfs_sa_upgrade); + + VERIFY(zfs_sb_setup(zsb, B_FALSE) == 0); + + zfs_set_fuid_feature(zsb); + + /* + * Attempt to re-establish all the active znodes with + * their dbufs. If a zfs_rezget() fails, then we'll let + * any potential callers discover that via ZFS_ENTER_VERIFY_VP + * when they try to use their znode. + */ + mutex_enter(&zsb->z_znodes_lock); + for (zp = list_head(&zsb->z_all_znodes); zp; + zp = list_next(&zsb->z_all_znodes, zp)) { + (void) zfs_rezget(zp); } + mutex_exit(&zsb->z_znodes_lock); bail: /* release the VFS ops */ @@ -1506,6 +1520,7 @@ zfs_resume_fs(zfs_sb_t *zsb, const char *osname) } return (err); } + EXPORT_SYMBOL(zfs_resume_fs); int