Skip to content

Commit

Permalink
Fix send/recv lost spill block
Browse files Browse the repository at this point in the history
When receiving a DRR_OBJECT record the receive_object() function
needs to determine how to handle a spill block associated with the
object.  It may need to be removed or kept depending on how the
object was modified at the source.

This determination is currently accomplished using a heuristic which
takes in to account the DRR_OBJECT record and the existing object
properties.  This is a problem because there isn't quite enough
information available to do the right thing under all circumstances.
For example, when only the block size changes the spill block is
removed when it should be kept.

What's needed to resolve this is an additional flag in the DRR_OBJECT
which indicates if the object being received references a spill block.
The DRR_OBJECT_SPILL flag was added for this purpose.  When set then
the object references a spill block and it must be kept.  Either
it is update to date, or it will be replaced by a subsequent DRR_SPILL
record.  Conversely, if the object being received doesn't reference
a spill block then any existing spill block should always be removed.

Since previous versions of ZFS do not understand this new flag
additional DRR_SPILL records will be inserted in to the stream.
This has the advantage of being fully backward compatible.  Existing
ZFS systems receiving this stream will recreate the spill block if
it was incorrectly removed.  Updated ZFS versions will correctly
ignore the additional spill blocks which can be identified by
checking for the DRR_SPILL_UNMODIFIED flag.

The small downside to this approach is that is may increase the size
of the stream and of the received snapshot on previous versions of
ZFS.  Additionally, when receiving streams generated by previous
unpatched versions of ZFS spill blocks may still be lost.

OpenZFS-issue: https://www.illumos.org/issues/9952
FreeBSD-issue: https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=233277

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Tom Caputi <tcaputi@datto.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #8668
  • Loading branch information
behlendorf authored May 7, 2019
1 parent 9c53e51 commit caf9dd2
Show file tree
Hide file tree
Showing 19 changed files with 398 additions and 42 deletions.
3 changes: 2 additions & 1 deletion include/sys/dmu.h
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,8 @@ int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object,
dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype,
int bonuslen, int dnodesize, dmu_tx_t *txp);
int bonuslen, int dnodesize, boolean_t keep_spill, dmu_tx_t *tx);
int dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx);

/*
* Free an object from this objset.
Expand Down
1 change: 1 addition & 0 deletions include/sys/dmu_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ typedef struct dmu_sendarg {
objset_t *dsa_os;
zio_cksum_t dsa_zc;
uint64_t dsa_toguid;
uint64_t dsa_fromtxg;
int dsa_err;
dmu_pendop_t dsa_pending_op;
uint64_t dsa_featureflags;
Expand Down
1 change: 1 addition & 0 deletions include/sys/dmu_recv.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ typedef struct dmu_recv_cookie {
boolean_t drc_resumable;
boolean_t drc_raw;
boolean_t drc_clone;
boolean_t drc_spill;
struct avl_tree *drc_guid_to_ds_map;
nvlist_t *drc_keynvl;
zio_cksum_t drc_cksum;
Expand Down
7 changes: 4 additions & 3 deletions include/sys/dnode.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,8 @@ typedef struct dnode_phys {
};
} dnode_phys_t;

#define DN_SPILL_BLKPTR(dnp) (blkptr_t *)((char *)(dnp) + \
(((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT))
#define DN_SPILL_BLKPTR(dnp) ((blkptr_t *)((char *)(dnp) + \
(((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT)))

struct dnode {
/*
Expand Down Expand Up @@ -420,7 +420,8 @@ void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx);
void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx);
dmu_object_type_t bonustype, int bonuslen, int dn_slots,
boolean_t keep_spill, dmu_tx_t *tx);
void dnode_free(dnode_t *dn, dmu_tx_t *tx);
void dnode_byteswap(dnode_phys_t *dnp);
void dnode_buf_byteswap(void *buf, size_t size);
Expand Down
1 change: 1 addition & 0 deletions include/sys/fs/zfs.h
Original file line number Diff line number Diff line change
Expand Up @@ -1317,6 +1317,7 @@ typedef enum {
ZFS_ERR_WRONG_PARENT,
ZFS_ERR_FROM_IVSET_GUID_MISSING,
ZFS_ERR_FROM_IVSET_GUID_MISMATCH,
ZFS_ERR_SPILL_BLOCK_FLAG_MISSING,
} zfs_errno_t;

/*
Expand Down
28 changes: 25 additions & 3 deletions include/sys/zfs_ioctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ typedef enum drr_headertype {
/* flag #18 is reserved for a Delphix feature */
#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19)
#define DMU_BACKUP_FEATURE_RESUMING (1 << 20)
/* flag #21 is reserved for a Delphix feature */
/* flag #21 is reserved for the redacted send/receive feature */
#define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22)
#define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23)
#define DMU_BACKUP_FEATURE_RAW (1 << 24)
Expand Down Expand Up @@ -131,7 +131,7 @@ typedef enum dmu_send_resume_token_version {
*
* 64 56 48 40 32 24 16 8 0
* +-------+-------+-------+-------+-------+-------+-------+-------+
* | reserved | feature-flags |C|S|
* | reserved | feature-flags |C|S|
* +-------+-------+-------+-------+-------+-------+-------+-------+
*
* The low order two bits indicate the header type: SUBSTREAM (0x1)
Expand Down Expand Up @@ -160,16 +160,38 @@ typedef enum dmu_send_resume_token_version {
* cannot necessarily be received as a clone correctly.
*/
#define DRR_FLAG_FREERECORDS (1<<2)
/*
* When DRR_FLAG_SPILL_BLOCK is set it indicates the DRR_OBJECT_SPILL
* and DRR_SPILL_UNMODIFIED flags are meaningful in the send stream.
*
* When DRR_FLAG_SPILL_BLOCK is set, DRR_OBJECT records will have
* DRR_OBJECT_SPILL set if and only if they should have a spill block
* (either an existing one, or a new one in the send stream). When clear
* the object does not have a spill block and any existing spill block
* should be freed.
*
* Similarly, when DRR_FLAG_SPILL_BLOCK is set, DRR_SPILL records will
* have DRR_SPILL_UNMODIFIED set if and only if they were included for
* backward compatibility purposes, and can be safely ignored by new versions
* of zfs receive. Previous versions of ZFS which do not understand the
* DRR_FLAG_SPILL_BLOCK will process this record and recreate any missing
* spill blocks.
*/
#define DRR_FLAG_SPILL_BLOCK (1<<3)

/*
* flags in the drr_flags field in the DRR_WRITE, DRR_SPILL, DRR_OBJECT,
* DRR_WRITE_BYREF, and DRR_OBJECT_RANGE blocks
*/
#define DRR_CHECKSUM_DEDUP (1<<0) /* not used for DRR_SPILL blocks */
#define DRR_CHECKSUM_DEDUP (1<<0) /* not used for SPILL records */
#define DRR_RAW_BYTESWAP (1<<1)
#define DRR_OBJECT_SPILL (1<<2) /* OBJECT record has a spill block */
#define DRR_SPILL_UNMODIFIED (1<<2) /* SPILL record for unmodified block */

#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP)
#define DRR_IS_RAW_BYTESWAPPED(flags) ((flags) & DRR_RAW_BYTESWAP)
#define DRR_OBJECT_HAS_SPILL(flags) ((flags) & DRR_OBJECT_SPILL)
#define DRR_SPILL_IS_UNMODIFIED(flags) ((flags) & DRR_SPILL_UNMODIFIED)

/* deal with compressed drr_write replay records */
#define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0)
Expand Down
7 changes: 7 additions & 0 deletions lib/libzfs/libzfs_sendrecv.c
Original file line number Diff line number Diff line change
Expand Up @@ -4466,6 +4466,13 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
"of raw encrypted send streams."));
(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
break;
case ZFS_ERR_SPILL_BLOCK_FLAG_MISSING:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"Spill block flag missing for raw send.\n"
"The zfs software on the sending system must "
"be updated."));
(void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
break;
case EBUSY:
if (hastoken) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
Expand Down
16 changes: 15 additions & 1 deletion man/man5/zfs-module-parameters.5
Original file line number Diff line number Diff line change
Expand Up @@ -2337,6 +2337,21 @@ Allow sending of corrupt data (ignore read/checksum errors when sending data)
Use \fB1\fR for yes and \fB0\fR for no (default).
.RE

.sp
.ne 2
.na
\fBzfs_send_unmodified_spill_blocks\fR (int)
.ad
.RS 12n
Include unmodified spill blocks in the send stream. Under certain circumstances
previous versions of ZFS could incorrectly remove the spill block from an
existing object. Including unmodified copies of the spill blocks creates a
backwards compatible stream which will recreate a spill block if it was
incorrectly removed.
.sp
Use \fB1\fR for yes (default) and \fB0\fR for no.
.RE

.sp
.ne 2
.na
Expand All @@ -2355,7 +2370,6 @@ Default value: \fB16,777,216\fR.
\fBzfs_recv_queue_length\fR (int)
.ad
.RS 12n
.sp
The maximum number of bytes allowed in the \fBzfs receive\fR queue. This value
must be at least twice the maximum block size in use.
.sp
Expand Down
2 changes: 1 addition & 1 deletion module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -2466,7 +2466,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
ASSERT(db->db_level == 0);
ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
ASSERT(buf != NULL);
ASSERT(arc_buf_lsize(buf) == db->db.db_size);
ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
ASSERT(tx->tx_txg != 0);

arc_return_buf(buf, db);
Expand Down
31 changes: 28 additions & 3 deletions module/zfs/dmu_object.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
* Copyright 2014 HybridCluster. All rights reserved.
*/

#include <sys/dbuf.h>
#include <sys/dmu.h>
#include <sys/dmu_objset.h>
#include <sys/dmu_tx.h>
Expand Down Expand Up @@ -304,13 +305,13 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
bonuslen, DNODE_MIN_SIZE, tx));
bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
}

int
dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
dmu_tx_t *tx)
boolean_t keep_spill, dmu_tx_t *tx)
{
dnode_t *dn;
int dn_slots = dnodesize >> DNODE_SHIFT;
Expand All @@ -327,7 +328,30 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
if (err)
return (err);

dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
keep_spill, tx);

dnode_rele(dn, FTAG);
return (err);
}

int
dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
{
dnode_t *dn;
int err;

err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
FTAG, &dn);
if (err)
return (err);

rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
dbuf_rm_spill(dn, tx);
dnode_rm_spill(dn, tx);
}
rw_exit(&dn->dn_struct_rwlock);

dnode_rele(dn, FTAG);
return (err);
Expand Down Expand Up @@ -489,6 +513,7 @@ EXPORT_SYMBOL(dmu_object_claim);
EXPORT_SYMBOL(dmu_object_claim_dnsize);
EXPORT_SYMBOL(dmu_object_reclaim);
EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
EXPORT_SYMBOL(dmu_object_rm_spill);
EXPORT_SYMBOL(dmu_object_free);
EXPORT_SYMBOL(dmu_object_next);
EXPORT_SYMBOL(dmu_object_zapify);
Expand Down
67 changes: 58 additions & 9 deletions module/zfs/dmu_recv.c
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
/* embedded data is incompatible with encryption and raw recv */
if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
return (SET_ERROR(EINVAL));

/* raw receives require spill block allocation flag */
if (!(flags & DRR_FLAG_SPILL_BLOCK))
return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
} else {
dsflags |= DS_HOLD_FLAG_DECRYPT;
}
Expand Down Expand Up @@ -615,8 +619,13 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
(void) snprintf(recvname, sizeof (recvname), "%s/%s",
tofs, recv_clone_name);

if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0)
if (featureflags & DMU_BACKUP_FEATURE_RAW) {
/* raw receives require spill block allocation flag */
if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK))
return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
} else {
dsflags |= DS_HOLD_FLAG_DECRYPT;
}

if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
/* %recv does not exist; continue in tofs */
Expand Down Expand Up @@ -764,6 +773,9 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
return (SET_ERROR(EINVAL));
}

if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)
drc->drc_spill = B_TRUE;

drba.drba_origin = origin;
drba.drba_cookie = drc;
drba.drba_cred = CRED();
Expand Down Expand Up @@ -835,7 +847,8 @@ struct receive_writer_arg {
/* A map from guid to dataset to help handle dedup'd streams. */
avl_tree_t *guid_to_ds_map;
boolean_t resumable;
boolean_t raw;
boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */
boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */
uint64_t last_object;
uint64_t last_offset;
uint64_t max_object; /* highest object ID referenced in stream */
Expand Down Expand Up @@ -1151,10 +1164,19 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
drro->drr_raw_bonuslen)
return (SET_ERROR(EINVAL));
} else {
if (drro->drr_flags != 0 || drro->drr_raw_bonuslen != 0 ||
drro->drr_indblkshift != 0 || drro->drr_nlevels != 0 ||
drro->drr_nblkptr != 0)
/*
* The DRR_OBJECT_SPILL flag is valid when the DRR_BEGIN
* record indicates this by setting DRR_FLAG_SPILL_BLOCK.
*/
if (((drro->drr_flags & ~(DRR_OBJECT_SPILL))) ||
(!rwa->spill && DRR_OBJECT_HAS_SPILL(drro->drr_flags))) {
return (SET_ERROR(EINVAL));
}

if (drro->drr_raw_bonuslen != 0 || drro->drr_nblkptr != 0 ||
drro->drr_indblkshift != 0 || drro->drr_nlevels != 0) {
return (SET_ERROR(EINVAL));
}
}

err = dmu_object_info(rwa->os, drro->drr_object, &doi);
Expand Down Expand Up @@ -1312,7 +1334,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
}

if (object == DMU_NEW_OBJECT) {
/* currently free, want to be allocated */
/* Currently free, wants to be allocated */
err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen,
Expand All @@ -1321,11 +1343,19 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
drro->drr_blksz != doi.doi_data_block_size ||
drro->drr_bonustype != doi.doi_bonus_type ||
drro->drr_bonuslen != doi.doi_bonus_size) {
/* currently allocated, but with different properties */
/* Currently allocated, but with different properties */
err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen,
dn_slots << DNODE_SHIFT, tx);
dn_slots << DNODE_SHIFT, rwa->spill ?
DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx);
} else if (rwa->spill && !DRR_OBJECT_HAS_SPILL(drro->drr_flags)) {
/*
* Currently allocated, the existing version of this object
* may reference a spill block that is no longer allocated
* at the source and needs to be freed.
*/
err = dmu_object_rm_spill(rwa->os, drro->drr_object, tx);
}

if (err != 0) {
Expand Down Expand Up @@ -1665,6 +1695,17 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
return (SET_ERROR(EINVAL));

/*
* This is an unmodified spill block which was added to the stream
* to resolve an issue with incorrectly removing spill blocks. It
* should be ignored by current versions of the code which support
* the DRR_FLAG_SPILL_BLOCK flag.
*/
if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) {
dmu_return_arcbuf(abuf);
return (0);
}

if (rwa->raw) {
if (!DMU_OT_IS_VALID(drrs->drr_type) ||
drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS ||
Expand Down Expand Up @@ -1699,9 +1740,16 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
return (err);
}

if (db_spill->db_size < drrs->drr_length)
/*
* Spill blocks may both grow and shrink. When a change in size
* occurs any existing dbuf must be updated to match the logical
* size of the provided arc_buf_t.
*/
if (db_spill->db_size != drrs->drr_length) {
dmu_buf_will_fill(db_spill, tx);
VERIFY(0 == dbuf_spill_set_blksz(db_spill,
drrs->drr_length, tx));
}

if (rwa->byteswap && !arc_is_encrypted(abuf) &&
arc_get_compression(abuf) == ZIO_COMPRESS_OFF) {
Expand Down Expand Up @@ -2575,6 +2623,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
rwa->byteswap = drc->drc_byteswap;
rwa->resumable = drc->drc_resumable;
rwa->raw = drc->drc_raw;
rwa->spill = drc->drc_spill;
rwa->os->os_raw_receive = drc->drc_raw;

(void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc,
Expand Down
Loading

0 comments on commit caf9dd2

Please sign in to comment.