RAID-Z expansion feature

This feature allows disks to be added one at a time to a RAID-Z group, expanding its capacity incrementally. This feature is especially useful for small pools (typically with only one RAID-Z group), where there isn't sufficient hardware to add capacity by adding a whole new RAID-Z group (typically doubling the number of disks). == Initiating expansion == A new device (disk) can be attached to an existing RAIDZ vdev, by running `zpool attach POOL raidzP-N NEW_DEVICE`, e.g. `zpool attach tank raidz2-0 sda`. The new device will become part of the RAIDZ group. A "raidz expansion" will be initiated, and the new device will contribute additional space to the RAIDZ group once the expansion completes. The `feature@raidz_expansion` on-disk feature flag must be `enabled` to initiate an expansion, and it remains `active` for the life of the pool. In other words, pools with expanded RAIDZ vdevs can not be imported by older releases of the ZFS software. == During expansion == The expansion entails reading all allocated space from existing disks in the RAIDZ group, and rewriting it to the new disks in the RAIDZ group (including the newly added device). The expansion progress can be monitored with `zpool status`. Data redundancy is maintained during (and after) the expansion. If a disk fails while the expansion is in progress, the expansion pauses until the health of the RAIDZ vdev is restored (e.g. by replacing the failed disk and waiting for reconstruction to complete). The pool remains accessible during expansion. Following a reboot or export/import, the expansion resumes where it left off. == After expansion == When the expansion completes, the additional space is available for use, and is reflected in the `available` zfs property (as seen in `zfs list`, `df`, etc). Expansion does not change the number of failures that can be tolerated without data loss (e.g. a RAIDZ2 is still a RAIDZ2 even after expansion). A RAIDZ vdev can be expanded multiple times. After the expansion completes, old blocks remain with their old data-to-parity ratio (e.g. 5-wide RAIDZ2, has 3 data to 2 parity), but distributed among the larger set of disks. New blocks will be written with the new data-to-parity ratio (e.g. a 5-wide RAIDZ2 which has been expanded once to 6-wide, has 4 data to 2 parity). However, the RAIDZ vdev's "assumed parity ratio" does not change, so slightly less space than is expected may be reported for newly-written blocks, according to `zfs list`, `df`, `ls -s`, and similar tools. Sponsored-by: The FreeBSD Foundation Sponsored-by: iXsystems, Inc. Sponsored-by: vStack Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Mark Maybee <mark.maybee@delphix.com> Authored-by: Matthew Ahrens <mahrens@delphix.com> Contributions-by: Fedor Uporov <fuporov.vstack@gmail.com> Contributions-by: Stuart Maybee <stuart.maybee@comcast.net> Contributions-by: Thorsten Behrens <tbehrens@outlook.com> Contributions-by: Fmstrat <nospam@nowsci.com> Contributions-by: Don Brady <dev.fs.zfs@gmail.com> Signed-off-by: Don Brady <dev.fs.zfs@gmail.com> Closes #15022
openzfs · Nov 8, 2023 · 5caeef0 · 5caeef0
1 parent 9198de8
commit 5caeef0
Show file tree

Hide file tree

Showing 62 changed files with 5,737 additions and 873 deletions.
diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c
@@ -84,10 +84,10 @@ run_gen_bench_impl(const char *impl)
 
 			if (rto_opts.rto_expand) {
 				rm_bench = vdev_raidz_map_alloc_expanded(
-				    zio_bench.io_abd,
-				    zio_bench.io_size, zio_bench.io_offset,
+				    &zio_bench,
 				    rto_opts.rto_ashift, ncols+1, ncols,
-				    fn+1, rto_opts.rto_expand_offset);
+				    fn+1, rto_opts.rto_expand_offset,
+				    0, B_FALSE);
 			} else {
 				rm_bench = vdev_raidz_map_alloc(&zio_bench,
 				    BENCH_ASHIFT, ncols, fn+1);
@@ -172,10 +172,10 @@ run_rec_bench_impl(const char *impl)
 
 			if (rto_opts.rto_expand) {
 				rm_bench = vdev_raidz_map_alloc_expanded(
-				    zio_bench.io_abd,
-				    zio_bench.io_size, zio_bench.io_offset,
+				    &zio_bench,
 				    BENCH_ASHIFT, ncols+1, ncols,
-				    PARITY_PQR, rto_opts.rto_expand_offset);
+				    PARITY_PQR,
+				    rto_opts.rto_expand_offset, 0, B_FALSE);
 			} else {
 				rm_bench = vdev_raidz_map_alloc(&zio_bench,
 				    BENCH_ASHIFT, ncols, PARITY_PQR);

diff --git a/cmd/raidz_test/raidz_test.c b/cmd/raidz_test/raidz_test.c
@@ -327,14 +327,12 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
 
 	if (opts->rto_expand) {
 		opts->rm_golden =
-		    vdev_raidz_map_alloc_expanded(opts->zio_golden->io_abd,
-		    opts->zio_golden->io_size, opts->zio_golden->io_offset,
+		    vdev_raidz_map_alloc_expanded(opts->zio_golden,
 		    opts->rto_ashift, total_ncols+1, total_ncols,
-		    parity, opts->rto_expand_offset);
-		rm_test = vdev_raidz_map_alloc_expanded(zio_test->io_abd,
-		    zio_test->io_size, zio_test->io_offset,
+		    parity, opts->rto_expand_offset, 0, B_FALSE);
+		rm_test = vdev_raidz_map_alloc_expanded(zio_test,
 		    opts->rto_ashift, total_ncols+1, total_ncols,
-		    parity, opts->rto_expand_offset);
+		    parity, opts->rto_expand_offset, 0, B_FALSE);
 	} else {
 		opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden,
 		    opts->rto_ashift, total_ncols, parity);
@@ -361,187 +359,6 @@ init_raidz_golden_map(raidz_test_opts_t *opts, const int parity)
 	return (err);
 }
 
-/*
- * If reflow is not in progress, reflow_offset should be UINT64_MAX.
- * For each row, if the row is entirely before reflow_offset, it will
- * come from the new location.  Otherwise this row will come from the
- * old location.  Therefore, rows that straddle the reflow_offset will
- * come from the old location.
- *
- * NOTE: Until raidz expansion is implemented this function is only
- * needed by raidz_test.c to the multi-row raid_map_t functionality.
- */
-raidz_map_t *
-vdev_raidz_map_alloc_expanded(abd_t *abd, uint64_t size, uint64_t offset,
-    uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
-    uint64_t nparity, uint64_t reflow_offset)
-{
-	/* The zio's size in units of the vdev's minimum sector size. */
-	uint64_t s = size >> ashift;
-	uint64_t q, r, bc, devidx, asize = 0, tot;
-
-	/*
-	 * "Quotient": The number of data sectors for this stripe on all but
-	 * the "big column" child vdevs that also contain "remainder" data.
-	 * AKA "full rows"
-	 */
-	q = s / (logical_cols - nparity);
-
-	/*
-	 * "Remainder": The number of partial stripe data sectors in this I/O.
-	 * This will add a sector to some, but not all, child vdevs.
-	 */
-	r = s - q * (logical_cols - nparity);
-
-	/* The number of "big columns" - those which contain remainder data. */
-	bc = (r == 0 ? 0 : r + nparity);
-
-	/*
-	 * The total number of data and parity sectors associated with
-	 * this I/O.
-	 */
-	tot = s + nparity * (q + (r == 0 ? 0 : 1));
-
-	/* How many rows contain data (not skip) */
-	uint64_t rows = howmany(tot, logical_cols);
-	int cols = MIN(tot, logical_cols);
-
-	raidz_map_t *rm = kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
-	    KM_SLEEP);
-	rm->rm_nrows = rows;
-
-	for (uint64_t row = 0; row < rows; row++) {
-		raidz_row_t *rr = kmem_alloc(offsetof(raidz_row_t,
-		    rr_col[cols]), KM_SLEEP);
-		rm->rm_row[row] = rr;
-
-		/* The starting RAIDZ (parent) vdev sector of the row. */
-		uint64_t b = (offset >> ashift) + row * logical_cols;
-
-		/*
-		 * If we are in the middle of a reflow, and any part of this
-		 * row has not been copied, then use the old location of
-		 * this row.
-		 */
-		int row_phys_cols = physical_cols;
-		if (b + (logical_cols - nparity) > reflow_offset >> ashift)
-			row_phys_cols--;
-
-		/* starting child of this row */
-		uint64_t child_id = b % row_phys_cols;
-		/* The starting byte offset on each child vdev. */
-		uint64_t child_offset = (b / row_phys_cols) << ashift;
-
-		/*
-		 * We set cols to the entire width of the block, even
-		 * if this row is shorter.  This is needed because parity
-		 * generation (for Q and R) needs to know the entire width,
-		 * because it treats the short row as though it was
-		 * full-width (and the "phantom" sectors were zero-filled).
-		 *
-		 * Another approach to this would be to set cols shorter
-		 * (to just the number of columns that we might do i/o to)
-		 * and have another mechanism to tell the parity generation
-		 * about the "entire width".  Reconstruction (at least
-		 * vdev_raidz_reconstruct_general()) would also need to
-		 * know about the "entire width".
-		 */
-		rr->rr_cols = cols;
-		rr->rr_bigcols = bc;
-		rr->rr_missingdata = 0;
-		rr->rr_missingparity = 0;
-		rr->rr_firstdatacol = nparity;
-		rr->rr_abd_empty = NULL;
-		rr->rr_nempty = 0;
-
-		for (int c = 0; c < rr->rr_cols; c++, child_id++) {
-			if (child_id >= row_phys_cols) {
-				child_id -= row_phys_cols;
-				child_offset += 1ULL << ashift;
-			}
-			rr->rr_col[c].rc_devidx = child_id;
-			rr->rr_col[c].rc_offset = child_offset;
-			rr->rr_col[c].rc_orig_data = NULL;
-			rr->rr_col[c].rc_error = 0;
-			rr->rr_col[c].rc_tried = 0;
-			rr->rr_col[c].rc_skipped = 0;
-			rr->rr_col[c].rc_need_orig_restore = B_FALSE;
-
-			uint64_t dc = c - rr->rr_firstdatacol;
-			if (c < rr->rr_firstdatacol) {
-				rr->rr_col[c].rc_size = 1ULL << ashift;
-				rr->rr_col[c].rc_abd =
-				    abd_alloc_linear(rr->rr_col[c].rc_size,
-				    B_TRUE);
-			} else if (row == rows - 1 && bc != 0 && c >= bc) {
-				/*
-				 * Past the end, this for parity generation.
-				 */
-				rr->rr_col[c].rc_size = 0;
-				rr->rr_col[c].rc_abd = NULL;
-			} else {
-				/*
-				 * "data column" (col excluding parity)
-				 * Add an ASCII art diagram here
-				 */
-				uint64_t off;
-
-				if (c < bc || r == 0) {
-					off = dc * rows + row;
-				} else {
-					off = r * rows +
-					    (dc - r) * (rows - 1) + row;
-				}
-				rr->rr_col[c].rc_size = 1ULL << ashift;
-				rr->rr_col[c].rc_abd = abd_get_offset_struct(
-				    &rr->rr_col[c].rc_abdstruct,
-				    abd, off << ashift, 1 << ashift);
-			}
-
-			asize += rr->rr_col[c].rc_size;
-		}
-		/*
-		 * If all data stored spans all columns, there's a danger that
-		 * parity will always be on the same device and, since parity
-		 * isn't read during normal operation, that that device's I/O
-		 * bandwidth won't be used effectively. We therefore switch
-		 * the parity every 1MB.
-		 *
-		 * ...at least that was, ostensibly, the theory. As a practical
-		 * matter unless we juggle the parity between all devices
-		 * evenly, we won't see any benefit. Further, occasional writes
-		 * that aren't a multiple of the LCM of the number of children
-		 * and the minimum stripe width are sufficient to avoid pessimal
-		 * behavior. Unfortunately, this decision created an implicit
-		 * on-disk format requirement that we need to support for all
-		 * eternity, but only for single-parity RAID-Z.
-		 *
-		 * If we intend to skip a sector in the zeroth column for
-		 * padding we must make sure to note this swap. We will never
-		 * intend to skip the first column since at least one data and
-		 * one parity column must appear in each row.
-		 */
-		if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
-		    (offset & (1ULL << 20))) {
-			ASSERT(rr->rr_cols >= 2);
-			ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
-			devidx = rr->rr_col[0].rc_devidx;
-			uint64_t o = rr->rr_col[0].rc_offset;
-			rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
-			rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
-			rr->rr_col[1].rc_devidx = devidx;
-			rr->rr_col[1].rc_offset = o;
-		}
-
-	}
-	ASSERT3U(asize, ==, tot << ashift);
-
-	/* init RAIDZ parity ops */
-	rm->rm_ops = vdev_raidz_math_get_ops();
-
-	return (rm);
-}
-
 static raidz_map_t *
 init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
 {
@@ -561,10 +378,9 @@ init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity)
 	init_zio_abd(*zio);
 
 	if (opts->rto_expand) {
-		rm = vdev_raidz_map_alloc_expanded((*zio)->io_abd,
-		    (*zio)->io_size, (*zio)->io_offset,
+		rm = vdev_raidz_map_alloc_expanded(*zio,
 		    opts->rto_ashift, total_ncols+1, total_ncols,
-		    parity, opts->rto_expand_offset);
+		    parity, opts->rto_expand_offset, 0, B_FALSE);
 	} else {
 		rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift,
 		    total_ncols, parity);

diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h
@@ -119,7 +119,4 @@ void init_zio_abd(zio_t *zio);
 
 void run_raidz_benchmark(void);
 
-struct raidz_map *vdev_raidz_map_alloc_expanded(abd_t *, uint64_t, uint64_t,
-    uint64_t, uint64_t, uint64_t, uint64_t, uint64_t);
-
 #endif /* RAIDZ_TEST_H */
diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c
@@ -4134,6 +4134,11 @@ dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
 	}
 	(void) printf("\tcheckpoint_txg = %llu\n",
 	    (u_longlong_t)ub->ub_checkpoint_txg);
+
+	(void) printf("\traidz_reflow state=%u off=%llu\n",
+	    (int)RRSS_GET_STATE(ub),
+	    (u_longlong_t)RRSS_GET_OFFSET(ub));
+
 	(void) printf("%s", footer ? footer : "");
 }