diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index e20d601340c6..aa4328783ed8 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1769,7 +1769,7 @@ completes in order to verify the checksums of all blocks which have been resilvered. This is enabled by default and strongly recommended. . -.It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 33554432 Ns B Po 32 MiB Pc Pq u64 +.It Sy zfs_rebuild_vdev_limit Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64 Maximum amount of I/O that can be concurrently issued for a sequential resilver per leaf device, given in bytes. . @@ -1898,7 +1898,7 @@ When disabled, the memory limit may be exceeded by fast disks. Freezes a scrub/resilver in progress without actually pausing it. Intended for testing/debugging. . -.It Sy zfs_scan_vdev_limit Ns = Ns Sy 4194304 Ns B Po 4 MiB Pc Pq int +.It Sy zfs_scan_vdev_limit Ns = Ns Sy 16777216 Ns B Po 16 MiB Pc Pq int Maximum amount of data that can be concurrently issued at once for scrubs and resilvers per leaf device, given in bytes. . diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index f9e437f0c947..a680d95fd6e3 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -126,7 +127,7 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); -static uint64_t dsl_scan_count_data_disks(vdev_t *vd); +static uint64_t dsl_scan_count_data_disks(spa_t *spa); extern uint_t zfs_vdev_async_write_active_min_dirty_percent; static int zfs_scan_blkstats = 0; @@ -147,7 +148,7 @@ static int zfs_scan_strict_mem_lim = B_FALSE; * overload the drives with I/O, since that is protected by * zfs_vdev_scrub_max_active. */ -static uint64_t zfs_scan_vdev_limit = 4 << 20; +static uint64_t zfs_scan_vdev_limit = 16 << 20; static uint_t zfs_scan_issue_strategy = 0; @@ -466,11 +467,12 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) /* * Calculate the max number of in-flight bytes for pool-wide - * scanning operations (minimum 1MB). Limits for the issuing - * phase are done per top-level vdev and are handled separately. + * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). + * Limits for the issuing phase are done per top-level vdev and + * are handled separately. */ - scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * - dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20); + scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, + zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), offsetof(scan_ds_t, sds_node)); @@ -2811,8 +2813,9 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) } static uint64_t -dsl_scan_count_data_disks(vdev_t *rvd) +dsl_scan_count_data_disks(spa_t *spa) { + vdev_t *rvd = spa->spa_root_vdev; uint64_t i, leaves = 0; for (i = 0; i < rvd->vdev_children; i++) { @@ -3711,12 +3714,13 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) taskqid_t prefetch_tqid; /* - * Recalculate the max number of in-flight bytes for pool-wide - * scanning operations (minimum 1MB). Limits for the issuing - * phase are done per top-level vdev and are handled separately. + * Calculate the max number of in-flight bytes for pool-wide + * scanning operations (minimum 1MB, maximum 1/4 of arc_c_max). + * Limits for the issuing phase are done per top-level vdev and + * are handled separately. */ - scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * - dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20); + scn->scn_maxinflight_bytes = MIN(arc_c_max / 4, MAX(1ULL << 20, + zfs_scan_vdev_limit * dsl_scan_count_data_disks(spa))); if (scnp->scn_ddt_bookmark.ddb_class <= scnp->scn_ddt_class_max) { diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 1f56275c853b..62aa61b3b9e7 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -34,6 +34,7 @@ #include #include #include +#include #include /* @@ -116,13 +117,12 @@ static uint64_t zfs_rebuild_max_segment = 1024 * 1024; * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep * the queue depth short. * - * 32MB was selected as the default value to achieve good performance with - * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential - * rebuild was unable to saturate all of the drives using smaller values. - * With a value of 32MB the sequential resilver write rate was measured at - * 800MB/s sustained while rebuilding to a distributed spare. + * 64MB was observed to deliver the best performance and set as the default. + * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c) + * and a rebuild rate of 1.2GB/s was measured to the distribute spare. + * Smaller values were unable to fully saturate the available pool I/O. */ -static uint64_t zfs_rebuild_vdev_limit = 32 << 20; +static uint64_t zfs_rebuild_vdev_limit = 64 << 20; /* * Automatically start a pool scrub when the last active sequential resilver @@ -754,6 +754,7 @@ vdev_rebuild_thread(void *arg) { vdev_t *vd = arg; spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; int error = 0; /* @@ -786,9 +787,6 @@ vdev_rebuild_thread(void *arg) vr->vr_pass_bytes_scanned = 0; vr->vr_pass_bytes_issued = 0; - vr->vr_bytes_inflight_max = MAX(1ULL << 20, - zfs_rebuild_vdev_limit * vd->vdev_children); - uint64_t update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, 0); @@ -804,6 +802,17 @@ vdev_rebuild_thread(void *arg) metaslab_t *msp = vd->vdev_ms[i]; vr->vr_scan_msp = msp; + /* + * Calculate the max number of in-flight bytes for top-level + * vdev scanning operations (minimum 1MB, maximum 1/4 of + * arc_c_max shared by all top-level vdevs). Limits for the + * issuing phase are done per top-level vdev and are handled + * separately. + */ + uint64_t limit = (arc_c_max / 4) / MAX(rvd->vdev_children, 1); + vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20, + zfs_rebuild_vdev_limit * vd->vdev_children)); + /* * Removal of vdevs from the vdev tree may eliminate the need * for the rebuild, in which case it should be canceled. The