diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index e41d6fd89b00..f4afe2525c89 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -4060,11 +4060,25 @@ Percentage of online CPUs (or CPU cores, etc) which will run a worker thread for I/O. These workers are responsible for I/O work such as compression and checksum calculations. Fractional number of CPUs will be rounded down. .sp -The default value of 75 was chosen to avoid using all CPUs which can result in -latency issues and inconsistent application performance, especially when high -compression is enabled. +The default value of 80 was chosen to avoid using all CPUs which can result in +latency issues and inconsistent application performance, especially when slower +compression and/or checksumming is enabled. .sp -Default value: \fB75\fR. +Default value: \fB80\fR. +.RE + +.sp +.ne 2 +.na +\fBzio_taskq_batch_tpq\fR (uint) +.ad +.RS 12n +Number of worker threads per taskq. Lower value improves I/O ordering and +CPU utilization, while higher reduces lock contention. +.sp +By default about 6 worker threads per taskq, depending on system size. +.sp +Default value: \fB0\fR. .RE .sp diff --git a/module/zfs/spa.c b/module/zfs/spa.c index a30821e045fa..26995575adaa 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -108,6 +108,7 @@ int zfs_ccw_retry_interval = 300; typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ + ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; @@ -115,6 +116,7 @@ typedef enum zti_modes { #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } +#define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) @@ -141,7 +143,8 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the - * particular taskq is chosen at random. + * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH, + * but with number of taskqs also scaling with number of CPUs. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that @@ -150,9 +153,9 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ - { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ - { ZTI_BATCH, ZTI_N(5), ZTI_P(12, 8), ZTI_N(5) }, /* WRITE */ - { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ + { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ + { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ + { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ @@ -164,7 +167,8 @@ static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); -uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ +uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ +uint_t zio_taskq_batch_tpq; /* threads per taskq */ boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ uint_t zio_taskq_basedc = 80; /* base duty cycle */ @@ -957,25 +961,12 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) uint_t value = ztip->zti_value; uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; - uint_t flags = 0; + uint_t cpus, flags = TASKQ_DYNAMIC; boolean_t batch = B_FALSE; - if (mode == ZTI_MODE_NULL) { - tqs->stqs_count = 0; - tqs->stqs_taskq = NULL; - return; - } - - ASSERT3U(count, >, 0); - - tqs->stqs_count = count; - tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); - switch (mode) { case ZTI_MODE_FIXED: - ASSERT3U(value, >=, 1); - value = MAX(value, 1); - flags |= TASKQ_DYNAMIC; + ASSERT3U(value, >, 0); break; case ZTI_MODE_BATCH: @@ -984,6 +975,48 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) value = MIN(zio_taskq_batch_pct, 100); break; + case ZTI_MODE_SCALE: + flags |= TASKQ_THREADS_CPU_PCT; + /* + * We want more taskqs to reduce lock contention, but we want + * less for better request ordering and CPU utilization. + */ + cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); + if (zio_taskq_batch_tpq > 0) { + count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / + zio_taskq_batch_tpq); + } else { + /* + * Prefer 6 threads per taskq, but no more taskqs + * than threads in them on large systems. For 80%: + * + * taskq taskq total + * cpus taskqs percent threads threads + * ------- ------- ------- ------- ------- + * 1 1 80% 1 1 + * 2 1 80% 1 1 + * 4 1 80% 3 3 + * 8 2 40% 3 6 + * 16 3 27% 4 12 + * 32 5 16% 5 25 + * 64 7 11% 7 49 + * 128 10 8% 10 100 + * 256 14 6% 15 210 + */ + count = 1 + cpus / 6; + while (count * count > cpus) + count--; + } + /* Limit each taskq within 100% to not trigger assertion. */ + count = MAX(count, (zio_taskq_batch_pct + 99) / 100); + value = (zio_taskq_batch_pct + count / 2) / count; + break; + + case ZTI_MODE_NULL: + tqs->stqs_count = 0; + tqs->stqs_taskq = NULL; + return; + default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " "spa_activate()", @@ -991,12 +1024,20 @@ spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) break; } + ASSERT3U(count, >, 0); + tqs->stqs_count = count; + tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); + for (uint_t i = 0; i < count; i++) { taskq_t *tq; char name[32]; - (void) snprintf(name, sizeof (name), "%s_%s", - zio_type_name[t], zio_taskq_types[q]); + if (count > 1) + (void) snprintf(name, sizeof (name), "%s_%s_%u", + zio_type_name[t], zio_taskq_types[q], i); + else + (void) snprintf(name, sizeof (name), "%s_%s", + zio_type_name[t], zio_taskq_types[q]); if (zio_taskq_sysdc && spa->spa_proc != &p0) { if (batch) @@ -9863,6 +9904,9 @@ ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, "Percentage of CPUs to run an IO worker thread"); +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, + "Number of threads per IO worker taskqueue"); + ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW, "Allow importing pool with up to this number of missing top-level " "vdevs (in read-only mode)");