Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

illumos 3954, 4080 and 4081 #1772

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion include/sys/metaslab_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
*/

/*
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/

#ifndef _SYS_METASLAB_IMPL_H
Expand All @@ -45,6 +45,7 @@ struct metaslab_class {
metaslab_group_t *mc_rotor;
space_map_ops_t *mc_ops;
uint64_t mc_aliquot;
uint64_t mc_alloc_groups; /* # of allocatable groups */
uint64_t mc_alloc; /* total allocated space */
uint64_t mc_deferred; /* total deferred frees */
uint64_t mc_space; /* total space (alloc + free) */
Expand All @@ -58,6 +59,8 @@ struct metaslab_group {
uint64_t mg_aliquot;
uint64_t mg_bonus_area;
uint64_t mg_alloc_failures;
boolean_t mg_allocatable; /* can we allocate? */
uint64_t mg_free_capacity; /* percentage free */
int64_t mg_bias;
int64_t mg_activation_count;
metaslab_class_t *mg_class;
Expand Down
113 changes: 109 additions & 4 deletions module/zfs/metaslab.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,25 @@ int zfs_condense_pct = 200;
/*
* This value defines the number of allowed allocation failures per vdev.
* If a device reaches this threshold in a given txg then we consider skipping
* allocations on that device.
* allocations on that device. The value of zfs_mg_alloc_failures is computed
* in zio_init() unless it has been overridden in /etc/system.
*/
int zfs_mg_alloc_failures;
int zfs_mg_alloc_failures = 0;

/*
* The zfs_mg_noalloc_threshold defines which metaslab groups should
* be eligible for allocation. The value is defined as a percentage of
* a free space. Metaslab groups that have more free space than
* zfs_mg_noalloc_threshold are always eligible for allocations. Once
* a metaslab group's free space is less than or equal to the
* zfs_mg_noalloc_threshold the allocator will avoid allocating to that
* group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
* Once all groups in the pool reach zfs_mg_noalloc_threshold then all
* groups are allowed to accept allocations. Gang blocks are always
* eligible to allocate on any metaslab group. The default value of 0 means
* no metaslab group will be excluded based on this criterion.
*/
int zfs_mg_noalloc_threshold = 0;

/*
* Metaslab debugging: when set, keeps all space maps in core to verify frees.
Expand Down Expand Up @@ -222,6 +238,53 @@ metaslab_compare(const void *x1, const void *x2)
return (0);
}

/*
* Update the allocatable flag and the metaslab group's capacity.
* The allocatable flag is set to true if the capacity is below
* the zfs_mg_noalloc_threshold. If a metaslab group transitions
* from allocatable to non-allocatable or vice versa then the metaslab
* group's class is updated to reflect the transition.
*/
static void
metaslab_group_alloc_update(metaslab_group_t *mg)
{
vdev_t *vd = mg->mg_vd;
metaslab_class_t *mc = mg->mg_class;
vdev_stat_t *vs = &vd->vdev_stat;
boolean_t was_allocatable;

ASSERT(vd == vd->vdev_top);

mutex_enter(&mg->mg_lock);
was_allocatable = mg->mg_allocatable;

mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
(vs->vs_space + 1);

mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);

/*
* The mc_alloc_groups maintains a count of the number of
* groups in this metaslab class that are still above the
* zfs_mg_noalloc_threshold. This is used by the allocating
* threads to determine if they should avoid allocations to
* a given group. The allocator will avoid allocations to a group
* if that group has reached or is below the zfs_mg_noalloc_threshold
* and there are still other groups that are above the threshold.
* When a group transitions from allocatable to non-allocatable or
* vice versa we update the metaslab class to reflect that change.
* When the mc_alloc_groups value drops to 0 that means that all
* groups have reached the zfs_mg_noalloc_threshold making all groups
* eligible for allocations. This effectively means that all devices
* are balanced again.
*/
if (was_allocatable && !mg->mg_allocatable)
mc->mc_alloc_groups--;
else if (!was_allocatable && mg->mg_allocatable)
mc->mc_alloc_groups++;
mutex_exit(&mg->mg_lock);
}

metaslab_group_t *
metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
{
Expand Down Expand Up @@ -272,6 +335,7 @@ metaslab_group_activate(metaslab_group_t *mg)
return;

mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
metaslab_group_alloc_update(mg);

if ((mgprev = mc->mc_rotor) == NULL) {
mg->mg_prev = mg;
Expand Down Expand Up @@ -356,6 +420,29 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
mutex_exit(&mg->mg_lock);
}

/*
* Determine if a given metaslab group should skip allocations. A metaslab
* group should avoid allocations if its used capacity has crossed the
* zfs_mg_noalloc_threshold and there is at least one metaslab group
* that can still handle allocations.
*/
static boolean_t
metaslab_group_allocatable(metaslab_group_t *mg)
{
vdev_t *vd = mg->mg_vd;
spa_t *spa = vd->vdev_spa;
metaslab_class_t *mc = mg->mg_class;

/*
* A metaslab group is considered allocatable if its free capacity
* is greater than the set value of zfs_mg_noalloc_threshold, it's
* associated with a slog, or there are no other metaslab groups
* with free capacity greater than zfs_mg_noalloc_threshold.
*/
return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
}

/*
* ==========================================================================
* Common allocator routines
Expand Down Expand Up @@ -1312,6 +1399,8 @@ metaslab_sync_reassess(metaslab_group_t *mg)
int64_t failures = mg->mg_alloc_failures;
int m;

metaslab_group_alloc_update(mg);

/*
* Re-evaluate all metaslabs which have lower offsets than the
* bonus area.
Expand Down Expand Up @@ -1406,6 +1495,8 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
if (msp == NULL)
return (-1ULL);

mutex_enter(&msp->ms_lock);

/*
* If we've already reached the allowable number of failed
* allocation attempts on this metaslab group then we
Expand All @@ -1422,11 +1513,10 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
"asize %llu, failures %llu", spa_name(spa),
mg->mg_vd->vdev_id, txg, mg, psize, asize,
mg->mg_alloc_failures);
mutex_exit(&msp->ms_lock);
return (-1ULL);
}

mutex_enter(&msp->ms_lock);

/*
* If this metaslab is currently condensing then pick again as
* we can't manipulate this metaslab until it's committed
Expand Down Expand Up @@ -1591,6 +1681,21 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
} else {
allocatable = vdev_allocatable(vd);
}

/*
* Determine if the selected metaslab group is eligible
* for allocations. If we're ganging or have requested
* an allocation for the smallest gang block size
* then we don't want to avoid allocating to the this
* metaslab group. If we're in this condition we should
* try to allocate from any device possible so that we
* don't inadvertently return ENOSPC and suspend the pool
* even though space is still available.
*/
if (allocatable && CAN_FASTGANG(flags) &&
psize > SPA_GANGBLOCKSIZE)
allocatable = metaslab_group_allocatable(mg);

if (!allocatable)
goto next;

Expand Down
2 changes: 1 addition & 1 deletion module/zfs/zfs_ioctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -5335,7 +5335,7 @@ zfs_ioctl_init(void)
zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);

zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);
zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);

Expand Down
5 changes: 3 additions & 2 deletions module/zfs/zio.c
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,8 @@ zio_init(void)
* The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
* to fail 3 times per txg or 8 failures, whichever is greater.
*/
zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
if (zfs_mg_alloc_failures == 0)
zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);

zio_inject_init();

Expand Down Expand Up @@ -2405,7 +2406,7 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, uint64_t size,
if (error) {
error = metaslab_alloc(spa, spa_normal_class(spa), size,
new_bp, 1, txg, NULL,
METASLAB_FASTWRITE | METASLAB_GANG_AVOID);
METASLAB_FASTWRITE);
}

if (error == 0) {
Expand Down