Skip to content

Commit

Permalink
Fix zsb->z_hold_mtx deadlock
Browse files Browse the repository at this point in the history
The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to
serialize access to a znode and its SA buffer while the object is being
created or destroyed.  This kind of locking would normally reside in the
znode itself but in this case that's impossible because the znode and SA
buffer may not yet exist.  Therefore the locking is handled externally
with an array of mutexs and AVLs trees which contain per-object locks.

In zfs_znode_hold_enter() a per-object lock is created as needed, inserted
in to the correct AVL tree and finally the per-object lock is held.  In
zfs_znode_hold_exit() the process is reversed.  The per-object lock is
released, removed from the AVL tree and destroyed if there are no waiters.

This scheme has two important properties:

1) No memory allocations are performed while holding one of the z_hold_locks.
   This ensures evict(), which can be called from direct memory reclaim, will
   never block waiting on a z_hold_locks which just happens to have hashed
   to the same index.

2) All locks used to serialize access to an object are per-object and never
   shared.  This minimizes lock contention without creating a large number
   of dedicated locks.

On the downside it does require znode_lock_t structures to be frequently
allocated and freed.  However, because these are backed by a kmem cache
and very short lived this cost is minimal.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Issue #4106
  • Loading branch information
behlendorf committed Jan 15, 2016
1 parent 0720116 commit c96c36f
Show file tree
Hide file tree
Showing 4 changed files with 218 additions and 64 deletions.
5 changes: 3 additions & 2 deletions include/sys/zfs_vfsops.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,9 @@ typedef struct zfs_sb {
uint64_t z_groupquota_obj;
uint64_t z_replay_eof; /* New end of file - replay only */
sa_attr_type_t *z_attr_table; /* SA attr mapping->id */
uint64_t z_hold_mtx_size; /* znode hold locks size */
kmutex_t *z_hold_mtx; /* znode hold locks */
uint64_t z_hold_size; /* znode hold array size */
avl_tree_t *z_hold_trees; /* znode hold trees */
kmutex_t *z_hold_locks; /* znode hold locks */
} zfs_sb_t;

#define ZFS_SUPER_MAGIC 0x2fc12fc1
Expand Down
25 changes: 10 additions & 15 deletions include/sys/zfs_znode.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,12 @@ typedef struct znode {
struct inode z_inode; /* generic vfs inode */
} znode_t;

typedef struct znode_hold {
uint64_t zh_obj; /* object id */
kmutex_t zh_lock; /* lock serializing object access */
avl_node_t zh_node; /* avl tree linkage */
refcount_t zh_refcount; /* active consumer reference count */
} znode_hold_t;

/*
* Range locking rules
Expand Down Expand Up @@ -273,24 +279,12 @@ typedef struct znode {
/*
* Macros for dealing with dmu_buf_hold
*/
#define ZFS_OBJ_MTX_SZ 64
#define ZFS_OBJ_MTX_MAX (1024 * 1024)
#define ZFS_OBJ_MTX_SZ 64
#define ZFS_OBJ_MTX_MAX (1024 * 1024)
#define ZFS_OBJ_HASH(zsb, obj) ((obj) & ((zsb->z_hold_size) - 1))

extern unsigned int zfs_object_mutex_size;

#define ZFS_OBJ_HASH(zsb, obj_num) \
((obj_num) & ((zsb->z_hold_mtx_size) - 1))
#define ZFS_OBJ_MUTEX(zsb, obj_num) \
(&(zsb)->z_hold_mtx[ZFS_OBJ_HASH(zsb, obj_num)])
#define ZFS_OBJ_HOLD_ENTER(zsb, obj_num) \
mutex_enter(ZFS_OBJ_MUTEX((zsb), (obj_num)))
#define ZFS_OBJ_HOLD_TRYENTER(zsb, obj_num) \
mutex_tryenter(ZFS_OBJ_MUTEX((zsb), (obj_num)))
#define ZFS_OBJ_HOLD_EXIT(zsb, obj_num) \
mutex_exit(ZFS_OBJ_MUTEX((zsb), (obj_num)))
#define ZFS_OBJ_HOLD_OWNED(zsb, obj_num) \
mutex_owned(ZFS_OBJ_MUTEX((zsb), (obj_num)))

/* Encode ZFS stored time values from a struct timespec */
#define ZFS_TIME_ENCODE(tp, stmp) \
{ \
Expand Down Expand Up @@ -326,6 +320,7 @@ extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *);
extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t);
extern void zfs_znode_init(void);
extern void zfs_znode_fini(void);
extern int zfs_znode_hold_compare(const void *, const void *);
extern int zfs_zget(zfs_sb_t *, uint64_t, znode_t **);
extern int zfs_rezget(znode_t *);
extern void zfs_zinactive(znode_t *);
Expand Down
32 changes: 18 additions & 14 deletions module/zfs/zfs_vfsops.c
Original file line number Diff line number Diff line change
Expand Up @@ -663,7 +663,7 @@ zfs_sb_create(const char *osname, zfs_mntopts_t *zmo, zfs_sb_t **zsbp)
objset_t *os;
zfs_sb_t *zsb;
uint64_t zval;
int i, error;
int i, size, error;
uint64_t sa_obj;

zsb = kmem_zalloc(sizeof (zfs_sb_t), KM_SLEEP);
Expand All @@ -685,8 +685,7 @@ zfs_sb_create(const char *osname, zfs_mntopts_t *zmo, zfs_sb_t **zsbp)

/*
* Initialize the zfs-specific filesystem structure.
* Should probably make this a kmem cache, shuffle fields,
* and just bzero up to z_hold_mtx[].
* Should probably make this a kmem cache, shuffle fields.
*/
zsb->z_sb = NULL;
zsb->z_parent = zsb;
Expand Down Expand Up @@ -795,12 +794,15 @@ zfs_sb_create(const char *osname, zfs_mntopts_t *zmo, zfs_sb_t **zsbp)
rw_init(&zsb->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zsb->z_fuid_lock, NULL, RW_DEFAULT, NULL);

zsb->z_hold_mtx_size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1),
ZFS_OBJ_MTX_MAX);
zsb->z_hold_mtx = vmem_zalloc(sizeof (kmutex_t) * zsb->z_hold_mtx_size,
KM_SLEEP);
for (i = 0; i != zsb->z_hold_mtx_size; i++)
mutex_init(&zsb->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX);
zsb->z_hold_size = size;
zsb->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, KM_SLEEP);
zsb->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP);
for (i = 0; i != size; i++) {
avl_create(&zsb->z_hold_trees[i], zfs_znode_hold_compare,
sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node));
mutex_init(&zsb->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL);
}

*zsbp = zsb;
return (0);
Expand All @@ -809,7 +811,6 @@ zfs_sb_create(const char *osname, zfs_mntopts_t *zmo, zfs_sb_t **zsbp)
dmu_objset_disown(os, zsb);
*zsbp = NULL;

vmem_free(zsb->z_hold_mtx, sizeof (kmutex_t) * zsb->z_hold_mtx_size);
kmem_free(zsb, sizeof (zfs_sb_t));
return (error);
}
Expand Down Expand Up @@ -901,7 +902,7 @@ EXPORT_SYMBOL(zfs_sb_setup);
void
zfs_sb_free(zfs_sb_t *zsb)
{
int i;
int i, size = zsb->z_hold_size;

zfs_fuid_destroy(zsb);

Expand All @@ -911,9 +912,12 @@ zfs_sb_free(zfs_sb_t *zsb)
rrm_destroy(&zsb->z_teardown_lock);
rw_destroy(&zsb->z_teardown_inactive_lock);
rw_destroy(&zsb->z_fuid_lock);
for (i = 0; i != zsb->z_hold_mtx_size; i++)
mutex_destroy(&zsb->z_hold_mtx[i]);
vmem_free(zsb->z_hold_mtx, sizeof (kmutex_t) * zsb->z_hold_mtx_size);
for (i = 0; i != size; i++) {
avl_destroy(&zsb->z_hold_trees[i]);
mutex_destroy(&zsb->z_hold_locks[i]);
}
vmem_free(zsb->z_hold_trees, sizeof (avl_tree_t) * size);
vmem_free(zsb->z_hold_locks, sizeof (kmutex_t) * size);
zfs_mntopts_free(zsb->z_mntopts);
kmem_free(zsb, sizeof (zfs_sb_t));
}
Expand Down
Loading

0 comments on commit c96c36f

Please sign in to comment.