Skip to content

Commit 7029acd

Browse files
committed
io_uring/rsrc: get rid of per-ring io_rsrc_node list
Work in progress, but get rid of the per-ring serialization of resource nodes, like registered buffers and files. Main issue here is that one node can otherwise hold up a bunch of other nodes from getting freed, which is especially a problem for file resource nodes and networked workloads where some descriptors may not see activity in a long time. As an example, instantiate an io_uring ring fd and create a sparse registered file table. Even 2 will do. Then create a socket and register it as fixed file 0, F0. The number of open files in the app is now 5, with 0/1/2 being the usual stdin/out/err, 3 being the ring fd, and 4 being the socket. Register this socket (eg "the listener") in slot 0 of the registered file table. Now add an operation on the socket that uses slot 0. Finally, loop N times, where each loop creates a new socket, registers said socket as a file, then unregisters the socket, and finally closes the socket. This is roughly similar to what a basic accept loop would look like. At the end of this loop, it's not unreasonable to expect that there would still be 5 open files. Each socket created and registered in the loop is also unregistered and closed. But since the listener socket registered first still has references to its resource node due to still being active, each subsequent socket unregistration is stuck behind it for reclaim. Hence 5 + N files are still open at that point, where N is awaiting the final put held up by the listener socket. Rewrite the io_rsrc_node handling to NOT rely on serialization. Struct io_kiocb now gets explicit resource nodes assigned, with each holding a reference to the parent node. A parent node is either of type FILE or BUFFER, which are the two types of nodes that exist. A request can have two nodes assigned, if it's using both registered files and buffers. Since request issue and task_work completion is both under the ring private lock, no atomics are needed to handle these references. It's a simple unlocked inc/dec. As before, the registered buffer or file table each hold a reference as well to the registered nodes. Final put of the node will remove the node and free the underlying resource, eg unmap the buffer or put the file. Outside of removing the stall in resource reclaim described above, it has the following advantages: 1) It's a lot simpler than the previous scheme, and easier to follow. No need to specific quiesce handling anymore. 2) There are no resource node allocations in the fast path, all of that happens at resource registration time. 3) The structs related to resource handling can all get simplified quite a bit, like io_rsrc_node and io_rsrc_data. io_rsrc_put can go away completely. 4) Handling of resource tags is much simpler, and doesn't require persistent storage as it can simply get assigned up front at registration time. Just copy them in one-by-one at registration time and assign to the resource node. The only real downside is that a request is now explicitly limited to pinning 2 resources, one file and one buffer, where before just assigning a resource node to a request would pin all of them. The upside is that it's easier to follow now, as an individual resource is explicitly referenced and assigned to the request. With this in place, the above mentioned example will be using exactly 5 files at the end of the loop, not N. Signed-off-by: Jens Axboe <axboe@kernel.dk>
1 parent e410ffc commit 7029acd

File tree

13 files changed

+270
-465
lines changed

13 files changed

+270
-465
lines changed

include/linux/io_uring_types.h

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ struct io_wq_work {
5656
};
5757

5858
struct io_file_table {
59-
struct io_fixed_file *files;
59+
struct io_rsrc_node **nodes;
6060
unsigned long *bitmap;
6161
unsigned int alloc_hint;
6262
};
@@ -264,7 +264,6 @@ struct io_ring_ctx {
264264
* Fixed resources fast path, should be accessed only under
265265
* uring_lock, and updated through io_uring_register(2)
266266
*/
267-
struct io_rsrc_node *rsrc_node;
268267
atomic_t cancel_seq;
269268

270269
/*
@@ -277,7 +276,7 @@ struct io_ring_ctx {
277276
struct io_wq_work_list iopoll_list;
278277

279278
struct io_file_table file_table;
280-
struct io_mapped_ubuf **user_bufs;
279+
struct io_rsrc_node **user_bufs;
281280
unsigned nr_user_files;
282281
unsigned nr_user_bufs;
283282

@@ -372,10 +371,7 @@ struct io_ring_ctx {
372371
struct io_rsrc_data *buf_data;
373372

374373
/* protected by ->uring_lock */
375-
struct list_head rsrc_ref_list;
376374
struct io_alloc_cache rsrc_node_cache;
377-
struct wait_queue_head rsrc_quiesce_wq;
378-
unsigned rsrc_quiesce;
379375

380376
u32 pers_next;
381377
struct xarray personalities;
@@ -642,7 +638,7 @@ struct io_kiocb {
642638
__poll_t apoll_events;
643639
};
644640

645-
struct io_rsrc_node *rsrc_node;
641+
struct io_rsrc_node *rsrc_nodes[2];
646642

647643
atomic_t refs;
648644
bool cancel_seq_set;

io_uring/fdinfo.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *file)
176176
}
177177
seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
178178
for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
179-
struct io_mapped_ubuf *buf = ctx->user_bufs[i];
179+
struct io_mapped_ubuf *buf = ctx->user_bufs[i]->buf;
180180

181181
seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf, buf->len);
182182
}

io_uring/filetable.c

Lines changed: 19 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,14 @@ static int io_file_bitmap_get(struct io_ring_ctx *ctx)
3838

3939
bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
4040
{
41-
table->files = kvcalloc(nr_files, sizeof(table->files[0]),
42-
GFP_KERNEL_ACCOUNT);
43-
if (unlikely(!table->files))
41+
table->nodes = kvmalloc_array(nr_files, sizeof(struct io_src_node *),
42+
GFP_KERNEL_ACCOUNT | __GFP_ZERO);
43+
if (unlikely(!table->nodes))
4444
return false;
4545

4646
table->bitmap = bitmap_zalloc(nr_files, GFP_KERNEL_ACCOUNT);
4747
if (unlikely(!table->bitmap)) {
48-
kvfree(table->files);
48+
kvfree(table->nodes);
4949
return false;
5050
}
5151

@@ -54,18 +54,17 @@ bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files)
5454

5555
void io_free_file_tables(struct io_file_table *table)
5656
{
57-
kvfree(table->files);
57+
kvfree(table->nodes);
5858
bitmap_free(table->bitmap);
59-
table->files = NULL;
59+
table->nodes = NULL;
6060
table->bitmap = NULL;
6161
}
6262

6363
static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
6464
u32 slot_index)
6565
__must_hold(&req->ctx->uring_lock)
6666
{
67-
struct io_fixed_file *file_slot;
68-
int ret;
67+
struct io_rsrc_node *node;
6968

7069
if (io_is_uring_fops(file))
7170
return -EBADF;
@@ -74,22 +73,18 @@ static int io_install_fixed_file(struct io_ring_ctx *ctx, struct file *file,
7473
if (slot_index >= ctx->nr_user_files)
7574
return -EINVAL;
7675

77-
slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
78-
file_slot = io_fixed_file_slot(&ctx->file_table, slot_index);
79-
80-
if (file_slot->file_ptr) {
81-
ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
82-
io_slot_file(file_slot));
83-
if (ret)
84-
return ret;
76+
node = io_rsrc_node_alloc(ctx, IORING_RSRC_FILE);
77+
if (!node)
78+
return -ENOMEM;
8579

86-
file_slot->file_ptr = 0;
87-
} else {
80+
slot_index = array_index_nospec(slot_index, ctx->nr_user_files);
81+
if (ctx->file_table.nodes[slot_index])
82+
io_put_rsrc_node(ctx->file_table.nodes[slot_index]);
83+
else
8884
io_file_bitmap_set(&ctx->file_table, slot_index);
89-
}
9085

91-
*io_get_tag_slot(ctx->file_data, slot_index) = 0;
92-
io_fixed_file_set(file_slot, file);
86+
ctx->file_table.nodes[slot_index] = node;
87+
io_fixed_file_set(node, file);
9388
return 0;
9489
}
9590

@@ -134,25 +129,16 @@ int io_fixed_fd_install(struct io_kiocb *req, unsigned int issue_flags,
134129

135130
int io_fixed_fd_remove(struct io_ring_ctx *ctx, unsigned int offset)
136131
{
137-
struct io_fixed_file *file_slot;
138-
int ret;
139-
140132
if (unlikely(!ctx->file_data))
141133
return -ENXIO;
142134
if (offset >= ctx->nr_user_files)
143135
return -EINVAL;
144136

145137
offset = array_index_nospec(offset, ctx->nr_user_files);
146-
file_slot = io_fixed_file_slot(&ctx->file_table, offset);
147-
if (!file_slot->file_ptr)
138+
if (!ctx->file_table.nodes[offset])
148139
return -EBADF;
149-
150-
ret = io_queue_rsrc_removal(ctx->file_data, offset,
151-
io_slot_file(file_slot));
152-
if (ret)
153-
return ret;
154-
155-
file_slot->file_ptr = 0;
140+
io_put_rsrc_node(ctx->file_table.nodes[offset]);
141+
ctx->file_table.nodes[offset] = NULL;
156142
io_file_bitmap_clear(&ctx->file_table, offset);
157143
return 0;
158144
}

io_uring/filetable.h

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -34,36 +34,35 @@ static inline void io_file_bitmap_set(struct io_file_table *table, int bit)
3434
table->alloc_hint = bit + 1;
3535
}
3636

37-
static inline struct io_fixed_file *
38-
io_fixed_file_slot(struct io_file_table *table, unsigned i)
39-
{
40-
return &table->files[i];
41-
}
42-
4337
#define FFS_NOWAIT 0x1UL
4438
#define FFS_ISREG 0x2UL
4539
#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG)
4640

47-
static inline unsigned int io_slot_flags(struct io_fixed_file *slot)
41+
static inline unsigned int io_slot_flags(struct io_rsrc_node *node)
4842
{
49-
return (slot->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT;
43+
44+
return (node->file_ptr & ~FFS_MASK) << REQ_F_SUPPORT_NOWAIT_BIT;
5045
}
5146

52-
static inline struct file *io_slot_file(struct io_fixed_file *slot)
47+
static inline struct file *io_slot_file(struct io_rsrc_node *node)
5348
{
54-
return (struct file *)(slot->file_ptr & FFS_MASK);
49+
return (struct file *)(node->file_ptr & FFS_MASK);
5550
}
5651

5752
static inline struct file *io_file_from_index(struct io_file_table *table,
5853
int index)
5954
{
60-
return io_slot_file(io_fixed_file_slot(table, index));
55+
struct io_rsrc_node *node = table->nodes[index];
56+
57+
if (node)
58+
return io_slot_file(node);
59+
return NULL;
6160
}
6261

63-
static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
62+
static inline void io_fixed_file_set(struct io_rsrc_node *node,
6463
struct file *file)
6564
{
66-
file_slot->file_ptr = (unsigned long)file |
65+
node->file_ptr = (unsigned long)file |
6766
(io_file_get_flags(file) >> REQ_F_SUPPORT_NOWAIT_BIT);
6867
}
6968

io_uring/io_uring.c

Lines changed: 12 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -333,15 +333,13 @@ static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
333333
mutex_init(&ctx->uring_lock);
334334
init_waitqueue_head(&ctx->cq_wait);
335335
init_waitqueue_head(&ctx->poll_wq);
336-
init_waitqueue_head(&ctx->rsrc_quiesce_wq);
337336
spin_lock_init(&ctx->completion_lock);
338337
spin_lock_init(&ctx->timeout_lock);
339338
INIT_WQ_LIST(&ctx->iopoll_list);
340339
INIT_LIST_HEAD(&ctx->io_buffers_comp);
341340
INIT_LIST_HEAD(&ctx->defer_list);
342341
INIT_LIST_HEAD(&ctx->timeout_list);
343342
INIT_LIST_HEAD(&ctx->ltimeout_list);
344-
INIT_LIST_HEAD(&ctx->rsrc_ref_list);
345343
init_llist_head(&ctx->work_llist);
346344
INIT_LIST_HEAD(&ctx->tctx_list);
347345
ctx->submit_state.free_list.next = NULL;
@@ -1415,7 +1413,7 @@ static void io_free_batch_list(struct io_ring_ctx *ctx,
14151413
io_clean_op(req);
14161414
}
14171415
io_put_file(req);
1418-
io_put_rsrc_node(ctx, req->rsrc_node);
1416+
io_req_put_rsrc_nodes(req);
14191417
io_put_task(req->task);
14201418

14211419
node = req->comp_list.next;
@@ -1878,19 +1876,20 @@ inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
18781876
unsigned int issue_flags)
18791877
{
18801878
struct io_ring_ctx *ctx = req->ctx;
1881-
struct io_fixed_file *slot;
1879+
struct io_rsrc_node *node;
18821880
struct file *file = NULL;
18831881

18841882
io_ring_submit_lock(ctx, issue_flags);
18851883

18861884
if (unlikely((unsigned int)fd >= ctx->nr_user_files))
18871885
goto out;
18881886
fd = array_index_nospec(fd, ctx->nr_user_files);
1889-
slot = io_fixed_file_slot(&ctx->file_table, fd);
1890-
if (!req->rsrc_node)
1891-
__io_req_set_rsrc_node(req, ctx);
1892-
req->flags |= io_slot_flags(slot);
1893-
file = io_slot_file(slot);
1887+
node = ctx->file_table.nodes[fd];
1888+
if (node) {
1889+
io_req_assign_rsrc_node(req, node);
1890+
req->flags |= io_slot_flags(node);
1891+
file = io_slot_file(node);
1892+
}
18941893
out:
18951894
io_ring_submit_unlock(ctx, issue_flags);
18961895
return file;
@@ -2036,7 +2035,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
20362035
req->flags = (__force io_req_flags_t) sqe_flags;
20372036
req->cqe.user_data = READ_ONCE(sqe->user_data);
20382037
req->file = NULL;
2039-
req->rsrc_node = NULL;
2038+
req->rsrc_nodes[IORING_RSRC_FILE] = NULL;
2039+
req->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
20402040
req->task = current;
20412041
req->cancel_seq_set = false;
20422042

@@ -2718,15 +2718,10 @@ static void io_req_caches_free(struct io_ring_ctx *ctx)
27182718
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
27192719
{
27202720
io_sq_thread_finish(ctx);
2721-
/* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */
2722-
if (WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)))
2723-
return;
27242721

27252722
mutex_lock(&ctx->uring_lock);
2726-
if (ctx->buf_data)
2727-
__io_sqe_buffers_unregister(ctx);
2728-
if (ctx->file_data)
2729-
__io_sqe_files_unregister(ctx);
2723+
io_sqe_buffers_unregister(ctx);
2724+
io_sqe_files_unregister(ctx);
27302725
io_cqring_overflow_kill(ctx);
27312726
io_eventfd_unregister(ctx);
27322727
io_alloc_cache_free(&ctx->apoll_cache, kfree);
@@ -2743,11 +2738,6 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
27432738
if (ctx->submitter_task)
27442739
put_task_struct(ctx->submitter_task);
27452740

2746-
/* there are no registered resources left, nobody uses it */
2747-
if (ctx->rsrc_node)
2748-
io_rsrc_node_destroy(ctx, ctx->rsrc_node);
2749-
2750-
WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list));
27512741
WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list));
27522742

27532743
io_alloc_cache_free(&ctx->rsrc_node_cache, kfree);
@@ -3729,10 +3719,6 @@ static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
37293719
if (ret)
37303720
goto err;
37313721

3732-
ret = io_rsrc_init(ctx);
3733-
if (ret)
3734-
goto err;
3735-
37363722
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
37373723
IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
37383724
IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |

io_uring/net.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,24 +1342,25 @@ static int io_send_zc_import(struct io_kiocb *req, unsigned int issue_flags)
13421342

13431343
if (sr->flags & IORING_RECVSEND_FIXED_BUF) {
13441344
struct io_ring_ctx *ctx = req->ctx;
1345-
struct io_mapped_ubuf *imu;
1345+
struct io_rsrc_node *node;
13461346
int idx;
13471347

13481348
ret = -EFAULT;
13491349
io_ring_submit_lock(ctx, issue_flags);
13501350
if (sr->buf_index < ctx->nr_user_bufs) {
13511351
idx = array_index_nospec(sr->buf_index, ctx->nr_user_bufs);
1352-
imu = READ_ONCE(ctx->user_bufs[idx]);
1353-
io_req_set_rsrc_node(sr->notif, ctx);
1352+
node = ctx->user_bufs[idx];
1353+
io_req_assign_rsrc_node(sr->notif, node);
13541354
ret = 0;
13551355
}
13561356
io_ring_submit_unlock(ctx, issue_flags);
13571357

13581358
if (unlikely(ret))
13591359
return ret;
13601360

1361-
ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter, imu,
1362-
(u64)(uintptr_t)sr->buf, sr->len);
1361+
ret = io_import_fixed(ITER_SOURCE, &kmsg->msg.msg_iter,
1362+
node->buf, (u64)(uintptr_t)sr->buf,
1363+
sr->len);
13631364
if (unlikely(ret))
13641365
return ret;
13651366
kmsg->msg.sg_from_iter = io_sg_from_iter;

io_uring/nop.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,15 +61,15 @@ int io_nop(struct io_kiocb *req, unsigned int issue_flags)
6161
}
6262
if (nop->flags & IORING_NOP_FIXED_BUFFER) {
6363
struct io_ring_ctx *ctx = req->ctx;
64-
struct io_mapped_ubuf *imu;
64+
struct io_rsrc_node *node;
6565
int idx;
6666

6767
ret = -EFAULT;
6868
io_ring_submit_lock(ctx, issue_flags);
6969
if (nop->buffer < ctx->nr_user_bufs) {
7070
idx = array_index_nospec(nop->buffer, ctx->nr_user_bufs);
71-
imu = READ_ONCE(ctx->user_bufs[idx]);
72-
io_req_set_rsrc_node(req, ctx);
71+
node = READ_ONCE(ctx->user_bufs[idx]);
72+
io_req_assign_rsrc_node(req, node);
7373
ret = 0;
7474
}
7575
io_ring_submit_unlock(ctx, issue_flags);

io_uring/notif.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,8 @@ struct io_kiocb *io_alloc_notif(struct io_ring_ctx *ctx)
117117
notif->file = NULL;
118118
notif->task = current;
119119
io_get_task_refs(1);
120-
notif->rsrc_node = NULL;
120+
notif->rsrc_nodes[IORING_RSRC_FILE] = NULL;
121+
notif->rsrc_nodes[IORING_RSRC_BUFFER] = NULL;
121122

122123
nd = io_notif_to_data(notif);
123124
nd->zc_report = false;

0 commit comments

Comments
 (0)