Skip to content

Commit e2d46f2

Browse files
dhowellsbrauner
authored andcommitted
netfs: Change the read result collector to only use one work item
Change the way netfslib collects read results to do all the collection for a particular read request using a single work item that walks along the subrequest queue as subrequests make progress or complete, unlocking folios progressively rather than doing the unlock in parallel as parallel requests come in. The code is remodelled to be more like the write-side code, though only using a single stream. This makes it more directly comparable and thus easier to duplicate fixes between the two sides. This has a number of advantages: (1) It's simpler. There doesn't need to be a complex donation mechanism to handle mismatches between the size and alignment of subrequests and folios. The collector unlocks folios as the subrequests covering each complete. (2) It should cause less scheduler overhead as there's a single work item in play unlocking pages in parallel when a read gets split up into a lot of subrequests instead of one per subrequest. Whilst the parallellism is nice in theory, in practice, the vast majority of loads are sequential reads of the whole file, so committing a bunch of threads to unlocking folios out of order doesn't help in those cases. (3) It should make it easier to implement content decryption. A folio cannot be decrypted until all the requests that contribute to it have completed - and, again, most loads are sequential and so, most of the time, we want to begin decryption sequentially (though it's great if the decryption can happen in parallel). There is a disadvantage in that we're losing the ability to decrypt and unlock things on an as-things-arrive basis which may affect some applications. Signed-off-by: David Howells <dhowells@redhat.com> Link: https://lore.kernel.org/r/20241216204124.3752367-28-dhowells@redhat.com cc: Jeff Layton <jlayton@kernel.org> cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
1 parent eddf51f commit e2d46f2

19 files changed

+819
-763
lines changed

fs/9p/vfs_addr.c

+1-2
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,7 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
8181
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
8282
if (pos + total >= i_size_read(rreq->inode))
8383
__set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags);
84-
85-
if (!err) {
84+
if (!err && total) {
8685
subreq->transferred += total;
8786
__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
8887
}

fs/afs/dir.c

+6-2
Original file line numberDiff line numberDiff line change
@@ -325,8 +325,10 @@ static ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file)
325325
* haven't read it yet.
326326
*/
327327
if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
328-
test_bit(AFS_VNODE_DIR_READ, &dvnode->flags))
328+
test_bit(AFS_VNODE_DIR_READ, &dvnode->flags)) {
329+
ret = i_size;
329330
goto valid;
331+
}
330332

331333
up_read(&dvnode->validate_lock);
332334
if (down_write_killable(&dvnode->validate_lock) < 0)
@@ -346,11 +348,13 @@ static ssize_t afs_read_dir(struct afs_vnode *dvnode, struct file *file)
346348

347349
set_bit(AFS_VNODE_DIR_VALID, &dvnode->flags);
348350
set_bit(AFS_VNODE_DIR_READ, &dvnode->flags);
351+
} else {
352+
ret = i_size;
349353
}
350354

351355
downgrade_write(&dvnode->validate_lock);
352356
valid:
353-
return i_size;
357+
return ret;
354358

355359
error_unlock:
356360
up_write(&dvnode->validate_lock);

fs/ceph/addr.c

+7-2
Original file line numberDiff line numberDiff line change
@@ -223,10 +223,13 @@ static void finish_netfs_read(struct ceph_osd_request *req)
223223
subreq->len, i_size_read(req->r_inode));
224224

225225
/* no object means success but no data */
226-
if (err == -ENOENT)
226+
if (err == -ENOENT) {
227+
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
228+
__set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
227229
err = 0;
228-
else if (err == -EBLOCKLISTED)
230+
} else if (err == -EBLOCKLISTED) {
229231
fsc->blocklisted = true;
232+
}
230233

231234
if (err >= 0) {
232235
if (sparse && err > 0)
@@ -242,6 +245,8 @@ static void finish_netfs_read(struct ceph_osd_request *req)
242245
if (err > subreq->len)
243246
err = subreq->len;
244247
}
248+
if (err > 0)
249+
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
245250
}
246251

247252
if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {

fs/netfs/buffered_read.c

+79-81
Original file line numberDiff line numberDiff line change
@@ -121,12 +121,6 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
121121

122122
subreq->io_iter = rreq->buffer.iter;
123123

124-
if (iov_iter_is_folioq(&subreq->io_iter)) {
125-
subreq->curr_folioq = (struct folio_queue *)subreq->io_iter.folioq;
126-
subreq->curr_folioq_slot = subreq->io_iter.folioq_slot;
127-
subreq->curr_folio_order = subreq->curr_folioq->orders[subreq->curr_folioq_slot];
128-
}
129-
130124
iov_iter_truncate(&subreq->io_iter, subreq->len);
131125
rolling_buffer_advance(&rreq->buffer, subreq->len);
132126
return subreq->len;
@@ -147,19 +141,6 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_request *rr
147141

148142
}
149143

150-
void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async)
151-
{
152-
struct netfs_io_subrequest *subreq = priv;
153-
154-
if (transferred_or_error > 0) {
155-
subreq->transferred += transferred_or_error;
156-
subreq->error = 0;
157-
} else {
158-
subreq->error = transferred_or_error;
159-
}
160-
schedule_work(&subreq->work);
161-
}
162-
163144
/*
164145
* Issue a read against the cache.
165146
* - Eats the caller's ref on subreq.
@@ -174,6 +155,47 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq,
174155
netfs_cache_read_terminated, subreq);
175156
}
176157

158+
static void netfs_issue_read(struct netfs_io_request *rreq,
159+
struct netfs_io_subrequest *subreq)
160+
{
161+
struct netfs_io_stream *stream = &rreq->io_streams[0];
162+
163+
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
164+
165+
/* We add to the end of the list whilst the collector may be walking
166+
* the list. The collector only goes nextwards and uses the lock to
167+
* remove entries off of the front.
168+
*/
169+
spin_lock(&rreq->lock);
170+
list_add_tail(&subreq->rreq_link, &stream->subrequests);
171+
if (list_is_first(&subreq->rreq_link, &stream->subrequests)) {
172+
stream->front = subreq;
173+
if (!stream->active) {
174+
stream->collected_to = stream->front->start;
175+
/* Store list pointers before active flag */
176+
smp_store_release(&stream->active, true);
177+
}
178+
}
179+
180+
spin_unlock(&rreq->lock);
181+
182+
switch (subreq->source) {
183+
case NETFS_DOWNLOAD_FROM_SERVER:
184+
rreq->netfs_ops->issue_read(subreq);
185+
break;
186+
case NETFS_READ_FROM_CACHE:
187+
netfs_read_cache_to_pagecache(rreq, subreq);
188+
break;
189+
default:
190+
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
191+
subreq->error = 0;
192+
iov_iter_zero(subreq->len, &subreq->io_iter);
193+
subreq->transferred = subreq->len;
194+
netfs_read_subreq_terminated(subreq);
195+
break;
196+
}
197+
}
198+
177199
/*
178200
* Perform a read to the pagecache from a series of sources of different types,
179201
* slicing up the region to be read according to available cache blocks and
@@ -186,8 +208,6 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
186208
ssize_t size = rreq->len;
187209
int ret = 0;
188210

189-
atomic_inc(&rreq->nr_outstanding);
190-
191211
do {
192212
struct netfs_io_subrequest *subreq;
193213
enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
@@ -202,14 +222,6 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
202222
subreq->start = start;
203223
subreq->len = size;
204224

205-
atomic_inc(&rreq->nr_outstanding);
206-
spin_lock(&rreq->lock);
207-
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
208-
subreq->prev_donated = rreq->prev_donated;
209-
rreq->prev_donated = 0;
210-
trace_netfs_sreq(subreq, netfs_sreq_trace_added);
211-
spin_unlock(&rreq->lock);
212-
213225
source = netfs_cache_prepare_read(rreq, subreq, rreq->i_size);
214226
subreq->source = source;
215227
if (source == NETFS_DOWNLOAD_FROM_SERVER) {
@@ -237,85 +249,69 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
237249
netfs_stat(&netfs_n_rh_download);
238250
if (rreq->netfs_ops->prepare_read) {
239251
ret = rreq->netfs_ops->prepare_read(subreq);
240-
if (ret < 0)
241-
goto prep_failed;
252+
if (ret < 0) {
253+
subreq->error = ret;
254+
/* Not queued - release both refs. */
255+
netfs_put_subrequest(subreq, false,
256+
netfs_sreq_trace_put_cancel);
257+
netfs_put_subrequest(subreq, false,
258+
netfs_sreq_trace_put_cancel);
259+
break;
260+
}
242261
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
243262
}
244-
245-
slice = netfs_prepare_read_iterator(subreq);
246-
if (slice < 0)
247-
goto prep_iter_failed;
248-
249-
rreq->netfs_ops->issue_read(subreq);
250-
goto done;
263+
goto issue;
251264
}
252265

253266
fill_with_zeroes:
254267
if (source == NETFS_FILL_WITH_ZEROES) {
255268
subreq->source = NETFS_FILL_WITH_ZEROES;
256269
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
257270
netfs_stat(&netfs_n_rh_zero);
258-
slice = netfs_prepare_read_iterator(subreq);
259-
if (slice < 0)
260-
goto prep_iter_failed;
261-
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
262-
subreq->error = 0;
263-
netfs_read_subreq_terminated(subreq);
264-
goto done;
271+
goto issue;
265272
}
266273

267274
if (source == NETFS_READ_FROM_CACHE) {
268275
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
269-
slice = netfs_prepare_read_iterator(subreq);
270-
if (slice < 0)
271-
goto prep_iter_failed;
272-
netfs_read_cache_to_pagecache(rreq, subreq);
273-
goto done;
276+
goto issue;
274277
}
275278

276279
pr_err("Unexpected read source %u\n", source);
277280
WARN_ON_ONCE(1);
278281
break;
279282

280-
prep_iter_failed:
281-
ret = slice;
282-
prep_failed:
283-
subreq->error = ret;
284-
atomic_dec(&rreq->nr_outstanding);
285-
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
286-
break;
287-
288-
done:
283+
issue:
284+
slice = netfs_prepare_read_iterator(subreq);
285+
if (slice < 0) {
286+
ret = slice;
287+
subreq->error = ret;
288+
trace_netfs_sreq(subreq, netfs_sreq_trace_cancel);
289+
/* Not queued - release both refs. */
290+
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
291+
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
292+
break;
293+
}
289294
size -= slice;
290295
start += slice;
296+
if (size <= 0) {
297+
smp_wmb(); /* Write lists before ALL_QUEUED. */
298+
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
299+
}
300+
301+
netfs_issue_read(rreq, subreq);
291302
cond_resched();
292303
} while (size > 0);
293304

294-
if (atomic_dec_and_test(&rreq->nr_outstanding))
295-
netfs_rreq_terminated(rreq);
305+
if (unlikely(size > 0)) {
306+
smp_wmb(); /* Write lists before ALL_QUEUED. */
307+
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
308+
netfs_wake_read_collector(rreq);
309+
}
296310

297311
/* Defer error return as we may need to wait for outstanding I/O. */
298312
cmpxchg(&rreq->error, 0, ret);
299313
}
300314

301-
/*
302-
* Wait for the read operation to complete, successfully or otherwise.
303-
*/
304-
static int netfs_wait_for_read(struct netfs_io_request *rreq)
305-
{
306-
int ret;
307-
308-
trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
309-
wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE);
310-
ret = rreq->error;
311-
if (ret == 0 && rreq->submitted < rreq->len) {
312-
trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
313-
ret = -EIO;
314-
}
315-
316-
return ret;
317-
}
318-
319315
/**
320316
* netfs_readahead - Helper to manage a read request
321317
* @ractl: The description of the readahead request
@@ -344,6 +340,8 @@ void netfs_readahead(struct readahead_control *ractl)
344340
if (IS_ERR(rreq))
345341
return;
346342

343+
__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags);
344+
347345
ret = netfs_begin_cache_read(rreq, ictx);
348346
if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
349347
goto cleanup_free;
@@ -460,7 +458,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio)
460458
folio_put(sink);
461459

462460
ret = netfs_wait_for_read(rreq);
463-
if (ret == 0) {
461+
if (ret >= 0) {
464462
flush_dcache_folio(folio);
465463
folio_mark_uptodate(folio);
466464
}
@@ -748,7 +746,7 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio,
748746
netfs_read_to_pagecache(rreq);
749747
ret = netfs_wait_for_read(rreq);
750748
netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
751-
return ret;
749+
return ret < 0 ? ret : 0;
752750

753751
error_put:
754752
netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);

0 commit comments

Comments
 (0)