Skip to content

Commit 0a8068a

Browse files
fdmananakdave
authored andcommitted
btrfs: make ranged full fsyncs more efficient
Commit 0c713cb ("Btrfs: fix race between ranged fsync and writeback of adjacent ranges") fixed a bug where we could end up with file extent items in a log tree that represent file ranges that overlap due to a race between the hole detection of a ranged full fsync and writeback for a different file range. The problem was solved by forcing any ranged full fsync to become a non-ranged full fsync - setting the range start to 0 and the end offset to LLONG_MAX. This was a simple solution because the code that detected and marked holes was very complex, it used to be done at copy_items() and implied several searches on the fs/subvolume tree. The drawback of that solution was that we started to flush delalloc for the entire file and wait for all the ordered extents to complete for ranged full fsyncs (including ordered extents covering ranges completely outside the given range). Fortunatelly ranged full fsyncs are not the most common case (hopefully for most workloads). However a later fix for detecting and marking holes was made by commit 0e56315 ("Btrfs: fix missing hole after hole punching and fsync when using NO_HOLES") and it simplified a lot the detection of holes, and now copy_items() no longer does it and we do it in a much more simple way at btrfs_log_holes(). This makes it now possible to simply make the code that detects holes to operate only on the initial range and no longer need to operate on the whole file, while also avoiding the need to flush delalloc for the entire file and wait for ordered extents that cover ranges that don't overlap the given range. Another special care is that we must skip file extent items that fall entirely outside the fsync range when copying inode items from the fs/subvolume tree into the log tree - this is to avoid races with ordered extent completion for extents falling outside the fsync range, which could cause us to end up with file extent items in the log tree that have overlapping ranges - for example if the fsync range is [1Mb, 2Mb], when we copy inode items we could copy an extent item for the range [0, 512K], then release the search path and before moving to the next leaf, an ordered extent for a range of [256Kb, 512Kb] completes - this would cause us to copy the new extent item for range [256Kb, 512Kb] into the log tree after we have copied one for the range [0, 512Kb] - the extents overlap, resulting in a corruption. So this change just does these steps: 1) When the NO_HOLES feature is enabled it leaves the initial range intact - no longer sets it to [0, LLONG_MAX] when the full sync bit is set in the inode. If NO_HOLES is not enabled, always set the range to a full, just like before this change, to avoid missing file extent items representing holes after replaying the log (for both full and fast fsyncs); 2) Make the hole detection code to operate only on the fsync range; 3) Make the code that copies items from the fs/subvolume tree to skip copying file extent items that cover a range completely outside the range of the fsync. Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent da44700 commit 0a8068a

File tree

2 files changed

+79
-27
lines changed

2 files changed

+79
-27
lines changed

fs/btrfs/file.c

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2102,19 +2102,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
21022102

21032103
atomic_inc(&root->log_batch);
21042104

2105-
/*
2106-
* If the inode needs a full sync, make sure we use a full range to
2107-
* avoid log tree corruption, due to hole detection racing with ordered
2108-
* extent completion for adjacent ranges, and assertion failures during
2109-
* hole detection. Do this while holding the inode lock, to avoid races
2110-
* with other tasks.
2111-
*/
2112-
if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2113-
&BTRFS_I(inode)->runtime_flags)) {
2114-
start = 0;
2115-
end = LLONG_MAX;
2116-
}
2117-
21182105
/*
21192106
* Before we acquired the inode's lock, someone may have dirtied more
21202107
* pages in the target range. We need to make sure that writeback for

fs/btrfs/tree-log.c

Lines changed: 79 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ enum {
9696
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
9797
struct btrfs_root *root, struct btrfs_inode *inode,
9898
int inode_only,
99-
const loff_t start,
100-
const loff_t end,
99+
u64 start,
100+
u64 end,
101101
struct btrfs_log_ctx *ctx);
102102
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
103103
struct btrfs_root *root,
@@ -4534,28 +4534,37 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
45344534
static int btrfs_log_holes(struct btrfs_trans_handle *trans,
45354535
struct btrfs_root *root,
45364536
struct btrfs_inode *inode,
4537-
struct btrfs_path *path)
4537+
struct btrfs_path *path,
4538+
const u64 start,
4539+
const u64 end)
45384540
{
45394541
struct btrfs_fs_info *fs_info = root->fs_info;
45404542
struct btrfs_key key;
45414543
const u64 ino = btrfs_ino(inode);
45424544
const u64 i_size = i_size_read(&inode->vfs_inode);
4543-
u64 prev_extent_end = 0;
4545+
u64 prev_extent_end = start;
45444546
int ret;
45454547

45464548
if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
45474549
return 0;
45484550

45494551
key.objectid = ino;
45504552
key.type = BTRFS_EXTENT_DATA_KEY;
4551-
key.offset = 0;
4553+
key.offset = start;
45524554

45534555
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
45544556
if (ret < 0)
45554557
return ret;
45564558

4559+
if (ret > 0 && path->slots[0] > 0) {
4560+
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
4561+
if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
4562+
path->slots[0]--;
4563+
}
4564+
45574565
while (true) {
45584566
struct extent_buffer *leaf = path->nodes[0];
4567+
u64 extent_end;
45594568

45604569
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
45614570
ret = btrfs_next_leaf(root, path);
@@ -4572,9 +4581,18 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
45724581
if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
45734582
break;
45744583

4584+
extent_end = btrfs_file_extent_end(path);
4585+
if (extent_end <= start)
4586+
goto next_slot;
4587+
45754588
/* We have a hole, log it. */
45764589
if (prev_extent_end < key.offset) {
4577-
const u64 hole_len = key.offset - prev_extent_end;
4590+
u64 hole_len;
4591+
4592+
if (key.offset >= end)
4593+
hole_len = end - prev_extent_end;
4594+
else
4595+
hole_len = key.offset - prev_extent_end;
45784596

45794597
/*
45804598
* Release the path to avoid deadlocks with other code
@@ -4604,16 +4622,20 @@ static int btrfs_log_holes(struct btrfs_trans_handle *trans,
46044622
leaf = path->nodes[0];
46054623
}
46064624

4607-
prev_extent_end = btrfs_file_extent_end(path);
4625+
prev_extent_end = min(extent_end, end);
4626+
if (extent_end >= end)
4627+
break;
4628+
next_slot:
46084629
path->slots[0]++;
46094630
cond_resched();
46104631
}
46114632

4612-
if (prev_extent_end < i_size) {
4633+
if (prev_extent_end < end && prev_extent_end < i_size) {
46134634
u64 hole_len;
46144635

46154636
btrfs_release_path(path);
4616-
hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
4637+
hole_len = min(ALIGN(i_size, fs_info->sectorsize), end);
4638+
hole_len -= prev_extent_end;
46174639
ret = btrfs_insert_file_extent(trans, root->log_root,
46184640
ino, prev_extent_end, 0, 0,
46194641
hole_len, 0, hole_len,
@@ -4950,6 +4972,8 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
49504972
const u64 logged_isize,
49514973
const bool recursive_logging,
49524974
const int inode_only,
4975+
const u64 start,
4976+
const u64 end,
49534977
struct btrfs_log_ctx *ctx,
49544978
bool *need_log_inode_item)
49554979
{
@@ -4958,6 +4982,21 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
49584982
int ins_nr = 0;
49594983
int ret;
49604984

4985+
/*
4986+
* We must make sure we don't copy extent items that are entirely out of
4987+
* the range [start, end - 1]. This is not just an optimization to avoid
4988+
* copying but also needed to avoid a corruption where we end up with
4989+
* file extent items in the log tree that have overlapping ranges - this
4990+
* can happen if we race with ordered extent completion for ranges that
4991+
* are outside our target range. For example we copy an extent item and
4992+
* when we move to the next leaf, that extent was trimmed and a new one
4993+
* covering a subrange of it, but with a higher key, was inserted - we
4994+
* would then copy this other extent too, resulting in a log tree with
4995+
* 2 extent items that represent overlapping ranges.
4996+
*
4997+
* We can copy the entire extents at the range bondaries however, even
4998+
* if they cover an area outside the target range. That's ok.
4999+
*/
49615000
while (1) {
49625001
ret = btrfs_search_forward(root, min_key, path, trans->transid);
49635002
if (ret < 0)
@@ -5025,6 +5064,29 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
50255064
goto next_slot;
50265065
}
50275066

5067+
if (min_key->type == BTRFS_EXTENT_DATA_KEY) {
5068+
const u64 extent_end = btrfs_file_extent_end(path);
5069+
5070+
if (extent_end <= start) {
5071+
if (ins_nr > 0) {
5072+
ret = copy_items(trans, inode, dst_path,
5073+
path, ins_start_slot,
5074+
ins_nr, inode_only,
5075+
logged_isize);
5076+
if (ret < 0)
5077+
return ret;
5078+
ins_nr = 0;
5079+
}
5080+
goto next_slot;
5081+
}
5082+
if (extent_end >= end) {
5083+
ins_nr++;
5084+
if (ins_nr == 1)
5085+
ins_start_slot = path->slots[0];
5086+
break;
5087+
}
5088+
}
5089+
50285090
if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
50295091
ins_nr++;
50305092
goto next_slot;
@@ -5090,8 +5152,8 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
50905152
static int btrfs_log_inode(struct btrfs_trans_handle *trans,
50915153
struct btrfs_root *root, struct btrfs_inode *inode,
50925154
int inode_only,
5093-
const loff_t start,
5094-
const loff_t end,
5155+
u64 start,
5156+
u64 end,
50955157
struct btrfs_log_ctx *ctx)
50965158
{
50975159
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -5119,6 +5181,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
51195181
return -ENOMEM;
51205182
}
51215183

5184+
start = ALIGN_DOWN(start, fs_info->sectorsize);
5185+
end = ALIGN(end, fs_info->sectorsize);
5186+
51225187
min_key.objectid = ino;
51235188
min_key.type = BTRFS_INODE_ITEM_KEY;
51245189
min_key.offset = 0;
@@ -5234,8 +5299,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
52345299

52355300
err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
52365301
path, dst_path, logged_isize,
5237-
recursive_logging, inode_only, ctx,
5238-
&need_log_inode_item);
5302+
recursive_logging, inode_only,
5303+
start, end, ctx, &need_log_inode_item);
52395304
if (err)
52405305
goto out_unlock;
52415306

@@ -5248,7 +5313,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
52485313
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
52495314
btrfs_release_path(path);
52505315
btrfs_release_path(dst_path);
5251-
err = btrfs_log_holes(trans, root, inode, path);
5316+
err = btrfs_log_holes(trans, root, inode, path, start, end);
52525317
if (err)
52535318
goto out_unlock;
52545319
}

0 commit comments

Comments
 (0)