Skip to content

Commit cbfce4c

Browse files
Christoph Hellwigkdave
Christoph Hellwig
authored andcommitted
btrfs: optimize the logical to physical mapping for zoned writes
The current code to store the final logical to physical mapping for a zone append write in the extent tree is rather inefficient. It first has to split the ordered extent so that there is one ordered extent per bio, so that it can look up the ordered extent on I/O completion in btrfs_record_physical_zoned and store the physical LBA returned by the block driver in the ordered extent. btrfs_rewrite_logical_zoned then has to do a lookup in the chunk tree to see what physical address the logical address for this bio / ordered extent is mapped to, and then rewrite it in the extent tree. To optimize this process, we can store the physical address assigned in the chunk tree to the original logical address and a pointer to btrfs_ordered_sum structure the in the btrfs_bio structure, and then use this information to rewrite the logical address in the btrfs_ordered_sum structure directly at I/O completion time in btrfs_record_physical_zoned. btrfs_rewrite_logical_zoned then simply updates the logical address in the extent tree and the ordered_extent itself. The code in btrfs_rewrite_logical_zoned now runs for all data I/O completions in zoned file systems, which is fine as there is no remapping to do for non-append writes to conventional zones or for relocation, and the overhead for quickly breaking out of the loop is very low. Because zoned file systems now need the ordered_sums structure to record the actual write location returned by zone append, allocate dummy structures without the csum array for them when the I/O doesn't use checksums, and free them when completing the ordered_extent. Note that the btrfs_bio doesn't grow as the new field are places into a union that is so far not used for data writes and has plenty of space left in it. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent 5cfe76f commit cbfce4c

File tree

8 files changed

+73
-54
lines changed

8 files changed

+73
-54
lines changed

Diff for: fs/btrfs/bio.c

+5
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
431431
u64 zone_start = round_down(physical, dev->fs_info->zone_size);
432432

433433
ASSERT(btrfs_dev_is_sequential(dev, physical));
434+
btrfs_bio(bio)->orig_physical = physical;
434435
bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
435436
}
436437
btrfs_debug_in_rcu(dev->fs_info,
@@ -685,6 +686,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
685686
ret = btrfs_bio_csum(bbio);
686687
if (ret)
687688
goto fail_put_bio;
689+
} else if (use_append) {
690+
ret = btrfs_alloc_dummy_sum(bbio);
691+
if (ret)
692+
goto fail_put_bio;
688693
}
689694
}
690695

Diff for: fs/btrfs/bio.h

+14-3
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,27 @@ struct btrfs_bio {
3939

4040
union {
4141
/*
42-
* Data checksumming and original I/O information for internal
43-
* use in the btrfs_submit_bio machinery.
42+
* For data reads: checksumming and original I/O information.
43+
* (for internal use in the btrfs_submit_bio machinery only)
4444
*/
4545
struct {
4646
u8 *csum;
4747
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
4848
struct bvec_iter saved_iter;
4949
};
5050

51-
/* For metadata parentness verification. */
51+
/*
52+
* For data writes:
53+
* - pointer to the checksums for this bio
54+
* - original physical address from the allocator
55+
* (for zone append only)
56+
*/
57+
struct {
58+
struct btrfs_ordered_sum *sums;
59+
u64 orig_physical;
60+
};
61+
62+
/* For metadata reads: parentness verification. */
5263
struct btrfs_tree_parent_check parent_check;
5364
};
5465

Diff for: fs/btrfs/file-item.c

+30
Original file line numberDiff line numberDiff line change
@@ -818,11 +818,41 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio)
818818

819819
}
820820
this_sum_bytes = 0;
821+
822+
/*
823+
* The ->sums assignment is for zoned writes, where a bio never spans
824+
* ordered extents and is only done unconditionally because that's cheaper
825+
* than a branch.
826+
*/
827+
bbio->sums = sums;
821828
btrfs_add_ordered_sum(ordered, sums);
822829
btrfs_put_ordered_extent(ordered);
823830
return 0;
824831
}
825832

833+
/*
834+
* Nodatasum I/O on zoned file systems still requires an btrfs_ordered_sum to
835+
* record the updated logical address on Zone Append completion.
836+
* Allocate just the structure with an empty sums array here for that case.
837+
*/
838+
blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio)
839+
{
840+
struct btrfs_ordered_extent *ordered =
841+
btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
842+
843+
if (WARN_ON_ONCE(!ordered))
844+
return BLK_STS_IOERR;
845+
846+
bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS);
847+
if (!bbio->sums)
848+
return BLK_STS_RESOURCE;
849+
bbio->sums->len = bbio->bio.bi_iter.bi_size;
850+
bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
851+
btrfs_add_ordered_sum(ordered, bbio->sums);
852+
btrfs_put_ordered_extent(ordered);
853+
return 0;
854+
}
855+
826856
/*
827857
* Remove one checksum overlapping a range.
828858
*

Diff for: fs/btrfs/file-item.h

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
5050
struct btrfs_root *root,
5151
struct btrfs_ordered_sum *sums);
5252
blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio);
53+
blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio);
5354
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
5455
struct list_head *list, int search_commit,
5556
bool nowait);

Diff for: fs/btrfs/inode.c

+1-5
Original file line numberDiff line numberDiff line change
@@ -3301,14 +3301,10 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
33013301
goto out;
33023302
}
33033303

3304-
/* A valid ->physical implies a write on a sequential zone. */
3305-
if (ordered_extent->physical != (u64)-1) {
3304+
if (btrfs_is_zoned(fs_info)) {
33063305
btrfs_rewrite_logical_zoned(ordered_extent);
33073306
btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
33083307
ordered_extent->disk_num_bytes);
3309-
} else if (btrfs_is_data_reloc_root(inode->root)) {
3310-
btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3311-
ordered_extent->disk_num_bytes);
33123308
}
33133309

33143310
if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {

Diff for: fs/btrfs/ordered-data.c

-1
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,6 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
209209
entry->compress_type = compress_type;
210210
entry->truncated_len = (u64)-1;
211211
entry->qgroup_rsv = ret;
212-
entry->physical = (u64)-1;
213212

214213
ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
215214
entry->flags = flags;

Diff for: fs/btrfs/ordered-data.h

-6
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,6 @@ struct btrfs_ordered_extent {
151151
struct completion completion;
152152
struct btrfs_work flush_work;
153153
struct list_head work_list;
154-
155-
/*
156-
* Used to reverse-map physical address returned from ZONE_APPEND write
157-
* command in a workqueue context
158-
*/
159-
u64 physical;
160154
};
161155

162156
static inline void

Diff for: fs/btrfs/zoned.c

+22-39
Original file line numberDiff line numberDiff line change
@@ -1657,63 +1657,46 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
16571657
void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
16581658
{
16591659
const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
1660-
struct btrfs_ordered_extent *ordered;
1660+
struct btrfs_ordered_sum *sum = bbio->sums;
16611661

1662-
ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
1663-
if (WARN_ON(!ordered))
1664-
return;
1665-
1666-
ordered->physical = physical;
1667-
btrfs_put_ordered_extent(ordered);
1662+
if (physical < bbio->orig_physical)
1663+
sum->logical -= bbio->orig_physical - physical;
1664+
else
1665+
sum->logical += physical - bbio->orig_physical;
16681666
}
16691667

16701668
void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
16711669
{
16721670
struct btrfs_inode *inode = BTRFS_I(ordered->inode);
1673-
struct btrfs_fs_info *fs_info = inode->root->fs_info;
1674-
struct extent_map_tree *em_tree;
1671+
struct extent_map_tree *em_tree = &inode->extent_tree;
16751672
struct extent_map *em;
1676-
struct btrfs_ordered_sum *sum;
1677-
u64 orig_logical = ordered->disk_bytenr;
1678-
struct map_lookup *map;
1679-
u64 physical = ordered->physical;
1680-
u64 chunk_start_phys;
1681-
u64 logical;
1673+
struct btrfs_ordered_sum *sum =
1674+
list_first_entry(&ordered->list, typeof(*sum), list);
1675+
u64 logical = sum->logical;
16821676

1683-
em = btrfs_get_chunk_map(fs_info, orig_logical, 1);
1684-
if (IS_ERR(em))
1685-
return;
1686-
map = em->map_lookup;
1687-
chunk_start_phys = map->stripes[0].physical;
1688-
1689-
if (WARN_ON_ONCE(map->num_stripes > 1) ||
1690-
WARN_ON_ONCE((map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) != 0) ||
1691-
WARN_ON_ONCE(physical < chunk_start_phys) ||
1692-
WARN_ON_ONCE(physical > chunk_start_phys + em->orig_block_len)) {
1693-
free_extent_map(em);
1694-
return;
1695-
}
1696-
logical = em->start + (physical - map->stripes[0].physical);
1697-
free_extent_map(em);
1698-
1699-
if (orig_logical == logical)
1700-
return;
1677+
if (ordered->disk_bytenr == logical)
1678+
goto out;
17011679

17021680
ordered->disk_bytenr = logical;
17031681

1704-
em_tree = &inode->extent_tree;
17051682
write_lock(&em_tree->lock);
17061683
em = search_extent_mapping(em_tree, ordered->file_offset,
17071684
ordered->num_bytes);
17081685
em->block_start = logical;
17091686
free_extent_map(em);
17101687
write_unlock(&em_tree->lock);
17111688

1712-
list_for_each_entry(sum, &ordered->list, list) {
1713-
if (logical < orig_logical)
1714-
sum->logical -= orig_logical - logical;
1715-
else
1716-
sum->logical += logical - orig_logical;
1689+
out:
1690+
/*
1691+
* If we end up here for nodatasum I/O, the btrfs_ordered_sum structures
1692+
* were allocated by btrfs_alloc_dummy_sum only to record the logical
1693+
* addresses and don't contain actual checksums. We thus must free them
1694+
* here so that we don't attempt to log the csums later.
1695+
*/
1696+
if ((inode->flags & BTRFS_INODE_NODATASUM) ||
1697+
test_bit(BTRFS_FS_STATE_NO_CSUMS, &inode->root->fs_info->fs_state)) {
1698+
list_del(&sum->list);
1699+
kfree(sum);
17171700
}
17181701
}
17191702

0 commit comments

Comments
 (0)