11// SPDX-License-Identifier: GPL-2.0
22
3+ #include <linux/blkdev.h>
34#include <linux/iversion.h>
5+ #include "compression.h"
46#include "ctree.h"
7+ #include "delalloc-space.h"
58#include "reflink.h"
69#include "transaction.h"
710
@@ -42,49 +45,131 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
4245 return ret ;
4346}
4447
48+ static int copy_inline_to_page (struct inode * inode ,
49+ const u64 file_offset ,
50+ char * inline_data ,
51+ const u64 size ,
52+ const u64 datal ,
53+ const u8 comp_type )
54+ {
55+ const u64 block_size = btrfs_inode_sectorsize (inode );
56+ const u64 range_end = file_offset + block_size - 1 ;
57+ const size_t inline_size = size - btrfs_file_extent_calc_inline_size (0 );
58+ char * data_start = inline_data + btrfs_file_extent_calc_inline_size (0 );
59+ struct extent_changeset * data_reserved = NULL ;
60+ struct page * page = NULL ;
61+ int ret ;
62+
63+ ASSERT (IS_ALIGNED (file_offset , block_size ));
64+
65+ /*
66+ * We have flushed and locked the ranges of the source and destination
67+ * inodes, we also have locked the inodes, so we are safe to do a
68+ * reservation here. Also we must not do the reservation while holding
69+ * a transaction open, otherwise we would deadlock.
70+ */
71+ ret = btrfs_delalloc_reserve_space (inode , & data_reserved , file_offset ,
72+ block_size );
73+ if (ret )
74+ goto out ;
75+
76+ page = find_or_create_page (inode -> i_mapping , file_offset >> PAGE_SHIFT ,
77+ btrfs_alloc_write_mask (inode -> i_mapping ));
78+ if (!page ) {
79+ ret = - ENOMEM ;
80+ goto out_unlock ;
81+ }
82+
83+ set_page_extent_mapped (page );
84+ clear_extent_bit (& BTRFS_I (inode )-> io_tree , file_offset , range_end ,
85+ EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG ,
86+ 0 , 0 , NULL );
87+ ret = btrfs_set_extent_delalloc (inode , file_offset , range_end , 0 , NULL );
88+ if (ret )
89+ goto out_unlock ;
90+
91+ if (comp_type == BTRFS_COMPRESS_NONE ) {
92+ char * map ;
93+
94+ map = kmap (page );
95+ memcpy (map , data_start , datal );
96+ flush_dcache_page (page );
97+ kunmap (page );
98+ } else {
99+ ret = btrfs_decompress (comp_type , data_start , page , 0 ,
100+ inline_size , datal );
101+ if (ret )
102+ goto out_unlock ;
103+ flush_dcache_page (page );
104+ }
105+
106+ /*
107+ * If our inline data is smaller then the block/page size, then the
108+ * remaining of the block/page is equivalent to zeroes. We had something
109+ * like the following done:
110+ *
111+ * $ xfs_io -f -c "pwrite -S 0xab 0 500" file
112+ * $ sync # (or fsync)
113+ * $ xfs_io -c "falloc 0 4K" file
114+ * $ xfs_io -c "pwrite -S 0xcd 4K 4K"
115+ *
116+ * So what's in the range [500, 4095] corresponds to zeroes.
117+ */
118+ if (datal < block_size ) {
119+ char * map ;
120+
121+ map = kmap (page );
122+ memset (map + datal , 0 , block_size - datal );
123+ flush_dcache_page (page );
124+ kunmap (page );
125+ }
126+
127+ SetPageUptodate (page );
128+ ClearPageChecked (page );
129+ set_page_dirty (page );
130+ out_unlock :
131+ if (page ) {
132+ unlock_page (page );
133+ put_page (page );
134+ }
135+ if (ret )
136+ btrfs_delalloc_release_space (inode , data_reserved , file_offset ,
137+ block_size , true);
138+ btrfs_delalloc_release_extents (BTRFS_I (inode ), block_size );
139+ out :
140+ extent_changeset_free (data_reserved );
141+
142+ return ret ;
143+ }
144+
45145/*
46- * Make sure we do not end up inserting an inline extent into a file that has
47- * already other (non-inline) extents. If a file has an inline extent it can
48- * not have any other extents and the (single) inline extent must start at the
49- * file offset 0. Failing to respect these rules will lead to file corruption,
50- * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
51- *
52- * We can have extents that have been already written to disk or we can have
53- * dirty ranges still in delalloc, in which case the extent maps and items are
54- * created only when we run delalloc, and the delalloc ranges might fall outside
55- * the range we are currently locking in the inode's io tree. So we check the
56- * inode's i_size because of that (i_size updates are done while holding the
57- * i_mutex, which we are holding here).
58- * We also check to see if the inode has a size not greater than "datal" but has
59- * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
60- * protected against such concurrent fallocate calls by the i_mutex).
61- *
62- * If the file has no extents but a size greater than datal, do not allow the
63- * copy because we would need turn the inline extent into a non-inline one (even
64- * with NO_HOLES enabled). If we find our destination inode only has one inline
65- * extent, just overwrite it with the source inline extent if its size is less
66- * than the source extent's size, or we could copy the source inline extent's
67- * data into the destination inode's inline extent if the later is greater then
68- * the former.
146+ * Deal with cloning of inline extents. We try to copy the inline extent from
147+ * the source inode to destination inode when possible. When not possible we
148+ * copy the inline extent's data into the respective page of the inode.
69149 */
70150static int clone_copy_inline_extent (struct inode * dst ,
71- struct btrfs_trans_handle * trans ,
72151 struct btrfs_path * path ,
73152 struct btrfs_key * new_key ,
74153 const u64 drop_start ,
75154 const u64 datal ,
76155 const u64 size ,
77- const char * inline_data )
156+ const u8 comp_type ,
157+ char * inline_data ,
158+ struct btrfs_trans_handle * * trans_out )
78159{
79160 struct btrfs_fs_info * fs_info = btrfs_sb (dst -> i_sb );
80161 struct btrfs_root * root = BTRFS_I (dst )-> root ;
81162 const u64 aligned_end = ALIGN (new_key -> offset + datal ,
82163 fs_info -> sectorsize );
164+ struct btrfs_trans_handle * trans = NULL ;
83165 int ret ;
84166 struct btrfs_key key ;
85167
86- if (new_key -> offset > 0 )
87- return - EOPNOTSUPP ;
168+ if (new_key -> offset > 0 ) {
169+ ret = copy_inline_to_page (dst , new_key -> offset , inline_data ,
170+ size , datal , comp_type );
171+ goto out ;
172+ }
88173
89174 key .objectid = btrfs_ino (BTRFS_I (dst ));
90175 key .type = BTRFS_EXTENT_DATA_KEY ;
@@ -103,81 +188,104 @@ static int clone_copy_inline_extent(struct inode *dst,
103188 btrfs_item_key_to_cpu (path -> nodes [0 ], & key , path -> slots [0 ]);
104189 if (key .objectid == btrfs_ino (BTRFS_I (dst )) &&
105190 key .type == BTRFS_EXTENT_DATA_KEY ) {
191+ /*
192+ * There's an implicit hole at file offset 0, copy the
193+ * inline extent's data to the page.
194+ */
106195 ASSERT (key .offset > 0 );
107- return - EOPNOTSUPP ;
196+ ret = copy_inline_to_page (dst , new_key -> offset ,
197+ inline_data , size , datal ,
198+ comp_type );
199+ goto out ;
108200 }
109201 } else if (i_size_read (dst ) <= datal ) {
110202 struct btrfs_file_extent_item * ei ;
111- u64 ext_len ;
112203
113- /*
114- * If the file size is <= datal, make sure there are no other
115- * extents following (can happen do to an fallocate call with
116- * the flag FALLOC_FL_KEEP_SIZE).
117- */
118204 ei = btrfs_item_ptr (path -> nodes [0 ], path -> slots [0 ],
119205 struct btrfs_file_extent_item );
120206 /*
121- * If it's an inline extent, it can not have other extents
122- * following it.
207+ * If it's an inline extent replace it with the source inline
208+ * extent, otherwise copy the source inline extent data into
209+ * the respective page at the destination inode.
123210 */
124211 if (btrfs_file_extent_type (path -> nodes [0 ], ei ) ==
125212 BTRFS_FILE_EXTENT_INLINE )
126213 goto copy_inline_extent ;
127214
128- ext_len = btrfs_file_extent_num_bytes (path -> nodes [0 ], ei );
129- if (ext_len > aligned_end )
130- return - EOPNOTSUPP ;
131-
132- ret = btrfs_next_item (root , path );
133- if (ret < 0 ) {
134- return ret ;
135- } else if (ret == 0 ) {
136- btrfs_item_key_to_cpu (path -> nodes [0 ], & key ,
137- path -> slots [0 ]);
138- if (key .objectid == btrfs_ino (BTRFS_I (dst )) &&
139- key .type == BTRFS_EXTENT_DATA_KEY )
140- return - EOPNOTSUPP ;
141- }
215+ ret = copy_inline_to_page (dst , new_key -> offset , inline_data ,
216+ size , datal , comp_type );
217+ goto out ;
142218 }
143219
144220copy_inline_extent :
221+ ret = 0 ;
145222 /*
146223 * We have no extent items, or we have an extent at offset 0 which may
147224 * or may not be inlined. All these cases are dealt the same way.
148225 */
149226 if (i_size_read (dst ) > datal ) {
150227 /*
151- * If the destination inode has an inline extent.
152- * This would require copying the data from the source inline
153- * extent into the beginning of the destination's inline extent.
154- * But this is really complex, both extents can be compressed
155- * or just one of them, which would require decompressing and
156- * re-compressing data (which could increase the new compressed
157- * size, not allowing the compressed data to fit anymore in an
158- * inline extent).
159- * So just don't support this case for now (it should be rare,
160- * we are not really saving space when cloning inline extents).
228+ * At the destination offset 0 we have either a hole, a regular
229+ * extent or an inline extent larger then the one we want to
230+ * clone. Deal with all these cases by copying the inline extent
231+ * data into the respective page at the destination inode.
161232 */
162- return - EOPNOTSUPP ;
233+ ret = copy_inline_to_page (dst , new_key -> offset , inline_data ,
234+ size , datal , comp_type );
235+ goto out ;
163236 }
164237
165238 btrfs_release_path (path );
239+ /*
240+ * If we end up here it means were copy the inline extent into a leaf
241+ * of the destination inode. We know we will drop or adjust at most one
242+ * extent item in the destination root.
243+ *
244+ * 1 unit - adjusting old extent (we may have to split it)
245+ * 1 unit - add new extent
246+ * 1 unit - inode update
247+ */
248+ trans = btrfs_start_transaction (root , 3 );
249+ if (IS_ERR (trans )) {
250+ ret = PTR_ERR (trans );
251+ trans = NULL ;
252+ goto out ;
253+ }
166254 ret = btrfs_drop_extents (trans , root , dst , drop_start , aligned_end , 1 );
167255 if (ret )
168- return ret ;
256+ goto out ;
169257 ret = btrfs_insert_empty_item (trans , root , path , new_key , size );
170258 if (ret )
171- return ret ;
259+ goto out ;
172260
173261 write_extent_buffer (path -> nodes [0 ], inline_data ,
174262 btrfs_item_ptr_offset (path -> nodes [0 ],
175263 path -> slots [0 ]),
176264 size );
177265 inode_add_bytes (dst , datal );
178266 set_bit (BTRFS_INODE_NEEDS_FULL_SYNC , & BTRFS_I (dst )-> runtime_flags );
267+ out :
268+ if (!ret && !trans ) {
269+ /*
270+ * No transaction here means we copied the inline extent into a
271+ * page of the destination inode.
272+ *
273+ * 1 unit to update inode item
274+ */
275+ trans = btrfs_start_transaction (root , 1 );
276+ if (IS_ERR (trans )) {
277+ ret = PTR_ERR (trans );
278+ trans = NULL ;
279+ }
280+ }
281+ if (ret && trans ) {
282+ btrfs_abort_transaction (trans , ret );
283+ btrfs_end_transaction (trans );
284+ }
285+ if (!ret )
286+ * trans_out = trans ;
179287
180- return 0 ;
288+ return ret ;
181289}
182290
183291/**
@@ -196,7 +304,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
196304 const u64 destoff , int no_time_update )
197305{
198306 struct btrfs_fs_info * fs_info = btrfs_sb (inode -> i_sb );
199- struct btrfs_root * root = BTRFS_I (inode )-> root ;
200307 struct btrfs_path * path = NULL ;
201308 struct extent_buffer * leaf ;
202309 struct btrfs_trans_handle * trans ;
@@ -233,6 +340,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
233340 struct btrfs_key new_key ;
234341 u64 disko = 0 , diskl = 0 ;
235342 u64 datao = 0 , datal = 0 ;
343+ u8 comp ;
236344 u64 drop_start ;
237345
238346 /* Note the key will change type as we walk through the tree */
@@ -275,6 +383,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
275383
276384 extent = btrfs_item_ptr (leaf , slot ,
277385 struct btrfs_file_extent_item );
386+ comp = btrfs_file_extent_compression (leaf , extent );
278387 type = btrfs_file_extent_type (leaf , extent );
279388 if (type == BTRFS_FILE_EXTENT_REG ||
280389 type == BTRFS_FILE_EXTENT_PREALLOC ) {
@@ -369,29 +478,11 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
369478 if (key .offset != 0 || datal > fs_info -> sectorsize )
370479 return - EUCLEAN ;
371480
372- /*
373- * If our extent is inline, we know we will drop or
374- * adjust at most 1 extent item in the destination root.
375- *
376- * 1 - adjusting old extent (we may have to split it)
377- * 1 - add new extent
378- * 1 - inode update
379- */
380- trans = btrfs_start_transaction (root , 3 );
381- if (IS_ERR (trans )) {
382- ret = PTR_ERR (trans );
383- goto out ;
384- }
385-
386- ret = clone_copy_inline_extent (inode , trans , path ,
387- & new_key , drop_start ,
388- datal , size , buf );
389- if (ret ) {
390- if (ret != - EOPNOTSUPP )
391- btrfs_abort_transaction (trans , ret );
392- btrfs_end_transaction (trans );
481+ ret = clone_copy_inline_extent (inode , path , & new_key ,
482+ drop_start , datal , size ,
483+ comp , buf , & trans );
484+ if (ret )
393485 goto out ;
394- }
395486 }
396487
397488 btrfs_release_path (path );
@@ -526,6 +617,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
526617 struct inode * src = file_inode (file_src );
527618 struct btrfs_fs_info * fs_info = btrfs_sb (inode -> i_sb );
528619 int ret ;
620+ int wb_ret ;
529621 u64 len = olen ;
530622 u64 bs = fs_info -> sb -> s_blocksize ;
531623
@@ -566,6 +658,14 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
566658 btrfs_double_extent_lock (src , off , inode , destoff , len );
567659 ret = btrfs_clone (src , inode , off , olen , len , destoff , 0 );
568660 btrfs_double_extent_unlock (src , off , inode , destoff , len );
661+
662+ /*
663+ * We may have copied an inline extent into a page of the destination
664+ * range, so wait for writeback to complete before truncating pages
665+ * from the page cache. This is a rare case.
666+ */
667+ wb_ret = btrfs_wait_ordered_range (inode , destoff , len );
668+ ret = ret ? ret : wb_ret ;
569669 /*
570670 * Truncate page cache pages so that future reads will see the cloned
571671 * data immediately and not the previous data.
0 commit comments