Skip to content

Commit 298f7be

Browse files
Dave Chinnerdjwong
Dave Chinner
authored andcommitted
xfs: pin inode backing buffer to the inode log item
When we dirty an inode, we are going to have to write it disk at some point in the near future. This requires the inode cluster backing buffer to be present in memory. Unfortunately, under severe memory pressure we can reclaim the inode backing buffer while the inode is dirty in memory, resulting in stalling the AIL pushing because it has to do a read-modify-write cycle on the cluster buffer. When we have no memory available, the read of the cluster buffer blocks the AIL pushing process, and this causes all sorts of issues for memory reclaim as it requires inode writeback to make forwards progress. Allocating a cluster buffer causes more memory pressure, and results in more cluster buffers to be reclaimed, resulting in more RMW cycles to be done in the AIL context and everything then backs up on AIL progress. Only the synchronous inode cluster writeback in the the inode reclaim code provides some level of forwards progress guarantees that prevent OOM-killer rampages in this situation. Fix this by pinning the inode backing buffer to the inode log item when the inode is first dirtied (i.e. in xfs_trans_log_inode()). This may mean the first modification of an inode that has been held in cache for a long time may block on a cluster buffer read, but we can do that in transaction context and block safely until the buffer has been allocated and read. Once we have the cluster buffer, the inode log item takes a reference to it, pinning it in memory, and attaches it to the log item for future reference. This means we can always grab the cluster buffer from the inode log item when we need it. When the inode is finally cleaned and removed from the AIL, we can drop the reference the inode log item holds on the cluster buffer. Once all inodes on the cluster buffer are clean, the cluster buffer will be unpinned and it will be available for memory reclaim to reclaim again. This avoids the issues with needing to do RMW cycles in the AIL pushing context, and hence allows complete non-blocking inode flushing to be performed by the AIL pushing context. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
1 parent e98084b commit 298f7be

File tree

5 files changed

+105
-24
lines changed

5 files changed

+105
-24
lines changed

Diff for: fs/xfs/libxfs/xfs_inode_buf.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,8 @@ xfs_imap_to_bp(
176176
}
177177

178178
*bpp = bp;
179-
*dipp = xfs_buf_offset(bp, imap->im_boffset);
179+
if (dipp)
180+
*dipp = xfs_buf_offset(bp, imap->im_boffset);
180181
return 0;
181182
}
182183

Diff for: fs/xfs/libxfs/xfs_trans_inode.c

+47-6
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
#include "xfs_shared.h"
99
#include "xfs_format.h"
1010
#include "xfs_log_format.h"
11+
#include "xfs_trans_resv.h"
12+
#include "xfs_mount.h"
1113
#include "xfs_inode.h"
1214
#include "xfs_trans.h"
1315
#include "xfs_trans_priv.h"
@@ -72,13 +74,19 @@ xfs_trans_ichgtime(
7274
}
7375

7476
/*
75-
* This is called to mark the fields indicated in fieldmask as needing
76-
* to be logged when the transaction is committed. The inode must
77-
* already be associated with the given transaction.
77+
* This is called to mark the fields indicated in fieldmask as needing to be
78+
* logged when the transaction is committed. The inode must already be
79+
* associated with the given transaction.
7880
*
79-
* The values for fieldmask are defined in xfs_inode_item.h. We always
80-
* log all of the core inode if any of it has changed, and we always log
81-
* all of the inline data/extents/b-tree root if any of them has changed.
81+
* The values for fieldmask are defined in xfs_inode_item.h. We always log all
82+
* of the core inode if any of it has changed, and we always log all of the
83+
* inline data/extents/b-tree root if any of them has changed.
84+
*
85+
* Grab and pin the cluster buffer associated with this inode to avoid RMW
86+
* cycles at inode writeback time. Avoid the need to add error handling to every
87+
* xfs_trans_log_inode() call by shutting down on read error. This will cause
88+
* transactions to fail and everything to error out, just like if we return a
89+
* read error in a dirty transaction and cancel it.
8290
*/
8391
void
8492
xfs_trans_log_inode(
@@ -131,6 +139,39 @@ xfs_trans_log_inode(
131139
spin_lock(&iip->ili_lock);
132140
iip->ili_fsync_fields |= flags;
133141

142+
if (!iip->ili_item.li_buf) {
143+
struct xfs_buf *bp;
144+
int error;
145+
146+
/*
147+
* We hold the ILOCK here, so this inode is not going to be
148+
* flushed while we are here. Further, because there is no
149+
* buffer attached to the item, we know that there is no IO in
150+
* progress, so nothing will clear the ili_fields while we read
151+
* in the buffer. Hence we can safely drop the spin lock and
152+
* read the buffer knowing that the state will not change from
153+
* here.
154+
*/
155+
spin_unlock(&iip->ili_lock);
156+
error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, NULL,
157+
&bp, 0);
158+
if (error) {
159+
xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR);
160+
return;
161+
}
162+
163+
/*
164+
* We need an explicit buffer reference for the log item but
165+
* don't want the buffer to remain attached to the transaction.
166+
* Hold the buffer but release the transaction reference.
167+
*/
168+
xfs_buf_hold(bp);
169+
xfs_trans_brelse(tp, bp);
170+
171+
spin_lock(&iip->ili_lock);
172+
iip->ili_item.li_buf = bp;
173+
}
174+
134175
/*
135176
* Always OR in the bits from the ili_last_fields field. This is to
136177
* coordinate with the xfs_iflush() and xfs_iflush_done() routines in

Diff for: fs/xfs/xfs_buf_item.c

+1-3
Original file line numberDiff line numberDiff line change
@@ -1143,11 +1143,9 @@ xfs_buf_inode_iodone(
11431143
if (ret == XBF_IOERROR_DONE)
11441144
return;
11451145
ASSERT(ret == XBF_IOERROR_FAIL);
1146-
spin_lock(&bp->b_mount->m_ail->ail_lock);
11471146
list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
1148-
xfs_set_li_failed(lip, bp);
1147+
set_bit(XFS_LI_FAILED, &lip->li_flags);
11491148
}
1150-
spin_unlock(&bp->b_mount->m_ail->ail_lock);
11511149
xfs_buf_ioerror(bp, 0);
11521150
xfs_buf_relse(bp);
11531151
return;

Diff for: fs/xfs/xfs_inode_item.c

+49-12
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,7 @@ xfs_inode_item_pin(
439439
struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
440440

441441
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
442+
ASSERT(lip->li_buf);
442443

443444
trace_xfs_inode_pin(ip, _RET_IP_);
444445
atomic_inc(&ip->i_pincount);
@@ -450,6 +451,12 @@ xfs_inode_item_pin(
450451
* item which was previously pinned with a call to xfs_inode_item_pin().
451452
*
452453
* Also wake up anyone in xfs_iunpin_wait() if the count goes to 0.
454+
*
455+
* Note that unpin can race with inode cluster buffer freeing marking the buffer
456+
* stale. In that case, flush completions are run from the buffer unpin call,
457+
* which may happen before the inode is unpinned. If we lose the race, there
458+
* will be no buffer attached to the log item, but the inode will be marked
459+
* XFS_ISTALE.
453460
*/
454461
STATIC void
455462
xfs_inode_item_unpin(
@@ -459,6 +466,7 @@ xfs_inode_item_unpin(
459466
struct xfs_inode *ip = INODE_ITEM(lip)->ili_inode;
460467

461468
trace_xfs_inode_unpin(ip, _RET_IP_);
469+
ASSERT(lip->li_buf || xfs_iflags_test(ip, XFS_ISTALE));
462470
ASSERT(atomic_read(&ip->i_pincount) > 0);
463471
if (atomic_dec_and_test(&ip->i_pincount))
464472
wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
@@ -629,10 +637,15 @@ xfs_inode_item_init(
629637
*/
630638
void
631639
xfs_inode_item_destroy(
632-
xfs_inode_t *ip)
640+
struct xfs_inode *ip)
633641
{
634-
kmem_free(ip->i_itemp->ili_item.li_lv_shadow);
635-
kmem_cache_free(xfs_ili_zone, ip->i_itemp);
642+
struct xfs_inode_log_item *iip = ip->i_itemp;
643+
644+
ASSERT(iip->ili_item.li_buf == NULL);
645+
646+
ip->i_itemp = NULL;
647+
kmem_free(iip->ili_item.li_lv_shadow);
648+
kmem_cache_free(xfs_ili_zone, iip);
636649
}
637650

638651

@@ -673,11 +686,10 @@ xfs_iflush_done(
673686
list_move_tail(&lip->li_bio_list, &tmp);
674687

675688
/* Do an unlocked check for needing the AIL lock. */
676-
if (lip->li_lsn == iip->ili_flush_lsn ||
689+
if (iip->ili_flush_lsn == lip->li_lsn ||
677690
test_bit(XFS_LI_FAILED, &lip->li_flags))
678691
need_ail++;
679692
}
680-
ASSERT(list_empty(&bp->b_li_list));
681693

682694
/*
683695
* We only want to pull the item from the AIL if it is actually there
@@ -690,7 +702,7 @@ xfs_iflush_done(
690702
/* this is an opencoded batch version of xfs_trans_ail_delete */
691703
spin_lock(&ailp->ail_lock);
692704
list_for_each_entry(lip, &tmp, li_bio_list) {
693-
xfs_clear_li_failed(lip);
705+
clear_bit(XFS_LI_FAILED, &lip->li_flags);
694706
if (lip->li_lsn == INODE_ITEM(lip)->ili_flush_lsn) {
695707
xfs_lsn_t lsn = xfs_ail_delete_one(ailp, lip);
696708
if (!tail_lsn && lsn)
@@ -706,14 +718,29 @@ xfs_iflush_done(
706718
* them is safely on disk.
707719
*/
708720
list_for_each_entry_safe(lip, n, &tmp, li_bio_list) {
721+
bool drop_buffer = false;
722+
709723
list_del_init(&lip->li_bio_list);
710724
iip = INODE_ITEM(lip);
711725

712726
spin_lock(&iip->ili_lock);
727+
728+
/*
729+
* Remove the reference to the cluster buffer if the inode is
730+
* clean in memory. Drop the buffer reference once we've dropped
731+
* the locks we hold.
732+
*/
733+
ASSERT(iip->ili_item.li_buf == bp);
734+
if (!iip->ili_fields) {
735+
iip->ili_item.li_buf = NULL;
736+
drop_buffer = true;
737+
}
713738
iip->ili_last_fields = 0;
739+
iip->ili_flush_lsn = 0;
714740
spin_unlock(&iip->ili_lock);
715-
716741
xfs_ifunlock(iip->ili_inode);
742+
if (drop_buffer)
743+
xfs_buf_rele(bp);
717744
}
718745
}
719746

@@ -725,12 +752,20 @@ xfs_iflush_done(
725752
*/
726753
void
727754
xfs_iflush_abort(
728-
struct xfs_inode *ip)
755+
struct xfs_inode *ip)
729756
{
730-
struct xfs_inode_log_item *iip = ip->i_itemp;
757+
struct xfs_inode_log_item *iip = ip->i_itemp;
758+
struct xfs_buf *bp = NULL;
731759

732760
if (iip) {
761+
/*
762+
* Clear the failed bit before removing the item from the AIL so
763+
* xfs_trans_ail_delete() doesn't try to clear and release the
764+
* buffer attached to the log item before we are done with it.
765+
*/
766+
clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags);
733767
xfs_trans_ail_delete(&iip->ili_item, 0);
768+
734769
/*
735770
* Clear the inode logging fields so no more flushes are
736771
* attempted.
@@ -739,12 +774,14 @@ xfs_iflush_abort(
739774
iip->ili_last_fields = 0;
740775
iip->ili_fields = 0;
741776
iip->ili_fsync_fields = 0;
777+
iip->ili_flush_lsn = 0;
778+
bp = iip->ili_item.li_buf;
779+
iip->ili_item.li_buf = NULL;
742780
spin_unlock(&iip->ili_lock);
743781
}
744-
/*
745-
* Release the inode's flush lock since we're done with it.
746-
*/
747782
xfs_ifunlock(ip);
783+
if (bp)
784+
xfs_buf_rele(bp);
748785
}
749786

750787
/*

Diff for: fs/xfs/xfs_trans_ail.c

+6-2
Original file line numberDiff line numberDiff line change
@@ -377,8 +377,12 @@ xfsaild_resubmit_item(
377377
}
378378

379379
/* protected by ail_lock */
380-
list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
381-
xfs_clear_li_failed(lip);
380+
list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
381+
if (bp->b_flags & _XBF_INODES)
382+
clear_bit(XFS_LI_FAILED, &lip->li_flags);
383+
else
384+
xfs_clear_li_failed(lip);
385+
}
382386

383387
xfs_buf_unlock(bp);
384388
return XFS_ITEM_SUCCESS;

0 commit comments

Comments
 (0)