Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f758eeab authored by Christoph Hellwig's avatar Christoph Hellwig Committed by Wu Fengguang
Browse files

writeback: split inode_wb_list_lock into bdi_writeback.list_lock



Split the global inode_wb_list_lock into a per-bdi_writeback list_lock,
as it's currently the most contended lock in the system for metadata
heavy workloads.  It won't help for single-filesystem workloads for
which we'll need the I/O-less balance_dirty_pages, but at least we
can dedicate a cpu to spinning on each bdi now for larger systems.

Based on earlier patches from Nick Piggin and Dave Chinner.

It reduces lock contentions to 1/4 in this test case:
10 HDD JBOD, 100 dd on each disk, XFS, 6GB ram

lock_stat version 0.3
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
vanilla 2.6.39-rc3:
                      inode_wb_list_lock:         42590          44433           0.12         147.74      144127.35         252274         886792           0.08         121.34      917211.23
                      ------------------
                      inode_wb_list_lock              2          [<ffffffff81165da5>] bdev_inode_switch_bdi+0x29/0x85
                      inode_wb_list_lock             34          [<ffffffff8115bd0b>] inode_wb_list_del+0x22/0x49
                      inode_wb_list_lock          12893          [<ffffffff8115bb53>] __mark_inode_dirty+0x170/0x1d0
                      inode_wb_list_lock          10702          [<ffffffff8115afef>] writeback_single_inode+0x16d/0x20a
                      ------------------
                      inode_wb_list_lock              2          [<ffffffff81165da5>] bdev_inode_switch_bdi+0x29/0x85
                      inode_wb_list_lock             19          [<ffffffff8115bd0b>] inode_wb_list_del+0x22/0x49
                      inode_wb_list_lock           5550          [<ffffffff8115bb53>] __mark_inode_dirty+0x170/0x1d0
                      inode_wb_list_lock           8511          [<ffffffff8115b4ad>] writeback_sb_inodes+0x10f/0x157

2.6.39-rc3 + patch:
                &(&wb->list_lock)->rlock:         11383          11657           0.14         151.69       40429.51          90825         527918           0.11         145.90      556843.37
                ------------------------
                &(&wb->list_lock)->rlock             10          [<ffffffff8115b189>] inode_wb_list_del+0x5f/0x86
                &(&wb->list_lock)->rlock           1493          [<ffffffff8115b1ed>] writeback_inodes_wb+0x3d/0x150
                &(&wb->list_lock)->rlock           3652          [<ffffffff8115a8e9>] writeback_sb_inodes+0x123/0x16f
                &(&wb->list_lock)->rlock           1412          [<ffffffff8115a38e>] writeback_single_inode+0x17f/0x223
                ------------------------
                &(&wb->list_lock)->rlock              3          [<ffffffff8110b5af>] bdi_lock_two+0x46/0x4b
                &(&wb->list_lock)->rlock              6          [<ffffffff8115b189>] inode_wb_list_del+0x5f/0x86
                &(&wb->list_lock)->rlock           2061          [<ffffffff8115af97>] __mark_inode_dirty+0x173/0x1cf
                &(&wb->list_lock)->rlock           2629          [<ffffffff8115a8e9>] writeback_sb_inodes+0x123/0x16f

hughd@google.com: fix recursive lock when bdi_lock_two() is called with new the same as old
akpm@linux-foundation.org: cleanup bdev_inode_switch_bdi() comment

Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarHugh Dickins <hughd@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarWu Fengguang <fengguang.wu@intel.com>
parent 424b351f
Loading
Loading
Loading
Loading
+10 −6
Original line number Diff line number Diff line
@@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode)
{
	return &BDEV_I(inode)->bdev;
}

EXPORT_SYMBOL(I_BDEV);

/*
 * move the inode from it's current bdi to the a new bdi. if the inode is dirty
 * we need to move it onto the dirty list of @dst so that the inode is always
 * on the right list.
 * Move the inode from its current bdi to a new bdi. If the inode is dirty we
 * need to move it onto the dirty list of @dst so that the inode is always on
 * the right list.
 */
static void bdev_inode_switch_bdi(struct inode *inode,
			struct backing_dev_info *dst)
{
	spin_lock(&inode_wb_list_lock);
	struct backing_dev_info *old = inode->i_data.backing_dev_info;

	if (unlikely(dst == old))		/* deadlock avoidance */
		return;
	bdi_lock_two(&old->wb, &dst->wb);
	spin_lock(&inode->i_lock);
	inode->i_data.backing_dev_info = dst;
	if (inode->i_state & I_DIRTY)
		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
	spin_unlock(&inode->i_lock);
	spin_unlock(&inode_wb_list_lock);
	spin_unlock(&old->wb.list_lock);
	spin_unlock(&dst->wb.list_lock);
}

static sector_t max_block(struct block_device *bdev)
+49 −48
Original line number Diff line number Diff line
@@ -181,12 +181,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
 */
void inode_wb_list_del(struct inode *inode)
{
	spin_lock(&inode_wb_list_lock);
	struct backing_dev_info *bdi = inode_to_bdi(inode);

	spin_lock(&bdi->wb.list_lock);
	list_del_init(&inode->i_wb_list);
	spin_unlock(&inode_wb_list_lock);
	spin_unlock(&bdi->wb.list_lock);
}


/*
 * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
 * furthest end of its superblock's dirty-inode list.
@@ -196,11 +197,9 @@ void inode_wb_list_del(struct inode *inode)
 * the case then the inode must have been redirtied while it was being written
 * out and we don't reset its dirtied_when.
 */
static void redirty_tail(struct inode *inode)
static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
{
	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;

	assert_spin_locked(&inode_wb_list_lock);
	assert_spin_locked(&wb->list_lock);
	if (!list_empty(&wb->b_dirty)) {
		struct inode *tail;

@@ -214,11 +213,9 @@ static void redirty_tail(struct inode *inode)
/*
 * requeue inode for re-scanning after bdi->b_io list is exhausted.
 */
static void requeue_io(struct inode *inode)
static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
{
	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;

	assert_spin_locked(&inode_wb_list_lock);
	assert_spin_locked(&wb->list_lock);
	list_move(&inode->i_wb_list, &wb->b_more_io);
}

@@ -226,7 +223,7 @@ static void inode_sync_complete(struct inode *inode)
{
	/*
	 * Prevent speculative execution through
	 * spin_unlock(&inode_wb_list_lock);
	 * spin_unlock(&wb->list_lock);
	 */

	smp_mb();
@@ -302,7 +299,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 */
static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
{
	assert_spin_locked(&inode_wb_list_lock);
	assert_spin_locked(&wb->list_lock);
	list_splice_init(&wb->b_more_io, &wb->b_io);
	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
}
@@ -317,7 +314,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
/*
 * Wait for writeback on an inode to complete.
 */
static void inode_wait_for_writeback(struct inode *inode)
static void inode_wait_for_writeback(struct inode *inode,
				     struct bdi_writeback *wb)
{
	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
	wait_queue_head_t *wqh;
@@ -325,15 +323,15 @@ static void inode_wait_for_writeback(struct inode *inode)
	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
	while (inode->i_state & I_SYNC) {
		spin_unlock(&inode->i_lock);
		spin_unlock(&inode_wb_list_lock);
		spin_unlock(&wb->list_lock);
		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
		spin_lock(&inode_wb_list_lock);
		spin_lock(&wb->list_lock);
		spin_lock(&inode->i_lock);
	}
}

/*
 * Write out an inode's dirty pages.  Called under inode_wb_list_lock and
 * Write out an inode's dirty pages.  Called under wb->list_lock and
 * inode->i_lock.  Either the caller has an active reference on the inode or
 * the inode has I_WILL_FREE set.
 *
@@ -344,13 +342,14 @@ static void inode_wait_for_writeback(struct inode *inode)
 * livelocks, etc.
 */
static int
writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
		       struct writeback_control *wbc)
{
	struct address_space *mapping = inode->i_mapping;
	unsigned dirty;
	int ret;

	assert_spin_locked(&inode_wb_list_lock);
	assert_spin_locked(&wb->list_lock);
	assert_spin_locked(&inode->i_lock);

	if (!atomic_read(&inode->i_count))
@@ -368,14 +367,14 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
		 * completed a full scan of b_io.
		 */
		if (wbc->sync_mode != WB_SYNC_ALL) {
			requeue_io(inode);
			requeue_io(inode, wb);
			return 0;
		}

		/*
		 * It's a data-integrity sync.  We must wait.
		 */
		inode_wait_for_writeback(inode);
		inode_wait_for_writeback(inode, wb);
	}

	BUG_ON(inode->i_state & I_SYNC);
@@ -384,7 +383,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
	inode->i_state |= I_SYNC;
	inode->i_state &= ~I_DIRTY_PAGES;
	spin_unlock(&inode->i_lock);
	spin_unlock(&inode_wb_list_lock);
	spin_unlock(&wb->list_lock);

	ret = do_writepages(mapping, wbc);

@@ -415,7 +414,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
			ret = err;
	}

	spin_lock(&inode_wb_list_lock);
	spin_lock(&wb->list_lock);
	spin_lock(&inode->i_lock);
	inode->i_state &= ~I_SYNC;
	if (!(inode->i_state & I_FREEING)) {
@@ -438,7 +437,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
				/*
				 * slice used up: queue for next turn
				 */
				requeue_io(inode);
				requeue_io(inode, wb);
			} else {
				/*
				 * Writeback blocked by something other than
@@ -447,7 +446,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
				 * retrying writeback of the dirty page/inode
				 * that cannot be performed immediately.
				 */
				redirty_tail(inode);
				redirty_tail(inode, wb);
			}
		} else if (inode->i_state & I_DIRTY) {
			/*
@@ -456,7 +455,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
			 * submission or metadata updates after data IO
			 * completion.
			 */
			redirty_tail(inode);
			redirty_tail(inode, wb);
		} else {
			/*
			 * The inode is clean.  At this point we either have
@@ -521,7 +520,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
				 * superblock, move all inodes not belonging
				 * to it back onto the dirty list.
				 */
				redirty_tail(inode);
				redirty_tail(inode, wb);
				continue;
			}

@@ -541,7 +540,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
		spin_lock(&inode->i_lock);
		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
			spin_unlock(&inode->i_lock);
			requeue_io(inode);
			requeue_io(inode, wb);
			continue;
		}

@@ -557,19 +556,19 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
		__iget(inode);

		pages_skipped = wbc->pages_skipped;
		writeback_single_inode(inode, wbc);
		writeback_single_inode(inode, wb, wbc);
		if (wbc->pages_skipped != pages_skipped) {
			/*
			 * writeback is not making progress due to locked
			 * buffers.  Skip this inode for now.
			 */
			redirty_tail(inode);
			redirty_tail(inode, wb);
		}
		spin_unlock(&inode->i_lock);
		spin_unlock(&inode_wb_list_lock);
		spin_unlock(&wb->list_lock);
		iput(inode);
		cond_resched();
		spin_lock(&inode_wb_list_lock);
		spin_lock(&wb->list_lock);
		if (wbc->nr_to_write <= 0) {
			wbc->more_io = 1;
			return 1;
@@ -588,7 +587,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,

	if (!wbc->wb_start)
		wbc->wb_start = jiffies; /* livelock avoidance */
	spin_lock(&inode_wb_list_lock);
	spin_lock(&wb->list_lock);

	if (list_empty(&wb->b_io))
		queue_io(wb, wbc->older_than_this);
@@ -598,7 +597,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
		struct super_block *sb = inode->i_sb;

		if (!pin_sb_for_writeback(sb)) {
			requeue_io(inode);
			requeue_io(inode, wb);
			continue;
		}
		ret = writeback_sb_inodes(sb, wb, wbc, false);
@@ -607,7 +606,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
		if (ret)
			break;
	}
	spin_unlock(&inode_wb_list_lock);
	spin_unlock(&wb->list_lock);
	/* Leave any unwritten inodes on b_io */
}

@@ -616,11 +615,11 @@ static void __writeback_inodes_sb(struct super_block *sb,
{
	WARN_ON(!rwsem_is_locked(&sb->s_umount));

	spin_lock(&inode_wb_list_lock);
	spin_lock(&wb->list_lock);
	if (list_empty(&wb->b_io))
		queue_io(wb, wbc->older_than_this);
	writeback_sb_inodes(sb, wb, wbc, true);
	spin_unlock(&inode_wb_list_lock);
	spin_unlock(&wb->list_lock);
}

/*
@@ -762,15 +761,15 @@ static long wb_writeback(struct bdi_writeback *wb,
		 * become available for writeback. Otherwise
		 * we'll just busyloop.
		 */
		spin_lock(&inode_wb_list_lock);
		spin_lock(&wb->list_lock);
		if (!list_empty(&wb->b_more_io))  {
			inode = wb_inode(wb->b_more_io.prev);
			trace_wbc_writeback_wait(&wbc, wb->bdi);
			spin_lock(&inode->i_lock);
			inode_wait_for_writeback(inode);
			inode_wait_for_writeback(inode, wb);
			spin_unlock(&inode->i_lock);
		}
		spin_unlock(&inode_wb_list_lock);
		spin_unlock(&wb->list_lock);
	}

	return wrote;
@@ -1104,10 +1103,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
			}

			spin_unlock(&inode->i_lock);
			spin_lock(&inode_wb_list_lock);
			spin_lock(&bdi->wb.list_lock);
			inode->dirtied_when = jiffies;
			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
			spin_unlock(&inode_wb_list_lock);
			spin_unlock(&bdi->wb.list_lock);

			if (wakeup_bdi)
				bdi_wakeup_thread_delayed(bdi);
@@ -1309,6 +1308,7 @@ EXPORT_SYMBOL(sync_inodes_sb);
 */
int write_inode_now(struct inode *inode, int sync)
{
	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
	int ret;
	struct writeback_control wbc = {
		.nr_to_write = LONG_MAX,
@@ -1321,11 +1321,11 @@ int write_inode_now(struct inode *inode, int sync)
		wbc.nr_to_write = 0;

	might_sleep();
	spin_lock(&inode_wb_list_lock);
	spin_lock(&wb->list_lock);
	spin_lock(&inode->i_lock);
	ret = writeback_single_inode(inode, &wbc);
	ret = writeback_single_inode(inode, wb, &wbc);
	spin_unlock(&inode->i_lock);
	spin_unlock(&inode_wb_list_lock);
	spin_unlock(&wb->list_lock);
	if (sync)
		inode_sync_wait(inode);
	return ret;
@@ -1345,13 +1345,14 @@ EXPORT_SYMBOL(write_inode_now);
 */
int sync_inode(struct inode *inode, struct writeback_control *wbc)
{
	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
	int ret;

	spin_lock(&inode_wb_list_lock);
	spin_lock(&wb->list_lock);
	spin_lock(&inode->i_lock);
	ret = writeback_single_inode(inode, wbc);
	ret = writeback_single_inode(inode, wb, wbc);
	spin_unlock(&inode->i_lock);
	spin_unlock(&inode_wb_list_lock);
	spin_unlock(&wb->list_lock);
	return ret;
}
EXPORT_SYMBOL(sync_inode);
+2 −3
Original line number Diff line number Diff line
@@ -37,7 +37,7 @@
 *   inode_lru, inode->i_lru
 * inode_sb_list_lock protects:
 *   sb->s_inodes, inode->i_sb_list
 * inode_wb_list_lock protects:
 * bdi->wb.list_lock protects:
 *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
 * inode_hash_lock protects:
 *   inode_hashtable, inode->i_hash
@@ -48,7 +48,7 @@
 *   inode->i_lock
 *     inode_lru_lock
 *
 * inode_wb_list_lock
 * bdi->wb.list_lock
 *   inode->i_lock
 *
 * inode_hash_lock
@@ -68,7 +68,6 @@ static LIST_HEAD(inode_lru);
static DEFINE_SPINLOCK(inode_lru_lock);

__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);

/*
 * iprune_sem provides exclusion between the icache shrinking and the
+2 −0
Original line number Diff line number Diff line
@@ -57,6 +57,7 @@ struct bdi_writeback {
	struct list_head b_dirty;	/* dirty inodes */
	struct list_head b_io;		/* parked for writeback */
	struct list_head b_more_io;	/* parked for more writeback */
	spinlock_t list_lock;		/* protects the b_* lists */
};

struct backing_dev_info {
@@ -106,6 +107,7 @@ int bdi_writeback_thread(void *data);
int bdi_has_dirty_io(struct backing_dev_info *bdi);
void bdi_arm_supers_timer(void);
void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);

extern spinlock_t bdi_lock;
extern struct list_head bdi_list;
+0 −2
Original line number Diff line number Diff line
@@ -9,8 +9,6 @@

struct backing_dev_info;

extern spinlock_t inode_wb_list_lock;

/*
 * fs/fs-writeback.c
 */
Loading