Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 7fc5854f authored by Tejun Heo's avatar Tejun Heo Committed by Jens Axboe
Browse files

writeback: synchronize sync(2) against cgroup writeback membership switches



sync_inodes_sb() can race against cgwb (cgroup writeback) membership
switches and fail to writeback some inodes.  For example, if an inode
switches to another wb while sync_inodes_sb() is in progress, the new
wb might not be visible to bdi_split_work_to_wbs() at all or the inode
might jump from a wb which hasn't issued writebacks yet to one which
already has.

This patch adds backing_dev_info->wb_switch_rwsem to synchronize cgwb
switch path against sync_inodes_sb() so that sync_inodes_sb() is
guaranteed to see all the target wbs and inodes can't jump wbs to
escape syncing.

v2: Fixed misplaced rwsem init.  Spotted by Jiufei.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reported-by: default avatarJiufei Xue <xuejiufei@gmail.com>
Link: http://lkml.kernel.org/r/dc694ae2-f07f-61e1-7097-7c8411cee12d@gmail.com


Acked-by: default avatarJan Kara <jack@suse.cz>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent 698cef17
Loading
Loading
Loading
Loading
+38 −2
Original line number Diff line number Diff line
@@ -331,11 +331,22 @@ struct inode_switch_wbs_context {
	struct work_struct	work;
};

static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
{
	down_write(&bdi->wb_switch_rwsem);
}

static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
{
	up_write(&bdi->wb_switch_rwsem);
}

static void inode_switch_wbs_work_fn(struct work_struct *work)
{
	struct inode_switch_wbs_context *isw =
		container_of(work, struct inode_switch_wbs_context, work);
	struct inode *inode = isw->inode;
	struct backing_dev_info *bdi = inode_to_bdi(inode);
	struct address_space *mapping = inode->i_mapping;
	struct bdi_writeback *old_wb = inode->i_wb;
	struct bdi_writeback *new_wb = isw->new_wb;
@@ -343,6 +354,12 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
	struct page *page;
	bool switched = false;

	/*
	 * If @inode switches cgwb membership while sync_inodes_sb() is
	 * being issued, sync_inodes_sb() might miss it.  Synchronize.
	 */
	down_read(&bdi->wb_switch_rwsem);

	/*
	 * By the time control reaches here, RCU grace period has passed
	 * since I_WB_SWITCH assertion and all wb stat update transactions
@@ -428,6 +445,8 @@ static void inode_switch_wbs_work_fn(struct work_struct *work)
	spin_unlock(&new_wb->list_lock);
	spin_unlock(&old_wb->list_lock);

	up_read(&bdi->wb_switch_rwsem);

	if (switched) {
		wb_wakeup(new_wb);
		wb_put(old_wb);
@@ -468,9 +487,18 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
	if (inode->i_state & I_WB_SWITCH)
		return;

	/*
	 * Avoid starting new switches while sync_inodes_sb() is in
	 * progress.  Otherwise, if the down_write protected issue path
	 * blocks heavily, we might end up starting a large number of
	 * switches which will block on the rwsem.
	 */
	if (!down_read_trylock(&bdi->wb_switch_rwsem))
		return;

	isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
	if (!isw)
		return;
		goto out_unlock;

	/* find and pin the new wb */
	rcu_read_lock();
@@ -504,12 +532,14 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id)
	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
	 */
	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
	return;
	goto out_unlock;

out_free:
	if (isw->new_wb)
		wb_put(isw->new_wb);
	kfree(isw);
out_unlock:
	up_read(&bdi->wb_switch_rwsem);
}

/**
@@ -887,6 +917,9 @@ fs_initcall(cgroup_writeback_init);

#else	/* CONFIG_CGROUP_WRITEBACK */

static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }

static struct bdi_writeback *
locked_inode_to_wb_and_lock_list(struct inode *inode)
	__releases(&inode->i_lock)
@@ -2413,8 +2446,11 @@ void sync_inodes_sb(struct super_block *sb)
		return;
	WARN_ON(!rwsem_is_locked(&sb->s_umount));

	/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
	bdi_down_write_wb_switch_rwsem(bdi);
	bdi_split_work_to_wbs(bdi, &work, false);
	wb_wait_for_completion(bdi, &done);
	bdi_up_write_wb_switch_rwsem(bdi);

	wait_sb_inodes(sb);
}
+1 −0
Original line number Diff line number Diff line
@@ -190,6 +190,7 @@ struct backing_dev_info {
	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
	struct rb_root cgwb_congested_tree; /* their congested states */
	struct mutex cgwb_release_mutex;  /* protect shutdown of wb structs */
	struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
#else
	struct bdi_writeback_congested *wb_congested;
#endif
+1 −0
Original line number Diff line number Diff line
@@ -689,6 +689,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
	INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
	bdi->cgwb_congested_tree = RB_ROOT;
	mutex_init(&bdi->cgwb_release_mutex);
	init_rwsem(&bdi->wb_switch_rwsem);

	ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
	if (!ret) {