Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 66f3b8e2 authored by Jens Axboe's avatar Jens Axboe
Browse files

writeback: move dirty inodes from super_block to backing_dev_info



This is a first step at introducing per-bdi flusher threads. We should
have no change in behaviour, although sb_has_dirty_inodes() is now
ridiculously expensive, as there's no easy way to answer that question.
Not a huge problem, since it'll be deleted in subsequent patches.

Signed-off-by: default avatarJens Axboe <jens.axboe@oracle.com>
parent d8a8559c
Loading
Loading
Loading
Loading
+127 −70
Original line number Diff line number Diff line
@@ -25,6 +25,7 @@
#include <linux/buffer_head.h>
#include "internal.h"

#define inode_to_bdi(inode)	((inode)->i_mapping->backing_dev_info)

/**
 * writeback_acquire - attempt to get exclusive writeback access to a device
@@ -165,12 +166,13 @@ void __mark_inode_dirty(struct inode *inode, int flags)
			goto out;

		/*
		 * If the inode was already on s_dirty/s_io/s_more_io, don't
		 * reposition it (that would break s_dirty time-ordering).
		 * If the inode was already on b_dirty/b_io/b_more_io, don't
		 * reposition it (that would break b_dirty time-ordering).
		 */
		if (!was_dirty) {
			inode->dirtied_when = jiffies;
			list_move(&inode->i_list, &sb->s_dirty);
			list_move(&inode->i_list,
					&inode_to_bdi(inode)->b_dirty);
		}
	}
out:
@@ -191,31 +193,30 @@ static int write_inode(struct inode *inode, int sync)
 * furthest end of its superblock's dirty-inode list.
 *
 * Before stamping the inode's ->dirtied_when, we check to see whether it is
 * already the most-recently-dirtied inode on the s_dirty list.  If that is
 * already the most-recently-dirtied inode on the b_dirty list.  If that is
 * the case then the inode must have been redirtied while it was being written
 * out and we don't reset its dirtied_when.
 */
static void redirty_tail(struct inode *inode)
{
	struct super_block *sb = inode->i_sb;
	struct backing_dev_info *bdi = inode_to_bdi(inode);

	if (!list_empty(&sb->s_dirty)) {
		struct inode *tail_inode;
	if (!list_empty(&bdi->b_dirty)) {
		struct inode *tail;

		tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
		if (time_before(inode->dirtied_when,
				tail_inode->dirtied_when))
		tail = list_entry(bdi->b_dirty.next, struct inode, i_list);
		if (time_before(inode->dirtied_when, tail->dirtied_when))
			inode->dirtied_when = jiffies;
	}
	list_move(&inode->i_list, &sb->s_dirty);
	list_move(&inode->i_list, &bdi->b_dirty);
}

/*
 * requeue inode for re-scanning after sb->s_io list is exhausted.
 * requeue inode for re-scanning after bdi->b_io list is exhausted.
 */
static void requeue_io(struct inode *inode)
{
	list_move(&inode->i_list, &inode->i_sb->s_more_io);
	list_move(&inode->i_list, &inode_to_bdi(inode)->b_more_io);
}

static void inode_sync_complete(struct inode *inode)
@@ -262,18 +263,50 @@ static void move_expired_inodes(struct list_head *delaying_queue,
/*
 * Queue all expired dirty inodes for io, eldest first.
 */
static void queue_io(struct super_block *sb,
static void queue_io(struct backing_dev_info *bdi,
		     unsigned long *older_than_this)
{
	list_splice_init(&sb->s_more_io, sb->s_io.prev);
	move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
	list_splice_init(&bdi->b_more_io, bdi->b_io.prev);
	move_expired_inodes(&bdi->b_dirty, &bdi->b_io, older_than_this);
}

static int sb_on_inode_list(struct super_block *sb, struct list_head *list)
{
	struct inode *inode;
	int ret = 0;

	spin_lock(&inode_lock);
	list_for_each_entry(inode, list, i_list) {
		if (inode->i_sb == sb) {
			ret = 1;
			break;
		}
	}
	spin_unlock(&inode_lock);
	return ret;
}

int sb_has_dirty_inodes(struct super_block *sb)
{
	return !list_empty(&sb->s_dirty) ||
	       !list_empty(&sb->s_io) ||
	       !list_empty(&sb->s_more_io);
	struct backing_dev_info *bdi;
	int ret = 0;

	/*
	 * This is REALLY expensive right now, but it'll go away
	 * when the bdi writeback is introduced
	 */
	mutex_lock(&bdi_lock);
	list_for_each_entry(bdi, &bdi_list, bdi_list) {
		if (sb_on_inode_list(sb, &bdi->b_dirty) ||
		    sb_on_inode_list(sb, &bdi->b_io) ||
		    sb_on_inode_list(sb, &bdi->b_more_io)) {
			ret = 1;
			break;
		}
	}
	mutex_unlock(&bdi_lock);

	return ret;
}
EXPORT_SYMBOL(sb_has_dirty_inodes);

@@ -322,11 +355,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
	if (inode->i_state & I_SYNC) {
		/*
		 * If this inode is locked for writeback and we are not doing
		 * writeback-for-data-integrity, move it to s_more_io so that
		 * writeback-for-data-integrity, move it to b_more_io so that
		 * writeback can proceed with the other inodes on s_io.
		 *
		 * We'll have another go at writing back this inode when we
		 * completed a full scan of s_io.
		 * completed a full scan of b_io.
		 */
		if (!wait) {
			requeue_io(inode);
@@ -371,11 +404,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
			/*
			 * We didn't write back all the pages.  nfs_writepages()
			 * sometimes bales out without doing anything. Redirty
			 * the inode; Move it from s_io onto s_more_io/s_dirty.
			 * the inode; Move it from b_io onto b_more_io/b_dirty.
			 */
			/*
			 * akpm: if the caller was the kupdate function we put
			 * this inode at the head of s_dirty so it gets first
			 * this inode at the head of b_dirty so it gets first
			 * consideration.  Otherwise, move it to the tail, for
			 * the reasons described there.  I'm not really sure
			 * how much sense this makes.  Presumably I had a good
@@ -385,7 +418,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
			if (wbc->for_kupdate) {
				/*
				 * For the kupdate function we move the inode
				 * to s_more_io so it will get more writeout as
				 * to b_more_io so it will get more writeout as
				 * soon as the queue becomes uncongested.
				 */
				inode->i_state |= I_DIRTY_PAGES;
@@ -433,51 +466,34 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
	return ret;
}

/*
 * Write out a superblock's list of dirty inodes.  A wait will be performed
 * upon no inodes, all inodes or the final one, depending upon sync_mode.
 *
 * If older_than_this is non-NULL, then only write out inodes which
 * had their first dirtying at a time earlier than *older_than_this.
 *
 * If we're a pdflush thread, then implement pdflush collision avoidance
 * against the entire list.
 *
 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
 * This function assumes that the blockdev superblock's inodes are backed by
 * a variety of queues, so all inodes are searched.  For other superblocks,
 * assume that all inodes are backed by the same queue.
 *
 * FIXME: this linear search could get expensive with many fileystems.  But
 * how to fix?  We need to go from an address_space to all inodes which share
 * a queue with that address_space.  (Easy: have a global "dirty superblocks"
 * list).
 *
 * The inodes to be written are parked on sb->s_io.  They are moved back onto
 * sb->s_dirty as they are selected for writing.  This way, none can be missed
 * on the writer throttling path, and we get decent balancing between many
 * throttled threads: we don't want them all piling up on inode_sync_wait.
 */
static void generic_sync_sb_inodes(struct super_block *sb,
				   struct writeback_control *wbc)
static void generic_sync_bdi_inodes(struct backing_dev_info *bdi,
				    struct writeback_control *wbc,
				    struct super_block *sb)
{
	const int is_blkdev_sb = sb_is_blkdev_sb(sb);
	const unsigned long start = jiffies;	/* livelock avoidance */
	int sync = wbc->sync_mode == WB_SYNC_ALL;

	spin_lock(&inode_lock);
	if (!wbc->for_kupdate || list_empty(&sb->s_io))
		queue_io(sb, wbc->older_than_this);

	while (!list_empty(&sb->s_io)) {
		struct inode *inode = list_entry(sb->s_io.prev,
	if (!wbc->for_kupdate || list_empty(&bdi->b_io))
		queue_io(bdi, wbc->older_than_this);

	while (!list_empty(&bdi->b_io)) {
		struct inode *inode = list_entry(bdi->b_io.prev,
						struct inode, i_list);
		struct address_space *mapping = inode->i_mapping;
		struct backing_dev_info *bdi = mapping->backing_dev_info;
		long pages_skipped;

		/*
		 * super block given and doesn't match, skip this inode
		 */
		if (sb && sb != inode->i_sb) {
			redirty_tail(inode);
			continue;
		}

		if (!bdi_cap_writeback_dirty(bdi)) {
			redirty_tail(inode);
			if (sb_is_blkdev_sb(sb)) {
			if (is_blkdev_sb) {
				/*
				 * Dirty memory-backed blockdev: the ramdisk
				 * driver does this.  Skip just this inode
@@ -499,14 +515,14 @@ static void generic_sync_sb_inodes(struct super_block *sb,

		if (wbc->nonblocking && bdi_write_congested(bdi)) {
			wbc->encountered_congestion = 1;
			if (!sb_is_blkdev_sb(sb))
			if (!is_blkdev_sb)
				break;		/* Skip a congested fs */
			requeue_io(inode);
			continue;		/* Skip a congested blockdev */
		}

		if (wbc->bdi && bdi != wbc->bdi) {
			if (!sb_is_blkdev_sb(sb))
			if (!is_blkdev_sb)
				break;		/* fs has the wrong queue */
			requeue_io(inode);
			continue;		/* blockdev has wrong queue */
@@ -544,13 +560,57 @@ static void generic_sync_sb_inodes(struct super_block *sb,
			wbc->more_io = 1;
			break;
		}
		if (!list_empty(&sb->s_more_io))
		if (!list_empty(&bdi->b_more_io))
			wbc->more_io = 1;
	}

	if (sync) {
	spin_unlock(&inode_lock);
	/* Leave any unwritten inodes on b_io */
}

/*
 * Write out a superblock's list of dirty inodes.  A wait will be performed
 * upon no inodes, all inodes or the final one, depending upon sync_mode.
 *
 * If older_than_this is non-NULL, then only write out inodes which
 * had their first dirtying at a time earlier than *older_than_this.
 *
 * If we're a pdlfush thread, then implement pdflush collision avoidance
 * against the entire list.
 *
 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
 * This function assumes that the blockdev superblock's inodes are backed by
 * a variety of queues, so all inodes are searched.  For other superblocks,
 * assume that all inodes are backed by the same queue.
 *
 * FIXME: this linear search could get expensive with many fileystems.  But
 * how to fix?  We need to go from an address_space to all inodes which share
 * a queue with that address_space.  (Easy: have a global "dirty superblocks"
 * list).
 *
 * The inodes to be written are parked on bdi->b_io.  They are moved back onto
 * bdi->b_dirty as they are selected for writing.  This way, none can be missed
 * on the writer throttling path, and we get decent balancing between many
 * throttled threads: we don't want them all piling up on inode_sync_wait.
 */
static void generic_sync_sb_inodes(struct super_block *sb,
				   struct writeback_control *wbc)
{
	struct backing_dev_info *bdi;

	if (!wbc->bdi) {
		mutex_lock(&bdi_lock);
		list_for_each_entry(bdi, &bdi_list, bdi_list)
			generic_sync_bdi_inodes(bdi, wbc, sb);
		mutex_unlock(&bdi_lock);
	} else
		generic_sync_bdi_inodes(wbc->bdi, wbc, sb);

	if (wbc->sync_mode == WB_SYNC_ALL) {
		struct inode *inode, *old_inode = NULL;

		spin_lock(&inode_lock);

		/*
		 * Data integrity sync. Must wait for all pages under writeback,
		 * because there may have been pages dirtied before our sync
@@ -588,10 +648,7 @@ static void generic_sync_sb_inodes(struct super_block *sb,
		}
		spin_unlock(&inode_lock);
		iput(old_inode);
	} else
		spin_unlock(&inode_lock);

	return;		/* Leave any unwritten inodes on s_io */
	}
}

/*
@@ -599,8 +656,8 @@ static void generic_sync_sb_inodes(struct super_block *sb,
 *
 * Note:
 * We don't need to grab a reference to superblock here. If it has non-empty
 * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
 * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
 * ->b_dirty it's hadn't been killed yet and kill_super() won't proceed
 * past sync_inodes_sb() until the ->b_dirty/b_io/b_more_io lists are all
 * empty. Since __sync_single_inode() regains inode_lock before it finally moves
 * inode from superblock lists we are OK.
 *
+0 −3
Original line number Diff line number Diff line
@@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
			s = NULL;
			goto out;
		}
		INIT_LIST_HEAD(&s->s_dirty);
		INIT_LIST_HEAD(&s->s_io);
		INIT_LIST_HEAD(&s->s_more_io);
		INIT_LIST_HEAD(&s->s_files);
		INIT_LIST_HEAD(&s->s_instances);
		INIT_HLIST_HEAD(&s->s_anon);
+9 −0
Original line number Diff line number Diff line
@@ -40,6 +40,8 @@ enum bdi_stat_item {
#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))

struct backing_dev_info {
	struct list_head bdi_list;

	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
	unsigned long state;	/* Always use atomic bitops on this */
	unsigned int capabilities; /* Device capabilities */
@@ -58,6 +60,10 @@ struct backing_dev_info {

	struct device *dev;

	struct list_head	b_dirty;	/* dirty inodes */
	struct list_head	b_io;		/* parked for writeback */
	struct list_head	b_more_io;	/* parked for more writeback */

#ifdef CONFIG_DEBUG_FS
	struct dentry *debug_dir;
	struct dentry *debug_stats;
@@ -72,6 +78,9 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
void bdi_unregister(struct backing_dev_info *bdi);

extern struct mutex bdi_lock;
extern struct list_head bdi_list;

static inline void __add_bdi_stat(struct backing_dev_info *bdi,
		enum bdi_stat_item item, s64 amount)
{
+1 −4
Original line number Diff line number Diff line
@@ -715,7 +715,7 @@ struct posix_acl;

struct inode {
	struct hlist_node	i_hash;
	struct list_head	i_list;
	struct list_head	i_list;		/* backing dev IO list */
	struct list_head	i_sb_list;
	struct list_head	i_dentry;
	unsigned long		i_ino;
@@ -1336,9 +1336,6 @@ struct super_block {
	struct xattr_handler	**s_xattr;

	struct list_head	s_inodes;	/* all inodes */
	struct list_head	s_dirty;	/* dirty inodes */
	struct list_head	s_io;		/* parked for writeback */
	struct list_head	s_more_io;	/* parked for more writeback */
	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
	struct list_head	s_files;
	/* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
+24 −0
Original line number Diff line number Diff line
@@ -22,6 +22,8 @@ struct backing_dev_info default_backing_dev_info = {
EXPORT_SYMBOL_GPL(default_backing_dev_info);

static struct class *bdi_class;
DEFINE_MUTEX(bdi_lock);
LIST_HEAD(bdi_list);

#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
@@ -211,6 +213,10 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
		goto exit;
	}

	mutex_lock(&bdi_lock);
	list_add_tail(&bdi->bdi_list, &bdi_list);
	mutex_unlock(&bdi_lock);

	bdi->dev = dev;
	bdi_debug_register(bdi, dev_name(dev));

@@ -225,9 +231,17 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
}
EXPORT_SYMBOL(bdi_register_dev);

static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
	mutex_lock(&bdi_lock);
	list_del(&bdi->bdi_list);
	mutex_unlock(&bdi_lock);
}

void bdi_unregister(struct backing_dev_info *bdi)
{
	if (bdi->dev) {
		bdi_remove_from_list(bdi);
		bdi_debug_unregister(bdi);
		device_unregister(bdi->dev);
		bdi->dev = NULL;
@@ -245,6 +259,10 @@ int bdi_init(struct backing_dev_info *bdi)
	bdi->min_ratio = 0;
	bdi->max_ratio = 100;
	bdi->max_prop_frac = PROP_FRAC_BASE;
	INIT_LIST_HEAD(&bdi->bdi_list);
	INIT_LIST_HEAD(&bdi->b_io);
	INIT_LIST_HEAD(&bdi->b_dirty);
	INIT_LIST_HEAD(&bdi->b_more_io);

	for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
		err = percpu_counter_init(&bdi->bdi_stat[i], 0);
@@ -259,6 +277,8 @@ int bdi_init(struct backing_dev_info *bdi)
err:
		while (i--)
			percpu_counter_destroy(&bdi->bdi_stat[i]);

		bdi_remove_from_list(bdi);
	}

	return err;
@@ -269,6 +289,10 @@ void bdi_destroy(struct backing_dev_info *bdi)
{
	int i;

	WARN_ON(!list_empty(&bdi->b_dirty));
	WARN_ON(!list_empty(&bdi->b_io));
	WARN_ON(!list_empty(&bdi->b_more_io));

	bdi_unregister(bdi);

	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
Loading