Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6d7f18f6 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'writeback' of git://git.kernel.dk/linux-2.6-block

* 'writeback' of git://git.kernel.dk/linux-2.6-block:
  writeback: writeback_inodes_sb() should use bdi_start_writeback()
  writeback: don't delay inodes redirtied by a fast dirtier
  writeback: make the super_block pinning more efficient
  writeback: don't resort for a single super_block in move_expired_inodes()
  writeback: move inodes from one super_block together
  writeback: get rid to incorrect references to pdflush in comments
  writeback: improve readability of the wb_writeback() continue/break logic
  writeback: cleanup writeback_single_inode()
  writeback: kupdate writeback shall not stop when more io is possible
  writeback: stop background writeback when below background threshold
  writeback: balance_dirty_pages() shall write more than dirtied pages
  fs: Fix busyloop in wb_writeback()
parents 53cddfcc 56a131dc
Loading
Loading
Loading
Loading
+5 −5
Original line number Diff line number Diff line
@@ -280,7 +280,7 @@ void invalidate_bdev(struct block_device *bdev)
EXPORT_SYMBOL(invalidate_bdev);

/*
 * Kick pdflush then try to free up some ZONE_NORMAL memory.
 * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
 */
static void free_more_memory(void)
{
@@ -1709,9 +1709,9 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
		/*
		 * If it's a fully non-blocking write attempt and we cannot
		 * lock the buffer then redirty the page.  Note that this can
		 * potentially cause a busy-wait loop from pdflush and kswapd
		 * activity, but those code paths have their own higher-level
		 * throttling.
		 * potentially cause a busy-wait loop from writeback threads
		 * and kswapd activity, but those code paths have their own
		 * higher-level throttling.
		 */
		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
			lock_buffer(bh);
@@ -3208,7 +3208,7 @@ EXPORT_SYMBOL(block_sync_page);
 * still running obsolete flush daemons, so we terminate them here.
 *
 * Use of bdflush() is deprecated and will be removed in a future kernel.
 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
 * The `flush-X' kernel threads fully replace bdflush daemons and this call.
 */
SYSCALL_DEFINE2(bdflush, int, func, long, data)
{
+113 −48
Original line number Diff line number Diff line
@@ -41,8 +41,9 @@ struct wb_writeback_args {
	long nr_pages;
	struct super_block *sb;
	enum writeback_sync_modes sync_mode;
	int for_kupdate;
	int range_cyclic;
	int for_kupdate:1;
	int range_cyclic:1;
	int for_background:1;
};

/*
@@ -257,6 +258,15 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
		.range_cyclic	= 1,
	};

	/*
	 * We treat @nr_pages=0 as the special case to do background writeback,
	 * ie. to sync pages until the background dirty threshold is reached.
	 */
	if (!nr_pages) {
		args.nr_pages = LONG_MAX;
		args.for_background = 1;
	}

	bdi_alloc_queue_work(bdi, &args);
}

@@ -310,7 +320,7 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
	 * For inodes being constantly redirtied, dirtied_when can get stuck.
	 * It _appears_ to be in the future, but is actually in distant past.
	 * This test is necessary to prevent such wrapped-around relative times
	 * from permanently stopping the whole pdflush writeback.
	 * from permanently stopping the whole bdi writeback.
	 */
	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
#endif
@@ -324,15 +334,40 @@ static void move_expired_inodes(struct list_head *delaying_queue,
			       struct list_head *dispatch_queue,
				unsigned long *older_than_this)
{
	LIST_HEAD(tmp);
	struct list_head *pos, *node;
	struct super_block *sb = NULL;
	struct inode *inode;
	int do_sb_sort = 0;

	while (!list_empty(delaying_queue)) {
		struct inode *inode = list_entry(delaying_queue->prev,
						struct inode, i_list);
		inode = list_entry(delaying_queue->prev, struct inode, i_list);
		if (older_than_this &&
		    inode_dirtied_after(inode, *older_than_this))
			break;
		if (sb && sb != inode->i_sb)
			do_sb_sort = 1;
		sb = inode->i_sb;
		list_move(&inode->i_list, &tmp);
	}

	/* just one sb in list, splice to dispatch_queue and we're done */
	if (!do_sb_sort) {
		list_splice(&tmp, dispatch_queue);
		return;
	}

	/* Move inodes from one superblock together */
	while (!list_empty(&tmp)) {
		inode = list_entry(tmp.prev, struct inode, i_list);
		sb = inode->i_sb;
		list_for_each_prev_safe(pos, node, &tmp) {
			inode = list_entry(pos, struct inode, i_list);
			if (inode->i_sb == sb)
				list_move(&inode->i_list, dispatch_queue);
		}
	}
}

/*
 * Queue all expired dirty inodes for io, eldest first.
@@ -439,8 +474,18 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
	spin_lock(&inode_lock);
	inode->i_state &= ~I_SYNC;
	if (!(inode->i_state & (I_FREEING | I_CLEAR))) {
		if (!(inode->i_state & I_DIRTY) &&
		    mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
		if ((inode->i_state & I_DIRTY_PAGES) && wbc->for_kupdate) {
			/*
			 * More pages get dirtied by a fast dirtier.
			 */
			goto select_queue;
		} else if (inode->i_state & I_DIRTY) {
			/*
			 * At least XFS will redirty the inode during the
			 * writeback (delalloc) and on io completion (isize).
			 */
			redirty_tail(inode);
		} else if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
			/*
			 * We didn't write back all the pages.  nfs_writepages()
			 * sometimes bales out without doing anything. Redirty
@@ -462,6 +507,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
				 * soon as the queue becomes uncongested.
				 */
				inode->i_state |= I_DIRTY_PAGES;
select_queue:
				if (wbc->nr_to_write <= 0) {
					/*
					 * slice used up: queue for next turn
@@ -484,12 +530,6 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
				inode->i_state |= I_DIRTY_PAGES;
				redirty_tail(inode);
			}
		} else if (inode->i_state & I_DIRTY) {
			/*
			 * Someone redirtied the inode while were writing back
			 * the pages.
			 */
			redirty_tail(inode);
		} else if (atomic_read(&inode->i_count)) {
			/*
			 * The inode is clean, inuse
@@ -506,6 +546,17 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
	return ret;
}

static void unpin_sb_for_writeback(struct super_block **psb)
{
	struct super_block *sb = *psb;

	if (sb) {
		up_read(&sb->s_umount);
		put_super(sb);
		*psb = NULL;
	}
}

/*
 * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
 * before calling writeback. So make sure that we do pin it, so it doesn't
@@ -515,10 +566,19 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 * 1 if we failed.
 */
static int pin_sb_for_writeback(struct writeback_control *wbc,
				   struct inode *inode)
				struct inode *inode, struct super_block **psb)
{
	struct super_block *sb = inode->i_sb;

	/*
	 * If this sb is already pinned, nothing more to do. If not and
	 * *psb is non-NULL, unpin the old one first
	 */
	if (sb == *psb)
		return 0;
	else if (*psb)
		unpin_sb_for_writeback(psb);

	/*
	 * Caller must already hold the ref for this
	 */
@@ -532,7 +592,7 @@ static int pin_sb_for_writeback(struct writeback_control *wbc,
	if (down_read_trylock(&sb->s_umount)) {
		if (sb->s_root) {
			spin_unlock(&sb_lock);
			return 0;
			goto pinned;
		}
		/*
		 * umounted, drop rwsem again and fall through to failure
@@ -543,24 +603,15 @@ static int pin_sb_for_writeback(struct writeback_control *wbc,
	sb->s_count--;
	spin_unlock(&sb_lock);
	return 1;
}

static void unpin_sb_for_writeback(struct writeback_control *wbc,
				   struct inode *inode)
{
	struct super_block *sb = inode->i_sb;

	if (wbc->sync_mode == WB_SYNC_ALL)
		return;

	up_read(&sb->s_umount);
	put_super(sb);
pinned:
	*psb = sb;
	return 0;
}

static void writeback_inodes_wb(struct bdi_writeback *wb,
				struct writeback_control *wbc)
{
	struct super_block *sb = wbc->sb;
	struct super_block *sb = wbc->sb, *pin_sb = NULL;
	const int is_blkdev_sb = sb_is_blkdev_sb(sb);
	const unsigned long start = jiffies;	/* livelock avoidance */

@@ -619,7 +670,7 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
		if (inode_dirtied_after(inode, start))
			break;

		if (pin_sb_for_writeback(wbc, inode)) {
		if (pin_sb_for_writeback(wbc, inode, &pin_sb)) {
			requeue_io(inode);
			continue;
		}
@@ -628,7 +679,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
		__iget(inode);
		pages_skipped = wbc->pages_skipped;
		writeback_single_inode(inode, wbc);
		unpin_sb_for_writeback(wbc, inode);
		if (wbc->pages_skipped != pages_skipped) {
			/*
			 * writeback is not making progress due to locked
@@ -648,6 +698,8 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
			wbc->more_io = 1;
	}

	unpin_sb_for_writeback(&pin_sb);

	spin_unlock(&inode_lock);
	/* Leave any unwritten inodes on b_io */
}
@@ -706,6 +758,7 @@ static long wb_writeback(struct bdi_writeback *wb,
	};
	unsigned long oldest_jif;
	long wrote = 0;
	struct inode *inode;

	if (wbc.for_kupdate) {
		wbc.older_than_this = &oldest_jif;
@@ -719,20 +772,16 @@ static long wb_writeback(struct bdi_writeback *wb,

	for (;;) {
		/*
		 * Don't flush anything for non-integrity writeback where
		 * no nr_pages was given
		 * Stop writeback when nr_pages has been consumed
		 */
		if (!args->for_kupdate && args->nr_pages <= 0 &&
		     args->sync_mode == WB_SYNC_NONE)
		if (args->nr_pages <= 0)
			break;

		/*
		 * If no specific pages were given and this is just a
		 * periodic background writeout and we are below the
		 * background dirty threshold, don't do anything
		 * For background writeout, stop when we are below the
		 * background dirty threshold
		 */
		if (args->for_kupdate && args->nr_pages <= 0 &&
		    !over_bground_thresh())
		if (args->for_background && !over_bground_thresh())
			break;

		wbc.more_io = 0;
@@ -744,13 +793,32 @@ static long wb_writeback(struct bdi_writeback *wb,
		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;

		/*
		 * If we ran out of stuff to write, bail unless more_io got set
		 * If we consumed everything, see if we have more
		 */
		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
			if (wbc.more_io && !wbc.for_kupdate)
		if (wbc.nr_to_write <= 0)
			continue;
		/*
		 * Didn't write everything and we don't have more IO, bail
		 */
		if (!wbc.more_io)
			break;
		/*
		 * Did we write something? Try for more
		 */
		if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
			continue;
		/*
		 * Nothing written. Wait for some inode to
		 * become available for writeback. Otherwise
		 * we'll just busyloop.
		 */
		spin_lock(&inode_lock);
		if (!list_empty(&wb->b_more_io))  {
			inode = list_entry(wb->b_more_io.prev,
						struct inode, i_list);
			inode_wait_for_writeback(inode);
		}
		spin_unlock(&inode_lock);
	}

	return wrote;
@@ -1060,9 +1128,6 @@ EXPORT_SYMBOL(__mark_inode_dirty);
 * If older_than_this is non-NULL, then only write out inodes which
 * had their first dirtying at a time earlier than *older_than_this.
 *
 * If we're a pdlfush thread, then implement pdflush collision avoidance
 * against the entire list.
 *
 * If `bdi' is non-zero then we're being asked to writeback a specific queue.
 * This function assumes that the blockdev superblock's inodes are backed by
 * a variety of queues, so all inodes are searched.  For other superblocks,
@@ -1141,7 +1206,7 @@ void writeback_inodes_sb(struct super_block *sb)
	nr_to_write = nr_dirty + nr_unstable +
			(inodes_stat.nr_inodes - inodes_stat.nr_unused);

	bdi_writeback_all(sb, nr_to_write);
	bdi_start_writeback(sb->s_bdi, nr_to_write);
}
EXPORT_SYMBOL(writeback_inodes_sb);

+17 −13
Original line number Diff line number Diff line
@@ -44,18 +44,21 @@ static long ratelimit_pages = 32;
/*
 * When balance_dirty_pages decides that the caller needs to perform some
 * non-background writeback, this is how many pages it will attempt to write.
 * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
 * It should be somewhat larger than dirtied pages to ensure that reasonably
 * large amounts of I/O are submitted.
 */
static inline long sync_writeback_pages(void)
static inline long sync_writeback_pages(unsigned long dirtied)
{
	return ratelimit_pages + ratelimit_pages / 2;
	if (dirtied < ratelimit_pages)
		dirtied = ratelimit_pages;

	return dirtied + dirtied / 2;
}

/* The following parameters are exported via /proc/sys/vm */

/*
 * Start background writeback (via pdflush) at this percentage
 * Start background writeback (via writeback threads) at this percentage
 */
int dirty_background_ratio = 10;

@@ -474,10 +477,11 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
 * balance_dirty_pages() must be called by processes which are generating dirty
 * data.  It looks at the number of dirty pages in the machine and will force
 * the caller to perform writeback if the system is over `vm_dirty_ratio'.
 * If we're over `background_thresh' then pdflush is woken to perform some
 * writeout.
 * If we're over `background_thresh' then the writeback threads are woken to
 * perform some writeout.
 */
static void balance_dirty_pages(struct address_space *mapping)
static void balance_dirty_pages(struct address_space *mapping,
				unsigned long write_chunk)
{
	long nr_reclaimable, bdi_nr_reclaimable;
	long nr_writeback, bdi_nr_writeback;
@@ -485,7 +489,6 @@ static void balance_dirty_pages(struct address_space *mapping)
	unsigned long dirty_thresh;
	unsigned long bdi_thresh;
	unsigned long pages_written = 0;
	unsigned long write_chunk = sync_writeback_pages();
	unsigned long pause = 1;

	struct backing_dev_info *bdi = mapping->backing_dev_info;
@@ -579,7 +582,7 @@ static void balance_dirty_pages(struct address_space *mapping)
		bdi->dirty_exceeded = 0;

	if (writeback_in_progress(bdi))
		return;		/* pdflush is already working this queue */
		return;

	/*
	 * In laptop mode, we wait until hitting the higher threshold before
@@ -590,10 +593,10 @@ static void balance_dirty_pages(struct address_space *mapping)
	 * background_thresh, to keep the amount of dirty memory low.
	 */
	if ((laptop_mode && pages_written) ||
	    (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
	    (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
			       + global_page_state(NR_UNSTABLE_NFS))
					  > background_thresh)))
		bdi_start_writeback(bdi, nr_writeback);
		bdi_start_writeback(bdi, 0);
}

void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -640,9 +643,10 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
	p =  &__get_cpu_var(bdp_ratelimits);
	*p += nr_pages_dirtied;
	if (unlikely(*p >= ratelimit)) {
		ratelimit = sync_writeback_pages(*p);
		*p = 0;
		preempt_enable();
		balance_dirty_pages(mapping);
		balance_dirty_pages(mapping, ratelimit);
		return;
	}
	preempt_enable();
+3 −2
Original line number Diff line number Diff line
@@ -1046,8 +1046,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
	 * sync from ever calling shmem_writepage; but a stacking filesystem
	 * may use the ->writepage of its underlying filesystem, in which case
	 * tmpfs should write out to swap only in response to memory pressure,
	 * and not for pdflush or sync.  However, in those cases, we do still
	 * want to check if there's a redundant swappage to be discarded.
	 * and not for the writeback threads or sync.  However, in those cases,
	 * we do still want to check if there's a redundant swappage to be
	 * discarded.
	 */
	if (wbc->for_reclaim)
		swap = get_swap_page();
+4 −4
Original line number Diff line number Diff line
@@ -1709,10 +1709,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
 *
 * If the caller is !__GFP_FS then the probability of a failure is reasonably
 * high - the zone may be full of dirty or under-writeback pages, which this
 * caller can't do much about.  We kick pdflush and take explicit naps in the
 * hope that some of these pages can be written.  But if the allocating task
 * holds filesystem locks which prevent writeout this might not work, and the
 * allocation attempt will fail.
 * caller can't do much about.  We kick the writeback threads and take explicit
 * naps in the hope that some of these pages can be written.  But if the
 * allocating task holds filesystem locks which prevent writeout this might not
 * work, and the allocation attempt will fail.
 *
 * returns:	0, if no pages reclaimed
 * 		else, the number of pages reclaimed