Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9aa05000 authored by Dave Chinner's avatar Dave Chinner Committed by Ben Myers
Browse files

xfs: xfs_sync_data is redundant.



We don't do any data writeback from XFS any more - the VFS is
completely responsible for that, including for freeze. We can
replace the remaining caller with a VFS level function that
achieves the same thing, but without conflicting with current
writeback work.

This means we can remove the flush_work and xfs_flush_inodes() - the
VFS functionality completely replaces the internal flush queue for
doing this writeback work in a separate context to avoid stack
overruns.

This does have one complication - it cannot be called with page
locks held.  Hence move the flushing of delalloc space when ENOSPC
occurs back up into xfs_file_aio_buffered_write when we don't hold
any locks that will stall writeback.

Unfortunately, writeback_inodes_sb_if_idle() is not sufficient to
trigger delalloc conversion fast enough to prevent spurious ENOSPC
whent here are hundreds of writers, thousands of small files and GBs
of free RAM.  Hence we need to use sync_sb_inodes() to block callers
while we wait for writeback like the previous xfs_flush_inodes
implementation did.

That means we have to hold the s_umount lock here, but because this
call can nest inside i_mutex (the parent directory in the create
case, held by the VFS), we have to use down_read_trylock() to avoid
potential deadlocks. In practice, this trylock will succeed on
almost every attempt as unmount/remount type operations are
exceedingly rare.

Note: we always need to pass a count of zero to
generic_file_buffered_write() as the previously written byte count.
We only do this by accident before this patch by the virtue of ret
always being zero when there are no errors. Make this explicit
rather than needing to specifically zero ret in the ENOSPC retry
case.

Signed-off-by: default avatarDave Chinner <dchinner@redhat.com>
Tested-by: default avatarBrian Foster <bfoster@redhat.com>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarBen Myers <bpm@sgi.com>
parent cf2931db
Loading
Loading
Loading
Loading
+7 −6
Original line number Diff line number Diff line
@@ -728,15 +728,16 @@ xfs_file_buffered_aio_write(
write_retry:
	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
	ret = generic_file_buffered_write(iocb, iovp, nr_segs,
			pos, &iocb->ki_pos, count, ret);
			pos, &iocb->ki_pos, count, 0);

	/*
	 * if we just got an ENOSPC, flush the inode now we aren't holding any
	 * page locks and retry *once*
	 * If we just got an ENOSPC, try to write back all dirty inodes to
	 * convert delalloc space to free up some of the excess reserved
	 * metadata space.
	 */
	if (ret == -ENOSPC && !enospc) {
		enospc = 1;
		ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
		if (!ret)
		xfs_flush_inodes(ip->i_mount);
		goto write_retry;
	}

+7 −16
Original line number Diff line number Diff line
@@ -373,7 +373,7 @@ xfs_iomap_write_delay(
	xfs_extlen_t	extsz;
	int		nimaps;
	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
	int		prealloc, flushed = 0;
	int		prealloc;
	int		error;

	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -434,27 +434,18 @@ retry:
	}

	/*
	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
	 * ENOSPC, * flush all other inodes with delalloc blocks to free up
	 * some of the excess reserved metadata space. For both cases, retry
	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
	 * without EOF preallocation.
	 */
	if (nimaps == 0) {
		trace_xfs_delalloc_enospc(ip, offset, count);
		if (flushed)
			return XFS_ERROR(error ? error : ENOSPC);

		if (error == ENOSPC) {
			xfs_iunlock(ip, XFS_ILOCK_EXCL);
			xfs_flush_inodes(ip);
			xfs_ilock(ip, XFS_ILOCK_EXCL);
		}

		flushed = 1;
		error = 0;
		if (prealloc) {
			prealloc = 0;
			error = 0;
			goto retry;
		}
		return XFS_ERROR(error ? error : ENOSPC);
	}

	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
		return xfs_alert_fsblock_zero(ip, &imap[0]);
+0 −1
Original line number Diff line number Diff line
@@ -198,7 +198,6 @@ typedef struct xfs_mount {
#endif
	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
	struct work_struct	m_flush_work;	/* background inode flush */
	__int64_t		m_update_flags;	/* sb flags we need to update
						   on the next remount,rw */
	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
+18 −3
Original line number Diff line number Diff line
@@ -882,6 +882,24 @@ xfs_destroy_mount_workqueues(
	destroy_workqueue(mp->m_unwritten_workqueue);
}

/*
 * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
 * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
 * for IO to complete so that we effectively throttle multiple callers to the
 * rate at which IO is completing.
 */
void
xfs_flush_inodes(
	struct xfs_mount	*mp)
{
	struct super_block	*sb = mp->m_super;

	if (down_read_trylock(&sb->s_umount)) {
		sync_inodes_sb(sb);
		up_read(&sb->s_umount);
	}
}

/* Catch misguided souls that try to use this interface on XFS */
STATIC struct inode *
xfs_fs_alloc_inode(
@@ -1005,8 +1023,6 @@ xfs_fs_put_super(
{
	struct xfs_mount	*mp = XFS_M(sb);

	cancel_work_sync(&mp->m_flush_work);

	xfs_filestream_unmount(mp);
	xfs_unmountfs(mp);

@@ -1324,7 +1340,6 @@ xfs_fs_fill_super(
	spin_lock_init(&mp->m_sb_lock);
	mutex_init(&mp->m_growlock);
	atomic_set(&mp->m_active_trans, 0);
	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);

	mp->m_super = sb;
+1 −0
Original line number Diff line number Diff line
@@ -74,6 +74,7 @@ struct block_device;

extern __uint64_t xfs_max_file_offset(unsigned int);

extern void xfs_flush_inodes(struct xfs_mount *mp);
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
Loading