Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 5cc60aee authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'xfs-for-linus-4.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs

Pull xfs updates from Dave Chinner:
 "There is quite a varied bunch of stuff in this update, and some of it
  you will have already merged through the ext4 tree which imported the
  dax-4.10-iomap-pmd topic branch from the XFS tree.

  There is also a new direct IO implementation that uses the iomap
  infrastructure. It's much simpler, faster, and has lower IO latency
  than the existing direct IO infrastructure.

  Summary:
   - DAX PMD faults via iomap infrastructure
   - Direct-io support in iomap infrastructure
   - removal of now-redundant XFS inode iolock, replaced with VFS
     i_rwsem
   - synchronisation with fixes and changes in userspace libxfs code
   - extent tree lookup helpers
   - lots of little corruption detection improvements to verifiers
   - optimised CRC calculations
   - faster buffer cache lookups
   - deprecation of barrier/nobarrier mount options - we always use
     REQ_FUA/REQ_FLUSH where appropriate for data integrity now
   - cleanups to speculative preallocation
   - miscellaneous minor bug fixes and cleanups"

* tag 'xfs-for-linus-4.10-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc/linux-xfs: (63 commits)
  xfs: nuke unused tracepoint definitions
  xfs: use GPF_NOFS when allocating btree cursors
  xfs: use xfs_vn_setattr_size to check on new size
  xfs: deprecate barrier/nobarrier mount option
  xfs: Always flush caches when integrity is required
  xfs: ignore leaf attr ichdr.count in verifier during log replay
  xfs: use rhashtable to track buffer cache
  xfs: optimise CRC updates
  xfs: make xfs btree stats less huge
  xfs: don't cap maximum dedupe request length
  xfs: don't allow di_size with high bit set
  xfs: error out if trying to add attrs and anextents > 0
  xfs: don't crash if reading a directory results in an unexpected hole
  xfs: complain if we don't get nextents bmap records
  xfs: check for bogus values in btree block headers
  xfs: forbid AG btrees with level == 0
  xfs: several xattr functions can be void
  xfs: handle cow fork in xfs_bmap_trace_exlist
  xfs: pass state not whichfork to trace_xfs_extlist
  xfs: Move AGI buffer type setting to xfs_read_agi
  ...
parents 5c2992ee 9807b773
Loading
Loading
Loading
Loading
+4 −8
Original line number Diff line number Diff line
@@ -51,13 +51,6 @@ default behaviour.
	CRC enabled filesystems always use the attr2 format, and so
	will reject the noattr2 mount option if it is set.

  barrier (*)
  nobarrier
	Enables/disables the use of block layer write barriers for
	writes into the journal and for data integrity operations.
	This allows for drive level write caching to be enabled, for
	devices that support write barriers.

  discard
  nodiscard (*)
	Enable/disable the issuing of commands to let the block
@@ -228,7 +221,10 @@ default behaviour.
Deprecated Mount Options
========================

None at present.
  Name				Removal Schedule
  ----				----------------
  barrier			no earlier than v4.15
  nobarrier			no earlier than v4.15


Removed Mount Options
+1 −1
Original line number Diff line number Diff line
@@ -554,7 +554,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
 * filesystems that don't need it and also allows us to create the workqueue
 * late enough so the we can include s_id in the name of the workqueue.
 */
static int sb_init_dio_done_wq(struct super_block *sb)
int sb_init_dio_done_wq(struct super_block *sb)
{
	struct workqueue_struct *old;
	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
+3 −0
Original line number Diff line number Diff line
@@ -184,3 +184,6 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
		unsigned flags, struct iomap_ops *ops, void *data,
		iomap_actor_t actor);

/* direct-io.c: */
int sb_init_dio_done_wq(struct super_block *sb);
+373 −0
Original line number Diff line number Diff line
@@ -24,6 +24,7 @@
#include <linux/uio.h>
#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/dax.h>
#include "internal.h"

@@ -584,3 +585,375 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
	return 0;
}
EXPORT_SYMBOL_GPL(iomap_fiemap);

/*
 * Private flags for iomap_dio, must not overlap with the public ones in
 * iomap.h:
 */
#define IOMAP_DIO_WRITE		(1 << 30)
#define IOMAP_DIO_DIRTY		(1 << 31)

struct iomap_dio {
	struct kiocb		*iocb;
	iomap_dio_end_io_t	*end_io;
	loff_t			i_size;
	loff_t			size;
	atomic_t		ref;
	unsigned		flags;
	int			error;

	union {
		/* used during submission and for synchronous completion: */
		struct {
			struct iov_iter		*iter;
			struct task_struct	*waiter;
			struct request_queue	*last_queue;
			blk_qc_t		cookie;
		} submit;

		/* used for aio completion: */
		struct {
			struct work_struct	work;
		} aio;
	};
};

static ssize_t iomap_dio_complete(struct iomap_dio *dio)
{
	struct kiocb *iocb = dio->iocb;
	ssize_t ret;

	if (dio->end_io) {
		ret = dio->end_io(iocb,
				dio->error ? dio->error : dio->size,
				dio->flags);
	} else {
		ret = dio->error;
	}

	if (likely(!ret)) {
		ret = dio->size;
		/* check for short read */
		if (iocb->ki_pos + ret > dio->i_size &&
		    !(dio->flags & IOMAP_DIO_WRITE))
			ret = dio->i_size - iocb->ki_pos;
		iocb->ki_pos += ret;
	}

	inode_dio_end(file_inode(iocb->ki_filp));
	kfree(dio);

	return ret;
}

static void iomap_dio_complete_work(struct work_struct *work)
{
	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
	struct kiocb *iocb = dio->iocb;
	bool is_write = (dio->flags & IOMAP_DIO_WRITE);
	ssize_t ret;

	ret = iomap_dio_complete(dio);
	if (is_write && ret > 0)
		ret = generic_write_sync(iocb, ret);
	iocb->ki_complete(iocb, ret, 0);
}

/*
 * Set an error in the dio if none is set yet.  We have to use cmpxchg
 * as the submission context and the completion context(s) can race to
 * update the error.
 */
static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
{
	cmpxchg(&dio->error, 0, ret);
}

static void iomap_dio_bio_end_io(struct bio *bio)
{
	struct iomap_dio *dio = bio->bi_private;
	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);

	if (bio->bi_error)
		iomap_dio_set_error(dio, bio->bi_error);

	if (atomic_dec_and_test(&dio->ref)) {
		if (is_sync_kiocb(dio->iocb)) {
			struct task_struct *waiter = dio->submit.waiter;

			WRITE_ONCE(dio->submit.waiter, NULL);
			wake_up_process(waiter);
		} else if (dio->flags & IOMAP_DIO_WRITE) {
			struct inode *inode = file_inode(dio->iocb->ki_filp);

			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
		} else {
			iomap_dio_complete_work(&dio->aio.work);
		}
	}

	if (should_dirty) {
		bio_check_pages_dirty(bio);
	} else {
		struct bio_vec *bvec;
		int i;

		bio_for_each_segment_all(bvec, bio, i)
			put_page(bvec->bv_page);
		bio_put(bio);
	}
}

static blk_qc_t
iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
		unsigned len)
{
	struct page *page = ZERO_PAGE(0);
	struct bio *bio;

	bio = bio_alloc(GFP_KERNEL, 1);
	bio->bi_bdev = iomap->bdev;
	bio->bi_iter.bi_sector =
		iomap->blkno + ((pos - iomap->offset) >> 9);
	bio->bi_private = dio;
	bio->bi_end_io = iomap_dio_bio_end_io;

	get_page(page);
	if (bio_add_page(bio, page, len, 0) != len)
		BUG();
	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);

	atomic_inc(&dio->ref);
	return submit_bio(bio);
}

static loff_t
iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
		void *data, struct iomap *iomap)
{
	struct iomap_dio *dio = data;
	unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
	unsigned fs_block_size = (1 << inode->i_blkbits), pad;
	unsigned align = iov_iter_alignment(dio->submit.iter);
	struct iov_iter iter;
	struct bio *bio;
	bool need_zeroout = false;
	int nr_pages, ret;

	if ((pos | length | align) & ((1 << blkbits) - 1))
		return -EINVAL;

	switch (iomap->type) {
	case IOMAP_HOLE:
		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
			return -EIO;
		/*FALLTHRU*/
	case IOMAP_UNWRITTEN:
		if (!(dio->flags & IOMAP_DIO_WRITE)) {
			iov_iter_zero(length, dio->submit.iter);
			dio->size += length;
			return length;
		}
		dio->flags |= IOMAP_DIO_UNWRITTEN;
		need_zeroout = true;
		break;
	case IOMAP_MAPPED:
		if (iomap->flags & IOMAP_F_SHARED)
			dio->flags |= IOMAP_DIO_COW;
		if (iomap->flags & IOMAP_F_NEW)
			need_zeroout = true;
		break;
	default:
		WARN_ON_ONCE(1);
		return -EIO;
	}

	/*
	 * Operate on a partial iter trimmed to the extent we were called for.
	 * We'll update the iter in the dio once we're done with this extent.
	 */
	iter = *dio->submit.iter;
	iov_iter_truncate(&iter, length);

	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
	if (nr_pages <= 0)
		return nr_pages;

	if (need_zeroout) {
		/* zero out from the start of the block to the write offset */
		pad = pos & (fs_block_size - 1);
		if (pad)
			iomap_dio_zero(dio, iomap, pos - pad, pad);
	}

	do {
		if (dio->error)
			return 0;

		bio = bio_alloc(GFP_KERNEL, nr_pages);
		bio->bi_bdev = iomap->bdev;
		bio->bi_iter.bi_sector =
			iomap->blkno + ((pos - iomap->offset) >> 9);
		bio->bi_private = dio;
		bio->bi_end_io = iomap_dio_bio_end_io;

		ret = bio_iov_iter_get_pages(bio, &iter);
		if (unlikely(ret)) {
			bio_put(bio);
			return ret;
		}

		if (dio->flags & IOMAP_DIO_WRITE) {
			bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
			task_io_account_write(bio->bi_iter.bi_size);
		} else {
			bio_set_op_attrs(bio, REQ_OP_READ, 0);
			if (dio->flags & IOMAP_DIO_DIRTY)
				bio_set_pages_dirty(bio);
		}

		dio->size += bio->bi_iter.bi_size;
		pos += bio->bi_iter.bi_size;

		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);

		atomic_inc(&dio->ref);

		dio->submit.last_queue = bdev_get_queue(iomap->bdev);
		dio->submit.cookie = submit_bio(bio);
	} while (nr_pages);

	if (need_zeroout) {
		/* zero out from the end of the write to the end of the block */
		pad = pos & (fs_block_size - 1);
		if (pad)
			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
	}

	iov_iter_advance(dio->submit.iter, length);
	return length;
}

ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops,
		iomap_dio_end_io_t end_io)
{
	struct address_space *mapping = iocb->ki_filp->f_mapping;
	struct inode *inode = file_inode(iocb->ki_filp);
	size_t count = iov_iter_count(iter);
	loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0;
	unsigned int flags = IOMAP_DIRECT;
	struct blk_plug plug;
	struct iomap_dio *dio;

	lockdep_assert_held(&inode->i_rwsem);

	if (!count)
		return 0;

	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
	if (!dio)
		return -ENOMEM;

	dio->iocb = iocb;
	atomic_set(&dio->ref, 1);
	dio->size = 0;
	dio->i_size = i_size_read(inode);
	dio->end_io = end_io;
	dio->error = 0;
	dio->flags = 0;

	dio->submit.iter = iter;
	if (is_sync_kiocb(iocb)) {
		dio->submit.waiter = current;
		dio->submit.cookie = BLK_QC_T_NONE;
		dio->submit.last_queue = NULL;
	}

	if (iov_iter_rw(iter) == READ) {
		if (pos >= dio->i_size)
			goto out_free_dio;

		if (iter->type == ITER_IOVEC)
			dio->flags |= IOMAP_DIO_DIRTY;
	} else {
		dio->flags |= IOMAP_DIO_WRITE;
		flags |= IOMAP_WRITE;
	}

	if (mapping->nrpages) {
		ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
		if (ret)
			goto out_free_dio;

		ret = invalidate_inode_pages2_range(mapping,
				iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
		WARN_ON_ONCE(ret);
		ret = 0;
	}

	inode_dio_begin(inode);

	blk_start_plug(&plug);
	do {
		ret = iomap_apply(inode, pos, count, flags, ops, dio,
				iomap_dio_actor);
		if (ret <= 0) {
			/* magic error code to fall back to buffered I/O */
			if (ret == -ENOTBLK)
				ret = 0;
			break;
		}
		pos += ret;
	} while ((count = iov_iter_count(iter)) > 0);
	blk_finish_plug(&plug);

	if (ret < 0)
		iomap_dio_set_error(dio, ret);

	if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
			!inode->i_sb->s_dio_done_wq) {
		ret = sb_init_dio_done_wq(inode->i_sb);
		if (ret < 0)
			iomap_dio_set_error(dio, ret);
	}

	if (!atomic_dec_and_test(&dio->ref)) {
		if (!is_sync_kiocb(iocb))
			return -EIOCBQUEUED;

		for (;;) {
			set_current_state(TASK_UNINTERRUPTIBLE);
			if (!READ_ONCE(dio->submit.waiter))
				break;

			if (!(iocb->ki_flags & IOCB_HIPRI) ||
			    !dio->submit.last_queue ||
			    !blk_mq_poll(dio->submit.last_queue,
					 dio->submit.cookie))
				io_schedule();
		}
		__set_current_state(TASK_RUNNING);
	}

	/*
	 * Try again to invalidate clean pages which might have been cached by
	 * non-direct readahead, or faulted in by get_user_pages() if the source
	 * of the write was an mmap'ed region of the file we're writing.  Either
	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
	 * this invalidation fails, tough, the write still worked...
	 */
	if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
		ret = invalidate_inode_pages2_range(mapping,
				iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
		WARN_ON_ONCE(ret);
	}

	return iomap_dio_complete(dio);

out_free_dio:
	kfree(dio);
	return ret;
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);
+7 −3
Original line number Diff line number Diff line
@@ -2455,12 +2455,15 @@ xfs_agf_verify(
	      be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
		return false;

	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
		return false;

	if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)
	    (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
	     be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
		return false;

	/*
@@ -2477,7 +2480,8 @@ xfs_agf_verify(
		return false;

	if (xfs_sb_version_hasreflink(&mp->m_sb) &&
	    be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)
	    (be32_to_cpu(agf->agf_refcount_level) < 1 ||
	     be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
		return false;

	return true;;
Loading