xfs: implement iomap based buffered write path (68a9f5e7) · Commits · e / devices / android_kernel_teracube_2e

fs/xfs/Kconfig

+1 −0

Original line number	Diff line number	Diff line
		@@ -4,6 +4,7 @@ config XFS_FS
		depends on (64BIT \|\| LBDAF)
		select EXPORTFS
		select LIBCRC32C
		select FS_IOMAP
		help
		XFS is a high performance journaling filesystem which originated
		on the SGI IRIX platform. It is completely multi-threaded, can

fs/xfs/xfs_aops.c

+0 −212

Original line number	Diff line number	Diff line
		@@ -1427,216 +1427,6 @@ xfs_vm_direct_IO(
		xfs_get_blocks_direct, endio, NULL, flags);
		}

		/*
		* Punch out the delalloc blocks we have already allocated.
		*
		* Don't bother with xfs_setattr given that nothing can have made it to disk yet
		* as the page is still locked at this point.
		*/
		STATIC void
		xfs_vm_kill_delalloc_range(
		struct inode *inode,
		loff_t start,
		loff_t end)
		{
		struct xfs_inode *ip = XFS_I(inode);
		xfs_fileoff_t start_fsb;
		xfs_fileoff_t end_fsb;
		int error;

		start_fsb = XFS_B_TO_FSB(ip->i_mount, start);
		end_fsb = XFS_B_TO_FSB(ip->i_mount, end);
		if (end_fsb <= start_fsb)
		return;

		xfs_ilock(ip, XFS_ILOCK_EXCL);
		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
		end_fsb - start_fsb);
		if (error) {
		/* something screwed, just bail */
		if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
		xfs_alert(ip->i_mount,
		"xfs_vm_write_failed: unable to clean up ino %lld",
		ip->i_ino);
		}
		}
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
		}

		STATIC void
		xfs_vm_write_failed(
		struct inode *inode,
		struct page *page,
		loff_t pos,
		unsigned len)
		{
		loff_t block_offset;
		loff_t block_start;
		loff_t block_end;
		loff_t from = pos & (PAGE_SIZE - 1);
		loff_t to = from + len;
		struct buffer_head bh, head;
		struct xfs_mount *mp = XFS_I(inode)->i_mount;

		/*
		* The request pos offset might be 32 or 64 bit, this is all fine
		* on 64-bit platform. However, for 64-bit pos request on 32-bit
		* platform, the high 32-bit will be masked off if we evaluate the
		* block_offset via (pos & PAGE_MASK) because the PAGE_MASK is
		* 0xfffff000 as an unsigned long, hence the result is incorrect
		* which could cause the following ASSERT failed in most cases.
		* In order to avoid this, we can evaluate the block_offset of the
		* start of the page by using shifts rather than masks the mismatch
		* problem.
		*/
		block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT;

		ASSERT(block_offset + from == pos);

		head = page_buffers(page);
		block_start = 0;
		for (bh = head; bh != head \|\| !block_start;
		bh = bh->b_this_page, block_start = block_end,
		block_offset += bh->b_size) {
		block_end = block_start + bh->b_size;

		/* skip buffers before the write */
		if (block_end <= from)
		continue;

		/* if the buffer is after the write, we're done */
		if (block_start >= to)
		break;

		/*
		* Process delalloc and unwritten buffers beyond EOF. We can
		* encounter unwritten buffers in the event that a file has
		* post-EOF unwritten extents and an extending write happens to
		* fail (e.g., an unaligned write that also involves a delalloc
		* to the same page).
		*/
		if (!buffer_delay(bh) && !buffer_unwritten(bh))
		continue;

		if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) &&
		block_offset < i_size_read(inode))
		continue;

		if (buffer_delay(bh))
		xfs_vm_kill_delalloc_range(inode, block_offset,
		block_offset + bh->b_size);

		/*
		* This buffer does not contain data anymore. make sure anyone
		* who finds it knows that for certain.
		*/
		clear_buffer_delay(bh);
		clear_buffer_uptodate(bh);
		clear_buffer_mapped(bh);
		clear_buffer_new(bh);
		clear_buffer_dirty(bh);
		clear_buffer_unwritten(bh);
		}

		}

		/*
		* This used to call block_write_begin(), but it unlocks and releases the page
		* on error, and we need that page to be able to punch stale delalloc blocks out
		* on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at
		* the appropriate point.
		*/
		STATIC int
		xfs_vm_write_begin(
		struct file *file,
		struct address_space *mapping,
		loff_t pos,
		unsigned len,
		unsigned flags,
		struct page **pagep,
		void **fsdata)
		{
		pgoff_t index = pos >> PAGE_SHIFT;
		struct page *page;
		int status;
		struct xfs_mount *mp = XFS_I(mapping->host)->i_mount;

		ASSERT(len <= PAGE_SIZE);

		page = grab_cache_page_write_begin(mapping, index, flags);
		if (!page)
		return -ENOMEM;

		status = __block_write_begin(page, pos, len, xfs_get_blocks);
		if (xfs_mp_fail_writes(mp))
		status = -EIO;
		if (unlikely(status)) {
		struct inode *inode = mapping->host;
		size_t isize = i_size_read(inode);

		xfs_vm_write_failed(inode, page, pos, len);
		unlock_page(page);

		/*
		* If the write is beyond EOF, we only want to kill blocks
		* allocated in this write, not blocks that were previously
		* written successfully.
		*/
		if (xfs_mp_fail_writes(mp))
		isize = 0;
		if (pos + len > isize) {
		ssize_t start = max_t(ssize_t, pos, isize);

		truncate_pagecache_range(inode, start, pos + len);
		}

		put_page(page);
		page = NULL;
		}

		*pagep = page;
		return status;
		}

		/*
		* On failure, we only need to kill delalloc blocks beyond EOF in the range of
		* this specific write because they will never be written. Previous writes
		* beyond EOF where block allocation succeeded do not need to be trashed, so
		* only new blocks from this write should be trashed. For blocks within
		* EOF, generic_write_end() zeros them so they are safe to leave alone and be
		* written with all the other valid data.
		*/
		STATIC int
		xfs_vm_write_end(
		struct file *file,
		struct address_space *mapping,
		loff_t pos,
		unsigned len,
		unsigned copied,
		struct page *page,
		void *fsdata)
		{
		int ret;

		ASSERT(len <= PAGE_SIZE);

		ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
		if (unlikely(ret < len)) {
		struct inode *inode = mapping->host;
		size_t isize = i_size_read(inode);
		loff_t to = pos + len;

		if (to > isize) {
		/* only kill blocks in this write beyond EOF */
		if (pos > isize)
		isize = pos;
		xfs_vm_kill_delalloc_range(inode, isize, to);
		truncate_pagecache_range(inode, isize, to);
		}
		}
		return ret;
		}

		STATIC sector_t
		xfs_vm_bmap(
		struct address_space *mapping,
		@@ -1747,8 +1537,6 @@ const struct address_space_operations xfs_address_space_operations = {
		.set_page_dirty = xfs_vm_set_page_dirty,
		.releasepage = xfs_vm_releasepage,
		.invalidatepage = xfs_vm_invalidatepage,
		.write_begin = xfs_vm_write_begin,
		.write_end = xfs_vm_write_end,
		.bmap = xfs_vm_bmap,
		.direct_IO = xfs_vm_direct_IO,
		.migratepage = buffer_migrate_page,

fs/xfs/xfs_file.c

+30 −41

Original line number	Diff line number	Diff line
		@@ -37,6 +37,7 @@
		#include "xfs_log.h"
		#include "xfs_icache.h"
		#include "xfs_pnfs.h"
		#include "xfs_iomap.h"

		#include <linux/dcache.h>
		#include <linux/falloc.h>
		@@ -79,57 +80,27 @@ xfs_rw_ilock_demote(
		inode_unlock(VFS_I(ip));
		}

		/*
		* xfs_iozero clears the specified range supplied via the page cache (except in
		* the DAX case). Writes through the page cache will allocate blocks over holes,
		* though the callers usually map the holes first and avoid them. If a block is
		* not completely zeroed, then it will be read from disk before being partially
		* zeroed.
		*
		* In the DAX case, we can just directly write to the underlying pages. This
		* will not allocate blocks, but will avoid holes and unwritten extents and so
		* not do unnecessary work.
		*/
		int
		xfs_iozero(
		struct xfs_inode ip, / inode */
		loff_t pos, /* offset in file */
		size_t count) /* size of data to zero */
		static int
		xfs_dax_zero_range(
		struct inode *inode,
		loff_t pos,
		size_t count)
		{
		struct page *page;
		struct address_space *mapping;
		int status = 0;


		mapping = VFS_I(ip)->i_mapping;
		do {
		unsigned offset, bytes;
		void *fsdata;

		offset = (pos & (PAGE_SIZE -1)); /* Within page */
		bytes = PAGE_SIZE - offset;
		if (bytes > count)
		bytes = count;

		if (IS_DAX(VFS_I(ip))) {
		status = dax_zero_page_range(VFS_I(ip), pos, bytes,
		status = dax_zero_page_range(inode, pos, bytes,
		xfs_get_blocks_direct);
		if (status)
		break;
		} else {
		status = pagecache_write_begin(NULL, mapping, pos, bytes,
		AOP_FLAG_UNINTERRUPTIBLE,
		&page, &fsdata);
		if (status)
		break;

		zero_user(page, offset, bytes);

		status = pagecache_write_end(NULL, mapping, pos, bytes,
		bytes, page, fsdata);
		WARN_ON(status <= 0); /* can't return less than zero! */
		status = 0;
		}
		pos += bytes;
		count -= bytes;
		} while (count);
		@@ -137,6 +108,24 @@ xfs_iozero(
		return status;
		}

		/*
		* Clear the specified ranges to zero through either the pagecache or DAX.
		* Holes and unwritten extents will be left as-is as they already are zeroed.
		*/
		int
		xfs_iozero(
		struct xfs_inode *ip,
		loff_t pos,
		size_t count)
		{
		struct inode *inode = VFS_I(ip);

		if (IS_DAX(VFS_I(ip)))
		return xfs_dax_zero_range(inode, pos, count);
		else
		return iomap_zero_range(inode, pos, count, NULL, &xfs_iomap_ops);
		}

		int
		xfs_update_prealloc_flags(
		struct xfs_inode *ip,
		@@ -841,7 +830,7 @@ xfs_file_buffered_aio_write(
		write_retry:
		trace_xfs_file_buffered_write(ip, iov_iter_count(from),
		iocb->ki_pos, 0);
		ret = generic_perform_write(file, from, iocb->ki_pos);
		ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
		if (likely(ret >= 0))
		iocb->ki_pos += ret;

		@@ -1553,7 +1542,7 @@ xfs_filemap_page_mkwrite(
		if (IS_DAX(inode)) {
		ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault);
		} else {
		ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
		ret = block_page_mkwrite_return(ret);
		}

fs/xfs/xfs_iomap.c

+144 −0

Original line number	Diff line number	Diff line
		@@ -967,3 +967,147 @@ xfs_bmbt_to_iomap(
		iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
		iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
		}

		static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps)
		{
		return !nimaps \|\|
		imap->br_startblock == HOLESTARTBLOCK \|\|
		imap->br_startblock == DELAYSTARTBLOCK;
		}

		static int
		xfs_file_iomap_begin(
		struct inode *inode,
		loff_t offset,
		loff_t length,
		unsigned flags,
		struct iomap *iomap)
		{
		struct xfs_inode *ip = XFS_I(inode);
		struct xfs_mount *mp = ip->i_mount;
		struct xfs_bmbt_irec imap;
		xfs_fileoff_t offset_fsb, end_fsb;
		int nimaps = 1, error = 0;

		if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

		xfs_ilock(ip, XFS_ILOCK_EXCL);

		ASSERT(offset <= mp->m_super->s_maxbytes);
		if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes)
		length = mp->m_super->s_maxbytes - offset;
		offset_fsb = XFS_B_TO_FSBT(mp, offset);
		end_fsb = XFS_B_TO_FSB(mp, offset + length);

		error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
		&nimaps, XFS_BMAPI_ENTIRE);
		if (error) {
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
		return error;
		}

		if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) {
		/*
		* We cap the maximum length we map here to MAX_WRITEBACK_PAGES
		* pages to keep the chunks of work done where somewhat symmetric
		* with the work writeback does. This is a completely arbitrary
		* number pulled out of thin air as a best guess for initial
		* testing.
		*
		* Note that the values needs to be less than 32-bits wide until
		* the lower level functions are updated.
		*/
		length = min_t(loff_t, length, 1024 * PAGE_SIZE);
		if (xfs_get_extsz_hint(ip)) {
		/*
		* xfs_iomap_write_direct() expects the shared lock. It
		* is unlocked on return.
		*/
		xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
		error = xfs_iomap_write_direct(ip, offset, length, &imap,
		nimaps);
		} else {
		error = xfs_iomap_write_delay(ip, offset, length, &imap);
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
		}

		if (error)
		return error;

		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
		xfs_bmbt_to_iomap(ip, iomap, &imap);
		} else if (nimaps) {
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
		trace_xfs_iomap_found(ip, offset, length, 0, &imap);
		xfs_bmbt_to_iomap(ip, iomap, &imap);
		} else {
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
		trace_xfs_iomap_not_found(ip, offset, length, 0, &imap);
		iomap->blkno = IOMAP_NULL_BLOCK;
		iomap->type = IOMAP_HOLE;
		iomap->offset = offset;
		iomap->length = length;
		}

		return 0;
		}

		static int
		xfs_file_iomap_end_delalloc(
		struct xfs_inode *ip,
		loff_t offset,
		loff_t length,
		ssize_t written)
		{
		struct xfs_mount *mp = ip->i_mount;
		xfs_fileoff_t start_fsb;
		xfs_fileoff_t end_fsb;
		int error = 0;

		start_fsb = XFS_B_TO_FSB(mp, offset + written);
		end_fsb = XFS_B_TO_FSB(mp, offset + length);

		/*
		* Trim back delalloc blocks if we didn't manage to write the whole
		* range reserved.
		*
		* We don't need to care about racing delalloc as we hold i_mutex
		* across the reserve/allocate/unreserve calls. If there are delalloc
		* blocks in the range, they are ours.
		*/
		if (start_fsb < end_fsb) {
		xfs_ilock(ip, XFS_ILOCK_EXCL);
		error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
		end_fsb - start_fsb);
		xfs_iunlock(ip, XFS_ILOCK_EXCL);

		if (error && !XFS_FORCED_SHUTDOWN(mp)) {
		xfs_alert(mp, "%s: unable to clean up ino %lld",
		__func__, ip->i_ino);
		return error;
		}
		}

		return 0;
		}

		static int
		xfs_file_iomap_end(
		struct inode *inode,
		loff_t offset,
		loff_t length,
		ssize_t written,
		unsigned flags,
		struct iomap *iomap)
		{
		if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC)
		return xfs_file_iomap_end_delalloc(XFS_I(inode), offset,
		length, written);
		return 0;
		}

		struct iomap_ops xfs_iomap_ops = {
		.iomap_begin = xfs_file_iomap_begin,
		.iomap_end = xfs_file_iomap_end,
		};

fs/xfs/xfs_iomap.h

+4 −1

Original line number	Diff line number	Diff line
		@@ -18,7 +18,8 @@
		#ifndef __XFS_IOMAP_H__
		#define __XFS_IOMAP_H__

		struct iomap;
		#include <linux/iomap.h>

		struct xfs_inode;
		struct xfs_bmbt_irec;

		@@ -33,4 +34,6 @@ int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
		void xfs_bmbt_to_iomap(struct xfs_inode , struct iomap ,
		struct xfs_bmbt_irec *);

		extern struct iomap_ops xfs_iomap_ops;

		#endif /* __XFS_IOMAP_H__*/