Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 66e8ac7b authored by Dave Chinner's avatar Dave Chinner
Browse files

Merge branch 'xfs-dax-support' into for-next

parents b9a350a1 cbe4dab1
Loading
Loading
Loading
Loading
+27 −7
Original line number Original line Diff line number Diff line
@@ -309,14 +309,21 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 out:
 out:
	i_mmap_unlock_read(mapping);
	i_mmap_unlock_read(mapping);


	if (bh->b_end_io)
		bh->b_end_io(bh, 1);

	return error;
	return error;
}
}


static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
/**
			get_block_t get_block)
 * __dax_fault - handle a page fault on a DAX file
 * @vma: The virtual memory area where the fault occurred
 * @vmf: The description of the fault
 * @get_block: The filesystem method used to translate file offsets to blocks
 *
 * When a page fault occurs, filesystems may call this helper in their
 * fault handler for DAX files. __dax_fault() assumes the caller has done all
 * the necessary locking for the page fault to proceed successfully.
 */
int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
			get_block_t get_block, dax_iodone_t complete_unwritten)
{
{
	struct file *file = vma->vm_file;
	struct file *file = vma->vm_file;
	struct address_space *mapping = file->f_mapping;
	struct address_space *mapping = file->f_mapping;
@@ -417,7 +424,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
		page_cache_release(page);
		page_cache_release(page);
	}
	}


	/*
	 * If we successfully insert the new mapping over an unwritten extent,
	 * we need to ensure we convert the unwritten extent. If there is an
	 * error inserting the mapping, the filesystem needs to leave it as
	 * unwritten to prevent exposure of the stale underlying data to
	 * userspace, but we still need to call the completion function so
	 * the private resources on the mapping buffer can be released. We
	 * indicate what the callback should do via the uptodate variable, same
	 * as for normal BH based IO completions.
	 */
	error = dax_insert_mapping(inode, &bh, vma, vmf);
	error = dax_insert_mapping(inode, &bh, vma, vmf);
	if (buffer_unwritten(&bh))
		complete_unwritten(&bh, !error);


 out:
 out:
	if (error == -ENOMEM)
	if (error == -ENOMEM)
@@ -434,6 +453,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
	}
	}
	goto out;
	goto out;
}
}
EXPORT_SYMBOL(__dax_fault);


/**
/**
 * dax_fault - handle a page fault on a DAX file
 * dax_fault - handle a page fault on a DAX file
@@ -445,7 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 * fault handler for DAX files.
 * fault handler for DAX files.
 */
 */
int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
			get_block_t get_block)
	      get_block_t get_block, dax_iodone_t complete_unwritten)
{
{
	int result;
	int result;
	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
@@ -454,7 +474,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
		sb_start_pagefault(sb);
		sb_start_pagefault(sb);
		file_update_time(vma->vm_file);
		file_update_time(vma->vm_file);
	}
	}
	result = do_dax_fault(vma, vmf, get_block);
	result = __dax_fault(vma, vmf, get_block, complete_unwritten);
	if (vmf->flags & FAULT_FLAG_WRITE)
	if (vmf->flags & FAULT_FLAG_WRITE)
		sb_end_pagefault(sb);
		sb_end_pagefault(sb);


+2 −2
Original line number Original line Diff line number Diff line
@@ -28,12 +28,12 @@
#ifdef CONFIG_FS_DAX
#ifdef CONFIG_FS_DAX
static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
{
	return dax_fault(vma, vmf, ext2_get_block);
	return dax_fault(vma, vmf, ext2_get_block, NULL);
}
}


static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
{
	return dax_mkwrite(vma, vmf, ext2_get_block);
	return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
}
}


static const struct vm_operations_struct ext2_dax_vm_ops = {
static const struct vm_operations_struct ext2_dax_vm_ops = {
+14 −2
Original line number Original line Diff line number Diff line
@@ -192,15 +192,27 @@ out:
}
}


#ifdef CONFIG_FS_DAX
#ifdef CONFIG_FS_DAX
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
	struct inode *inode = bh->b_assoc_map->host;
	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
	int err;
	if (!uptodate)
		return;
	WARN_ON(!buffer_unwritten(bh));
	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
}

static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
{
	return dax_fault(vma, vmf, ext4_get_block);
	return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
					/* Is this the right get_block? */
					/* Is this the right get_block? */
}
}


static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
{
	return dax_mkwrite(vma, vmf, ext4_get_block);
	return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
}
}


static const struct vm_operations_struct ext4_dax_vm_ops = {
static const struct vm_operations_struct ext4_dax_vm_ops = {
+7 −14
Original line number Original line Diff line number Diff line
@@ -656,18 +656,6 @@ has_zeroout:
	return retval;
	return retval;
}
}


static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
	struct inode *inode = bh->b_assoc_map->host;
	/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
	int err;
	if (!uptodate)
		return;
	WARN_ON(!buffer_unwritten(bh));
	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
}

/* Maximum number of blocks we map for direct IO at once. */
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096
#define DIO_MAX_BLOCKS 4096


@@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,


		map_bh(bh, inode->i_sb, map.m_pblk);
		map_bh(bh, inode->i_sb, map.m_pblk);
		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
		if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
		if (IS_DAX(inode) && buffer_unwritten(bh)) {
			/*
			 * dgc: I suspect unwritten conversion on ext4+DAX is
			 * fundamentally broken here when there are concurrent
			 * read/write in progress on this inode.
			 */
			WARN_ON_ONCE(io_end);
			bh->b_assoc_map = inode->i_mapping;
			bh->b_assoc_map = inode->i_mapping;
			bh->b_private = (void *)(unsigned long)iblock;
			bh->b_private = (void *)(unsigned long)iblock;
			bh->b_end_io = ext4_end_io_unwritten;
		}
		}
		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
			set_buffer_defer_completion(bh);
			set_buffer_defer_completion(bh);
+110 −42
Original line number Original line Diff line number Diff line
@@ -1349,7 +1349,7 @@ __xfs_get_blocks(
	sector_t		iblock,
	sector_t		iblock,
	struct buffer_head	*bh_result,
	struct buffer_head	*bh_result,
	int			create,
	int			create,
	int			direct)
	bool			direct)
{
{
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_mount	*mp = ip->i_mount;
@@ -1414,6 +1414,7 @@ __xfs_get_blocks(
			if (error)
			if (error)
				return error;
				return error;
			new = 1;
			new = 1;

		} else {
		} else {
			/*
			/*
			 * Delalloc reservations do not require a transaction,
			 * Delalloc reservations do not require a transaction,
@@ -1508,49 +1509,29 @@ xfs_get_blocks(
	struct buffer_head	*bh_result,
	struct buffer_head	*bh_result,
	int			create)
	int			create)
{
{
	return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
	return __xfs_get_blocks(inode, iblock, bh_result, create, false);
}
}


STATIC int
int
xfs_get_blocks_direct(
xfs_get_blocks_direct(
	struct inode		*inode,
	struct inode		*inode,
	sector_t		iblock,
	sector_t		iblock,
	struct buffer_head	*bh_result,
	struct buffer_head	*bh_result,
	int			create)
	int			create)
{
{
	return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
	return __xfs_get_blocks(inode, iblock, bh_result, create, true);
}
}


/*
static void
 * Complete a direct I/O write request.
__xfs_end_io_direct_write(
 *
	struct inode		*inode,
 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
	struct xfs_ioend	*ioend,
 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
 * wholly within the EOF and so there is nothing for us to do. Note that in this
 * case the completion can be called in interrupt context, whereas if we have an
 * ioend we will always be called in task context (i.e. from a workqueue).
 */
STATIC void
xfs_end_io_direct_write(
	struct kiocb		*iocb,
	loff_t			offset,
	loff_t			offset,
	ssize_t			size,
	ssize_t			size)
	void			*private)
{
{
	struct inode		*inode = file_inode(iocb->ki_filp);
	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_ioend	*ioend = private;

	trace_xfs_gbmap_direct_endio(ip, offset, size,
				     ioend ? ioend->io_type : 0, NULL);

	if (!ioend) {
		ASSERT(offset + size <= i_size_read(inode));
		return;
	}


	if (XFS_FORCED_SHUTDOWN(mp))
	if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error)
		goto out_end_io;
		goto out_end_io;


	/*
	/*
@@ -1587,10 +1568,10 @@ xfs_end_io_direct_write(
	 * here can result in EOF moving backwards and Bad Things Happen when
	 * here can result in EOF moving backwards and Bad Things Happen when
	 * that occurs.
	 * that occurs.
	 */
	 */
	spin_lock(&ip->i_flags_lock);
	spin_lock(&XFS_I(inode)->i_flags_lock);
	if (offset + size > i_size_read(inode))
	if (offset + size > i_size_read(inode))
		i_size_write(inode, offset + size);
		i_size_write(inode, offset + size);
	spin_unlock(&ip->i_flags_lock);
	spin_unlock(&XFS_I(inode)->i_flags_lock);


	/*
	/*
	 * If we are doing an append IO that needs to update the EOF on disk,
	 * If we are doing an append IO that needs to update the EOF on disk,
@@ -1607,6 +1588,98 @@ out_end_io:
	return;
	return;
}
}


/*
 * Complete a direct I/O write request.
 *
 * The ioend structure is passed from __xfs_get_blocks() to tell us what to do.
 * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite
 * wholly within the EOF and so there is nothing for us to do. Note that in this
 * case the completion can be called in interrupt context, whereas if we have an
 * ioend we will always be called in task context (i.e. from a workqueue).
 */
STATIC void
xfs_end_io_direct_write(
	struct kiocb		*iocb,
	loff_t			offset,
	ssize_t			size,
	void			*private)
{
	struct inode		*inode = file_inode(iocb->ki_filp);
	struct xfs_ioend	*ioend = private;

	trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size,
				     ioend ? ioend->io_type : 0, NULL);

	if (!ioend) {
		ASSERT(offset + size <= i_size_read(inode));
		return;
	}

	__xfs_end_io_direct_write(inode, ioend, offset, size);
}

/*
 * For DAX we need a mapping buffer callback for unwritten extent conversion
 * when page faults allocate blocks and then zero them. Note that in this
 * case the mapping indicated by the ioend may extend beyond EOF. We most
 * definitely do not want to extend EOF here, so we trim back the ioend size to
 * EOF.
 */
#ifdef CONFIG_FS_DAX
void
xfs_end_io_dax_write(
	struct buffer_head	*bh,
	int			uptodate)
{
	struct xfs_ioend	*ioend = bh->b_private;
	struct inode		*inode = ioend->io_inode;
	ssize_t			size = ioend->io_size;

	ASSERT(IS_DAX(ioend->io_inode));

	/* if there was an error zeroing, then don't convert it */
	if (!uptodate)
		ioend->io_error = -EIO;

	/*
	 * Trim update to EOF, so we don't extend EOF during unwritten extent
	 * conversion of partial EOF blocks.
	 */
	spin_lock(&XFS_I(inode)->i_flags_lock);
	if (ioend->io_offset + size > i_size_read(inode))
		size = i_size_read(inode) - ioend->io_offset;
	spin_unlock(&XFS_I(inode)->i_flags_lock);

	__xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);

}
#else
void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
#endif

static inline ssize_t
xfs_vm_do_dio(
	struct inode		*inode,
	struct kiocb		*iocb,
	struct iov_iter		*iter,
	loff_t			offset,
	void			(*endio)(struct kiocb	*iocb,
					 loff_t		offset,
					 ssize_t	size,
					 void		*private),
	int			flags)
{
	struct block_device	*bdev;

	if (IS_DAX(inode))
		return dax_do_io(iocb, inode, iter, offset,
				 xfs_get_blocks_direct, endio, 0);

	bdev = xfs_find_bdev_for_inode(inode);
	return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
				     xfs_get_blocks_direct, endio, NULL, flags);
}

STATIC ssize_t
STATIC ssize_t
xfs_vm_direct_IO(
xfs_vm_direct_IO(
	struct kiocb		*iocb,
	struct kiocb		*iocb,
@@ -1614,16 +1687,11 @@ xfs_vm_direct_IO(
	loff_t			offset)
	loff_t			offset)
{
{
	struct inode		*inode = iocb->ki_filp->f_mapping->host;
	struct inode		*inode = iocb->ki_filp->f_mapping->host;
	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);


	if (iov_iter_rw(iter) == WRITE) {
	if (iov_iter_rw(iter) == WRITE)
		return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
		return xfs_vm_do_dio(inode, iocb, iter, offset,
					    xfs_get_blocks_direct,
				     xfs_end_io_direct_write, DIO_ASYNC_EXTEND);
					    xfs_end_io_direct_write, NULL,
	return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0);
					    DIO_ASYNC_EXTEND);
	}
	return __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
				    xfs_get_blocks_direct, NULL, NULL, 0);
}
}


/*
/*
Loading