Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ba5843f5 authored by Jan Kara's avatar Jan Kara Committed by Theodore Ts'o
Browse files

ext4: use pre-zeroed blocks for DAX page faults



Make DAX fault path use pre-zeroed blocks to avoid races with extent
conversion and zeroing when two page faults to the same block happen.

Signed-off-by: default avatarJan Kara <jack@suse.com>
Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
parent c86d8db3
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -2452,7 +2452,7 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_get_block_write(struct inode *inode, sector_t iblock,
			 struct buffer_head *bh_result, int create);
int ext4_get_block_dax(struct inode *inode, sector_t iblock,
int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
			    struct buffer_head *bh_result, int create);
int ext4_get_block(struct inode *inode, sector_t iblock,
				struct buffer_head *bh_result, int create);
+3 −17
Original line number Diff line number Diff line
@@ -193,18 +193,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
}

#ifdef CONFIG_FS_DAX
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
{
	struct inode *inode = bh->b_assoc_map->host;
	/* XXX: breaks on 32-bit > 16TB. Is that even supported? */
	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
	int err;
	if (!uptodate)
		return;
	WARN_ON(!buffer_unwritten(bh));
	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
}

static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
	int result;
@@ -225,8 +213,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
	if (IS_ERR(handle))
		result = VM_FAULT_SIGBUS;
	else
		result = __dax_fault(vma, vmf, ext4_get_block_dax,
						ext4_end_io_unwritten);
		result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);

	if (write) {
		if (!IS_ERR(handle))
@@ -262,7 +249,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
		result = VM_FAULT_SIGBUS;
	else
		result = __dax_pmd_fault(vma, addr, pmd, flags,
				ext4_get_block_dax, ext4_end_io_unwritten);
				ext4_dax_mmap_get_block, NULL);

	if (write) {
		if (!IS_ERR(handle))
@@ -283,8 +270,7 @@ static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
	sb_start_pagefault(inode->i_sb);
	file_update_time(vma->vm_file);
	down_read(&EXT4_I(inode)->i_mmap_sem);
	err = __dax_mkwrite(vma, vmf, ext4_get_block_dax,
			    ext4_end_io_unwritten);
	err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL);
	up_read(&EXT4_I(inode)->i_mmap_sem);
	sb_end_pagefault(inode->i_sb);

+69 −17
Original line number Diff line number Diff line
@@ -723,16 +723,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,

		map_bh(bh, inode->i_sb, map.m_pblk);
		bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
		if (IS_DAX(inode) && buffer_unwritten(bh)) {
			/*
			 * dgc: I suspect unwritten conversion on ext4+DAX is
			 * fundamentally broken here when there are concurrent
			 * read/write in progress on this inode.
			 */
			WARN_ON_ONCE(io_end);
			bh->b_assoc_map = inode->i_mapping;
			bh->b_private = (void *)(unsigned long)iblock;
		}
		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
			set_buffer_defer_completion(bh);
		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -3097,18 +3087,80 @@ static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
	return ret;
}

int ext4_get_block_dax(struct inode *inode, sector_t iblock,
#ifdef CONFIG_FS_DAX
int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
			    struct buffer_head *bh_result, int create)
{
	int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
	int ret, err;
	int credits;
	struct ext4_map_blocks map;
	handle_t *handle = NULL;
	int flags = 0;

	if (create)
		flags |= EXT4_GET_BLOCKS_CREATE;
	ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
	ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
		   inode->i_ino, create);
	return _ext4_get_block(inode, iblock, bh_result, flags);
	map.m_lblk = iblock;
	map.m_len = bh_result->b_size >> inode->i_blkbits;
	credits = ext4_chunk_trans_blocks(inode, map.m_len);
	if (create) {
		flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
		if (IS_ERR(handle)) {
			ret = PTR_ERR(handle);
			return ret;
		}
	}

	ret = ext4_map_blocks(handle, inode, &map, flags);
	if (create) {
		err = ext4_journal_stop(handle);
		if (ret >= 0 && err < 0)
			ret = err;
	}
	if (ret <= 0)
		goto out;
	if (map.m_flags & EXT4_MAP_UNWRITTEN) {
		int err2;

		/*
		 * We are protected by i_mmap_sem so we know block cannot go
		 * away from under us even though we dropped i_data_sem.
		 * Convert extent to written and write zeros there.
		 *
		 * Note: We may get here even when create == 0.
		 */
		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
		if (IS_ERR(handle)) {
			ret = PTR_ERR(handle);
			goto out;
		}

		err = ext4_map_blocks(handle, inode, &map,
		      EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
		if (err < 0)
			ret = err;
		err2 = ext4_journal_stop(handle);
		if (err2 < 0 && ret > 0)
			ret = err2;
	}
out:
	WARN_ON_ONCE(ret == 0 && create);
	if (ret > 0) {
		map_bh(bh_result, inode->i_sb, map.m_pblk);
		bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
					map.m_flags;
		/*
		 * At least for now we have to clear BH_New so that DAX code
		 * doesn't attempt to zero blocks again in a racy way.
		 */
		bh_result->b_state &= ~(1 << BH_New);
		bh_result->b_size = map.m_len << inode->i_blkbits;
		ret = 0;
	}
	return ret;
}
#endif

static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
			    ssize_t size, void *private)
{