Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 0031462b authored by Mingming Cao's avatar Mingming Cao Committed by Theodore Ts'o
Browse files

ext4: Split uninitialized extents for direct I/O



When writing into an unitialized extent via direct I/O, and the direct
I/O doesn't exactly cover the unitialized extent, split the extent
into uninitialized and initialized extents before submitting the I/O.
This avoids needing to deal with an ENOSPC error in the end_io
callback that gets used for direct I/O.

When the IO is complete, the written extent will be marked as initialized.

Singed-Off-By: default avatarMingming Cao <cmm@us.ibm.com>
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
parent 9f0ccfd8
Loading
Loading
Loading
Loading
+21 −1
Original line number Diff line number Diff line
@@ -128,6 +128,15 @@ struct mpage_da_data {
	int retval;
};

typedef struct ext4_io_end {
	struct inode		*inode;		/* file being written to */
	unsigned int		flag;		/* sync IO or AIO */
	int			error;		/* I/O error code */
	ext4_lblk_t		offset;		/* offset in the file */
	size_t			size;		/* size of the extent */
	struct work_struct	work;		/* data work queue */
} ext4_io_end_t;

/*
 * Special inodes numbers
 */
@@ -347,7 +356,16 @@ struct ext4_new_group_data {
	/* Call ext4_da_update_reserve_space() after successfully 
	   allocating the blocks */
#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE	0x0008

	/* caller is from the direct IO path, request to creation of an
	unitialized extents if not allocated, split the uninitialized
	extent if blocks has been preallocated already*/
#define EXT4_GET_BLOCKS_DIO			0x0010
#define EXT4_GET_BLOCKS_CONVERT			0x0020
#define EXT4_GET_BLOCKS_DIO_CREATE_EXT		(EXT4_GET_BLOCKS_DIO|\
					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
	/* Convert extent to initialized after direct IO complete */
#define EXT4_GET_BLOCKS_DIO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
					 EXT4_GET_BLOCKS_DIO_CREATE_EXT)

/*
 * ioctl commands
@@ -1700,6 +1718,8 @@ extern void ext4_ext_init(struct super_block *);
extern void ext4_ext_release(struct super_block *);
extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
			  loff_t len);
extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
			  loff_t len);
extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
			   sector_t block, unsigned int max_blocks,
			   struct buffer_head *bh, int flags);
+6 −1
Original line number Diff line number Diff line
@@ -220,6 +220,11 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
		(le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
}

static inline void ext4_ext_mark_initialized(struct ext4_extent *ext)
{
	ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext));
}

extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex);
extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
@@ -235,7 +240,7 @@ extern int ext4_ext_try_to_merge(struct inode *inode,
				 struct ext4_ext_path *path,
				 struct ext4_extent *);
extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *);
extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int);
extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t,
							ext_prepare_callback, void *);
extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
+386 −37
Original line number Diff line number Diff line
@@ -723,7 +723,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
 * insert new index [@logical;@ptr] into the block at @curp;
 * check where to insert: before @curp or after @curp
 */
static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
				struct ext4_ext_path *curp,
				int logical, ext4_fsblk_t ptr)
{
@@ -1586,7 +1586,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
 */
int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
				struct ext4_ext_path *path,
				struct ext4_extent *newext)
				struct ext4_extent *newext, int flag)
{
	struct ext4_extent_header *eh;
	struct ext4_extent *ex, *fex;
@@ -1602,7 +1602,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
	BUG_ON(path[depth].p_hdr == NULL);

	/* try to insert block into found extent and return */
	if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
	if (ex && (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
		&& ext4_can_extents_be_merged(inode, ex, newext)) {
		ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
				ext4_ext_is_uninitialized(newext),
				ext4_ext_get_actual_len(newext),
@@ -1722,6 +1723,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,

merge:
	/* try to merge extents to the right */
	if (flag != EXT4_GET_BLOCKS_DIO_CREATE_EXT)
		ext4_ext_try_to_merge(inode, path, nearex);

	/* try to merge extents to the left */
@@ -2490,7 +2492,6 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
}

#define EXT4_EXT_ZERO_LEN 7

/*
 * This function is called by ext4_ext_get_blocks() if someone tries to write
 * to an uninitialized extent. It may result in splitting the uninitialized
@@ -2583,7 +2584,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
			ex3->ee_block = cpu_to_le32(iblock);
			ext4_ext_store_pblock(ex3, newblock);
			ex3->ee_len = cpu_to_le16(allocated);
			err = ext4_ext_insert_extent(handle, inode, path, ex3);
			err = ext4_ext_insert_extent(handle, inode, path,
							ex3, 0);
			if (err == -ENOSPC) {
				err =  ext4_ext_zeroout(inode, &orig_ex);
				if (err)
@@ -2639,7 +2641,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
		ext4_ext_store_pblock(ex3, newblock + max_blocks);
		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
		ext4_ext_mark_uninitialized(ex3);
		err = ext4_ext_insert_extent(handle, inode, path, ex3);
		err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
		if (err == -ENOSPC) {
			err =  ext4_ext_zeroout(inode, &orig_ex);
			if (err)
@@ -2757,7 +2759,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
	err = ext4_ext_dirty(handle, inode, path + depth);
	goto out;
insert:
	err = ext4_ext_insert_extent(handle, inode, path, &newex);
	err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
	if (err == -ENOSPC) {
		err =  ext4_ext_zeroout(inode, &orig_ex);
		if (err)
@@ -2784,6 +2786,320 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
	return err;
}

/*
 * This function is called by ext4_ext_get_blocks() from
 * ext4_get_blocks_dio_write() when DIO to write
 * to an uninitialized extent.
 *
 * Writing to an uninitized extent may result in splitting the uninitialized
 * extent into multiple /intialized unintialized extents (up to three)
 * There are three possibilities:
 *   a> There is no split required: Entire extent should be uninitialized
 *   b> Splits in two extents: Write is happening at either end of the extent
 *   c> Splits in three extents: Somone is writing in middle of the extent
 *
 * One of more index blocks maybe needed if the extent tree grow after
 * the unintialized extent split. To prevent ENOSPC occur at the IO
 * complete, we need to split the uninitialized extent before DIO submit
 * the IO. The uninitilized extent called at this time will be split
 * into three uninitialized extent(at most). After IO complete, the part
 * being filled will be convert to initialized by the end_io callback function
 * via ext4_convert_unwritten_extents().
 */
static int ext4_split_unwritten_extents(handle_t *handle,
					struct inode *inode,
					struct ext4_ext_path *path,
					ext4_lblk_t iblock,
					unsigned int max_blocks,
					int flags)
{
	struct ext4_extent *ex, newex, orig_ex;
	struct ext4_extent *ex1 = NULL;
	struct ext4_extent *ex2 = NULL;
	struct ext4_extent *ex3 = NULL;
	struct ext4_extent_header *eh;
	ext4_lblk_t ee_block;
	unsigned int allocated, ee_len, depth;
	ext4_fsblk_t newblock;
	int err = 0;
	int ret = 0;

	ext_debug("ext4_split_unwritten_extents: inode %lu,"
		  "iblock %llu, max_blocks %u\n", inode->i_ino,
		  (unsigned long long)iblock, max_blocks);
	depth = ext_depth(inode);
	eh = path[depth].p_hdr;
	ex = path[depth].p_ext;
	ee_block = le32_to_cpu(ex->ee_block);
	ee_len = ext4_ext_get_actual_len(ex);
	allocated = ee_len - (iblock - ee_block);
	newblock = iblock - ee_block + ext_pblock(ex);
	ex2 = ex;
	orig_ex.ee_block = ex->ee_block;
	orig_ex.ee_len   = cpu_to_le16(ee_len);
	ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));

	/*
 	 * if the entire unintialized extent length less than
 	 * the size of extent to write, there is no need to split
 	 * uninitialized extent
 	 */
 	if (allocated <= max_blocks)
		return ret;

	err = ext4_ext_get_access(handle, inode, path + depth);
	if (err)
		goto out;
	/* ex1: ee_block to iblock - 1 : uninitialized */
	if (iblock > ee_block) {
		ex1 = ex;
		ex1->ee_len = cpu_to_le16(iblock - ee_block);
		ext4_ext_mark_uninitialized(ex1);
		ex2 = &newex;
	}
	/*
	 * for sanity, update the length of the ex2 extent before
	 * we insert ex3, if ex1 is NULL. This is to avoid temporary
	 * overlap of blocks.
	 */
	if (!ex1 && allocated > max_blocks)
		ex2->ee_len = cpu_to_le16(max_blocks);
	/* ex3: to ee_block + ee_len : uninitialised */
	if (allocated > max_blocks) {
		unsigned int newdepth;
		ex3 = &newex;
		ex3->ee_block = cpu_to_le32(iblock + max_blocks);
		ext4_ext_store_pblock(ex3, newblock + max_blocks);
		ex3->ee_len = cpu_to_le16(allocated - max_blocks);
		ext4_ext_mark_uninitialized(ex3);
		err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
		if (err == -ENOSPC) {
			err =  ext4_ext_zeroout(inode, &orig_ex);
			if (err)
				goto fix_extent_len;
			/* update the extent length and mark as initialized */
			ex->ee_block = orig_ex.ee_block;
			ex->ee_len   = orig_ex.ee_len;
			ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
			ext4_ext_dirty(handle, inode, path + depth);
			/* zeroed the full extent */
			/* blocks available from iblock */
			return allocated;

		} else if (err)
			goto fix_extent_len;
		/*
		 * The depth, and hence eh & ex might change
		 * as part of the insert above.
		 */
		newdepth = ext_depth(inode);
		/*
		 * update the extent length after successful insert of the
		 * split extent
		 */
		orig_ex.ee_len = cpu_to_le16(ee_len -
						ext4_ext_get_actual_len(ex3));
		depth = newdepth;
		ext4_ext_drop_refs(path);
		path = ext4_ext_find_extent(inode, iblock, path);
		if (IS_ERR(path)) {
			err = PTR_ERR(path);
			goto out;
		}
		eh = path[depth].p_hdr;
		ex = path[depth].p_ext;
		if (ex2 != &newex)
			ex2 = ex;

		err = ext4_ext_get_access(handle, inode, path + depth);
		if (err)
			goto out;

		allocated = max_blocks;
	}
	/*
	 * If there was a change of depth as part of the
	 * insertion of ex3 above, we need to update the length
	 * of the ex1 extent again here
	 */
	if (ex1 && ex1 != ex) {
		ex1 = ex;
		ex1->ee_len = cpu_to_le16(iblock - ee_block);
		ext4_ext_mark_uninitialized(ex1);
		ex2 = &newex;
	}
	/*
	 * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
	 * uninitialised still.
	 */
	ex2->ee_block = cpu_to_le32(iblock);
	ext4_ext_store_pblock(ex2, newblock);
	ex2->ee_len = cpu_to_le16(allocated);
	ext4_ext_mark_uninitialized(ex2);
	if (ex2 != ex)
		goto insert;
	/* Mark modified extent as dirty */
	err = ext4_ext_dirty(handle, inode, path + depth);
	ext_debug("out here\n");
	goto out;
insert:
	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
	if (err == -ENOSPC) {
		err =  ext4_ext_zeroout(inode, &orig_ex);
		if (err)
			goto fix_extent_len;
		/* update the extent length and mark as initialized */
		ex->ee_block = orig_ex.ee_block;
		ex->ee_len   = orig_ex.ee_len;
		ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
		ext4_ext_dirty(handle, inode, path + depth);
		/* zero out the first half */
		return allocated;
	} else if (err)
		goto fix_extent_len;
out:
	ext4_ext_show_leaf(inode, path);
	return err ? err : allocated;

fix_extent_len:
	ex->ee_block = orig_ex.ee_block;
	ex->ee_len   = orig_ex.ee_len;
	ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
	ext4_ext_mark_uninitialized(ex);
	ext4_ext_dirty(handle, inode, path + depth);
	return err;
}
static int ext4_convert_unwritten_extents_dio(handle_t *handle,
					      struct inode *inode,
					      struct ext4_ext_path *path)
{
	struct ext4_extent *ex;
	struct ext4_extent_header *eh;
	int depth;
	int err = 0;
	int ret = 0;

	depth = ext_depth(inode);
	eh = path[depth].p_hdr;
	ex = path[depth].p_ext;

	err = ext4_ext_get_access(handle, inode, path + depth);
	if (err)
		goto out;
	/* first mark the extent as initialized */
	ext4_ext_mark_initialized(ex);

	/*
	 * We have to see if it can be merged with the extent
	 * on the left.
	 */
	if (ex > EXT_FIRST_EXTENT(eh)) {
		/*
		 * To merge left, pass "ex - 1" to try_to_merge(),
		 * since it merges towards right _only_.
		 */
		ret = ext4_ext_try_to_merge(inode, path, ex - 1);
		if (ret) {
			err = ext4_ext_correct_indexes(handle, inode, path);
			if (err)
				goto out;
			depth = ext_depth(inode);
			ex--;
		}
	}
	/*
	 * Try to Merge towards right.
	 */
	ret = ext4_ext_try_to_merge(inode, path, ex);
	if (ret) {
		err = ext4_ext_correct_indexes(handle, inode, path);
		if (err)
			goto out;
		depth = ext_depth(inode);
	}
	/* Mark modified extent as dirty */
	err = ext4_ext_dirty(handle, inode, path + depth);
out:
	ext4_ext_show_leaf(inode, path);
	return err;
}

static int
ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
			ext4_lblk_t iblock, unsigned int max_blocks,
			struct ext4_ext_path *path, int flags,
			unsigned int allocated, struct buffer_head *bh_result,
			ext4_fsblk_t newblock)
{
	int ret = 0;
	int err = 0;

	ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
		  "block %llu, max_blocks %u, flags %d, allocated %u",
		  inode->i_ino, (unsigned long long)iblock, max_blocks,
		  flags, allocated);
	ext4_ext_show_leaf(inode, path);

	/* DIO get_block() before submit the IO, split the extent */
	if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) {
		ret = ext4_split_unwritten_extents(handle,
						inode, path, iblock,
						max_blocks, flags);
		goto out;
	}
	/* DIO end_io complete, convert the filled extent to written */
	if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
		ret = ext4_convert_unwritten_extents_dio(handle, inode,
							path);
		goto out2;
	}
	/* buffered IO case */
	/*
	 * repeat fallocate creation request
	 * we already have an unwritten extent
	 */
	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
		goto map_out;

	/* buffered READ or buffered write_begin() lookup */
	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
		/*
		 * We have blocks reserved already.  We
		 * return allocated blocks so that delalloc
		 * won't do block reservation for us.  But
		 * the buffer head will be unmapped so that
		 * a read from the block returns 0s.
		 */
		set_buffer_unwritten(bh_result);
		goto out1;
	}

	/* buffered write, writepage time, convert*/
	ret = ext4_ext_convert_to_initialized(handle, inode,
						path, iblock,
						max_blocks);
out:
	if (ret <= 0) {
		err = ret;
		goto out2;
	} else
		allocated = ret;
	set_buffer_new(bh_result);
map_out:
	set_buffer_mapped(bh_result);
out1:
	if (allocated > max_blocks)
		allocated = max_blocks;
	ext4_ext_show_leaf(inode, path);
	bh_result->b_bdev = inode->i_sb->s_bdev;
	bh_result->b_blocknr = newblock;
out2:
	if (path) {
		ext4_ext_drop_refs(path);
		kfree(path);
	}
	return err ? err : allocated;
}
/*
 * Block allocation/map/preallocation routine for extents based files
 *
@@ -2889,33 +3205,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
							EXT4_EXT_CACHE_EXTENT);
				goto out;
			}
			if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
				goto out;
			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
				if (allocated > max_blocks)
					allocated = max_blocks;
				/*
				 * We have blocks reserved already.  We
				 * return allocated blocks so that delalloc
				 * won't do block reservation for us.  But
				 * the buffer head will be unmapped so that
				 * a read from the block returns 0s.
				 */
				set_buffer_unwritten(bh_result);
				bh_result->b_bdev = inode->i_sb->s_bdev;
				bh_result->b_blocknr = newblock;
				goto out2;
			}

			ret = ext4_ext_convert_to_initialized(handle, inode,
								path, iblock,
								max_blocks);
			if (ret <= 0) {
				err = ret;
				goto out2;
			} else
				allocated = ret;
			goto outnew;
			ret = ext4_ext_handle_uninitialized_extents(handle,
					inode, iblock, max_blocks, path,
					flags, allocated, bh_result, newblock);
			return ret;
		}
	}

@@ -2988,7 +3281,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
	newex.ee_len = cpu_to_le16(ar.len);
	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)  /* Mark uninitialized */
		ext4_ext_mark_uninitialized(&newex);
	err = ext4_ext_insert_extent(handle, inode, path, &newex);
	err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
	if (err) {
		/* free data blocks we just allocated */
		/* not a good idea to call discard here directly,
@@ -3002,7 +3295,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
	/* previous routine could use block we allocated */
	newblock = ext_pblock(&newex);
	allocated = ext4_ext_get_actual_len(&newex);
outnew:
	set_buffer_new(bh_result);

	/* Cache only when it is _not_ an uninitialized extent */
@@ -3200,6 +3492,63 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
	return ret > 0 ? ret2 : ret;
}

/*
 * This function convert a range of blocks to written extents
 * The caller of this function will pass the start offset and the size.
 * all unwritten extents within this range will be converted to
 * written extents.
 *
 * This function is called from the direct IO end io call back
 * function, to convert the fallocated extents after IO is completed.
 */
int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
				    loff_t len)
{
	handle_t *handle;
	ext4_lblk_t block;
	unsigned int max_blocks;
	int ret = 0;
	int ret2 = 0;
	struct buffer_head map_bh;
	unsigned int credits, blkbits = inode->i_blkbits;

	block = offset >> blkbits;
	/*
	 * We can't just convert len to max_blocks because
	 * If blocksize = 4096 offset = 3072 and len = 2048
	 */
	max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
							- block;
	/*
	 * credits to insert 1 extent into extent tree
	 */
	credits = ext4_chunk_trans_blocks(inode, max_blocks);
	while (ret >= 0 && ret < max_blocks) {
		block = block + ret;
		max_blocks = max_blocks - ret;
		handle = ext4_journal_start(inode, credits);
		if (IS_ERR(handle)) {
			ret = PTR_ERR(handle);
			break;
		}
		map_bh.b_state = 0;
		ret = ext4_get_blocks(handle, inode, block,
				      max_blocks, &map_bh,
				      EXT4_GET_BLOCKS_DIO_CONVERT_EXT);
		if (ret <= 0) {
			WARN_ON(ret <= 0);
			printk(KERN_ERR "%s: ext4_ext_get_blocks "
				    "returned error inode#%lu, block=%u, "
				    "max_blocks=%u", __func__,
				    inode->i_ino, block, max_blocks);
		}
		ext4_mark_inode_dirty(handle, inode);
		ret2 = ext4_journal_stop(handle);
		if (ret <= 0 || ret2 )
			break;
	}
	return ret > 0 ? ret2 : ret;
}
/*
 * Callback function called for each extent to gather FIEMAP information.
 */
+3 −0
Original line number Diff line number Diff line
@@ -1233,6 +1233,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
	clear_buffer_mapped(bh);
	clear_buffer_unwritten(bh);

	ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
		  "logical block %lu\n", inode->i_ino, flags, max_blocks,
		  (unsigned long)block);
	/*
	 * Try to see if we can get the block without requesting a new
	 * file system block.
+1 −1
Original line number Diff line number Diff line
@@ -75,7 +75,7 @@ static int finish_range(handle_t *handle, struct inode *inode,
				goto err_out;
		}
	}
	retval = ext4_ext_insert_extent(handle, inode, path, &newext);
	retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0);
err_out:
	if (path) {
		ext4_ext_drop_refs(path);
Loading