Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 27dd4385 authored by Lukas Czerner's avatar Lukas Czerner Committed by Theodore Ts'o
Browse files

ext4: introduce reserved space



Currently in ENOSPC condition when writing into unwritten space, or
punching a hole, we might need to split the extent and grow extent tree.
However since we can not allocate any new metadata blocks we'll have to
zero out unwritten part of extent or punched out part of extent, or in
the worst case return ENOSPC even though use actually does not allocate
any space.

Also in delalloc path we do reserve metadata and data blocks for the
time we're going to write out, however metadata block reservation is
very tricky especially since we expect that logical connectivity implies
physical connectivity, however that might not be the case and hence we
might end up allocating more metadata blocks than previously reserved.
So in future, metadata reservation checks should be removed since we can
not assure that we do not under reserve.

And this is where reserved space comes into the picture. When mounting
the file system we slice off a little bit of the file system space (2%
or 4096 clusters, whichever is smaller) which can be then used for the
cases mentioned above to prevent costly zeroout, or unexpected ENOSPC.

The number of reserved clusters can be set via sysfs, however it can
never be bigger than number of free clusters in the file system.

Note that this patch fixes the failure of xfstest 274 as expected.

Signed-off-by: default avatarLukas Czerner <lczerner@redhat.com>
Signed-off-by: default avatar"Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: default avatarCarlos Maiolino <cmaiolino@redhat.com>
parent f45a5ef9
Loading
Loading
Loading
Loading
+11 −0
Original line number Original line Diff line number Diff line
@@ -494,6 +494,17 @@ Files in /sys/fs/ext4/<devname>
 session_write_kbytes         This file is read-only and shows the number of
 session_write_kbytes         This file is read-only and shows the number of
                              kilobytes of data that have been written to this
                              kilobytes of data that have been written to this
                              filesystem since it was mounted.
                              filesystem since it was mounted.

 reserved_clusters            This is RW file and contains number of reserved
                              clusters in the file system which will be used
                              in the specific situations to avoid costly
                              zeroout, unexpected ENOSPC, or possible data
                              loss. The default is 2% or 4096 clusters,
                              whichever is smaller and this can be changed
                              however it can never exceed number of clusters
                              in the file system. If there is not enough space
                              for the reserved space when mounting the file
                              mount will _not_ fail.
..............................................................................
..............................................................................


Ioctls
Ioctls
+13 −5
Original line number Original line Diff line number Diff line
@@ -499,20 +499,22 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
				  s64 nclusters, unsigned int flags)
				  s64 nclusters, unsigned int flags)
{
{
	s64 free_clusters, dirty_clusters, root_clusters;
	s64 free_clusters, dirty_clusters, rsv, resv_clusters;
	struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
	struct percpu_counter *fcc = &sbi->s_freeclusters_counter;
	struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;
	struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter;


	free_clusters  = percpu_counter_read_positive(fcc);
	free_clusters  = percpu_counter_read_positive(fcc);
	dirty_clusters = percpu_counter_read_positive(dcc);
	dirty_clusters = percpu_counter_read_positive(dcc);
	resv_clusters = atomic64_read(&sbi->s_resv_clusters);


	/*
	/*
	 * r_blocks_count should always be multiple of the cluster ratio so
	 * r_blocks_count should always be multiple of the cluster ratio so
	 * we are safe to do a plane bit shift only.
	 * we are safe to do a plane bit shift only.
	 */
	 */
	root_clusters = ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
	rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) +
	      resv_clusters;


	if (free_clusters - (nclusters + root_clusters + dirty_clusters) <
	if (free_clusters - (nclusters + rsv + dirty_clusters) <
					EXT4_FREECLUSTERS_WATERMARK) {
					EXT4_FREECLUSTERS_WATERMARK) {
		free_clusters  = percpu_counter_sum_positive(fcc);
		free_clusters  = percpu_counter_sum_positive(fcc);
		dirty_clusters = percpu_counter_sum_positive(dcc);
		dirty_clusters = percpu_counter_sum_positive(dcc);
@@ -520,7 +522,7 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
	/* Check whether we have space after accounting for current
	/* Check whether we have space after accounting for current
	 * dirty clusters & root reserved clusters.
	 * dirty clusters & root reserved clusters.
	 */
	 */
	if (free_clusters >= ((root_clusters + nclusters) + dirty_clusters))
	if (free_clusters >= (rsv + nclusters + dirty_clusters))
		return 1;
		return 1;


	/* Hm, nope.  Are (enough) root reserved clusters available? */
	/* Hm, nope.  Are (enough) root reserved clusters available? */
@@ -529,6 +531,12 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi,
	    capable(CAP_SYS_RESOURCE) ||
	    capable(CAP_SYS_RESOURCE) ||
	    (flags & EXT4_MB_USE_ROOT_BLOCKS)) {
	    (flags & EXT4_MB_USE_ROOT_BLOCKS)) {


		if (free_clusters >= (nclusters + dirty_clusters +
				      resv_clusters))
			return 1;
	}
	/* No free blocks. Let's see if we can dip into reserved pool */
	if (flags & EXT4_MB_USE_RESERVED) {
		if (free_clusters >= (nclusters + dirty_clusters))
		if (free_clusters >= (nclusters + dirty_clusters))
			return 1;
			return 1;
	}
	}
+8 −5
Original line number Original line Diff line number Diff line
@@ -121,6 +121,8 @@ typedef unsigned int ext4_group_t;
#define EXT4_MB_STREAM_ALLOC		0x0800
#define EXT4_MB_STREAM_ALLOC		0x0800
/* Use reserved root blocks if needed */
/* Use reserved root blocks if needed */
#define EXT4_MB_USE_ROOT_BLOCKS		0x1000
#define EXT4_MB_USE_ROOT_BLOCKS		0x1000
/* Use blocks from reserved pool */
#define EXT4_MB_USE_RESERVED		0x2000


struct ext4_allocation_request {
struct ext4_allocation_request {
	/* target inode for block we're allocating */
	/* target inode for block we're allocating */
@@ -557,9 +559,8 @@ enum {
#define EXT4_GET_BLOCKS_UNINIT_EXT		0x0002
#define EXT4_GET_BLOCKS_UNINIT_EXT		0x0002
#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
						 EXT4_GET_BLOCKS_CREATE)
						 EXT4_GET_BLOCKS_CREATE)
	/* Caller is from the delayed allocation writeout path,
	/* Caller is from the delayed allocation writeout path
	   so set the magic i_delalloc_reserve_flag after taking the
	 * finally doing the actual allocation of delayed blocks */
	   inode allocation semaphore for */
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004
#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004
	/* caller is from the direct IO path, request to creation of an
	/* caller is from the direct IO path, request to creation of an
	unitialized extents if not allocated, split the uninitialized
	unitialized extents if not allocated, split the uninitialized
@@ -571,8 +572,9 @@ enum {
	/* Convert extent to initialized after IO complete */
	/* Convert extent to initialized after IO complete */
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
#define EXT4_GET_BLOCKS_IO_CONVERT_EXT		(EXT4_GET_BLOCKS_CONVERT|\
					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
					 EXT4_GET_BLOCKS_CREATE_UNINIT_EXT)
	/* Punch out blocks of an extent */
	/* Eventual metadata allocation (due to growing extent tree)
#define EXT4_GET_BLOCKS_PUNCH_OUT_EXT		0x0020
	 * should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL		0x0020
	/* Don't normalize allocation size (used for fallocate) */
	/* Don't normalize allocation size (used for fallocate) */
#define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
#define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
	/* Request will not result in inode size update (user for fallocate) */
	/* Request will not result in inode size update (user for fallocate) */
@@ -1188,6 +1190,7 @@ struct ext4_sb_info {
	unsigned int s_mount_flags;
	unsigned int s_mount_flags;
	unsigned int s_def_mount_opt;
	unsigned int s_def_mount_opt;
	ext4_fsblk_t s_sb_block;
	ext4_fsblk_t s_sb_block;
	atomic64_t s_resv_clusters;
	kuid_t s_resuid;
	kuid_t s_resuid;
	kgid_t s_resgid;
	kgid_t s_resgid;
	unsigned short s_mount_state;
	unsigned short s_mount_state;
+18 −9
Original line number Original line Diff line number Diff line
@@ -1942,8 +1942,8 @@ prepend:
	 * There is no free space in the found leaf.
	 * There is no free space in the found leaf.
	 * We're gonna add a new leaf in the tree.
	 * We're gonna add a new leaf in the tree.
	 */
	 */
	if (flag & EXT4_GET_BLOCKS_PUNCH_OUT_EXT)
	if (flag & EXT4_GET_BLOCKS_METADATA_NOFAIL)
		flags = EXT4_MB_USE_ROOT_BLOCKS;
		flags = EXT4_MB_USE_RESERVED;
	err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
	err = ext4_ext_create_new_leaf(handle, inode, flags, path, newext);
	if (err)
	if (err)
		goto cleanup;
		goto cleanup;
@@ -2729,12 +2729,14 @@ again:


			/*
			/*
			 * Split the extent in two so that 'end' is the last
			 * Split the extent in two so that 'end' is the last
			 * block in the first new extent
			 * block in the first new extent. Also we should not
			 * fail removing space due to ENOSPC so try to use
			 * reserved block if that happens.
			 */
			 */
			err = ext4_split_extent_at(handle, inode, path,
			err = ext4_split_extent_at(handle, inode, path,
					end + 1, split_flag,
					end + 1, split_flag,
					EXT4_GET_BLOCKS_PRE_IO |
					EXT4_GET_BLOCKS_PRE_IO |
						EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
					EXT4_GET_BLOCKS_METADATA_NOFAIL);


			if (err < 0)
			if (err < 0)
				goto out;
				goto out;
@@ -3209,7 +3211,8 @@ out:
static int ext4_ext_convert_to_initialized(handle_t *handle,
static int ext4_ext_convert_to_initialized(handle_t *handle,
					   struct inode *inode,
					   struct inode *inode,
					   struct ext4_map_blocks *map,
					   struct ext4_map_blocks *map,
					   struct ext4_ext_path *path)
					   struct ext4_ext_path *path,
					   int flags)
{
{
	struct ext4_sb_info *sbi;
	struct ext4_sb_info *sbi;
	struct ext4_extent_header *eh;
	struct ext4_extent_header *eh;
@@ -3435,7 +3438,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
	}
	}


	allocated = ext4_split_extent(handle, inode, path,
	allocated = ext4_split_extent(handle, inode, path,
				      &split_map, split_flag, 0);
				      &split_map, split_flag, flags);
	if (allocated < 0)
	if (allocated < 0)
		err = allocated;
		err = allocated;


@@ -3755,6 +3758,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
		  flags, allocated);
		  flags, allocated);
	ext4_ext_show_leaf(inode, path);
	ext4_ext_show_leaf(inode, path);


	/*
	 * When writing into uninitialized space, we should not fail to
	 * allocate metadata blocks for the new extent block if needed.
	 */
	flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL;

	trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
	trace_ext4_ext_handle_uninitialized_extents(inode, map, flags,
						    allocated, newblock);
						    allocated, newblock);


@@ -3818,7 +3827,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
	}
	}


	/* buffered write, writepage time, convert*/
	/* buffered write, writepage time, convert*/
	ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
	ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags);
	if (ret >= 0)
	if (ret >= 0)
		ext4_update_inode_fsync_trans(handle, inode, 1);
		ext4_update_inode_fsync_trans(handle, inode, 1);
out:
out:
+10 −1
Original line number Original line Diff line number Diff line
@@ -1688,12 +1688,21 @@ static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
	 */
	 */
	map.m_lblk = next;
	map.m_lblk = next;
	map.m_len = max_blocks;
	map.m_len = max_blocks;
	get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
	/*
	 * We're in delalloc path and it is possible that we're going to
	 * need more metadata blocks than previously reserved. However
	 * we must not fail because we're in writeback and there is
	 * nothing we can do about it so it might result in data loss.
	 * So use reserved blocks to allocate metadata if possible.
	 */
	get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
			   EXT4_GET_BLOCKS_METADATA_NOFAIL;
	if (ext4_should_dioread_nolock(mpd->inode))
	if (ext4_should_dioread_nolock(mpd->inode))
		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
		get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
	if (mpd->b_state & (1 << BH_Delay))
	if (mpd->b_state & (1 << BH_Delay))
		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
		get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;



	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
	blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
	if (blks < 0) {
	if (blks < 0) {
		struct super_block *sb = mpd->inode->i_sb;
		struct super_block *sb = mpd->inode->i_sb;
Loading