Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit bc42bda2 authored by Qu Wenruo's avatar Qu Wenruo Committed by David Sterba
Browse files

btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges



[BUG]
For the following case, btrfs can underflow qgroup reserved space
at an error path:
(Page size 4K, function name without "btrfs_" prefix)

         Task A                  |             Task B
----------------------------------------------------------------------
Buffered_write [0, 2K)           |
|- check_data_free_space()       |
|  |- qgroup_reserve_data()      |
|     Range aligned to page      |
|     range [0, 4K)          <<< |
|     4K bytes reserved      <<< |
|- copy pages to page cache      |
                                 | Buffered_write [2K, 4K)
                                 | |- check_data_free_space()
                                 | |  |- qgroup_reserved_data()
                                 | |     Range alinged to page
                                 | |     range [0, 4K)
                                 | |     Already reserved by A <<<
                                 | |     0 bytes reserved      <<<
                                 | |- delalloc_reserve_metadata()
                                 | |  And it *FAILED* (Maybe EQUOTA)
                                 | |- free_reserved_data_space()
                                      |- qgroup_free_data()
                                         Range aligned to page range
                                         [0, 4K)
                                         Freeing 4K
(Special thanks to Chandan for the detailed report and analyse)

[CAUSE]
Above Task B is freeing reserved data range [0, 4K) which is actually
reserved by Task A.

And at writeback time, page dirty by Task A will go through writeback
routine, which will free 4K reserved data space at file extent insert
time, causing the qgroup underflow.

[FIX]
For btrfs_qgroup_free_data(), add @reserved parameter to only free
data ranges reserved by previous btrfs_qgroup_reserve_data().
So in above case, Task B will try to free 0 byte, so no underflow.

Reported-by: default avatarChandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: default avatarQu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: default avatarChandan Rajendra <chandan@linux.vnet.ibm.com>
Tested-by: default avatarChandan Rajendra <chandan@linux.vnet.ibm.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 364ecf36
Loading
Loading
Loading
Loading
+4 −2
Original line number Diff line number Diff line
@@ -2711,7 +2711,10 @@ enum btrfs_flush_state {
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
int btrfs_check_data_free_space(struct inode *inode,
			struct extent_changeset **reserved, u64 start, u64 len);
void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
void btrfs_free_reserved_data_space(struct inode *inode,
			struct extent_changeset *reserved, u64 start, u64 len);
void btrfs_delalloc_release_space(struct inode *inode,
			struct extent_changeset *reserved, u64 start, u64 len);
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
					    u64 len);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2730,7 +2733,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_space(struct inode *inode,
			struct extent_changeset **reserved, u64 start, u64 len);
void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
					      unsigned short type);
+7 −5
Original line number Diff line number Diff line
@@ -4389,7 +4389,8 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
 * This one will handle the per-inode data rsv map for accurate reserved
 * space framework.
 */
void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
void btrfs_free_reserved_data_space(struct inode *inode,
			struct extent_changeset *reserved, u64 start, u64 len)
{
	struct btrfs_root *root = BTRFS_I(inode)->root;

@@ -4399,7 +4400,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
	start = round_down(start, root->fs_info->sectorsize);

	btrfs_free_reserved_data_space_noquota(inode, start, len);
	btrfs_qgroup_free_data(inode, start, len);
	btrfs_qgroup_free_data(inode, reserved, start, len);
}

static void force_metadata_allocation(struct btrfs_fs_info *info)
@@ -6204,7 +6205,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
		return ret;
	ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len);
	if (ret < 0)
		btrfs_free_reserved_data_space(inode, start, len);
		btrfs_free_reserved_data_space(inode, *reserved, start, len);
	return ret;
}

@@ -6223,10 +6224,11 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
 * list if there are no delalloc bytes left.
 * Also it will handle the qgroup reserved space.
 */
void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
void btrfs_delalloc_release_space(struct inode *inode,
			struct extent_changeset *reserved, u64 start, u64 len)
{
	btrfs_delalloc_release_metadata(BTRFS_I(inode), len);
	btrfs_free_reserved_data_space(inode, start, len);
	btrfs_free_reserved_data_space(inode, reserved, start, len);
}

static int update_block_group(struct btrfs_trans_handle *trans,
+16 −13
Original line number Diff line number Diff line
@@ -1660,7 +1660,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
				reserve_bytes);
		if (ret) {
			if (!only_release_metadata)
				btrfs_free_reserved_data_space(inode, pos,
				btrfs_free_reserved_data_space(inode,
						data_reserved, pos,
						write_bytes);
			else
				btrfs_end_write_no_snapshoting(root);
@@ -1743,7 +1744,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
				__pos = round_down(pos,
						   fs_info->sectorsize) +
					(dirty_pages << PAGE_SHIFT);
				btrfs_delalloc_release_space(inode, __pos,
				btrfs_delalloc_release_space(inode,
						data_reserved, __pos,
						release_bytes);
			}
		}
@@ -1799,7 +1801,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
			btrfs_delalloc_release_metadata(BTRFS_I(inode),
					release_bytes);
		} else {
			btrfs_delalloc_release_space(inode,
			btrfs_delalloc_release_space(inode, data_reserved,
					round_down(pos, fs_info->sectorsize),
					release_bytes);
		}
@@ -2918,8 +2920,8 @@ static long btrfs_fallocate(struct file *file, int mode,
			 * range, free reserved data space first, otherwise
			 * it'll result in false ENOSPC error.
			 */
			btrfs_free_reserved_data_space(inode, cur_offset,
				last_byte - cur_offset);
			btrfs_free_reserved_data_space(inode, data_reserved,
					cur_offset, last_byte - cur_offset);
		}
		free_extent_map(em);
		cur_offset = last_byte;
@@ -2938,7 +2940,8 @@ static long btrfs_fallocate(struct file *file, int mode,
					range->len, i_blocksize(inode),
					offset + len, &alloc_hint);
		else
			btrfs_free_reserved_data_space(inode, range->start,
			btrfs_free_reserved_data_space(inode,
					data_reserved, range->start,
					range->len);
		list_del(&range->list);
		kfree(range);
@@ -2977,8 +2980,8 @@ static long btrfs_fallocate(struct file *file, int mode,
	inode_unlock(inode);
	/* Let go of our reservation. */
	if (ret != 0)
		btrfs_free_reserved_data_space(inode, alloc_start,
				       alloc_end - cur_offset);
		btrfs_free_reserved_data_space(inode, data_reserved,
				alloc_start, alloc_end - cur_offset);
	extent_changeset_free(data_reserved);
	return ret;
}
+15 −14
Original line number Diff line number Diff line
@@ -345,7 +345,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
	 * And at reserve time, it's always aligned to page size, so
	 * just free one page here.
	 */
	btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
	btrfs_free_path(path);
	btrfs_end_transaction(trans);
	return ret;
@@ -2935,7 +2935,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
		 * space for NOCOW range.
		 * As NOCOW won't cause a new delayed ref, just free the space
		 */
		btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
		btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
				       ordered_extent->len);
		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
		if (nolock)
@@ -4794,7 +4794,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
again:
	page = find_or_create_page(mapping, index, mask);
	if (!page) {
		btrfs_delalloc_release_space(inode,
		btrfs_delalloc_release_space(inode, data_reserved,
				round_down(from, blocksize),
				blocksize);
		ret = -ENOMEM;
@@ -4866,7 +4866,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,

out_unlock:
	if (ret)
		btrfs_delalloc_release_space(inode, block_start,
		btrfs_delalloc_release_space(inode, data_reserved, block_start,
					     blocksize);
	unlock_page(page);
	put_page(page);
@@ -5266,7 +5266,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
		 * Note, end is the bytenr of last byte, so we need + 1 here.
		 */
		if (state->state & EXTENT_DELALLOC)
			btrfs_qgroup_free_data(inode, start, end - start + 1);
			btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);

		clear_extent_bit(io_tree, start, end,
				 EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8792,8 +8792,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
		current->journal_info = NULL;
		if (ret < 0 && ret != -EIOCBQUEUED) {
			if (dio_data.reserve)
				btrfs_delalloc_release_space(inode, offset,
							     dio_data.reserve);
				btrfs_delalloc_release_space(inode, data_reserved,
					offset, dio_data.reserve);
			/*
			 * On error we might have left some ordered extents
			 * without submitting corresponding bios for them, so
@@ -8808,8 +8808,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
					dio_data.unsubmitted_oe_range_start,
					false);
		} else if (ret >= 0 && (size_t)ret < count)
			btrfs_delalloc_release_space(inode, offset,
						     count - (size_t)ret);
			btrfs_delalloc_release_space(inode, data_reserved,
					offset, count - (size_t)ret);
	}
out:
	if (wakeup)
@@ -9008,7 +9008,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
	 *    free the entire extent.
	 */
	if (PageDirty(page))
		btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
		btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
	if (!inode_evicting) {
		clear_extent_bit(tree, page_start, page_end,
				 EXTENT_LOCKED | EXTENT_DIRTY |
@@ -9130,8 +9130,8 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
			spin_lock(&BTRFS_I(inode)->lock);
			BTRFS_I(inode)->outstanding_extents++;
			spin_unlock(&BTRFS_I(inode)->lock);
			btrfs_delalloc_release_space(inode, page_start,
						PAGE_SIZE - reserved_space);
			btrfs_delalloc_release_space(inode, data_reserved,
					page_start, PAGE_SIZE - reserved_space);
		}
	}

@@ -9187,7 +9187,8 @@ int btrfs_page_mkwrite(struct vm_fault *vmf)
	}
	unlock_page(page);
out:
	btrfs_delalloc_release_space(inode, page_start, reserved_space);
	btrfs_delalloc_release_space(inode, data_reserved, page_start,
				     reserved_space);
out_noreserve:
	sb_end_pagefault(inode->i_sb);
	extent_changeset_free(data_reserved);
@@ -10557,7 +10558,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
			btrfs_end_transaction(trans);
	}
	if (cur_offset < end)
		btrfs_free_reserved_data_space(inode, cur_offset,
		btrfs_free_reserved_data_space(inode, NULL, cur_offset,
			end - cur_offset + 1);
	return ret;
}
+2 −2
Original line number Diff line number Diff line
@@ -1227,7 +1227,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
		spin_lock(&BTRFS_I(inode)->lock);
		BTRFS_I(inode)->outstanding_extents++;
		spin_unlock(&BTRFS_I(inode)->lock);
		btrfs_delalloc_release_space(inode,
		btrfs_delalloc_release_space(inode, data_reserved,
				start_index << PAGE_SHIFT,
				(page_cnt - i_done) << PAGE_SHIFT);
	}
@@ -1255,7 +1255,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
		unlock_page(pages[i]);
		put_page(pages[i]);
	}
	btrfs_delalloc_release_space(inode,
	btrfs_delalloc_release_space(inode, data_reserved,
			start_index << PAGE_SHIFT,
			page_cnt << PAGE_SHIFT);
	extent_changeset_free(data_reserved);
Loading