Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 32159242 authored by Gui Hecheng's avatar Gui Hecheng Committed by Chris Mason
Browse files

btrfs: fix dead lock while running replace and defrag concurrently



This can be reproduced by fstests: btrfs/070

The scenario is like the following:

replace worker thread		defrag thread
---------------------		-------------
copy_nocow_pages_worker		btrfs_defrag_file
  copy_nocow_pages_for_inode	    ...
				  btrfs_writepages
  |A| lock_extent_bits		    extent_write_cache_pages
				|B|   lock_page
					__extent_writepage
		...			  writepage_delalloc
					    find_lock_delalloc_range
				|B| 	      lock_extent_bits
  find_or_create_page
    pagecache_get_page
  |A| lock_page

This leads to an ABBA pattern deadlock. To fix it,
o we just change it to an AABB pattern which means to @unlock_extent_bits()
  before we @lock_page(), and in this way the @extent_read_full_page_nolock()
  is no longer in an locked context, so change it back to @extent_read_full_page()
  to regain protection.

o Since we @unlock_extent_bits() earlier, then before @write_page_nocow(),
  the extent may not really point at the physical block we want, so we
  have to check it before write.

Signed-off-by: default avatarGui Hecheng <guihc.fnst@cn.fujitsu.com>
Tested-by: default avatarDavid Sterba <dsterba@suse.cz>
Signed-off-by: default avatarChris Mason <clm@fb.com>
parent 5f5bc6b1
Loading
Loading
Loading
Loading
+60 −30
Original line number Diff line number Diff line
@@ -3310,6 +3310,50 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
	scrub_pending_trans_workers_dec(sctx);
}

static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
				 u64 logical)
{
	struct extent_state *cached_state = NULL;
	struct btrfs_ordered_extent *ordered;
	struct extent_io_tree *io_tree;
	struct extent_map *em;
	u64 lockstart = start, lockend = start + len - 1;
	int ret = 0;

	io_tree = &BTRFS_I(inode)->io_tree;

	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
	if (ordered) {
		btrfs_put_ordered_extent(ordered);
		ret = 1;
		goto out_unlock;
	}

	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
	if (IS_ERR(em)) {
		ret = PTR_ERR(em);
		goto out_unlock;
	}

	/*
	 * This extent does not actually cover the logical extent anymore,
	 * move on to the next inode.
	 */
	if (em->block_start > logical ||
	    em->block_start + em->block_len < logical + len) {
		free_extent_map(em);
		ret = 1;
		goto out_unlock;
	}
	free_extent_map(em);

out_unlock:
	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
			     GFP_NOFS);
	return ret;
}

static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
				      struct scrub_copy_nocow_ctx *nocow_ctx)
{
@@ -3318,13 +3362,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
	struct inode *inode;
	struct page *page;
	struct btrfs_root *local_root;
	struct btrfs_ordered_extent *ordered;
	struct extent_map *em;
	struct extent_state *cached_state = NULL;
	struct extent_io_tree *io_tree;
	u64 physical_for_dev_replace;
	u64 nocow_ctx_logical;
	u64 len = nocow_ctx->len;
	u64 lockstart = offset, lockend = offset + len - 1;
	unsigned long index;
	int srcu_index;
	int ret = 0;
@@ -3356,30 +3397,13 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,

	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
	io_tree = &BTRFS_I(inode)->io_tree;
	nocow_ctx_logical = nocow_ctx->logical;

	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
	if (ordered) {
		btrfs_put_ordered_extent(ordered);
		goto out_unlock;
	}

	em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
	if (IS_ERR(em)) {
		ret = PTR_ERR(em);
		goto out_unlock;
	}

	/*
	 * This extent does not actually cover the logical extent anymore,
	 * move on to the next inode.
	 */
	if (em->block_start > nocow_ctx->logical ||
	    em->block_start + em->block_len < nocow_ctx->logical + len) {
		free_extent_map(em);
		goto out_unlock;
	ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
	if (ret) {
		ret = ret > 0 ? 0 : ret;
		goto out;
	}
	free_extent_map(em);

	while (len >= PAGE_CACHE_SIZE) {
		index = offset >> PAGE_CACHE_SHIFT;
@@ -3396,7 +3420,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
				goto next_page;
		} else {
			ClearPageError(page);
			err = extent_read_full_page_nolock(io_tree, page,
			err = extent_read_full_page(io_tree, page,
							   btrfs_get_extent,
							   nocow_ctx->mirror_num);
			if (err) {
@@ -3421,6 +3445,14 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
				goto next_page;
			}
		}

		ret = check_extent_to_block(inode, offset, len,
					    nocow_ctx_logical);
		if (ret) {
			ret = ret > 0 ? 0 : ret;
			goto next_page;
		}

		err = write_page_nocow(nocow_ctx->sctx,
				       physical_for_dev_replace, page);
		if (err)
@@ -3434,12 +3466,10 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,

		offset += PAGE_CACHE_SIZE;
		physical_for_dev_replace += PAGE_CACHE_SIZE;
		nocow_ctx_logical += PAGE_CACHE_SIZE;
		len -= PAGE_CACHE_SIZE;
	}
	ret = COPY_COMPLETE;
out_unlock:
	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
			     GFP_NOFS);
out:
	mutex_unlock(&inode->i_mutex);
	iput(inode);