Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 4a096752 authored by Chris Mason's avatar Chris Mason
Browse files

Btrfs: Data ordered fixes



* In btrfs_delete_inode, wait for ordered extents after calling
truncate_inode_pages.  This is much faster, and more correct

* Properly clear our the PageChecked bit everywhere we redirty the page.

* Change the writepage fixup handler to lock the page range and check to
see if an ordered extent had been inserted since the improperly dirtied
page was discovered

* Wait for ordered extents outside the transaction.  This isn't required
for locking rules but does improve transaction latencies

* Reduce contention on the alloc_mutex by dropping it while incrementing
refs on a node/leaf and while dropping refs on a leaf.

Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent e5a2217e
Loading
Loading
Loading
Loading
+15 −3
Original line number Diff line number Diff line
@@ -934,7 +934,6 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
	if (!root->ref_cows)
		return 0;

	mutex_lock(&root->fs_info->alloc_mutex);
	level = btrfs_header_level(buf);
	nritems = btrfs_header_nritems(buf);
	for (i = 0; i < nritems; i++) {
@@ -951,29 +950,36 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
			if (disk_bytenr == 0)
				continue;

			mutex_lock(&root->fs_info->alloc_mutex);
			ret = __btrfs_inc_extent_ref(trans, root, disk_bytenr,
				    btrfs_file_extent_disk_num_bytes(buf, fi),
				    root->root_key.objectid, trans->transid,
				    key.objectid, key.offset);
			mutex_unlock(&root->fs_info->alloc_mutex);
			if (ret) {
				faili = i;
				WARN_ON(1);
				goto fail;
			}
		} else {
			bytenr = btrfs_node_blockptr(buf, i);
			btrfs_node_key_to_cpu(buf, &key, i);

			mutex_lock(&root->fs_info->alloc_mutex);
			ret = __btrfs_inc_extent_ref(trans, root, bytenr,
					   btrfs_level_size(root, level - 1),
					   root->root_key.objectid,
					   trans->transid,
					   level - 1, key.objectid);
			mutex_unlock(&root->fs_info->alloc_mutex);
			if (ret) {
				faili = i;
				WARN_ON(1);
				goto fail;
			}
		}
	}
	mutex_unlock(&root->fs_info->alloc_mutex);
	return 0;
fail:
	WARN_ON(1);
@@ -1004,7 +1010,6 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		}
	}
#endif
	mutex_unlock(&root->fs_info->alloc_mutex);
	return ret;
}

@@ -2180,6 +2185,8 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
	leaf_owner = btrfs_header_owner(leaf);
	leaf_generation = btrfs_header_generation(leaf);

	mutex_unlock(&root->fs_info->alloc_mutex);

	for (i = 0; i < nritems; i++) {
		u64 disk_bytenr;

@@ -2197,12 +2204,17 @@ static int noinline drop_leaf_ref(struct btrfs_trans_handle *trans,
		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
		if (disk_bytenr == 0)
			continue;

		mutex_lock(&root->fs_info->alloc_mutex);
		ret = __btrfs_free_extent(trans, root, disk_bytenr,
				btrfs_file_extent_disk_num_bytes(leaf, fi),
				leaf_owner, leaf_generation,
				key.objectid, key.offset, 0);
		mutex_unlock(&root->fs_info->alloc_mutex);
		BUG_ON(ret);
	}

	mutex_lock(&root->fs_info->alloc_mutex);
	return 0;
}

+1 −0
Original line number Diff line number Diff line
@@ -75,6 +75,7 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages)
	for (i = 0; i < num_pages; i++) {
		if (!pages[i])
			break;
		ClearPageChecked(pages[i]);
		unlock_page(pages[i]);
		mark_page_accessed(pages[i]);
		page_cache_release(pages[i]);
+18 −5
Original line number Diff line number Diff line
@@ -418,7 +418,7 @@ void btrfs_writepage_fixup_worker(struct btrfs_work *work)

	fixup = container_of(work, struct btrfs_writepage_fixup, work);
	page = fixup->page;

again:
	lock_page(page);
	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
		ClearPageChecked(page);
@@ -430,9 +430,21 @@ void btrfs_writepage_fixup_worker(struct btrfs_work *work)
	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;

	lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS);
	ordered = btrfs_lookup_ordered_extent(inode, page_start);
	if (ordered)

	/* already ordered? We're done */
	if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
			     EXTENT_ORDERED, 0)) {
		goto out;
	}

	ordered = btrfs_lookup_ordered_extent(inode, page_start);
	if (ordered) {
		unlock_extent(&BTRFS_I(inode)->io_tree, page_start,
			      page_end, GFP_NOFS);
		unlock_page(page);
		btrfs_start_ordered_extent(inode, ordered, 1);
		goto again;
	}

	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start, page_end,
			    GFP_NOFS);
@@ -1465,11 +1477,11 @@ void btrfs_delete_inode(struct inode *inode)
	unsigned long nr;
	int ret;

	btrfs_wait_ordered_range(inode, 0, (u64)-1);
	truncate_inode_pages(&inode->i_data, 0);
	if (is_bad_inode(inode)) {
		goto no_delete;
	}
	btrfs_wait_ordered_range(inode, 0, (u64)-1);

	btrfs_i_size_write(inode, 0);
	trans = btrfs_start_transaction(root, 1);
@@ -2707,6 +2719,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
		 1, 1, GFP_NOFS);
	__btrfs_releasepage(page, GFP_NOFS);

	ClearPageChecked(page);
	if (PagePrivate(page)) {
		invalidate_extent_lru(tree, page_offset(page),
				      PAGE_CACHE_SIZE);
@@ -2818,10 +2831,10 @@ static void btrfs_truncate(struct inode *inode)
		return;

	btrfs_truncate_page(inode->i_mapping, inode->i_size);
	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);

	trans = btrfs_start_transaction(root, 1);
	btrfs_set_trans_block_group(trans, inode);
	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
	btrfs_i_size_write(inode, inode->i_size);

	/* FIXME, add redo link to tree so we don't leak on crash */
+9 −2
Original line number Diff line number Diff line
@@ -336,7 +336,7 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
		orig_end = start + len - 1;
		wait_end = orig_end;
	}

again:
	/* start IO across the range first to instantiate any delalloc
	 * extents
	 */
@@ -369,6 +369,14 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
			break;
		end--;
	}
	if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
			   EXTENT_ORDERED | EXTENT_DELALLOC, 0)) {
		printk("inode %lu still ordered or delalloc after wait "
		       "%llu %llu\n", inode->i_ino,
		       (unsigned long long)start,
		       (unsigned long long)orig_end);
		goto again;
	}
}

/*
@@ -545,7 +553,6 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
			sector_sums = &ordered_sum->sums;
			for (i = 0; i < num_sectors; i++) {
				if (sector_sums[i].offset == offset) {
printk("find ordered sum inode %lu offset %Lu\n", inode->i_ino, offset);
					*sum = sector_sums[i].sum;
					ret = 0;
					goto out;