Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e6dcd2dc authored by Chris Mason's avatar Chris Mason
Browse files

Btrfs: New data=ordered implementation



The old data=ordered code would force commit to wait until
all the data extents from the transaction were fully on disk.  This
introduced large latencies into the commit and stalled new writers
in the transaction for a long time.

The new code changes the way data allocations and extents work:

* When delayed allocation is filled, data extents are reserved, and
  the extent bit EXTENT_ORDERED is set on the entire range of the extent.
  A struct btrfs_ordered_extent is allocated an inserted into a per-inode
  rbtree to track the pending extents.

* As each page is written EXTENT_ORDERED is cleared on the bytes corresponding
  to that page.

* When all of the bytes corresponding to a single struct btrfs_ordered_extent
  are written, The previously reserved extent is inserted into the FS
  btree and into the extent allocation trees.  The checksums for the file
  data are also updated.

Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 77a41afb
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@

#include "extent_map.h"
#include "extent_io.h"
#include "ordered-data.h"

/* in memory btrfs inode */
struct btrfs_inode {
@@ -32,9 +33,8 @@ struct btrfs_inode {
	struct extent_io_tree io_failure_tree;
	struct mutex csum_mutex;
	struct inode vfs_inode;
	atomic_t ordered_writeback;
	struct btrfs_ordered_inode_tree ordered_tree;

	u64 ordered_trans;
	/*
	 * transid of the trans_handle that last modified this inode
	 */
+17 −2
Original line number Diff line number Diff line
@@ -25,6 +25,7 @@
#include <linux/fs.h>
#include <linux/completion.h>
#include <linux/backing-dev.h>
#include <linux/wait.h>
#include <asm/kmap_types.h>
#include "bit-radix.h"
#include "extent_io.h"
@@ -37,6 +38,7 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
extern struct kmem_cache *btrfs_transaction_cachep;
extern struct kmem_cache *btrfs_bit_radix_cachep;
extern struct kmem_cache *btrfs_path_cachep;
struct btrfs_ordered_sum;

#define BTRFS_MAGIC "_B5RfS_M"

@@ -510,6 +512,7 @@ struct btrfs_fs_info {
	u64 max_inline;
	u64 alloc_start;
	struct btrfs_transaction *running_transaction;
	wait_queue_head_t transaction_throttle;
	struct btrfs_super_block super_copy;
	struct btrfs_super_block super_for_commit;
	struct block_device *__bdev;
@@ -541,6 +544,7 @@ struct btrfs_fs_info {
	 */
	struct btrfs_workers workers;
	struct btrfs_workers endio_workers;
	struct btrfs_workers endio_write_workers;
	struct btrfs_workers submit_workers;
	struct task_struct *transaction_kthread;
	struct task_struct *cleaner_kthread;
@@ -1384,6 +1388,17 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
		       u64 owner, u64 owner_offset,
		       u64 empty_size, u64 hint_byte,
		       u64 search_end, struct btrfs_key *ins, u64 data);
int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
				struct btrfs_root *root,
				u64 root_objectid, u64 ref_generation,
				u64 owner, u64 owner_offset,
				struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root,
				  u64 num_bytes, u64 min_alloc_size,
				  u64 empty_size, u64 hint_byte,
				  u64 search_end, struct btrfs_key *ins,
				  u64 data);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		  struct extent_buffer *buf);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
@@ -1556,9 +1571,9 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
			     u64 bytenr, int mod);
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
			   struct btrfs_root *root, struct inode *inode,
			   struct bio *bio, char *sums);
			   struct btrfs_ordered_sum *sums);
int btrfs_csum_one_bio(struct btrfs_root *root,
		       struct bio *bio, char **sums_ret);
		       struct bio *bio, struct btrfs_ordered_sum **sums_ret);
struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
					  struct btrfs_root *root,
					  struct btrfs_path *path,
+12 −1
Original line number Diff line number Diff line
@@ -407,6 +407,10 @@ static int end_workqueue_bio(struct bio *bio,
	end_io_wq->error = err;
	end_io_wq->work.func = end_workqueue_fn;
	end_io_wq->work.flags = 0;
	if (bio->bi_rw & (1 << BIO_RW))
		btrfs_queue_worker(&fs_info->endio_write_workers,
				   &end_io_wq->work);
	else
		btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);

#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
@@ -1286,6 +1290,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
	mutex_init(&fs_info->transaction_kthread_mutex);
	mutex_init(&fs_info->cleaner_mutex);
	mutex_init(&fs_info->volume_mutex);
	init_waitqueue_head(&fs_info->transaction_throttle);

#if 0
	ret = add_hasher(fs_info, "crc32c");
@@ -1325,9 +1330,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
	btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
	btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
	btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
	btrfs_init_workers(&fs_info->endio_write_workers,
			   fs_info->thread_pool_size);
	btrfs_start_workers(&fs_info->workers, 1);
	btrfs_start_workers(&fs_info->submit_workers, 1);
	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
	btrfs_start_workers(&fs_info->endio_write_workers,
			    fs_info->thread_pool_size);

	err = -EINVAL;
	if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
@@ -1447,6 +1456,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
	btrfs_stop_workers(&fs_info->workers);
	btrfs_stop_workers(&fs_info->endio_workers);
	btrfs_stop_workers(&fs_info->endio_write_workers);
	btrfs_stop_workers(&fs_info->submit_workers);
fail_iput:
	iput(fs_info->btree_inode);
@@ -1702,6 +1712,7 @@ int close_ctree(struct btrfs_root *root)

	btrfs_stop_workers(&fs_info->workers);
	btrfs_stop_workers(&fs_info->endio_workers);
	btrfs_stop_workers(&fs_info->endio_write_workers);
	btrfs_stop_workers(&fs_info->submit_workers);

	iput(fs_info->btree_inode);
+93 −39
Original line number Diff line number Diff line
@@ -1895,36 +1895,17 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
	return ret;
}

/*
 * finds a free extent and does all the dirty work required for allocation
 * returns the key for the extent through ins, and a tree buffer for
 * the first block of the extent through buf.
 *
 * returns 0 if everything worked, non-zero otherwise.
 */
int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root,
				  u64 num_bytes, u64 min_alloc_size,
		       u64 root_objectid, u64 ref_generation,
		       u64 owner, u64 owner_offset,
				  u64 empty_size, u64 hint_byte,
		       u64 search_end, struct btrfs_key *ins, u64 data)
				  u64 search_end, struct btrfs_key *ins,
				  u64 data)
{
	int ret;
	int pending_ret;
	u64 super_used;
	u64 root_used;
	u64 search_start = 0;
	u64 alloc_profile;
	u32 sizes[2];
	struct btrfs_fs_info *info = root->fs_info;
	struct btrfs_root *extent_root = info->extent_root;
	struct btrfs_extent_item *extent_item;
	struct btrfs_extent_ref *ref;
	struct btrfs_path *path;
	struct btrfs_key keys[2];

	maybe_lock_mutex(root);

	if (data) {
		alloc_profile = info->avail_data_alloc_bits &
@@ -1974,11 +1955,48 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
	}
	if (ret) {
		printk("allocation failed flags %Lu\n", data);
	}
	if (ret) {
		BUG();
		goto out;
	}
	clear_extent_dirty(&root->fs_info->free_space_cache,
			   ins->objectid, ins->objectid + ins->offset - 1,
			   GFP_NOFS);
	return 0;
}

int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root,
				  u64 num_bytes, u64 min_alloc_size,
				  u64 empty_size, u64 hint_byte,
				  u64 search_end, struct btrfs_key *ins,
				  u64 data)
{
	int ret;
	maybe_lock_mutex(root);
	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
				     empty_size, hint_byte, search_end, ins,
				     data);
	maybe_unlock_mutex(root);
	return ret;
}

static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
					 struct btrfs_root *root,
					 u64 root_objectid, u64 ref_generation,
					 u64 owner, u64 owner_offset,
					 struct btrfs_key *ins)
{
	int ret;
	int pending_ret;
	u64 super_used;
	u64 root_used;
	u64 num_bytes = ins->offset;
	u32 sizes[2];
	struct btrfs_fs_info *info = root->fs_info;
	struct btrfs_root *extent_root = info->extent_root;
	struct btrfs_extent_item *extent_item;
	struct btrfs_extent_ref *ref;
	struct btrfs_path *path;
	struct btrfs_key keys[2];

	/* block accounting for super block */
	spin_lock_irq(&info->delalloc_lock);
@@ -1990,10 +2008,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
	root_used = btrfs_root_used(&root->root_item);
	btrfs_set_root_used(&root->root_item, root_used + num_bytes);

	clear_extent_dirty(&root->fs_info->free_space_cache,
			   ins->objectid, ins->objectid + ins->offset - 1,
			   GFP_NOFS);

	if (root == extent_root) {
		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
				ins->objectid + ins->offset - 1,
@@ -2001,10 +2015,6 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
		goto update_block;
	}

	WARN_ON(trans->alloc_exclude_nr);
	trans->alloc_exclude_start = ins->objectid;
	trans->alloc_exclude_nr = ins->offset;

	memcpy(&keys[0], ins, sizeof(*ins));
	keys[1].offset = hash_extent_ref(root_objectid, ref_generation,
					 owner, owner_offset);
@@ -2054,6 +2064,51 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
		BUG();
	}
out:
	return ret;
}

int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
				struct btrfs_root *root,
				u64 root_objectid, u64 ref_generation,
				u64 owner, u64 owner_offset,
				struct btrfs_key *ins)
{
	int ret;
	maybe_lock_mutex(root);
	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
					    ref_generation, owner,
					    owner_offset, ins);
	maybe_unlock_mutex(root);
	return ret;
}
/*
 * finds a free extent and does all the dirty work required for allocation
 * returns the key for the extent through ins, and a tree buffer for
 * the first block of the extent through buf.
 *
 * returns 0 if everything worked, non-zero otherwise.
 */
int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
		       struct btrfs_root *root,
		       u64 num_bytes, u64 min_alloc_size,
		       u64 root_objectid, u64 ref_generation,
		       u64 owner, u64 owner_offset,
		       u64 empty_size, u64 hint_byte,
		       u64 search_end, struct btrfs_key *ins, u64 data)
{
	int ret;

	maybe_lock_mutex(root);

	ret = __btrfs_reserve_extent(trans, root, num_bytes,
				     min_alloc_size, empty_size, hint_byte,
				     search_end, ins, data);
	BUG_ON(ret);
	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
					    ref_generation, owner,
					    owner_offset, ins);
	BUG_ON(ret);

	maybe_unlock_mutex(root);
	return ret;
}
@@ -2288,8 +2343,8 @@ static int noinline walk_down_tree(struct btrfs_trans_handle *trans,
			mutex_lock(&root->fs_info->alloc_mutex);

			/* we've dropped the lock, double check */
			ret = drop_snap_lookup_refcount(root, bytenr,
						blocksize, &refs);
			ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
						&refs);
			BUG_ON(ret);
			if (refs != 1) {
				parent = path->nodes[*level];
@@ -2584,7 +2639,6 @@ static int noinline relocate_inode_pages(struct inode *inode, u64 start,
	kfree(ra);
	trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
	if (trans) {
		btrfs_add_ordered_inode(inode);
		btrfs_end_transaction(trans, BTRFS_I(inode)->root);
		mark_inode_dirty(inode);
	}
+45 −7
Original line number Diff line number Diff line
@@ -793,6 +793,13 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
}
EXPORT_SYMBOL(set_extent_dirty);

int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
		       gfp_t mask)
{
	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
}
EXPORT_SYMBOL(set_extent_ordered);

int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
		    int bits, gfp_t mask)
{
@@ -812,8 +819,8 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
		     gfp_t mask)
{
	return set_extent_bit(tree, start, end,
			      EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
			      mask);
			      EXTENT_DELALLOC | EXTENT_DIRTY,
			      0, NULL, mask);
}
EXPORT_SYMBOL(set_extent_delalloc);

@@ -825,6 +832,13 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
}
EXPORT_SYMBOL(clear_extent_dirty);

int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
			 gfp_t mask)
{
	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
}
EXPORT_SYMBOL(clear_extent_ordered);

int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
		     gfp_t mask)
{
@@ -1395,10 +1409,9 @@ static int end_bio_extent_writepage(struct bio *bio,

		if (--bvec >= bio->bi_io_vec)
			prefetchw(&bvec->bv_page->flags);

		if (tree->ops && tree->ops->writepage_end_io_hook) {
			ret = tree->ops->writepage_end_io_hook(page, start,
						       end, state);
						       end, state, uptodate);
			if (ret)
				uptodate = 0;
		}
@@ -1868,9 +1881,14 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
			unlock_extent(tree, cur, end, GFP_NOFS);
			break;
		}

		extent_offset = cur - em->start;
		if (extent_map_end(em) <= cur) {
printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
		}
		BUG_ON(extent_map_end(em) <= cur);
		if (end < cur) {
printk("2bad mapping end %Lu cur %Lu\n", end, cur);
		}
		BUG_ON(end < cur);

		iosize = min(extent_map_end(em) - cur, end - cur + 1);
@@ -1976,6 +1994,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
	u64 last_byte = i_size_read(inode);
	u64 block_start;
	u64 iosize;
	u64 unlock_start;
	sector_t sector;
	struct extent_map *em;
	struct block_device *bdev;
@@ -1988,7 +2007,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
	u64 nr_delalloc;
	u64 delalloc_end;


	WARN_ON(!PageLocked(page));
	page_offset = i_size & (PAGE_CACHE_SIZE - 1);
	if (page->index > end_index ||
@@ -2030,6 +2048,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
		delalloc_start = delalloc_end + 1;
	}
	lock_extent(tree, start, page_end, GFP_NOFS);
	unlock_start = start;

	end = page_end;
	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
@@ -2038,6 +2057,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,

	if (last_byte <= start) {
		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
		unlock_extent(tree, start, page_end, GFP_NOFS);
		if (tree->ops && tree->ops->writepage_end_io_hook)
			tree->ops->writepage_end_io_hook(page, start,
							 page_end, NULL, 1);
		unlock_start = page_end + 1;
		goto done;
	}

@@ -2047,6 +2071,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
	while (cur <= end) {
		if (cur >= last_byte) {
			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
			if (tree->ops && tree->ops->writepage_end_io_hook)
				tree->ops->writepage_end_io_hook(page, cur,
							 page_end, NULL, 1);
			unlock_start = page_end + 1;
			break;
		}
		em = epd->get_extent(inode, page, page_offset, cur,
@@ -2071,8 +2100,16 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
		    block_start == EXTENT_MAP_INLINE) {
			clear_extent_dirty(tree, cur,
					   cur + iosize - 1, GFP_NOFS);

			unlock_extent(tree, unlock_start, cur + iosize -1,
				      GFP_NOFS);
			if (tree->ops && tree->ops->writepage_end_io_hook)
				tree->ops->writepage_end_io_hook(page, cur,
							 cur + iosize - 1,
							 NULL, 1);
			cur = cur + iosize;
			page_offset += iosize;
			unlock_start = cur;
			continue;
		}

@@ -2119,7 +2156,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
		set_page_writeback(page);
		end_page_writeback(page);
	}
	unlock_extent(tree, start, page_end, GFP_NOFS);
	if (unlock_start <= page_end)
		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
	unlock_page(page);
	return 0;
}
Loading