Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 3f157a2f authored by Chris Mason's avatar Chris Mason
Browse files

Btrfs: Online btree defragmentation fixes



The btree defragger wasn't making forward progress because the new key wasn't
being saved by the btrfs_search_forward function.

This also disables the automatic btree defrag, it wasn't scaling well to
huge filesystems.  The auto-defrag needs to be done differently.

Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent 1b1e2135
Loading
Loading
Loading
Loading
+161 −9
Original line number Diff line number Diff line
@@ -63,10 +63,9 @@ void btrfs_free_path(struct btrfs_path *p)
void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
{
	int i;
	int keep = p->keep_locks;
	int skip = p->skip_locking;

	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
		p->slots[i] = 0;
		if (!p->nodes[i])
			continue;
		if (p->locks[i]) {
@@ -74,10 +73,8 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
			p->locks[i] = 0;
		}
		free_extent_buffer(p->nodes[i]);
		p->nodes[i] = NULL;
	}
	memset(p, 0, sizeof(*p));
	p->keep_locks = keep;
	p->skip_locking = skip;
}

struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
@@ -463,8 +460,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
		search_start = cur->start;
		last_block = cur->start;
		*last_ret = search_start;
		if (parent_level == 1)
			btrfs_clear_buffer_defrag(cur);
		btrfs_tree_unlock(cur);
		free_extent_buffer(cur);
	}
@@ -2969,8 +2964,138 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
	return 1;
}

/*
 * A helper function to walk down the tree starting at min_key, and looking
 * for nodes or leaves that are either in cache or have a minimum
 * transaction id.  This is used by the btree defrag code, but could
 * also be used to search for blocks that have changed since a given
 * transaction id.
 *
 * This does not cow, but it does stuff the starting key it finds back
 * into min_key, so you can call btrfs_search_slot with cow=1 on the
 * key and get a writable path.
 *
 * This does lock as it descends, and path->keep_locks should be set
 * to 1 by the caller.
 *
 * This honors path->lowest_level to prevent descent past a given level
 * of the tree.
 *
 * returns zero if something useful was found, < 0 on error and 1 if there
 * was nothing in the tree that matched the search criteria.
 */
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
			 struct btrfs_path *path, int cache_only,
			 u64 min_trans)
{
	struct extent_buffer *cur;
	struct btrfs_key found_key;
	int slot;
	u32 nritems;
	int level;
	int ret = 1;

again:
	cur = btrfs_lock_root_node(root);
	level = btrfs_header_level(cur);
	path->nodes[level] = cur;
	path->locks[level] = 1;

	if (btrfs_header_generation(cur) < min_trans) {
		ret = 1;
		goto out;
	}
	while(1) {
		nritems = btrfs_header_nritems(cur);
		level = btrfs_header_level(cur);
		bin_search(cur, min_key, level, &slot);

		/* at level = 0, we're done, setup the path and exit */
		if (level == 0) {
			ret = 0;
			path->slots[level] = slot;
			btrfs_item_key_to_cpu(cur, &found_key, slot);
			goto out;
		}
		/*
		 * check this node pointer against the cache_only and
		 * min_trans parameters.  If it isn't in cache or is too
		 * old, skip to the next one.
		 */
		while(slot < nritems) {
			u64 blockptr;
			u64 gen;
			struct extent_buffer *tmp;
			blockptr = btrfs_node_blockptr(cur, slot);
			gen = btrfs_node_ptr_generation(cur, slot);
			if (gen < min_trans) {
				slot++;
				continue;
			}
			if (!cache_only)
				break;

			tmp = btrfs_find_tree_block(root, blockptr,
					    btrfs_level_size(root, level - 1));

			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
				free_extent_buffer(tmp);
				break;
			}
			if (tmp)
				free_extent_buffer(tmp);
			slot++;
		}
		/*
		 * we didn't find a candidate key in this node, walk forward
		 * and find another one
		 */
		if (slot >= nritems) {
			ret = btrfs_find_next_key(root, path, min_key, level,
						  cache_only, min_trans);
			if (ret == 0) {
				btrfs_release_path(root, path);
				goto again;
			} else {
				goto out;
			}
		}
		/* save our key for returning back */
		btrfs_node_key_to_cpu(cur, &found_key, slot);
		path->slots[level] = slot;
		if (level == path->lowest_level) {
			ret = 0;
			unlock_up(path, level, 1);
			goto out;
		}
		cur = read_node_slot(root, cur, slot);

		btrfs_tree_lock(cur);
		path->locks[level - 1] = 1;
		path->nodes[level - 1] = cur;
		unlock_up(path, level, 1);
	}
out:
	if (ret == 0)
		memcpy(min_key, &found_key, sizeof(found_key));
	return ret;
}

/*
 * this is similar to btrfs_next_leaf, but does not try to preserve
 * and fixup the path.  It looks for and returns the next key in the
 * tree based on the current path and the cache_only and min_trans
 * parameters.
 *
 * 0 is returned if another key is found, < 0 if there are any errors
 * and 1 is returned if there are no higher keys in the tree
 *
 * path->keep_locks should be set to 1 on the search made before
 * calling this function.
 */
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
			struct btrfs_key *key, int lowest_level)
			struct btrfs_key *key, int lowest_level,
			int cache_only, u64 min_trans)
{
	int level = lowest_level;
	int slot;
@@ -2982,6 +3107,7 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,

		slot = path->slots[level] + 1;
		c = path->nodes[level];
next:
		if (slot >= btrfs_header_nritems(c)) {
			level++;
			if (level == BTRFS_MAX_LEVEL) {
@@ -2991,8 +3117,28 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
		}
		if (level == 0)
			btrfs_item_key_to_cpu(c, key, slot);
		else
		else {
			u64 blockptr = btrfs_node_blockptr(c, slot);
			u64 gen = btrfs_node_ptr_generation(c, slot);

			if (cache_only) {
				struct extent_buffer *cur;
				cur = btrfs_find_tree_block(root, blockptr,
					    btrfs_level_size(root, level - 1));
				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
					slot++;
					if (cur)
						free_extent_buffer(cur);
					goto next;
				}
				free_extent_buffer(cur);
			}
			if (gen < min_trans) {
				slot++;
				goto next;
			}
			btrfs_node_key_to_cpu(c, key, slot);
		}
		return 0;
	}
	return 1;
@@ -3095,6 +3241,12 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
	return 0;
}

/*
 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
 * searching until it gets past min_objectid or finds an item of 'type'
 *
 * returns 0 if something is found, 1 if nothing was found and < 0 on error
 */
int btrfs_previous_item(struct btrfs_root *root,
			struct btrfs_path *path, u64 min_objectid,
			int type)
+6 −1
Original line number Diff line number Diff line
@@ -609,6 +609,7 @@ struct btrfs_root {
	u64 last_inode_alloc;
	int ref_cows;
	int track_dirty;
	u64 defrag_trans_start;
	struct btrfs_key defrag_progress;
	struct btrfs_key defrag_max;
	int defrag_running;
@@ -1412,7 +1413,11 @@ int btrfs_previous_item(struct btrfs_root *root,
struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
			struct btrfs_key *key, int lowest_level);
			struct btrfs_key *key, int lowest_level,
			int cache_only, u64 min_trans);
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
			 struct btrfs_path *path, int cache_only,
			 u64 min_trans);

int btrfs_cow_block(struct btrfs_trans_handle *trans,
		    struct btrfs_root *root, struct extent_buffer *buf,
+3 −58
Original line number Diff line number Diff line
@@ -295,7 +295,6 @@ int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
					     btrfs_header_generation(eb));
	BUG_ON(ret);
	btrfs_clear_buffer_defrag(eb);
	found_start = btrfs_header_bytenr(eb);
	if (found_start != start) {
		printk("warning: eb start incorrect %Lu buffer %Lu len %lu\n",
@@ -355,7 +354,6 @@ int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
	}
	eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);

	btrfs_clear_buffer_defrag(eb);
	found_start = btrfs_header_bytenr(eb);
	if (found_start != start) {
		ret = -EIO;
@@ -736,6 +734,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
	memset(&root->root_item, 0, sizeof(root->root_item));
	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
	memset(&root->root_kobj, 0, sizeof(root->root_kobj));
	root->defrag_trans_start = fs_info->generation;
	init_completion(&root->kobj_unregister);
	root->defrag_running = 0;
	root->defrag_level = 0;
@@ -1168,7 +1167,6 @@ static int transaction_kthread(void *arg)
			goto sleep;
		}
		mutex_unlock(&root->fs_info->trans_mutex);
		btrfs_defrag_dirty_roots(root->fs_info);
		trans = btrfs_start_transaction(root, 1);
		ret = btrfs_commit_transaction(trans, root);
sleep:
@@ -1434,12 +1432,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
						   tree_root,
						   "btrfs-transaction");
	if (!fs_info->transaction_kthread)
		goto fail_trans_kthread;
		goto fail_cleaner;


	return tree_root;

fail_trans_kthread:
fail_cleaner:
	kthread_stop(fs_info->cleaner_kthread);
fail_extent_root:
	free_extent_buffer(extent_root->node);
@@ -1662,7 +1660,6 @@ int close_ctree(struct btrfs_root *root)
	kthread_stop(root->fs_info->transaction_kthread);
	kthread_stop(root->fs_info->cleaner_kthread);

	btrfs_defrag_dirty_roots(root->fs_info);
	btrfs_clean_old_snapshots(root);
	trans = btrfs_start_transaction(root, 1);
	ret = btrfs_commit_transaction(trans, root);
@@ -1794,58 +1791,6 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
	return;
}

void btrfs_set_buffer_defrag(struct extent_buffer *buf)
{
	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
	struct inode *btree_inode = root->fs_info->btree_inode;
	set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start,
			buf->start + buf->len - 1, EXTENT_DEFRAG, GFP_NOFS);
}

void btrfs_set_buffer_defrag_done(struct extent_buffer *buf)
{
	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
	struct inode *btree_inode = root->fs_info->btree_inode;
	set_extent_bits(&BTRFS_I(btree_inode)->io_tree, buf->start,
			buf->start + buf->len - 1, EXTENT_DEFRAG_DONE,
			GFP_NOFS);
}

int btrfs_buffer_defrag(struct extent_buffer *buf)
{
	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
	struct inode *btree_inode = root->fs_info->btree_inode;
	return test_range_bit(&BTRFS_I(btree_inode)->io_tree,
		     buf->start, buf->start + buf->len - 1, EXTENT_DEFRAG, 0);
}

int btrfs_buffer_defrag_done(struct extent_buffer *buf)
{
	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
	struct inode *btree_inode = root->fs_info->btree_inode;
	return test_range_bit(&BTRFS_I(btree_inode)->io_tree,
		     buf->start, buf->start + buf->len - 1,
		     EXTENT_DEFRAG_DONE, 0);
}

int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf)
{
	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
	struct inode *btree_inode = root->fs_info->btree_inode;
	return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree,
		     buf->start, buf->start + buf->len - 1,
		     EXTENT_DEFRAG_DONE, GFP_NOFS);
}

int btrfs_clear_buffer_defrag(struct extent_buffer *buf)
{
	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
	struct inode *btree_inode = root->fs_info->btree_inode;
	return clear_extent_bits(&BTRFS_I(btree_inode)->io_tree,
		     buf->start, buf->start + buf->len - 1,
		     EXTENT_DEFRAG, GFP_NOFS);
}

int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
{
	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+0 −6
Original line number Diff line number Diff line
@@ -61,12 +61,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
int wait_on_tree_block_writeback(struct btrfs_root *root,
				 struct extent_buffer *buf);
void btrfs_set_buffer_defrag(struct extent_buffer *buf);
void btrfs_set_buffer_defrag_done(struct extent_buffer *buf);
int btrfs_buffer_defrag(struct extent_buffer *buf);
int btrfs_buffer_defrag_done(struct extent_buffer *buf);
int btrfs_clear_buffer_defrag(struct extent_buffer *buf);
int btrfs_clear_buffer_defrag_done(struct extent_buffer *buf);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
void btrfs_csum_final(u32 crc, char *result);
+0 −2
Original line number Diff line number Diff line
@@ -2095,8 +2095,6 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,

	set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
			 buf->start + buf->len - 1, GFP_NOFS);
	if (!btrfs_test_opt(root, SSD))
		btrfs_set_buffer_defrag(buf);
	trans->blocks_used++;
	return buf;
}
Loading