Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 318b067a authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull btrfs updates from David Sterba:
 "This is the first batch with fixes and some nice performance
  improvements.

  Preliminary results show eg. more files/sec in fsmark, better perf on
  multi-threaded workloads (filebench, dbench), fewer context switches
  and overall better memory allocation characteristics (multiple
  benchmarks).

  Apart from general performance, there's an improvement for qgroups +
  balance workload that's been troubling our users.

  Note for stable: there are 20+ patches tagged for stable, out of 90.
  Not all of them apply cleanly on all stable versions but the conflicts
  are mostly due to simple cleanups and resolving should be obvious. The
  fixes are otherwise independent.

  Performance improvements:

   - transition between blocking and spinning modes of path is gone,
     which originally resulted to more unnecessary wakeups and updates
     to the path locks, the effects are measurable and improve latency
     and scalability

   - qgroups: first batch of changes that should speedup balancing with
     qgroups on, skip quota accounting on unchanged subtrees, overall
     gain is about 30+% in runtime

   - use rb-tree with cached first node for several structures, small
     improvement to avoid pointer chasing

  Fixes:

   - trim
      - fix: some blockgroups could have been missed if their logical
        address was past the total filesystem size (ie. after a lot of
        balancing)
      - better error reporting, after processing blockgroups and whole
        device
      - fix: continue trimming block groups after an error is
        encountered
      - check for trim support of the device earlier and avoid some
        unnecessary work
      - less interaction with transaction commit that improves latency
        on slower storage (eg. image files over NFS)

   - fsync
      - fix warning when replaying log after fsync of a O_TMPFILE
      - fix wrong dentries after fsync of file that got its parent
        replaced

   - qgroups: fix rescan that might misc some dirty groups

   - don't clean dirty pages during buffered writes, this could lead to
     lost updates in some corner cases

   - some block groups could have been delayed in creation, if the
     allocation triggered another one

   - error handling improvements

  Cleanups:

   - removed unused struct members and variables

   - function return type cleanups

   - delayed refs code refactoring

   - protect against deadlock that could be caused by crafted image that
     tries to allocate from a tree that's locked already"

* tag 'for-4.20-part1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (93 commits)
  btrfs: switch return_bigger to bool in find_ref_head
  btrfs: remove fs_info from btrfs_should_throttle_delayed_refs
  btrfs: remove fs_info from btrfs_check_space_for_delayed_refs
  btrfs: delayed-ref: pass delayed_refs directly to btrfs_delayed_ref_lock
  btrfs: delayed-ref: pass delayed_refs directly to btrfs_select_ref_head
  btrfs: qgroup: move the qgroup->members check out from (!qgroup)'s else branch
  btrfs: relocation: Remove redundant tree level check
  btrfs: relocation: Cleanup while loop using rbtree_postorder_for_each_entry_safe
  btrfs: qgroup: Avoid calling qgroup functions if qgroup is not enabled
  Btrfs: fix wrong dentries after fsync of file that got its parent replaced
  Btrfs: fix warning when replaying log after fsync of a tmpfile
  btrfs: drop min_size from evict_refill_and_join
  btrfs: assert on non-empty delayed iputs
  btrfs: make sure we create all new block groups
  btrfs: reset max_extent_size on clear in a bitmap
  btrfs: protect space cache inode alloc with GFP_NOFS
  btrfs: release metadata before running delayed refs
  Btrfs: kill btrfs_clear_path_blocking
  btrfs: dev-replace: remove pointless assert in write unlock
  btrfs: dev-replace: move replace members out of fs_info
  ...
parents 44adbac8 d9352794
Loading
Loading
Loading
Loading
+21 −18
Original line number Diff line number Diff line
@@ -112,11 +112,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
}

struct preftree {
	struct rb_root root;
	struct rb_root_cached root;
	unsigned int count;
};

#define PREFTREE_INIT	{ .root = RB_ROOT, .count = 0 }
#define PREFTREE_INIT	{ .root = RB_ROOT_CACHED, .count = 0 }

struct preftrees {
	struct preftree direct;    /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
@@ -225,14 +225,15 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
			      struct prelim_ref *newref,
			      struct share_check *sc)
{
	struct rb_root *root;
	struct rb_root_cached *root;
	struct rb_node **p;
	struct rb_node *parent = NULL;
	struct prelim_ref *ref;
	int result;
	bool leftmost = true;

	root = &preftree->root;
	p = &root->rb_node;
	p = &root->rb_root.rb_node;

	while (*p) {
		parent = *p;
@@ -242,6 +243,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
			p = &(*p)->rb_left;
		} else if (result > 0) {
			p = &(*p)->rb_right;
			leftmost = false;
		} else {
			/* Identical refs, merge them and free @newref */
			struct extent_inode_elem *eie = ref->inode_list;
@@ -272,7 +274,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
	preftree->count++;
	trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
	rb_link_node(&newref->rbnode, parent, p);
	rb_insert_color(&newref->rbnode, root);
	rb_insert_color_cached(&newref->rbnode, root, leftmost);
}

/*
@@ -283,11 +285,11 @@ static void prelim_release(struct preftree *preftree)
{
	struct prelim_ref *ref, *next_ref;

	rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root,
					     rbnode)
	rbtree_postorder_for_each_entry_safe(ref, next_ref,
					     &preftree->root.rb_root, rbnode)
		free_pref(ref);

	preftree->root = RB_ROOT;
	preftree->root = RB_ROOT_CACHED;
	preftree->count = 0;
}

@@ -627,7 +629,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
	 * freeing the entire indirect tree when we're done.  In some test
	 * cases, the tree can grow quite large (~200k objects).
	 */
	while ((rnode = rb_first(&preftrees->indirect.root))) {
	while ((rnode = rb_first_cached(&preftrees->indirect.root))) {
		struct prelim_ref *ref;

		ref = rb_entry(rnode, struct prelim_ref, rbnode);
@@ -637,7 +639,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
			goto out;
		}

		rb_erase(&ref->rbnode, &preftrees->indirect.root);
		rb_erase_cached(&ref->rbnode, &preftrees->indirect.root);
		preftrees->indirect.count--;

		if (ref->count == 0) {
@@ -717,9 +719,9 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
	struct preftree *tree = &preftrees->indirect_missing_keys;
	struct rb_node *node;

	while ((node = rb_first(&tree->root))) {
	while ((node = rb_first_cached(&tree->root))) {
		ref = rb_entry(node, struct prelim_ref, rbnode);
		rb_erase(node, &tree->root);
		rb_erase_cached(node, &tree->root);

		BUG_ON(ref->parent);	/* should not be a direct ref */
		BUG_ON(ref->key_for_search.type);
@@ -769,7 +771,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
		btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);

	spin_lock(&head->lock);
	for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {
	for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
		node = rb_entry(n, struct btrfs_delayed_ref_node,
				ref_node);
		if (node->seq > seq)
@@ -1229,14 +1231,14 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
	if (ret)
		goto out;

	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));

	ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
				    extent_item_pos, total_refs, sc, ignore_offset);
	if (ret)
		goto out;

	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root));
	WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root.rb_root));

	/*
	 * This walks the tree of merged and resolved refs. Tree blocks are
@@ -1245,7 +1247,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
	 *
	 * We release the entire tree in one go before returning.
	 */
	node = rb_first(&preftrees.direct.root);
	node = rb_first_cached(&preftrees.direct.root);
	while (node) {
		ref = rb_entry(node, struct prelim_ref, rbnode);
		node = rb_next(&ref->rbnode);
@@ -1468,7 +1470,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
	struct seq_list elem = SEQ_LIST_INIT(elem);
	int ret = 0;
	struct share_check shared = {
		.root_objectid = root->objectid,
		.root_objectid = root->root_key.objectid,
		.inum = inum,
		.share_count = 0,
	};
@@ -2031,7 +2033,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
			/* path must be released before calling iterate()! */
			btrfs_debug(fs_root->fs_info,
				"following ref at offset %u for inode %llu in tree %llu",
				cur, found_key.objectid, fs_root->objectid);
				cur, found_key.objectid,
				fs_root->root_key.objectid);
			ret = iterate(parent, name_len,
				      (unsigned long)(iref + 1), eb, ctx);
			if (ret)
+4 −4
Original line number Diff line number Diff line
@@ -206,7 +206,7 @@ static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
static inline unsigned long btrfs_inode_hash(u64 objectid,
					     const struct btrfs_root *root)
{
	u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME);
	u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME);

#if BITS_PER_LONG == 32
	h = (h >> 32) ^ (h & 0xffffffff);
@@ -339,15 +339,15 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
	struct btrfs_root *root = inode->root;

	/* Output minus objectid, which is more meaningful */
	if (root->objectid >= BTRFS_LAST_FREE_OBJECTID)
	if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
		btrfs_warn_rl(root->fs_info,
	"csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d",
			root->objectid, btrfs_ino(inode),
			root->root_key.objectid, btrfs_ino(inode),
			logical_start, csum, csum_expected, mirror_num);
	else
		btrfs_warn_rl(root->fs_info,
	"csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d",
			root->objectid, btrfs_ino(inode),
			root->root_key.objectid, btrfs_ino(inode),
			logical_start, csum, csum_expected, mirror_num);
}

+3 −3
Original line number Diff line number Diff line
@@ -1594,6 +1594,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
{
	unsigned int num_pages;
	unsigned int i;
	size_t size;
	u64 dev_bytenr;
	int ret;

@@ -1608,9 +1609,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,

	num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
		    PAGE_SHIFT;
	block_ctx->mem_to_free = kcalloc(sizeof(*block_ctx->datav) +
						sizeof(*block_ctx->pagev),
					 num_pages, GFP_NOFS);
	size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev);
	block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS);
	if (!block_ctx->mem_to_free)
		return -ENOMEM;
	block_ctx->datav = block_ctx->mem_to_free;
+0 −2
Original line number Diff line number Diff line
@@ -528,7 +528,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
				 int mirror_num, unsigned long bio_flags)
{
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	struct extent_io_tree *tree;
	struct extent_map_tree *em_tree;
	struct compressed_bio *cb;
	unsigned long compressed_len;
@@ -545,7 +544,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
	int faili = 0;
	u32 *sums;

	tree = &BTRFS_I(inode)->io_tree;
	em_tree = &BTRFS_I(inode)->extent_tree;

	/* we need the actual starting offset of this extent in the file */
+11 −57
Original line number Diff line number Diff line
@@ -52,42 +52,6 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
	}
}

/*
 * reset all the locked nodes in the patch to spinning locks.
 *
 * held is used to keep lockdep happy, when lockdep is enabled
 * we set held to a blocking lock before we go around and
 * retake all the spinlocks in the path.  You can safely use NULL
 * for held
 */
noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
					struct extent_buffer *held, int held_rw)
{
	int i;

	if (held) {
		btrfs_set_lock_blocking_rw(held, held_rw);
		if (held_rw == BTRFS_WRITE_LOCK)
			held_rw = BTRFS_WRITE_LOCK_BLOCKING;
		else if (held_rw == BTRFS_READ_LOCK)
			held_rw = BTRFS_READ_LOCK_BLOCKING;
	}
	btrfs_set_path_blocking(p);

	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
		if (p->nodes[i] && p->locks[i]) {
			btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
			if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
				p->locks[i] = BTRFS_WRITE_LOCK;
			else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
				p->locks[i] = BTRFS_READ_LOCK;
		}
	}

	if (held)
		btrfs_clear_lock_blocking_rw(held, held_rw);
}

/* this also releases the path */
void btrfs_free_path(struct btrfs_path *p)
{
@@ -207,7 +171,7 @@ static void add_root_to_dirty_list(struct btrfs_root *root)
	spin_lock(&fs_info->trans_lock);
	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
		/* Want the extent tree to be the last on the list */
		if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
		if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
			list_move_tail(&root->dirty_list,
				       &fs_info->dirty_cowonly_roots);
		else
@@ -1306,7 +1270,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
		}
	}

	btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
	btrfs_tree_read_unlock_blocking(eb);
	free_extent_buffer(eb);

@@ -1815,8 +1778,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
	int orig_slot = path->slots[level];
	u64 orig_ptr;

	if (level == 0)
		return 0;
	ASSERT(level > 0);

	mid = path->nodes[level];

@@ -2483,7 +2445,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
		btrfs_set_path_blocking(p);
		reada_for_balance(fs_info, p, level);
		sret = split_node(trans, root, p, level);
		btrfs_clear_path_blocking(p, NULL, 0);

		BUG_ON(sret > 0);
		if (sret) {
@@ -2504,7 +2465,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
		btrfs_set_path_blocking(p);
		reada_for_balance(fs_info, p, level);
		sret = balance_level(trans, root, p, level);
		btrfs_clear_path_blocking(p, NULL, 0);

		if (sret) {
			ret = sret;
@@ -2789,7 +2749,10 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		}
cow_done:
		p->nodes[level] = b;
		btrfs_clear_path_blocking(p, NULL, 0);
		/*
		 * Leave path with blocking locks to avoid massive
		 * lock context switch, this is made on purpose.
		 */

		/*
		 * we have a lock on b and as long as we aren't changing
@@ -2871,8 +2834,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
					if (!err) {
						btrfs_set_path_blocking(p);
						btrfs_tree_lock(b);
						btrfs_clear_path_blocking(p, b,
								  BTRFS_WRITE_LOCK);
					}
					p->locks[level] = BTRFS_WRITE_LOCK;
				} else {
@@ -2880,8 +2841,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
					if (!err) {
						btrfs_set_path_blocking(p);
						btrfs_tree_read_lock(b);
						btrfs_clear_path_blocking(p, b,
								  BTRFS_READ_LOCK);
					}
					p->locks[level] = BTRFS_READ_LOCK;
				}
@@ -2900,7 +2859,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
				btrfs_set_path_blocking(p);
				err = split_leaf(trans, root, key,
						 p, ins_len, ret == 0);
				btrfs_clear_path_blocking(p, NULL, 0);

				BUG_ON(err > 0);
				if (err) {
@@ -2910,7 +2868,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
			}
			if (!p->search_for_split)
				unlock_up(p, level, lowest_unlock,
					  min_write_lock_level, &write_lock_level);
					  min_write_lock_level, NULL);
			goto done;
		}
	}
@@ -2961,13 +2919,16 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,

again:
	b = get_old_root(root, time_seq);
	if (!b) {
		ret = -EIO;
		goto done;
	}
	level = btrfs_header_level(b);
	p->locks[level] = BTRFS_READ_LOCK;

	while (b) {
		level = btrfs_header_level(b);
		p->nodes[level] = b;
		btrfs_clear_path_blocking(p, NULL, 0);

		/*
		 * we have a lock on b and as long as we aren't changing
@@ -3013,8 +2974,6 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
			if (!err) {
				btrfs_set_path_blocking(p);
				btrfs_tree_read_lock(b);
				btrfs_clear_path_blocking(p, b,
							  BTRFS_READ_LOCK);
			}
			b = tree_mod_log_rewind(fs_info, p, b, time_seq);
			if (!b) {
@@ -5198,7 +5157,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
		path->locks[level - 1] = BTRFS_READ_LOCK;
		path->nodes[level - 1] = cur;
		unlock_up(path, level, 1, 0, NULL);
		btrfs_clear_path_blocking(path, NULL, 0);
	}
out:
	path->keep_locks = keep_locks;
@@ -5783,8 +5741,6 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
			if (!ret) {
				btrfs_set_path_blocking(path);
				btrfs_tree_read_lock(next);
				btrfs_clear_path_blocking(path, next,
							  BTRFS_READ_LOCK);
			}
			next_rw_lock = BTRFS_READ_LOCK;
		}
@@ -5820,8 +5776,6 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
			if (!ret) {
				btrfs_set_path_blocking(path);
				btrfs_tree_read_lock(next);
				btrfs_clear_path_blocking(path, next,
							  BTRFS_READ_LOCK);
			}
			next_rw_lock = BTRFS_READ_LOCK;
		}
Loading