Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit fcebe456 authored by Josef Bacik's avatar Josef Bacik Committed by Chris Mason
Browse files

Btrfs: rework qgroup accounting



Currently qgroups account for space by intercepting delayed ref updates to fs
trees.  It does this by adding sequence numbers to delayed ref updates so that
it can figure out how the tree looked before the update so we can adjust the
counters properly.  The problem with this is that it does not allow delayed refs
to be merged, so if you say are defragging an extent with 5k snapshots pointing
to it we will thrash the delayed ref lock because we need to go back and
manually merge these things together.  Instead we want to process quota changes
when we know they are going to happen, like when we first allocate an extent, we
free a reference for an extent, we add new references etc.  This patch
accomplishes this by only adding qgroup operations for real ref changes.  We
only modify the sequence number when we need to lookup roots for bytenrs, this
reduces the amount of churn on the sequence number and allows us to merge
delayed refs as we add them most of the time.  This patch encompasses a bunch of
architectural changes

1) qgroup ref operations: instead of tracking qgroup operations through the
delayed refs we simply add new ref operations whenever we notice that we need to
when we've modified the refs themselves.

2) tree mod seq:  we no longer have this separation of major/minor counters.
this makes the sequence number stuff much more sane and we can remove some
locking that was needed to protect the counter.

3) delayed ref seq: we now read the tree mod seq number and use that as our
sequence.  This means each new delayed ref doesn't have it's own unique sequence
number, rather whenever we go to lookup backrefs we inc the sequence number so
we can make sure to keep any new operations from screwing up our world view at
that given point.  This allows us to merge delayed refs during runtime.

With all of these changes the delayed ref stuff is a little saner and the qgroup
accounting stuff no longer goes negative in some cases like it was before.
Thanks,

Signed-off-by: default avatarJosef Bacik <jbacik@fb.com>
Signed-off-by: default avatarChris Mason <clm@fb.com>
parent 5dca6eea
Loading
Loading
Loading
Loading
+5 −40
Original line number Diff line number Diff line
@@ -356,43 +356,13 @@ static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
}

/*
 * Increment the upper half of tree_mod_seq, set lower half zero.
 *
 * Must be called with fs_info->tree_mod_seq_lock held.
 * Pull a new tree mod seq number for our operation.
 */
static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
{
	u64 seq = atomic64_read(&fs_info->tree_mod_seq);
	seq &= 0xffffffff00000000ull;
	seq += 1ull << 32;
	atomic64_set(&fs_info->tree_mod_seq, seq);
	return seq;
}

/*
 * Increment the lower half of tree_mod_seq.
 *
 * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
 * are generated should not technically require a spin lock here. (Rationale:
 * incrementing the minor while incrementing the major seq number is between its
 * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
 * just returns a unique sequence number as usual.) We have decided to leave
 * that requirement in here and rethink it once we notice it really imposes a
 * problem on some workload.
 */
static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
{
	return atomic64_inc_return(&fs_info->tree_mod_seq);
}

/*
 * return the last minor in the previous major tree_mod_seq number
 */
u64 btrfs_tree_mod_seq_prev(u64 seq)
{
	return (seq & 0xffffffff00000000ull) - 1ull;
}

/*
 * This adds a new blocker to the tree mod log's blocker list if the @elem
 * passed does not already have a sequence number set. So when a caller expects
@@ -404,19 +374,16 @@ u64 btrfs_tree_mod_seq_prev(u64 seq)
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
			   struct seq_list *elem)
{
	u64 seq;

	tree_mod_log_write_lock(fs_info);
	spin_lock(&fs_info->tree_mod_seq_lock);
	if (!elem->seq) {
		elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
	}
	seq = btrfs_inc_tree_mod_seq_minor(fs_info);
	spin_unlock(&fs_info->tree_mod_seq_lock);
	tree_mod_log_write_unlock(fs_info);

	return seq;
	return elem->seq;
}

void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
@@ -489,9 +456,7 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)

	BUG_ON(!tm);

	spin_lock(&fs_info->tree_mod_seq_lock);
	tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
	spin_unlock(&fs_info->tree_mod_seq_lock);
	tm->seq = btrfs_inc_tree_mod_seq(fs_info);

	tm_root = &fs_info->tree_mod_log;
	new = &tm_root->rb_node;
+7 −52
Original line number Diff line number Diff line
@@ -1648,7 +1648,10 @@ struct btrfs_fs_info {

	/* holds configuration and tracking. Protected by qgroup_lock */
	struct rb_root qgroup_tree;
	struct rb_root qgroup_op_tree;
	spinlock_t qgroup_lock;
	spinlock_t qgroup_op_lock;
	atomic_t qgroup_op_seq;

	/*
	 * used to avoid frequently calling ulist_alloc()/ulist_free()
@@ -3300,9 +3303,9 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
			 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
			 struct btrfs_key *ins, int is_data);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		  struct extent_buffer *buf, int full_backref, int for_cow);
		  struct extent_buffer *buf, int full_backref, int no_quota);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		  struct extent_buffer *buf, int full_backref, int for_cow);
		  struct extent_buffer *buf, int full_backref, int no_quota);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
				struct btrfs_root *root,
				u64 bytenr, u64 num_bytes, u64 flags,
@@ -3310,7 +3313,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
int btrfs_free_extent(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root,
		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
		      u64 owner, u64 offset, int for_cow);
		      u64 owner, u64 offset, int no_quota);

int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
@@ -3322,7 +3325,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
			 struct btrfs_root *root,
			 u64 bytenr, u64 num_bytes, u64 parent,
			 u64 root_objectid, u64 owner, u64 offset, int for_cow);
			 u64 root_objectid, u64 owner, u64 offset, int no_quota);

int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
				    struct btrfs_root *root);
@@ -3410,7 +3413,6 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
					 struct btrfs_fs_info *fs_info);
int __get_raid_index(u64 flags);

int btrfs_start_nocow_write(struct btrfs_root *root);
void btrfs_end_nocow_write(struct btrfs_root *root);
/* ctree.c */
@@ -3586,7 +3588,6 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
			   struct seq_list *elem);
void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
			    struct seq_list *elem);
u64 btrfs_tree_mod_seq_prev(u64 seq);
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);

/* root-item.c */
@@ -4094,52 +4095,6 @@ void btrfs_reada_detach(void *handle);
int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
			 u64 start, int err);

/* qgroup.c */
struct qgroup_update {
	struct list_head list;
	struct btrfs_delayed_ref_node *node;
	struct btrfs_delayed_extent_op *extent_op;
};

int btrfs_quota_enable(struct btrfs_trans_handle *trans,
		       struct btrfs_fs_info *fs_info);
int btrfs_quota_disable(struct btrfs_trans_handle *trans,
			struct btrfs_fs_info *fs_info);
int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
			      struct btrfs_fs_info *fs_info, u64 src, u64 dst);
int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
			      struct btrfs_fs_info *fs_info, u64 src, u64 dst);
int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
			struct btrfs_fs_info *fs_info, u64 qgroupid,
			char *name);
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
			      struct btrfs_fs_info *fs_info, u64 qgroupid);
int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
		       struct btrfs_fs_info *fs_info, u64 qgroupid,
		       struct btrfs_qgroup_limit *limit);
int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
struct btrfs_delayed_extent_op;
int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
			    struct btrfs_delayed_ref_node *node,
			    struct btrfs_delayed_extent_op *extent_op);
int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
			     struct btrfs_fs_info *fs_info,
			     struct btrfs_delayed_ref_node *node,
			     struct btrfs_delayed_extent_op *extent_op);
int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
		      struct btrfs_fs_info *fs_info);
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
			 struct btrfs_qgroup_inherit *inherit);
int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);

void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);

static inline int is_fstree(u64 rootid)
{
	if (rootid == BTRFS_FS_TREE_OBJECTID ||
+23 −16
Original line number Diff line number Diff line
@@ -106,6 +106,10 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref2,
		return -1;
	if (ref1->type > ref2->type)
		return 1;
	if (ref1->no_quota > ref2->no_quota)
		return 1;
	if (ref1->no_quota < ref2->no_quota)
		return -1;
	/* merging of sequenced refs is not allowed */
	if (compare_seq) {
		if (ref1->seq < ref2->seq)
@@ -635,7 +639,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
		     struct btrfs_delayed_ref_head *head_ref,
		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
		     u64 num_bytes, u64 parent, u64 ref_root, int level,
		     int action, int for_cow)
		     int action, int no_quota)
{
	struct btrfs_delayed_ref_node *existing;
	struct btrfs_delayed_tree_ref *full_ref;
@@ -645,6 +649,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
	if (action == BTRFS_ADD_DELAYED_EXTENT)
		action = BTRFS_ADD_DELAYED_REF;

	if (is_fstree(ref_root))
		seq = atomic64_read(&fs_info->tree_mod_seq);
	delayed_refs = &trans->transaction->delayed_refs;

	/* first set the basic ref node struct up */
@@ -655,9 +661,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
	ref->action = action;
	ref->is_head = 0;
	ref->in_tree = 1;

	if (need_ref_seq(for_cow, ref_root))
		seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
	ref->no_quota = no_quota;
	ref->seq = seq;

	full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -697,7 +701,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
		     struct btrfs_delayed_ref_head *head_ref,
		     struct btrfs_delayed_ref_node *ref, u64 bytenr,
		     u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
		     u64 offset, int action, int for_cow)
		     u64 offset, int action, int no_quota)
{
	struct btrfs_delayed_ref_node *existing;
	struct btrfs_delayed_data_ref *full_ref;
@@ -709,6 +713,9 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,

	delayed_refs = &trans->transaction->delayed_refs;

	if (is_fstree(ref_root))
		seq = atomic64_read(&fs_info->tree_mod_seq);

	/* first set the basic ref node struct up */
	atomic_set(&ref->refs, 1);
	ref->bytenr = bytenr;
@@ -717,9 +724,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
	ref->action = action;
	ref->is_head = 0;
	ref->in_tree = 1;

	if (need_ref_seq(for_cow, ref_root))
		seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
	ref->no_quota = no_quota;
	ref->seq = seq;

	full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -762,12 +767,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
			       u64 bytenr, u64 num_bytes, u64 parent,
			       u64 ref_root,  int level, int action,
			       struct btrfs_delayed_extent_op *extent_op,
			       int for_cow)
			       int no_quota)
{
	struct btrfs_delayed_tree_ref *ref;
	struct btrfs_delayed_ref_head *head_ref;
	struct btrfs_delayed_ref_root *delayed_refs;

	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
		no_quota = 0;

	BUG_ON(extent_op && extent_op->is_data);
	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
	if (!ref)
@@ -793,10 +801,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,

	add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
				   num_bytes, parent, ref_root, level, action,
				   for_cow);
				   no_quota);
	spin_unlock(&delayed_refs->lock);
	if (need_ref_seq(for_cow, ref_root))
		btrfs_qgroup_record_ref(trans, &ref->node, extent_op);

	return 0;
}
@@ -810,12 +816,15 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
			       u64 parent, u64 ref_root,
			       u64 owner, u64 offset, int action,
			       struct btrfs_delayed_extent_op *extent_op,
			       int for_cow)
			       int no_quota)
{
	struct btrfs_delayed_data_ref *ref;
	struct btrfs_delayed_ref_head *head_ref;
	struct btrfs_delayed_ref_root *delayed_refs;

	if (!is_fstree(ref_root) || !fs_info->quota_enabled)
		no_quota = 0;

	BUG_ON(extent_op && !extent_op->is_data);
	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
	if (!ref)
@@ -841,10 +850,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,

	add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
				   num_bytes, parent, ref_root, owner, offset,
				   action, for_cow);
				   action, no_quota);
	spin_unlock(&delayed_refs->lock);
	if (need_ref_seq(for_cow, ref_root))
		btrfs_qgroup_record_ref(trans, &ref->node, extent_op);

	return 0;
}
+3 −21
Original line number Diff line number Diff line
@@ -52,6 +52,7 @@ struct btrfs_delayed_ref_node {

	unsigned int action:8;
	unsigned int type:8;
	unsigned int no_quota:1;
	/* is this node still in the rbtree? */
	unsigned int is_head:1;
	unsigned int in_tree:1;
@@ -196,14 +197,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
			       u64 bytenr, u64 num_bytes, u64 parent,
			       u64 ref_root, int level, int action,
			       struct btrfs_delayed_extent_op *extent_op,
			       int for_cow);
			       int no_quota);
int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
			       struct btrfs_trans_handle *trans,
			       u64 bytenr, u64 num_bytes,
			       u64 parent, u64 ref_root,
			       u64 owner, u64 offset, int action,
			       struct btrfs_delayed_extent_op *extent_op,
			       int for_cow);
			       int no_quota);
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
				struct btrfs_trans_handle *trans,
				u64 bytenr, u64 num_bytes,
@@ -230,25 +231,6 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
			    struct btrfs_delayed_ref_root *delayed_refs,
			    u64 seq);

/*
 * delayed refs with a ref_seq > 0 must be held back during backref walking.
 * this only applies to items in one of the fs-trees. for_cow items never need
 * to be held back, so they won't get a ref_seq number.
 */
static inline int need_ref_seq(int for_cow, u64 rootid)
{
	if (for_cow)
		return 0;

	if (rootid == BTRFS_FS_TREE_OBJECTID)
		return 1;

	if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
		return 1;

	return 0;
}

/*
 * a node might live in a head or a regular ref, this lets you
 * test for the proper type to use.
+4 −0
Original line number Diff line number Diff line
@@ -49,6 +49,7 @@
#include "dev-replace.h"
#include "raid56.h"
#include "sysfs.h"
#include "qgroup.h"

#ifdef CONFIG_X86
#include <asm/cpufeature.h>
@@ -2219,6 +2220,7 @@ int open_ctree(struct super_block *sb,
	spin_lock_init(&fs_info->free_chunk_lock);
	spin_lock_init(&fs_info->tree_mod_seq_lock);
	spin_lock_init(&fs_info->super_lock);
	spin_lock_init(&fs_info->qgroup_op_lock);
	spin_lock_init(&fs_info->buffer_lock);
	rwlock_init(&fs_info->tree_mod_log_lock);
	mutex_init(&fs_info->reloc_mutex);
@@ -2244,6 +2246,7 @@ int open_ctree(struct super_block *sb,
	atomic_set(&fs_info->async_submit_draining, 0);
	atomic_set(&fs_info->nr_async_bios, 0);
	atomic_set(&fs_info->defrag_running, 0);
	atomic_set(&fs_info->qgroup_op_seq, 0);
	atomic64_set(&fs_info->tree_mod_seq, 0);
	fs_info->sb = sb;
	fs_info->max_inline = 8192 * 1024;
@@ -2353,6 +2356,7 @@ int open_ctree(struct super_block *sb,
	spin_lock_init(&fs_info->qgroup_lock);
	mutex_init(&fs_info->qgroup_ioctl_lock);
	fs_info->qgroup_tree = RB_ROOT;
	fs_info->qgroup_op_tree = RB_ROOT;
	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
	fs_info->qgroup_seq = 1;
	fs_info->quota_enabled = 0;
Loading