Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 66a0e2d5 authored by Ilya Dryomov's avatar Ilya Dryomov
Browse files

crush: remove mutable part of CRUSH map



Then add it to the working state. It would be very nice if we didn't
have to take a lock to calculate a crush placement. By moving the
permutation array into the working data, we can treat the CRUSH map as
immutable.

Reflects ceph.git commit cbcd039651c0569551cb90d26ce27e1432671f2a.

Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
parent 1b6a78b5
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -175,6 +175,7 @@ struct ceph_osdmap {

	struct mutex crush_scratch_mutex;
	int crush_scratch_ary[CEPH_PG_MAX_SIZE * 3];
	void *crush_workspace;
};

static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd)
+34 −7
Original line number Diff line number Diff line
@@ -135,13 +135,6 @@ struct crush_bucket {
	__u32 size;      /* num items */
	__s32 *items;

	/*
	 * cached random permutation: used for uniform bucket and for
	 * the linear search fallback for the other bucket types.
	 */
	__u32 perm_x;  /* @x for which *perm is defined */
	__u32 perm_n;  /* num elements of *perm that are permuted/defined */
	__u32 *perm;
};

struct crush_bucket_uniform {
@@ -211,6 +204,21 @@ struct crush_map {
	 * device fails. */
	__u8 chooseleaf_stable;

	/*
	 * This value is calculated after decode or construction by
	 * the builder. It is exposed here (rather than having a
	 * 'build CRUSH working space' function) so that callers can
	 * reserve a static buffer, allocate space on the stack, or
	 * otherwise avoid calling into the heap allocator if they
	 * want to. The size of the working space depends on the map,
	 * while the size of the scratch vector passed to the mapper
	 * depends on the size of the desired result set.
	 *
	 * Nothing stops the caller from allocating both in one swell
	 * foop and passing in two points, though.
	 */
	size_t working_size;

#ifndef __KERNEL__
	/*
	 * version 0 (original) of straw_calc has various flaws.  version 1
@@ -248,4 +256,23 @@ static inline int crush_calc_tree_node(int i)
	return ((i+1) << 1)-1;
}

/*
 * These data structures are private to the CRUSH implementation. They
 * are exposed in this header file because builder needs their
 * definitions to calculate the total working size.
 *
 * Moving this out of the crush map allow us to treat the CRUSH map as
 * immutable within the mapper and removes the requirement for a CRUSH
 * map lock.
 */
struct crush_work_bucket {
	__u32 perm_x; /* @x for which *perm is defined */
	__u32 perm_n; /* num elements of *perm that are permuted/defined */
	__u32 *perm;  /* Permutation of the bucket's items */
};

struct crush_work {
	struct crush_work_bucket **work; /* Per-bucket working store */
};

#endif
+3 −1
Original line number Diff line number Diff line
@@ -15,6 +15,8 @@ extern int crush_do_rule(const struct crush_map *map,
			 int ruleno,
			 int x, int *result, int result_max,
			 const __u32 *weights, int weight_max,
			 int *scratch);
			 void *cwin, int *scratch);

void crush_init_workspace(const struct crush_map *map, void *v);

#endif
+0 −5
Original line number Diff line number Diff line
@@ -45,7 +45,6 @@ int crush_get_bucket_item_weight(const struct crush_bucket *b, int p)

void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
{
	kfree(b->h.perm);
	kfree(b->h.items);
	kfree(b);
}
@@ -54,14 +53,12 @@ void crush_destroy_bucket_list(struct crush_bucket_list *b)
{
	kfree(b->item_weights);
	kfree(b->sum_weights);
	kfree(b->h.perm);
	kfree(b->h.items);
	kfree(b);
}

void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
{
	kfree(b->h.perm);
	kfree(b->h.items);
	kfree(b->node_weights);
	kfree(b);
@@ -71,7 +68,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
{
	kfree(b->straws);
	kfree(b->item_weights);
	kfree(b->h.perm);
	kfree(b->h.items);
	kfree(b);
}
@@ -79,7 +75,6 @@ void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b)
{
	kfree(b->item_weights);
	kfree(b->h.perm);
	kfree(b->h.items);
	kfree(b);
}
+138 −68
Original line number Diff line number Diff line
@@ -54,7 +54,6 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
	return -1;
}


/*
 * bucket choose methods
 *
@@ -72,59 +71,60 @@ int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size
 * Since this is expensive, we optimize for the r=0 case, which
 * captures the vast majority of calls.
 */
static int bucket_perm_choose(struct crush_bucket *bucket,
static int bucket_perm_choose(const struct crush_bucket *bucket,
			      struct crush_work_bucket *work,
			      int x, int r)
{
	unsigned int pr = r % bucket->size;
	unsigned int i, s;

	/* start a new permutation if @x has changed */
	if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) {
	if (work->perm_x != (__u32)x || work->perm_n == 0) {
		dprintk("bucket %d new x=%d\n", bucket->id, x);
		bucket->perm_x = x;
		work->perm_x = x;

		/* optimize common r=0 case */
		if (pr == 0) {
			s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
				bucket->size;
			bucket->perm[0] = s;
			bucket->perm_n = 0xffff;   /* magic value, see below */
			work->perm[0] = s;
			work->perm_n = 0xffff;   /* magic value, see below */
			goto out;
		}

		for (i = 0; i < bucket->size; i++)
			bucket->perm[i] = i;
		bucket->perm_n = 0;
	} else if (bucket->perm_n == 0xffff) {
			work->perm[i] = i;
		work->perm_n = 0;
	} else if (work->perm_n == 0xffff) {
		/* clean up after the r=0 case above */
		for (i = 1; i < bucket->size; i++)
			bucket->perm[i] = i;
		bucket->perm[bucket->perm[0]] = 0;
		bucket->perm_n = 1;
			work->perm[i] = i;
		work->perm[work->perm[0]] = 0;
		work->perm_n = 1;
	}

	/* calculate permutation up to pr */
	for (i = 0; i < bucket->perm_n; i++)
	for (i = 0; i < work->perm_n; i++)
		dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
	while (bucket->perm_n <= pr) {
		unsigned int p = bucket->perm_n;
	while (work->perm_n <= pr) {
		unsigned int p = work->perm_n;
		/* no point in swapping the final entry */
		if (p < bucket->size - 1) {
			i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
				(bucket->size - p);
			if (i) {
				unsigned int t = bucket->perm[p + i];
				bucket->perm[p + i] = bucket->perm[p];
				bucket->perm[p] = t;
				unsigned int t = work->perm[p + i];
				work->perm[p + i] = work->perm[p];
				work->perm[p] = t;
			}
			dprintk(" perm_choose swap %d with %d\n", p, p+i);
		}
		bucket->perm_n++;
		work->perm_n++;
	}
	for (i = 0; i < bucket->size; i++)
		dprintk(" perm_choose  %d: %d\n", i, bucket->perm[i]);

	s = bucket->perm[pr];
	s = work->perm[pr];
out:
	dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
		bucket->size, x, r, pr, s);
@@ -132,14 +132,14 @@ static int bucket_perm_choose(struct crush_bucket *bucket,
}

/* uniform */
static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
				 int x, int r)
static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
				 struct crush_work_bucket *work, int x, int r)
{
	return bucket_perm_choose(&bucket->h, x, r);
	return bucket_perm_choose(&bucket->h, work, x, r);
}

/* list */
static int bucket_list_choose(struct crush_bucket_list *bucket,
static int bucket_list_choose(const struct crush_bucket_list *bucket,
			      int x, int r)
{
	int i;
@@ -155,9 +155,10 @@ static int bucket_list_choose(struct crush_bucket_list *bucket,
		w *= bucket->sum_weights[i];
		w = w >> 16;
		/*dprintk(" scaled %llx\n", w);*/
		if (w < bucket->item_weights[i])
		if (w < bucket->item_weights[i]) {
			return bucket->h.items[i];
		}
	}

	dprintk("bad list sums for bucket %d\n", bucket->h.id);
	return bucket->h.items[0];
@@ -192,7 +193,7 @@ static int terminal(int x)
	return x & 1;
}

static int bucket_tree_choose(struct crush_bucket_tree *bucket,
static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
			      int x, int r)
{
	int n;
@@ -224,7 +225,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,

/* straw */

static int bucket_straw_choose(struct crush_bucket_straw *bucket,
static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
			       int x, int r)
{
	__u32 i;
@@ -301,7 +302,7 @@ static __u64 crush_ln(unsigned int xin)
 *
 */

static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
				int x, int r)
{
	unsigned int i, high = 0;
@@ -344,29 +345,35 @@ static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket,
			high_draw = draw;
		}
	}

	return bucket->h.items[high];
}


static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
static int crush_bucket_choose(const struct crush_bucket *in,
			       struct crush_work_bucket *work,
			       int x, int r)
{
	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
	BUG_ON(in->size == 0);
	switch (in->alg) {
	case CRUSH_BUCKET_UNIFORM:
		return bucket_uniform_choose((struct crush_bucket_uniform *)in,
					  x, r);
		return bucket_uniform_choose(
			(const struct crush_bucket_uniform *)in,
			work, x, r);
	case CRUSH_BUCKET_LIST:
		return bucket_list_choose((struct crush_bucket_list *)in,
		return bucket_list_choose((const struct crush_bucket_list *)in,
					  x, r);
	case CRUSH_BUCKET_TREE:
		return bucket_tree_choose((struct crush_bucket_tree *)in,
		return bucket_tree_choose((const struct crush_bucket_tree *)in,
					  x, r);
	case CRUSH_BUCKET_STRAW:
		return bucket_straw_choose((struct crush_bucket_straw *)in,
		return bucket_straw_choose(
			(const struct crush_bucket_straw *)in,
			x, r);
	case CRUSH_BUCKET_STRAW2:
		return bucket_straw2_choose((struct crush_bucket_straw2 *)in,
		return bucket_straw2_choose(
			(const struct crush_bucket_straw2 *)in,
			x, r);
	default:
		dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
@@ -374,7 +381,6 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
	}
}


/*
 * true if device is marked "out" (failed, fully offloaded)
 * of the cluster
@@ -416,7 +422,8 @@ static int is_out(const struct crush_map *map,
 * @parent_r: r value passed from the parent
 */
static int crush_choose_firstn(const struct crush_map *map,
			       struct crush_bucket *bucket,
			       struct crush_work *work,
			       const struct crush_bucket *bucket,
			       const __u32 *weight, int weight_max,
			       int x, int numrep, int type,
			       int *out, int outpos,
@@ -434,7 +441,7 @@ static int crush_choose_firstn(const struct crush_map *map,
	int rep;
	unsigned int ftotal, flocal;
	int retry_descent, retry_bucket, skip_rep;
	struct crush_bucket *in = bucket;
	const struct crush_bucket *in = bucket;
	int r;
	int i;
	int item = 0;
@@ -473,9 +480,13 @@ static int crush_choose_firstn(const struct crush_map *map,
				if (local_fallback_retries > 0 &&
				    flocal >= (in->size>>1) &&
				    flocal > local_fallback_retries)
					item = bucket_perm_choose(in, x, r);
					item = bucket_perm_choose(
						in, work->work[-1-in->id],
						x, r);
				else
					item = crush_bucket_choose(in, x, r);
					item = crush_bucket_choose(
						in, work->work[-1-in->id],
						x, r);
				if (item >= map->max_devices) {
					dprintk("   bad item %d\n", item);
					skip_rep = 1;
@@ -518,7 +529,9 @@ static int crush_choose_firstn(const struct crush_map *map,
							sub_r = r >> (vary_r-1);
						else
							sub_r = 0;
						if (crush_choose_firstn(map,
						if (crush_choose_firstn(
							    map,
							    work,
							    map->buckets[-1-item],
							    weight, weight_max,
							    x, stable ? 1 : outpos+1, 0,
@@ -600,7 +613,8 @@ static int crush_choose_firstn(const struct crush_map *map,
 *
 */
static void crush_choose_indep(const struct crush_map *map,
			       struct crush_bucket *bucket,
			       struct crush_work *work,
			       const struct crush_bucket *bucket,
			       const __u32 *weight, int weight_max,
			       int x, int left, int numrep, int type,
			       int *out, int outpos,
@@ -610,7 +624,7 @@ static void crush_choose_indep(const struct crush_map *map,
			       int *out2,
			       int parent_r)
{
	struct crush_bucket *in = bucket;
	const struct crush_bucket *in = bucket;
	int endpos = outpos + left;
	int rep;
	unsigned int ftotal;
@@ -678,7 +692,9 @@ static void crush_choose_indep(const struct crush_map *map,
					break;
				}

				item = crush_bucket_choose(in, x, r);
				item = crush_bucket_choose(
					in, work->work[-1-in->id],
					x, r);
				if (item >= map->max_devices) {
					dprintk("   bad item %d\n", item);
					out[rep] = CRUSH_ITEM_NONE;
@@ -724,7 +740,9 @@ static void crush_choose_indep(const struct crush_map *map,

				if (recurse_to_leaf) {
					if (item < 0) {
						crush_choose_indep(map,
						crush_choose_indep(
							map,
							work,
							map->buckets[-1-item],
							weight, weight_max,
							x, 1, numrep, 0,
@@ -781,6 +799,53 @@ static void crush_choose_indep(const struct crush_map *map,
#endif
}


/*
 * This takes a chunk of memory and sets it up to be a shiny new
 * working area for a CRUSH placement computation. It must be called
 * on any newly allocated memory before passing it in to
 * crush_do_rule. It may be used repeatedly after that, so long as the
 * map has not changed. If the map /has/ changed, you must make sure
 * the working size is no smaller than what was allocated and re-run
 * crush_init_workspace.
 *
 * If you do retain the working space between calls to crush, make it
 * thread-local.
 */
void crush_init_workspace(const struct crush_map *map, void *v)
{
	struct crush_work *w = v;
	__s32 b;

	/*
	 * We work by moving through the available space and setting
	 * values and pointers as we go.
	 *
	 * It's a bit like Forth's use of the 'allot' word since we
	 * set the pointer first and then reserve the space for it to
	 * point to by incrementing the point.
	 */
	v += sizeof(struct crush_work *);
	w->work = v;
	v += map->max_buckets * sizeof(struct crush_work_bucket *);
	for (b = 0; b < map->max_buckets; ++b) {
		if (!map->buckets[b])
			continue;

		w->work[b] = v;
		switch (map->buckets[b]->alg) {
		default:
			v += sizeof(struct crush_work_bucket);
			break;
		}
		w->work[b]->perm_x = 0;
		w->work[b]->perm_n = 0;
		w->work[b]->perm = v;
		v += map->buckets[b]->size * sizeof(__u32);
	}
	BUG_ON(v - (void *)w != map->working_size);
}

/**
 * crush_do_rule - calculate a mapping with the given input and rule
 * @map: the crush_map
@@ -790,14 +855,16 @@ static void crush_choose_indep(const struct crush_map *map,
 * @result_max: maximum result size
 * @weight: weight vector (for map leaves)
 * @weight_max: size of weight vector
 * @cwin: pointer to at least map->working_size bytes of memory
 * @scratch: scratch vector for private use; must be >= 3 * result_max
 */
int crush_do_rule(const struct crush_map *map,
		  int ruleno, int x, int *result, int result_max,
		  const __u32 *weight, int weight_max,
		  int *scratch)
		  void *cwin, int *scratch)
{
	int result_len;
	struct crush_work *cw = cwin;
	int *a = scratch;
	int *b = scratch + result_max;
	int *c = scratch + result_max*2;
@@ -807,7 +874,7 @@ int crush_do_rule(const struct crush_map *map,
	int *o;
	int osize;
	int *tmp;
	struct crush_rule *rule;
	const struct crush_rule *rule;
	__u32 step;
	int i, j;
	int numrep;
@@ -840,7 +907,7 @@ int crush_do_rule(const struct crush_map *map,

	for (step = 0; step < rule->len; step++) {
		int firstn = 0;
		struct crush_rule_step *curstep = &rule->steps[step];
		const struct crush_rule_step *curstep = &rule->steps[step];

		switch (curstep->op) {
		case CRUSH_RULE_TAKE:
@@ -936,6 +1003,7 @@ int crush_do_rule(const struct crush_map *map,
						recurse_tries = choose_tries;
					osize += crush_choose_firstn(
						map,
						cw,
						map->buckets[bno],
						weight, weight_max,
						x, numrep,
@@ -956,6 +1024,7 @@ int crush_do_rule(const struct crush_map *map,
						    numrep : (result_max-osize));
					crush_choose_indep(
						map,
						cw,
						map->buckets[bno],
						weight, weight_max,
						x, out_size, numrep,
@@ -997,5 +1066,6 @@ int crush_do_rule(const struct crush_map *map,
			break;
		}
	}

	return result_len;
}
Loading