Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1429d7c9 authored by Jens Axboe's avatar Jens Axboe
Browse files

blk-mq: switch ctx pending map to the sparser blk_align_bitmap



Each hardware queue has a bitmap of software queues with pending
requests. When new IO is queued on a software queue, the bit is
set, and when IO is pruned on a hardware queue run, the bit is
cleared. This causes a lot of traffic. Switch this from the regular
BITS_PER_LONG bitmap to a sparser layout, similarly to what was
done for blk-mq tagging.

20% performance increase was observed for single threaded IO, and
about 15% performanc increase on multiple threads driving the
same device.

Signed-off-by: default avatarJens Axboe <axboe@fb.com>
parent e93ecf60
Loading
Loading
Loading
Loading
+91 −28
Original line number Diff line number Diff line
@@ -56,21 +56,40 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
{
	unsigned int i;

	for (i = 0; i < hctx->nr_ctx_map; i++)
		if (hctx->ctx_map[i])
	for (i = 0; i < hctx->ctx_map.map_size; i++)
		if (hctx->ctx_map.map[i].word)
			return true;

	return false;
}

static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
					      struct blk_mq_ctx *ctx)
{
	return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
}

#define CTX_TO_BIT(hctx, ctx)	\
	((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))

/*
 * Mark this ctx as having pending work in this hardware queue
 */
static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
				     struct blk_mq_ctx *ctx)
{
	if (!test_bit(ctx->index_hw, hctx->ctx_map))
		set_bit(ctx->index_hw, hctx->ctx_map);
	struct blk_align_bitmap *bm = get_bm(hctx, ctx);

	if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
		set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
}

static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
				      struct blk_mq_ctx *ctx)
{
	struct blk_align_bitmap *bm = get_bm(hctx, ctx);

	clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
}

static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
@@ -614,6 +633,40 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
	return false;
}

/*
 * Process software queues that have been marked busy, splicing them
 * to the for-dispatch
 */
static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
	struct blk_mq_ctx *ctx;
	int i;

	for (i = 0; i < hctx->ctx_map.map_size; i++) {
		struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
		unsigned int off, bit;

		if (!bm->word)
			continue;

		bit = 0;
		off = i * hctx->ctx_map.bits_per_word;
		do {
			bit = find_next_bit(&bm->word, bm->depth, bit);
			if (bit >= bm->depth)
				break;

			ctx = hctx->ctxs[bit + off];
			clear_bit(bit, &bm->word);
			spin_lock(&ctx->lock);
			list_splice_tail_init(&ctx->rq_list, list);
			spin_unlock(&ctx->lock);

			bit++;
		} while (1);
	}
}

/*
 * Run this hardware queue, pulling any software queues mapped to it in.
 * Note that this function currently has various problems around ordering
@@ -623,10 +676,9 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
{
	struct request_queue *q = hctx->queue;
	struct blk_mq_ctx *ctx;
	struct request *rq;
	LIST_HEAD(rq_list);
	int bit, queued;
	int queued;

	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));

@@ -638,14 +690,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
	/*
	 * Touch any software queue that has pending entries.
	 */
	for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) {
		clear_bit(bit, hctx->ctx_map);
		ctx = hctx->ctxs[bit];

		spin_lock(&ctx->lock);
		list_splice_tail_init(&ctx->rq_list, &rq_list);
		spin_unlock(&ctx->lock);
	}
	flush_busy_ctxs(hctx, &rq_list);

	/*
	 * If we have previous entries on our dispatch list, grab them
@@ -658,14 +703,10 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
		spin_unlock(&hctx->lock);
	}

	/*
	 * Delete and return all entries from our dispatch list
	 */
	queued = 0;

	/*
	 * Now process all the entries, sending them to the driver.
	 */
	queued = 0;
	while (!list_empty(&rq_list)) {
		int ret;

@@ -1158,7 +1199,7 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
	spin_lock(&ctx->lock);
	if (!list_empty(&ctx->rq_list)) {
		list_splice_init(&ctx->rq_list, &tmp);
		clear_bit(ctx->index_hw, hctx->ctx_map);
		blk_mq_hctx_clear_pending(hctx, ctx);
	}
	spin_unlock(&ctx->lock);

@@ -1298,6 +1339,34 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
	return NULL;
}

static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
{
	kfree(bitmap->map);
}

static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
{
	unsigned int bpw = 8, total, num_maps, i;

	bitmap->bits_per_word = bpw;

	num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
	bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
					GFP_KERNEL, node);
	if (!bitmap->map)
		return -ENOMEM;

	bitmap->map_size = num_maps;

	total = nr_cpu_ids;
	for (i = 0; i < num_maps; i++) {
		bitmap->map[i].depth = min(total, bitmap->bits_per_word);
		total -= bitmap->map[i].depth;
	}

	return 0;
}

static int blk_mq_init_hw_queues(struct request_queue *q,
		struct blk_mq_tag_set *set)
{
@@ -1308,7 +1377,6 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
	 * Initialize hardware queues
	 */
	queue_for_each_hw_ctx(q, hctx, i) {
		unsigned int num_maps;
		int node;

		node = hctx->numa_node;
@@ -1339,13 +1407,9 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
		if (!hctx->ctxs)
			break;

		num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG;
		hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long),
						GFP_KERNEL, node);
		if (!hctx->ctx_map)
		if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
			break;

		hctx->nr_ctx_map = num_maps;
		hctx->nr_ctx = 0;

		if (set->ops->init_hctx &&
@@ -1368,7 +1432,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,

		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
		kfree(hctx->ctxs);
		kfree(hctx->ctx_map);
		blk_mq_free_bitmap(&hctx->ctx_map);
	}

	return 1;
@@ -1542,7 +1606,6 @@ void blk_mq_free_queue(struct request_queue *q)
	int i;

	queue_for_each_hw_ctx(q, hctx, i) {
		kfree(hctx->ctx_map);
		kfree(hctx->ctxs);
		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
		if (q->mq_ops->exit_hctx)
+8 −2
Original line number Diff line number Diff line
@@ -11,6 +11,12 @@ struct blk_mq_cpu_notifier {
	void (*notify)(void *data, unsigned long action, unsigned int cpu);
};

struct blk_mq_ctxmap {
	unsigned int map_size;
	unsigned int bits_per_word;
	struct blk_align_bitmap *map;
};

struct blk_mq_hw_ctx {
	struct {
		spinlock_t		lock;
@@ -31,8 +37,8 @@ struct blk_mq_hw_ctx {

	void			*driver_data;

	unsigned int 		nr_ctx_map;
	unsigned long		*ctx_map;
	struct blk_mq_ctxmap	ctx_map;

	unsigned int		nr_ctx;
	struct blk_mq_ctx	**ctxs;