Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 78d91a75 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:
 "Here's a pull request for 4.11-rc, fixing a set of issues mostly
  centered around the new scheduling framework. These have been brewing
  for a while, but split up into what we absolutely need in 4.11, and
  what we can defer until 4.12. These are well tested, on both single
  queue and multiqueue setups, and with and without shared tags. They
  fix several hangs that have happened in testing.

  This is obviously larger than I would have preferred at this point in
  time, but I don't think we can shave much off this and still get the
  desired results.

  In detail, this pull request contains:

   - a set of five fixes for NVMe, mostly from Christoph and one from
     Roland.

   - a series from Bart, fixing issues with dm-mq and SCSI shared tags
     and scheduling. Note that one of those patches commit messages may
     read like an optimization, but it is in fact an important fix for
     queue restarts in particular.

   - a series from Omar, most importantly fixing a hang with multiple
     hardware queues when we fail to get a driver tag. Another important
     fix in there is for resizing hardware queues, which nbd does when
     handling multiple sockets for one connection.

   - fixing an imbalance in putting the ctx for hctx request allocations
     from Minchan"

* 'for-linus' of git://git.kernel.dk/linux-block:
  blk-mq: Restart a single queue if tag sets are shared
  dm rq: Avoid that request processing stalls sporadically
  scsi: Avoid that SCSI queues get stuck
  blk-mq: Introduce blk_mq_delay_run_hw_queue()
  blk-mq: remap queues when adding/removing hardware queues
  blk-mq-sched: fix crash in switch error path
  blk-mq-sched: set up scheduler tags when bringing up new queues
  blk-mq-sched: refactor scheduler initialization
  blk-mq: use the right hctx when getting a driver tag fails
  nvmet: fix byte swap in nvmet_parse_io_cmd
  nvmet: fix byte swap in nvmet_execute_write_zeroes
  nvmet: add missing byte swap in nvmet_get_smart_log
  nvme: add missing byte swap in nvme_setup_discard
  nvme: Correct NVMF enum values to match NVMe-oF rev 1.0
  block: do not put mq context in blk_mq_alloc_request_hctx
parents c3df1c7c 6d8c6c0f
Loading
Loading
Loading
Loading
+131 −50
Original line number Diff line number Diff line
@@ -171,7 +171,8 @@ void blk_mq_sched_put_request(struct request *rq)

void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
	struct elevator_queue *e = hctx->queue->elevator;
	struct request_queue *q = hctx->queue;
	struct elevator_queue *e = q->elevator;
	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
	bool did_work = false;
	LIST_HEAD(rq_list);
@@ -203,10 +204,10 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
	 */
	if (!list_empty(&rq_list)) {
		blk_mq_sched_mark_restart_hctx(hctx);
		did_work = blk_mq_dispatch_rq_list(hctx, &rq_list);
		did_work = blk_mq_dispatch_rq_list(q, &rq_list);
	} else if (!has_sched_dispatch) {
		blk_mq_flush_busy_ctxs(hctx, &rq_list);
		blk_mq_dispatch_rq_list(hctx, &rq_list);
		blk_mq_dispatch_rq_list(q, &rq_list);
	}

	/*
@@ -222,7 +223,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
			if (!rq)
				break;
			list_add(&rq->queuelist, &rq_list);
		} while (blk_mq_dispatch_rq_list(hctx, &rq_list));
		} while (blk_mq_dispatch_rq_list(q, &rq_list));
	}
}

@@ -317,25 +318,68 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
	return true;
}

static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
{
	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
		if (blk_mq_hctx_has_pending(hctx))
		if (blk_mq_hctx_has_pending(hctx)) {
			blk_mq_run_hw_queue(hctx, true);
			return true;
		}
	}
	return false;
}

void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx)
{
	struct request_queue *q = hctx->queue;
	unsigned int i;
/**
 * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
 * @pos:    loop cursor.
 * @skip:   the list element that will not be examined. Iteration starts at
 *          @skip->next.
 * @head:   head of the list to examine. This list must have at least one
 *          element, namely @skip.
 * @member: name of the list_head structure within typeof(*pos).
 */
#define list_for_each_entry_rcu_rr(pos, skip, head, member)		\
	for ((pos) = (skip);						\
	     (pos = (pos)->member.next != (head) ? list_entry_rcu(	\
			(pos)->member.next, typeof(*pos), member) :	\
	      list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
	     (pos) != (skip); )

	if (test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) {
		if (test_and_clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) {
			queue_for_each_hw_ctx(q, hctx, i)
				blk_mq_sched_restart_hctx(hctx);
/*
 * Called after a driver tag has been freed to check whether a hctx needs to
 * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
 * queues in a round-robin fashion if the tag set of @hctx is shared with other
 * hardware queues.
 */
void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
{
	struct blk_mq_tags *const tags = hctx->tags;
	struct blk_mq_tag_set *const set = hctx->queue->tag_set;
	struct request_queue *const queue = hctx->queue, *q;
	struct blk_mq_hw_ctx *hctx2;
	unsigned int i, j;

	if (set->flags & BLK_MQ_F_TAG_SHARED) {
		rcu_read_lock();
		list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
					   tag_set_list) {
			queue_for_each_hw_ctx(q, hctx2, i)
				if (hctx2->tags == tags &&
				    blk_mq_sched_restart_hctx(hctx2))
					goto done;
		}
		j = hctx->queue_num + 1;
		for (i = 0; i < queue->nr_hw_queues; i++, j++) {
			if (j == queue->nr_hw_queues)
				j = 0;
			hctx2 = queue->queue_hw_ctx[j];
			if (hctx2->tags == tags &&
			    blk_mq_sched_restart_hctx(hctx2))
				break;
		}
done:
		rcu_read_unlock();
	} else {
		blk_mq_sched_restart_hctx(hctx);
	}
@@ -431,11 +475,67 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
	}
}

int blk_mq_sched_setup(struct request_queue *q)
static int blk_mq_sched_alloc_tags(struct request_queue *q,
				   struct blk_mq_hw_ctx *hctx,
				   unsigned int hctx_idx)
{
	struct blk_mq_tag_set *set = q->tag_set;
	int ret;

	hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
					       set->reserved_tags);
	if (!hctx->sched_tags)
		return -ENOMEM;

	ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
	if (ret)
		blk_mq_sched_free_tags(set, hctx, hctx_idx);

	return ret;
}

static void blk_mq_sched_tags_teardown(struct request_queue *q)
{
	struct blk_mq_tag_set *set = q->tag_set;
	struct blk_mq_hw_ctx *hctx;
	int ret, i;
	int i;

	queue_for_each_hw_ctx(q, hctx, i)
		blk_mq_sched_free_tags(set, hctx, i);
}

int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
			   unsigned int hctx_idx)
{
	struct elevator_queue *e = q->elevator;

	if (!e)
		return 0;

	return blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
}

void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
			    unsigned int hctx_idx)
{
	struct elevator_queue *e = q->elevator;

	if (!e)
		return;

	blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
}

int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
	struct blk_mq_hw_ctx *hctx;
	unsigned int i;
	int ret;

	if (!e) {
		q->elevator = NULL;
		return 0;
	}

	/*
	 * Default to 256, since we don't split into sync/async like the
@@ -443,49 +543,30 @@ int blk_mq_sched_setup(struct request_queue *q)
	 */
	q->nr_requests = 2 * BLKDEV_MAX_RQ;

	/*
	 * We're switching to using an IO scheduler, so setup the hctx
	 * scheduler tags and switch the request map from the regular
	 * tags to scheduler tags. First allocate what we need, so we
	 * can safely fail and fallback, if needed.
	 */
	ret = 0;
	queue_for_each_hw_ctx(q, hctx, i) {
		hctx->sched_tags = blk_mq_alloc_rq_map(set, i,
				q->nr_requests, set->reserved_tags);
		if (!hctx->sched_tags) {
			ret = -ENOMEM;
			break;
		}
		ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests);
		ret = blk_mq_sched_alloc_tags(q, hctx, i);
		if (ret)
			break;
	}

	/*
	 * If we failed, free what we did allocate
	 */
	if (ret) {
		queue_for_each_hw_ctx(q, hctx, i) {
			if (!hctx->sched_tags)
				continue;
			blk_mq_sched_free_tags(set, hctx, i);
			goto err;
	}

		return ret;
	}
	ret = e->ops.mq.init_sched(q, e);
	if (ret)
		goto err;

	return 0;

err:
	blk_mq_sched_tags_teardown(q);
	q->elevator = NULL;
	return ret;
}

void blk_mq_sched_teardown(struct request_queue *q)
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
{
	struct blk_mq_tag_set *set = q->tag_set;
	struct blk_mq_hw_ctx *hctx;
	int i;

	queue_for_each_hw_ctx(q, hctx, i)
		blk_mq_sched_free_tags(set, hctx, i);
	if (e->type->ops.mq.exit_sched)
		e->type->ops.mq.exit_sched(e);
	blk_mq_sched_tags_teardown(q);
	q->elevator = NULL;
}

int blk_mq_sched_init(struct request_queue *q)
+8 −17
Original line number Diff line number Diff line
@@ -19,7 +19,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
				struct request **merged_request);
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);

void blk_mq_sched_insert_request(struct request *rq, bool at_head,
				 bool run_queue, bool async, bool can_block);
@@ -32,8 +32,13 @@ void blk_mq_sched_move_to_dispatch(struct blk_mq_hw_ctx *hctx,
			struct list_head *rq_list,
			struct request *(*get_rq)(struct blk_mq_hw_ctx *));

int blk_mq_sched_setup(struct request_queue *q);
void blk_mq_sched_teardown(struct request_queue *q);
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);

int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
			   unsigned int hctx_idx);
void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
			    unsigned int hctx_idx);

int blk_mq_sched_init(struct request_queue *q);

@@ -131,20 +136,6 @@ static inline void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}

/*
 * Mark a hardware queue and the request queue it belongs to as needing a
 * restart.
 */
static inline void blk_mq_sched_mark_restart_queue(struct blk_mq_hw_ctx *hctx)
{
	struct request_queue *q = hctx->queue;

	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
	if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags))
		set_bit(QUEUE_FLAG_RESTART, &q->queue_flags);
}

static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
{
	return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+62 −23
Original line number Diff line number Diff line
@@ -321,7 +321,6 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,

	rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);

	blk_mq_put_ctx(alloc_data.ctx);
	blk_queue_exit(q);

	if (!rq)
@@ -349,7 +348,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
	if (sched_tag != -1)
		blk_mq_sched_completed_request(hctx, rq);
	blk_mq_sched_restart_queues(hctx);
	blk_mq_sched_restart(hctx);
	blk_queue_exit(q);
}

@@ -846,12 +845,8 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
		.flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
	};

	if (rq->tag != -1) {
done:
		if (hctx)
			*hctx = data.hctx;
		return true;
	}
	if (rq->tag != -1)
		goto done;

	if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
		data.flags |= BLK_MQ_REQ_RESERVED;
@@ -863,10 +858,12 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
			atomic_inc(&data.hctx->nr_active);
		}
		data.hctx->tags->rqs[rq->tag] = rq;
		goto done;
	}

	return false;
done:
	if (hctx)
		*hctx = data.hctx;
	return rq->tag != -1;
}

static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
@@ -963,14 +960,17 @@ static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx)
	return true;
}

bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
{
	struct request_queue *q = hctx->queue;
	struct blk_mq_hw_ctx *hctx;
	struct request *rq;
	LIST_HEAD(driver_list);
	struct list_head *dptr;
	int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;

	if (list_empty(list))
		return false;

	/*
	 * Start off with dptr being NULL, so we start the first request
	 * immediately, even if we have more pending.
@@ -981,7 +981,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
	 * Now process all the entries, sending them to the driver.
	 */
	errors = queued = 0;
	while (!list_empty(list)) {
	do {
		struct blk_mq_queue_data bd;

		rq = list_first_entry(list, struct request, queuelist);
@@ -1052,7 +1052,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
		 */
		if (!dptr && list->next != list->prev)
			dptr = &driver_list;
	}
	} while (!list_empty(list));

	hctx->dispatched[queued_to_index(queued)]++;

@@ -1135,7 +1135,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
	return hctx->next_cpu;
}

void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
					unsigned long msecs)
{
	if (unlikely(blk_mq_hctx_stopped(hctx) ||
		     !blk_mq_hw_queue_mapped(hctx)))
@@ -1152,7 +1153,24 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
		put_cpu();
	}

	kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work);
	if (msecs == 0)
		kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx),
					 &hctx->run_work);
	else
		kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
						 &hctx->delayed_run_work,
						 msecs_to_jiffies(msecs));
}

void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
	__blk_mq_delay_run_hw_queue(hctx, true, msecs);
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);

void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
	__blk_mq_delay_run_hw_queue(hctx, async, 0);
}

void blk_mq_run_hw_queues(struct request_queue *q, bool async)
@@ -1255,6 +1273,15 @@ static void blk_mq_run_work_fn(struct work_struct *work)
	__blk_mq_run_hw_queue(hctx);
}

static void blk_mq_delayed_run_work_fn(struct work_struct *work)
{
	struct blk_mq_hw_ctx *hctx;

	hctx = container_of(work, struct blk_mq_hw_ctx, delayed_run_work.work);

	__blk_mq_run_hw_queue(hctx);
}

static void blk_mq_delay_work_fn(struct work_struct *work)
{
	struct blk_mq_hw_ctx *hctx;
@@ -1924,6 +1951,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
				       hctx->fq->flush_rq, hctx_idx,
				       flush_start_tag + hctx_idx);

	blk_mq_sched_exit_hctx(q, hctx, hctx_idx);

	if (set->ops->exit_hctx)
		set->ops->exit_hctx(hctx, hctx_idx);

@@ -1960,6 +1989,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
		node = hctx->numa_node = set->numa_node;

	INIT_WORK(&hctx->run_work, blk_mq_run_work_fn);
	INIT_DELAYED_WORK(&hctx->delayed_run_work, blk_mq_delayed_run_work_fn);
	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
	spin_lock_init(&hctx->lock);
	INIT_LIST_HEAD(&hctx->dispatch);
@@ -1990,9 +2020,12 @@ static int blk_mq_init_hctx(struct request_queue *q,
	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
		goto free_bitmap;

	if (blk_mq_sched_init_hctx(q, hctx, hctx_idx))
		goto exit_hctx;

	hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
	if (!hctx->fq)
		goto exit_hctx;
		goto sched_exit_hctx;

	if (set->ops->init_request &&
	    set->ops->init_request(set->driver_data,
@@ -2007,6 +2040,8 @@ static int blk_mq_init_hctx(struct request_queue *q,

 free_fq:
	kfree(hctx->fq);
 sched_exit_hctx:
	blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
 exit_hctx:
	if (set->ops->exit_hctx)
		set->ops->exit_hctx(hctx, hctx_idx);
@@ -2233,8 +2268,6 @@ void blk_mq_release(struct request_queue *q)
	struct blk_mq_hw_ctx *hctx;
	unsigned int i;

	blk_mq_sched_teardown(q);

	/* hctx kobj stays in hctx */
	queue_for_each_hw_ctx(q, hctx, i) {
		if (!hctx)
@@ -2565,6 +2598,14 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
	return 0;
}

static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
	if (set->ops->map_queues)
		return set->ops->map_queues(set);
	else
		return blk_mq_map_queues(set);
}

/*
 * Alloc a tag set to be associated with one or more request queues.
 * May fail with EINVAL for various error conditions. May adjust the
@@ -2619,10 +2660,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
	if (!set->mq_map)
		goto out_free_tags;

	if (set->ops->map_queues)
		ret = set->ops->map_queues(set);
	else
		ret = blk_mq_map_queues(set);
	ret = blk_mq_update_queue_map(set);
	if (ret)
		goto out_free_mq_map;

@@ -2714,6 +2752,7 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
		blk_mq_freeze_queue(q);

	set->nr_hw_queues = nr_hw_queues;
	blk_mq_update_queue_map(set);
	list_for_each_entry(q, &set->tag_list, tag_set_list) {
		blk_mq_realloc_hw_ctxs(set, q);

+1 −1
Original line number Diff line number Diff line
@@ -31,7 +31,7 @@ void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *);
bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx);
bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
+1 −1
Original line number Diff line number Diff line
@@ -816,7 +816,7 @@ static void blk_release_queue(struct kobject *kobj)

	if (q->elevator) {
		ioc_clear_queue(q);
		elevator_exit(q->elevator);
		elevator_exit(q, q->elevator);
	}

	blk_exit_rl(&q->root_rl);
Loading