Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 12f5b931 authored by Keith Busch's avatar Keith Busch Committed by Jens Axboe
Browse files

blk-mq: Remove generation seqeunce



This patch simplifies the timeout handling by relying on the request
reference counting to ensure the iterator is operating on an inflight
and truly timed out request. Since the reference counting prevents the
tag from being reallocated, the block layer no longer needs to prevent
drivers from completing their requests while the timeout handler is
operating on it: a driver completing a request is allowed to proceed to
the next state without additional syncronization with the block layer.

This also removes any need for generation sequence numbers since the
request lifetime is prevented from being reallocated as a new sequence
while timeout handling is operating on it.

To enables this a refcount is added to struct request so that request
users can be sure they're operating on the same request without it
changing while they're processing it.  The request's tag won't be
released for reuse until both the timeout handler and the completion
are done with it.

Signed-off-by: default avatarKeith Busch <keith.busch@intel.com>
[hch: slight cleanups, added back submission side hctx lock, use cmpxchg
 for completions]
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent ad103e79
Loading
Loading
Loading
Loading
+0 −6
Original line number Original line Diff line number Diff line
@@ -198,12 +198,6 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
	rq->internal_tag = -1;
	rq->internal_tag = -1;
	rq->start_time_ns = ktime_get_ns();
	rq->start_time_ns = ktime_get_ns();
	rq->part = NULL;
	rq->part = NULL;
	seqcount_init(&rq->gstate_seq);
	u64_stats_init(&rq->aborted_gstate_sync);
	/*
	 * See comment of blk_mq_init_request
	 */
	WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC);
}
}
EXPORT_SYMBOL(blk_rq_init);
EXPORT_SYMBOL(blk_rq_init);


+0 −1
Original line number Original line Diff line number Diff line
@@ -344,7 +344,6 @@ static const char *const rqf_name[] = {
	RQF_NAME(STATS),
	RQF_NAME(STATS),
	RQF_NAME(SPECIAL_PAYLOAD),
	RQF_NAME(SPECIAL_PAYLOAD),
	RQF_NAME(ZONE_WRITE_LOCKED),
	RQF_NAME(ZONE_WRITE_LOCKED),
	RQF_NAME(MQ_TIMEOUT_EXPIRED),
	RQF_NAME(MQ_POLL_SLEPT),
	RQF_NAME(MQ_POLL_SLEPT),
};
};
#undef RQF_NAME
#undef RQF_NAME
+75 −183
Original line number Original line Diff line number Diff line
@@ -332,6 +332,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
#endif
#endif


	data->ctx->rq_dispatched[op_is_sync(op)]++;
	data->ctx->rq_dispatched[op_is_sync(op)]++;
	refcount_set(&rq->ref, 1);
	return rq;
	return rq;
}
}


@@ -465,13 +466,27 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
}
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);


static void __blk_mq_free_request(struct request *rq)
{
	struct request_queue *q = rq->q;
	struct blk_mq_ctx *ctx = rq->mq_ctx;
	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
	const int sched_tag = rq->internal_tag;

	if (rq->tag != -1)
		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
	if (sched_tag != -1)
		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
	blk_mq_sched_restart(hctx);
	blk_queue_exit(q);
}

void blk_mq_free_request(struct request *rq)
void blk_mq_free_request(struct request *rq)
{
{
	struct request_queue *q = rq->q;
	struct request_queue *q = rq->q;
	struct elevator_queue *e = q->elevator;
	struct elevator_queue *e = q->elevator;
	struct blk_mq_ctx *ctx = rq->mq_ctx;
	struct blk_mq_ctx *ctx = rq->mq_ctx;
	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
	const int sched_tag = rq->internal_tag;


	if (rq->rq_flags & RQF_ELVPRIV) {
	if (rq->rq_flags & RQF_ELVPRIV) {
		if (e && e->type->ops.mq.finish_request)
		if (e && e->type->ops.mq.finish_request)
@@ -494,13 +509,9 @@ void blk_mq_free_request(struct request *rq)
	if (blk_rq_rl(rq))
	if (blk_rq_rl(rq))
		blk_put_rl(blk_rq_rl(rq));
		blk_put_rl(blk_rq_rl(rq));


	blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
	if (rq->tag != -1)
	if (refcount_dec_and_test(&rq->ref))
		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
		__blk_mq_free_request(rq);
	if (sched_tag != -1)
		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
	blk_mq_sched_restart(hctx);
	blk_queue_exit(q);
}
}
EXPORT_SYMBOL_GPL(blk_mq_free_request);
EXPORT_SYMBOL_GPL(blk_mq_free_request);


@@ -547,8 +558,9 @@ static void __blk_mq_complete_request(struct request *rq)
	bool shared = false;
	bool shared = false;
	int cpu;
	int cpu;


	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT);
	if (cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) !=
	blk_mq_rq_update_state(rq, MQ_RQ_COMPLETE);
			MQ_RQ_IN_FLIGHT)
		return;


	if (rq->internal_tag != -1)
	if (rq->internal_tag != -1)
		blk_mq_sched_completed_request(rq);
		blk_mq_sched_completed_request(rq);
@@ -593,36 +605,6 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
		*srcu_idx = srcu_read_lock(hctx->srcu);
		*srcu_idx = srcu_read_lock(hctx->srcu);
}
}


static void blk_mq_rq_update_aborted_gstate(struct request *rq, u64 gstate)
{
	unsigned long flags;

	/*
	 * blk_mq_rq_aborted_gstate() is used from the completion path and
	 * can thus be called from irq context.  u64_stats_fetch in the
	 * middle of update on the same CPU leads to lockup.  Disable irq
	 * while updating.
	 */
	local_irq_save(flags);
	u64_stats_update_begin(&rq->aborted_gstate_sync);
	rq->aborted_gstate = gstate;
	u64_stats_update_end(&rq->aborted_gstate_sync);
	local_irq_restore(flags);
}

static u64 blk_mq_rq_aborted_gstate(struct request *rq)
{
	unsigned int start;
	u64 aborted_gstate;

	do {
		start = u64_stats_fetch_begin(&rq->aborted_gstate_sync);
		aborted_gstate = rq->aborted_gstate;
	} while (u64_stats_fetch_retry(&rq->aborted_gstate_sync, start));

	return aborted_gstate;
}

/**
/**
 * blk_mq_complete_request - end I/O on a request
 * blk_mq_complete_request - end I/O on a request
 * @rq:		the request being processed
 * @rq:		the request being processed
@@ -633,28 +615,9 @@ static u64 blk_mq_rq_aborted_gstate(struct request *rq)
 **/
 **/
void blk_mq_complete_request(struct request *rq)
void blk_mq_complete_request(struct request *rq)
{
{
	struct request_queue *q = rq->q;
	if (unlikely(blk_should_fake_timeout(rq->q)))
	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
	int srcu_idx;

	if (unlikely(blk_should_fake_timeout(q)))
		return;
		return;

	/*
	 * If @rq->aborted_gstate equals the current instance, timeout is
	 * claiming @rq and we lost.  This is synchronized through
	 * hctx_lock().  See blk_mq_timeout_work() for details.
	 *
	 * Completion path never blocks and we can directly use RCU here
	 * instead of hctx_lock() which can be either RCU or SRCU.
	 * However, that would complicate paths which want to synchronize
	 * against us.  Let stay in sync with the issue path so that
	 * hctx_lock() covers both issue and completion paths.
	 */
	hctx_lock(hctx, &srcu_idx);
	if (blk_mq_rq_aborted_gstate(rq) != rq->gstate)
	__blk_mq_complete_request(rq);
	__blk_mq_complete_request(rq);
	hctx_unlock(hctx, srcu_idx);
}
}
EXPORT_SYMBOL(blk_mq_complete_request);
EXPORT_SYMBOL(blk_mq_complete_request);


@@ -683,25 +646,8 @@ void blk_mq_start_request(struct request *rq)


	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
	WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);


	/*
	 * Mark @rq in-flight which also advances the generation number,
	 * and register for timeout.  Protect with a seqcount to allow the
	 * timeout path to read both @rq->gstate and @rq->deadline
	 * coherently.
	 *
	 * This is the only place where a request is marked in-flight.  If
	 * the timeout path reads an in-flight @rq->gstate, the
	 * @rq->deadline it reads together under @rq->gstate_seq is
	 * guaranteed to be the matching one.
	 */
	preempt_disable();
	write_seqcount_begin(&rq->gstate_seq);

	blk_add_timer(rq);
	blk_add_timer(rq);
	blk_mq_rq_update_state(rq, MQ_RQ_IN_FLIGHT);
	WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);

	write_seqcount_end(&rq->gstate_seq);
	preempt_enable();


	if (q->dma_drain_size && blk_rq_bytes(rq)) {
	if (q->dma_drain_size && blk_rq_bytes(rq)) {
		/*
		/*
@@ -714,11 +660,6 @@ void blk_mq_start_request(struct request *rq)
}
}
EXPORT_SYMBOL(blk_mq_start_request);
EXPORT_SYMBOL(blk_mq_start_request);


/*
 * When we reach here because queue is busy, it's safe to change the state
 * to IDLE without checking @rq->aborted_gstate because we should still be
 * holding the RCU read lock and thus protected against timeout.
 */
static void __blk_mq_requeue_request(struct request *rq)
static void __blk_mq_requeue_request(struct request *rq)
{
{
	struct request_queue *q = rq->q;
	struct request_queue *q = rq->q;
@@ -728,8 +669,8 @@ static void __blk_mq_requeue_request(struct request *rq)
	trace_block_rq_requeue(q, rq);
	trace_block_rq_requeue(q, rq);
	wbt_requeue(q->rq_wb, rq);
	wbt_requeue(q->rq_wb, rq);


	if (blk_mq_rq_state(rq) != MQ_RQ_IDLE) {
	if (blk_mq_request_started(rq)) {
		blk_mq_rq_update_state(rq, MQ_RQ_IDLE);
		WRITE_ONCE(rq->state, MQ_RQ_IDLE);
		if (q->dma_drain_size && blk_rq_bytes(rq))
		if (q->dma_drain_size && blk_rq_bytes(rq))
			rq->nr_phys_segments--;
			rq->nr_phys_segments--;
	}
	}
@@ -827,33 +768,20 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
}
}
EXPORT_SYMBOL(blk_mq_tag_to_rq);
EXPORT_SYMBOL(blk_mq_tag_to_rq);


struct blk_mq_timeout_data {
	unsigned long next;
	unsigned int next_set;
	unsigned int nr_expired;
};

static void blk_mq_rq_timed_out(struct request *req, bool reserved)
static void blk_mq_rq_timed_out(struct request *req, bool reserved)
{
{
	const struct blk_mq_ops *ops = req->q->mq_ops;
	const struct blk_mq_ops *ops = req->q->mq_ops;
	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;


	req->rq_flags |= RQF_MQ_TIMEOUT_EXPIRED;

	if (ops->timeout)
	if (ops->timeout)
		ret = ops->timeout(req, reserved);
		ret = ops->timeout(req, reserved);


	switch (ret) {
	switch (ret) {
	case BLK_EH_HANDLED:
	case BLK_EH_HANDLED:
		if (blk_mq_rq_state(req) == MQ_RQ_IN_FLIGHT)
			__blk_mq_complete_request(req);
			__blk_mq_complete_request(req);
		break;
		break;
	case BLK_EH_RESET_TIMER:
	case BLK_EH_RESET_TIMER:
		/*
		 * As nothing prevents from completion happening while
		 * ->aborted_gstate is set, this may lead to ignored
		 * completions and further spurious timeouts.
		 */
		blk_mq_rq_update_aborted_gstate(req, 0);
		blk_add_timer(req);
		blk_add_timer(req);
		break;
		break;
	case BLK_EH_NOT_HANDLED:
	case BLK_EH_NOT_HANDLED:
@@ -864,64 +792,65 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
	}
	}
}
}


static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
		struct request *rq, void *priv, bool reserved)
{
{
	struct blk_mq_timeout_data *data = priv;
	unsigned long deadline;
	unsigned long gstate, deadline;
	int start;

	might_sleep();


	if (rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED)
	if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
		return;
		return false;


	/* read coherent snapshots of @rq->state_gen and @rq->deadline */
	while (true) {
		start = read_seqcount_begin(&rq->gstate_seq);
		gstate = READ_ONCE(rq->gstate);
	deadline = blk_rq_deadline(rq);
	deadline = blk_rq_deadline(rq);
		if (!read_seqcount_retry(&rq->gstate_seq, start))
	if (time_after_eq(jiffies, deadline))
			break;
		return true;
		cond_resched();
	}


	/* if in-flight && overdue, mark for abortion */
	if (*next == 0)
	if ((gstate & MQ_RQ_STATE_MASK) == MQ_RQ_IN_FLIGHT &&
		*next = deadline;
	    time_after_eq(jiffies, deadline)) {
	else if (time_after(*next, deadline))
		blk_mq_rq_update_aborted_gstate(rq, gstate);
		*next = deadline;
		data->nr_expired++;
	return false;
		hctx->nr_expired++;
	} else if (!data->next_set || time_after(data->next, deadline)) {
		data->next = deadline;
		data->next_set = 1;
	}
}
}


static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
		struct request *rq, void *priv, bool reserved)
		struct request *rq, void *priv, bool reserved)
{
{
	unsigned long *next = priv;

	/*
	 * Just do a quick check if it is expired before locking the request in
	 * so we're not unnecessarilly synchronizing across CPUs.
	 */
	if (!blk_mq_req_expired(rq, next))
		return;

	/*
	/*
	 * We marked @rq->aborted_gstate and waited for RCU.  If there were
	 * We have reason to believe the request may be expired. Take a
	 * completions that we lost to, they would have finished and
	 * reference on the request to lock this request lifetime into its
	 * updated @rq->gstate by now; otherwise, the completion path is
	 * currently allocated context to prevent it from being reallocated in
	 * now guaranteed to see @rq->aborted_gstate and yield.  If
	 * the event the completion by-passes this timeout handler.
	 * @rq->aborted_gstate still matches @rq->gstate, @rq is ours.
	 *
	 * If the reference was already released, then the driver beat the
	 * timeout handler to posting a natural completion.
	 */
	if (!refcount_inc_not_zero(&rq->ref))
		return;

	/*
	 * The request is now locked and cannot be reallocated underneath the
	 * timeout handler's processing. Re-verify this exact request is truly
	 * expired; if it is not expired, then the request was completed and
	 * reallocated as a new request.
	 */
	 */
	if (!(rq->rq_flags & RQF_MQ_TIMEOUT_EXPIRED) &&
	if (blk_mq_req_expired(rq, next))
	    READ_ONCE(rq->gstate) == rq->aborted_gstate)
		blk_mq_rq_timed_out(rq, reserved);
		blk_mq_rq_timed_out(rq, reserved);
	if (refcount_dec_and_test(&rq->ref))
		__blk_mq_free_request(rq);
}
}


static void blk_mq_timeout_work(struct work_struct *work)
static void blk_mq_timeout_work(struct work_struct *work)
{
{
	struct request_queue *q =
	struct request_queue *q =
		container_of(work, struct request_queue, timeout_work);
		container_of(work, struct request_queue, timeout_work);
	struct blk_mq_timeout_data data = {
	unsigned long next = 0;
		.next		= 0,
		.next_set	= 0,
		.nr_expired	= 0,
	};
	struct blk_mq_hw_ctx *hctx;
	struct blk_mq_hw_ctx *hctx;
	int i;
	int i;


@@ -941,39 +870,10 @@ static void blk_mq_timeout_work(struct work_struct *work)
	if (!percpu_ref_tryget(&q->q_usage_counter))
	if (!percpu_ref_tryget(&q->q_usage_counter))
		return;
		return;


	/* scan for the expired ones and set their ->aborted_gstate */
	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
	blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);

	if (data.nr_expired) {
		bool has_rcu = false;

		/*
		 * Wait till everyone sees ->aborted_gstate.  The
		 * sequential waits for SRCUs aren't ideal.  If this ever
		 * becomes a problem, we can add per-hw_ctx rcu_head and
		 * wait in parallel.
		 */
		queue_for_each_hw_ctx(q, hctx, i) {
			if (!hctx->nr_expired)
				continue;

			if (!(hctx->flags & BLK_MQ_F_BLOCKING))
				has_rcu = true;
			else
				synchronize_srcu(hctx->srcu);

			hctx->nr_expired = 0;
		}
		if (has_rcu)
			synchronize_rcu();

		/* terminate the ones we won */
		blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
	}


	if (data.next_set) {
	if (next != 0) {
		data.next = blk_rq_timeout(round_jiffies_up(data.next));
		mod_timer(&q->timeout, next);
		mod_timer(&q->timeout, data.next);
	} else {
	} else {
		/*
		/*
		 * Request timeouts are handled as a forward rolling timer. If
		 * Request timeouts are handled as a forward rolling timer. If
@@ -2049,15 +1949,7 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
			return ret;
			return ret;
	}
	}


	seqcount_init(&rq->gstate_seq);
	WRITE_ONCE(rq->state, MQ_RQ_IDLE);
	u64_stats_init(&rq->aborted_gstate_sync);
	/*
	 * start gstate with gen 1 instead of 0, otherwise it will be equal
	 * to aborted_gstate, and be identified timed out by
	 * blk_mq_terminate_expired.
	 */
	WRITE_ONCE(rq->gstate, MQ_RQ_GEN_INC);

	return 0;
	return 0;
}
}


+2 −40
Original line number Original line Diff line number Diff line
@@ -30,20 +30,6 @@ struct blk_mq_ctx {
	struct kobject		kobj;
	struct kobject		kobj;
} ____cacheline_aligned_in_smp;
} ____cacheline_aligned_in_smp;


/*
 * Bits for request->gstate.  The lower two bits carry MQ_RQ_* state value
 * and the upper bits the generation number.
 */
enum mq_rq_state {
	MQ_RQ_IDLE		= 0,
	MQ_RQ_IN_FLIGHT		= 1,
	MQ_RQ_COMPLETE		= 2,

	MQ_RQ_STATE_BITS	= 2,
	MQ_RQ_STATE_MASK	= (1 << MQ_RQ_STATE_BITS) - 1,
	MQ_RQ_GEN_INC		= 1 << MQ_RQ_STATE_BITS,
};

void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
@@ -107,33 +93,9 @@ void blk_mq_release(struct request_queue *q);
 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
 * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
 * @rq: target request.
 * @rq: target request.
 */
 */
static inline int blk_mq_rq_state(struct request *rq)
static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
{
{
	return READ_ONCE(rq->gstate) & MQ_RQ_STATE_MASK;
	return READ_ONCE(rq->state);
}

/**
 * blk_mq_rq_update_state() - set the current MQ_RQ_* state of a request
 * @rq: target request.
 * @state: new state to set.
 *
 * Set @rq's state to @state.  The caller is responsible for ensuring that
 * there are no other updaters.  A request can transition into IN_FLIGHT
 * only from IDLE and doing so increments the generation number.
 */
static inline void blk_mq_rq_update_state(struct request *rq,
					  enum mq_rq_state state)
{
	u64 old_val = READ_ONCE(rq->gstate);
	u64 new_val = (old_val & ~MQ_RQ_STATE_MASK) | state;

	if (state == MQ_RQ_IN_FLIGHT) {
		WARN_ON_ONCE((old_val & MQ_RQ_STATE_MASK) != MQ_RQ_IDLE);
		new_val += MQ_RQ_GEN_INC;
	}

	/* avoid exposing interim values */
	WRITE_ONCE(rq->gstate, new_val);
}
}


static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
+0 −1
Original line number Original line Diff line number Diff line
@@ -214,7 +214,6 @@ void blk_add_timer(struct request *req)
		req->timeout = q->rq_timeout;
		req->timeout = q->rq_timeout;


	blk_rq_set_deadline(req, jiffies + req->timeout);
	blk_rq_set_deadline(req, jiffies + req->timeout);
	req->rq_flags &= ~RQF_MQ_TIMEOUT_EXPIRED;


	/*
	/*
	 * Only the non-mq case needs to add the request to a protected list.
	 * Only the non-mq case needs to add the request to a protected list.
Loading