Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 64f1c21e authored by Jens Axboe's avatar Jens Axboe
Browse files

blk-mq: make the polling code adaptive



The previous commit introduced the hybrid sleep/poll mode. Take
that one step further, and use the completion latencies to
automatically sleep for half the mean completion time. This is
a good approximation.

This changes the 'io_poll_delay' sysfs file a bit to expose the
various options. Depending on the value, the polling code will
behave differently:

-1	Never enter hybrid sleep mode
 0	Use half of the completion mean for the sleep delay
>0	Use this specific value as the sleep delay

Signed-off-by: default avatarJens Axboe <axboe@fb.com>
Tested-By: default avatarStephen Bates <sbates@raithlin.com>
Reviewed-By: default avatarStephen Bates <sbates@raithlin.com>
parent 06426adf
Loading
Loading
Loading
Loading
+64 −3
Original line number Original line Diff line number Diff line
@@ -2132,6 +2132,11 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
	 */
	 */
	q->nr_requests = set->queue_depth;
	q->nr_requests = set->queue_depth;


	/*
	 * Default to classic polling
	 */
	q->poll_nsec = -1;

	if (set->ops->complete)
	if (set->ops->complete)
		blk_queue_softirq_done(q, set->ops->complete);
		blk_queue_softirq_done(q, set->ops->complete);


@@ -2469,14 +2474,70 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
}
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);


static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
				       struct blk_mq_hw_ctx *hctx,
				       struct request *rq)
{
	struct blk_rq_stat stat[2];
	unsigned long ret = 0;

	/*
	 * If stats collection isn't on, don't sleep but turn it on for
	 * future users
	 */
	if (!blk_stat_enable(q))
		return 0;

	/*
	 * We don't have to do this once per IO, should optimize this
	 * to just use the current window of stats until it changes
	 */
	memset(&stat, 0, sizeof(stat));
	blk_hctx_stat_get(hctx, stat);

	/*
	 * As an optimistic guess, use half of the mean service time
	 * for this type of request. We can (and should) make this smarter.
	 * For instance, if the completion latencies are tight, we can
	 * get closer than just half the mean. This is especially
	 * important on devices where the completion latencies are longer
	 * than ~10 usec.
	 */
	if (req_op(rq) == REQ_OP_READ && stat[BLK_STAT_READ].nr_samples)
		ret = (stat[BLK_STAT_READ].mean + 1) / 2;
	else if (req_op(rq) == REQ_OP_WRITE && stat[BLK_STAT_WRITE].nr_samples)
		ret = (stat[BLK_STAT_WRITE].mean + 1) / 2;

	return ret;
}

static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
				     struct blk_mq_hw_ctx *hctx,
				     struct request *rq)
				     struct request *rq)
{
{
	struct hrtimer_sleeper hs;
	struct hrtimer_sleeper hs;
	enum hrtimer_mode mode;
	enum hrtimer_mode mode;
	unsigned int nsecs;
	ktime_t kt;
	ktime_t kt;


	if (!q->poll_nsec || test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
	if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags))
		return false;

	/*
	 * poll_nsec can be:
	 *
	 * -1:	don't ever hybrid sleep
	 *  0:	use half of prev avg
	 * >0:	use this specific value
	 */
	if (q->poll_nsec == -1)
		return false;
	else if (q->poll_nsec > 0)
		nsecs = q->poll_nsec;
	else
		nsecs = blk_mq_poll_nsecs(q, hctx, rq);

	if (!nsecs)
		return false;
		return false;


	set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
	set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
@@ -2485,7 +2546,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
	 * This will be replaced with the stats tracking code, using
	 * This will be replaced with the stats tracking code, using
	 * 'avg_completion_time / 2' as the pre-sleep target.
	 * 'avg_completion_time / 2' as the pre-sleep target.
	 */
	 */
	kt = ktime_set(0, q->poll_nsec);
	kt = ktime_set(0, nsecs);


	mode = HRTIMER_MODE_REL;
	mode = HRTIMER_MODE_REL;
	hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
	hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
@@ -2520,7 +2581,7 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq)
	 * the IO isn't complete, we'll get called again and will go
	 * the IO isn't complete, we'll get called again and will go
	 * straight to the busy poll loop.
	 * straight to the busy poll loop.
	 */
	 */
	if (blk_mq_poll_hybrid_sleep(q, rq))
	if (blk_mq_poll_hybrid_sleep(q, hctx, rq))
		return true;
		return true;


	hctx->poll_considered++;
	hctx->poll_considered++;
+18 −8
Original line number Original line Diff line number Diff line
@@ -352,24 +352,34 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)


static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
{
{
	return queue_var_show(q->poll_nsec / 1000, page);
	int val;

	if (q->poll_nsec == -1)
		val = -1;
	else
		val = q->poll_nsec / 1000;

	return sprintf(page, "%d\n", val);
}
}


static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
				size_t count)
				size_t count)
{
{
	unsigned long poll_usec;
	int err, val;
	ssize_t ret;


	if (!q->mq_ops || !q->mq_ops->poll)
	if (!q->mq_ops || !q->mq_ops->poll)
		return -EINVAL;
		return -EINVAL;


	ret = queue_var_store(&poll_usec, page, count);
	err = kstrtoint(page, 10, &val);
	if (ret < 0)
	if (err < 0)
		return ret;
		return err;


	q->poll_nsec = poll_usec * 1000;
	if (val == -1)
	return ret;
		q->poll_nsec = -1;
	else
		q->poll_nsec = val * 1000;

	return count;
}
}


static ssize_t queue_poll_show(struct request_queue *q, char *page)
static ssize_t queue_poll_show(struct request_queue *q, char *page)
+1 −1
Original line number Original line Diff line number Diff line
@@ -509,7 +509,7 @@ struct request_queue {
	unsigned int		request_fn_active;
	unsigned int		request_fn_active;


	unsigned int		rq_timeout;
	unsigned int		rq_timeout;
	unsigned int		poll_nsec;
	int			poll_nsec;
	struct timer_list	timeout;
	struct timer_list	timeout;
	struct work_struct	timeout_work;
	struct work_struct	timeout_work;
	struct list_head	timeout_list;
	struct list_head	timeout_list;