Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1b8f2ffc authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:
 "A collection of fixes that should go into this series. This contains:

   - A set of NVMe fixes, pulled from Christoph. This includes a set of
     fixes for the fiber channel bits from James Smart, rdma queue depth
     fix from Marta, controller removal fixes from Ming, and some more
     APST quirk updates from Andy.

   - A blk-mq debugfs fix from Bart, fixing a problem with the
     untangling of the sysfs and debugfs blk-mq bits that was added in
     this series.

   - Error code fix in add_partition() from Dan.

   - A small series of fixes for the new blk-throttle code from Shaohua"

* 'for-linus' of git://git.kernel.dk/linux-block: (21 commits)
  blk-mq: Only register debugfs attributes for blk-mq queues
  nvme: Quirk APST on Intel 600P/P3100 devices
  nvme: only setup block integrity if supported by the driver
  nvme: replace is_flags field in nvme_ctrl_ops with a flags field
  nvme-pci: consistencly use ctrl->device for logging
  partitions/msdos: FreeBSD UFS2 file systems are not recognized
  block: fix an error code in add_partition()
  blk-throttle: force user to configure all settings for io.low
  blk-throttle: respect 0 bps/iops settings for io.low
  blk-throttle: output some debug info in trace
  blk-throttle: add hierarchy support for latency target and idle time
  nvme_fc: remove extra controller reference taken on reconnect
  nvme_fc: correct nvme status set on abort
  nvme_fc: set logging level on resets/deletes
  nvme_fc: revise comment on teardown
  nvme_fc: Support ctrl_loss_tmo
  nvme_fc: get rid of local reconnect_delay
  blk-mq: remove blk_mq_abort_requeue_list()
  nvme: avoid to use blk_mq_abort_requeue_list()
  nvme: use blk_mq_start_hw_queues() in nvme_kill_queues()
  ...
parents 6ce47829 8aa63829
Loading
Loading
Loading
Loading
+0 −19
Original line number Diff line number Diff line
@@ -628,25 +628,6 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q,
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);

void blk_mq_abort_requeue_list(struct request_queue *q)
{
	unsigned long flags;
	LIST_HEAD(rq_list);

	spin_lock_irqsave(&q->requeue_lock, flags);
	list_splice_init(&q->requeue_list, &rq_list);
	spin_unlock_irqrestore(&q->requeue_lock, flags);

	while (!list_empty(&rq_list)) {
		struct request *rq;

		rq = list_first_entry(&rq_list, struct request, queuelist);
		list_del_init(&rq->queuelist);
		blk_mq_end_request(rq, -EIO);
	}
}
EXPORT_SYMBOL(blk_mq_abort_requeue_list);

struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
{
	if (tag < tags->nr_tags) {
+3 −3
Original line number Diff line number Diff line
@@ -887,10 +887,10 @@ int blk_register_queue(struct gendisk *disk)
		goto unlock;
	}

	if (q->mq_ops)
	if (q->mq_ops) {
		__blk_mq_register_dev(dev, q);

		blk_mq_debugfs_register(q);
	}

	kobject_uevent(&q->kobj, KOBJ_ADD);

+109 −63
Original line number Diff line number Diff line
@@ -22,11 +22,11 @@ static int throtl_quantum = 32;
#define DFL_THROTL_SLICE_HD (HZ / 10)
#define DFL_THROTL_SLICE_SSD (HZ / 50)
#define MAX_THROTL_SLICE (HZ)
#define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */
#define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */
#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
/* default latency target is 0, eg, guarantee IO latency by default */
#define DFL_LATENCY_TARGET (0)
#define MIN_THROTL_BPS (320 * 1024)
#define MIN_THROTL_IOPS (10)
#define DFL_LATENCY_TARGET (-1L)
#define DFL_IDLE_THRESHOLD (0)

#define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT)

@@ -157,6 +157,7 @@ struct throtl_grp {
	unsigned long last_check_time;

	unsigned long latency_target; /* us */
	unsigned long latency_target_conf; /* us */
	/* When did we start a new slice */
	unsigned long slice_start[2];
	unsigned long slice_end[2];
@@ -165,6 +166,7 @@ struct throtl_grp {
	unsigned long checked_last_finish_time; /* ns / 1024 */
	unsigned long avg_idletime; /* ns / 1024 */
	unsigned long idletime_threshold; /* us */
	unsigned long idletime_threshold_conf; /* us */

	unsigned int bio_cnt; /* total bios */
	unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
@@ -201,8 +203,6 @@ struct throtl_data
	unsigned int limit_index;
	bool limit_valid[LIMIT_CNT];

	unsigned long dft_idletime_threshold; /* us */

	unsigned long low_upgrade_time;
	unsigned long low_downgrade_time;

@@ -294,8 +294,14 @@ static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)

	td = tg->td;
	ret = tg->bps[rw][td->limit_index];
	if (ret == 0 && td->limit_index == LIMIT_LOW)
		return tg->bps[rw][LIMIT_MAX];
	if (ret == 0 && td->limit_index == LIMIT_LOW) {
		/* intermediate node or iops isn't 0 */
		if (!list_empty(&blkg->blkcg->css.children) ||
		    tg->iops[rw][td->limit_index])
			return U64_MAX;
		else
			return MIN_THROTL_BPS;
	}

	if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
	    tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
@@ -315,10 +321,17 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)

	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
		return UINT_MAX;

	td = tg->td;
	ret = tg->iops[rw][td->limit_index];
	if (ret == 0 && tg->td->limit_index == LIMIT_LOW)
		return tg->iops[rw][LIMIT_MAX];
	if (ret == 0 && tg->td->limit_index == LIMIT_LOW) {
		/* intermediate node or bps isn't 0 */
		if (!list_empty(&blkg->blkcg->css.children) ||
		    tg->bps[rw][td->limit_index])
			return UINT_MAX;
		else
			return MIN_THROTL_IOPS;
	}

	if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
	    tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
@@ -482,6 +495,9 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
	/* LIMIT_LOW will have default value 0 */

	tg->latency_target = DFL_LATENCY_TARGET;
	tg->latency_target_conf = DFL_LATENCY_TARGET;
	tg->idletime_threshold = DFL_IDLE_THRESHOLD;
	tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;

	return &tg->pd;
}
@@ -510,8 +526,6 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
	tg->td = td;

	tg->idletime_threshold = td->dft_idletime_threshold;
}

/*
@@ -1349,7 +1363,7 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v)
	return 0;
}

static void tg_conf_updated(struct throtl_grp *tg)
static void tg_conf_updated(struct throtl_grp *tg, bool global)
{
	struct throtl_service_queue *sq = &tg->service_queue;
	struct cgroup_subsys_state *pos_css;
@@ -1367,8 +1381,26 @@ static void tg_conf_updated(struct throtl_grp *tg)
	 * restrictions in the whole hierarchy and allows them to bypass
	 * blk-throttle.
	 */
	blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg))
		tg_update_has_rules(blkg_to_tg(blkg));
	blkg_for_each_descendant_pre(blkg, pos_css,
			global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) {
		struct throtl_grp *this_tg = blkg_to_tg(blkg);
		struct throtl_grp *parent_tg;

		tg_update_has_rules(this_tg);
		/* ignore root/second level */
		if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent ||
		    !blkg->parent->parent)
			continue;
		parent_tg = blkg_to_tg(blkg->parent);
		/*
		 * make sure all children has lower idle time threshold and
		 * higher latency target
		 */
		this_tg->idletime_threshold = min(this_tg->idletime_threshold,
				parent_tg->idletime_threshold);
		this_tg->latency_target = max(this_tg->latency_target,
				parent_tg->latency_target);
	}

	/*
	 * We're already holding queue_lock and know @tg is valid.  Let's
@@ -1413,7 +1445,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
	else
		*(unsigned int *)((void *)tg + of_cft(of)->private) = v;

	tg_conf_updated(tg);
	tg_conf_updated(tg, false);
	ret = 0;
out_finish:
	blkg_conf_finish(&ctx);
@@ -1497,34 +1529,34 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
	    tg->iops_conf[READ][off] == iops_dft &&
	    tg->iops_conf[WRITE][off] == iops_dft &&
	    (off != LIMIT_LOW ||
	     (tg->idletime_threshold == tg->td->dft_idletime_threshold &&
	      tg->latency_target == DFL_LATENCY_TARGET)))
	     (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD &&
	      tg->latency_target_conf == DFL_LATENCY_TARGET)))
		return 0;

	if (tg->bps_conf[READ][off] != bps_dft)
	if (tg->bps_conf[READ][off] != U64_MAX)
		snprintf(bufs[0], sizeof(bufs[0]), "%llu",
			tg->bps_conf[READ][off]);
	if (tg->bps_conf[WRITE][off] != bps_dft)
	if (tg->bps_conf[WRITE][off] != U64_MAX)
		snprintf(bufs[1], sizeof(bufs[1]), "%llu",
			tg->bps_conf[WRITE][off]);
	if (tg->iops_conf[READ][off] != iops_dft)
	if (tg->iops_conf[READ][off] != UINT_MAX)
		snprintf(bufs[2], sizeof(bufs[2]), "%u",
			tg->iops_conf[READ][off]);
	if (tg->iops_conf[WRITE][off] != iops_dft)
	if (tg->iops_conf[WRITE][off] != UINT_MAX)
		snprintf(bufs[3], sizeof(bufs[3]), "%u",
			tg->iops_conf[WRITE][off]);
	if (off == LIMIT_LOW) {
		if (tg->idletime_threshold == ULONG_MAX)
		if (tg->idletime_threshold_conf == ULONG_MAX)
			strcpy(idle_time, " idle=max");
		else
			snprintf(idle_time, sizeof(idle_time), " idle=%lu",
				tg->idletime_threshold);
				tg->idletime_threshold_conf);

		if (tg->latency_target == ULONG_MAX)
		if (tg->latency_target_conf == ULONG_MAX)
			strcpy(latency_time, " latency=max");
		else
			snprintf(latency_time, sizeof(latency_time),
				" latency=%lu", tg->latency_target);
				" latency=%lu", tg->latency_target_conf);
	}

	seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
@@ -1563,8 +1595,8 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
	v[2] = tg->iops_conf[READ][index];
	v[3] = tg->iops_conf[WRITE][index];

	idle_time = tg->idletime_threshold;
	latency_time = tg->latency_target;
	idle_time = tg->idletime_threshold_conf;
	latency_time = tg->latency_target_conf;
	while (true) {
		char tok[27];	/* wiops=18446744073709551616 */
		char *p;
@@ -1623,17 +1655,33 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
		tg->iops_conf[READ][LIMIT_MAX]);
	tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
		tg->iops_conf[WRITE][LIMIT_MAX]);
	tg->idletime_threshold_conf = idle_time;
	tg->latency_target_conf = latency_time;

	/* force user to configure all settings for low limit  */
	if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
	      tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) ||
	    tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD ||
	    tg->latency_target_conf == DFL_LATENCY_TARGET) {
		tg->bps[READ][LIMIT_LOW] = 0;
		tg->bps[WRITE][LIMIT_LOW] = 0;
		tg->iops[READ][LIMIT_LOW] = 0;
		tg->iops[WRITE][LIMIT_LOW] = 0;
		tg->idletime_threshold = DFL_IDLE_THRESHOLD;
		tg->latency_target = DFL_LATENCY_TARGET;
	} else if (index == LIMIT_LOW) {
		tg->idletime_threshold = tg->idletime_threshold_conf;
		tg->latency_target = tg->latency_target_conf;
	}

	if (index == LIMIT_LOW) {
	blk_throtl_update_limit_valid(tg->td);
		if (tg->td->limit_valid[LIMIT_LOW])
	if (tg->td->limit_valid[LIMIT_LOW]) {
		if (index == LIMIT_LOW)
			tg->td->limit_index = LIMIT_LOW;
		tg->idletime_threshold = (idle_time == ULONG_MAX) ?
			ULONG_MAX : idle_time;
		tg->latency_target = (latency_time == ULONG_MAX) ?
			ULONG_MAX : latency_time;
	}
	tg_conf_updated(tg);
	} else
		tg->td->limit_index = LIMIT_MAX;
	tg_conf_updated(tg, index == LIMIT_LOW &&
		tg->td->limit_valid[LIMIT_LOW]);
	ret = 0;
out_finish:
	blkg_conf_finish(&ctx);
@@ -1722,17 +1770,25 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
	/*
	 * cgroup is idle if:
	 * - single idle is too long, longer than a fixed value (in case user
	 *   configure a too big threshold) or 4 times of slice
	 *   configure a too big threshold) or 4 times of idletime threshold
	 * - average think time is more than threshold
	 * - IO latency is largely below threshold
	 */
	unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice);
	unsigned long time;
	bool ret;

	time = min_t(unsigned long, MAX_IDLE_TIME, time);
	return (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
	time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
	ret = tg->latency_target == DFL_LATENCY_TARGET ||
	      tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
	      (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
	      tg->avg_idletime > tg->idletime_threshold ||
	      (tg->latency_target && tg->bio_cnt &&
		tg->bad_bio_cnt * 5 < tg->bio_cnt);
	throtl_log(&tg->service_queue,
		"avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d",
		tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt,
		tg->bio_cnt, ret, tg->td->scale);
	return ret;
}

static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
@@ -1828,6 +1884,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
	struct cgroup_subsys_state *pos_css;
	struct blkcg_gq *blkg;

	throtl_log(&td->service_queue, "upgrade to max");
	td->limit_index = LIMIT_MAX;
	td->low_upgrade_time = jiffies;
	td->scale = 0;
@@ -1850,6 +1907,7 @@ static void throtl_downgrade_state(struct throtl_data *td, int new)
{
	td->scale /= 2;

	throtl_log(&td->service_queue, "downgrade, scale %d", td->scale);
	if (td->scale) {
		td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
		return;
@@ -2023,6 +2081,11 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
		td->avg_buckets[i].valid = true;
		last_latency = td->avg_buckets[i].latency;
	}

	for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
		throtl_log(&td->service_queue,
			"Latency bucket %d: latency=%ld, valid=%d", i,
			td->avg_buckets[i].latency, td->avg_buckets[i].valid);
}
#else
static inline void throtl_update_latency_buckets(struct throtl_data *td)
@@ -2354,19 +2417,14 @@ void blk_throtl_exit(struct request_queue *q)
void blk_throtl_register_queue(struct request_queue *q)
{
	struct throtl_data *td;
	struct cgroup_subsys_state *pos_css;
	struct blkcg_gq *blkg;

	td = q->td;
	BUG_ON(!td);

	if (blk_queue_nonrot(q)) {
	if (blk_queue_nonrot(q))
		td->throtl_slice = DFL_THROTL_SLICE_SSD;
		td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD;
	} else {
	else
		td->throtl_slice = DFL_THROTL_SLICE_HD;
		td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD;
	}
#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
	/* if no low limit, use previous default */
	td->throtl_slice = DFL_THROTL_SLICE_HD;
@@ -2375,18 +2433,6 @@ void blk_throtl_register_queue(struct request_queue *q)
	td->track_bio_latency = !q->mq_ops && !q->request_fn;
	if (!td->track_bio_latency)
		blk_stat_enable_accounting(q);

	/*
	 * some tg are created before queue is fully initialized, eg, nonrot
	 * isn't initialized yet
	 */
	rcu_read_lock();
	blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
		struct throtl_grp *tg = blkg_to_tg(blkg);

		tg->idletime_threshold = td->dft_idletime_threshold;
	}
	rcu_read_unlock();
}

#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+3 −1
Original line number Diff line number Diff line
@@ -320,8 +320,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,

	if (info) {
		struct partition_meta_info *pinfo = alloc_part_info(disk);
		if (!pinfo)
		if (!pinfo) {
			err = -ENOMEM;
			goto out_free_stats;
		}
		memcpy(pinfo, info, sizeof(*info));
		p->info = pinfo;
	}
+2 −0
Original line number Diff line number Diff line
@@ -300,6 +300,8 @@ static void parse_bsd(struct parsed_partitions *state,
			continue;
		bsd_start = le32_to_cpu(p->p_offset);
		bsd_size = le32_to_cpu(p->p_size);
		if (memcmp(flavour, "bsd\0", 4) == 0)
			bsd_start += offset;
		if (offset == bsd_start && size == bsd_size)
			/* full parent partition, we have it already */
			continue;
Loading