Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f87b0f0d authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge branch 'nvme-4.19' of git://git.infradead.org/nvme into for-4.19/block2

Pull NVMe changes from Christoph:

"This contains the support for TP4004, Asymmetric Namespace Access,
 which makes NVMe multipathing usable in practice."

* 'nvme-4.19' of git://git.infradead.org/nvme

:
  nvmet: use Retain Async Event bit to clear AEN
  nvmet: support configuring ANA groups
  nvmet: add minimal ANA support
  nvmet: track and limit the number of namespaces per subsystem
  nvmet: keep a port pointer in nvmet_ctrl
  nvme: add ANA support
  nvme: remove nvme_req_needs_failover
  nvme: simplify the API for getting log pages
  nvme.h: add ANA definitions
  nvme.h: add support for the log specific field

Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parents 05b9ba4b b369b30c
Loading
Loading
Loading
Loading
+43 −26
Original line number Diff line number Diff line
@@ -252,7 +252,8 @@ void nvme_complete_rq(struct request *req)
	trace_nvme_complete_rq(req);

	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
		if (nvme_req_needs_failover(req, status)) {
		if ((req->cmd_flags & REQ_NVME_MPATH) &&
		    blk_path_error(status)) {
			nvme_failover_req(req);
			return;
		}
@@ -1067,7 +1068,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
EXPORT_SYMBOL_GPL(nvme_set_queue_count);

#define NVME_AEN_SUPPORTED \
	(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT)
	(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE)

static void nvme_enable_aen(struct nvme_ctrl *ctrl)
{
@@ -2281,21 +2282,16 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
	return ret;
}

int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
		     u8 log_page, void *log,
		     size_t size, u64 offset)
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
		void *log, size_t size, u64 offset)
{
	struct nvme_command c = { };
	unsigned long dwlen = size / 4 - 1;

	c.get_log_page.opcode = nvme_admin_get_log_page;

	if (ns)
		c.get_log_page.nsid = cpu_to_le32(ns->head->ns_id);
	else
		c.get_log_page.nsid = cpu_to_le32(NVME_NSID_ALL);

	c.get_log_page.nsid = cpu_to_le32(nsid);
	c.get_log_page.lid = log_page;
	c.get_log_page.lsp = lsp;
	c.get_log_page.numdl = cpu_to_le16(dwlen & ((1 << 16) - 1));
	c.get_log_page.numdu = cpu_to_le16(dwlen >> 16);
	c.get_log_page.lpol = cpu_to_le32(lower_32_bits(offset));
@@ -2304,12 +2300,6 @@ int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
	return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size);
}

static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log,
			size_t size)
{
	return nvme_get_log_ext(ctrl, NULL, log_page, log, size, 0);
}

static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
{
	int ret;
@@ -2320,8 +2310,8 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl)
	if (!ctrl->effects)
		return 0;

	ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects,
					sizeof(*ctrl->effects));
	ret = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CMD_EFFECTS, 0,
			ctrl->effects, sizeof(*ctrl->effects), 0);
	if (ret) {
		kfree(ctrl->effects);
		ctrl->effects = NULL;
@@ -2412,6 +2402,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
	nvme_set_queue_limits(ctrl, ctrl->admin_q);
	ctrl->sgls = le32_to_cpu(id->sgls);
	ctrl->kas = le16_to_cpu(id->kas);
	ctrl->max_namespaces = le32_to_cpu(id->mnan);

	if (id->rtd3e) {
		/* us -> s */
@@ -2471,8 +2462,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
		ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
	}

	ret = nvme_mpath_init(ctrl, id);
	kfree(id);

	if (ret < 0)
		return ret;

	if (ctrl->apst_enabled && !prev_apst_enabled)
		dev_pm_qos_expose_latency_tolerance(ctrl->device);
	else if (!ctrl->apst_enabled && prev_apst_enabled)
@@ -2691,6 +2686,10 @@ static struct attribute *nvme_ns_id_attrs[] = {
	&dev_attr_nguid.attr,
	&dev_attr_eui.attr,
	&dev_attr_nsid.attr,
#ifdef CONFIG_NVME_MULTIPATH
	&dev_attr_ana_grpid.attr,
	&dev_attr_ana_state.attr,
#endif
	NULL,
};

@@ -2713,6 +2712,14 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
		if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
			return 0;
	}
#ifdef CONFIG_NVME_MULTIPATH
	if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
		if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */
			return 0;
		if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
			return 0;
	}
#endif
	return a->mode;
}

@@ -3086,8 +3093,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)

	nvme_get_ctrl(ctrl);

	kfree(id);

	device_add_disk(ctrl->device, ns->disk);
	if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
					&nvme_ns_id_attr_group))
@@ -3097,8 +3102,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
		pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
			ns->disk->disk_name);

	nvme_mpath_add_disk(ns->head);
	nvme_mpath_add_disk(ns, id);
	nvme_fault_inject_init(ns);
	kfree(id);

	return;
 out_unlink_ns:
	mutex_lock(&ctrl->subsys->lock);
@@ -3240,7 +3247,8 @@ static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl)
	 * raced with us in reading the log page, which could cause us to miss
	 * updates.
	 */
	error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size);
	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_CHANGED_NS, 0, log,
			log_size, 0);
	if (error)
		dev_warn(ctrl->device,
			"reading changed ns log failed: %d\n", error);
@@ -3357,9 +3365,9 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
	if (!log)
		return;

	if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log)))
		dev_warn(ctrl->device,
				"Get FW SLOT INFO log error\n");
	if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log,
			sizeof(*log), 0))
		dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
	kfree(log);
}

@@ -3405,6 +3413,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
	case NVME_AER_NOTICE_FW_ACT_STARTING:
		queue_work(nvme_wq, &ctrl->fw_act_work);
		break;
#ifdef CONFIG_NVME_MULTIPATH
	case NVME_AER_NOTICE_ANA:
		if (!ctrl->ana_log_buf)
			break;
		queue_work(nvme_wq, &ctrl->ana_work);
		break;
#endif
	default:
		dev_warn(ctrl->device, "async event result %08x\n", result);
	}
@@ -3437,6 +3452,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);

void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
{
	nvme_mpath_stop(ctrl);
	nvme_stop_keep_alive(ctrl);
	flush_work(&ctrl->async_event_work);
	flush_work(&ctrl->scan_work);
@@ -3474,6 +3490,7 @@ static void nvme_free_ctrl(struct device *dev)

	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
	kfree(ctrl->effects);
	nvme_mpath_uninit(ctrl);

	if (subsys) {
		mutex_lock(&subsys->lock);
+3 −2
Original line number Diff line number Diff line
@@ -604,8 +604,9 @@ static int nvme_nvm_get_chk_meta(struct nvm_dev *ndev,
	while (left) {
		len = min_t(unsigned int, left, max_len);

		ret = nvme_get_log_ext(ctrl, ns, NVME_NVM_LOG_REPORT_CHUNK,
				dev_meta, len, offset);
		ret = nvme_get_log(ctrl, ns->head->ns_id,
				NVME_NVM_LOG_REPORT_CHUNK, 0, dev_meta, len,
				offset);
		if (ret) {
			dev_err(ctrl->device, "Get REPORT CHUNK log error\n");
			break;
+327 −22
Original line number Diff line number Diff line
/*
 * Copyright (c) 2017 Christoph Hellwig.
 * Copyright (c) 2017-2018 Christoph Hellwig.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
@@ -20,6 +20,11 @@ module_param(multipath, bool, 0444);
MODULE_PARM_DESC(multipath,
	"turn on native support for multiple controllers per subsystem");

inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
{
	return multipath && (ctrl->subsys->cmic & (1 << 3));
}

/*
 * If multipathing is enabled we need to always use the subsystem instance
 * number for numbering our devices to avoid conflicts between subsystems that
@@ -45,6 +50,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
void nvme_failover_req(struct request *req)
{
	struct nvme_ns *ns = req->q->queuedata;
	u16 status = nvme_req(req)->status;
	unsigned long flags;

	spin_lock_irqsave(&ns->head->requeue_lock, flags);
@@ -52,15 +58,35 @@ void nvme_failover_req(struct request *req)
	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
	blk_mq_end_request(req, 0);

	switch (status & 0x7ff) {
	case NVME_SC_ANA_TRANSITION:
	case NVME_SC_ANA_INACCESSIBLE:
	case NVME_SC_ANA_PERSISTENT_LOSS:
		/*
		 * If we got back an ANA error we know the controller is alive,
		 * but not ready to serve this namespaces.  The spec suggests
		 * we should update our general state here, but due to the fact
		 * that the admin and I/O queues are not serialized that is
		 * fundamentally racy.  So instead just clear the current path,
		 * mark the the path as pending and kick of a re-read of the ANA
		 * log page ASAP.
		 */
		nvme_mpath_clear_current_path(ns);
		if (ns->ctrl->ana_log_buf) {
			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
			queue_work(nvme_wq, &ns->ctrl->ana_work);
		}
		break;
	default:
		/*
		 * Reset the controller for any non-ANA error as we don't know
		 * what caused the error.
		 */
		nvme_reset_ctrl(ns->ctrl);
	kblockd_schedule_work(&ns->head->requeue_work);
		break;
	}

bool nvme_req_needs_failover(struct request *req, blk_status_t error)
{
	if (!(req->cmd_flags & REQ_NVME_MPATH))
		return false;
	return blk_path_error(error);
	kblockd_schedule_work(&ns->head->requeue_work);
}

void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
@@ -75,25 +101,51 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
	up_read(&ctrl->namespaces_rwsem);
}

static const char *nvme_ana_state_names[] = {
	[0]				= "invalid state",
	[NVME_ANA_OPTIMIZED]		= "optimized",
	[NVME_ANA_NONOPTIMIZED]		= "non-optimized",
	[NVME_ANA_INACCESSIBLE]		= "inaccessible",
	[NVME_ANA_PERSISTENT_LOSS]	= "persistent-loss",
	[NVME_ANA_CHANGE]		= "change",
};

static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
{
	struct nvme_ns *ns;
	struct nvme_ns *ns, *fallback = NULL;

	list_for_each_entry_rcu(ns, &head->list, siblings) {
		if (ns->ctrl->state == NVME_CTRL_LIVE) {
		if (ns->ctrl->state != NVME_CTRL_LIVE ||
		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
			continue;
		switch (ns->ana_state) {
		case NVME_ANA_OPTIMIZED:
			rcu_assign_pointer(head->current_path, ns);
			return ns;
		case NVME_ANA_NONOPTIMIZED:
			fallback = ns;
			break;
		default:
			break;
		}
	}

	return NULL;
	if (fallback)
		rcu_assign_pointer(head->current_path, fallback);
	return fallback;
}

static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
{
	return ns->ctrl->state == NVME_CTRL_LIVE &&
		ns->ana_state == NVME_ANA_OPTIMIZED;
}

inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
{
	struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);

	if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE))
	if (unlikely(!ns || !nvme_path_is_optimized(ns)))
		ns = __nvme_find_path(head);
	return ns;
}
@@ -142,7 +194,7 @@ static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)

	srcu_idx = srcu_read_lock(&head->srcu);
	ns = srcu_dereference(head->current_path, &head->srcu);
	if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE))
	if (likely(ns && nvme_path_is_optimized(ns)))
		found = ns->queue->poll_fn(q, qc);
	srcu_read_unlock(&head->srcu, srcu_idx);
	return found;
@@ -176,6 +228,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
	struct request_queue *q;
	bool vwc = false;

	mutex_init(&head->lock);
	bio_list_init(&head->requeue_list);
	spin_lock_init(&head->requeue_lock);
	INIT_WORK(&head->requeue_work, nvme_requeue_work);
@@ -220,29 +273,232 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
	return -ENOMEM;
}

void nvme_mpath_add_disk(struct nvme_ns_head *head)
static void nvme_mpath_set_live(struct nvme_ns *ns)
{
	struct nvme_ns_head *head = ns->head;

	lockdep_assert_held(&ns->head->lock);

	if (!head->disk)
		return;

	mutex_lock(&head->subsys->lock);
	if (!(head->disk->flags & GENHD_FL_UP)) {
		device_add_disk(&head->subsys->dev, head->disk);
		if (sysfs_create_group(&disk_to_dev(head->disk)->kobj,
				&nvme_ns_id_attr_group))
			pr_warn("%s: failed to create sysfs group for identification\n",
				head->disk->disk_name);
			dev_warn(&head->subsys->dev,
				 "failed to create id group.\n");
	}

	kblockd_schedule_work(&ns->head->requeue_work);
}

static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
		int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
			void *))
{
	void *base = ctrl->ana_log_buf;
	size_t offset = sizeof(struct nvme_ana_rsp_hdr);
	int error, i;

	lockdep_assert_held(&ctrl->ana_lock);

	for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
		struct nvme_ana_group_desc *desc = base + offset;
		u32 nr_nsids = le32_to_cpu(desc->nnsids);
		size_t nsid_buf_size = nr_nsids * sizeof(__le32);

		if (WARN_ON_ONCE(desc->grpid == 0))
			return -EINVAL;
		if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
			return -EINVAL;
		if (WARN_ON_ONCE(desc->state == 0))
			return -EINVAL;
		if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
			return -EINVAL;

		offset += sizeof(*desc);
		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
			return -EINVAL;

		error = cb(ctrl, desc, data);
		if (error)
			return error;

		offset += nsid_buf_size;
		if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
			return -EINVAL;
	}

	return 0;
}

static inline bool nvme_state_is_live(enum nvme_ana_state state)
{
	return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
}

static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
		struct nvme_ns *ns)
{
	enum nvme_ana_state old;

	mutex_lock(&ns->head->lock);
	old = ns->ana_state;
	ns->ana_grpid = le32_to_cpu(desc->grpid);
	ns->ana_state = desc->state;
	clear_bit(NVME_NS_ANA_PENDING, &ns->flags);

	if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old))
		nvme_mpath_set_live(ns);
	mutex_unlock(&ns->head->lock);
}

static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
		struct nvme_ana_group_desc *desc, void *data)
{
	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
	unsigned *nr_change_groups = data;
	struct nvme_ns *ns;

	dev_info(ctrl->device, "ANA group %d: %s.\n",
			le32_to_cpu(desc->grpid),
			nvme_ana_state_names[desc->state]);

	if (desc->state == NVME_ANA_CHANGE)
		(*nr_change_groups)++;

	if (!nr_nsids)
		return 0;

	down_write(&ctrl->namespaces_rwsem);
	list_for_each_entry(ns, &ctrl->namespaces, list) {
		if (ns->head->ns_id != le32_to_cpu(desc->nsids[n]))
			continue;
		nvme_update_ns_ana_state(desc, ns);
		if (++n == nr_nsids)
			break;
	}
	up_write(&ctrl->namespaces_rwsem);
	WARN_ON_ONCE(n < nr_nsids);
	return 0;
}

static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only)
{
	u32 nr_change_groups = 0;
	int error;

	mutex_lock(&ctrl->ana_lock);
	error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA,
			groups_only ? NVME_ANA_LOG_RGO : 0,
			ctrl->ana_log_buf, ctrl->ana_log_size, 0);
	if (error) {
		dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
		goto out_unlock;
	}

	error = nvme_parse_ana_log(ctrl, &nr_change_groups,
			nvme_update_ana_state);
	if (error)
		goto out_unlock;

	/*
	 * In theory we should have an ANATT timer per group as they might enter
	 * the change state at different times.  But that is a lot of overhead
	 * just to protect against a target that keeps entering new changes
	 * states while never finishing previous ones.  But we'll still
	 * eventually time out once all groups are in change state, so this
	 * isn't a big deal.
	 *
	 * We also double the ANATT value to provide some slack for transports
	 * or AEN processing overhead.
	 */
	if (nr_change_groups)
		mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
	else
		del_timer_sync(&ctrl->anatt_timer);
out_unlock:
	mutex_unlock(&ctrl->ana_lock);
	return error;
}

static void nvme_ana_work(struct work_struct *work)
{
	struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);

	nvme_read_ana_log(ctrl, false);
}

static void nvme_anatt_timeout(struct timer_list *t)
{
	struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);

	dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
	nvme_reset_ctrl(ctrl);
}

void nvme_mpath_stop(struct nvme_ctrl *ctrl)
{
	if (!nvme_ctrl_use_ana(ctrl))
		return;
	del_timer_sync(&ctrl->anatt_timer);
	cancel_work_sync(&ctrl->ana_work);
}

static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
		char *buf)
{
	return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
}
DEVICE_ATTR_RO(ana_grpid);

static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
		char *buf)
{
	struct nvme_ns *ns = nvme_get_ns_from_dev(dev);

	return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
}
DEVICE_ATTR_RO(ana_state);

static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl,
		struct nvme_ana_group_desc *desc, void *data)
{
	struct nvme_ns *ns = data;

	if (ns->ana_grpid == le32_to_cpu(desc->grpid)) {
		nvme_update_ns_ana_state(desc, ns);
		return -ENXIO; /* just break out of the loop */
	}

	return 0;
}

void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
{
	if (nvme_ctrl_use_ana(ns->ctrl)) {
		mutex_lock(&ns->ctrl->ana_lock);
		ns->ana_grpid = le32_to_cpu(id->anagrpid);
		nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state);
		mutex_unlock(&ns->ctrl->ana_lock);
	} else {
		mutex_lock(&ns->head->lock);
		ns->ana_state = NVME_ANA_OPTIMIZED; 
		nvme_mpath_set_live(ns);
		mutex_unlock(&ns->head->lock);
	}
	mutex_unlock(&head->subsys->lock);
}

void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{
	if (!head->disk)
		return;
	if (head->disk->flags & GENHD_FL_UP) {
		sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
				   &nvme_ns_id_attr_group);
		del_gendisk(head->disk);
	}
	blk_set_queue_dying(head->disk->queue);
	/* make sure all pending bios are cleaned up */
	kblockd_schedule_work(&head->requeue_work);
@@ -250,3 +506,52 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
	blk_cleanup_queue(head->disk->queue);
	put_disk(head->disk);
}

int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
	int error;

	if (!nvme_ctrl_use_ana(ctrl))
		return 0;

	ctrl->anacap = id->anacap;
	ctrl->anatt = id->anatt;
	ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
	ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);

	mutex_init(&ctrl->ana_lock);
	timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
	ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
		ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
	if (!(ctrl->anacap & (1 << 6)))
		ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);

	if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
		dev_err(ctrl->device,
			"ANA log page size (%zd) larger than MDTS (%d).\n",
			ctrl->ana_log_size,
			ctrl->max_hw_sectors << SECTOR_SHIFT);
		dev_err(ctrl->device, "disabling ANA support.\n");
		return 0;
	}

	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
	ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
	if (!ctrl->ana_log_buf)
		goto out;

	error = nvme_read_ana_log(ctrl, true);
	if (error)
		goto out_free_ana_log_buf;
	return 0;
out_free_ana_log_buf:
	kfree(ctrl->ana_log_buf);
out:
	return -ENOMEM;
}

void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
{
	kfree(ctrl->ana_log_buf);
}
+49 −12
Original line number Diff line number Diff line
@@ -183,6 +183,7 @@ struct nvme_ctrl {
	u16 oacs;
	u16 nssa;
	u16 nr_streams;
	u32 max_namespaces;
	atomic_t abort_limit;
	u8 vwc;
	u32 vs;
@@ -205,6 +206,19 @@ struct nvme_ctrl {
	struct work_struct fw_act_work;
	unsigned long events;

#ifdef CONFIG_NVME_MULTIPATH
	/* asymmetric namespace access: */
	u8 anacap;
	u8 anatt;
	u32 anagrpmax;
	u32 nanagrpid;
	struct mutex ana_lock;
	struct nvme_ana_rsp_hdr *ana_log_buf;
	size_t ana_log_size;
	struct timer_list anatt_timer;
	struct work_struct ana_work;
#endif

	/* Power saving configuration */
	u64 ps_max_latency_us;
	bool apst_enabled;
@@ -269,6 +283,7 @@ struct nvme_ns_head {
	struct bio_list		requeue_list;
	spinlock_t		requeue_lock;
	struct work_struct	requeue_work;
	struct mutex		lock;
#endif
	struct list_head	list;
	struct srcu_struct      srcu;
@@ -295,6 +310,10 @@ struct nvme_ns {
	struct nvme_ctrl *ctrl;
	struct request_queue *queue;
	struct gendisk *disk;
#ifdef CONFIG_NVME_MULTIPATH
	enum nvme_ana_state ana_state;
	u32 ana_grpid;
#endif
	struct list_head siblings;
	struct nvm_dev *ndev;
	struct kref kref;
@@ -309,6 +328,7 @@ struct nvme_ns {
	unsigned long flags;
#define NVME_NS_REMOVING	0
#define NVME_NS_DEAD     	1
#define NVME_NS_ANA_PENDING	2
	u16 noiob;

#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
@@ -436,21 +456,24 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);

int nvme_get_log_ext(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
		u8 log_page, void *log, size_t size, u64 offset);
int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
		void *log, size_t size, u64 offset);

extern const struct attribute_group nvme_ns_id_attr_group;
extern const struct block_device_operations nvme_ns_head_ops;

#ifdef CONFIG_NVME_MULTIPATH
bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
			struct nvme_ctrl *ctrl, int *flags);
void nvme_failover_req(struct request *req);
bool nvme_req_needs_failover(struct request *req, blk_status_t error);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
void nvme_mpath_remove_disk(struct nvme_ns_head *head);
int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
void nvme_mpath_stop(struct nvme_ctrl *ctrl);

static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
{
@@ -469,7 +492,14 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
		kblockd_schedule_work(&head->requeue_work);
}

extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;

#else
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
{
	return false;
}
/*
 * Without the multipath code enabled, multiple controller per subsystems are
 * visible as devices and thus we cannot use the subsystem instance.
@@ -483,11 +513,6 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
static inline void nvme_failover_req(struct request *req)
{
}
static inline bool nvme_req_needs_failover(struct request *req,
					   blk_status_t error)
{
	return false;
}
static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
}
@@ -496,7 +521,8 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
{
	return 0;
}
static inline void nvme_mpath_add_disk(struct nvme_ns_head *head)
static inline void nvme_mpath_add_disk(struct nvme_ns *ns,
		struct nvme_id_ns *id)
{
}
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
@@ -508,6 +534,17 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
{
}
static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
		struct nvme_id_ctrl *id)
{
	return 0;
}
static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
{
}
static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl)
{
}
#endif /* CONFIG_NVME_MULTIPATH */

#ifdef CONFIG_NVM
+99 −5
Original line number Diff line number Diff line
@@ -19,6 +19,19 @@
#include <asm/unaligned.h>
#include "nvmet.h"

/*
 * This helper allows us to clear the AEN based on the RAE bit,
 * Please use this helper when processing the log pages which are
 * associated with the AEN.
 */
static inline void nvmet_clear_aen(struct nvmet_req *req, u32 aen_bit)
{
	int rae = le32_to_cpu(req->cmd->common.cdw10[0]) & 1 << 15;

	if (!rae)
		clear_bit(aen_bit, &req->sq->ctrl->aen_masked);
}

u32 nvmet_get_log_page_len(struct nvme_command *cmd)
{
	u32 len = le16_to_cpu(cmd->get_log_page.numdu);
@@ -176,12 +189,76 @@ static void nvmet_execute_get_log_changed_ns(struct nvmet_req *req)
	if (!status)
		status = nvmet_zero_sgl(req, len, req->data_len - len);
	ctrl->nr_changed_ns = 0;
	clear_bit(NVME_AEN_CFG_NS_ATTR, &ctrl->aen_masked);
	nvmet_clear_aen(req, NVME_AEN_CFG_NS_ATTR);
	mutex_unlock(&ctrl->lock);
out:
	nvmet_req_complete(req, status);
}

static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
		struct nvme_ana_group_desc *desc)
{
	struct nvmet_ctrl *ctrl = req->sq->ctrl;
	struct nvmet_ns *ns;
	u32 count = 0;

	if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) {
		rcu_read_lock();
		list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link)
			if (ns->anagrpid == grpid)
				desc->nsids[count++] = cpu_to_le32(ns->nsid);
		rcu_read_unlock();
	}

	desc->grpid = cpu_to_le32(grpid);
	desc->nnsids = cpu_to_le32(count);
	desc->chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
	desc->state = req->port->ana_state[grpid];
	memset(desc->rsvd17, 0, sizeof(desc->rsvd17));
	return sizeof(struct nvme_ana_group_desc) + count * sizeof(__le32);
}

static void nvmet_execute_get_log_page_ana(struct nvmet_req *req)
{
	struct nvme_ana_rsp_hdr hdr = { 0, };
	struct nvme_ana_group_desc *desc;
	size_t offset = sizeof(struct nvme_ana_rsp_hdr); /* start beyond hdr */
	size_t len;
	u32 grpid;
	u16 ngrps = 0;
	u16 status;

	status = NVME_SC_INTERNAL;
	desc = kmalloc(sizeof(struct nvme_ana_group_desc) +
			NVMET_MAX_NAMESPACES * sizeof(__le32), GFP_KERNEL);
	if (!desc)
		goto out;

	down_read(&nvmet_ana_sem);
	for (grpid = 1; grpid <= NVMET_MAX_ANAGRPS; grpid++) {
		if (!nvmet_ana_group_enabled[grpid])
			continue;
		len = nvmet_format_ana_group(req, grpid, desc);
		status = nvmet_copy_to_sgl(req, offset, desc, len);
		if (status)
			break;
		offset += len;
		ngrps++;
	}

	hdr.chgcnt = cpu_to_le64(nvmet_ana_chgcnt);
	hdr.ngrps = cpu_to_le16(ngrps);
	nvmet_clear_aen(req, NVME_AEN_CFG_ANA_CHANGE);
	up_read(&nvmet_ana_sem);

	kfree(desc);

	/* copy the header last once we know the number of groups */
	status = nvmet_copy_to_sgl(req, 0, &hdr, sizeof(hdr));
out:
	nvmet_req_complete(req, status);
}

static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
{
	struct nvmet_ctrl *ctrl = req->sq->ctrl;
@@ -213,8 +290,8 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
	 * the safest is to leave it as zeroes.
	 */

	/* we support multiple ports and multiples hosts: */
	id->cmic = (1 << 0) | (1 << 1);
	/* we support multiple ports, multiples hosts and ANA: */
	id->cmic = (1 << 0) | (1 << 1) | (1 << 3);

	/* no limit on data transfer sizes for now */
	id->mdts = 0;
@@ -252,6 +329,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
	id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);

	id->nn = cpu_to_le32(ctrl->subsys->max_nsid);
	id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES);
	id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM |
			NVME_CTRL_ONCS_WRITE_ZEROES);

@@ -281,6 +359,11 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)

	id->msdbd = ctrl->ops->msdbd;

	id->anacap = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3) | (1 << 4);
	id->anatt = 10; /* random value */
	id->anagrpmax = cpu_to_le32(NVMET_MAX_ANAGRPS);
	id->nanagrpid = cpu_to_le32(NVMET_MAX_ANAGRPS);

	/*
	 * Meh, we don't really support any power state.  Fake up the same
	 * values that qemu does.
@@ -322,8 +405,15 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
	 * nuse = ncap = nsze isn't always true, but we have no way to find
	 * that out from the underlying device.
	 */
	id->ncap = id->nuse = id->nsze =
		cpu_to_le64(ns->size >> ns->blksize_shift);
	id->ncap = id->nsze = cpu_to_le64(ns->size >> ns->blksize_shift);
	switch (req->port->ana_state[ns->anagrpid]) {
	case NVME_ANA_INACCESSIBLE:
	case NVME_ANA_PERSISTENT_LOSS:
		break;
	default:
		id->nuse = id->nsze;
		break;
        }

	/*
	 * We just provide a single LBA format that matches what the
@@ -337,6 +427,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
	 * controllers, but also with any other user of the block device.
	 */
	id->nmic = (1 << 0);
	id->anagrpid = cpu_to_le32(ns->anagrpid);

	memcpy(&id->nguid, &ns->nguid, sizeof(id->nguid));

@@ -619,6 +710,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
		case NVME_LOG_CMD_EFFECTS:
			req->execute = nvmet_execute_get_log_cmd_effects_ns;
			return 0;
		case NVME_LOG_ANA:
			req->execute = nvmet_execute_get_log_page_ana;
			return 0;
		}
		break;
	case nvme_admin_identify:
Loading