Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b55f0097 authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge branch 'nvme-5.4' of git://git.infradead.org/nvme into for-linus

Pull NVMe updates from Keith:

"This is a collection of bug fixes committed since the previous pull
 request that address deadlocks, double resets, memory leaks, and other
 regression."

* 'nvme-5.4' of git://git.infradead.org/nvme:
  nvme-pci: Set the prp2 correctly when using more than 4k page
  nvme-tcp: fix possible leakage during error flow
  nvmet-loop: fix possible leakage during error flow
  nvme-tcp: Initialize sk->sk_ll_usec only with NET_RX_BUSY_POLL
  nvme: Wait for reset state when required
  nvme: Prevent resets during paused controller state
  nvme: Restart request timers in resetting state
  nvme: Remove ADMIN_ONLY state
  nvme-pci: Free tagset if no IO queues
  nvme: retain split access workaround for capability reads
  nvme: fix possible deadlock when nvme_update_formats fails
parents 8b07a65a a4f40484
Loading
Loading
Loading
Loading
+68 −26
Original line number Diff line number Diff line
@@ -116,10 +116,26 @@ static void nvme_queue_scan(struct nvme_ctrl *ctrl)
	/*
	 * Only new queue scan work when admin and IO queues are both alive
	 */
	if (ctrl->state == NVME_CTRL_LIVE)
	if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset)
		queue_work(nvme_wq, &ctrl->scan_work);
}

/*
 * Use this function to proceed with scheduling reset_work for a controller
 * that had previously been set to the resetting state. This is intended for
 * code paths that can't be interrupted by other reset attempts. A hot removal
 * may prevent this from succeeding.
 */
int nvme_try_sched_reset(struct nvme_ctrl *ctrl)
{
	if (ctrl->state != NVME_CTRL_RESETTING)
		return -EBUSY;
	if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
		return -EBUSY;
	return 0;
}
EXPORT_SYMBOL_GPL(nvme_try_sched_reset);

int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
{
	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
@@ -137,8 +153,7 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
	ret = nvme_reset_ctrl(ctrl);
	if (!ret) {
		flush_work(&ctrl->reset_work);
		if (ctrl->state != NVME_CTRL_LIVE &&
		    ctrl->state != NVME_CTRL_ADMIN_ONLY)
		if (ctrl->state != NVME_CTRL_LIVE)
			ret = -ENETRESET;
	}

@@ -315,15 +330,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,

	old_state = ctrl->state;
	switch (new_state) {
	case NVME_CTRL_ADMIN_ONLY:
		switch (old_state) {
		case NVME_CTRL_CONNECTING:
			changed = true;
			/* FALLTHRU */
		default:
			break;
		}
		break;
	case NVME_CTRL_LIVE:
		switch (old_state) {
		case NVME_CTRL_NEW:
@@ -339,7 +345,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
		switch (old_state) {
		case NVME_CTRL_NEW:
		case NVME_CTRL_LIVE:
		case NVME_CTRL_ADMIN_ONLY:
			changed = true;
			/* FALLTHRU */
		default:
@@ -359,7 +364,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
	case NVME_CTRL_DELETING:
		switch (old_state) {
		case NVME_CTRL_LIVE:
		case NVME_CTRL_ADMIN_ONLY:
		case NVME_CTRL_RESETTING:
		case NVME_CTRL_CONNECTING:
			changed = true;
@@ -381,8 +385,10 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
		break;
	}

	if (changed)
	if (changed) {
		ctrl->state = new_state;
		wake_up_all(&ctrl->state_wq);
	}

	spin_unlock_irqrestore(&ctrl->lock, flags);
	if (changed && ctrl->state == NVME_CTRL_LIVE)
@@ -391,6 +397,39 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
}
EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);

/*
 * Returns true for sink states that can't ever transition back to live.
 */
static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
{
	switch (ctrl->state) {
	case NVME_CTRL_NEW:
	case NVME_CTRL_LIVE:
	case NVME_CTRL_RESETTING:
	case NVME_CTRL_CONNECTING:
		return false;
	case NVME_CTRL_DELETING:
	case NVME_CTRL_DEAD:
		return true;
	default:
		WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
		return true;
	}
}

/*
 * Waits for the controller state to be resetting, or returns false if it is
 * not possible to ever transition to that state.
 */
bool nvme_wait_reset(struct nvme_ctrl *ctrl)
{
	wait_event(ctrl->state_wq,
		   nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) ||
		   nvme_state_terminal(ctrl));
	return ctrl->state == NVME_CTRL_RESETTING;
}
EXPORT_SYMBOL_GPL(nvme_wait_reset);

static void nvme_free_ns_head(struct kref *ref)
{
	struct nvme_ns_head *head =
@@ -1306,8 +1345,6 @@ static void nvme_update_formats(struct nvme_ctrl *ctrl)
		if (ns->disk && nvme_revalidate_disk(ns->disk))
			nvme_set_queue_dying(ns);
	up_read(&ctrl->namespaces_rwsem);

	nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
}

static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
@@ -1323,6 +1360,7 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
		nvme_unfreeze(ctrl);
		nvme_mpath_unfreeze(ctrl->subsys);
		mutex_unlock(&ctrl->subsys->lock);
		nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
		mutex_unlock(&ctrl->scan_lock);
	}
	if (effects & NVME_CMD_EFFECTS_CCC)
@@ -2874,7 +2912,6 @@ static int nvme_dev_open(struct inode *inode, struct file *file)

	switch (ctrl->state) {
	case NVME_CTRL_LIVE:
	case NVME_CTRL_ADMIN_ONLY:
		break;
	default:
		return -EWOULDBLOCK;
@@ -3168,7 +3205,6 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
	static const char *const state_name[] = {
		[NVME_CTRL_NEW]		= "new",
		[NVME_CTRL_LIVE]	= "live",
		[NVME_CTRL_ADMIN_ONLY]	= "only-admin",
		[NVME_CTRL_RESETTING]	= "resetting",
		[NVME_CTRL_CONNECTING]	= "connecting",
		[NVME_CTRL_DELETING]	= "deleting",
@@ -3679,11 +3715,10 @@ static void nvme_scan_work(struct work_struct *work)
	struct nvme_id_ctrl *id;
	unsigned nn;

	if (ctrl->state != NVME_CTRL_LIVE)
	/* No tagset on a live ctrl means IO queues could not created */
	if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset)
		return;

	WARN_ON_ONCE(!ctrl->tagset);

	if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) {
		dev_info(ctrl->device, "rescanning namespaces.\n");
		nvme_clear_changed_ns_log(ctrl);
@@ -3844,13 +3879,13 @@ static void nvme_fw_act_work(struct work_struct *work)
		if (time_after(jiffies, fw_act_timeout)) {
			dev_warn(ctrl->device,
				"Fw activation timeout, reset controller\n");
			nvme_reset_ctrl(ctrl);
			break;
			nvme_try_sched_reset(ctrl);
			return;
		}
		msleep(100);
	}

	if (ctrl->state != NVME_CTRL_LIVE)
	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
		return;

	nvme_start_queues(ctrl);
@@ -3870,6 +3905,12 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
		nvme_queue_scan(ctrl);
		break;
	case NVME_AER_NOTICE_FW_ACT_STARTING:
		/*
		 * We are (ab)using the RESETTING state to prevent subsequent
		 * recovery actions from interfering with the controller's
		 * firmware activation.
		 */
		if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
			queue_work(nvme_wq, &ctrl->fw_act_work);
		break;
#ifdef CONFIG_NVME_MULTIPATH
@@ -3993,6 +4034,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
	INIT_WORK(&ctrl->async_event_work, nvme_async_event_work);
	INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work);
	INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work);
	init_waitqueue_head(&ctrl->state_wq);

	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
+1 −2
Original line number Diff line number Diff line
@@ -182,8 +182,7 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
		bool queue_live)
{
	if (likely(ctrl->state == NVME_CTRL_LIVE ||
		   ctrl->state == NVME_CTRL_ADMIN_ONLY))
	if (likely(ctrl->state == NVME_CTRL_LIVE))
		return true;
	return __nvmf_check_ready(ctrl, rq, queue_live);
}
+4 −1
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@
#include <linux/sed-opal.h>
#include <linux/fault-inject.h>
#include <linux/rcupdate.h>
#include <linux/wait.h>

#include <trace/events/block.h>

@@ -161,7 +162,6 @@ static inline u16 nvme_req_qid(struct request *req)
enum nvme_ctrl_state {
	NVME_CTRL_NEW,
	NVME_CTRL_LIVE,
	NVME_CTRL_ADMIN_ONLY,    /* Only admin queue live */
	NVME_CTRL_RESETTING,
	NVME_CTRL_CONNECTING,
	NVME_CTRL_DELETING,
@@ -199,6 +199,7 @@ struct nvme_ctrl {
	struct cdev cdev;
	struct work_struct reset_work;
	struct work_struct delete_work;
	wait_queue_head_t state_wq;

	struct nvme_subsystem *subsys;
	struct list_head subsys_entry;
@@ -449,6 +450,7 @@ void nvme_complete_rq(struct request *req);
bool nvme_cancel_request(struct request *req, void *data, bool reserved);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
		enum nvme_ctrl_state new_state);
bool nvme_wait_reset(struct nvme_ctrl *ctrl);
int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
@@ -499,6 +501,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);

int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
+49 −34
Original line number Diff line number Diff line
@@ -773,7 +773,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
		struct bio_vec *bv)
{
	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
	unsigned int first_prp_len = dev->ctrl.page_size - bv->bv_offset;
	unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1);
	unsigned int first_prp_len = dev->ctrl.page_size - offset;

	iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
	if (dma_mapping_error(dev->dev, iod->first_dma))
@@ -2263,10 +2264,7 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
	return true;
}

/*
 * return error value only when tagset allocation failed
 */
static int nvme_dev_add(struct nvme_dev *dev)
static void nvme_dev_add(struct nvme_dev *dev)
{
	int ret;

@@ -2296,7 +2294,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
		if (ret) {
			dev_warn(dev->ctrl.device,
				"IO queues tagset allocation failed %d\n", ret);
			return ret;
			return;
		}
		dev->ctrl.tagset = &dev->tagset;
	} else {
@@ -2307,7 +2305,6 @@ static int nvme_dev_add(struct nvme_dev *dev)
	}

	nvme_dbbuf_set(dev);
	return 0;
}

static int nvme_pci_enable(struct nvme_dev *dev)
@@ -2467,6 +2464,14 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
	mutex_unlock(&dev->shutdown_lock);
}

static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
{
	if (!nvme_wait_reset(&dev->ctrl))
		return -EBUSY;
	nvme_dev_disable(dev, shutdown);
	return 0;
}

static int nvme_setup_prp_pools(struct nvme_dev *dev)
{
	dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
@@ -2490,14 +2495,20 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
	dma_pool_destroy(dev->prp_small_pool);
}

static void nvme_free_tagset(struct nvme_dev *dev)
{
	if (dev->tagset.tags)
		blk_mq_free_tag_set(&dev->tagset);
	dev->ctrl.tagset = NULL;
}

static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
{
	struct nvme_dev *dev = to_nvme_dev(ctrl);

	nvme_dbbuf_dma_free(dev);
	put_device(dev->dev);
	if (dev->tagset.tags)
		blk_mq_free_tag_set(&dev->tagset);
	nvme_free_tagset(dev);
	if (dev->ctrl.admin_q)
		blk_put_queue(dev->ctrl.admin_q);
	kfree(dev->queues);
@@ -2508,6 +2519,11 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)

static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
{
	/*
	 * Set state to deleting now to avoid blocking nvme_wait_reset(), which
	 * may be holding this pci_dev's device lock.
	 */
	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
	nvme_get_ctrl(&dev->ctrl);
	nvme_dev_disable(dev, false);
	nvme_kill_queues(&dev->ctrl);
@@ -2521,7 +2537,6 @@ static void nvme_reset_work(struct work_struct *work)
		container_of(work, struct nvme_dev, ctrl.reset_work);
	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
	int result;
	enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;

	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) {
		result = -ENODEV;
@@ -2615,13 +2630,11 @@ static void nvme_reset_work(struct work_struct *work)
		dev_warn(dev->ctrl.device, "IO queues not created\n");
		nvme_kill_queues(&dev->ctrl);
		nvme_remove_namespaces(&dev->ctrl);
		new_state = NVME_CTRL_ADMIN_ONLY;
		nvme_free_tagset(dev);
	} else {
		nvme_start_queues(&dev->ctrl);
		nvme_wait_freeze(&dev->ctrl);
		/* hit this only when allocate tagset fails */
		if (nvme_dev_add(dev))
			new_state = NVME_CTRL_ADMIN_ONLY;
		nvme_dev_add(dev);
		nvme_unfreeze(&dev->ctrl);
	}

@@ -2629,9 +2642,9 @@ static void nvme_reset_work(struct work_struct *work)
	 * If only admin queue live, keep it to do further investigation or
	 * recovery.
	 */
	if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
		dev_warn(dev->ctrl.device,
			"failed to mark controller state %d\n", new_state);
			"failed to mark controller live state\n");
		result = -ENODEV;
		goto out;
	}
@@ -2672,7 +2685,7 @@ static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)

static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
{
	*val = readq(to_nvme_dev(ctrl)->bar + off);
	*val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
	return 0;
}

@@ -2836,19 +2849,28 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
static void nvme_reset_prepare(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
	nvme_dev_disable(dev, false);

	/*
	 * We don't need to check the return value from waiting for the reset
	 * state as pci_dev device lock is held, making it impossible to race
	 * with ->remove().
	 */
	nvme_disable_prepare_reset(dev, false);
	nvme_sync_queues(&dev->ctrl);
}

static void nvme_reset_done(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
	nvme_reset_ctrl_sync(&dev->ctrl);

	if (!nvme_try_sched_reset(&dev->ctrl))
		flush_work(&dev->ctrl.reset_work);
}

static void nvme_shutdown(struct pci_dev *pdev)
{
	struct nvme_dev *dev = pci_get_drvdata(pdev);
	nvme_dev_disable(dev, true);
	nvme_disable_prepare_reset(dev, true);
}

/*
@@ -2901,7 +2923,7 @@ static int nvme_resume(struct device *dev)

	if (ndev->last_ps == U32_MAX ||
	    nvme_set_power_state(ctrl, ndev->last_ps) != 0)
		nvme_reset_ctrl(ctrl);
		return nvme_try_sched_reset(&ndev->ctrl);
	return 0;
}

@@ -2929,17 +2951,14 @@ static int nvme_suspend(struct device *dev)
	 */
	if (pm_suspend_via_firmware() || !ctrl->npss ||
	    !pcie_aspm_enabled(pdev) ||
	    (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) {
		nvme_dev_disable(ndev, true);
		return 0;
	}
	    (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND))
		return nvme_disable_prepare_reset(ndev, true);

	nvme_start_freeze(ctrl);
	nvme_wait_freeze(ctrl);
	nvme_sync_queues(ctrl);

	if (ctrl->state != NVME_CTRL_LIVE &&
	    ctrl->state != NVME_CTRL_ADMIN_ONLY)
	if (ctrl->state != NVME_CTRL_LIVE)
		goto unfreeze;

	ret = nvme_get_power_state(ctrl, &ndev->last_ps);
@@ -2965,9 +2984,8 @@ static int nvme_suspend(struct device *dev)
		 * Clearing npss forces a controller reset on resume. The
		 * correct value will be resdicovered then.
		 */
		nvme_dev_disable(ndev, true);
		ret = nvme_disable_prepare_reset(ndev, true);
		ctrl->npss = 0;
		ret = 0;
	}
unfreeze:
	nvme_unfreeze(ctrl);
@@ -2977,9 +2995,7 @@ static int nvme_suspend(struct device *dev)
static int nvme_simple_suspend(struct device *dev)
{
	struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));

	nvme_dev_disable(ndev, true);
	return 0;
	return nvme_disable_prepare_reset(ndev, true);
}

static int nvme_simple_resume(struct device *dev)
@@ -2987,8 +3003,7 @@ static int nvme_simple_resume(struct device *dev)
	struct pci_dev *pdev = to_pci_dev(dev);
	struct nvme_dev *ndev = pci_get_drvdata(pdev);

	nvme_reset_ctrl(&ndev->ctrl);
	return 0;
	return nvme_try_sched_reset(&ndev->ctrl);
}

static const struct dev_pm_ops nvme_dev_pm_ops = {
+8 −0
Original line number Diff line number Diff line
@@ -1701,6 +1701,14 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
	dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
		 rq->tag, nvme_rdma_queue_idx(queue));

	/*
	 * Restart the timer if a controller reset is already scheduled. Any
	 * timed out commands would be handled before entering the connecting
	 * state.
	 */
	if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
		return BLK_EH_RESET_TIMER;

	if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
		/*
		 * Teardown immediately if controller times out while starting
Loading