Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6a04aed6 authored by Wei Hu (Xavier)'s avatar Wei Hu (Xavier) Committed by Jason Gunthorpe
Browse files

RDMA/hns: Fix the chip hanging caused by sending mailbox&CMQ during reset



On hi08 chip, There is a possibility of chip hanging and some errors when
sending mailbox & doorbell during reset.  We can fix it by prohibiting
mailbox and doorbell during reset and reset occurred to ensure that
hardware can work normally.

Fixes: a04ff739 ("RDMA/hns: Add command queue support for hip08 RoCE driver")
Signed-off-by: default avatarWei Hu (Xavier) <xavier.huwei@huawei.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@mellanox.com>
parent d061effc
Loading
Loading
Loading
Loading
+24 −8
Original line number Diff line number Diff line
@@ -176,17 +176,33 @@ int hns_roce_cmd_mbox(struct hns_roce_dev *hr_dev, u64 in_param, u64 out_param,
		      unsigned long in_modifier, u8 op_modifier, u16 op,
		      unsigned long timeout)
{
	if (hr_dev->is_reset)
	int ret;

	if (hr_dev->hw->rst_prc_mbox) {
		ret = hr_dev->hw->rst_prc_mbox(hr_dev);
		if (ret == CMD_RST_PRC_SUCCESS)
			return 0;
		else if (ret == CMD_RST_PRC_EBUSY)
			return -EBUSY;
	}

	if (hr_dev->cmd.use_events)
		return hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
		ret = hns_roce_cmd_mbox_wait(hr_dev, in_param, out_param,
					     in_modifier, op_modifier, op,
					     timeout);
	else
		return hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param,
		ret = hns_roce_cmd_mbox_poll(hr_dev, in_param, out_param,
					     in_modifier, op_modifier, op,
					     timeout);

	if (ret == CMD_RST_PRC_EBUSY)
		return -EBUSY;

	if (ret && (hr_dev->hw->rst_prc_mbox &&
		    hr_dev->hw->rst_prc_mbox(hr_dev) == CMD_RST_PRC_SUCCESS))
		return 0;

	return ret;
}
EXPORT_SYMBOL_GPL(hns_roce_cmd_mbox);

+7 −0
Original line number Diff line number Diff line
@@ -237,6 +237,12 @@ enum {
	HNS_ROCE_RST_DIRECT_RETURN		= 0,
};

enum {
	CMD_RST_PRC_OTHERS,
	CMD_RST_PRC_SUCCESS,
	CMD_RST_PRC_EBUSY,
};

#define HNS_ROCE_CMD_SUCCESS			1

#define HNS_ROCE_PORT_DOWN			0
@@ -874,6 +880,7 @@ struct hns_roce_hw {
			 u64 out_param, u32 in_modifier, u8 op_modifier, u16 op,
			 u16 token, int event);
	int (*chk_mbox)(struct hns_roce_dev *hr_dev, unsigned long timeout);
	int (*rst_prc_mbox)(struct hns_roce_dev *hr_dev);
	int (*set_gid)(struct hns_roce_dev *hr_dev, u8 port, int gid_index,
		       const union ib_gid *gid, const struct ib_gid_attr *attr);
	int (*set_mac)(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr);
+134 −5
Original line number Diff line number Diff line
@@ -712,6 +712,110 @@ static int hns_roce_v2_post_recv(struct ib_qp *ibqp,
	return ret;
}

static int hns_roce_v2_cmd_hw_reseted(struct hns_roce_dev *hr_dev,
				      unsigned long instance_stage,
				      unsigned long reset_stage)
{
	/* When hardware reset has been completed once or more, we should stop
	 * sending mailbox&cmq to hardware. If now in .init_instance()
	 * function, we should exit with error. If now at HNAE3_INIT_CLIENT
	 * stage of soft reset process, we should exit with error, and then
	 * HNAE3_INIT_CLIENT related process can rollback the operation like
	 * notifing hardware to free resources, HNAE3_INIT_CLIENT related
	 * process will exit with error to notify NIC driver to reschedule soft
	 * reset process once again.
	 */
	hr_dev->is_reset = true;

	if (reset_stage == HNS_ROCE_STATE_RST_INIT ||
	    instance_stage == HNS_ROCE_STATE_INIT)
		return CMD_RST_PRC_EBUSY;

	return CMD_RST_PRC_SUCCESS;
}

static int hns_roce_v2_cmd_hw_resetting(struct hns_roce_dev *hr_dev,
					unsigned long instance_stage,
					unsigned long reset_stage)
{
	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
	struct hnae3_handle *handle = priv->handle;
	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;

	/* When hardware reset is detected, we should stop sending mailbox&cmq
	 * to hardware. If now in .init_instance() function, we should
	 * exit with error. If now at HNAE3_INIT_CLIENT stage of soft reset
	 * process, we should exit with error, and then HNAE3_INIT_CLIENT
	 * related process can rollback the operation like notifing hardware to
	 * free resources, HNAE3_INIT_CLIENT related process will exit with
	 * error to notify NIC driver to reschedule soft reset process once
	 * again.
	 */
	if (!ops->get_hw_reset_stat(handle))
		hr_dev->is_reset = true;

	if (!hr_dev->is_reset || reset_stage == HNS_ROCE_STATE_RST_INIT ||
	    instance_stage == HNS_ROCE_STATE_INIT)
		return CMD_RST_PRC_EBUSY;

	return CMD_RST_PRC_SUCCESS;
}

static int hns_roce_v2_cmd_sw_resetting(struct hns_roce_dev *hr_dev)
{
	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
	struct hnae3_handle *handle = priv->handle;
	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;

	/* When software reset is detected at .init_instance() function, we
	 * should stop sending mailbox&cmq to hardware, and exit with
	 * error.
	 */
	if (ops->ae_dev_reset_cnt(handle) != hr_dev->reset_cnt)
		hr_dev->is_reset = true;

	return CMD_RST_PRC_EBUSY;
}

static int hns_roce_v2_rst_process_cmd(struct hns_roce_dev *hr_dev)
{
	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
	struct hnae3_handle *handle = priv->handle;
	const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
	unsigned long instance_stage;	/* the current instance stage */
	unsigned long reset_stage;	/* the current reset stage */
	unsigned long reset_cnt;
	bool sw_resetting;
	bool hw_resetting;

	if (hr_dev->is_reset)
		return CMD_RST_PRC_SUCCESS;

	/* Get information about reset from NIC driver or RoCE driver itself,
	 * the meaning of the following variables from NIC driver are described
	 * as below:
	 * reset_cnt -- The count value of completed hardware reset.
	 * hw_resetting -- Whether hardware device is resetting now.
	 * sw_resetting -- Whether NIC's software reset process is running now.
	 */
	instance_stage = handle->rinfo.instance_state;
	reset_stage = handle->rinfo.reset_state;
	reset_cnt = ops->ae_dev_reset_cnt(handle);
	hw_resetting = ops->get_hw_reset_stat(handle);
	sw_resetting = ops->ae_dev_resetting(handle);

	if (reset_cnt != hr_dev->reset_cnt)
		return hns_roce_v2_cmd_hw_reseted(hr_dev, instance_stage,
						  reset_stage);
	else if (hw_resetting)
		return hns_roce_v2_cmd_hw_resetting(hr_dev, instance_stage,
						    reset_stage);
	else if (sw_resetting && instance_stage == HNS_ROCE_STATE_INIT)
		return hns_roce_v2_cmd_sw_resetting(hr_dev);

	return 0;
}

static int hns_roce_cmq_space(struct hns_roce_v2_cmq_ring *ring)
{
	int ntu = ring->next_to_use;
@@ -892,7 +996,7 @@ static int hns_roce_cmq_csq_clean(struct hns_roce_dev *hr_dev)
	return clean;
}

static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
			       struct hns_roce_cmq_desc *desc, int num)
{
	struct hns_roce_v2_priv *priv = (struct hns_roce_v2_priv *)hr_dev->priv;
@@ -905,9 +1009,6 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
	int ret = 0;
	int ntc;

	if (hr_dev->is_reset)
		return 0;

	spin_lock_bh(&csq->lock);

	if (num > hns_roce_cmq_space(csq)) {
@@ -982,6 +1083,30 @@ static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
	return ret;
}

int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
			     struct hns_roce_cmq_desc *desc, int num)
{
	int retval;
	int ret;

	ret = hns_roce_v2_rst_process_cmd(hr_dev);
	if (ret == CMD_RST_PRC_SUCCESS)
		return 0;
	if (ret == CMD_RST_PRC_EBUSY)
		return ret;

	ret = __hns_roce_cmq_send(hr_dev, desc, num);
	if (ret) {
		retval = hns_roce_v2_rst_process_cmd(hr_dev);
		if (retval == CMD_RST_PRC_SUCCESS)
			return 0;
		else if (retval == CMD_RST_PRC_EBUSY)
			return retval;
	}

	return ret;
}

static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev)
{
	struct hns_roce_query_version *resp;
@@ -1857,6 +1982,9 @@ static int hns_roce_v2_chk_mbox(struct hns_roce_dev *hr_dev,

	status = hns_roce_v2_cmd_complete(hr_dev);
	if (status != 0x1) {
		if (status == CMD_RST_PRC_EBUSY)
			return status;

		dev_err(dev, "mailbox status 0x%x!\n", status);
		return -EBUSY;
	}
@@ -5977,6 +6105,7 @@ static const struct hns_roce_hw hns_roce_hw_v2 = {
	.hw_exit = hns_roce_v2_exit,
	.post_mbox = hns_roce_v2_post_mbox,
	.chk_mbox = hns_roce_v2_chk_mbox,
	.rst_prc_mbox = hns_roce_v2_rst_process_cmd,
	.set_gid = hns_roce_v2_set_gid,
	.set_mac = hns_roce_v2_set_mac,
	.write_mtpt = hns_roce_v2_write_mtpt,
+2 −0
Original line number Diff line number Diff line
@@ -96,6 +96,8 @@
#define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE	2
#define HNS_ROCE_V2_RSV_QPS			8

#define HNS_ROCE_V2_HW_RST_TIMEOUT             1000

#define HNS_ROCE_CONTEXT_HOP_NUM		1
#define HNS_ROCE_SCCC_HOP_NUM			1
#define HNS_ROCE_MTT_HOP_NUM			1