Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit bb7c778b authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'qed-Error-recovery-process'

Michal Kalderon says:

====================
qed*: Error recovery process

Parity errors might happen in the device's memories due to momentary bit
flips which are caused by radiation.
Errors that are not correctable initiate a process kill event, which blocks
the device access towards the host and the network, and a recovery process
is started in the management FW and in the driver.

This series adds the support of this process in the qed core module and in
the qede driver (patches 2 & 3).
Patch 1 in the series revises the load sequence, to avoid PCI errors that
might be observed during a recovery process.

Changes in v2:
	- Addressed issue found in https://patchwork.ozlabs.org/patch/1030545/


	  The change was done be removing the enum and passing a boolean to
	  the related functions.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 8e067bb3 ccc67ef5
Loading
Loading
Loading
Loading
+4 −1
Original line number Original line Diff line number Diff line
@@ -554,7 +554,6 @@ struct qed_hwfn {
	u8				dp_level;
	u8				dp_level;
	char				name[NAME_SIZE];
	char				name[NAME_SIZE];


	bool				first_on_engine;
	bool				hw_init_done;
	bool				hw_init_done;


	u8				num_funcs_on_engine;
	u8				num_funcs_on_engine;
@@ -805,6 +804,9 @@ struct qed_dev {


	u32				mcp_nvm_resp;
	u32				mcp_nvm_resp;


	/* Recovery */
	bool recov_in_prog;

	/* Linux specific here */
	/* Linux specific here */
	struct  qede_dev		*edev;
	struct  qede_dev		*edev;
	struct  pci_dev			*pdev;
	struct  pci_dev			*pdev;
@@ -944,6 +946,7 @@ void qed_link_update(struct qed_hwfn *hwfn, struct qed_ptt *ptt);
u32 qed_unzip_data(struct qed_hwfn *p_hwfn,
u32 qed_unzip_data(struct qed_hwfn *p_hwfn,
		   u32 input_len, u8 *input_buf,
		   u32 input_len, u8 *input_buf,
		   u32 max_size, u8 *unzip_buf);
		   u32 max_size, u8 *unzip_buf);
void qed_schedule_recovery_handler(struct qed_hwfn *p_hwfn);
void qed_get_protocol_stats(struct qed_dev *cdev,
void qed_get_protocol_stats(struct qed_dev *cdev,
			    enum qed_mcp_protocol_type type,
			    enum qed_mcp_protocol_type type,
			    union qed_mcp_protocol_stats *stats);
			    union qed_mcp_protocol_stats *stats);
+95 −63
Original line number Original line Diff line number Diff line
@@ -1959,11 +1959,6 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
		     (p_hwfn->hw_info.personality == QED_PCI_FCOE) ? 1 : 0);
		     (p_hwfn->hw_info.personality == QED_PCI_FCOE) ? 1 : 0);
	STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_ROCE_RT_OFFSET, 0);
	STORE_RT_REG(p_hwfn, PRS_REG_SEARCH_ROCE_RT_OFFSET, 0);


	/* Cleanup chip from previous driver if such remains exist */
	rc = qed_final_cleanup(p_hwfn, p_ptt, rel_pf_id, false);
	if (rc)
		return rc;

	/* Sanity check before the PF init sequence that uses DMAE */
	/* Sanity check before the PF init sequence that uses DMAE */
	rc = qed_dmae_sanity(p_hwfn, p_ptt, "pf_phase");
	rc = qed_dmae_sanity(p_hwfn, p_ptt, "pf_phase");
	if (rc)
	if (rc)
@@ -2007,17 +2002,15 @@ static int qed_hw_init_pf(struct qed_hwfn *p_hwfn,
	return rc;
	return rc;
}
}


static int qed_change_pci_hwfn(struct qed_hwfn *p_hwfn,
int qed_pglueb_set_pfid_enable(struct qed_hwfn *p_hwfn,
			       struct qed_ptt *p_ptt,
			       struct qed_ptt *p_ptt, bool b_enable)
			       u8 enable)
{
{
	u32 delay_idx = 0, val, set_val = enable ? 1 : 0;
	u32 delay_idx = 0, val, set_val = b_enable ? 1 : 0;


	/* Change PF in PXP */
	/* Configure the PF's internal FID_enable for master transactions */
	qed_wr(p_hwfn, p_ptt,
	qed_wr(p_hwfn, p_ptt, PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER, set_val);
	       PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER, set_val);


	/* wait until value is set - try for 1 second every 50us */
	/* Wait until value is set - try for 1 second every 50us */
	for (delay_idx = 0; delay_idx < 20000; delay_idx++) {
	for (delay_idx = 0; delay_idx < 20000; delay_idx++) {
		val = qed_rd(p_hwfn, p_ptt,
		val = qed_rd(p_hwfn, p_ptt,
			     PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER);
			     PGLUE_B_REG_INTERNAL_PFID_ENABLE_MASTER);
@@ -2071,13 +2064,19 @@ static int qed_vf_start(struct qed_hwfn *p_hwfn,
	return 0;
	return 0;
}
}


static void qed_pglueb_clear_err(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
{
	qed_wr(p_hwfn, p_ptt, PGLUE_B_REG_WAS_ERROR_PF_31_0_CLR,
	       BIT(p_hwfn->abs_pf_id));
}

int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
{
{
	struct qed_load_req_params load_req_params;
	struct qed_load_req_params load_req_params;
	u32 load_code, resp, param, drv_mb_param;
	u32 load_code, resp, param, drv_mb_param;
	bool b_default_mtu = true;
	bool b_default_mtu = true;
	struct qed_hwfn *p_hwfn;
	struct qed_hwfn *p_hwfn;
	int rc = 0, mfw_rc, i;
	int rc = 0, i;
	u16 ether_type;
	u16 ether_type;


	if ((p_params->int_mode == QED_INT_MODE_MSI) && (cdev->num_hwfns > 1)) {
	if ((p_params->int_mode == QED_INT_MODE_MSI) && (cdev->num_hwfns > 1)) {
@@ -2092,7 +2091,7 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
	}
	}


	for_each_hwfn(cdev, i) {
	for_each_hwfn(cdev, i) {
		struct qed_hwfn *p_hwfn = &cdev->hwfns[i];
		p_hwfn = &cdev->hwfns[i];


		/* If management didn't provide a default, set one of our own */
		/* If management didn't provide a default, set one of our own */
		if (!p_hwfn->hw_info.mtu) {
		if (!p_hwfn->hw_info.mtu) {
@@ -2105,9 +2104,6 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
			continue;
			continue;
		}
		}


		/* Enable DMAE in PXP */
		rc = qed_change_pci_hwfn(p_hwfn, p_hwfn->p_main_ptt, true);

		rc = qed_calc_hw_mode(p_hwfn);
		rc = qed_calc_hw_mode(p_hwfn);
		if (rc)
		if (rc)
			return rc;
			return rc;
@@ -2144,12 +2140,43 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
			   "Load request was sent. Load code: 0x%x\n",
			   "Load request was sent. Load code: 0x%x\n",
			   load_code);
			   load_code);


		/* Only relevant for recovery:
		 * Clear the indication after LOAD_REQ is responded by the MFW.
		 */
		cdev->recov_in_prog = false;

		qed_mcp_set_capabilities(p_hwfn, p_hwfn->p_main_ptt);
		qed_mcp_set_capabilities(p_hwfn, p_hwfn->p_main_ptt);


		qed_reset_mb_shadow(p_hwfn, p_hwfn->p_main_ptt);
		qed_reset_mb_shadow(p_hwfn, p_hwfn->p_main_ptt);


		p_hwfn->first_on_engine = (load_code ==
		/* Clean up chip from previous driver if such remains exist.
					   FW_MSG_CODE_DRV_LOAD_ENGINE);
		 * This is not needed when the PF is the first one on the
		 * engine, since afterwards we are going to init the FW.
		 */
		if (load_code != FW_MSG_CODE_DRV_LOAD_ENGINE) {
			rc = qed_final_cleanup(p_hwfn, p_hwfn->p_main_ptt,
					       p_hwfn->rel_pf_id, false);
			if (rc) {
				DP_NOTICE(p_hwfn, "Final cleanup failed\n");
				goto load_err;
			}
		}

		/* Log and clear previous pglue_b errors if such exist */
		qed_pglueb_rbc_attn_handler(p_hwfn, p_hwfn->p_main_ptt);

		/* Enable the PF's internal FID_enable in the PXP */
		rc = qed_pglueb_set_pfid_enable(p_hwfn, p_hwfn->p_main_ptt,
						true);
		if (rc)
			goto load_err;

		/* Clear the pglue_b was_error indication.
		 * In E4 it must be done after the BME and the internal
		 * FID_enable for the PF are set, since VDMs may cause the
		 * indication to be set again.
		 */
		qed_pglueb_clear_err(p_hwfn, p_hwfn->p_main_ptt);


		switch (load_code) {
		switch (load_code) {
		case FW_MSG_CODE_DRV_LOAD_ENGINE:
		case FW_MSG_CODE_DRV_LOAD_ENGINE:
@@ -2180,39 +2207,29 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
			break;
			break;
		}
		}


		if (rc)
		if (rc) {
			DP_NOTICE(p_hwfn,
			DP_NOTICE(p_hwfn,
				  "init phase failed for loadcode 0x%x (rc %d)\n",
				  "init phase failed for loadcode 0x%x (rc %d)\n",
				  load_code, rc);
				  load_code, rc);
			goto load_err;
		}


		/* ACK mfw regardless of success or failure of initialization */
		rc = qed_mcp_load_done(p_hwfn, p_hwfn->p_main_ptt);
		mfw_rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt,
				     DRV_MSG_CODE_LOAD_DONE,
				     0, &load_code, &param);
		if (rc)
		if (rc)
			return rc;
			return rc;
		if (mfw_rc) {
			DP_NOTICE(p_hwfn, "Failed sending LOAD_DONE command\n");
			return mfw_rc;
		}

		/* Check if there is a DID mismatch between nvm-cfg/efuse */
		if (param & FW_MB_PARAM_LOAD_DONE_DID_EFUSE_ERROR)
			DP_NOTICE(p_hwfn,
				  "warning: device configuration is not supported on this board type. The device may not function as expected.\n");


		/* send DCBX attention request command */
		/* send DCBX attention request command */
		DP_VERBOSE(p_hwfn,
		DP_VERBOSE(p_hwfn,
			   QED_MSG_DCB,
			   QED_MSG_DCB,
			   "sending phony dcbx set command to trigger DCBx attention handling\n");
			   "sending phony dcbx set command to trigger DCBx attention handling\n");
		mfw_rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt,
		rc = qed_mcp_cmd(p_hwfn, p_hwfn->p_main_ptt,
				 DRV_MSG_CODE_SET_DCBX,
				 DRV_MSG_CODE_SET_DCBX,
				 1 << DRV_MB_PARAM_DCBX_NOTIFY_SHIFT,
				 1 << DRV_MB_PARAM_DCBX_NOTIFY_SHIFT,
				     &load_code, &param);
				 &resp, &param);
		if (mfw_rc) {
		if (rc) {
			DP_NOTICE(p_hwfn,
			DP_NOTICE(p_hwfn,
				  "Failed to send DCBX attention request\n");
				  "Failed to send DCBX attention request\n");
			return mfw_rc;
			return rc;
		}
		}


		p_hwfn->hw_init_done = true;
		p_hwfn->hw_init_done = true;
@@ -2261,6 +2278,12 @@ int qed_hw_init(struct qed_dev *cdev, struct qed_hw_init_params *p_params)
	}
	}


	return 0;
	return 0;

load_err:
	/* The MFW load lock should be released also when initialization fails.
	 */
	qed_mcp_load_done(p_hwfn, p_hwfn->p_main_ptt);
	return rc;
}
}


#define QED_HW_STOP_RETRY_LIMIT (10)
#define QED_HW_STOP_RETRY_LIMIT (10)
@@ -2273,6 +2296,9 @@ static void qed_hw_timers_stop(struct qed_dev *cdev,
	qed_wr(p_hwfn, p_ptt, TM_REG_PF_ENABLE_CONN, 0x0);
	qed_wr(p_hwfn, p_ptt, TM_REG_PF_ENABLE_CONN, 0x0);
	qed_wr(p_hwfn, p_ptt, TM_REG_PF_ENABLE_TASK, 0x0);
	qed_wr(p_hwfn, p_ptt, TM_REG_PF_ENABLE_TASK, 0x0);


	if (cdev->recov_in_prog)
		return;

	for (i = 0; i < QED_HW_STOP_RETRY_LIMIT; i++) {
	for (i = 0; i < QED_HW_STOP_RETRY_LIMIT; i++) {
		if ((!qed_rd(p_hwfn, p_ptt,
		if ((!qed_rd(p_hwfn, p_ptt,
			     TM_REG_PF_SCAN_ACTIVE_CONN)) &&
			     TM_REG_PF_SCAN_ACTIVE_CONN)) &&
@@ -2335,6 +2361,7 @@ int qed_hw_stop(struct qed_dev *cdev)
		p_hwfn->hw_init_done = false;
		p_hwfn->hw_init_done = false;


		/* Send unload command to MCP */
		/* Send unload command to MCP */
		if (!cdev->recov_in_prog) {
			rc = qed_mcp_unload_req(p_hwfn, p_ptt);
			rc = qed_mcp_unload_req(p_hwfn, p_ptt);
			if (rc) {
			if (rc) {
				DP_NOTICE(p_hwfn,
				DP_NOTICE(p_hwfn,
@@ -2342,6 +2369,7 @@ int qed_hw_stop(struct qed_dev *cdev)
					  rc);
					  rc);
				rc2 = -EINVAL;
				rc2 = -EINVAL;
			}
			}
		}


		qed_slowpath_irq_sync(p_hwfn);
		qed_slowpath_irq_sync(p_hwfn);


@@ -2382,7 +2410,8 @@ int qed_hw_stop(struct qed_dev *cdev)
		qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_DB_ENABLE, 0);
		qed_wr(p_hwfn, p_ptt, DORQ_REG_PF_DB_ENABLE, 0);
		qed_wr(p_hwfn, p_ptt, QM_REG_PF_EN, 0);
		qed_wr(p_hwfn, p_ptt, QM_REG_PF_EN, 0);


		qed_mcp_unload_done(p_hwfn, p_ptt);
		if (!cdev->recov_in_prog) {
			rc = qed_mcp_unload_done(p_hwfn, p_ptt);
			if (rc) {
			if (rc) {
				DP_NOTICE(p_hwfn,
				DP_NOTICE(p_hwfn,
					  "Failed sending a UNLOAD_DONE command. rc = %d.\n",
					  "Failed sending a UNLOAD_DONE command. rc = %d.\n",
@@ -2390,19 +2419,22 @@ int qed_hw_stop(struct qed_dev *cdev)
				rc2 = -EINVAL;
				rc2 = -EINVAL;
			}
			}
		}
		}
	}


	if (IS_PF(cdev)) {
	if (IS_PF(cdev) && !cdev->recov_in_prog) {
		p_hwfn = QED_LEADING_HWFN(cdev);
		p_hwfn = QED_LEADING_HWFN(cdev);
		p_ptt = QED_LEADING_HWFN(cdev)->p_main_ptt;
		p_ptt = QED_LEADING_HWFN(cdev)->p_main_ptt;


		/* Disable DMAE in PXP - in CMT, this should only be done for
		/* Clear the PF's internal FID_enable in the PXP.
		 * first hw-function, and only after all transactions have
		 * In CMT this should only be done for first hw-function, and
		 * stopped for all active hw-functions.
		 * only after all transactions have stopped for all active
		 * hw-functions.
		 */
		 */
		rc = qed_change_pci_hwfn(p_hwfn, p_ptt, false);
		rc = qed_pglueb_set_pfid_enable(p_hwfn, p_ptt, false);
		if (rc) {
		if (rc) {
			DP_NOTICE(p_hwfn,
			DP_NOTICE(p_hwfn,
				  "qed_change_pci_hwfn failed. rc = %d.\n", rc);
				  "qed_pglueb_set_pfid_enable() failed. rc = %d.\n",
				  rc);
			rc2 = -EINVAL;
			rc2 = -EINVAL;
		}
		}
	}
	}
@@ -2502,9 +2534,8 @@ static void qed_hw_hwfn_prepare(struct qed_hwfn *p_hwfn)
		       PGLUE_B_REG_PGL_ADDR_94_F0_BB, 0);
		       PGLUE_B_REG_PGL_ADDR_94_F0_BB, 0);
	}
	}


	/* Clean Previous errors if such exist */
	/* Clean previous pglue_b errors if such exist */
	qed_wr(p_hwfn, p_hwfn->p_main_ptt,
	qed_pglueb_clear_err(p_hwfn, p_hwfn->p_main_ptt);
	       PGLUE_B_REG_WAS_ERROR_PF_31_0_CLR, 1 << p_hwfn->abs_pf_id);


	/* enable internal target-read */
	/* enable internal target-read */
	qed_wr(p_hwfn, p_hwfn->p_main_ptt,
	qed_wr(p_hwfn, p_hwfn->p_main_ptt,
@@ -3440,6 +3471,7 @@ static int qed_hw_prepare_single(struct qed_hwfn *p_hwfn,
				 void __iomem *p_doorbells,
				 void __iomem *p_doorbells,
				 enum qed_pci_personality personality)
				 enum qed_pci_personality personality)
{
{
	struct qed_dev *cdev = p_hwfn->cdev;
	int rc = 0;
	int rc = 0;


	/* Split PCI bars evenly between hwfns */
	/* Split PCI bars evenly between hwfns */
@@ -3492,7 +3524,7 @@ static int qed_hw_prepare_single(struct qed_hwfn *p_hwfn,
	/* Sending a mailbox to the MFW should be done after qed_get_hw_info()
	/* Sending a mailbox to the MFW should be done after qed_get_hw_info()
	 * is called as it sets the ports number in an engine.
	 * is called as it sets the ports number in an engine.
	 */
	 */
	if (IS_LEAD_HWFN(p_hwfn)) {
	if (IS_LEAD_HWFN(p_hwfn) && !cdev->recov_in_prog) {
		rc = qed_mcp_initiate_pf_flr(p_hwfn, p_hwfn->p_main_ptt);
		rc = qed_mcp_initiate_pf_flr(p_hwfn, p_hwfn->p_main_ptt);
		if (rc)
		if (rc)
			DP_NOTICE(p_hwfn, "Failed to initiate PF FLR\n");
			DP_NOTICE(p_hwfn, "Failed to initiate PF FLR\n");
+12 −0
Original line number Original line Diff line number Diff line
@@ -472,6 +472,18 @@ int qed_get_queue_coalesce(struct qed_hwfn *p_hwfn, u16 *coal, void *handle);
int
int
qed_set_queue_coalesce(u16 rx_coal, u16 tx_coal, void *p_handle);
qed_set_queue_coalesce(u16 rx_coal, u16 tx_coal, void *p_handle);


/**
 * @brief qed_pglueb_set_pfid_enable - Enable or disable PCI BUS MASTER
 *
 * @param p_hwfn
 * @param p_ptt
 * @param b_enable - true/false
 *
 * @return int
 */
int qed_pglueb_set_pfid_enable(struct qed_hwfn *p_hwfn,
			       struct qed_ptt *p_ptt, bool b_enable);

/**
/**
 * @brief db_recovery_add - add doorbell information to the doorbell
 * @brief db_recovery_add - add doorbell information to the doorbell
 * recovery mechanism.
 * recovery mechanism.
+1 −1
Original line number Original line Diff line number Diff line
@@ -12827,7 +12827,7 @@ enum MFW_DRV_MSG_TYPE {
	MFW_DRV_MSG_LLDP_DATA_UPDATED,
	MFW_DRV_MSG_LLDP_DATA_UPDATED,
	MFW_DRV_MSG_DCBX_REMOTE_MIB_UPDATED,
	MFW_DRV_MSG_DCBX_REMOTE_MIB_UPDATED,
	MFW_DRV_MSG_DCBX_OPERATIONAL_MIB_UPDATED,
	MFW_DRV_MSG_DCBX_OPERATIONAL_MIB_UPDATED,
	MFW_DRV_MSG_RESERVED4,
	MFW_DRV_MSG_ERROR_RECOVERY,
	MFW_DRV_MSG_BW_UPDATE,
	MFW_DRV_MSG_BW_UPDATE,
	MFW_DRV_MSG_S_TAG_UPDATE,
	MFW_DRV_MSG_S_TAG_UPDATE,
	MFW_DRV_MSG_GET_LAN_STATS,
	MFW_DRV_MSG_GET_LAN_STATS,
+11 −0
Original line number Original line Diff line number Diff line
@@ -703,6 +703,17 @@ static int qed_dmae_execute_command(struct qed_hwfn *p_hwfn,
	int qed_status = 0;
	int qed_status = 0;
	u32 offset = 0;
	u32 offset = 0;


	if (p_hwfn->cdev->recov_in_prog) {
		DP_VERBOSE(p_hwfn,
			   NETIF_MSG_HW,
			   "Recovery is in progress. Avoid DMAE transaction [{src: addr 0x%llx, type %d}, {dst: addr 0x%llx, type %d}, size %d].\n",
			   src_addr, src_type, dst_addr, dst_type,
			   size_in_dwords);

		/* Let the flow complete w/o any error handling */
		return 0;
	}

	qed_dmae_opcode(p_hwfn,
	qed_dmae_opcode(p_hwfn,
			(src_type == QED_DMAE_ADDRESS_GRC),
			(src_type == QED_DMAE_ADDRESS_GRC),
			(dst_type == QED_DMAE_ADDRESS_GRC),
			(dst_type == QED_DMAE_ADDRESS_GRC),
Loading