Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 2b83d2f9 authored by Subhash Jadavani's avatar Subhash Jadavani
Browse files

scsi: ufs: add error recovery after DL NAC error



Some vendor's UFS device sends back to back NACs for the DL data frames
causing the host controller to raise the DFES error status. Sometimes
such UFS devices send back to back NAC without waiting for new
retransmitted DL frame from the host and in such cases it might be possible
the Host UniPro goes into bad state without raising the DFES error
interrupt. If this happens then all the pending commands would timeout
only after respective SW command (which is generally too large).

This change workarounds such device behaviour like this:
- As soon as SW sees the DL NAC error, it would schedule the error handler
- Error handler would sleep for 50ms to see if there any fatal errors
  raised by UFS controller.
   - If there are fatal errors then SW does normal error recovery.
   - If there are no fatal errors then SW sends the NOP command to device
     to check if link is alive.
       - If NOP command times out, SW does normal error recovery
       - If NOP command succeed, skip the error handling.

If DL NAC error is seen multiple times with some vendor's UFS devices then
enable this quirk to initiate quick error recovery and also silence related
error logs to reduce spamming of kernel logs.

Change-Id: Id2f0c05c414d700ba923513f8c9e3d1e6a8a749a
Signed-off-by: default avatarSubhash Jadavani <subhashj@codeaurora.org>
parent c6029bb6
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -18,6 +18,8 @@
static struct ufs_card_fix ufs_fixups[] = {
	/* UFS cards deviations table */
	UFS_FIX(UFS_VENDOR_SAMSUNG, UFS_ANY_MODEL, UFS_DEVICE_NO_VCCQ),
	UFS_FIX(UFS_VENDOR_SAMSUNG, UFS_ANY_MODEL,
		UFS_DEVICE_QUIRK_RECOVERY_FROM_DL_NAC_ERRORS),
	END_FIX
};

+25 −0
Original line number Diff line number Diff line
@@ -74,6 +74,31 @@ struct ufs_card_fix {
 */
#define UFS_DEVICE_NO_VCCQ (1 << 1)

/*
 * Some vendor's UFS device sends back to back NACs for the DL data frames
 * causing the host controller to raise the DFES error status. Sometimes
 * such UFS devices send back to back NAC without waiting for new
 * retransmitted DL frame from the host and in such cases it might be possible
 * the Host UniPro goes into bad state without raising the DFES error
 * interrupt. If this happens then all the pending commands would timeout
 * only after respective SW command (which is generally too large).
 *
 * We can workaround such device behaviour like this:
 * - As soon as SW sees the DL NAC error, it should schedule the error handler
 * - Error handler would sleep for 50ms to see if there are any fatal errors
 *   raised by UFS controller.
 *    - If there are fatal errors then SW does normal error recovery.
 *    - If there are no fatal errors then SW sends the NOP command to device
 *      to check if link is alive.
 *        - If NOP command times out, SW does normal error recovery
 *        - If NOP command succeed, skip the error handling.
 *
 * If DL NAC error is seen multiple times with some vendor's UFS devices then
 * enable this quirk to initiate quick error recovery and also silence related
 * error logs to reduce spamming of kernel logs.
 */
#define UFS_DEVICE_QUIRK_RECOVERY_FROM_DL_NAC_ERRORS (1 << 2)

struct ufs_hba;
void ufs_advertise_fixup_device(struct ufs_hba *hba);
#endif /* UFS_QUIRKS_H_ */
+131 −9
Original line number Diff line number Diff line
@@ -210,9 +210,11 @@ enum {
/* UFSHCD UIC layer error flags */
enum {
	UFSHCD_UIC_DL_PA_INIT_ERROR = (1 << 0), /* Data link layer error */
	UFSHCD_UIC_NL_ERROR = (1 << 1), /* Network layer error */
	UFSHCD_UIC_TL_ERROR = (1 << 2), /* Transport Layer error */
	UFSHCD_UIC_DME_ERROR = (1 << 3), /* DME error */
	UFSHCD_UIC_DL_NAC_RECEIVED_ERROR = (1 << 1), /* Data link layer error */
	UFSHCD_UIC_DL_TCx_REPLAY_ERROR = (1 << 2), /* Data link layer error */
	UFSHCD_UIC_NL_ERROR = (1 << 3), /* Network layer error */
	UFSHCD_UIC_TL_ERROR = (1 << 4), /* Transport Layer error */
	UFSHCD_UIC_DME_ERROR = (1 << 5), /* DME error */
};

/* Interrupt configuration options */
@@ -4580,7 +4582,7 @@ ufshcd_transfer_rsp_status(struct ufs_hba *hba, struct ufshcd_lrb *lrbp)
		break;
	} /* end of switch */

	if (host_byte(result) != DID_OK) {
	if ((host_byte(result) != DID_OK) && !hba->silence_err_logs) {
		print_prdt = (ocs == OCS_INVALID_PRDT_ATTR ||
			ocs == OCS_MISMATCH_DATA_BUF_SIZE);
		ufshcd_print_trs(hba, 1 << lrbp->task_tag, print_prdt);
@@ -4994,6 +4996,101 @@ static void ufshcd_complete_requests(struct ufs_hba *hba)
	ufshcd_transfer_req_compl(hba);
	ufshcd_tmc_handler(hba);
}

/**
 * ufshcd_quirk_dl_nac_errors - This function checks if error handling is
 *				to recover from the DL NAC errors or not.
 * @hba: per-adapter instance
 *
 * Returns true if error handling is required, false otherwise
 */
static bool ufshcd_quirk_dl_nac_errors(struct ufs_hba *hba)
{
	unsigned long flags;
	bool err_handling = true;

	spin_lock_irqsave(hba->host->host_lock, flags);
	/*
	 * UFS_DEVICE_QUIRK_RECOVERY_FROM_DL_NAC_ERRORS only workaround the
	 * device fatal error and/or DL NAC & REPLAY timeout errors.
	 */
	if (hba->saved_err & (CONTROLLER_FATAL_ERROR | SYSTEM_BUS_FATAL_ERROR))
		goto out;

	if ((hba->saved_err & DEVICE_FATAL_ERROR) ||
	    ((hba->saved_err & UIC_ERROR) &&
	     (hba->saved_uic_err & UFSHCD_UIC_DL_TCx_REPLAY_ERROR))) {
		/*
		 * we have to do error recovery but atleast silence the error
		 * logs.
		 */
		hba->silence_err_logs = true;
		goto out;
	}

	if ((hba->saved_err & UIC_ERROR) &&
	    (hba->saved_uic_err & UFSHCD_UIC_DL_NAC_RECEIVED_ERROR)) {
		int err;
		/*
		 * wait for 50ms to see if we can get any other errors or not.
		 */
		spin_unlock_irqrestore(hba->host->host_lock, flags);
		msleep(50);
		spin_lock_irqsave(hba->host->host_lock, flags);

		/*
		 * now check if we have got any other severe errors other than
		 * DL NAC error?
		 */
		if ((hba->saved_err & INT_FATAL_ERRORS) ||
		    ((hba->saved_err & UIC_ERROR) &&
		    (hba->saved_uic_err & ~UFSHCD_UIC_DL_NAC_RECEIVED_ERROR))) {
			if (((hba->saved_err & INT_FATAL_ERRORS) ==
				DEVICE_FATAL_ERROR) || (hba->saved_uic_err &
					~UFSHCD_UIC_DL_NAC_RECEIVED_ERROR))
				hba->silence_err_logs = true;
			goto out;
		}

		/*
		 * As DL NAC is the only error received so far, send out NOP
		 * command to confirm if link is still active or not.
		 *   - If we don't get any response then do error recovery.
		 *   - If we get response then clear the DL NAC error bit.
		 */

		/* silence the error logs from NOP command */
		hba->silence_err_logs = true;
		spin_unlock_irqrestore(hba->host->host_lock, flags);
		err = ufshcd_verify_dev_init(hba);
		spin_lock_irqsave(hba->host->host_lock, flags);
		hba->silence_err_logs = false;

		if (err) {
			hba->silence_err_logs = true;
			goto out;
		}

		/* Link seems to be alive hence ignore the DL NAC errors */
		if (hba->saved_uic_err == UFSHCD_UIC_DL_NAC_RECEIVED_ERROR)
			hba->saved_err &= ~UIC_ERROR;
		/* clear NAC error */
		hba->saved_uic_err &= ~UFSHCD_UIC_DL_NAC_RECEIVED_ERROR;
		if (!hba->saved_uic_err) {
			err_handling = false;
			goto out;
		}
		/*
		 * there seems to be some errors other than NAC, so do error
		 * recovery
		 */
		hba->silence_err_logs = true;
	}
out:
	spin_unlock_irqrestore(hba->host->host_lock, flags);
	return err_handling;
}

/**
 * ufshcd_err_handler - handle UFS errors that require s/w attention
 * @work: pointer to work structure
@@ -5023,6 +5120,17 @@ static void ufshcd_err_handler(struct work_struct *work)
	/* Complete requests that have door-bell cleared by h/w */
	ufshcd_complete_requests(hba);

	if (hba->dev_quirks & UFS_DEVICE_QUIRK_RECOVERY_FROM_DL_NAC_ERRORS) {
		bool ret;

		spin_unlock_irqrestore(hba->host->host_lock, flags);
		/* release the lock as ufshcd_quirk_dl_nac_errors() may sleep */
		ret = ufshcd_quirk_dl_nac_errors(hba);
		spin_lock_irqsave(hba->host->host_lock, flags);
		if (!ret)
			goto skip_err_handling;
	}

	/*
	 * Dump controller state before resetting. Transfer requests state
	 * will be dump as part of the request completion.
@@ -5030,17 +5138,21 @@ static void ufshcd_err_handler(struct work_struct *work)
	if (hba->saved_err & (INT_FATAL_ERRORS | UIC_ERROR)) {
		dev_err(hba->dev, "%s: saved_err 0x%x saved_uic_err 0x%x",
			__func__, hba->saved_err, hba->saved_uic_err);
		if (!hba->silence_err_logs) {
			ufshcd_print_host_regs(hba);
			ufshcd_print_pwr_info(hba);
			ufshcd_print_tmrs(hba, hba->outstanding_tasks);
		}
	}

	if (hba->vops && hba->vops->crypto_engine_get_err)
		crypto_engine_err = hba->vops->crypto_engine_get_err(hba);

	if ((hba->saved_err & INT_FATAL_ERRORS) || crypto_engine_err ||
	    ((hba->saved_err & UIC_ERROR) &&
	    (hba->saved_uic_err & (UFSHCD_UIC_DL_PA_INIT_ERROR))))
	    (hba->saved_uic_err & (UFSHCD_UIC_DL_PA_INIT_ERROR |
				   UFSHCD_UIC_DL_NAC_RECEIVED_ERROR |
				   UFSHCD_UIC_DL_TCx_REPLAY_ERROR))))
		needs_reset = true;

	/*
@@ -5125,6 +5237,7 @@ skip_pending_xfer_clear:
			hba->vops->crypto_engine_reset_err(hba);
	}

skip_err_handling:
	if (!needs_reset) {
		hba->ufshcd_state = UFSHCD_STATE_OPERATIONAL;
		if (hba->saved_err || hba->saved_uic_err)
@@ -5132,6 +5245,7 @@ skip_pending_xfer_clear:
			    __func__, hba->saved_err, hba->saved_uic_err);
	}

	hba->silence_err_logs = false;
	ufshcd_clear_eh_in_progress(hba);
out:
	spin_unlock_irqrestore(hba->host->host_lock, flags);
@@ -5175,8 +5289,16 @@ static void ufshcd_update_uic_error(struct ufs_hba *hba)
	if (reg)
		ufshcd_update_uic_reg_hist(&hba->ufs_stats.dl_err, reg);

	if (reg & UIC_DATA_LINK_LAYER_ERROR_PA_INIT)
	if (reg & UIC_DATA_LINK_LAYER_ERROR_PA_INIT) {
		hba->uic_error |= UFSHCD_UIC_DL_PA_INIT_ERROR;
	} else if (hba->dev_quirks &
		   UFS_DEVICE_QUIRK_RECOVERY_FROM_DL_NAC_ERRORS) {
		if (reg & UIC_DATA_LINK_LAYER_ERROR_NAC_RECEIVED)
			hba->uic_error |=
				UFSHCD_UIC_DL_NAC_RECEIVED_ERROR;
		else if (reg & UIC_DATA_LINK_LAYER_ERROR_TCx_REPLAY_TIMEOUT)
			hba->uic_error |= UFSHCD_UIC_DL_TCx_REPLAY_ERROR;
	}

	/* UIC NL/TL/DME errors needs software retry */
	reg = ufshcd_readl(hba, REG_UIC_ERROR_CODE_NETWORK_LAYER);
+2 −0
Original line number Diff line number Diff line
@@ -173,6 +173,8 @@ enum {
#define UIC_DATA_LINK_LAYER_ERROR		UFS_BIT(31)
#define UIC_DATA_LINK_LAYER_ERROR_CODE_MASK	0x7FFF
#define UIC_DATA_LINK_LAYER_ERROR_PA_INIT	0x2000
#define UIC_DATA_LINK_LAYER_ERROR_NAC_RECEIVED	0x0001
#define UIC_DATA_LINK_LAYER_ERROR_TCx_REPLAY_TIMEOUT 0x0002

/* UECN - Host UIC Error Code Network Layer 40h */
#define UIC_NETWORK_LAYER_ERROR			UFS_BIT(31)
+1 −0
Original line number Diff line number Diff line
@@ -682,6 +682,7 @@ struct ufs_hba {
	u32 uic_error;
	u32 saved_err;
	u32 saved_uic_err;
	bool silence_err_logs;

	/* Device management request data */
	struct ufs_dev_cmd dev_cmd;