Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 89d1e09b authored by Bryan O'Sullivan's avatar Bryan O'Sullivan Committed by Roland Dreier
Browse files

IB/ipath: Fix and recover TXE piobuf and PBC parity errors



We can sometimes trigger parity errors due to processor speculative
reads to our write-combined memory (mostly seen on Woodcrest).   Add a
stats counter for these.

Factored out the sendbuffererror buffer cancellation code so it can be
used in the new handling; suppress likely subsequent error messages if
within two jiffies of the cancellation.

Also restore 2 dropped TXE lines on hwe_bitsextant noticed while
debugging.

Signed-off-by: default avatarBryan O'Sullivan <bryan.osullivan@qlogic.com>
Signed-off-by: default avatarRoland Dreier <rolandd@cisco.com>
parent 51084775
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -141,8 +141,9 @@ struct infinipath_stats {
	 * packets if ipath not configured, etc.)
	 */
	__u64 sps_krdrops;
	__u64 sps_txeparity; /* PIO buffer parity error, recovered */
	/* pad for future growth */
	__u64 __sps_pad[46];
	__u64 __sps_pad[45];
};

/*
+31 −1
Original line number Diff line number Diff line
@@ -451,7 +451,10 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
	 * make sure we get this much out, unless told to be quiet,
	 * or it's occurred within the last 5 seconds
	 */
	if ((hwerrs & ~dd->ipath_lasthwerror) ||
	if ((hwerrs & ~(dd->ipath_lasthwerror |
			((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
			  INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
			<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) ||
	    (ipath_debug & __IPATH_VERBDBG))
		dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
			 "(cleared)\n", (unsigned long long) hwerrs);
@@ -464,6 +467,33 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,

	ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
	if (ctrl & INFINIPATH_C_FREEZEMODE) {
		/*
		 * parity errors in send memory are recoverable,
		 * just cancel the send (if indicated in * sendbuffererror),
		 * count the occurrence, unfreeze (if no other handled
		 * hardware error bits are set), and continue. They can
		 * occur if a processor speculative read is done to the PIO
		 * buffer while we are sending a packet, for example.
		 */
		if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
			       INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
			      << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
			ipath_stats.sps_txeparity++;
			ipath_dbg("Recovering from TXE parity error (%llu), "
			    	  "hwerrstatus=%llx\n",
				  (unsigned long long) ipath_stats.sps_txeparity,
				  (unsigned long long) hwerrs);
			ipath_disarm_senderrbufs(dd);
			hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
				     INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
				    << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
			if (!hwerrs) { /* else leave in freeze mode */
				ipath_write_kreg(dd,
						 dd->ipath_kregs->kr_control,
						 dd->ipath_control);
				return;
			}
		}
		if (hwerrs) {
			/*
			 * if any set that we aren't ignoring; only
+34 −3
Original line number Diff line number Diff line
@@ -370,7 +370,10 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
	 * make sure we get this much out, unless told to be quiet,
	 * or it's occurred within the last 5 seconds
	 */
	if ((hwerrs & ~dd->ipath_lasthwerror) ||
	if ((hwerrs & ~(dd->ipath_lasthwerror |
			((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
			  INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
			 << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) ||
	    (ipath_debug & __IPATH_VERBDBG))
		dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
			 "(cleared)\n", (unsigned long long) hwerrs);
@@ -383,6 +386,33 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,

	ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
	if (ctrl & INFINIPATH_C_FREEZEMODE) {
		/*
		 * parity errors in send memory are recoverable,
		 * just cancel the send (if indicated in * sendbuffererror),
		 * count the occurrence, unfreeze (if no other handled
		 * hardware error bits are set), and continue. They can
		 * occur if a processor speculative read is done to the PIO
		 * buffer while we are sending a packet, for example.
		 */
		if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
			       INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
			      << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
			ipath_stats.sps_txeparity++;
			ipath_dbg("Recovering from TXE parity error (%llu), "
			    	  "hwerrstatus=%llx\n",
				  (unsigned long long) ipath_stats.sps_txeparity,
				  (unsigned long long) hwerrs);
			ipath_disarm_senderrbufs(dd);
			hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
				     INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
				    << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
			if (!hwerrs) { /* else leave in freeze mode */
				ipath_write_kreg(dd,
						 dd->ipath_kregs->kr_control,
						 dd->ipath_control);
			    return;
			}
		}
		if (hwerrs) {
			/*
			 * if any set that we aren't ignoring only make the
@@ -406,9 +436,8 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
		} else {
			ipath_dbg("Clearing freezemode on ignored hardware "
				  "error\n");
			ctrl &= ~INFINIPATH_C_FREEZEMODE;
			ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
					 ctrl);
			   		 dd->ipath_control);
		}
	}

@@ -880,6 +909,8 @@ static void ipath_init_pe_variables(struct ipath_devdata *dd)
	dd->ipath_hwe_bitsextant =
		(INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
		 INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
		(INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
		 INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
		(INFINIPATH_HWE_PCIEMEMPARITYERR_MASK <<
		 INFINIPATH_HWE_PCIEMEMPARITYERR_SHIFT) |
		INFINIPATH_HWE_PCIE1PLLFAILED |
+53 −45
Original line number Diff line number Diff line
@@ -37,6 +37,50 @@
#include "ipath_verbs.h"
#include "ipath_common.h"

/*
 * Called when we might have an error that is specific to a particular
 * PIO buffer, and may need to cancel that buffer, so it can be re-used.
 */
void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
{
	u32 piobcnt;
	unsigned long sbuf[4];
	/*
	 * it's possible that sendbuffererror could have bits set; might
	 * have already done this as a result of hardware error handling
	 */
	piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
	/* read these before writing errorclear */
	sbuf[0] = ipath_read_kreg64(
		dd, dd->ipath_kregs->kr_sendbuffererror);
	sbuf[1] = ipath_read_kreg64(
		dd, dd->ipath_kregs->kr_sendbuffererror + 1);
	if (piobcnt > 128) {
		sbuf[2] = ipath_read_kreg64(
			dd, dd->ipath_kregs->kr_sendbuffererror + 2);
		sbuf[3] = ipath_read_kreg64(
			dd, dd->ipath_kregs->kr_sendbuffererror + 3);
	}

	if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
		int i;
		if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG)) {
			__IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,
					  "SendbufErrs %lx %lx", sbuf[0],
					  sbuf[1]);
			if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
				printk(" %lx %lx ", sbuf[2], sbuf[3]);
			printk("\n");
		}

		for (i = 0; i < piobcnt; i++)
			if (test_bit(i, sbuf))
				ipath_disarm_piobufs(dd, i, 1);
		dd->ipath_lastcancel = jiffies+3; /* no armlaunch for a bit */
	}
}


/* These are all rcv-related errors which we want to count for stats */
#define E_SUM_PKTERRS \
	(INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
@@ -68,53 +112,9 @@

static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
{
	unsigned long sbuf[4];
	u64 ignore_this_time = 0;
	u32 piobcnt;

	/* if possible that sendbuffererror could be valid */
	piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
	/* read these before writing errorclear */
	sbuf[0] = ipath_read_kreg64(
		dd, dd->ipath_kregs->kr_sendbuffererror);
	sbuf[1] = ipath_read_kreg64(
		dd, dd->ipath_kregs->kr_sendbuffererror + 1);
	if (piobcnt > 128) {
		sbuf[2] = ipath_read_kreg64(
			dd, dd->ipath_kregs->kr_sendbuffererror + 2);
		sbuf[3] = ipath_read_kreg64(
			dd, dd->ipath_kregs->kr_sendbuffererror + 3);
	}

	if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
		int i;

		ipath_cdbg(PKT, "SendbufErrs %lx %lx ", sbuf[0], sbuf[1]);
		if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
			printk("%lx %lx ", sbuf[2], sbuf[3]);
		for (i = 0; i < piobcnt; i++) {
			if (test_bit(i, sbuf)) {
				u32 __iomem *piobuf;
				if (i < dd->ipath_piobcnt2k)
					piobuf = (u32 __iomem *)
						(dd->ipath_pio2kbase +
						 i * dd->ipath_palign);
				else
					piobuf = (u32 __iomem *)
						(dd->ipath_pio4kbase +
						 (i - dd->ipath_piobcnt2k) *
						 dd->ipath_4kalign);

				ipath_cdbg(PKT,
					   "PIObuf[%u] @%p pbc is %x; ",
					   i, piobuf, readl(piobuf));

				ipath_disarm_piobufs(dd, i, 1);
			}
		}
		if (ipath_debug & __IPATH_PKTDBG)
			printk("\n");
	}
	ipath_disarm_senderrbufs(dd);
	if ((errs & E_SUM_LINK_PKTERRS) &&
	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
		/*
@@ -554,6 +554,14 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
			~(INFINIPATH_E_HARDWARE |
			  INFINIPATH_E_IBSTATUSCHANGED);
	}

	/* likely due to cancel, so suppress */
	if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) &&
		dd->ipath_lastcancel > jiffies) {
		ipath_dbg("Suppressed armlaunch/spktlen after error send cancel\n");
		errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN);
	}

	if (!errs)
		return 0;

+4 −2
Original line number Diff line number Diff line
@@ -427,6 +427,9 @@ struct ipath_devdata {
	unsigned long ipath_rcvctrl;
	/* shadow kr_sendctrl */
	unsigned long ipath_sendctrl;
	/* ports waiting for PIOavail intr */
	unsigned long ipath_portpiowait;
	unsigned long ipath_lastcancel; /* to not count armlaunch after cancel */

	/* value we put in kr_rcvhdrcnt */
	u32 ipath_rcvhdrcnt;
@@ -490,8 +493,6 @@ struct ipath_devdata {
	u32 ipath_htwidth;
	/* HT speed (200,400,800,1000) from HT config */
	u32 ipath_htspeed;
	/* ports waiting for PIOavail intr */
	unsigned long ipath_portpiowait;
	/*
	 * number of sequential ibcstatus change for polling active/quiet
	 * (i.e., link not coming up).
@@ -585,6 +586,7 @@ int ipath_enable_wc(struct ipath_devdata *dd);
void ipath_disable_wc(struct ipath_devdata *dd);
int ipath_count_units(int *npresentp, int *nupp, u32 *maxportsp);
void ipath_shutdown_device(struct ipath_devdata *);
void ipath_disarm_senderrbufs(struct ipath_devdata *);

struct file_operations;
int ipath_cdev_init(int minor, char *name, struct file_operations *fops,