Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9783ab40 authored by Bryan O'Sullivan's avatar Bryan O'Sullivan Committed by Roland Dreier
Browse files

IB/ipath: Improve handling and reporting of parity errors



Mostly cleanup.

Signed-off-by: default avatarDave Olson <dave.olson@qlogic.com>
Signed-off-by: default avatarBryan O'Sullivan <bryan.osullivan@qlogic.com>
Signed-off-by: default avatarRoland Dreier <rolandd@cisco.com>
parent 820054b7
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
@@ -605,8 +605,9 @@ static void __devexit cleanup_device(struct ipath_devdata *dd)

		ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
			   dd->ipath_pageshadow);
		vfree(dd->ipath_pageshadow);
		tmpp = dd->ipath_pageshadow;
		dd->ipath_pageshadow = NULL;
		vfree(tmpp);
	}

	/*
+4 −0
Original line number Diff line number Diff line
@@ -626,6 +626,10 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd)
	} else
		memcpy(dd->ipath_serial, ifp->if_serial,
		       sizeof ifp->if_serial);
	if (!strstr(ifp->if_comment, "Tested successfully"))
		ipath_dev_err(dd, "Board SN %s did not pass functional "
			"test: %s\n", dd->ipath_serial,
			ifp->if_comment);

	ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n",
		   (unsigned long long) be64_to_cpu(dd->ipath_guid));
+83 −55
Original line number Diff line number Diff line
@@ -284,6 +284,14 @@ static const struct ipath_cregs ipath_ht_cregs = {
#define INFINIPATH_EXTS_MEMBIST_ENDTEST     0x0000000000004000
#define INFINIPATH_EXTS_MEMBIST_CORRECT     0x0000000000008000


/* TID entries (memory), HT-only */
#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL	/* 40 bits valid */
#define INFINIPATH_RT_VALID 0x8000000000000000ULL
#define INFINIPATH_RT_ADDR_SHIFT 0
#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL
#define INFINIPATH_RT_BUFSIZE_SHIFT 48

/*
 * masks and bits that are different in different chips, or present only
 * in one
@@ -402,6 +410,14 @@ static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = {
	INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
};

#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \
		        INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
		        << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)
#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \
			  << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)

static int ipath_ht_txe_recover(struct ipath_devdata *);

/**
 * ipath_ht_handle_hwerrors - display hardware errors.
 * @dd: the infinipath device
@@ -450,12 +466,11 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,

	/*
	 * make sure we get this much out, unless told to be quiet,
	 * it's a parity error we may recover from,
	 * or it's occurred within the last 5 seconds
	 */
	if ((hwerrs & ~(dd->ipath_lasthwerror |
			((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
			  INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
			<< INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT))) ||
	if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY |
		RXE_EAGER_PARITY)) ||
		(ipath_debug & __IPATH_VERBDBG))
		dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
			 "(cleared)\n", (unsigned long long) hwerrs);
@@ -467,7 +482,7 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
			      (hwerrs & ~dd->ipath_hwe_bitsextant));

	ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
	if (ctrl & INFINIPATH_C_FREEZEMODE) {
	if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) {
		/*
		 * parity errors in send memory are recoverable,
		 * just cancel the send (if indicated in * sendbuffererror),
@@ -476,50 +491,14 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
		 * occur if a processor speculative read is done to the PIO
		 * buffer while we are sending a packet, for example.
		 */
		if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
			       INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
			      << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
			ipath_stats.sps_txeparity++;
			ipath_dbg("Recovering from TXE parity error (%llu), "
			    	  "hwerrstatus=%llx\n",
				  (unsigned long long) ipath_stats.sps_txeparity,
				  (unsigned long long) hwerrs);
			ipath_disarm_senderrbufs(dd);
			hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
				     INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
				    << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
			if (!hwerrs) { /* else leave in freeze mode */
				ipath_write_kreg(dd,
						 dd->ipath_kregs->kr_control,
						 dd->ipath_control);
				return;
			}
		}
		if (hwerrs) {
			/*
			 * if any set that we aren't ignoring; only
			 * make the complaint once, in case it's stuck
			 * or recurring, and we get here multiple
			 * times.
			 */
			if (dd->ipath_flags & IPATH_INITTED) {
				ipath_dev_err(dd, "Fatal Hardware Error (freeze "
					      "mode), no longer usable, SN %.16s\n",
						  dd->ipath_serial);
				isfatal = 1;
			}
			*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
			/* mark as having had error */
			*dd->ipath_statusp |= IPATH_STATUS_HWERROR;
			/*
			 * mark as not usable, at a minimum until driver
			 * is reloaded, probably until reboot, since no
			 * other reset is possible.
			 */
			dd->ipath_flags &= ~IPATH_INITTED;
		} else {
			ipath_dbg("Clearing freezemode on ignored hardware "
				  "error\n");
		if ((hwerrs & TXE_PIO_PARITY) && ipath_ht_txe_recover(dd))
			hwerrs &= ~TXE_PIO_PARITY;
		if (hwerrs & RXE_EAGER_PARITY)
			ipath_dev_err(dd, "RXE parity, Eager TID error is not "
				"recoverable\n");
		if (!hwerrs) {
			ipath_dbg("Clearing freezemode on ignored or "
				  "recovered hardware error\n");
			ctrl &= ~INFINIPATH_C_FREEZEMODE;
			ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
					 ctrl);
@@ -587,7 +566,32 @@ static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
				 dd->ipath_hwerrmask);
	}

	if (hwerrs) {
		/*
		 * if any set that we aren't ignoring; only
		 * make the complaint once, in case it's stuck
		 * or recurring, and we get here multiple
		 * times.
		 */
		ipath_dev_err(dd, "%s hardware error\n", msg);
		if (dd->ipath_flags & IPATH_INITTED) {
			ipath_dev_err(dd, "Fatal Hardware Error (freeze "
					  "mode), no longer usable, SN %.16s\n",
					  dd->ipath_serial);
			isfatal = 1;
		}
		*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
		/* mark as having had error */
		*dd->ipath_statusp |= IPATH_STATUS_HWERROR;
		/*
		 * mark as not usable, at a minimum until driver
		 * is reloaded, probably until reboot, since no
		 * other reset is possible.
		 */
		dd->ipath_flags &= ~IPATH_INITTED;
	}
	else
		*msg = 0; /* recovered from all of them */
	if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg)
		/*
		 * for status file; if no trailing brace is copied,
@@ -658,7 +662,8 @@ static int ipath_ht_boardname(struct ipath_devdata *dd, char *name,
	if (n)
		snprintf(name, namelen, "%s", n);

	if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 || dd->ipath_minrev > 3)) {
	if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 ||
		dd->ipath_minrev > 3)) {
		/*
		 * This version of the driver only supports Rev 3.2 and 3.3
		 */
@@ -1163,6 +1168,8 @@ static void ipath_ht_init_hwerrors(struct ipath_devdata *dd)

	if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST))
		ipath_dev_err(dd, "MemBIST did not complete!\n");
	if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT)
		ipath_dbg("MemBIST corrected\n");

	ipath_check_htlink(dd);

@@ -1366,6 +1373,9 @@ static void ipath_ht_put_tid(struct ipath_devdata *dd,
			     u64 __iomem *tidptr, u32 type,
			     unsigned long pa)
{
	if (!dd->ipath_kregbase)
		return;

	if (pa != dd->ipath_tidinvalid) {
		if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) {
			dev_info(&dd->pcidev->dev,
@@ -1382,10 +1392,10 @@ static void ipath_ht_put_tid(struct ipath_devdata *dd,
			pa |= lenvalid | INFINIPATH_RT_VALID;
		}
	}
	if (dd->ipath_kregbase)
	writeq(pa, tidptr);
}


/**
 * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager
 * @dd: the infinipath device
@@ -1528,6 +1538,24 @@ static int ipath_ht_early_init(struct ipath_devdata *dd)
	return 0;
}


static int ipath_ht_txe_recover(struct ipath_devdata *dd)
{
	int cnt = ++ipath_stats.sps_txeparity;
	if (cnt >= IPATH_MAX_PARITY_ATTEMPTS)  {
		if (cnt == IPATH_MAX_PARITY_ATTEMPTS)
			ipath_dev_err(dd,
				"Too many attempts to recover from "
				"TXE parity, giving up\n");
		return 0;
	}
	dev_info(&dd->pcidev->dev,
		"Recovering from TXE PIO parity error\n");
	ipath_disarm_senderrbufs(dd, 1);
	return 1;
}


/**
 * ipath_init_ht_get_base_info - set chip-specific flags for user code
 * @dd: the infinipath device
+39 −19
Original line number Diff line number Diff line
@@ -321,6 +321,12 @@ static const struct ipath_hwerror_msgs ipath_6120_hwerror_msgs[] = {
	INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
};

#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \
		        INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
		        << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)

static int ipath_pe_txe_recover(struct ipath_devdata *);

/**
 * ipath_pe_handle_hwerrors - display hardware errors.
 * @dd: the infinipath device
@@ -394,25 +400,8 @@ static void ipath_pe_handle_hwerrors(struct ipath_devdata *dd, char *msg,
		 * occur if a processor speculative read is done to the PIO
		 * buffer while we are sending a packet, for example.
		 */
		if (hwerrs & ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
			       INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
			      << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)) {
			ipath_stats.sps_txeparity++;
			ipath_dbg("Recovering from TXE parity error (%llu), "
			    	  "hwerrstatus=%llx\n",
				  (unsigned long long) ipath_stats.sps_txeparity,
				  (unsigned long long) hwerrs);
			ipath_disarm_senderrbufs(dd);
			hwerrs &= ~((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF |
				     INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC)
				    << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT);
			if (!hwerrs) { /* else leave in freeze mode */
				ipath_write_kreg(dd,
						 dd->ipath_kregs->kr_control,
						 dd->ipath_control);
			    return;
			}
		}
		if ((hwerrs & TXE_PIO_PARITY) && ipath_pe_txe_recover(dd))
			hwerrs &= ~TXE_PIO_PARITY;
		if (hwerrs) {
			/*
			 * if any set that we aren't ignoring only make the
@@ -581,6 +570,8 @@ static void ipath_pe_init_hwerrors(struct ipath_devdata *dd)

	if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST))
		ipath_dev_err(dd, "MemBIST did not complete!\n");
	if (extsval & INFINIPATH_EXTS_MEMBIST_FOUND)
		ipath_dbg("MemBIST corrected\n");

	val = ~0ULL;	/* barring bugs, all hwerrors become interrupts, */

@@ -1330,6 +1321,35 @@ static void ipath_pe_free_irq(struct ipath_devdata *dd)
	dd->ipath_irq = 0;
}

/*
 * On platforms using this chip, and not having ordered WC stores, we
 * can get TXE parity errors due to speculative reads to the PIO buffers,
 * and this, due to a chip bug can result in (many) false parity error
 * reports.  So it's a debug print on those, and an info print on systems
 * where the speculative reads don't occur.
 * Because we can get lots of false errors, we have no upper limit
 * on recovery attempts on those platforms.
 */
static int ipath_pe_txe_recover(struct ipath_devdata *dd)
{
	if (ipath_unordered_wc())
		ipath_dbg("Recovering from TXE PIO parity error\n");
	else {
		int cnt = ++ipath_stats.sps_txeparity;
		if (cnt >= IPATH_MAX_PARITY_ATTEMPTS)  {
			if (cnt == IPATH_MAX_PARITY_ATTEMPTS)
				ipath_dev_err(dd,
					"Too many attempts to recover from "
					"TXE parity, giving up\n");
			return 0;
		}
		dev_info(&dd->pcidev->dev,
			"Recovering from TXE PIO parity error\n");
	}
	ipath_disarm_senderrbufs(dd, 1);
	return 1;
}

/**
 * ipath_init_iba6120_funcs - set up the chip-specific function pointers
 * @dd: the infinipath device
+4 −0
Original line number Diff line number Diff line
@@ -590,6 +590,10 @@ static int init_housekeeping(struct ipath_devdata *dd,
		goto done;
	}


	/* clear diagctrl register, in case diags were running and crashed */
	ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0);

	/* clear the initial reset flag, in case first driver load */
	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
			 INFINIPATH_E_RESET);
Loading