Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d0234215 authored by Brice Goglin's avatar Brice Goglin Committed by David S. Miller
Browse files

myri10ge: improve parity error detection and recovery



Improve myri10ge parity error detection and recovery:
1) Don't restore PCI config space to a rebooted NIC until AFTER the
   host is quiescent.
2) Let myri10ge_close() know the NIC is dead, so it won't waste time
   waiting for a dead nic to respond to MXGEFW_CMD_ETHERNET_DOWN
3) When the NIC is quiet (link down, or otherwise idle link) use
   a pci config space read to detect a rebooted NIC.  Otherwise
   we might never notice that a NIC rebooted

Signed-off-by: default avatarAndrew Gallatin <gallatin@myri.com>
Signed-off-by: default avatarBrice Goglin <brice@myri.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent c9145a2d
Loading
Loading
Loading
Loading
+46 −17
Original line number Diff line number Diff line
@@ -75,7 +75,7 @@
#include "myri10ge_mcp.h"
#include "myri10ge_mcp_gen_header.h"

#define MYRI10GE_VERSION_STR "1.5.0-1.418"
#define MYRI10GE_VERSION_STR "1.5.0-1.432"

MODULE_DESCRIPTION("Myricom 10G driver (10GbE)");
MODULE_AUTHOR("Maintainer: help@myri.com");
@@ -188,6 +188,7 @@ struct myri10ge_slice_state {
	dma_addr_t fw_stats_bus;
	int watchdog_tx_done;
	int watchdog_tx_req;
	int watchdog_rx_done;
#ifdef CONFIG_MYRI10GE_DCA
	int cached_dca_tag;
	int cpu;
@@ -256,6 +257,7 @@ struct myri10ge_priv {
	u32 link_changes;
	u32 msg_enable;
	unsigned int board_number;
	int rebooted;
};

static char *myri10ge_fw_unaligned = "myri10ge_ethp_z8e.dat";
@@ -2552,17 +2554,22 @@ static int myri10ge_close(struct net_device *dev)
	netif_carrier_off(dev);

	netif_tx_stop_all_queues(dev);
	if (mgp->rebooted == 0) {
		old_down_cnt = mgp->down_cnt;
		mb();
	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0);
		status =
		    myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0);
		if (status)
		printk(KERN_ERR "myri10ge: %s: Couldn't bring down link\n",
			printk(KERN_ERR
			       "myri10ge: %s: Couldn't bring down link\n",
			       dev->name);

	wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt, HZ);
		wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt,
				   HZ);
		if (old_down_cnt == mgp->down_cnt)
		printk(KERN_ERR "myri10ge: %s never got down irq\n", dev->name);

			printk(KERN_ERR "myri10ge: %s never got down irq\n",
			       dev->name);
	}
	netif_tx_disable(dev);
	myri10ge_free_irq(mgp);
	for (i = 0; i < mgp->num_slices; i++)
@@ -3427,12 +3434,13 @@ static void myri10ge_watchdog(struct work_struct *work)
	    container_of(work, struct myri10ge_priv, watchdog_work);
	struct myri10ge_tx_buf *tx;
	u32 reboot;
	int status;
	int status, rebooted;
	int i;
	u16 cmd, vendor;

	mgp->watchdog_resets++;
	pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
	rebooted = 0;
	if ((cmd & PCI_COMMAND_MASTER) == 0) {
		/* Bus master DMA disabled?  Check to see
		 * if the card rebooted due to a parity error
@@ -3444,9 +3452,12 @@ static void myri10ge_watchdog(struct work_struct *work)
		       myri10ge_reset_recover ? " " : " not");
		if (myri10ge_reset_recover == 0)
			return;

		rtnl_lock();
		mgp->rebooted = 1;
		rebooted = 1;
		myri10ge_close(mgp->dev);
		myri10ge_reset_recover--;

		mgp->rebooted = 0;
		/*
		 * A rebooted nic will come back with config space as
		 * it was after power was applied to PCIe bus.
@@ -3494,8 +3505,10 @@ static void myri10ge_watchdog(struct work_struct *work)
		}
	}

	if (!rebooted) {
		rtnl_lock();
		myri10ge_close(mgp->dev);
	}
	status = myri10ge_load_firmware(mgp, 1);
	if (status != 0)
		printk(KERN_ERR "myri10ge: %s: failed to load firmware\n",
@@ -3516,12 +3529,14 @@ static void myri10ge_watchdog_timer(unsigned long arg)
{
	struct myri10ge_priv *mgp;
	struct myri10ge_slice_state *ss;
	int i, reset_needed;
	int i, reset_needed, busy_slice_cnt;
	u32 rx_pause_cnt;
	u16 cmd;

	mgp = (struct myri10ge_priv *)arg;

	rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause);
	busy_slice_cnt = 0;
	for (i = 0, reset_needed = 0;
	     i < mgp->num_slices && reset_needed == 0; ++i) {

@@ -3559,8 +3574,22 @@ static void myri10ge_watchdog_timer(unsigned long arg)
				reset_needed = 1;
			}
		}
		if (ss->watchdog_tx_done != ss->tx.done ||
		    ss->watchdog_rx_done != ss->rx_done.cnt) {
			busy_slice_cnt++;
		}
		ss->watchdog_tx_done = ss->tx.done;
		ss->watchdog_tx_req = ss->tx.req;
		ss->watchdog_rx_done = ss->rx_done.cnt;
	}
	/* if we've sent or received no traffic, poll the NIC to
	 * ensure it is still there.  Otherwise, we risk not noticing
	 * an error in a timely fashion */
	if (busy_slice_cnt == 0) {
		pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
		if ((cmd & PCI_COMMAND_MASTER) == 0) {
			reset_needed = 1;
		}
	}
	mgp->watchdog_pause = rx_pause_cnt;