Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit b3b30f5e authored by Jack Morgenstein's avatar Jack Morgenstein Committed by Roland Dreier
Browse files

IB/mthca: Recover from catastrophic errors



Trigger device remove and then add when a catastrophic error is
detected in hardware.  This, in turn, will cause a device reset, which
we hope will recover from the catastrophic condition.

Since this might interefere with debugging the root cause, add a
module option to suppress this behaviour.

Signed-off-by: default avatarJack Morgenstein <jackm@mellanox.co.il>
Signed-off-by: default avatarMichael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: default avatarRoland Dreier <rolandd@cisco.com>
parent 07eeec06
Loading
Loading
Loading
Loading
+62 −0
Original line number Original line Diff line number Diff line
@@ -34,6 +34,7 @@


#include <linux/jiffies.h>
#include <linux/jiffies.h>
#include <linux/timer.h>
#include <linux/timer.h>
#include <linux/workqueue.h>


#include "mthca_dev.h"
#include "mthca_dev.h"


@@ -48,9 +49,41 @@ enum {


static DEFINE_SPINLOCK(catas_lock);
static DEFINE_SPINLOCK(catas_lock);


static LIST_HEAD(catas_list);
static struct workqueue_struct *catas_wq;
static struct work_struct catas_work;

static int catas_reset_disable;
module_param_named(catas_reset_disable, catas_reset_disable, int, 0644);
MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero");

static void catas_reset(void *work_ptr)
{
	struct mthca_dev *dev, *tmpdev;
	LIST_HEAD(tlist);
	int ret;

	mutex_lock(&mthca_device_mutex);

	spin_lock_irq(&catas_lock);
	list_splice_init(&catas_list, &tlist);
	spin_unlock_irq(&catas_lock);

	list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) {
		ret = __mthca_restart_one(dev->pdev);
		if (ret)
			mthca_err(dev, "Reset failed (%d)\n", ret);
		else
			mthca_dbg(dev, "Reset succeeded\n");
	}

	mutex_unlock(&mthca_device_mutex);
}

static void handle_catas(struct mthca_dev *dev)
static void handle_catas(struct mthca_dev *dev)
{
{
	struct ib_event event;
	struct ib_event event;
	unsigned long flags;
	const char *type;
	const char *type;
	int i;
	int i;


@@ -82,6 +115,14 @@ static void handle_catas(struct mthca_dev *dev)
	for (i = 0; i < dev->catas_err.size; ++i)
	for (i = 0; i < dev->catas_err.size; ++i)
		mthca_err(dev, "  buf[%02x]: %08x\n",
		mthca_err(dev, "  buf[%02x]: %08x\n",
			  i, swab32(readl(dev->catas_err.map + i)));
			  i, swab32(readl(dev->catas_err.map + i)));

	if (catas_reset_disable)
		return;

	spin_lock_irqsave(&catas_lock, flags);
	list_add(&dev->catas_err.list, &catas_list);
	queue_work(catas_wq, &catas_work);
	spin_unlock_irqrestore(&catas_lock, flags);
}
}


static void poll_catas(unsigned long dev_ptr)
static void poll_catas(unsigned long dev_ptr)
@@ -135,6 +176,7 @@ void mthca_start_catas_poll(struct mthca_dev *dev)
	dev->catas_err.timer.data     = (unsigned long) dev;
	dev->catas_err.timer.data     = (unsigned long) dev;
	dev->catas_err.timer.function = poll_catas;
	dev->catas_err.timer.function = poll_catas;
	dev->catas_err.timer.expires  = jiffies + MTHCA_CATAS_POLL_INTERVAL;
	dev->catas_err.timer.expires  = jiffies + MTHCA_CATAS_POLL_INTERVAL;
	INIT_LIST_HEAD(&dev->catas_err.list);
	add_timer(&dev->catas_err.timer);
	add_timer(&dev->catas_err.timer);
}
}


@@ -153,4 +195,24 @@ void mthca_stop_catas_poll(struct mthca_dev *dev)
				    dev->catas_err.addr),
				    dev->catas_err.addr),
				   dev->catas_err.size * 4);
				   dev->catas_err.size * 4);
	}
	}

	spin_lock_irq(&catas_lock);
	list_del(&dev->catas_err.list);
	spin_unlock_irq(&catas_lock);
}

int __init mthca_catas_init(void)
{
	INIT_WORK(&catas_work, catas_reset, NULL);

	catas_wq = create_singlethread_workqueue("mthca_catas");
	if (!catas_wq)
		return -ENOMEM;

	return 0;
}

void mthca_catas_cleanup(void)
{
	destroy_workqueue(catas_wq);
}
}
+7 −0
Original line number Original line Diff line number Diff line
@@ -45,6 +45,7 @@
#include <linux/dma-mapping.h>
#include <linux/dma-mapping.h>
#include <linux/timer.h>
#include <linux/timer.h>
#include <linux/mutex.h>
#include <linux/mutex.h>
#include <linux/list.h>


#include <asm/semaphore.h>
#include <asm/semaphore.h>


@@ -283,8 +284,11 @@ struct mthca_catas_err {
	unsigned long		stop;
	unsigned long		stop;
	u32			size;
	u32			size;
	struct timer_list	timer;
	struct timer_list	timer;
	struct list_head	list;
};
};


extern struct mutex mthca_device_mutex;

struct mthca_dev {
struct mthca_dev {
	struct ib_device  ib_dev;
	struct ib_device  ib_dev;
	struct pci_dev   *pdev;
	struct pci_dev   *pdev;
@@ -450,6 +454,9 @@ void mthca_unregister_device(struct mthca_dev *dev);


void mthca_start_catas_poll(struct mthca_dev *dev);
void mthca_start_catas_poll(struct mthca_dev *dev);
void mthca_stop_catas_poll(struct mthca_dev *dev);
void mthca_stop_catas_poll(struct mthca_dev *dev);
int __mthca_restart_one(struct pci_dev *pdev);
int mthca_catas_init(void);
void mthca_catas_cleanup(void);


int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);
void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);
+67 −21
Original line number Original line Diff line number Diff line
@@ -80,6 +80,8 @@ static int tune_pci = 0;
module_param(tune_pci, int, 0444);
module_param(tune_pci, int, 0444);
MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero");
MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero");


struct mutex mthca_device_mutex;

static const char mthca_version[] __devinitdata =
static const char mthca_version[] __devinitdata =
	DRV_NAME ": Mellanox InfiniBand HCA driver v"
	DRV_NAME ": Mellanox InfiniBand HCA driver v"
	DRV_VERSION " (" DRV_RELDATE ")\n";
	DRV_VERSION " (" DRV_RELDATE ")\n";
@@ -978,28 +980,15 @@ static struct {
					MTHCA_FLAG_SINAI_OPT }
					MTHCA_FLAG_SINAI_OPT }
};
};


static int __devinit mthca_init_one(struct pci_dev *pdev,
static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
				    const struct pci_device_id *id)
{
{
	static int mthca_version_printed = 0;
	int ddr_hidden = 0;
	int ddr_hidden = 0;
	int err;
	int err;
	struct mthca_dev *mdev;
	struct mthca_dev *mdev;


	if (!mthca_version_printed) {
		printk(KERN_INFO "%s", mthca_version);
		++mthca_version_printed;
	}

	printk(KERN_INFO PFX "Initializing %s\n",
	printk(KERN_INFO PFX "Initializing %s\n",
	       pci_name(pdev));
	       pci_name(pdev));


	if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) {
		printk(KERN_ERR PFX "%s has invalid driver data %lx\n",
		       pci_name(pdev), id->driver_data);
		return -ENODEV;
	}

	err = pci_enable_device(pdev);
	err = pci_enable_device(pdev);
	if (err) {
	if (err) {
		dev_err(&pdev->dev, "Cannot enable PCI device, "
		dev_err(&pdev->dev, "Cannot enable PCI device, "
@@ -1065,7 +1054,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev,


	mdev->pdev = pdev;
	mdev->pdev = pdev;


	mdev->mthca_flags = mthca_hca_table[id->driver_data].flags;
	mdev->mthca_flags = mthca_hca_table[hca_type].flags;
	if (ddr_hidden)
	if (ddr_hidden)
		mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN;
		mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN;


@@ -1099,13 +1088,13 @@ static int __devinit mthca_init_one(struct pci_dev *pdev,
	if (err)
	if (err)
		goto err_cmd;
		goto err_cmd;


	if (mdev->fw_ver < mthca_hca_table[id->driver_data].latest_fw) {
	if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) {
		mthca_warn(mdev, "HCA FW version %d.%d.%d is old (%d.%d.%d is current).\n",
		mthca_warn(mdev, "HCA FW version %d.%d.%d is old (%d.%d.%d is current).\n",
			   (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff,
			   (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff,
			   (int) (mdev->fw_ver & 0xffff),
			   (int) (mdev->fw_ver & 0xffff),
			   (int) (mthca_hca_table[id->driver_data].latest_fw >> 32),
			   (int) (mthca_hca_table[hca_type].latest_fw >> 32),
			   (int) (mthca_hca_table[id->driver_data].latest_fw >> 16) & 0xffff,
			   (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff,
			   (int) (mthca_hca_table[id->driver_data].latest_fw & 0xffff));
			   (int) (mthca_hca_table[hca_type].latest_fw & 0xffff));
		mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n");
		mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n");
	}
	}


@@ -1122,6 +1111,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev,
		goto err_unregister;
		goto err_unregister;


	pci_set_drvdata(pdev, mdev);
	pci_set_drvdata(pdev, mdev);
	mdev->hca_type = hca_type;


	return 0;
	return 0;


@@ -1166,7 +1156,7 @@ static int __devinit mthca_init_one(struct pci_dev *pdev,
	return err;
	return err;
}
}


static void __devexit mthca_remove_one(struct pci_dev *pdev)
static void __mthca_remove_one(struct pci_dev *pdev)
{
{
	struct mthca_dev *mdev = pci_get_drvdata(pdev);
	struct mthca_dev *mdev = pci_get_drvdata(pdev);
	u8 status;
	u8 status;
@@ -1211,6 +1201,51 @@ static void __devexit mthca_remove_one(struct pci_dev *pdev)
	}
	}
}
}


int __mthca_restart_one(struct pci_dev *pdev)
{
	struct mthca_dev *mdev;

	mdev = pci_get_drvdata(pdev);
	if (!mdev)
		return -ENODEV;
	__mthca_remove_one(pdev);
	return __mthca_init_one(pdev, mdev->hca_type);
}

static int __devinit mthca_init_one(struct pci_dev *pdev,
			     const struct pci_device_id *id)
{
	static int mthca_version_printed = 0;
	int ret;

	mutex_lock(&mthca_device_mutex);

	if (!mthca_version_printed) {
		printk(KERN_INFO "%s", mthca_version);
		++mthca_version_printed;
	}

	if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) {
		printk(KERN_ERR PFX "%s has invalid driver data %lx\n",
		       pci_name(pdev), id->driver_data);
		mutex_unlock(&mthca_device_mutex);
		return -ENODEV;
	}

	ret = __mthca_init_one(pdev, id->driver_data);

	mutex_unlock(&mthca_device_mutex);

	return ret;
}

static void __devexit mthca_remove_one(struct pci_dev *pdev)
{
	mutex_lock(&mthca_device_mutex);
	__mthca_remove_one(pdev);
	mutex_unlock(&mthca_device_mutex);
}

static struct pci_device_id mthca_pci_table[] = {
static struct pci_device_id mthca_pci_table[] = {
	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR),
	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR),
	  .driver_data = TAVOR },
	  .driver_data = TAVOR },
@@ -1248,13 +1283,24 @@ static int __init mthca_init(void)
{
{
	int ret;
	int ret;


	mutex_init(&mthca_device_mutex);
	ret = mthca_catas_init();
	if (ret)
		return ret;

	ret = pci_register_driver(&mthca_driver);
	ret = pci_register_driver(&mthca_driver);
	return ret < 0 ? ret : 0;
	if (ret < 0) {
		mthca_catas_cleanup();
		return ret;
	}

	return 0;
}
}


static void __exit mthca_cleanup(void)
static void __exit mthca_cleanup(void)
{
{
	pci_unregister_driver(&mthca_driver);
	pci_unregister_driver(&mthca_driver);
	mthca_catas_cleanup();
}
}


module_init(mthca_init);
module_init(mthca_init);