Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit f6bc11e4 authored by Yishai Hadas's avatar Yishai Hadas Committed by David S. Miller
Browse files

net/mlx4_core: Enhance the catas flow to support device reset



This includes:

- resetting the chip when a fatal error is detected (the current code
  does not do this).

- exposing the ability to enter error state from outside the catas code
  by calling its functionality. (E.g. FW Command timeout, AER error).

- managing a persistent device state. This is needed to sync between
  reset flow cases.

Signed-off-by: default avatarYishai Hadas <yishaih@mellanox.com>
Signed-off-by: default avatarOr Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent ad9a0bf0
Loading
Loading
Loading
Loading
+94 −28
Original line number Diff line number Diff line
@@ -48,6 +48,83 @@ MODULE_PARM_DESC(internal_err_reset,
		 "Reset device on internal errors if non-zero"
		 " (default 1, in SRIOV mode default is 0)");

static int read_vendor_id(struct mlx4_dev *dev)
{
	u16 vendor_id = 0;
	int ret;

	ret = pci_read_config_word(dev->persist->pdev, 0, &vendor_id);
	if (ret) {
		mlx4_err(dev, "Failed to read vendor ID, ret=%d\n", ret);
		return ret;
	}

	if (vendor_id == 0xffff) {
		mlx4_err(dev, "PCI can't be accessed to read vendor id\n");
		return -EINVAL;
	}

	return 0;
}

static int mlx4_reset_master(struct mlx4_dev *dev)
{
	int err = 0;

	if (!pci_channel_offline(dev->persist->pdev)) {
		err = read_vendor_id(dev);
		/* If PCI can't be accessed to read vendor ID we assume that its
		 * link was disabled and chip was already reset.
		 */
		if (err)
			return 0;

		err = mlx4_reset(dev);
		if (err)
			mlx4_err(dev, "Fail to reset HCA\n");
	}

	return err;
}

void mlx4_enter_error_state(struct mlx4_dev_persistent *persist)
{
	int err;
	struct mlx4_dev *dev;

	if (!internal_err_reset)
		return;

	mutex_lock(&persist->device_state_mutex);
	if (persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
		goto out;

	dev = persist->dev;
	mlx4_err(dev, "device is going to be reset\n");
	err = mlx4_reset_master(dev);
	BUG_ON(err != 0);

	dev->persist->state |= MLX4_DEVICE_STATE_INTERNAL_ERROR;
	mlx4_err(dev, "device was reset successfully\n");
	mutex_unlock(&persist->device_state_mutex);

	/* At that step HW was already reset, now notify clients */
	mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
	return;

out:
	mutex_unlock(&persist->device_state_mutex);
}

static void mlx4_handle_error_state(struct mlx4_dev_persistent *persist)
{
	int err = 0;

	mlx4_enter_error_state(persist);
	err = mlx4_restart_one(persist->pdev);
	mlx4_info(persist->dev, "mlx4_restart_one was ended, ret=%d\n", err);
}

static void dump_err_buf(struct mlx4_dev *dev)
{
	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -66,21 +143,22 @@ static void poll_catas(unsigned long dev_ptr)
	struct mlx4_priv *priv = mlx4_priv(dev);

	if (readl(priv->catas_err.map)) {
		/* If the device is off-line, we cannot try to recover it */
		if (pci_channel_offline(dev->persist->pdev))
			mod_timer(&priv->catas_err.timer,
				  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
		else {
		dump_err_buf(dev);
			mlx4_dispatch_event(dev, MLX4_DEV_EVENT_CATASTROPHIC_ERROR, 0);
		goto internal_err;
	}

			if (internal_err_reset)
				queue_work(dev->persist->catas_wq,
					   &dev->persist->catas_work);
	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
		mlx4_warn(dev, "Internal error mark was detected on device\n");
		goto internal_err;
	}
	} else

	mod_timer(&priv->catas_err.timer,
		  round_jiffies(jiffies + MLX4_CATAS_POLL_INTERVAL));
	return;

internal_err:
	if (internal_err_reset)
		queue_work(dev->persist->catas_wq, &dev->persist->catas_work);
}

static void catas_reset(struct work_struct *work)
@@ -88,20 +166,8 @@ static void catas_reset(struct work_struct *work)
	struct mlx4_dev_persistent *persist =
		container_of(work, struct mlx4_dev_persistent,
			     catas_work);
	struct pci_dev *pdev = persist->pdev;
	int ret;

	/* If the device is off-line, we cannot reset it */
	if (pci_channel_offline(pdev))
		return;

	ret = mlx4_restart_one(pdev);
	/* 'priv' now is not valid */
	if (ret)
		pr_err("mlx4 %s: Reset failed (%d)\n",
		       pci_name(pdev), ret);
	else
		mlx4_dbg(persist->dev, "Reset succeeded\n");
	mlx4_handle_error_state(persist);
}

void mlx4_start_catas_poll(struct mlx4_dev *dev)
+6 −0
Original line number Diff line number Diff line
@@ -2624,6 +2624,11 @@ static int mlx4_load_one(struct pci_dev *pdev, int pci_dev_data,
		}
	}

	/* on load remove any previous indication of internal error,
	 * device is up.
	 */
	dev->persist->state = MLX4_DEVICE_STATE_UP;

slave_start:
	err = mlx4_cmd_init(dev);
	if (err) {
@@ -3108,6 +3113,7 @@ static int mlx4_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
	dev->persist->dev = dev;
	pci_set_drvdata(pdev, dev->persist);
	priv->pci_dev_data = id->driver_data;
	mutex_init(&dev->persist->device_state_mutex);

	ret =  __mlx4_init_one(pdev, id->driver_data, priv);
	if (ret) {
+1 −1
Original line number Diff line number Diff line
@@ -1178,7 +1178,7 @@ void mlx4_qp_event(struct mlx4_dev *dev, u32 qpn, int event_type);

void mlx4_srq_event(struct mlx4_dev *dev, u32 srqn, int event_type);

void mlx4_handle_catas_err(struct mlx4_dev *dev);
void mlx4_enter_error_state(struct mlx4_dev_persistent *persist);

int mlx4_SENSE_PORT(struct mlx4_dev *dev, int port,
		    enum mlx4_port_type *type);
+7 −0
Original line number Diff line number Diff line
@@ -411,6 +411,11 @@ enum {
	MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK	= 1 << 4,
};

enum {
	MLX4_DEVICE_STATE_UP			= 1 << 0,
	MLX4_DEVICE_STATE_INTERNAL_ERROR	= 1 << 1,
};

#define MSTR_SM_CHANGE_MASK (MLX4_EQ_PORT_INFO_MSTR_SM_SL_CHANGE_MASK | \
			     MLX4_EQ_PORT_INFO_MSTR_SM_LID_CHANGE_MASK)

@@ -753,6 +758,8 @@ struct mlx4_dev_persistent {
	enum mlx4_port_type curr_port_poss_type[MLX4_MAX_PORTS + 1];
	struct work_struct      catas_work;
	struct workqueue_struct *catas_wq;
	struct mutex	device_state_mutex; /* protect HW state */
	u8		state;
};

struct mlx4_dev {