Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 0cd93027 authored by Yishai Hadas's avatar Yishai Hadas Committed by David S. Miller
Browse files

net/mlx4_core: Reset flow activation upon SRIOV fatal command cases



When SRIOV commands are executed over the comm-channel and get
a fatal error (e.g. timeout, closing command failure) the VF enters
into error state and reset flow is activated.

To be able to recognize whether the failure was on a closing command, the
operational code for the given VHCR command is used. Once the device entered
into an error state we prevent redundant error messages from being printed.

Signed-off-by: default avatarYishai Hadas <yishaih@mellanox.com>
Signed-off-by: default avatarOr Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 55ad3592
Loading
Loading
Loading
Loading
+92 −33
Original line number Original line Diff line number Diff line
@@ -257,16 +257,30 @@ static int comm_pending(struct mlx4_dev *dev)
	return (swab32(status) >> 31) != priv->cmd.comm_toggle;
	return (swab32(status) >> 31) != priv->cmd.comm_toggle;
}
}


static void mlx4_comm_cmd_post(struct mlx4_dev *dev, u8 cmd, u16 param)
static int mlx4_comm_cmd_post(struct mlx4_dev *dev, u8 cmd, u16 param)
{
{
	struct mlx4_priv *priv = mlx4_priv(dev);
	struct mlx4_priv *priv = mlx4_priv(dev);
	u32 val;
	u32 val;


	/* To avoid writing to unknown addresses after the device state was
	 * changed to internal error and the function was rest,
	 * check the INTERNAL_ERROR flag which is updated under
	 * device_state_mutex lock.
	 */
	mutex_lock(&dev->persist->device_state_mutex);

	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
		mutex_unlock(&dev->persist->device_state_mutex);
		return -EIO;
	}

	priv->cmd.comm_toggle ^= 1;
	priv->cmd.comm_toggle ^= 1;
	val = param | (cmd << 16) | (priv->cmd.comm_toggle << 31);
	val = param | (cmd << 16) | (priv->cmd.comm_toggle << 31);
	__raw_writel((__force u32) cpu_to_be32(val),
	__raw_writel((__force u32) cpu_to_be32(val),
		     &priv->mfunc.comm->slave_write);
		     &priv->mfunc.comm->slave_write);
	mmiowb();
	mmiowb();
	mutex_unlock(&dev->persist->device_state_mutex);
	return 0;
}
}


static int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param,
static int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param,
@@ -286,7 +300,13 @@ static int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param,


	/* Write command */
	/* Write command */
	down(&priv->cmd.poll_sem);
	down(&priv->cmd.poll_sem);
	mlx4_comm_cmd_post(dev, cmd, param);
	if (mlx4_comm_cmd_post(dev, cmd, param)) {
		/* Only in case the device state is INTERNAL_ERROR,
		 * mlx4_comm_cmd_post returns with an error
		 */
		err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
		goto out;
	}


	end = msecs_to_jiffies(timeout) + jiffies;
	end = msecs_to_jiffies(timeout) + jiffies;
	while (comm_pending(dev) && time_before(jiffies, end))
	while (comm_pending(dev) && time_before(jiffies, end))
@@ -298,18 +318,23 @@ static int mlx4_comm_cmd_poll(struct mlx4_dev *dev, u8 cmd, u16 param,
		 * is MLX4_DELAY_RESET_SLAVE*/
		 * is MLX4_DELAY_RESET_SLAVE*/
		if ((MLX4_COMM_CMD_RESET == cmd)) {
		if ((MLX4_COMM_CMD_RESET == cmd)) {
			err = MLX4_DELAY_RESET_SLAVE;
			err = MLX4_DELAY_RESET_SLAVE;
			goto out;
		} else {
		} else {
			mlx4_warn(dev, "Communication channel timed out\n");
			mlx4_warn(dev, "Communication channel command 0x%x timed out\n",
			err = -ETIMEDOUT;
				  cmd);
			err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
		}
		}
	}
	}


	if (err)
		mlx4_enter_error_state(dev->persist);
out:
	up(&priv->cmd.poll_sem);
	up(&priv->cmd.poll_sem);
	return err;
	return err;
}
}


static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op,
static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 vhcr_cmd,
			      u16 param, unsigned long timeout)
			      u16 param, u16 op, unsigned long timeout)
{
{
	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
	struct mlx4_cmd *cmd = &mlx4_priv(dev)->cmd;
	struct mlx4_cmd_context *context;
	struct mlx4_cmd_context *context;
@@ -327,32 +352,47 @@ static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op,


	reinit_completion(&context->done);
	reinit_completion(&context->done);


	mlx4_comm_cmd_post(dev, op, param);
	if (mlx4_comm_cmd_post(dev, vhcr_cmd, param)) {
		/* Only in case the device state is INTERNAL_ERROR,
		 * mlx4_comm_cmd_post returns with an error
		 */
		err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
		goto out;
	}


	if (!wait_for_completion_timeout(&context->done,
	if (!wait_for_completion_timeout(&context->done,
					 msecs_to_jiffies(timeout))) {
					 msecs_to_jiffies(timeout))) {
		mlx4_warn(dev, "communication channel command 0x%x timed out\n",
		mlx4_warn(dev, "communication channel command 0x%x (op=0x%x) timed out\n",
			  op);
			  vhcr_cmd, op);
		err = -EBUSY;
		goto out_reset;
		goto out;
	}
	}


	err = context->result;
	err = context->result;
	if (err && context->fw_status != CMD_STAT_MULTI_FUNC_REQ) {
	if (err && context->fw_status != CMD_STAT_MULTI_FUNC_REQ) {
		mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
		mlx4_err(dev, "command 0x%x failed: fw status = 0x%x\n",
			 op, context->fw_status);
			 vhcr_cmd, context->fw_status);
		goto out;
		if (mlx4_closing_cmd_fatal_error(op, context->fw_status))
			goto out_reset;
	}
	}


out:
	/* wait for comm channel ready
	/* wait for comm channel ready
	 * this is necessary for prevention the race
	 * this is necessary for prevention the race
	 * when switching between event to polling mode
	 * when switching between event to polling mode
	 * Skipping this section in case the device is in FATAL_ERROR state,
	 * In this state, no commands are sent via the comm channel until
	 * the device has returned from reset.
	 */
	 */
	if (!(dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)) {
		end = msecs_to_jiffies(timeout) + jiffies;
		end = msecs_to_jiffies(timeout) + jiffies;
		while (comm_pending(dev) && time_before(jiffies, end))
		while (comm_pending(dev) && time_before(jiffies, end))
			cond_resched();
			cond_resched();
	}
	goto out;


out_reset:
	err = mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);
	mlx4_enter_error_state(dev->persist);
out:
	spin_lock(&cmd->context_lock);
	spin_lock(&cmd->context_lock);
	context->next = cmd->free_head;
	context->next = cmd->free_head;
	cmd->free_head = context - cmd->context;
	cmd->free_head = context - cmd->context;
@@ -363,10 +403,13 @@ static int mlx4_comm_cmd_wait(struct mlx4_dev *dev, u8 op,
}
}


int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
		  unsigned long timeout)
		  u16 op, unsigned long timeout)
{
{
	if (dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
		return mlx4_status_to_errno(CMD_STAT_INTERNAL_ERR);

	if (mlx4_priv(dev)->cmd.use_events)
	if (mlx4_priv(dev)->cmd.use_events)
		return mlx4_comm_cmd_wait(dev, cmd, param, timeout);
		return mlx4_comm_cmd_wait(dev, cmd, param, op, timeout);
	return mlx4_comm_cmd_poll(dev, cmd, param, timeout);
	return mlx4_comm_cmd_poll(dev, cmd, param, timeout);
}
}


@@ -502,8 +545,11 @@ static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
			}
			}
			ret = mlx4_status_to_errno(vhcr->status);
			ret = mlx4_status_to_errno(vhcr->status);
		}
		}
		if (ret &&
		    dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
			ret = mlx4_internal_err_ret_value(dev, op, op_modifier);
	} else {
	} else {
		ret = mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_POST, 0,
		ret = mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_POST, 0, op,
				    MLX4_COMM_TIME + timeout);
				    MLX4_COMM_TIME + timeout);
		if (!ret) {
		if (!ret) {
			if (out_is_imm) {
			if (out_is_imm) {
@@ -517,9 +563,14 @@ static int mlx4_slave_cmd(struct mlx4_dev *dev, u64 in_param, u64 *out_param,
				}
				}
			}
			}
			ret = mlx4_status_to_errno(vhcr->status);
			ret = mlx4_status_to_errno(vhcr->status);
		} else
		} else {
			mlx4_err(dev, "failed execution of VHCR_POST command opcode 0x%x\n",
			if (dev->persist->state &
				 op);
			    MLX4_DEVICE_STATE_INTERNAL_ERROR)
				ret = mlx4_internal_err_ret_value(dev, op,
								  op_modifier);
			else
				mlx4_err(dev, "failed execution of VHCR_POST command opcode 0x%x\n", op);
		}
	}
	}


	mutex_unlock(&priv->cmd.slave_cmd_mutex);
	mutex_unlock(&priv->cmd.slave_cmd_mutex);
@@ -1559,6 +1610,8 @@ static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
				      ALIGN(sizeof(struct mlx4_vhcr_cmd),
				      ALIGN(sizeof(struct mlx4_vhcr_cmd),
					    MLX4_ACCESS_MEM_ALIGN), 1);
					    MLX4_ACCESS_MEM_ALIGN), 1);
		if (ret) {
		if (ret) {
			if (!(dev->persist->state &
			    MLX4_DEVICE_STATE_INTERNAL_ERROR))
				mlx4_err(dev, "%s: Failed reading vhcr ret: 0x%x\n",
				mlx4_err(dev, "%s: Failed reading vhcr ret: 0x%x\n",
					 __func__, ret);
					 __func__, ret);
			kfree(vhcr);
			kfree(vhcr);
@@ -1599,9 +1652,12 @@ static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
			goto out_status;
			goto out_status;
		}
		}


		if (mlx4_ACCESS_MEM(dev, inbox->dma, slave,
		ret = mlx4_ACCESS_MEM(dev, inbox->dma, slave,
				      vhcr->in_param,
				      vhcr->in_param,
				    MLX4_MAILBOX_SIZE, 1)) {
				      MLX4_MAILBOX_SIZE, 1);
		if (ret) {
			if (!(dev->persist->state &
			    MLX4_DEVICE_STATE_INTERNAL_ERROR))
				mlx4_err(dev, "%s: Failed reading inbox (cmd:0x%x)\n",
				mlx4_err(dev, "%s: Failed reading inbox (cmd:0x%x)\n",
					 __func__, cmd->opcode);
					 __func__, cmd->opcode);
			vhcr_cmd->status = CMD_STAT_INTERNAL_ERR;
			vhcr_cmd->status = CMD_STAT_INTERNAL_ERR;
@@ -1651,6 +1707,7 @@ static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
	}
	}


	if (err) {
	if (err) {
		if (!(dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR))
			mlx4_warn(dev, "vhcr command:0x%x slave:%d failed with error:%d, status %d\n",
			mlx4_warn(dev, "vhcr command:0x%x slave:%d failed with error:%d, status %d\n",
				  vhcr->op, slave, vhcr->errno, err);
				  vhcr->op, slave, vhcr->errno, err);
		vhcr_cmd->status = mlx4_errno_to_status(err);
		vhcr_cmd->status = mlx4_errno_to_status(err);
@@ -1667,6 +1724,8 @@ static int mlx4_master_process_vhcr(struct mlx4_dev *dev, int slave,
			/* If we failed to write back the outbox after the
			/* If we failed to write back the outbox after the
			 *command was successfully executed, we must fail this
			 *command was successfully executed, we must fail this
			 * slave, as it is now in undefined state */
			 * slave, as it is now in undefined state */
			if (!(dev->persist->state &
			    MLX4_DEVICE_STATE_INTERNAL_ERROR))
				mlx4_err(dev, "%s:Failed writing outbox\n", __func__);
				mlx4_err(dev, "%s:Failed writing outbox\n", __func__);
			goto out;
			goto out;
		}
		}
+9 −7
Original line number Original line Diff line number Diff line
@@ -1484,7 +1484,8 @@ static void mlx4_slave_exit(struct mlx4_dev *dev)
	struct mlx4_priv *priv = mlx4_priv(dev);
	struct mlx4_priv *priv = mlx4_priv(dev);


	mutex_lock(&priv->cmd.slave_cmd_mutex);
	mutex_lock(&priv->cmd.slave_cmd_mutex);
	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_TIME))
	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_CMD_NA_OP,
			  MLX4_COMM_TIME))
		mlx4_warn(dev, "Failed to close slave function\n");
		mlx4_warn(dev, "Failed to close slave function\n");
	mutex_unlock(&priv->cmd.slave_cmd_mutex);
	mutex_unlock(&priv->cmd.slave_cmd_mutex);
}
}
@@ -1648,7 +1649,7 @@ static int mlx4_init_slave(struct mlx4_dev *dev)
	mlx4_reset_vf_support(dev);
	mlx4_reset_vf_support(dev);
	mlx4_warn(dev, "Sending reset\n");
	mlx4_warn(dev, "Sending reset\n");
	ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0,
	ret_from_reset = mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0,
				       MLX4_COMM_TIME);
				       MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME);
	/* if we are in the middle of flr the slave will try
	/* if we are in the middle of flr the slave will try
	 * NUM_OF_RESET_RETRIES times before leaving.*/
	 * NUM_OF_RESET_RETRIES times before leaving.*/
	if (ret_from_reset) {
	if (ret_from_reset) {
@@ -1673,22 +1674,23 @@ static int mlx4_init_slave(struct mlx4_dev *dev)


	mlx4_warn(dev, "Sending vhcr0\n");
	mlx4_warn(dev, "Sending vhcr0\n");
	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR0, dma >> 48,
	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR0, dma >> 48,
						    MLX4_COMM_TIME))
			     MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
		goto err;
		goto err;
	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR1, dma >> 32,
	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR1, dma >> 32,
						    MLX4_COMM_TIME))
			     MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
		goto err;
		goto err;
	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR2, dma >> 16,
	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR2, dma >> 16,
						    MLX4_COMM_TIME))
			     MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
		goto err;
		goto err;
	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma, MLX4_COMM_TIME))
	if (mlx4_comm_cmd(dev, MLX4_COMM_CMD_VHCR_EN, dma,
			  MLX4_COMM_CMD_NA_OP, MLX4_COMM_TIME))
		goto err;
		goto err;


	mutex_unlock(&priv->cmd.slave_cmd_mutex);
	mutex_unlock(&priv->cmd.slave_cmd_mutex);
	return 0;
	return 0;


err:
err:
	mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, 0);
	mlx4_comm_cmd(dev, MLX4_COMM_CMD_RESET, 0, MLX4_COMM_CMD_NA_OP, 0);
err_offline:
err_offline:
	mutex_unlock(&priv->cmd.slave_cmd_mutex);
	mutex_unlock(&priv->cmd.slave_cmd_mutex);
	return -EIO;
	return -EIO;
+3 −0
Original line number Original line Diff line number Diff line
@@ -1350,6 +1350,9 @@ static int mlx4_QP_ATTACH(struct mlx4_dev *dev, struct mlx4_qp *qp,
		       MLX4_CMD_WRAPPED);
		       MLX4_CMD_WRAPPED);


	mlx4_free_cmd_mailbox(dev, mailbox);
	mlx4_free_cmd_mailbox(dev, mailbox);
	if (err && !attach &&
	    dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR)
		err = 0;
	return err;
	return err;
}
}


+3 −1
Original line number Original line Diff line number Diff line
@@ -123,6 +123,8 @@ enum mlx4_mpt_state {


#define MLX4_COMM_TIME		10000
#define MLX4_COMM_TIME		10000
#define MLX4_COMM_OFFLINE_TIME_OUT 30000
#define MLX4_COMM_OFFLINE_TIME_OUT 30000
#define MLX4_COMM_CMD_NA_OP    0x0



enum {
enum {
	MLX4_COMM_CMD_RESET,
	MLX4_COMM_CMD_RESET,
@@ -1173,7 +1175,7 @@ int mlx4_cmd_use_events(struct mlx4_dev *dev);
void mlx4_cmd_use_polling(struct mlx4_dev *dev);
void mlx4_cmd_use_polling(struct mlx4_dev *dev);


int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
int mlx4_comm_cmd(struct mlx4_dev *dev, u8 cmd, u16 param,
		  unsigned long timeout);
		  u16 op, unsigned long timeout);


void mlx4_cq_tasklet_cb(unsigned long data);
void mlx4_cq_tasklet_cb(unsigned long data);
void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn);
void mlx4_cq_completion(struct mlx4_dev *dev, u32 cqn);