Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ba88ac9e authored by Moshe Shemesh's avatar Moshe Shemesh Committed by Greg Kroah-Hartman
Browse files

net/mlx4_en: Handle TX error CQE



[ Upstream commit ba603d9d7b1215c72513d7c7aa02b6775fd4891b ]

In case error CQE was found while polling TX CQ, the QP is in error
state and all posted WQEs will generate error CQEs without any data
transmitted. Fix it by reopening the channels, via same method used for
TX timeout handling.

In addition add some more info on error CQE and WQE for debug.

Fixes: bd2f631d ("net/mlx4_en: Notify user when TX ring in error state")
Signed-off-by: default avatarMoshe Shemesh <moshe@mellanox.com>
Signed-off-by: default avatarTariq Toukan <tariqt@nvidia.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 21ef20ff
Loading
Loading
Loading
Loading
+1 −0
Original line number Original line Diff line number Diff line
@@ -1741,6 +1741,7 @@ int mlx4_en_start_port(struct net_device *dev)
				mlx4_en_deactivate_cq(priv, cq);
				mlx4_en_deactivate_cq(priv, cq);
				goto tx_err;
				goto tx_err;
			}
			}
			clear_bit(MLX4_EN_TX_RING_STATE_RECOVERING, &tx_ring->state);
			if (t != TX_XDP) {
			if (t != TX_XDP) {
				tx_ring->tx_queue = netdev_get_tx_queue(dev, i);
				tx_ring->tx_queue = netdev_get_tx_queue(dev, i);
				tx_ring->recycle_ring = NULL;
				tx_ring->recycle_ring = NULL;
+33 −7
Original line number Original line Diff line number Diff line
@@ -385,6 +385,35 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
	return cnt;
	return cnt;
}
}


static void mlx4_en_handle_err_cqe(struct mlx4_en_priv *priv, struct mlx4_err_cqe *err_cqe,
				   u16 cqe_index, struct mlx4_en_tx_ring *ring)
{
	struct mlx4_en_dev *mdev = priv->mdev;
	struct mlx4_en_tx_info *tx_info;
	struct mlx4_en_tx_desc *tx_desc;
	u16 wqe_index;
	int desc_size;

	en_err(priv, "CQE error - cqn 0x%x, ci 0x%x, vendor syndrome: 0x%x syndrome: 0x%x\n",
	       ring->sp_cqn, cqe_index, err_cqe->vendor_err_syndrome, err_cqe->syndrome);
	print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, err_cqe, sizeof(*err_cqe),
		       false);

	wqe_index = be16_to_cpu(err_cqe->wqe_index) & ring->size_mask;
	tx_info = &ring->tx_info[wqe_index];
	desc_size = tx_info->nr_txbb << LOG_TXBB_SIZE;
	en_err(priv, "Related WQE - qpn 0x%x, wqe index 0x%x, wqe size 0x%x\n", ring->qpn,
	       wqe_index, desc_size);
	tx_desc = ring->buf + (wqe_index << LOG_TXBB_SIZE);
	print_hex_dump(KERN_WARNING, "", DUMP_PREFIX_OFFSET, 16, 1, tx_desc, desc_size, false);

	if (test_and_set_bit(MLX4_EN_STATE_FLAG_RESTARTING, &priv->state))
		return;

	en_err(priv, "Scheduling port restart\n");
	queue_work(mdev->workqueue, &priv->restart_task);
}

bool mlx4_en_process_tx_cq(struct net_device *dev,
bool mlx4_en_process_tx_cq(struct net_device *dev,
			   struct mlx4_en_cq *cq, int napi_budget)
			   struct mlx4_en_cq *cq, int napi_budget)
{
{
@@ -431,13 +460,10 @@ bool mlx4_en_process_tx_cq(struct net_device *dev,
		dma_rmb();
		dma_rmb();


		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
		if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
			     MLX4_CQE_OPCODE_ERROR)) {
			     MLX4_CQE_OPCODE_ERROR))
			struct mlx4_err_cqe *cqe_err = (struct mlx4_err_cqe *)cqe;
			if (!test_and_set_bit(MLX4_EN_TX_RING_STATE_RECOVERING, &ring->state))

				mlx4_en_handle_err_cqe(priv, (struct mlx4_err_cqe *)cqe, index,
			en_err(priv, "CQE error - vendor syndrome: 0x%x syndrome: 0x%x\n",
						       ring);
			       cqe_err->vendor_err_syndrome,
			       cqe_err->syndrome);
		}


		/* Skip over last polled CQE */
		/* Skip over last polled CQE */
		new_index = be16_to_cpu(cqe->wqe_index) & size_mask;
		new_index = be16_to_cpu(cqe->wqe_index) & size_mask;
+5 −0
Original line number Original line Diff line number Diff line
@@ -271,6 +271,10 @@ struct mlx4_en_page_cache {
	} buf[MLX4_EN_CACHE_SIZE];
	} buf[MLX4_EN_CACHE_SIZE];
};
};


enum {
	MLX4_EN_TX_RING_STATE_RECOVERING,
};

struct mlx4_en_priv;
struct mlx4_en_priv;


struct mlx4_en_tx_ring {
struct mlx4_en_tx_ring {
@@ -317,6 +321,7 @@ struct mlx4_en_tx_ring {
	 * Only queue_stopped might be used if BQL is not properly working.
	 * Only queue_stopped might be used if BQL is not properly working.
	 */
	 */
	unsigned long		queue_stopped;
	unsigned long		queue_stopped;
	unsigned long		state;
	struct mlx4_hwq_resources sp_wqres;
	struct mlx4_hwq_resources sp_wqres;
	struct mlx4_qp		sp_qp;
	struct mlx4_qp		sp_qp;
	struct mlx4_qp_context	sp_context;
	struct mlx4_qp_context	sp_context;