Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 800c55cb authored by Netanel Belgazal's avatar Netanel Belgazal Committed by David S. Miller
Browse files

net: ena: bug fix in lost tx packets detection mechanism



check_for_missing_tx_completions() is called from a timer
task and looking for lost tx packets.
The old implementation accumulate all the lost tx packets
and did not check if those packets were retrieved on a later stage.
This cause to a situation where the driver reset
the device for no reason.

Fixes: 1738cd3e ("Add a driver for Amazon Elastic Network Adapters (ENA)")
Signed-off-by: default avatarNetanel Belgazal <netanel@amazon.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent a2cc5198
Loading
Loading
Loading
Loading
+0 −1
Original line number Original line Diff line number Diff line
@@ -80,7 +80,6 @@ static const struct ena_stats ena_stats_tx_strings[] = {
	ENA_STAT_TX_ENTRY(tx_poll),
	ENA_STAT_TX_ENTRY(tx_poll),
	ENA_STAT_TX_ENTRY(doorbells),
	ENA_STAT_TX_ENTRY(doorbells),
	ENA_STAT_TX_ENTRY(prepare_ctx_err),
	ENA_STAT_TX_ENTRY(prepare_ctx_err),
	ENA_STAT_TX_ENTRY(missing_tx_comp),
	ENA_STAT_TX_ENTRY(bad_req_id),
	ENA_STAT_TX_ENTRY(bad_req_id),
};
};


+38 −28
Original line number Original line Diff line number Diff line
@@ -1995,6 +1995,7 @@ static netdev_tx_t ena_start_xmit(struct sk_buff *skb, struct net_device *dev)


	tx_info->tx_descs = nb_hw_desc;
	tx_info->tx_descs = nb_hw_desc;
	tx_info->last_jiffies = jiffies;
	tx_info->last_jiffies = jiffies;
	tx_info->print_once = 0;


	tx_ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use,
	tx_ring->next_to_use = ENA_TX_RING_IDX_NEXT(next_to_use,
		tx_ring->ring_size);
		tx_ring->ring_size);
@@ -2564,13 +2565,44 @@ static void ena_fw_reset_device(struct work_struct *work)
		"Reset attempt failed. Can not reset the device\n");
		"Reset attempt failed. Can not reset the device\n");
}
}


static void check_for_missing_tx_completions(struct ena_adapter *adapter)
static int check_missing_comp_in_queue(struct ena_adapter *adapter,
				       struct ena_ring *tx_ring)
{
{
	struct ena_tx_buffer *tx_buf;
	struct ena_tx_buffer *tx_buf;
	unsigned long last_jiffies;
	unsigned long last_jiffies;
	u32 missed_tx = 0;
	int i;

	for (i = 0; i < tx_ring->ring_size; i++) {
		tx_buf = &tx_ring->tx_buffer_info[i];
		last_jiffies = tx_buf->last_jiffies;
		if (unlikely(last_jiffies &&
			     time_is_before_jiffies(last_jiffies + TX_TIMEOUT))) {
			if (!tx_buf->print_once)
				netif_notice(adapter, tx_err, adapter->netdev,
					     "Found a Tx that wasn't completed on time, qid %d, index %d.\n",
					     tx_ring->qid, i);

			tx_buf->print_once = 1;
			missed_tx++;

			if (unlikely(missed_tx > MAX_NUM_OF_TIMEOUTED_PACKETS)) {
				netif_err(adapter, tx_err, adapter->netdev,
					  "The number of lost tx completions is above the threshold (%d > %d). Reset the device\n",
					  missed_tx, MAX_NUM_OF_TIMEOUTED_PACKETS);
				set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
				return -EIO;
			}
		}
	}

	return 0;
}

static void check_for_missing_tx_completions(struct ena_adapter *adapter)
{
	struct ena_ring *tx_ring;
	struct ena_ring *tx_ring;
	int i, j, budget;
	int i, budget, rc;
	u32 missed_tx;


	/* Make sure the driver doesn't turn the device in other process */
	/* Make sure the driver doesn't turn the device in other process */
	smp_rmb();
	smp_rmb();
@@ -2586,31 +2618,9 @@ static void check_for_missing_tx_completions(struct ena_adapter *adapter)
	for (i = adapter->last_monitored_tx_qid; i < adapter->num_queues; i++) {
	for (i = adapter->last_monitored_tx_qid; i < adapter->num_queues; i++) {
		tx_ring = &adapter->tx_ring[i];
		tx_ring = &adapter->tx_ring[i];


		for (j = 0; j < tx_ring->ring_size; j++) {
		rc = check_missing_comp_in_queue(adapter, tx_ring);
			tx_buf = &tx_ring->tx_buffer_info[j];
		if (unlikely(rc))
			last_jiffies = tx_buf->last_jiffies;
			return;
			if (unlikely(last_jiffies && time_is_before_jiffies(last_jiffies + TX_TIMEOUT))) {
				netif_notice(adapter, tx_err, adapter->netdev,
					     "Found a Tx that wasn't completed on time, qid %d, index %d.\n",
					     tx_ring->qid, j);

				u64_stats_update_begin(&tx_ring->syncp);
				missed_tx = tx_ring->tx_stats.missing_tx_comp++;
				u64_stats_update_end(&tx_ring->syncp);

				/* Clear last jiffies so the lost buffer won't
				 * be counted twice.
				 */
				tx_buf->last_jiffies = 0;

				if (unlikely(missed_tx > MAX_NUM_OF_TIMEOUTED_PACKETS)) {
					netif_err(adapter, tx_err, adapter->netdev,
						  "The number of lost tx completion is above the threshold (%d > %d). Reset the device\n",
						  missed_tx, MAX_NUM_OF_TIMEOUTED_PACKETS);
					set_bit(ENA_FLAG_TRIGGER_RESET, &adapter->flags);
				}
			}
		}


		budget--;
		budget--;
		if (!budget)
		if (!budget)
+12 −2
Original line number Original line Diff line number Diff line
@@ -146,7 +146,18 @@ struct ena_tx_buffer {
	u32 tx_descs;
	u32 tx_descs;
	/* num of buffers used by this skb */
	/* num of buffers used by this skb */
	u32 num_of_bufs;
	u32 num_of_bufs;
	/* Save the last jiffies to detect missing tx packets */

	/* Used for detect missing tx packets to limit the number of prints */
	u32 print_once;
	/* Save the last jiffies to detect missing tx packets
	 *
	 * sets to non zero value on ena_start_xmit and set to zero on
	 * napi and timer_Service_routine.
	 *
	 * while this value is not protected by lock,
	 * a given packet is not expected to be handled by ena_start_xmit
	 * and by napi/timer_service at the same time.
	 */
	unsigned long last_jiffies;
	unsigned long last_jiffies;
	struct ena_com_buf bufs[ENA_PKT_MAX_BUFS];
	struct ena_com_buf bufs[ENA_PKT_MAX_BUFS];
} ____cacheline_aligned;
} ____cacheline_aligned;
@@ -170,7 +181,6 @@ struct ena_stats_tx {
	u64 napi_comp;
	u64 napi_comp;
	u64 tx_poll;
	u64 tx_poll;
	u64 doorbells;
	u64 doorbells;
	u64 missing_tx_comp;
	u64 bad_req_id;
	u64 bad_req_id;
};
};