Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 34db548b authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

mlx4: add page recycling in receive path



Same technique than some Intel drivers, for arches where PAGE_SIZE = 4096

In most cases, pages are reused because they were consumed
before we could loop around the RX ring.

This brings back performance, and is even better,
a single TCP flow reaches 30Gbit on my hosts.

v2: added full memset() in mlx4_en_free_frag(), as Tariq found it was needed
if we switch to large MTU, as priv->log_rx_info can dynamically be changed.

Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Acked-by: default avatarTariq Toukan <tariqt@mellanox.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent b5a54d9a
Loading
Loading
Loading
Loading
+82 −176
Original line number Diff line number Diff line
@@ -50,9 +50,8 @@

#include "mlx4_en.h"

static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
			    struct mlx4_en_rx_alloc *page_alloc,
			    const struct mlx4_en_frag_info *frag_info,
static int mlx4_alloc_page(struct mlx4_en_priv *priv,
			   struct mlx4_en_rx_alloc *frag,
			   gfp_t gfp)
{
	struct page *page;
@@ -63,145 +62,46 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
		return -ENOMEM;
	dma = dma_map_page(priv->ddev, page, 0, PAGE_SIZE, priv->dma_dir);
	if (unlikely(dma_mapping_error(priv->ddev, dma))) {
		put_page(page);
		__free_page(page);
		return -ENOMEM;
	}
	page_alloc->page = page;
	page_alloc->dma = dma;
	page_alloc->page_offset = 0;
	/* Not doing get_page() for each frag is a big win
	 * on asymetric workloads. Note we can not use atomic_set().
	 */
	page_ref_add(page, PAGE_SIZE / frag_info->frag_stride - 1);
	frag->page = page;
	frag->dma = dma;
	frag->page_offset = priv->rx_headroom;
	return 0;
}

static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
			       struct mlx4_en_rx_desc *rx_desc,
			       struct mlx4_en_rx_alloc *frags,
			       struct mlx4_en_rx_alloc *ring_alloc,
			       gfp_t gfp)
{
	struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
	const struct mlx4_en_frag_info *frag_info;
	struct page *page;
	int i;

	for (i = 0; i < priv->num_frags; i++) {
		frag_info = &priv->frag_info[i];
		page_alloc[i] = ring_alloc[i];
		page_alloc[i].page_offset += frag_info->frag_stride;

		if (page_alloc[i].page_offset + frag_info->frag_stride <=
		    PAGE_SIZE)
			continue;

		if (unlikely(mlx4_alloc_pages(priv, &page_alloc[i],
					      frag_info, gfp)))
			goto out;
	}

	for (i = 0; i < priv->num_frags; i++) {
		frags[i] = ring_alloc[i];
		frags[i].page_offset += priv->rx_headroom;
		rx_desc->data[i].addr = cpu_to_be64(frags[i].dma +
						    frags[i].page_offset);
		ring_alloc[i] = page_alloc[i];
	}

	return 0;

out:
	while (i--) {
		if (page_alloc[i].page != ring_alloc[i].page) {
			dma_unmap_page(priv->ddev, page_alloc[i].dma,
				       PAGE_SIZE, priv->dma_dir);
			page = page_alloc[i].page;
			/* Revert changes done by mlx4_alloc_pages */
			page_ref_sub(page, PAGE_SIZE /
					   priv->frag_info[i].frag_stride - 1);
			put_page(page);
		}
	}
	for (i = 0; i < priv->num_frags; i++, frags++) {
		if (!frags->page && mlx4_alloc_page(priv, frags, gfp))
			return -ENOMEM;
}

static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
			      struct mlx4_en_rx_alloc *frags,
			      int i)
{
	const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
	u32 next_frag_end = frags[i].page_offset + 2 * frag_info->frag_stride;


	if (next_frag_end > PAGE_SIZE)
		dma_unmap_page(priv->ddev, frags[i].dma, PAGE_SIZE,
			       priv->dma_dir);

	if (frags[i].page)
		put_page(frags[i].page);
}

static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
				  struct mlx4_en_rx_ring *ring)
{
	int i;
	struct mlx4_en_rx_alloc *page_alloc;

	for (i = 0; i < priv->num_frags; i++) {
		const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];

		if (mlx4_alloc_pages(priv, &ring->page_alloc[i],
				     frag_info, GFP_KERNEL | __GFP_COLD))
			goto out;

		en_dbg(DRV, priv, "  frag %d allocator: - frags:%d\n",
		       i, page_ref_count(ring->page_alloc[i].page));
		rx_desc->data[i].addr = cpu_to_be64(frags->dma +
						    frags->page_offset);
	}
	return 0;

out:
	while (i--) {
		struct page *page;

		page_alloc = &ring->page_alloc[i];
		dma_unmap_page(priv->ddev, page_alloc->dma,
			       PAGE_SIZE, priv->dma_dir);
		page = page_alloc->page;
		/* Revert changes done by mlx4_alloc_pages */
		page_ref_sub(page, PAGE_SIZE /
				   priv->frag_info[i].frag_stride - 1);
		put_page(page);
		page_alloc->page = NULL;
	}
	return -ENOMEM;
}

static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
				      struct mlx4_en_rx_ring *ring)
static void mlx4_en_free_frag(const struct mlx4_en_priv *priv,
			      struct mlx4_en_rx_alloc *frag)
{
	struct mlx4_en_rx_alloc *page_alloc;
	int i;

	for (i = 0; i < priv->num_frags; i++) {
		const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];

		page_alloc = &ring->page_alloc[i];
		en_dbg(DRV, priv, "Freeing allocator:%d count:%d\n",
		       i, page_count(page_alloc->page));

		dma_unmap_page(priv->ddev, page_alloc->dma,
	if (frag->page) {
		dma_unmap_page(priv->ddev, frag->dma,
			       PAGE_SIZE, priv->dma_dir);
		while (page_alloc->page_offset + frag_info->frag_stride <
		       PAGE_SIZE) {
			put_page(page_alloc->page);
			page_alloc->page_offset += frag_info->frag_stride;
		}
		page_alloc->page = NULL;
		__free_page(frag->page);
	}
	/* We need to clear all fields, otherwise a change of priv->log_rx_info
	 * could lead to see garbage later in frag->page.
	 */
	memset(frag, 0, sizeof(*frag));
}

static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv,
static void mlx4_en_init_rx_desc(const struct mlx4_en_priv *priv,
				 struct mlx4_en_rx_ring *ring, int index)
{
	struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
@@ -235,19 +135,22 @@ static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
					(index << priv->log_rx_info);

	if (ring->page_cache.index > 0) {
		/* XDP uses a single page per frame */
		if (!frags->page) {
			ring->page_cache.index--;
		frags[0].page = ring->page_cache.buf[ring->page_cache.index].page;
		frags[0].dma  = ring->page_cache.buf[ring->page_cache.index].dma;
		frags[0].page_offset = XDP_PACKET_HEADROOM;
		rx_desc->data[0].addr = cpu_to_be64(frags[0].dma +
						    frags[0].page_offset);
			frags->page = ring->page_cache.buf[ring->page_cache.index].page;
			frags->dma  = ring->page_cache.buf[ring->page_cache.index].dma;
		}
		frags->page_offset = XDP_PACKET_HEADROOM;
		rx_desc->data[0].addr = cpu_to_be64(frags->dma +
						    XDP_PACKET_HEADROOM);
		return 0;
	}

	return mlx4_en_alloc_frags(priv, rx_desc, frags, ring->page_alloc, gfp);
	return mlx4_en_alloc_frags(priv, rx_desc, frags, gfp);
}

static inline bool mlx4_en_is_ring_empty(struct mlx4_en_rx_ring *ring)
static bool mlx4_en_is_ring_empty(const struct mlx4_en_rx_ring *ring)
{
	return ring->prod == ring->cons;
}
@@ -257,7 +160,8 @@ static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring)
	*ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff);
}

static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv,
/* slow path */
static void mlx4_en_free_rx_desc(const struct mlx4_en_priv *priv,
				 struct mlx4_en_rx_ring *ring,
				 int index)
{
@@ -267,7 +171,7 @@ static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv,
	frags = ring->rx_info + (index << priv->log_rx_info);
	for (nr = 0; nr < priv->num_frags; nr++) {
		en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
		mlx4_en_free_frag(priv, frags, nr);
		mlx4_en_free_frag(priv, frags + nr);
	}
}

@@ -323,12 +227,12 @@ static void mlx4_en_free_rx_buf(struct mlx4_en_priv *priv,
	       ring->cons, ring->prod);

	/* Unmap and free Rx buffers */
	while (!mlx4_en_is_ring_empty(ring)) {
		index = ring->cons & ring->size_mask;
	for (index = 0; index < ring->size; index++) {
		en_dbg(DRV, priv, "Processing descriptor:%d\n", index);
		mlx4_en_free_rx_desc(priv, ring, index);
		++ring->cons;
	}
	ring->cons = 0;
	ring->prod = 0;
}

void mlx4_en_set_num_rx_rings(struct mlx4_en_dev *mdev)
@@ -380,9 +284,9 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,

	tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
					sizeof(struct mlx4_en_rx_alloc));
	ring->rx_info = vmalloc_node(tmp, node);
	ring->rx_info = vzalloc_node(tmp, node);
	if (!ring->rx_info) {
		ring->rx_info = vmalloc(tmp);
		ring->rx_info = vzalloc(tmp);
		if (!ring->rx_info) {
			err = -ENOMEM;
			goto err_ring;
@@ -452,16 +356,6 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
		/* Initialize all descriptors */
		for (i = 0; i < ring->size; i++)
			mlx4_en_init_rx_desc(priv, ring, i);

		/* Initialize page allocators */
		err = mlx4_en_init_allocator(priv, ring);
		if (err) {
			en_err(priv, "Failed initializing ring allocator\n");
			if (ring->stride <= TXBB_SIZE)
				ring->buf -= TXBB_SIZE;
			ring_ind--;
			goto err_allocator;
		}
	}
	err = mlx4_en_fill_rx_buffers(priv);
	if (err)
@@ -481,11 +375,9 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
		mlx4_en_free_rx_buf(priv, priv->rx_ring[ring_ind]);

	ring_ind = priv->rx_ring_num - 1;
err_allocator:
	while (ring_ind >= 0) {
		if (priv->rx_ring[ring_ind]->stride <= TXBB_SIZE)
			priv->rx_ring[ring_ind]->buf -= TXBB_SIZE;
		mlx4_en_destroy_allocator(priv, priv->rx_ring[ring_ind]);
		ring_ind--;
	}
	return err;
@@ -565,50 +457,68 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
	mlx4_en_free_rx_buf(priv, ring);
	if (ring->stride <= TXBB_SIZE)
		ring->buf -= TXBB_SIZE;
	mlx4_en_destroy_allocator(priv, ring);
}


static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
				    struct mlx4_en_rx_desc *rx_desc,
				    struct mlx4_en_rx_alloc *frags,
				    struct sk_buff *skb,
				    int length)
{
	struct skb_frag_struct *skb_frags_rx = skb_shinfo(skb)->frags;
	struct mlx4_en_frag_info *frag_info = priv->frag_info;
	const struct mlx4_en_frag_info *frag_info = priv->frag_info;
	unsigned int truesize = 0;
	int nr, frag_size;
	struct page *page;
	dma_addr_t dma;
	bool release;

	/* Collect used fragments while replacing them in the HW descriptors */
	for (nr = 0;;) {
	for (nr = 0;; frags++) {
		frag_size = min_t(int, length, frag_info->frag_size);

		if (unlikely(!frags[nr].page))
		page = frags->page;
		if (unlikely(!page))
			goto fail;

		dma = be64_to_cpu(rx_desc->data[nr].addr);
		dma_sync_single_for_cpu(priv->ddev, dma, frag_info->frag_size,
					DMA_FROM_DEVICE);
		dma = frags->dma;
		dma_sync_single_range_for_cpu(priv->ddev, dma, frags->page_offset,
					      frag_size, priv->dma_dir);

		__skb_fill_page_desc(skb, nr, frags[nr].page,
				     frags[nr].page_offset,
		__skb_fill_page_desc(skb, nr, page, frags->page_offset,
				     frag_size);

		skb->truesize += frag_info->frag_stride;
		frags[nr].page = NULL;
		truesize += frag_info->frag_stride;
		if (frag_info->frag_stride == PAGE_SIZE / 2) {
			frags->page_offset ^= PAGE_SIZE / 2;
			release = page_count(page) != 1 ||
				  page_is_pfmemalloc(page) ||
				  page_to_nid(page) != numa_mem_id();
		} else {
			u32 sz_align = ALIGN(frag_size, SMP_CACHE_BYTES);

			frags->page_offset += sz_align;
			release = frags->page_offset + frag_info->frag_size > PAGE_SIZE;
		}
		if (release) {
			dma_unmap_page(priv->ddev, dma, PAGE_SIZE, priv->dma_dir);
			frags->page = NULL;
		} else {
			page_ref_inc(page);
		}

		nr++;
		length -= frag_size;
		if (!length)
			break;
		frag_info++;
	}
	skb->truesize += truesize;
	return nr;

fail:
	while (nr > 0) {
		nr--;
		__skb_frag_unref(&skb_frags_rx[nr]);
		__skb_frag_unref(skb_shinfo(skb)->frags + nr);
	}
	return 0;
}
@@ -639,7 +549,8 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
	if (length <= SMALL_PACKET_SIZE) {
		/* We are copying all relevant data to the skb - temporarily
		 * sync buffers for the copy */
		dma = be64_to_cpu(rx_desc->data[0].addr);

		dma = frags[0].dma + frags[0].page_offset;
		dma_sync_single_for_cpu(priv->ddev, dma, length,
					DMA_FROM_DEVICE);
		skb_copy_to_linear_data(skb, va, length);
@@ -648,7 +559,7 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
		unsigned int pull_len;

		/* Move relevant fragments to skb */
		used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, frags,
		used_frags = mlx4_en_complete_rx_desc(priv, frags,
							skb, length);
		if (unlikely(!used_frags)) {
			kfree_skb(skb);
@@ -916,8 +827,10 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
			case XDP_TX:
				if (likely(!mlx4_en_xmit_frame(ring, frags, dev,
							length, cq->ring,
							&doorbell_pending)))
					goto consumed;
							&doorbell_pending))) {
					frags[0].page = NULL;
					goto next;
				}
				trace_xdp_exception(dev, xdp_prog, act);
				goto xdp_drop_no_cnt; /* Drop on xmit failure */
			default:
@@ -927,8 +840,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
			case XDP_DROP:
				ring->xdp_drop++;
xdp_drop_no_cnt:
				if (likely(mlx4_en_rx_recycle(ring, frags)))
					goto consumed;
				goto next;
			}
		}
@@ -974,8 +885,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
			if (!gro_skb)
				goto next;

			nr = mlx4_en_complete_rx_desc(priv,
				rx_desc, frags, gro_skb,
			nr = mlx4_en_complete_rx_desc(priv, frags, gro_skb,
						      length);
			if (!nr)
				goto next;
@@ -1084,10 +994,6 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud

		napi_gro_receive(&cq->napi, skb);
next:
		for (nr = 0; nr < priv->num_frags; nr++)
			mlx4_en_free_frag(priv, frags, nr);

consumed:
		++cq->mcq.cons_index;
		index = (cq->mcq.cons_index) & ring->size_mask;
		cqe = mlx4_en_get_cqe(cq->buf, index, priv->cqe_size) + factor;
+0 −1
Original line number Diff line number Diff line
@@ -327,7 +327,6 @@ struct mlx4_en_rx_desc {

struct mlx4_en_rx_ring {
	struct mlx4_hwq_resources wqres;
	struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
	u32 size ;	/* number of Rx descs*/
	u32 actual_size;
	u32 size_mask;