Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 67e303e0 authored by VSR Burru's avatar VSR Burru Committed by David S. Miller
Browse files

liquidio: improve UDP TX performance



Improve UDP TX performance by:
* reducing the ring size from 2K to 512
* replacing the numerous streaming DMA allocations for info buffers and
  gather lists with one large consistent DMA allocation per ring

BQL is not effective here.  We reduced the ring size because there is heavy
overhead with dma_map_single every so often.  With iommu=on, dma_map_single
in PF Tx data path was taking longer time (~700usec) for every ~250
packets.  Debugged intel_iommu code, and found that PF driver is utilizing
too many static IO virtual address mapping entries (for gather list entries
and info buffers): about 100K entries for two PF's each using 8 rings.
Also, finding an empty entry (in rbtree of device domain's iova mapping in
kernel) during Tx path becomes a bottleneck every so often; the loop to
find the empty entry goes through over 40K iterations; this is too costly
and was the major overhead.  Overhead is low when this loop quits quickly.

Netperf benchmark numbers before and after patch:

PF UDP TX
+--------+--------+------------+------------+---------+
|        |        |  Before    |  After     |         |
| Number |        |  Patch     |  Patch     |         |
|  of    | Packet | Throughput | Throughput | Percent |
| Flows  |  Size  |  (Gbps)    |  (Gbps)    | Change  |
+--------+--------+------------+------------+---------+
|        |   360  |   0.52     |   0.93     |  +78.9  |
|   1    |  1024  |   1.62     |   2.84     |  +75.3  |
|        |  1518  |   2.44     |   4.21     |  +72.5  |
+--------+--------+------------+------------+---------+
|        |   360  |   0.45     |   1.59     | +253.3  |
|   4    |  1024  |   1.34     |   5.48     | +308.9  |
|        |  1518  |   2.27     |   8.31     | +266.1  |
+--------+--------+------------+------------+---------+
|        |   360  |   0.40     |   1.61     | +302.5  |
|   8    |  1024  |   1.64     |   4.24     | +158.5  |
|        |  1518  |   2.87     |   6.52     | +127.2  |
+--------+--------+------------+------------+---------+

VF UDP TX
+--------+--------+------------+------------+---------+
|        |        |  Before    |  After     |         |
| Number |        |  Patch     |  Patch     |         |
|  of    | Packet | Throughput | Throughput | Percent |
| Flows  |  Size  |  (Gbps)    |  (Gbps)    | Change  |
+--------+--------+------------+------------+---------+
|        |   360  |   1.28     |   1.49     |  +16.4  |
|   1    |  1024  |   4.44     |   4.39     |   -1.1  |
|        |  1518  |   6.08     |   6.51     |   +7.1  |
+--------+--------+------------+------------+---------+
|        |   360  |   2.35     |   2.35     |    0.0  |
|   4    |  1024  |   6.41     |   8.07     |  +25.9  |
|        |  1518  |   9.56     |   9.54     |   -0.2  |
+--------+--------+------------+------------+---------+
|        |   360  |   3.41     |   3.65     |   +7.0  |
|   8    |  1024  |   9.35     |   9.34     |   -0.1  |
|        |  1518  |   9.56     |   9.57     |   +0.1  |
+--------+--------+------------+------------+---------+

Signed-off-by: default avatarVSR Burru <veerasenareddy.burru@cavium.com>
Signed-off-by: default avatarFelix Manlunas <felix.manlunas@cavium.com>
Signed-off-by: default avatarDerek Chickles <derek.chickles@cavium.com>
Signed-off-by: default avatarRaghu Vatsavayi <raghu.vatsavayi@cavium.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 5be083ce
Loading
Loading
Loading
Loading
+55 −55
Original line number Diff line number Diff line
@@ -152,7 +152,7 @@ struct octnic_gather {
	 */
	struct octeon_sg_entry *sg;

	u64 sg_dma_ptr;
	dma_addr_t sg_dma_ptr;
};

struct handshake {
@@ -734,6 +734,9 @@ static void delete_glists(struct lio *lio)
	struct octnic_gather *g;
	int i;

	kfree(lio->glist_lock);
	lio->glist_lock = NULL;

	if (!lio->glist)
		return;

@@ -741,23 +744,26 @@ static void delete_glists(struct lio *lio)
		do {
			g = (struct octnic_gather *)
				list_delete_head(&lio->glist[i]);
			if (g) {
				if (g->sg) {
					dma_unmap_single(&lio->oct_dev->
							 pci_dev->dev,
							 g->sg_dma_ptr,
							 g->sg_size,
							 DMA_TO_DEVICE);
					kfree((void *)((unsigned long)g->sg -
						       g->adjust));
				}
			if (g)
				kfree(g);
			}
		} while (g);

		if (lio->glists_virt_base && lio->glists_virt_base[i]) {
			lio_dma_free(lio->oct_dev,
				     lio->glist_entry_size * lio->tx_qsize,
				     lio->glists_virt_base[i],
				     lio->glists_dma_base[i]);
		}
	}

	kfree((void *)lio->glist);
	kfree((void *)lio->glist_lock);
	kfree(lio->glists_virt_base);
	lio->glists_virt_base = NULL;

	kfree(lio->glists_dma_base);
	lio->glists_dma_base = NULL;

	kfree(lio->glist);
	lio->glist = NULL;
}

/**
@@ -772,13 +778,30 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)
	lio->glist_lock = kcalloc(num_iqs, sizeof(*lio->glist_lock),
				  GFP_KERNEL);
	if (!lio->glist_lock)
		return 1;
		return -ENOMEM;

	lio->glist = kcalloc(num_iqs, sizeof(*lio->glist),
			     GFP_KERNEL);
	if (!lio->glist) {
		kfree((void *)lio->glist_lock);
		return 1;
		kfree(lio->glist_lock);
		lio->glist_lock = NULL;
		return -ENOMEM;
	}

	lio->glist_entry_size =
		ROUNDUP8((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE);

	/* allocate memory to store virtual and dma base address of
	 * per glist consistent memory
	 */
	lio->glists_virt_base = kcalloc(num_iqs, sizeof(*lio->glists_virt_base),
					GFP_KERNEL);
	lio->glists_dma_base = kcalloc(num_iqs, sizeof(*lio->glists_dma_base),
				       GFP_KERNEL);

	if (!lio->glists_virt_base || !lio->glists_dma_base) {
		delete_glists(lio);
		return -ENOMEM;
	}

	for (i = 0; i < num_iqs; i++) {
@@ -788,6 +811,16 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)

		INIT_LIST_HEAD(&lio->glist[i]);

		lio->glists_virt_base[i] =
			lio_dma_alloc(oct,
				      lio->glist_entry_size * lio->tx_qsize,
				      &lio->glists_dma_base[i]);

		if (!lio->glists_virt_base[i]) {
			delete_glists(lio);
			return -ENOMEM;
		}

		for (j = 0; j < lio->tx_qsize; j++) {
			g = kzalloc_node(sizeof(*g), GFP_KERNEL,
					 numa_node);
@@ -796,43 +829,18 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)
			if (!g)
				break;

			g->sg_size = ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) *
				      OCT_SG_ENTRY_SIZE);
			g->sg = lio->glists_virt_base[i] +
				(j * lio->glist_entry_size);

			g->sg = kmalloc_node(g->sg_size + 8,
					     GFP_KERNEL, numa_node);
			if (!g->sg)
				g->sg = kmalloc(g->sg_size + 8, GFP_KERNEL);
			if (!g->sg) {
				kfree(g);
				break;
			}

			/* The gather component should be aligned on 64-bit
			 * boundary
			 */
			if (((unsigned long)g->sg) & 7) {
				g->adjust = 8 - (((unsigned long)g->sg) & 7);
				g->sg = (struct octeon_sg_entry *)
					((unsigned long)g->sg + g->adjust);
			}
			g->sg_dma_ptr = dma_map_single(&oct->pci_dev->dev,
						       g->sg, g->sg_size,
						       DMA_TO_DEVICE);
			if (dma_mapping_error(&oct->pci_dev->dev,
					      g->sg_dma_ptr)) {
				kfree((void *)((unsigned long)g->sg -
					       g->adjust));
				kfree(g);
				break;
			}
			g->sg_dma_ptr = lio->glists_dma_base[i] +
					(j * lio->glist_entry_size);

			list_add_tail(&g->list, &lio->glist[i]);
		}

		if (j != lio->tx_qsize) {
			delete_glists(lio);
			return 1;
			return -ENOMEM;
		}
	}

@@ -1885,9 +1893,6 @@ static void free_netsgbuf(void *buf)
		i++;
	}

	dma_sync_single_for_cpu(&lio->oct_dev->pci_dev->dev,
				g->sg_dma_ptr, g->sg_size, DMA_TO_DEVICE);

	iq = skb_iq(lio, skb);
	spin_lock(&lio->glist_lock[iq]);
	list_add_tail(&g->list, &lio->glist[iq]);
@@ -1933,9 +1938,6 @@ static void free_netsgbuf_with_resp(void *buf)
		i++;
	}

	dma_sync_single_for_cpu(&lio->oct_dev->pci_dev->dev,
				g->sg_dma_ptr, g->sg_size, DMA_TO_DEVICE);

	iq = skb_iq(lio, skb);

	spin_lock(&lio->glist_lock[iq]);
@@ -3273,8 +3275,6 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev)
			i++;
		}

		dma_sync_single_for_device(&oct->pci_dev->dev, g->sg_dma_ptr,
					   g->sg_size, DMA_TO_DEVICE);
		dptr = g->sg_dma_ptr;

		if (OCTEON_CN23XX_PF(oct))
+55 −49
Original line number Diff line number Diff line
@@ -108,6 +108,8 @@ struct octnic_gather {
	 * received from the IP layer.
	 */
	struct octeon_sg_entry *sg;

	dma_addr_t sg_dma_ptr;
};

struct octeon_device_priv {
@@ -490,6 +492,9 @@ static void delete_glists(struct lio *lio)
	struct octnic_gather *g;
	int i;

	kfree(lio->glist_lock);
	lio->glist_lock = NULL;

	if (!lio->glist)
		return;

@@ -497,17 +502,26 @@ static void delete_glists(struct lio *lio)
		do {
			g = (struct octnic_gather *)
			    list_delete_head(&lio->glist[i]);
			if (g) {
				if (g->sg)
					kfree((void *)((unsigned long)g->sg -
							g->adjust));
			if (g)
				kfree(g);
			}
		} while (g);

		if (lio->glists_virt_base && lio->glists_virt_base[i]) {
			lio_dma_free(lio->oct_dev,
				     lio->glist_entry_size * lio->tx_qsize,
				     lio->glists_virt_base[i],
				     lio->glists_dma_base[i]);
		}
	}

	kfree(lio->glists_virt_base);
	lio->glists_virt_base = NULL;

	kfree(lio->glists_dma_base);
	lio->glists_dma_base = NULL;

	kfree(lio->glist);
	kfree(lio->glist_lock);
	lio->glist = NULL;
}

/**
@@ -522,13 +536,30 @@ static int setup_glists(struct lio *lio, int num_iqs)
	lio->glist_lock =
	    kzalloc(sizeof(*lio->glist_lock) * num_iqs, GFP_KERNEL);
	if (!lio->glist_lock)
		return 1;
		return -ENOMEM;

	lio->glist =
	    kzalloc(sizeof(*lio->glist) * num_iqs, GFP_KERNEL);
	if (!lio->glist) {
		kfree(lio->glist_lock);
		return 1;
		lio->glist_lock = NULL;
		return -ENOMEM;
	}

	lio->glist_entry_size =
		ROUNDUP8((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE);

	/* allocate memory to store virtual and dma base address of
	 * per glist consistent memory
	 */
	lio->glists_virt_base = kcalloc(num_iqs, sizeof(*lio->glists_virt_base),
					GFP_KERNEL);
	lio->glists_dma_base = kcalloc(num_iqs, sizeof(*lio->glists_dma_base),
				       GFP_KERNEL);

	if (!lio->glists_virt_base || !lio->glists_dma_base) {
		delete_glists(lio);
		return -ENOMEM;
	}

	for (i = 0; i < num_iqs; i++) {
@@ -536,34 +567,33 @@ static int setup_glists(struct lio *lio, int num_iqs)

		INIT_LIST_HEAD(&lio->glist[i]);

		lio->glists_virt_base[i] =
			lio_dma_alloc(lio->oct_dev,
				      lio->glist_entry_size * lio->tx_qsize,
				      &lio->glists_dma_base[i]);

		if (!lio->glists_virt_base[i]) {
			delete_glists(lio);
			return -ENOMEM;
		}

		for (j = 0; j < lio->tx_qsize; j++) {
			g = kzalloc(sizeof(*g), GFP_KERNEL);
			if (!g)
				break;

			g->sg_size = ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) *
				      OCT_SG_ENTRY_SIZE);
			g->sg = lio->glists_virt_base[i] +
				(j * lio->glist_entry_size);

			g->sg = kmalloc(g->sg_size + 8, GFP_KERNEL);
			if (!g->sg) {
				kfree(g);
				break;
			}
			g->sg_dma_ptr = lio->glists_dma_base[i] +
					(j * lio->glist_entry_size);

			/* The gather component should be aligned on 64-bit
			 * boundary
			 */
			if (((unsigned long)g->sg) & 7) {
				g->adjust = 8 - (((unsigned long)g->sg) & 7);
				g->sg = (struct octeon_sg_entry *)
					((unsigned long)g->sg + g->adjust);
			}
			list_add_tail(&g->list, &lio->glist[i]);
		}

		if (j != lio->tx_qsize) {
			delete_glists(lio);
			return 1;
			return -ENOMEM;
		}
	}

@@ -1324,10 +1354,6 @@ static void free_netsgbuf(void *buf)
		i++;
	}

	dma_unmap_single(&lio->oct_dev->pci_dev->dev,
			 finfo->dptr, g->sg_size,
			 DMA_TO_DEVICE);

	iq = skb_iq(lio, skb);

	spin_lock(&lio->glist_lock[iq]);
@@ -1374,10 +1400,6 @@ static void free_netsgbuf_with_resp(void *buf)
		i++;
	}

	dma_unmap_single(&lio->oct_dev->pci_dev->dev,
			 finfo->dptr, g->sg_size,
			 DMA_TO_DEVICE);

	iq = skb_iq(lio, skb);

	spin_lock(&lio->glist_lock[iq]);
@@ -2382,23 +2404,7 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev)
			i++;
		}

		dptr = dma_map_single(&oct->pci_dev->dev,
				      g->sg, g->sg_size,
				      DMA_TO_DEVICE);
		if (dma_mapping_error(&oct->pci_dev->dev, dptr)) {
			dev_err(&oct->pci_dev->dev, "%s DMA mapping error 4\n",
				__func__);
			dma_unmap_single(&oct->pci_dev->dev, g->sg[0].ptr[0],
					 skb->len - skb->data_len,
					 DMA_TO_DEVICE);
			for (j = 1; j <= frags; j++) {
				frag = &skb_shinfo(skb)->frags[j - 1];
				dma_unmap_page(&oct->pci_dev->dev,
					       g->sg[j >> 2].ptr[j & 3],
					       frag->size, DMA_TO_DEVICE);
			}
			return NETDEV_TX_BUSY;
		}
		dptr = g->sg_dma_ptr;

		ndata.cmd.cmd3.dptr = dptr;
		finfo->dptr = dptr;
+3 −3
Original line number Diff line number Diff line
@@ -71,17 +71,17 @@
#define   CN23XX_MAX_RINGS_PER_VF          8

#define   CN23XX_MAX_INPUT_QUEUES	CN23XX_MAX_RINGS_PER_PF
#define   CN23XX_MAX_IQ_DESCRIPTORS	2048
#define   CN23XX_MAX_IQ_DESCRIPTORS	512
#define   CN23XX_DB_MIN                 1
#define   CN23XX_DB_MAX                 8
#define   CN23XX_DB_TIMEOUT             1

#define   CN23XX_MAX_OUTPUT_QUEUES	CN23XX_MAX_RINGS_PER_PF
#define   CN23XX_MAX_OQ_DESCRIPTORS	2048
#define   CN23XX_MAX_OQ_DESCRIPTORS	512
#define   CN23XX_OQ_BUF_SIZE		1536
#define   CN23XX_OQ_PKTSPER_INTR	128
/*#define CAVIUM_ONLY_CN23XX_RX_PERF*/
#define   CN23XX_OQ_REFIL_THRESHOLD	128
#define   CN23XX_OQ_REFIL_THRESHOLD	16

#define   CN23XX_OQ_INTR_PKT		64
#define   CN23XX_OQ_INTR_TIME		100
+2 −15
Original line number Diff line number Diff line
@@ -155,11 +155,6 @@ octeon_droq_destroy_ring_buffers(struct octeon_device *oct,
			recv_buffer_destroy(droq->recv_buf_list[i].buffer,
					    pg_info);

		if (droq->desc_ring && droq->desc_ring[i].info_ptr)
			lio_unmap_ring_info(oct->pci_dev,
					    (u64)droq->
					    desc_ring[i].info_ptr,
					    OCT_DROQ_INFO_SIZE);
		droq->recv_buf_list[i].buffer = NULL;
	}

@@ -211,10 +206,7 @@ int octeon_delete_droq(struct octeon_device *oct, u32 q_no)
	vfree(droq->recv_buf_list);

	if (droq->info_base_addr)
		cnnic_free_aligned_dma(oct->pci_dev, droq->info_list,
				       droq->info_alloc_size,
				       droq->info_base_addr,
				       droq->info_list_dma);
		lio_free_info_buffer(oct, droq);

	if (droq->desc_ring)
		lio_dma_free(oct, (droq->max_count * OCT_DROQ_DESC_SIZE),
@@ -294,12 +286,7 @@ int octeon_init_droq(struct octeon_device *oct,
	dev_dbg(&oct->pci_dev->dev, "droq[%d]: num_desc: %d\n", q_no,
		droq->max_count);

	droq->info_list =
		cnnic_numa_alloc_aligned_dma((droq->max_count *
					      OCT_DROQ_INFO_SIZE),
					     &droq->info_alloc_size,
					     &droq->info_base_addr,
					     numa_node);
	droq->info_list = lio_alloc_info_buffer(oct, droq);
	if (!droq->info_list) {
		dev_err(&oct->pci_dev->dev, "Cannot allocate memory for info list.\n");
		lio_dma_free(oct, (droq->max_count * OCT_DROQ_DESC_SIZE),
+2 −2
Original line number Diff line number Diff line
@@ -325,10 +325,10 @@ struct octeon_droq {
	size_t desc_ring_dma;

	/** Info ptr list are allocated at this virtual address. */
	size_t info_base_addr;
	void *info_base_addr;

	/** DMA mapped address of the info list */
	size_t info_list_dma;
	dma_addr_t info_list_dma;

	/** Allocated size of info list. */
	u32 info_alloc_size;
Loading