Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit bc7b3a36 authored by Shirley Ma's avatar Shirley Ma Committed by Roland Dreier
Browse files

IPoIB: Handle 4K IB MTU for UD (datagram) mode



This patch enables IPoIB to use 4K UD messages (when the underlying
device and fabrics support a 4K MTU) by using two scatter buffers when
PAGE_SIZE is less than or equal to thhe HCA IB MTU size.  The first
buffer is for IPoIB header + GRH header, and the second buffer is the
IPoIB payload, which is 4K-4.

Signed-off-by: default avatarShirley Ma <xma@us.ibm.com>
Signed-off-by: default avatarRoland Dreier <rolandd@cisco.com>
parent bc5698f3
Loading
Loading
Loading
Loading
+16 −4
Original line number Diff line number Diff line
@@ -56,11 +56,11 @@
/* constants */

enum {
	IPOIB_PACKET_SIZE	  = 2048,
	IPOIB_BUF_SIZE		  = IPOIB_PACKET_SIZE + IB_GRH_BYTES,

	IPOIB_ENCAP_LEN		  = 4,

	IPOIB_UD_HEAD_SIZE	  = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
	IPOIB_UD_RX_SG		  = 2, /* max buffer needed for 4K mtu */

	IPOIB_CM_MTU		  = 0x10000 - 0x10, /* padding to align header to 16 */
	IPOIB_CM_BUF_SIZE	  = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
	IPOIB_CM_HEAD_SIZE	  = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
@@ -139,7 +139,7 @@ struct ipoib_mcast {

struct ipoib_rx_buf {
	struct sk_buff *skb;
	u64		mapping;
	u64		mapping[IPOIB_UD_RX_SG];
};

struct ipoib_tx_buf {
@@ -294,6 +294,7 @@ struct ipoib_dev_priv {

	unsigned int admin_mtu;
	unsigned int mcast_mtu;
	unsigned int max_ib_mtu;

	struct ipoib_rx_buf *rx_ring;

@@ -305,6 +306,9 @@ struct ipoib_dev_priv {
	struct ib_send_wr    tx_wr;
	unsigned	     tx_outstanding;

	struct ib_recv_wr    rx_wr;
	struct ib_sge	     rx_sge[IPOIB_UD_RX_SG];

	struct ib_wc ibwc[IPOIB_NUM_WC];

	struct list_head dead_ahs;
@@ -366,6 +370,14 @@ struct ipoib_neigh {
	struct list_head    list;
};

#define IPOIB_UD_MTU(ib_mtu)		(ib_mtu - IPOIB_ENCAP_LEN)
#define IPOIB_UD_BUF_SIZE(ib_mtu)	(ib_mtu + IB_GRH_BYTES)

static inline int ipoib_ud_need_sg(unsigned int ib_mtu)
{
	return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE;
}

/*
 * We stash a pointer to our private neighbour information after our
 * hardware address in neigh->ha.  The ALIGN() expression here makes
+88 −37
Original line number Diff line number Diff line
@@ -89,28 +89,59 @@ void ipoib_free_ah(struct kref *kref)
	spin_unlock_irqrestore(&priv->lock, flags);
}

static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv,
				  u64 mapping[IPOIB_UD_RX_SG])
{
	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
		ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE,
				    DMA_FROM_DEVICE);
		ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE,
				  DMA_FROM_DEVICE);
	} else
		ib_dma_unmap_single(priv->ca, mapping[0],
				    IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
				    DMA_FROM_DEVICE);
}

static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv,
				   struct sk_buff *skb,
				   unsigned int length)
{
	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
		skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
		unsigned int size;
		/*
		 * There is only two buffers needed for max_payload = 4K,
		 * first buf size is IPOIB_UD_HEAD_SIZE
		 */
		skb->tail += IPOIB_UD_HEAD_SIZE;
		skb->len  += length;

		size = length - IPOIB_UD_HEAD_SIZE;

		frag->size     = size;
		skb->data_len += size;
		skb->truesize += size;
	} else
		skb_put(skb, length);

}

static int ipoib_ib_post_receive(struct net_device *dev, int id)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct ib_sge list;
	struct ib_recv_wr param;
	struct ib_recv_wr *bad_wr;
	int ret;

	list.addr     = priv->rx_ring[id].mapping;
	list.length   = IPOIB_BUF_SIZE;
	list.lkey     = priv->mr->lkey;
	priv->rx_wr.wr_id   = id | IPOIB_OP_RECV;
	priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0];
	priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1];

	param.next    = NULL;
	param.wr_id   = id | IPOIB_OP_RECV;
	param.sg_list = &list;
	param.num_sge = 1;

	ret = ib_post_recv(priv->qp, &param, &bad_wr);
	ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
	if (unlikely(ret)) {
		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
		ib_dma_unmap_single(priv->ca, priv->rx_ring[id].mapping,
				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping);
		dev_kfree_skb_any(priv->rx_ring[id].skb);
		priv->rx_ring[id].skb = NULL;
	}
@@ -118,15 +149,21 @@ static int ipoib_ib_post_receive(struct net_device *dev, int id)
	return ret;
}

static int ipoib_alloc_rx_skb(struct net_device *dev, int id)
static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	struct sk_buff *skb;
	u64 addr;
	int buf_size;
	u64 *mapping;

	skb = dev_alloc_skb(IPOIB_BUF_SIZE + 4);
	if (!skb)
		return -ENOMEM;
	if (ipoib_ud_need_sg(priv->max_ib_mtu))
		buf_size = IPOIB_UD_HEAD_SIZE;
	else
		buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);

	skb = dev_alloc_skb(buf_size + 4);
	if (unlikely(!skb))
		return NULL;

	/*
	 * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
@@ -135,17 +172,32 @@ static int ipoib_alloc_rx_skb(struct net_device *dev, int id)
	 */
	skb_reserve(skb, 4);

	addr = ib_dma_map_single(priv->ca, skb->data, IPOIB_BUF_SIZE,
	mapping = priv->rx_ring[id].mapping;
	mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
				       DMA_FROM_DEVICE);
	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
		dev_kfree_skb_any(skb);
		return -EIO;
	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
		goto error;

	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
		struct page *page = alloc_page(GFP_ATOMIC);
		if (!page)
			goto partial_error;
		skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);
		mapping[1] =
			ib_dma_map_page(priv->ca, skb_shinfo(skb)->frags[0].page,
					0, PAGE_SIZE, DMA_FROM_DEVICE);
		if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1])))
			goto partial_error;
	}

	priv->rx_ring[id].skb = skb;
	priv->rx_ring[id].mapping = addr;
	return skb;

	return 0;
partial_error:
	ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE);
error:
	dev_kfree_skb_any(skb);
	return NULL;
}

static int ipoib_ib_post_receives(struct net_device *dev)
@@ -154,7 +206,7 @@ static int ipoib_ib_post_receives(struct net_device *dev)
	int i;

	for (i = 0; i < ipoib_recvq_size; ++i) {
		if (ipoib_alloc_rx_skb(dev, i)) {
		if (!ipoib_alloc_rx_skb(dev, i)) {
			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
			return -ENOMEM;
		}
@@ -172,7 +224,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
	struct sk_buff *skb;
	u64 addr;
	u64 mapping[IPOIB_UD_RX_SG];

	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
		       wr_id, wc->status);
@@ -184,15 +236,13 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
	}

	skb  = priv->rx_ring[wr_id].skb;
	addr = priv->rx_ring[wr_id].mapping;

	if (unlikely(wc->status != IB_WC_SUCCESS)) {
		if (wc->status != IB_WC_WR_FLUSH_ERR)
			ipoib_warn(priv, "failed recv event "
				   "(status=%d, wrid=%d vend_err %x)\n",
				   wc->status, wr_id, wc->vendor_err);
		ib_dma_unmap_single(priv->ca, addr,
				    IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
		dev_kfree_skb_any(skb);
		priv->rx_ring[wr_id].skb = NULL;
		return;
@@ -205,11 +255,14 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
		goto repost;

	memcpy(mapping, priv->rx_ring[wr_id].mapping,
	       IPOIB_UD_RX_SG * sizeof *mapping);

	/*
	 * If we can't allocate a new RX buffer, dump
	 * this packet and reuse the old buffer.
	 */
	if (unlikely(ipoib_alloc_rx_skb(dev, wr_id))) {
	if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) {
		++dev->stats.rx_dropped;
		goto repost;
	}
@@ -217,9 +270,9 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
		       wc->byte_len, wc->slid);

	ib_dma_unmap_single(priv->ca, addr, IPOIB_BUF_SIZE, DMA_FROM_DEVICE);
	ipoib_ud_dma_unmap_rx(priv, mapping);
	ipoib_ud_skb_put_frags(priv, skb, wc->byte_len);

	skb_put(skb, wc->byte_len);
	skb_pull(skb, IB_GRH_BYTES);

	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
@@ -733,10 +786,8 @@ int ipoib_ib_dev_stop(struct net_device *dev, int flush)
				rx_req = &priv->rx_ring[i];
				if (!rx_req->skb)
					continue;
				ib_dma_unmap_single(priv->ca,
						    rx_req->mapping,
						    IPOIB_BUF_SIZE,
						    DMA_FROM_DEVICE);
				ipoib_ud_dma_unmap_rx(priv,
						      priv->rx_ring[i].mapping);
				dev_kfree_skb_any(rx_req->skb);
				rx_req->skb = NULL;
			}
+14 −5
Original line number Diff line number Diff line
@@ -195,7 +195,7 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
		return 0;
	}

	if (new_mtu > IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN)
	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
		return -EINVAL;

	priv->admin_mtu = new_mtu;
@@ -971,10 +971,6 @@ static void ipoib_setup(struct net_device *dev)
				    NETIF_F_LLTX		|
				    NETIF_F_HIGHDMA);

	/* MTU will be reset when mcast join happens */
	dev->mtu		 = IPOIB_PACKET_SIZE - IPOIB_ENCAP_LEN;
	priv->mcast_mtu		 = priv->admin_mtu = dev->mtu;

	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);

	netif_carrier_off(dev);
@@ -1107,6 +1103,7 @@ static struct net_device *ipoib_add_port(const char *format,
{
	struct ipoib_dev_priv *priv;
	struct ib_device_attr *device_attr;
	struct ib_port_attr attr;
	int result = -ENOMEM;

	priv = ipoib_intf_alloc(format);
@@ -1115,6 +1112,18 @@ static struct net_device *ipoib_add_port(const char *format,

	SET_NETDEV_DEV(priv->dev, hca->dma_device);

	if (!ib_query_port(hca, port, &attr))
		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
	else {
		printk(KERN_WARNING "%s: ib_query_port %d failed\n",
		       hca->name, port);
		goto device_init_failed;
	}

	/* MTU will be reset when mcast join happens */
	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;

	result = ib_query_pkey(hca, port, 0, &priv->pkey);
	if (result) {
		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
+1 −2
Original line number Diff line number Diff line
@@ -567,8 +567,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
		return;
	}

	priv->mcast_mtu = ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu) -
		IPOIB_ENCAP_LEN;
	priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));

	if (!ipoib_cm_admin_enabled(dev))
		dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+14 −1
Original line number Diff line number Diff line
@@ -150,7 +150,7 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
			.max_send_wr  = ipoib_sendq_size,
			.max_recv_wr  = ipoib_recvq_size,
			.max_send_sge = 1,
			.max_recv_sge = 1
			.max_recv_sge = IPOIB_UD_RX_SG
		},
		.sq_sig_type = IB_SIGNAL_ALL_WR,
		.qp_type     = IB_QPT_UD
@@ -215,6 +215,19 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
	priv->tx_wr.sg_list	= priv->tx_sge;
	priv->tx_wr.send_flags	= IB_SEND_SIGNALED;

	priv->rx_sge[0].lkey = priv->mr->lkey;
	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
		priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE;
		priv->rx_sge[1].length = PAGE_SIZE;
		priv->rx_sge[1].lkey = priv->mr->lkey;
		priv->rx_wr.num_sge = IPOIB_UD_RX_SG;
	} else {
		priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
		priv->rx_wr.num_sge = 1;
	}
	priv->rx_wr.next = NULL;
	priv->rx_wr.sg_list = priv->rx_sge;

	return 0;

out_free_cq:
Loading