Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 839fcaba authored by Michael S. Tsirkin's avatar Michael S. Tsirkin Committed by Roland Dreier
Browse files

IPoIB: Connected mode experimental support



The following patch adds experimental support for IPoIB connected
mode, as defined by the draft from the IETF ipoib working group.  The
idea is to increase performance by increasing the MTU from the maximum
of 2K (theoretically 4K) supported by IPoIB on top of UD.  With this
code, I'm able to get 800MByte/sec or more with netperf without
options on a Mellanox 4x back-to-back DDR system.

Some notes on code:
1. SRQ is used for scalability to large cluster sizes
2. Only RC connections are used (UC does not support SRQ now)
3. Retry count is set to 0 since spec draft warns against retries
4. Each connection is used for data transfers in only 1 direction, so
   each connection is either active(TX) or passive (RX).  2 sides that
   want to communicate create 2 connections.
5. Each active (TX) connection has a separate CQ for send completions -
   this keeps the code simple without CQ resize and other tricks
6. To detect stale passive side connections (where the remote side is
   down), we keep an LRU list of passive connections (updated once per
   second per connection) and destroy a connection after it has been
   unused for several seconds. The LRU rule makes it possible to avoid
   scanning connections that have recently been active.

Signed-off-by: default avatarMichael S. Tsirkin <mst@mellanox.co.il>
Signed-off-by: default avatarRoland Dreier <rolandd@cisco.com>
parent 9a6b090c
Loading
Loading
Loading
Loading
+15 −1
Original line number Diff line number Diff line
config INFINIBAND_IPOIB
	tristate "IP-over-InfiniBand"
	depends on INFINIBAND && NETDEVICES && INET
	depends on INFINIBAND && NETDEVICES && INET && (IPV6 || IPV6=n)
	---help---
	  Support for the IP-over-InfiniBand protocol (IPoIB). This
	  transports IP packets over InfiniBand so you can use your IB
@@ -8,6 +8,20 @@ config INFINIBAND_IPOIB

	  See Documentation/infiniband/ipoib.txt for more information

config INFINIBAND_IPOIB_CM
	bool "IP-over-InfiniBand Connected Mode support"
	depends on INFINIBAND_IPOIB && EXPERIMENTAL
	default n
	---help---
	  This option enables experimental support for IPoIB connected mode.
	  After enabling this option, you need to switch to connected mode through
	  /sys/class/net/ibXXX/mode to actually create connections, and then increase
	  the interface MTU with e.g. ifconfig ib0 mtu 65520.

	  WARNING: Enabling connected mode will trigger some
	  packet drops for multicast and UD mode traffic from this interface,
	  unless you limit mtu for these destinations to 2044.

config INFINIBAND_IPOIB_DEBUG
	bool "IP-over-InfiniBand debugging" if EMBEDDED
	depends on INFINIBAND_IPOIB
+1 −0
Original line number Diff line number Diff line
@@ -5,5 +5,6 @@ ib_ipoib-y := ipoib_main.o \
						   ipoib_multicast.o \
						   ipoib_verbs.o \
						   ipoib_vlan.o
ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM)		+= ipoib_cm.o
ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG)	+= ipoib_fs.o
+215 −0
Original line number Diff line number Diff line
@@ -62,6 +62,10 @@ enum {

	IPOIB_ENCAP_LEN 	  = 4,

	IPOIB_CM_MTU              = 0x10000 - 0x10, /* padding to align header to 16 */
	IPOIB_CM_BUF_SIZE         = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
	IPOIB_CM_HEAD_SIZE 	  = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
	IPOIB_CM_RX_SG            = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
	IPOIB_RX_RING_SIZE 	  = 128,
	IPOIB_TX_RING_SIZE 	  = 64,
	IPOIB_MAX_QUEUE_SIZE	  = 8192,
@@ -81,6 +85,8 @@ enum {
	IPOIB_MCAST_RUN 	  = 6,
	IPOIB_STOP_REAPER         = 7,
	IPOIB_MCAST_STARTED       = 8,
	IPOIB_FLAG_NETIF_STOPPED  = 9,
	IPOIB_FLAG_ADMIN_CM 	  = 10,

	IPOIB_MAX_BACKOFF_SECONDS = 16,

@@ -90,6 +96,13 @@ enum {
	IPOIB_MCAST_FLAG_ATTACHED = 3,
};

#define	IPOIB_OP_RECV   (1ul << 31)
#ifdef CONFIG_INFINIBAND_IPOIB_CM
#define	IPOIB_CM_OP_SRQ (1ul << 30)
#else
#define	IPOIB_CM_OP_SRQ (0)
#endif

/* structs */

struct ipoib_header {
@@ -113,6 +126,59 @@ struct ipoib_tx_buf {
	u64		mapping;
};

struct ib_cm_id;

struct ipoib_cm_data {
	__be32 qpn; /* High byte MUST be ignored on receive */
	__be32 mtu;
};

struct ipoib_cm_rx {
	struct ib_cm_id     *id;
	struct ib_qp        *qp;
	struct list_head     list;
	struct net_device   *dev;
	unsigned long        jiffies;
};

struct ipoib_cm_tx {
	struct ib_cm_id     *id;
	struct ib_cq        *cq;
	struct ib_qp        *qp;
	struct list_head     list;
	struct net_device   *dev;
	struct ipoib_neigh  *neigh;
	struct ipoib_path   *path;
	struct ipoib_tx_buf *tx_ring;
	unsigned             tx_head;
	unsigned             tx_tail;
	unsigned long        flags;
	u32                  mtu;
	struct ib_wc         ibwc[IPOIB_NUM_WC];
};

struct ipoib_cm_rx_buf {
	struct sk_buff *skb;
	u64 mapping[IPOIB_CM_RX_SG];
};

struct ipoib_cm_dev_priv {
	struct ib_srq  	       *srq;
	struct ipoib_cm_rx_buf *srq_ring;
	struct ib_cm_id        *id;
	struct list_head        passive_ids;
	struct work_struct      start_task;
	struct work_struct      reap_task;
	struct work_struct      skb_task;
	struct delayed_work     stale_task;
	struct sk_buff_head     skb_queue;
	struct list_head        start_list;
	struct list_head        reap_list;
	struct ib_wc            ibwc[IPOIB_NUM_WC];
	struct ib_sge           rx_sge[IPOIB_CM_RX_SG];
	struct ib_recv_wr       rx_wr;
};

/*
 * Device private locking: tx_lock protects members used in TX fast
 * path (and we use LLTX so upper layers don't do extra locking).
@@ -179,6 +245,10 @@ struct ipoib_dev_priv {
	struct list_head child_intfs;
	struct list_head list;

#ifdef CONFIG_INFINIBAND_IPOIB_CM
	struct ipoib_cm_dev_priv cm;
#endif

#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
	struct list_head fs_list;
	struct dentry *mcg_dentry;
@@ -212,6 +282,9 @@ struct ipoib_path {

struct ipoib_neigh {
	struct ipoib_ah    *ah;
#ifdef CONFIG_INFINIBAND_IPOIB_CM
	struct ipoib_cm_tx *cm;
#endif
	union ib_gid        dgid;
	struct sk_buff_head queue;

@@ -315,6 +388,146 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
void ipoib_pkey_poll(struct work_struct *work);
int ipoib_pkey_dev_delay_open(struct net_device *dev);

#ifdef CONFIG_INFINIBAND_IPOIB_CM

#define IPOIB_FLAGS_RC          0x80
#define IPOIB_FLAGS_UC          0x40

/* We don't support UC connections at the moment */
#define IPOIB_CM_SUPPORTED(ha)   (ha[0] & (IPOIB_FLAGS_RC))

static inline int ipoib_cm_admin_enabled(struct net_device *dev)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	return IPOIB_CM_SUPPORTED(dev->dev_addr) &&
		test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
}

static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)
{
	struct ipoib_dev_priv *priv = netdev_priv(dev);
	return IPOIB_CM_SUPPORTED(n->ha) &&
		test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
}

static inline int ipoib_cm_up(struct ipoib_neigh *neigh)

{
	return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags);
}

static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
{
	return neigh->cm;
}

static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
{
	neigh->cm = tx;
}

void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx);
int ipoib_cm_dev_open(struct net_device *dev);
void ipoib_cm_dev_stop(struct net_device *dev);
int ipoib_cm_dev_init(struct net_device *dev);
int ipoib_cm_add_mode_attr(struct net_device *dev);
void ipoib_cm_dev_cleanup(struct net_device *dev);
struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
				    struct ipoib_neigh *neigh);
void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
			   unsigned int mtu);
void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
#else

struct ipoib_cm_tx;

static inline int ipoib_cm_admin_enabled(struct net_device *dev)
{
	return 0;
}
static inline int ipoib_cm_enabled(struct net_device *dev, struct neighbour *n)

{
	return 0;
}

static inline int ipoib_cm_up(struct ipoib_neigh *neigh)

{
	return 0;
}

static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
{
	return NULL;
}

static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
{
}

static inline
void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
{
	return;
}

static inline
int ipoib_cm_dev_open(struct net_device *dev)
{
	return 0;
}

static inline
void ipoib_cm_dev_stop(struct net_device *dev)
{
	return;
}

static inline
int ipoib_cm_dev_init(struct net_device *dev)
{
	return -ENOSYS;
}

static inline
void ipoib_cm_dev_cleanup(struct net_device *dev)
{
	return;
}

static inline
struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
				    struct ipoib_neigh *neigh)
{
	return NULL;
}

static inline
void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
{
	return;
}

static inline
int ipoib_cm_add_mode_attr(struct net_device *dev)
{
	return 0;
}

static inline void ipoib_cm_skb_too_long(struct net_device* dev, struct sk_buff *skb,
					 unsigned int mtu)
{
	dev_kfree_skb_any(skb);
}

static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
{
}

#endif

#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
void ipoib_create_debug_files(struct net_device *dev);
void ipoib_delete_debug_files(struct net_device *dev);
@@ -392,4 +605,6 @@ extern int ipoib_debug_level;

#define IPOIB_GID_ARG(gid)	IPOIB_GID_RAW_ARG((gid).raw)

#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)

#endif /* _IPOIB_H */
+1237 −0

File added.

Preview size limit exceeded, changes collapsed.

+20 −9
Original line number Diff line number Diff line
@@ -50,8 +50,6 @@ MODULE_PARM_DESC(data_debug_level,
		 "Enable data path debug tracing if > 0");
#endif

#define	IPOIB_OP_RECV	(1ul << 31)

static DEFINE_MUTEX(pkey_mutex);

struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
@@ -268,10 +266,11 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)

	spin_lock_irqsave(&priv->tx_lock, flags);
	++priv->tx_tail;
	if (netif_queue_stopped(dev) &&
	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags) &&
	    priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1)
	if (unlikely(test_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags)) &&
	    priv->tx_head - priv->tx_tail <= ipoib_sendq_size >> 1) {
		clear_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
		netif_wake_queue(dev);
	}
	spin_unlock_irqrestore(&priv->tx_lock, flags);

	if (wc->status != IB_WC_SUCCESS &&
@@ -283,7 +282,9 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)

static void ipoib_ib_handle_wc(struct net_device *dev, struct ib_wc *wc)
{
	if (wc->wr_id & IPOIB_OP_RECV)
	if (wc->wr_id & IPOIB_CM_OP_SRQ)
		ipoib_cm_handle_rx_wc(dev, wc);
	else if (wc->wr_id & IPOIB_OP_RECV)
		ipoib_ib_handle_rx_wc(dev, wc);
	else
		ipoib_ib_handle_tx_wc(dev, wc);
@@ -327,12 +328,12 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
	struct ipoib_tx_buf *tx_req;
	u64 addr;

	if (unlikely(skb->len > dev->mtu + INFINIBAND_ALEN)) {
	if (unlikely(skb->len > priv->mcast_mtu + INFINIBAND_ALEN)) {
		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
			   skb->len, dev->mtu + INFINIBAND_ALEN);
			   skb->len, priv->mcast_mtu + INFINIBAND_ALEN);
		++priv->stats.tx_dropped;
		++priv->stats.tx_errors;
		dev_kfree_skb_any(skb);
		ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
		return;
	}

@@ -372,6 +373,7 @@ void ipoib_send(struct net_device *dev, struct sk_buff *skb,
		if (priv->tx_head - priv->tx_tail == ipoib_sendq_size) {
			ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
			netif_stop_queue(dev);
			set_bit(IPOIB_FLAG_NETIF_STOPPED, &priv->flags);
		}
	}
}
@@ -424,6 +426,13 @@ int ipoib_ib_dev_open(struct net_device *dev)
		return -1;
	}

	ret = ipoib_cm_dev_open(dev);
	if (ret) {
		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
		ipoib_ib_dev_stop(dev);
		return -1;
	}

	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
	queue_delayed_work(ipoib_workqueue, &priv->ah_reap_task, HZ);

@@ -509,6 +518,8 @@ int ipoib_ib_dev_stop(struct net_device *dev)

	clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags);

	ipoib_cm_dev_stop(dev);

	/*
	 * Move our QP to the error state and then reinitialize in
	 * when all work requests have completed or have been flushed.
Loading