Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1e2b44e7 authored by Ka-Cheong Poon's avatar Ka-Cheong Poon Committed by David S. Miller
Browse files

rds: Enable RDS IPv6 support



This patch enables RDS to use IPv6 addresses. For RDS/TCP, the
listener is now an IPv6 endpoint which accepts both IPv4 and IPv6
connection requests.  RDS/RDMA/IB uses a private data (struct
rds_ib_connect_private) exchange between endpoints at RDS connection
establishment time to support RDMA. This private data exchange uses a
32 bit integer to represent an IP address. This needs to be changed in
order to support IPv6. A new private data struct
rds6_ib_connect_private is introduced to handle this. To ensure
backward compatibility, an IPv6 capable RDS stack uses another RDMA
listener port (RDS_CM_PORT) to accept IPv6 connection. And it
continues to use the original RDS_PORT for IPv4 RDS connections. When
it needs to communicate with an IPv6 peer, it uses the RDS_CM_PORT to
send the connection set up request.

v5: Fixed syntax problem (David Miller).

v4: Changed port history comments in rds.h (Sowmini Varadhan).

v3: Added support to set up IPv4 connection using mapped address
    (David Miller).
    Added support to set up connection between link local and non-link
    addresses.
    Various review comments from Santosh Shilimkar and Sowmini Varadhan.

v2: Fixed bound and peer address scope mismatched issue.
    Added back rds_connect() IPv6 changes.

Signed-off-by: default avatarKa-Cheong Poon <ka-cheong.poon@oracle.com>
Acked-by: default avatarSantosh Shilimkar <santosh.shilimkar@oracle.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent eee2fa6a
Loading
Loading
Loading
Loading
+77 −14
Original line number Diff line number Diff line
@@ -142,16 +142,33 @@ static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
			uaddr_len = sizeof(*sin6);
		}
	} else {
		/* If socket is not yet bound, set the return address family
		 * to be AF_UNSPEC (value 0) and the address size to be that
		 * of an IPv4 address.
		/* If socket is not yet bound and the socket is connected,
		 * set the return address family to be the same as the
		 * connected address, but with 0 address value.  If it is not
		 * connected, set the family to be AF_UNSPEC (value 0) and
		 * the address size to be that of an IPv4 address.
		 */
		if (ipv6_addr_any(&rs->rs_bound_addr)) {
			if (ipv6_addr_any(&rs->rs_conn_addr)) {
				sin = (struct sockaddr_in *)uaddr;
				memset(sin, 0, sizeof(*sin));
				sin->sin_family = AF_UNSPEC;
				return sizeof(*sin);
			}

			if (ipv6_addr_type(&rs->rs_conn_addr) &
			    IPV6_ADDR_MAPPED) {
				sin = (struct sockaddr_in *)uaddr;
				memset(sin, 0, sizeof(*sin));
				sin->sin_family = AF_INET;
				return sizeof(*sin);
			}

			sin6 = (struct sockaddr_in6 *)uaddr;
			memset(sin6, 0, sizeof(*sin6));
			sin6->sin6_family = AF_INET6;
			return sizeof(*sin6);
		}
		if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
			sin = (struct sockaddr_in *)uaddr;
			memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
@@ -484,16 +501,18 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
{
	struct sock *sk = sock->sk;
	struct sockaddr_in *sin;
	struct sockaddr_in6 *sin6;
	struct rds_sock *rs = rds_sk_to_rs(sk);
	int addr_type;
	int ret = 0;

	lock_sock(sk);

	switch (addr_len) {
	case sizeof(struct sockaddr_in):
	switch (uaddr->sa_family) {
	case AF_INET:
		sin = (struct sockaddr_in *)uaddr;
		if (sin->sin_family != AF_INET) {
			ret = -EAFNOSUPPORT;
		if (addr_len < sizeof(struct sockaddr_in)) {
			ret = -EINVAL;
			break;
		}
		if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
@@ -509,14 +528,58 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
		rs->rs_conn_port = sin->sin_port;
		break;

	case sizeof(struct sockaddr_in6):
		ret = -EPROTONOSUPPORT;
	case AF_INET6:
		sin6 = (struct sockaddr_in6 *)uaddr;
		if (addr_len < sizeof(struct sockaddr_in6)) {
			ret = -EINVAL;
			break;
		}
		addr_type = ipv6_addr_type(&sin6->sin6_addr);
		if (!(addr_type & IPV6_ADDR_UNICAST)) {
			__be32 addr4;

	default:
			if (!(addr_type & IPV6_ADDR_MAPPED)) {
				ret = -EPROTOTYPE;
				break;
			}

			/* It is a mapped address.  Need to do some sanity
			 * checks.
			 */
			addr4 = sin6->sin6_addr.s6_addr32[3];
			if (addr4 == htonl(INADDR_ANY) ||
			    addr4 == htonl(INADDR_BROADCAST) ||
			    IN_MULTICAST(ntohl(addr4))) {
				ret = -EPROTOTYPE;
				break;
			}
		}

		if (addr_type & IPV6_ADDR_LINKLOCAL) {
			/* If socket is arleady bound to a link local address,
			 * the peer address must be on the same link.
			 */
			if (sin6->sin6_scope_id == 0 ||
			    (!ipv6_addr_any(&rs->rs_bound_addr) &&
			     rs->rs_bound_scope_id &&
			     sin6->sin6_scope_id != rs->rs_bound_scope_id)) {
				ret = -EINVAL;
				break;
			}
			/* Remember the connected address scope ID.  It will
			 * be checked against the binding local address when
			 * the socket is bound.
			 */
			rs->rs_bound_scope_id = sin6->sin6_scope_id;
		}
		rs->rs_conn_addr = sin6->sin6_addr;
		rs->rs_conn_port = sin6->sin6_port;
		break;

	default:
		ret = -EAFNOSUPPORT;
		break;
	}

	release_sock(sk);
	return ret;
+50 −9
Original line number Diff line number Diff line
@@ -127,9 +127,10 @@ static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr,
		if (!rhashtable_insert_fast(&bind_hash_table,
					    &rs->rs_bound_node, ht_parms)) {
			*port = rs->rs_bound_port;
			rs->rs_bound_scope_id = scope_id;
			ret = 0;
			rdsdebug("rs %p binding to %pI4:%d\n",
			  rs, &addr, (int)ntohs(*port));
			rdsdebug("rs %p binding to %pI6c:%d\n",
				 rs, addr, (int)ntohs(*port));
			break;
		} else {
			rs->rs_bound_addr = in6addr_any;
@@ -164,23 +165,53 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
	struct in6_addr v6addr, *binding_addr;
	struct rds_transport *trans;
	__u32 scope_id = 0;
	int addr_type;
	int ret = 0;
	__be16 port;

	/* We only allow an RDS socket to be bound to an IPv4 address. IPv6
	 * address support will be added later.
	/* We allow an RDS socket to be bound to either IPv4 or IPv6
	 * address.
	 */
	if (addr_len == sizeof(struct sockaddr_in)) {
	if (uaddr->sa_family == AF_INET) {
		struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;

		if (sin->sin_family != AF_INET ||
		    sin->sin_addr.s_addr == htonl(INADDR_ANY))
		if (addr_len < sizeof(struct sockaddr_in) ||
		    sin->sin_addr.s_addr == htonl(INADDR_ANY) ||
		    sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
		    IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
			return -EINVAL;
		ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr);
		binding_addr = &v6addr;
		port = sin->sin_port;
	} else if (addr_len == sizeof(struct sockaddr_in6)) {
		return -EPROTONOSUPPORT;
	} else if (uaddr->sa_family == AF_INET6) {
		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr;

		if (addr_len < sizeof(struct sockaddr_in6))
			return -EINVAL;
		addr_type = ipv6_addr_type(&sin6->sin6_addr);
		if (!(addr_type & IPV6_ADDR_UNICAST)) {
			__be32 addr4;

			if (!(addr_type & IPV6_ADDR_MAPPED))
				return -EINVAL;

			/* It is a mapped address.  Need to do some sanity
			 * checks.
			 */
			addr4 = sin6->sin6_addr.s6_addr32[3];
			if (addr4 == htonl(INADDR_ANY) ||
			    addr4 == htonl(INADDR_BROADCAST) ||
			    IN_MULTICAST(ntohl(addr4)))
				return -EINVAL;
		}
		/* The scope ID must be specified for link local address. */
		if (addr_type & IPV6_ADDR_LINKLOCAL) {
			if (sin6->sin6_scope_id == 0)
				return -EINVAL;
			scope_id = sin6->sin6_scope_id;
		}
		binding_addr = &sin6->sin6_addr;
		port = sin6->sin6_port;
	} else {
		return -EINVAL;
	}
@@ -191,6 +222,16 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
		ret = -EINVAL;
		goto out;
	}
	/* Socket is connected.  The binding address should have the same
	 * scope ID as the connected address, except the case when one is
	 * non-link local address (scope_id is 0).
	 */
	if (!ipv6_addr_any(&rs->rs_conn_addr) && scope_id &&
	    rs->rs_bound_scope_id &&
	    scope_id != rs->rs_bound_scope_id) {
		ret = -EINVAL;
		goto out;
	}

	ret = rds_add_bound(rs, binding_addr, &port, scope_id);
	if (ret)
+39 −15
Original line number Diff line number Diff line
/*
 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
@@ -36,6 +36,7 @@
#include <linux/export.h>
#include <net/ipv6.h>
#include <net/inet6_hashtables.h>
#include <net/addrconf.h>

#include "rds.h"
#include "loop.h"
@@ -200,6 +201,15 @@ static struct rds_connection *__rds_conn_create(struct net *net,
	conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
	conn->c_faddr = *faddr;
	conn->c_dev_if = dev_if;
	/* If the local address is link local, set c_bound_if to be the
	 * index used for this connection.  Otherwise, set it to 0 as
	 * the socket is not bound to an interface.  c_bound_if is used
	 * to look up a socket when a packet is received
	 */
	if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL)
		conn->c_bound_if = dev_if;
	else
		conn->c_bound_if = 0;

	rds_conn_net_set(conn, net);

@@ -486,7 +496,15 @@ void rds_conn_destroy(struct rds_connection *conn)
}
EXPORT_SYMBOL_GPL(rds_conn_destroy);

static void rds_conn_message_info(struct socket *sock, unsigned int len,
static void __rds_inc_msg_cp(struct rds_incoming *inc,
			     struct rds_info_iterator *iter,
			     void *saddr, void *daddr, int flip)
{
	rds_inc_info_copy(inc, iter, *(__be32 *)saddr,
			  *(__be32 *)daddr, flip);
}

static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
				      struct rds_info_iterator *iter,
				      struct rds_info_lengths *lens,
				      int want_send)
@@ -524,17 +542,12 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,

				/* XXX too lazy to maintain counts.. */
				list_for_each_entry(rm, list, m_conn_item) {
					__be32 laddr;
					__be32 faddr;

					total++;
					laddr = conn->c_laddr.s6_addr32[3];
					faddr = conn->c_faddr.s6_addr32[3];
					if (total <= len)
						rds_inc_info_copy(&rm->m_inc,
						__rds_inc_msg_cp(&rm->m_inc,
								 iter,
								  laddr,
								  faddr,
								 &conn->c_laddr,
								 &conn->c_faddr,
								 0);
				}

@@ -548,6 +561,14 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
	lens->each = sizeof(struct rds_info_message);
}

static void rds_conn_message_info(struct socket *sock, unsigned int len,
				  struct rds_info_iterator *iter,
				  struct rds_info_lengths *lens,
				  int want_send)
{
	rds_conn_message_info_cmn(sock, len, iter, lens, want_send);
}

static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
				       struct rds_info_iterator *iter,
				       struct rds_info_lengths *lens)
@@ -655,6 +676,9 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
	struct rds_info_connection *cinfo = buffer;
	struct rds_connection *conn = cp->cp_conn;

	if (conn->c_isv6)
		return 0;

	cinfo->next_tx_seq = cp->cp_next_tx_seq;
	cinfo->next_rx_seq = cp->cp_next_rx_seq;
	cinfo->laddr = conn->c_laddr.s6_addr32[3];
+47 −8
Original line number Diff line number Diff line
/*
 * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
@@ -39,6 +39,7 @@
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/module.h>
#include <net/addrconf.h>

#include "rds_single_path.h"
#include "rds.h"
@@ -295,6 +296,8 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
	/* We will only ever look at IB transports */
	if (conn->c_trans != &rds_ib_transport)
		return 0;
	if (conn->c_isv6)
		return 0;

	iinfo->src_addr = conn->c_laddr.s6_addr32[3];
	iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
@@ -330,7 +333,6 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
				sizeof(struct rds_info_rdma_connection));
}


/*
 * Early RDS/IB was built to only bind to an address if there is an IPoIB
 * device with that address set.
@@ -346,8 +348,12 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
{
	int ret;
	struct rdma_cm_id *cm_id;
	struct sockaddr_in6 sin6;
	struct sockaddr_in sin;
	struct sockaddr *sa;
	bool isv4;

	isv4 = ipv6_addr_v4mapped(addr);
	/* Create a CMA ID and try to bind it. This catches both
	 * IB and iWARP capable NICs.
	 */
@@ -356,20 +362,53 @@ static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
	if (IS_ERR(cm_id))
		return PTR_ERR(cm_id);

	if (isv4) {
		memset(&sin, 0, sizeof(sin));
		sin.sin_family = AF_INET;
		sin.sin_addr.s_addr = addr->s6_addr32[3];
		sa = (struct sockaddr *)&sin;
	} else {
		memset(&sin6, 0, sizeof(sin6));
		sin6.sin6_family = AF_INET6;
		sin6.sin6_addr = *addr;
		sin6.sin6_scope_id = scope_id;
		sa = (struct sockaddr *)&sin6;

		/* XXX Do a special IPv6 link local address check here.  The
		 * reason is that rdma_bind_addr() always succeeds with IPv6
		 * link local address regardless it is indeed configured in a
		 * system.
		 */
		if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) {
			struct net_device *dev;

			if (scope_id == 0)
				return -EADDRNOTAVAIL;

			/* Use init_net for now as RDS is not network
			 * name space aware.
			 */
			dev = dev_get_by_index(&init_net, scope_id);
			if (!dev)
				return -EADDRNOTAVAIL;
			if (!ipv6_chk_addr(&init_net, addr, dev, 1)) {
				dev_put(dev);
				return -EADDRNOTAVAIL;
			}
			dev_put(dev);
		}
	}

	/* rdma_bind_addr will only succeed for IB & iWARP devices */
	ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
	ret = rdma_bind_addr(cm_id, sa);
	/* due to this, we will claim to support iWARP devices unless we
	   check node_type. */
	if (ret || !cm_id->device ||
	    cm_id->device->node_type != RDMA_NODE_IB_CA)
		ret = -EADDRNOTAVAIL;

	rdsdebug("addr %pI6c ret %d node type %d\n",
		 addr, ret,
	rdsdebug("addr %pI6c%%%u ret %d node type %d\n",
		 addr, scope_id, ret,
		 cm_id->device ? cm_id->device->node_type : -1);

	rdma_destroy_id(cm_id);
+15 −5
Original line number Diff line number Diff line
@@ -678,7 +678,7 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
	return version;
}

/* Given an IPv6 address, find the IB net_device which hosts that address and
/* Given an IPv6 address, find the net_device which hosts that address and
 * return its index.  This is used by the rds_ib_cm_handle_connect() code to
 * find the interface index of where an incoming request comes from when
 * the request is using a link local address.
@@ -695,8 +695,7 @@ static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr)

	rcu_read_lock();
	for_each_netdev_rcu(net, dev) {
		if (dev->type == ARPHRD_INFINIBAND &&
		    ipv6_chk_addr(net, addr, dev, 0)) {
		if (ipv6_chk_addr(net, addr, dev, 1)) {
			idx = dev->ifindex;
			break;
		}
@@ -736,7 +735,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
		dp_cmn = &dp->ricp_v6.dp_cmn;
		saddr6 = &dp->ricp_v6.dp_saddr;
		daddr6 = &dp->ricp_v6.dp_daddr;
		/* If the local address is link local, need to find the
		/* If either address is link local, need to find the
		 * interface index in order to create a proper RDS
		 * connection.
		 */
@@ -748,6 +747,14 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
				err = -EOPNOTSUPP;
				goto out;
			}
		} else if (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) {
			/* Use our address to find the correct index. */
			ifindex = __rds_find_ifindex(&init_net, daddr6);
			/* No index found...  Need to bail out. */
			if (ifindex == 0) {
				err = -EOPNOTSUPP;
				goto out;
			}
		}
	} else {
		dp_cmn = &dp->ricp_v4.dp_cmn;
@@ -886,6 +893,9 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)

	/* XXX I wonder what affect the port space has */
	/* delegate cm event handler to rdma_transport */
	if (conn->c_isv6)
		handler = rds6_rdma_cm_event_handler;
	else
		handler = rds_rdma_cm_event_handler;
	ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
				     RDMA_PS_TCP, IB_QPT_RC);
Loading