Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 042a4197 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'for_net-next-5.1/rds-tos-v4' of...

Merge branch 'for_net-next-5.1/rds-tos-v4' of git://git.kernel.org/pub/scm/linux/kernel/git/ssantosh/linux



Santosh Shilimkar says:

====================
rds: add tos support

RDS applications make use of tos to classify database traffic.
This feature has been used in shipping products from 2.6.32 based
kernels. Its tied with RDS v4.1 protocol version and the compatibility
gets negotiated as part of connections setup.

Patchset keeps full backward compatibility using existing connection
negotiation scheme. Currently the feature is exploited by RDMA
transport and for TCP transport the user tos values are mapped to
same default class (0).

For RDMA transports, RDMA CM service type API is used to
set up different SL(service lanes) and the IB fabric is configured
for tos mapping using Subnet Manager(SL to VL mappings).
Similarly for ROCE fabric, user priority is mapped with different
DSCP code points which are associated with different switch queues
in the fabric.

The original code was developed by Bang Nguyen in downstream kernel back in
2.6.32 kernel days and it has evolved significantly over period of time.

Thanks to Yanjun for doing testing with various combinations of host like
v3.1<->v4.1, v4.1.<->v3.1, v4.1 upstream to shipping v4.1 etc etc
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents e90b1fd8 fd261ce6
Loading
Loading
Loading
Loading
+11 −0
Original line number Diff line number Diff line
@@ -69,6 +69,12 @@
#define RDS_TRANS_COUNT	3
#define	RDS_TRANS_NONE	(~0)

/* IOCTLS commands for SOL_RDS */
#define SIOCRDSSETTOS		(SIOCPROTOPRIVATE)
#define SIOCRDSGETTOS		(SIOCPROTOPRIVATE + 1)

typedef __u8	rds_tos_t;

/*
 * Control message types for SOL_RDS.
 *
@@ -149,6 +155,7 @@ struct rds_info_connection {
	__be32		faddr;
	__u8		transport[TRANSNAMSIZ];		/* null term ascii */
	__u8		flags;
	__u8		tos;
} __attribute__((packed));

struct rds6_info_connection {
@@ -171,6 +178,7 @@ struct rds_info_message {
	__be16		lport;
	__be16		fport;
	__u8		flags;
	__u8		tos;
} __attribute__((packed));

struct rds6_info_message {
@@ -214,6 +222,7 @@ struct rds_info_tcp_socket {
	__u32           last_sent_nxt;
	__u32           last_expected_una;
	__u32           last_seen_una;
	__u8		tos;
} __attribute__((packed));

struct rds6_info_tcp_socket {
@@ -240,6 +249,7 @@ struct rds_info_rdma_connection {
	__u32		max_send_sge;
	__u32		rdma_mr_max;
	__u32		rdma_mr_size;
	__u8		tos;
};

struct rds6_info_rdma_connection {
@@ -253,6 +263,7 @@ struct rds6_info_rdma_connection {
	__u32		max_send_sge;
	__u32		rdma_mr_max;
	__u32		rdma_mr_size;
	__u8		tos;
};

/* RDS message Receive Path Latency points */
+36 −1
Original line number Diff line number Diff line
@@ -254,7 +254,40 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,

static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
	struct rds_sock *rs = rds_sk_to_rs(sock->sk);
	rds_tos_t utos, tos = 0;

	switch (cmd) {
	case SIOCRDSSETTOS:
		if (get_user(utos, (rds_tos_t __user *)arg))
			return -EFAULT;

		if (rs->rs_transport &&
		    rs->rs_transport->get_tos_map)
			tos = rs->rs_transport->get_tos_map(utos);
		else
			return -ENOIOCTLCMD;

		spin_lock_bh(&rds_sock_lock);
		if (rs->rs_tos || rs->rs_conn) {
			spin_unlock_bh(&rds_sock_lock);
			return -EINVAL;
		}
		rs->rs_tos = tos;
		spin_unlock_bh(&rds_sock_lock);
		break;
	case SIOCRDSGETTOS:
		spin_lock_bh(&rds_sock_lock);
		tos = rs->rs_tos;
		spin_unlock_bh(&rds_sock_lock);
		if (put_user(tos, (rds_tos_t __user *)arg))
			return -EFAULT;
		break;
	default:
		return -ENOIOCTLCMD;
	}

	return 0;
}

static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
@@ -650,6 +683,8 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
	spin_lock_init(&rs->rs_rdma_lock);
	rs->rs_rdma_keys = RB_ROOT;
	rs->rs_rx_traces = 0;
	rs->rs_tos = 0;
	rs->rs_conn = NULL;

	spin_lock_bh(&rds_sock_lock);
	list_add_tail(&rs->rs_item, &rds_sock_list);
+12 −9
Original line number Diff line number Diff line
@@ -84,7 +84,7 @@ static struct rds_connection *rds_conn_lookup(struct net *net,
					      const struct in6_addr *laddr,
					      const struct in6_addr *faddr,
					      struct rds_transport *trans,
					      int dev_if)
					      u8 tos, int dev_if)
{
	struct rds_connection *conn, *ret = NULL;

@@ -92,6 +92,7 @@ static struct rds_connection *rds_conn_lookup(struct net *net,
		if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
		    ipv6_addr_equal(&conn->c_laddr, laddr) &&
		    conn->c_trans == trans &&
		    conn->c_tos == tos &&
		    net == rds_conn_net(conn) &&
		    conn->c_dev_if == dev_if) {
			ret = conn;
@@ -139,6 +140,7 @@ static void __rds_conn_path_init(struct rds_connection *conn,
	atomic_set(&cp->cp_state, RDS_CONN_DOWN);
	cp->cp_send_gen = 0;
	cp->cp_reconnect_jiffies = 0;
	cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
	INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker);
	INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker);
	INIT_DELAYED_WORK(&cp->cp_conn_w, rds_connect_worker);
@@ -159,7 +161,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
						const struct in6_addr *laddr,
						const struct in6_addr *faddr,
						struct rds_transport *trans,
						gfp_t gfp,
						gfp_t gfp, u8 tos,
						int is_outgoing,
						int dev_if)
{
@@ -171,7 +173,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
	int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);

	rcu_read_lock();
	conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if);
	conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if);
	if (conn &&
	    conn->c_loopback &&
	    conn->c_trans != &rds_loop_transport &&
@@ -205,6 +207,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
	conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
	conn->c_faddr = *faddr;
	conn->c_dev_if = dev_if;
	conn->c_tos = tos;

#if IS_ENABLED(CONFIG_IPV6)
	/* If the local address is link local, set c_bound_if to be the
@@ -297,7 +300,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
		struct rds_connection *found;

		found = rds_conn_lookup(net, head, laddr, faddr, trans,
					dev_if);
					tos, dev_if);
		if (found) {
			struct rds_conn_path *cp;
			int i;
@@ -332,10 +335,10 @@ static struct rds_connection *__rds_conn_create(struct net *net,
struct rds_connection *rds_conn_create(struct net *net,
				       const struct in6_addr *laddr,
				       const struct in6_addr *faddr,
				       struct rds_transport *trans, gfp_t gfp,
				       int dev_if)
				       struct rds_transport *trans, u8 tos,
				       gfp_t gfp, int dev_if)
{
	return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if);
	return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if);
}
EXPORT_SYMBOL_GPL(rds_conn_create);

@@ -343,9 +346,9 @@ struct rds_connection *rds_conn_create_outgoing(struct net *net,
						const struct in6_addr *laddr,
						const struct in6_addr *faddr,
						struct rds_transport *trans,
						gfp_t gfp, int dev_if)
						u8 tos, gfp_t gfp, int dev_if)
{
	return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if);
	return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if);
}
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);

+11 −0
Original line number Diff line number Diff line
@@ -301,6 +301,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,

	iinfo->src_addr = conn->c_laddr.s6_addr32[3];
	iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
	iinfo->tos = conn->c_tos;

	memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
	memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
@@ -514,6 +515,15 @@ void rds_ib_exit(void)
	rds_ib_mr_exit();
}

static u8 rds_ib_get_tos_map(u8 tos)
{
	/* 1:1 user to transport map for RDMA transport.
	 * In future, if custom map is desired, hook can export
	 * user configurable map.
	 */
	return tos;
}

struct rds_transport rds_ib_transport = {
	.laddr_check		= rds_ib_laddr_check,
	.xmit_path_complete	= rds_ib_xmit_path_complete,
@@ -536,6 +546,7 @@ struct rds_transport rds_ib_transport = {
	.sync_mr		= rds_ib_sync_mr,
	.free_mr		= rds_ib_free_mr,
	.flush_mrs		= rds_ib_flush_mrs,
	.get_tos_map		= rds_ib_get_tos_map,
	.t_owner		= THIS_MODULE,
	.t_name			= "infiniband",
	.t_unloading		= rds_ib_is_unloading,
+3 −1
Original line number Diff line number Diff line
@@ -67,7 +67,9 @@ struct rds_ib_conn_priv_cmn {
	u8			ricpc_protocol_major;
	u8			ricpc_protocol_minor;
	__be16			ricpc_protocol_minor_mask;	/* bitmask */
	__be32			ricpc_reserved1;
	u8			ricpc_dp_toss;
	u8			ripc_reserved1;
	__be16			ripc_reserved2;
	__be64			ricpc_ack_seq;
	__be32			ricpc_credit;	/* non-zero enables flow ctl */
};
Loading