Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6dd9a14e authored by David Ahern's avatar David Ahern Committed by David S. Miller
Browse files

net: Allow accepted sockets to be bound to l3mdev domain



Allow accepted sockets to derive their sk_bound_dev_if setting from the
l3mdev domain in which the packets originated. A sysctl setting is added
to control the behavior which is similar to sk_mark and
sysctl_tcp_fwmark_accept.

This effectively allow a process to have a "VRF-global" listen socket,
with child sockets bound to the VRF device in which the packet originated.
A similar behavior can be achieved using sk_mark, but a solution using marks
is incomplete as it does not handle duplicate addresses in different L3
domains/VRFs. Allowing sockets to inherit the sk_bound_dev_if from l3mdev
domain provides a complete solution.

Signed-off-by: default avatarDavid Ahern <dsa@cumulusnetworks.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 1a852479
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -335,6 +335,14 @@ tcp_keepalive_intvl - INTEGER
	after probes started. Default value: 75sec i.e. connection
	will be aborted after ~11 minutes of retries.

tcp_l3mdev_accept - BOOLEAN
	Enables child sockets to inherit the L3 master device index.
	Enabling this option allows a "global" listen socket to work
	across L3 master domains (e.g., VRFs) with connected sockets
	derived from the listen socket to be bound to the L3 domain in
	which the packets originated. Only valid when the kernel was
	compiled with CONFIG_NET_L3_MASTER_DEV.

tcp_low_latency - BOOLEAN
	If set, the TCP stack makes decisions that prefer lower
	latency as opposed to higher throughput.  By default, this
+14 −0
Original line number Diff line number Diff line
@@ -28,6 +28,7 @@
#include <net/request_sock.h>
#include <net/netns/hash.h>
#include <net/tcp_states.h>
#include <net/l3mdev.h>

/** struct ip_options - IP Options
 *
@@ -113,6 +114,19 @@ static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
	return sk->sk_mark;
}

static inline int inet_request_bound_dev_if(const struct sock *sk,
					    struct sk_buff *skb)
{
#ifdef CONFIG_NET_L3_MASTER_DEV
	struct net *net = sock_net(sk);

	if (!sk->sk_bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept)
		return l3mdev_master_ifindex_by_index(net, skb->skb_iif);
#endif

	return sk->sk_bound_dev_if;
}

struct inet_cork {
	unsigned int		flags;
	__be32			addr;
+3 −0
Original line number Diff line number Diff line
@@ -86,6 +86,9 @@ struct netns_ipv4 {

	int sysctl_fwmark_reflect;
	int sysctl_tcp_fwmark_accept;
#ifdef CONFIG_NET_L3_MASTER_DEV
	int sysctl_tcp_l3mdev_accept;
#endif
	int sysctl_tcp_mtu_probing;
	int sysctl_tcp_base_mss;
	int sysctl_tcp_probe_threshold;
+2 −2
Original line number Diff line number Diff line
@@ -351,7 +351,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
	treq->snt_synack.v64	= 0;
	treq->tfo_listener	= false;

	ireq->ir_iif = sk->sk_bound_dev_if;
	ireq->ir_iif = inet_request_bound_dev_if(sk, skb);

	/* We throwed the options of the initial SYN away, so we hope
	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -371,7 +371,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
	 * hasn't changed since we received the original syn, but I see
	 * no easy way to do this.
	 */
	flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
	flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark,
			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
			   inet_sk_flowi_flags(sk),
			   opt->srr ? opt->faddr : ireq->ir_rmt_addr,
+11 −0
Original line number Diff line number Diff line
@@ -915,6 +915,17 @@ static struct ctl_table ipv4_net_table[] = {
		.mode		= 0644,
		.proc_handler	= proc_dointvec,
	},
#ifdef CONFIG_NET_L3_MASTER_DEV
	{
		.procname	= "tcp_l3mdev_accept",
		.data		= &init_net.ipv4.sysctl_tcp_l3mdev_accept,
		.maxlen		= sizeof(int),
		.mode		= 0644,
		.proc_handler	= proc_dointvec_minmax,
		.extra1		= &zero,
		.extra2		= &one,
	},
#endif
	{
		.procname	= "tcp_mtu_probing",
		.data		= &init_net.ipv4.sysctl_tcp_mtu_probing,
Loading