Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 48debfd7 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'net-Enable-nexthop-objects-with-IPv4-and-IPv6-routes'



David Ahern says:

====================
net: Enable nexthop objects with IPv4 and IPv6 routes

This is the final set of the initial nexthop object work. When I
started this idea almost 2 years ago, it took 18 seconds to inject
700k+ IPv4 routes with 1 hop and about 28 seconds for 4-paths. Some
of that time was due to inefficiencies in 'ip', but most of it was
kernel side with excessive synchronize_rcu calls in ipv4, and redundant
processing validating a nexthop spec (device, gateway, encap). Worse,
the time increased dramatically as the number of legs in the routes
increased; for example, taking over 72 seconds for 16-path routes.

After this set, with increased dirty memory limits (fib_sync_mem sysctl),
an improved ip and nexthop objects a full internet fib (743,799 routes
based on a pull in January 2019) can be pushed to the kernel in 4.3
seconds. Even better, the time to insert is "almost" constant with
increasing number of paths. The 'almost constant' time is due to
expanding the nexthop definitions when generating notifications. A
follow on patch will be sent adding a sysctl that allows an admin to
avoid the nexthop expansion and truly get constant route insert time
regardless of the number of paths in a route! (Useful once all programs
used for a deployment that care about routes understand nexthop objects).

To be clear, 'ip' is used for benchmarking for no other reason than
'ip -batch' is a trivial to use for the tests. FRR, for example, better
manages nexthops and route changes and the way those are pushed to the
kernel and thus will have less userspace processing times than 'ip -batch'.

Patches 1-10 iterate over fib6_nh with a nexthop invoke a processing
function per fib6_nh. Prior to nexthop objects, a fib6_info referenced
a single fib6_nh. Multipath routes were added as separate fib6_info for
each leg of the route and linked as siblings:

    f6i -> sibling -> sibling ... -> sibling
     |                                   |
     +--------- multipath route ---------+

With nexthop objects a single fib6_info references an external
nexthop which may have a series of fib6_nh:

     f6i ---> nexthop ---> fib6_nh
                           ...
                           fib6_nh

making IPv6 routes similar to IPv4. The side effect is that a single
fib6_info now indirectly references a series of fib6_nh so the code
needs to walk each entry and call the local, per-fib6_nh processing
function.

Patches 11 and 13 wire up use of nexthops with fib entries for IPv4
and IPv6. With these commits you can actually use nexthops with routes.

Patch 12 is an optimization for IPv4 when using nexthops in the most
predominant use case (no metrics).

Patches 14 handles replace of a nexthop config.

Patches 15-18 add update pmtu and redirect tests to use both old and
new routing.

Patches 19 and 20 add new tests for the nexthop infrastructure. The first
is single nexthop is used by multiple prefixes to communicate with remote
hosts. This is on top of the functional tests already committed. The
second verifies multipath selection.

v4
- changed return to 'goto out' in patch 9 since the rcu_read_lock is
  held (noticed by Wei)

v3
- removed found arg in patch 7 and changed rt6_nh_remove_exception_rt
  to return 1 when a match is found for an exception

v2
- changed ++i to i++ in patches 1 and 14 as noticed by DaveM
- improved commit message for patch 14 (nexthop replace)
- removed the skip_fib argument to remove_nexthop; vestige of an
  older design
====================

Reviewed-By: default avatarWei Wang <weiwan@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 948622f9 cab14d10
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -49,6 +49,7 @@ struct fib6_config {
	u16		fc_delete_all_nh : 1,
			fc_ignore_dev_down:1,
			__unused : 14;
	u32		fc_nh_id;

	struct in6_addr	fc_dst;
	struct in6_addr	fc_src;
+1 −0
Original line number Diff line number Diff line
@@ -40,6 +40,7 @@ struct fib_config {
	u32			fc_flags;
	u32			fc_priority;
	__be32			fc_prefsrc;
	u32			fc_nh_id;
	struct nlattr		*fc_mx;
	struct rtnexthop	*fc_mp;
	int			fc_mx_len;
+4 −0
Original line number Diff line number Diff line
@@ -305,4 +305,8 @@ static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
		res->nh = &nhi->fib6_nh;
	}
}

int nexthop_for_each_fib6_nh(struct nexthop *nh,
			     int (*cb)(struct fib6_nh *nh, void *arg),
			     void *arg);
#endif
+19 −0
Original line number Diff line number Diff line
@@ -671,6 +671,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
	[RTA_IP_PROTO]		= { .type = NLA_U8 },
	[RTA_SPORT]		= { .type = NLA_U16 },
	[RTA_DPORT]		= { .type = NLA_U16 },
	[RTA_NH_ID]		= { .type = NLA_U32 },
};

int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla,
@@ -808,6 +809,18 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
			if (err < 0)
				goto errout;
			break;
		case RTA_NH_ID:
			cfg->fc_nh_id = nla_get_u32(attr);
			break;
		}
	}

	if (cfg->fc_nh_id) {
		if (cfg->fc_oif || cfg->fc_gw_family ||
		    cfg->fc_encap || cfg->fc_mp) {
			NL_SET_ERR_MSG(extack,
				       "Nexthop specification and nexthop id are mutually exclusive");
			return -EINVAL;
		}
	}

@@ -834,6 +847,12 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
	if (err < 0)
		goto errout;

	if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) {
		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
		err = -EINVAL;
		goto errout;
	}

	tb = fib_get_table(net, cfg.fc_table);
	if (!tb) {
		NL_SET_ERR_MSG(extack, "FIB table does not exist");
+80 −6
Original line number Diff line number Diff line
@@ -325,14 +325,32 @@ static inline unsigned int fib_devindex_hashfn(unsigned int val)
		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
}

static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
static unsigned int fib_info_hashfn_1(int init_val, u8 protocol, u8 scope,
				      u32 prefsrc, u32 priority)
{
	unsigned int val = init_val;

	val ^= (protocol << 8) | scope;
	val ^= prefsrc;
	val ^= priority;

	return val;
}

static unsigned int fib_info_hashfn_result(unsigned int val)
{
	unsigned int mask = (fib_info_hash_size - 1);
	unsigned int val = fi->fib_nhs;

	val ^= (fi->fib_protocol << 8) | fi->fib_scope;
	val ^= (__force u32)fi->fib_prefsrc;
	val ^= fi->fib_priority;
	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
}

static inline unsigned int fib_info_hashfn(struct fib_info *fi)
{
	unsigned int val;

	val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol,
				fi->fib_scope, (__force u32)fi->fib_prefsrc,
				fi->fib_priority);

	if (fi->nh) {
		val ^= fib_devindex_hashfn(fi->nh->id);
@@ -342,7 +360,40 @@ static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
		} endfor_nexthops(fi)
	}

	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
	return fib_info_hashfn_result(val);
}

/* no metrics, only nexthop id */
static struct fib_info *fib_find_info_nh(struct net *net,
					 const struct fib_config *cfg)
{
	struct hlist_head *head;
	struct fib_info *fi;
	unsigned int hash;

	hash = fib_info_hashfn_1(fib_devindex_hashfn(cfg->fc_nh_id),
				 cfg->fc_protocol, cfg->fc_scope,
				 (__force u32)cfg->fc_prefsrc,
				 cfg->fc_priority);
	hash = fib_info_hashfn_result(hash);
	head = &fib_info_hash[hash];

	hlist_for_each_entry(fi, head, fib_hash) {
		if (!net_eq(fi->fib_net, net))
			continue;
		if (!fi->nh || fi->nh->id != cfg->fc_nh_id)
			continue;
		if (cfg->fc_protocol == fi->fib_protocol &&
		    cfg->fc_scope == fi->fib_scope &&
		    cfg->fc_prefsrc == fi->fib_prefsrc &&
		    cfg->fc_priority == fi->fib_priority &&
		    cfg->fc_type == fi->fib_type &&
		    cfg->fc_table == fi->fib_tb_id &&
		    !((cfg->fc_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK))
			return fi;
	}

	return NULL;
}

static struct fib_info *fib_find_info(struct fib_info *nfi)
@@ -789,6 +840,12 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi,
	if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
		return 1;

	if (cfg->fc_nh_id) {
		if (fi->nh && cfg->fc_nh_id == fi->nh->id)
			return 0;
		return 1;
	}

	if (cfg->fc_oif || cfg->fc_gw_family) {
		struct fib_nh *nh = fib_info_nh(fi, 0);

@@ -1302,6 +1359,23 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
		goto err_inval;
	}

	if (cfg->fc_nh_id) {
		if (!cfg->fc_mx) {
			fi = fib_find_info_nh(net, cfg);
			if (fi) {
				fi->fib_treeref++;
				return fi;
			}
		}

		nh = nexthop_find_by_id(net, cfg->fc_nh_id);
		if (!nh) {
			NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
			goto err_inval;
		}
		nhs = 0;
	}

#ifdef CONFIG_IP_ROUTE_MULTIPATH
	if (cfg->fc_mp) {
		nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len, extack);
Loading