Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit ff1f56d9 authored by Daniel Borkmann's avatar Daniel Borkmann
Browse files

Merge branch 'bpf-fib-lookup-helper'



David Ahern says:

====================
Provide a helper for doing a FIB and neighbor lookup in the kernel
tables from an XDP program. The helper provides a fastpath for forwarding
packets. If the packet is a local delivery or for any reason is not a
simple lookup and forward, the packet is expected to continue up the stack
for full processing.

The response from a FIB and neighbor lookup is either the egress index
with the bpf_fib_lookup struct filled in with dmac and gateway or
0 meaning the packet should continue up the stack. In time we can
revisit this to return the FIB lookup result errno if it is one of the
special RTN_'s such as RTN_BLACKHOLE (-EINVAL) so that the XDP
programs can do an early drop if desired.

Patches 1-6 do some more refactoring to IPv6 with the end goal of
extracting a FIB lookup function that aligns with fib_lookup for IPv4,
basically returning a fib6_info without creating a dst based entry.

Patch 7 adds lookup functions to the ipv6 stub. These are needed since
bpf is built into the kernel and ipv6 may not be built or loaded.

Patch 8 adds the bpf helper and 9 adds a sample program.

v3
- remove ETH_ALEN and in6_addr from uapi header

v2
- removed pkt_access from bpf_func_proto as noticed by Daniel
- added check in that IPv6 forwarding is enabled
- added DaveM's ack on patches 1-7 and 9 based on v1 response and
  fact that no changes were made to them in v2

v1
- updated commit messages and cover letter
- added comment to sample program noting lack of verification on
  egress device supporting XDP

RFC v2
- fixed use of foward helper from cls_act as noted by Daniel
- in patch 1 rename fib6_lookup_1 as well for consistency
====================

Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents 68625b76 fe616055
Loading
Loading
Loading
Loading
+14 −0
Original line number Diff line number Diff line
@@ -223,6 +223,20 @@ struct ipv6_stub {
				 const struct in6_addr *addr);
	int (*ipv6_dst_lookup)(struct net *net, struct sock *sk,
			       struct dst_entry **dst, struct flowi6 *fl6);

	struct fib6_table *(*fib6_get_table)(struct net *net, u32 id);
	struct fib6_info *(*fib6_lookup)(struct net *net, int oif,
					 struct flowi6 *fl6, int flags);
	struct fib6_info *(*fib6_table_lookup)(struct net *net,
					      struct fib6_table *table,
					      int oif, struct flowi6 *fl6,
					      int flags);
	struct fib6_info *(*fib6_multipath_select)(const struct net *net,
						   struct fib6_info *f6i,
						   struct flowi6 *fl6, int oif,
						   const struct sk_buff *skb,
						   int strict);

	void (*udpv6_encap_enable)(void);
	void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr,
			      const struct in6_addr *solicited_addr,
+18 −3
Original line number Diff line number Diff line
@@ -376,7 +376,22 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
				   const struct sk_buff *skb,
				   int flags, pol_lookup_t lookup);

struct fib6_node *fib6_lookup(struct fib6_node *root,
/* called with rcu lock held; can return error pointer
 * caller needs to select path
 */
struct fib6_info *fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
			      int flags);

/* called with rcu lock held; caller needs to select path */
struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
				    int oif, struct flowi6 *fl6, int strict);

struct fib6_info *fib6_multipath_select(const struct net *net,
					struct fib6_info *match,
					struct flowi6 *fl6, int oif,
					const struct sk_buff *skb, int strict);

struct fib6_node *fib6_node_lookup(struct fib6_node *root,
				   const struct in6_addr *daddr,
				   const struct in6_addr *saddr);

+7 −7
Original line number Diff line number Diff line
@@ -12,10 +12,10 @@

TRACE_EVENT(fib6_table_lookup,

	TP_PROTO(const struct net *net, const struct rt6_info *rt,
	TP_PROTO(const struct net *net, const struct fib6_info *f6i,
		 struct fib6_table *table, const struct flowi6 *flp),

	TP_ARGS(net, rt, table, flp),
	TP_ARGS(net, f6i, table, flp),

	TP_STRUCT__entry(
		__field(	u32,	tb_id		)
@@ -48,20 +48,20 @@ TRACE_EVENT(fib6_table_lookup,
		in6 = (struct in6_addr *)__entry->dst;
		*in6 = flp->daddr;

		if (rt->rt6i_idev) {
			__assign_str(name, rt->rt6i_idev->dev->name);
		if (f6i->fib6_nh.nh_dev) {
			__assign_str(name, f6i->fib6_nh.nh_dev);
		} else {
			__assign_str(name, "");
		}
		if (rt == net->ipv6.ip6_null_entry) {
		if (f6i == net->ipv6.fib6_null_entry) {
			struct in6_addr in6_zero = {};

			in6 = (struct in6_addr *)__entry->gw;
			*in6 = in6_zero;

		} else if (rt) {
		} else if (f6i) {
			in6 = (struct in6_addr *)__entry->gw;
			*in6 = rt->rt6i_gateway;
			*in6 = f6i->fib6_nh.nh_gw;
		}
	),

+80 −1
Original line number Diff line number Diff line
@@ -1828,6 +1828,33 @@ union bpf_attr {
 * 	Return
 * 		0 on success, or a negative error in case of failure.
 *
 *
 * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags)
 *	Description
 *		Do FIB lookup in kernel tables using parameters in *params*.
 *		If lookup is successful and result shows packet is to be
 *		forwarded, the neighbor tables are searched for the nexthop.
 *		If successful (ie., FIB lookup shows forwarding and nexthop
 *		is resolved), the nexthop address is returned in ipv4_dst,
 *		ipv6_dst or mpls_out based on family, smac is set to mac
 *		address of egress device, dmac is set to nexthop mac address,
 *		rt_metric is set to metric from route.
 *
 *             *plen* argument is the size of the passed in struct.
 *             *flags* argument can be one or more BPF_FIB_LOOKUP_ flags:
 *
 *             **BPF_FIB_LOOKUP_DIRECT** means do a direct table lookup vs
 *             full lookup using FIB rules
 *             **BPF_FIB_LOOKUP_OUTPUT** means do lookup from an egress
 *             perspective (default is ingress)
 *
 *             *ctx* is either **struct xdp_md** for XDP programs or
 *             **struct sk_buff** tc cls_act programs.
 *
 *     Return
 *             Egress device index on success, 0 if packet needs to continue
 *             up the stack for further processing or a negative error in case
 *             of failure.
 */
#define __BPF_FUNC_MAPPER(FN)		\
	FN(unspec),			\
@@ -1898,7 +1925,8 @@ union bpf_attr {
	FN(xdp_adjust_tail),		\
	FN(skb_get_xfrm_state),		\
	FN(get_stack),			\
	FN(skb_load_bytes_relative),
	FN(skb_load_bytes_relative),	\
	FN(fib_lookup),

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
 * function eBPF program intends to call
@@ -2321,4 +2349,55 @@ struct bpf_raw_tracepoint_args {
	__u64 args[0];
};

/* DIRECT:  Skip the FIB rules and go to FIB table associated with device
 * OUTPUT:  Do lookup from egress perspective; default is ingress
 */
#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)

struct bpf_fib_lookup {
	/* input */
	__u8	family;   /* network family, AF_INET, AF_INET6, AF_MPLS */

	/* set if lookup is to consider L4 data - e.g., FIB rules */
	__u8	l4_protocol;
	__be16	sport;
	__be16	dport;

	/* total length of packet from network header - used for MTU check */
	__u16	tot_len;
	__u32	ifindex;  /* L3 device index for lookup */

	union {
		/* inputs to lookup */
		__u8	tos;		/* AF_INET  */
		__be32	flowlabel;	/* AF_INET6 */

		/* output: metric of fib result */
		__u32 rt_metric;
	};

	union {
		__be32		mpls_in;
		__be32		ipv4_src;
		__u32		ipv6_src[4];  /* in6_addr; network order */
	};

	/* input to bpf_fib_lookup, *dst is destination address.
	 * output: bpf_fib_lookup sets to gateway address
	 */
	union {
		/* return for MPLS lookups */
		__be32		mpls_out[4];  /* support up to 4 labels */
		__be32		ipv4_dst;
		__u32		ipv6_dst[4];  /* in6_addr; network order */
	};

	/* output */
	__be16	h_vlan_proto;
	__be16	h_vlan_TCI;
	__u8	smac[6];     /* ETH_ALEN */
	__u8	dmac[6];     /* ETH_ALEN */
};

#endif /* _UAPI__LINUX_BPF_H__ */
+267 −0
Original line number Diff line number Diff line
@@ -60,6 +60,10 @@
#include <net/xfrm.h>
#include <linux/bpf_trace.h>
#include <net/xdp_sock.h>
#include <linux/inetdevice.h>
#include <net/ip_fib.h>
#include <net/flow.h>
#include <net/arp.h>

/**
 *	sk_filter_trim_cap - run a packet through a socket filter
@@ -4032,6 +4036,265 @@ static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = {
};
#endif

#if IS_ENABLED(CONFIG_INET) || IS_ENABLED(CONFIG_IPV6)
static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params,
				  const struct neighbour *neigh,
				  const struct net_device *dev)
{
	memcpy(params->dmac, neigh->ha, ETH_ALEN);
	memcpy(params->smac, dev->dev_addr, ETH_ALEN);
	params->h_vlan_TCI = 0;
	params->h_vlan_proto = 0;

	return dev->ifindex;
}
#endif

#if IS_ENABLED(CONFIG_INET)
static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
			       u32 flags)
{
	struct in_device *in_dev;
	struct neighbour *neigh;
	struct net_device *dev;
	struct fib_result res;
	struct fib_nh *nh;
	struct flowi4 fl4;
	int err;

	dev = dev_get_by_index_rcu(net, params->ifindex);
	if (unlikely(!dev))
		return -ENODEV;

	/* verify forwarding is enabled on this interface */
	in_dev = __in_dev_get_rcu(dev);
	if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev)))
		return 0;

	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
		fl4.flowi4_iif = 1;
		fl4.flowi4_oif = params->ifindex;
	} else {
		fl4.flowi4_iif = params->ifindex;
		fl4.flowi4_oif = 0;
	}
	fl4.flowi4_tos = params->tos & IPTOS_RT_MASK;
	fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
	fl4.flowi4_flags = 0;

	fl4.flowi4_proto = params->l4_protocol;
	fl4.daddr = params->ipv4_dst;
	fl4.saddr = params->ipv4_src;
	fl4.fl4_sport = params->sport;
	fl4.fl4_dport = params->dport;

	if (flags & BPF_FIB_LOOKUP_DIRECT) {
		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
		struct fib_table *tb;

		tb = fib_get_table(net, tbid);
		if (unlikely(!tb))
			return 0;

		err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF);
	} else {
		fl4.flowi4_mark = 0;
		fl4.flowi4_secid = 0;
		fl4.flowi4_tun_key.tun_id = 0;
		fl4.flowi4_uid = sock_net_uid(net, NULL);

		err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF);
	}

	if (err || res.type != RTN_UNICAST)
		return 0;

	if (res.fi->fib_nhs > 1)
		fib_select_path(net, &res, &fl4, NULL);

	nh = &res.fi->fib_nh[res.nh_sel];

	/* do not handle lwt encaps right now */
	if (nh->nh_lwtstate)
		return 0;

	dev = nh->nh_dev;
	if (unlikely(!dev))
		return 0;

	if (nh->nh_gw)
		params->ipv4_dst = nh->nh_gw;

	params->rt_metric = res.fi->fib_priority;

	/* xdp and cls_bpf programs are run in RCU-bh so
	 * rcu_read_lock_bh is not needed here
	 */
	neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst);
	if (neigh)
		return bpf_fib_set_fwd_params(params, neigh, dev);

	return 0;
}
#endif

#if IS_ENABLED(CONFIG_IPV6)
static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
			       u32 flags)
{
	struct in6_addr *src = (struct in6_addr *) params->ipv6_src;
	struct in6_addr *dst = (struct in6_addr *) params->ipv6_dst;
	struct neighbour *neigh;
	struct net_device *dev;
	struct inet6_dev *idev;
	struct fib6_info *f6i;
	struct flowi6 fl6;
	int strict = 0;
	int oif;

	/* link local addresses are never forwarded */
	if (rt6_need_strict(dst) || rt6_need_strict(src))
		return 0;

	dev = dev_get_by_index_rcu(net, params->ifindex);
	if (unlikely(!dev))
		return -ENODEV;

	idev = __in6_dev_get_safely(dev);
	if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
		return 0;

	if (flags & BPF_FIB_LOOKUP_OUTPUT) {
		fl6.flowi6_iif = 1;
		oif = fl6.flowi6_oif = params->ifindex;
	} else {
		oif = fl6.flowi6_iif = params->ifindex;
		fl6.flowi6_oif = 0;
		strict = RT6_LOOKUP_F_HAS_SADDR;
	}
	fl6.flowlabel = params->flowlabel;
	fl6.flowi6_scope = 0;
	fl6.flowi6_flags = 0;
	fl6.mp_hash = 0;

	fl6.flowi6_proto = params->l4_protocol;
	fl6.daddr = *dst;
	fl6.saddr = *src;
	fl6.fl6_sport = params->sport;
	fl6.fl6_dport = params->dport;

	if (flags & BPF_FIB_LOOKUP_DIRECT) {
		u32 tbid = l3mdev_fib_table_rcu(dev) ? : RT_TABLE_MAIN;
		struct fib6_table *tb;

		tb = ipv6_stub->fib6_get_table(net, tbid);
		if (unlikely(!tb))
			return 0;

		f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict);
	} else {
		fl6.flowi6_mark = 0;
		fl6.flowi6_secid = 0;
		fl6.flowi6_tun_key.tun_id = 0;
		fl6.flowi6_uid = sock_net_uid(net, NULL);

		f6i = ipv6_stub->fib6_lookup(net, oif, &fl6, strict);
	}

	if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry))
		return 0;

	if (unlikely(f6i->fib6_flags & RTF_REJECT ||
	    f6i->fib6_type != RTN_UNICAST))
		return 0;

	if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0)
		f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6,
						       fl6.flowi6_oif, NULL,
						       strict);

	if (f6i->fib6_nh.nh_lwtstate)
		return 0;

	if (f6i->fib6_flags & RTF_GATEWAY)
		*dst = f6i->fib6_nh.nh_gw;

	dev = f6i->fib6_nh.nh_dev;
	params->rt_metric = f6i->fib6_metric;

	/* xdp and cls_bpf programs are run in RCU-bh so rcu_read_lock_bh is
	 * not needed here. Can not use __ipv6_neigh_lookup_noref here
	 * because we need to get nd_tbl via the stub
	 */
	neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128,
				      ndisc_hashfn, dst, dev);
	if (neigh)
		return bpf_fib_set_fwd_params(params, neigh, dev);

	return 0;
}
#endif

BPF_CALL_4(bpf_xdp_fib_lookup, struct xdp_buff *, ctx,
	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
	if (plen < sizeof(*params))
		return -EINVAL;

	switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
	case AF_INET:
		return bpf_ipv4_fib_lookup(dev_net(ctx->rxq->dev), params,
					   flags);
#endif
#if IS_ENABLED(CONFIG_IPV6)
	case AF_INET6:
		return bpf_ipv6_fib_lookup(dev_net(ctx->rxq->dev), params,
					   flags);
#endif
	}
	return 0;
}

static const struct bpf_func_proto bpf_xdp_fib_lookup_proto = {
	.func		= bpf_xdp_fib_lookup,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type      = ARG_PTR_TO_CTX,
	.arg2_type      = ARG_PTR_TO_MEM,
	.arg3_type      = ARG_CONST_SIZE,
	.arg4_type	= ARG_ANYTHING,
};

BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb,
	   struct bpf_fib_lookup *, params, int, plen, u32, flags)
{
	if (plen < sizeof(*params))
		return -EINVAL;

	switch (params->family) {
#if IS_ENABLED(CONFIG_INET)
	case AF_INET:
		return bpf_ipv4_fib_lookup(dev_net(skb->dev), params, flags);
#endif
#if IS_ENABLED(CONFIG_IPV6)
	case AF_INET6:
		return bpf_ipv6_fib_lookup(dev_net(skb->dev), params, flags);
#endif
	}
	return -ENOTSUPP;
}

static const struct bpf_func_proto bpf_skb_fib_lookup_proto = {
	.func		= bpf_skb_fib_lookup,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type      = ARG_PTR_TO_CTX,
	.arg2_type      = ARG_PTR_TO_MEM,
	.arg3_type      = ARG_CONST_SIZE,
	.arg4_type	= ARG_ANYTHING,
};

static const struct bpf_func_proto *
bpf_base_func_proto(enum bpf_func_id func_id)
{
@@ -4181,6 +4444,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
	case BPF_FUNC_skb_get_xfrm_state:
		return &bpf_skb_get_xfrm_state_proto;
#endif
	case BPF_FUNC_fib_lookup:
		return &bpf_skb_fib_lookup_proto;
	default:
		return bpf_base_func_proto(func_id);
	}
@@ -4206,6 +4471,8 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
		return &bpf_xdp_redirect_map_proto;
	case BPF_FUNC_xdp_adjust_tail:
		return &bpf_xdp_adjust_tail_proto;
	case BPF_FUNC_fib_lookup:
		return &bpf_xdp_fib_lookup_proto;
	default:
		return bpf_base_func_proto(func_id);
	}
Loading