Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 4895c771 authored by David S. Miller's avatar David S. Miller
Browse files

ipv4: Add FIB nexthop exceptions.



In a regime where we have subnetted route entries, we need a way to
store persistent storage about destination specific learned values
such as redirects and PMTU values.

This is implemented here via nexthop exceptions.

The initial implementation is a 2048 entry hash table with relaiming
starting at chain length 5.  A more sophisticated scheme can be
devised if that proves necessary.

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 6700c270
Loading
Loading
Loading
Loading
+18 −0
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@

#include <net/flow.h>
#include <linux/seq_file.h>
#include <linux/rcupdate.h>
#include <net/fib_rules.h>
#include <net/inetpeer.h>

@@ -46,6 +47,22 @@ struct fib_config {

struct fib_info;

struct fib_nh_exception {
	struct fib_nh_exception __rcu	*fnhe_next;
	__be32				fnhe_daddr;
	u32				fnhe_pmtu;
	u32				fnhe_gw;
	unsigned long			fnhe_expires;
	unsigned long			fnhe_stamp;
};

struct fnhe_hash_bucket {
	struct fib_nh_exception __rcu	*chain;
};

#define FNHE_HASH_SIZE		2048
#define FNHE_RECLAIM_DEPTH	5

struct fib_nh {
	struct net_device	*nh_dev;
	struct hlist_node	nh_hash;
@@ -63,6 +80,7 @@ struct fib_nh {
	__be32			nh_gw;
	__be32			nh_saddr;
	int			nh_saddr_genid;
	struct fnhe_hash_bucket	*nh_exceptions;
};

/*
+23 −0
Original line number Diff line number Diff line
@@ -140,6 +140,27 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
	},
};

static void free_nh_exceptions(struct fib_nh *nh)
{
	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
	int i;

	for (i = 0; i < FNHE_HASH_SIZE; i++) {
		struct fib_nh_exception *fnhe;

		fnhe = rcu_dereference(hash[i].chain);
		while (fnhe) {
			struct fib_nh_exception *next;
			
			next = rcu_dereference(fnhe->fnhe_next);
			kfree(fnhe);

			fnhe = next;
		}
	}
	kfree(hash);
}

/* Release a nexthop info record */
static void free_fib_info_rcu(struct rcu_head *head)
{
@@ -148,6 +169,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
	change_nexthops(fi) {
		if (nexthop_nh->nh_dev)
			dev_put(nexthop_nh->nh_dev);
		if (nexthop_nh->nh_exceptions)
			free_nh_exceptions(nexthop_nh);
	} endfor_nexthops(fi);

	release_net(fi->fib_net);
+225 −31
Original line number Diff line number Diff line
@@ -1275,14 +1275,130 @@ static void rt_del(unsigned int hash, struct rtable *rt)
	spin_unlock_bh(rt_hash_lock_addr(hash));
}

static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
static void __build_flow_key(struct flowi4 *fl4, struct sock *sk,
			     const struct iphdr *iph,
			     int oif, u8 tos,
			     u8 prot, u32 mark, int flow_flags)
{
	if (sk) {
		const struct inet_sock *inet = inet_sk(sk);

		oif = sk->sk_bound_dev_if;
		mark = sk->sk_mark;
		tos = RT_CONN_FLAGS(sk);
		prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
	}
	flowi4_init_output(fl4, oif, mark, tos,
			   RT_SCOPE_UNIVERSE, prot,
			   flow_flags,
			   iph->daddr, iph->saddr, 0, 0);
}

static void build_skb_flow_key(struct flowi4 *fl4, struct sk_buff *skb, struct sock *sk)
{
	const struct iphdr *iph = ip_hdr(skb);
	int oif = skb->dev->ifindex;
	u8 tos = RT_TOS(iph->tos);
	u8 prot = iph->protocol;
	u32 mark = skb->mark;

	__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
}

static void build_sk_flow_key(struct flowi4 *fl4, struct sock *sk)
{
	const struct inet_sock *inet = inet_sk(sk);
	struct ip_options_rcu *inet_opt;
	__be32 daddr = inet->inet_daddr;

	rcu_read_lock();
	inet_opt = rcu_dereference(inet->inet_opt);
	if (inet_opt && inet_opt->opt.srr)
		daddr = inet_opt->opt.faddr;
	flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
			   inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
			   inet_sk_flowi_flags(sk),
			   daddr, inet->inet_saddr, 0, 0);
	rcu_read_unlock();
}

static void ip_rt_build_flow_key(struct flowi4 *fl4, struct sock *sk,
				 struct sk_buff *skb)
{
	if (skb)
		build_skb_flow_key(fl4, skb, sk);
	else
		build_sk_flow_key(fl4, sk);
}

static DEFINE_SPINLOCK(fnhe_lock);

static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash, __be32 daddr)
{
	struct fib_nh_exception *fnhe, *oldest;

	oldest = rcu_dereference(hash->chain);
	for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
		if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
			oldest = fnhe;
	}
	return oldest;
}

static struct fib_nh_exception *find_or_create_fnhe(struct fib_nh *nh, __be32 daddr)
{
	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
	struct fib_nh_exception *fnhe;
	int depth;
	u32 hval;

	if (!hash) {
		hash = nh->nh_exceptions = kzalloc(FNHE_HASH_SIZE * sizeof(*hash),
						   GFP_ATOMIC);
		if (!hash)
			return NULL;
	}

	hval = (__force u32) daddr;
	hval ^= (hval >> 11) ^ (hval >> 22);
	hash += hval;

	depth = 0;
	for (fnhe = rcu_dereference(hash->chain); fnhe;
	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
		if (fnhe->fnhe_daddr == daddr)
			goto out;
		depth++;
	}

	if (depth > FNHE_RECLAIM_DEPTH) {
		fnhe = fnhe_oldest(hash + hval, daddr);
		goto out_daddr;
	}
	fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
	if (!fnhe)
		return NULL;

	fnhe->fnhe_next = hash->chain;
	rcu_assign_pointer(hash->chain, fnhe);

out_daddr:
	fnhe->fnhe_daddr = daddr;
out:
	fnhe->fnhe_stamp = jiffies;
	return fnhe;
}

static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4)
{
	__be32 new_gw = icmp_hdr(skb)->un.gateway;
	__be32 old_gw = ip_hdr(skb)->saddr;
	struct net_device *dev = skb->dev;
	struct in_device *in_dev;
	struct fib_result res;
	struct neighbour *n;
	struct rtable *rt;
	struct net *net;

	switch (icmp_hdr(skb)->code & 7) {
@@ -1296,7 +1412,6 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
		return;
	}

	rt = (struct rtable *) dst;
	if (rt->rt_gateway != old_gw)
		return;

@@ -1320,11 +1435,21 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
			goto reject_redirect;
	}

	n = ipv4_neigh_lookup(dst, NULL, &new_gw);
	n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
	if (n) {
		if (!(n->nud_state & NUD_VALID)) {
			neigh_event_send(n, NULL);
		} else {
			if (fib_lookup(net, fl4, &res) == 0) {
				struct fib_nh *nh = &FIB_RES_NH(res);
				struct fib_nh_exception *fnhe;

				spin_lock_bh(&fnhe_lock);
				fnhe = find_or_create_fnhe(nh, fl4->daddr);
				if (fnhe)
					fnhe->fnhe_gw = new_gw;
				spin_unlock_bh(&fnhe_lock);
			}
			rt->rt_gateway = new_gw;
			rt->rt_flags |= RTCF_REDIRECTED;
			call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
@@ -1349,6 +1474,17 @@ static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buf
	;
}

static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
{
	struct rtable *rt;
	struct flowi4 fl4;

	rt = (struct rtable *) dst;

	ip_rt_build_flow_key(&fl4, sk, skb);
	__ip_do_redirect(rt, skb, &fl4);
}

static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
{
	struct rtable *rt = (struct rtable *)dst;
@@ -1508,20 +1644,39 @@ out: kfree_skb(skb);
	return 0;
}

static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
			      struct sk_buff *skb, u32 mtu)
static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
{
	struct rtable *rt = (struct rtable *) dst;

	dst_confirm(dst);
	struct fib_result res;

	if (mtu < ip_rt_min_pmtu)
		mtu = ip_rt_min_pmtu;

	if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
		struct fib_nh *nh = &FIB_RES_NH(res);
		struct fib_nh_exception *fnhe;

		spin_lock_bh(&fnhe_lock);
		fnhe = find_or_create_fnhe(nh, fl4->daddr);
		if (fnhe) {
			fnhe->fnhe_pmtu = mtu;
			fnhe->fnhe_expires = jiffies + ip_rt_mtu_expires;
		}
		spin_unlock_bh(&fnhe_lock);
	}
	rt->rt_pmtu = mtu;
	dst_set_expires(&rt->dst, ip_rt_mtu_expires);
}

static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
			      struct sk_buff *skb, u32 mtu)
{
	struct rtable *rt = (struct rtable *) dst;
	struct flowi4 fl4;

	ip_rt_build_flow_key(&fl4, sk, skb);
	__ip_rt_update_pmtu(rt, &fl4, mtu);
}

void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
		      int oif, u32 mark, u8 protocol, int flow_flags)
{
@@ -1529,12 +1684,11 @@ void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
	struct flowi4 fl4;
	struct rtable *rt;

	flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
			   protocol, flow_flags,
			   iph->daddr, iph->saddr, 0, 0);
	__build_flow_key(&fl4, NULL, iph, oif,
			 RT_TOS(iph->tos), protocol, mark, flow_flags);
	rt = __ip_route_output_key(net, &fl4);
	if (!IS_ERR(rt)) {
		ip_rt_update_pmtu(&rt->dst, NULL, skb, mtu);
		__ip_rt_update_pmtu(rt, &fl4, mtu);
		ip_rt_put(rt);
	}
}
@@ -1542,12 +1696,16 @@ EXPORT_SYMBOL_GPL(ipv4_update_pmtu);

void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
{
	const struct inet_sock *inet = inet_sk(sk);
	const struct iphdr *iph = (const struct iphdr *) skb->data;
	struct flowi4 fl4;
	struct rtable *rt;

	return ipv4_update_pmtu(skb, sock_net(sk), mtu,
				sk->sk_bound_dev_if, sk->sk_mark,
				inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
				inet_sk_flowi_flags(sk));
	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
	rt = __ip_route_output_key(sock_net(sk), &fl4);
	if (!IS_ERR(rt)) {
		__ip_rt_update_pmtu(rt, &fl4, mtu);
		ip_rt_put(rt);
	}
}
EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);

@@ -1558,11 +1716,11 @@ void ipv4_redirect(struct sk_buff *skb, struct net *net,
	struct flowi4 fl4;
	struct rtable *rt;

	flowi4_init_output(&fl4, oif, mark, RT_TOS(iph->tos), RT_SCOPE_UNIVERSE,
			   protocol, flow_flags, iph->daddr, iph->saddr, 0, 0);
	__build_flow_key(&fl4, NULL, iph, oif,
			 RT_TOS(iph->tos), protocol, mark, flow_flags);
	rt = __ip_route_output_key(net, &fl4);
	if (!IS_ERR(rt)) {
		ip_do_redirect(&rt->dst, NULL, skb);
		__ip_do_redirect(rt, skb, &fl4);
		ip_rt_put(rt);
	}
}
@@ -1570,12 +1728,16 @@ EXPORT_SYMBOL_GPL(ipv4_redirect);

void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
	const struct inet_sock *inet = inet_sk(sk);
	const struct iphdr *iph = (const struct iphdr *) skb->data;
	struct flowi4 fl4;
	struct rtable *rt;

	return ipv4_redirect(skb, sock_net(sk), sk->sk_bound_dev_if,
			     sk->sk_mark,
			     inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
			     inet_sk_flowi_flags(sk));
	__build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
	rt = __ip_route_output_key(sock_net(sk), &fl4);
	if (!IS_ERR(rt)) {
		__ip_do_redirect(rt, skb, &fl4);
		ip_rt_put(rt);
	}
}
EXPORT_SYMBOL_GPL(ipv4_sk_redirect);

@@ -1722,14 +1884,46 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4,
	dst_init_metrics(&rt->dst, fi->fib_metrics, true);
}

static void rt_bind_exception(struct rtable *rt, struct fib_nh *nh, __be32 daddr)
{
	struct fnhe_hash_bucket *hash = nh->nh_exceptions;
	struct fib_nh_exception *fnhe;
	u32 hval;

	hval = (__force u32) daddr;
	hval ^= (hval >> 11) ^ (hval >> 22);

	for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
	     fnhe = rcu_dereference(fnhe->fnhe_next)) {
		if (fnhe->fnhe_daddr == daddr) {
			if (fnhe->fnhe_pmtu) {
				unsigned long expires = fnhe->fnhe_expires;
				unsigned long diff = jiffies - expires;

				if (time_before(jiffies, expires)) {
					rt->rt_pmtu = fnhe->fnhe_pmtu;
					dst_set_expires(&rt->dst, diff);
				}
			}
			if (fnhe->fnhe_gw)
				rt->rt_gateway = fnhe->fnhe_gw;
			fnhe->fnhe_stamp = jiffies;
			break;
		}
	}
}

static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4,
			   const struct fib_result *res,
			   struct fib_info *fi, u16 type, u32 itag)
{
	if (fi) {
		if (FIB_RES_GW(*res) &&
		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
			rt->rt_gateway = FIB_RES_GW(*res);
		struct fib_nh *nh = &FIB_RES_NH(*res);

		if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
			rt->rt_gateway = nh->nh_gw;
		if (unlikely(nh->nh_exceptions))
			rt_bind_exception(rt, nh, fl4->daddr);
		rt_init_metrics(rt, fl4, fi);
#ifdef CONFIG_IP_ROUTE_CLASSID
		rt->dst.tclassid = FIB_RES_NH(*res).nh_tclassid;