Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit fc66f95c authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

net dst: use a percpu_counter to track entries



struct dst_ops tracks number of allocated dst in an atomic_t field,
subject to high cache line contention in stress workload.

Switch to a percpu_counter, to reduce number of time we need to dirty a
central location. Place it on a separate cache line to avoid dirtying
read only fields.

Stress test :

(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE, SLUB/NUMA)

Before:

real    0m51.179s
user    0m15.329s
sys     10m15.942s

After:

real	0m45.570s
user	0m15.525s
sys	9m56.669s

With a small reordering of struct neighbour fields, subject of a
following patch, (to separate refcnt from other read mostly fields)

real	0m41.841s
user	0m15.261s
sys	8m45.949s

Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 0ed8ddf4
Loading
Loading
Loading
Loading
+36 −1
Original line number Diff line number Diff line
#ifndef _NET_DST_OPS_H
#define _NET_DST_OPS_H
#include <linux/types.h>
#include <linux/percpu_counter.h>

struct dst_entry;
struct kmem_cachep;
@@ -22,7 +23,41 @@ struct dst_ops {
	void			(*update_pmtu)(struct dst_entry *dst, u32 mtu);
	int			(*local_out)(struct sk_buff *skb);

	atomic_t		entries;
	struct kmem_cache	*kmem_cachep;

	struct percpu_counter	pcpuc_entries ____cacheline_aligned_in_smp;
};

static inline int dst_entries_get_fast(struct dst_ops *dst)
{
	return percpu_counter_read_positive(&dst->pcpuc_entries);
}

static inline int dst_entries_get_slow(struct dst_ops *dst)
{
	int res;

	local_bh_disable();
	res = percpu_counter_sum_positive(&dst->pcpuc_entries);
	local_bh_enable();
	return res;
}

static inline void dst_entries_add(struct dst_ops *dst, int val)
{
	local_bh_disable();
	percpu_counter_add(&dst->pcpuc_entries, val);
	local_bh_enable();
}

static inline int dst_entries_init(struct dst_ops *dst)
{
	return percpu_counter_init(&dst->pcpuc_entries, 0);
}

static inline void dst_entries_destroy(struct dst_ops *dst)
{
	percpu_counter_destroy(&dst->pcpuc_entries);
}

#endif
+9 −2
Original line number Diff line number Diff line
@@ -106,7 +106,6 @@ static struct dst_ops fake_dst_ops = {
	.family =		AF_INET,
	.protocol =		cpu_to_be16(ETH_P_IP),
	.update_pmtu =		fake_update_pmtu,
	.entries =		ATOMIC_INIT(0),
};

/*
@@ -1003,15 +1002,22 @@ int __init br_netfilter_init(void)
{
	int ret;

	ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
	ret = dst_entries_init(&fake_dst_ops);
	if (ret < 0)
		return ret;

	ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
	if (ret < 0) {
		dst_entries_destroy(&fake_dst_ops);
		return ret;
	}
#ifdef CONFIG_SYSCTL
	brnf_sysctl_header = register_sysctl_paths(brnf_path, brnf_table);
	if (brnf_sysctl_header == NULL) {
		printk(KERN_WARNING
		       "br_netfilter: can't register to sysctl.\n");
		nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
		dst_entries_destroy(&fake_dst_ops);
		return -ENOMEM;
	}
#endif
@@ -1025,4 +1031,5 @@ void br_netfilter_fini(void)
#ifdef CONFIG_SYSCTL
	unregister_sysctl_table(brnf_sysctl_header);
#endif
	dst_entries_destroy(&fake_dst_ops);
}
+3 −3
Original line number Diff line number Diff line
@@ -168,7 +168,7 @@ void *dst_alloc(struct dst_ops *ops)
{
	struct dst_entry *dst;

	if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {
	if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
		if (ops->gc(ops))
			return NULL;
	}
@@ -183,7 +183,7 @@ void *dst_alloc(struct dst_ops *ops)
#if RT_CACHE_DEBUG >= 2
	atomic_inc(&dst_total);
#endif
	atomic_inc(&ops->entries);
	dst_entries_add(ops, 1);
	return dst;
}
EXPORT_SYMBOL(dst_alloc);
@@ -236,7 +236,7 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
		neigh_release(neigh);
	}

	atomic_dec(&dst->ops->entries);
	dst_entries_add(dst->ops, -1);

	if (dst->ops->destroy)
		dst->ops->destroy(dst);
+2 −1
Original line number Diff line number Diff line
@@ -132,7 +132,6 @@ static struct dst_ops dn_dst_ops = {
	.negative_advice =	dn_dst_negative_advice,
	.link_failure =		dn_dst_link_failure,
	.update_pmtu =		dn_dst_update_pmtu,
	.entries =		ATOMIC_INIT(0),
};

static __inline__ unsigned dn_hash(__le16 src, __le16 dst)
@@ -1758,6 +1757,7 @@ void __init dn_route_init(void)
	dn_dst_ops.kmem_cachep =
		kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0,
				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
	dst_entries_init(&dn_dst_ops);
	setup_timer(&dn_route_timer, dn_dst_check_expire, 0);
	dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
	add_timer(&dn_route_timer);
@@ -1816,5 +1816,6 @@ void __exit dn_route_cleanup(void)
	dn_run_flush(0);

	proc_net_remove(&init_net, "decnet_cache");
	dst_entries_destroy(&dn_dst_ops);
}
+22 −14
Original line number Diff line number Diff line
@@ -159,7 +159,6 @@ static struct dst_ops ipv4_dst_ops = {
	.link_failure =		ipv4_link_failure,
	.update_pmtu =		ip_rt_update_pmtu,
	.local_out =		__ip_local_out,
	.entries =		ATOMIC_INIT(0),
};

#define ECN_OR_COST(class)	TC_PRIO_##class
@@ -466,7 +465,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)

	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
		   atomic_read(&ipv4_dst_ops.entries),
		   dst_entries_get_slow(&ipv4_dst_ops),
		   st->in_hit,
		   st->in_slow_tot,
		   st->in_slow_mc,
@@ -945,6 +944,7 @@ static int rt_garbage_collect(struct dst_ops *ops)
	struct rtable *rth, **rthp;
	unsigned long now = jiffies;
	int goal;
	int entries = dst_entries_get_fast(&ipv4_dst_ops);

	/*
	 * Garbage collection is pretty expensive,
@@ -954,28 +954,28 @@ static int rt_garbage_collect(struct dst_ops *ops)
	RT_CACHE_STAT_INC(gc_total);

	if (now - last_gc < ip_rt_gc_min_interval &&
	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
	    entries < ip_rt_max_size) {
		RT_CACHE_STAT_INC(gc_ignored);
		goto out;
	}

	entries = dst_entries_get_slow(&ipv4_dst_ops);
	/* Calculate number of entries, which we want to expire now. */
	goal = atomic_read(&ipv4_dst_ops.entries) -
		(ip_rt_gc_elasticity << rt_hash_log);
	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
	if (goal <= 0) {
		if (equilibrium < ipv4_dst_ops.gc_thresh)
			equilibrium = ipv4_dst_ops.gc_thresh;
		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
		goal = entries - equilibrium;
		if (goal > 0) {
			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
			goal = entries - equilibrium;
		}
	} else {
		/* We are in dangerous area. Try to reduce cache really
		 * aggressively.
		 */
		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
		equilibrium = entries - goal;
	}

	if (now - last_gc >= ip_rt_gc_min_interval)
@@ -1032,14 +1032,16 @@ static int rt_garbage_collect(struct dst_ops *ops)
		expire >>= 1;
#if RT_CACHE_DEBUG >= 2
		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
				atomic_read(&ipv4_dst_ops.entries), goal, i);
				dst_entries_get_fast(&ipv4_dst_ops), goal, i);
#endif

		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
			goto out;
	} while (!in_softirq() && time_before_eq(jiffies, now));

	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
		goto out;
	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
		goto out;
	if (net_ratelimit())
		printk(KERN_WARNING "dst cache overflow\n");
@@ -1049,11 +1051,12 @@ static int rt_garbage_collect(struct dst_ops *ops)
work_done:
	expire += ip_rt_gc_min_interval;
	if (expire > ip_rt_gc_timeout ||
	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
		expire = ip_rt_gc_timeout;
#if RT_CACHE_DEBUG >= 2
	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
			atomic_read(&ipv4_dst_ops.entries), goal, rover);
			dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
#endif
out:	return 0;
}
@@ -2717,7 +2720,6 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
	.destroy		=	ipv4_dst_destroy,
	.check			=	ipv4_blackhole_dst_check,
	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
	.entries		=	ATOMIC_INIT(0),
};


@@ -3287,6 +3289,12 @@ int __init ip_rt_init(void)

	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;

	if (dst_entries_init(&ipv4_dst_ops) < 0)
		panic("IP: failed to allocate ipv4_dst_ops counter\n");

	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");

	rt_hash_table = (struct rt_hash_bucket *)
		alloc_large_system_hash("IP route cache",
					sizeof(struct rt_hash_bucket),
Loading