Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1c0d32fd authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

net_sched: gen_estimator: complete rewrite of rate estimators



1) Old code was hard to maintain, due to complex lock chains.
   (We probably will be able to remove some kfree_rcu() in callers)

2) Using a single timer to update all estimators does not scale.

3) Code was buggy on 32bit kernel (WRITE_ONCE() on 64bit quantity
   is not supposed to work well)

In this rewrite :

- I removed the RB tree that had to be scanned in
  gen_estimator_active(). qdisc dumps should be much faster.

- Each estimator has its own timer.

- Estimations are maintained in net_rate_estimator structure,
  instead of dirtying the qdisc. Minor, but part of the simplification.

- Reading the estimator uses RCU and a seqcount to provide proper
  support for 32bit kernels.

- We reduce memory need when estimators are not used, since
  we store a pointer, instead of the bytes/packets counters.

- xt_rateest_mt() no longer has to grab a spinlock.
  (In the future, xt_rateest_tg() could be switched to per cpu counters)

Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent a6e16931
Loading
Loading
Loading
Loading
+1 −1
Original line number Original line Diff line number Diff line
@@ -36,7 +36,7 @@ struct tc_action {
	struct tcf_t			tcfa_tm;
	struct tcf_t			tcfa_tm;
	struct gnet_stats_basic_packed	tcfa_bstats;
	struct gnet_stats_basic_packed	tcfa_bstats;
	struct gnet_stats_queue		tcfa_qstats;
	struct gnet_stats_queue		tcfa_qstats;
	struct gnet_stats_rate_est64	tcfa_rate_est;
	struct net_rate_estimator __rcu *tcfa_rate_est;
	spinlock_t			tcfa_lock;
	spinlock_t			tcfa_lock;
	struct rcu_head			tcfa_rcu;
	struct rcu_head			tcfa_rcu;
	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+9 −8
Original line number Original line Diff line number Diff line
@@ -11,6 +11,8 @@ struct gnet_stats_basic_cpu {
	struct u64_stats_sync syncp;
	struct u64_stats_sync syncp;
};
};


struct net_rate_estimator;

struct gnet_dump {
struct gnet_dump {
	spinlock_t *      lock;
	spinlock_t *      lock;
	struct sk_buff *  skb;
	struct sk_buff *  skb;
@@ -42,8 +44,7 @@ void __gnet_stats_copy_basic(const seqcount_t *running,
			     struct gnet_stats_basic_cpu __percpu *cpu,
			     struct gnet_stats_basic_cpu __percpu *cpu,
			     struct gnet_stats_basic_packed *b);
			     struct gnet_stats_basic_packed *b);
int gnet_stats_copy_rate_est(struct gnet_dump *d,
int gnet_stats_copy_rate_est(struct gnet_dump *d,
			     const struct gnet_stats_basic_packed *b,
			     struct net_rate_estimator __rcu **ptr);
			     struct gnet_stats_rate_est64 *r);
int gnet_stats_copy_queue(struct gnet_dump *d,
int gnet_stats_copy_queue(struct gnet_dump *d,
			  struct gnet_stats_queue __percpu *cpu_q,
			  struct gnet_stats_queue __percpu *cpu_q,
			  struct gnet_stats_queue *q, __u32 qlen);
			  struct gnet_stats_queue *q, __u32 qlen);
@@ -53,16 +54,16 @@ int gnet_stats_finish_copy(struct gnet_dump *d);


int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
		      struct gnet_stats_rate_est64 *rate_est,
		      struct net_rate_estimator __rcu **rate_est,
		      spinlock_t *stats_lock,
		      spinlock_t *stats_lock,
		      seqcount_t *running, struct nlattr *opt);
		      seqcount_t *running, struct nlattr *opt);
void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
			struct gnet_stats_rate_est64 *rate_est);
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
			  struct gnet_stats_rate_est64 *rate_est,
			  struct net_rate_estimator __rcu **ptr,
			  spinlock_t *stats_lock,
			  spinlock_t *stats_lock,
			  seqcount_t *running, struct nlattr *opt);
			  seqcount_t *running, struct nlattr *opt);
bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
			  const struct gnet_stats_rate_est64 *rate_est);
bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
			struct gnet_stats_rate_est64 *sample);
#endif
#endif
+7 −3
Original line number Original line Diff line number Diff line
#ifndef _XT_RATEEST_H
#ifndef _XT_RATEEST_H
#define _XT_RATEEST_H
#define _XT_RATEEST_H


#include <net/gen_stats.h>

struct xt_rateest {
struct xt_rateest {
	/* keep lock and bstats on same cache line to speedup xt_rateest_tg() */
	/* keep lock and bstats on same cache line to speedup xt_rateest_tg() */
	struct gnet_stats_basic_packed	bstats;
	struct gnet_stats_basic_packed	bstats;
	spinlock_t			lock;
	spinlock_t			lock;
	/* keep rstats and lock on same cache line to speedup xt_rateest_mt() */

	struct gnet_stats_rate_est64	rstats;


	/* following fields not accessed in hot path */
	/* following fields not accessed in hot path */
	unsigned int			refcnt;
	struct hlist_node		list;
	struct hlist_node		list;
	char				name[IFNAMSIZ];
	char				name[IFNAMSIZ];
	unsigned int			refcnt;
	struct gnet_estimator		params;
	struct gnet_estimator		params;
	struct rcu_head			rcu;
	struct rcu_head			rcu;

	/* keep this field far away to speedup xt_rateest_mt() */
	struct net_rate_estimator __rcu *rate_est;
};
};


struct xt_rateest *xt_rateest_lookup(const char *name);
struct xt_rateest *xt_rateest_lookup(const char *name);
+1 −1
Original line number Original line Diff line number Diff line
@@ -76,7 +76,7 @@ struct Qdisc {


	struct netdev_queue	*dev_queue;
	struct netdev_queue	*dev_queue;


	struct gnet_stats_rate_est64	rate_est;
	struct net_rate_estimator __rcu *rate_est;
	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
	struct gnet_stats_queue	__percpu *cpu_qstats;
	struct gnet_stats_queue	__percpu *cpu_qstats;


+107 −192
Original line number Original line Diff line number Diff line
@@ -7,6 +7,7 @@
 *		2 of the License, or (at your option) any later version.
 *		2 of the License, or (at your option) any later version.
 *
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *		Eric Dumazet <edumazet@google.com>
 *
 *
 * Changes:
 * Changes:
 *              Jamal Hadi Salim - moved it to net/core and reshulfed
 *              Jamal Hadi Salim - moved it to net/core and reshulfed
@@ -30,171 +31,79 @@
#include <linux/skbuff.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/init.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/slab.h>
#include <linux/seqlock.h>
#include <net/sock.h>
#include <net/sock.h>
#include <net/gen_stats.h>
#include <net/gen_stats.h>


/*
/* This code is NOT intended to be used for statistics collection,
   This code is NOT intended to be used for statistics collection,
 * its purpose is to provide a base for statistical multiplexing
   its purpose is to provide a base for statistical multiplexing
 * for controlled load service.
   for controlled load service.
 * If you need only statistics, run a user level daemon which
   If you need only statistics, run a user level daemon which
 * periodically reads byte counters.
   periodically reads byte counters.

   Unfortunately, rate estimation is not a very easy task.
   F.e. I did not find a simple way to estimate the current peak rate
   and even failed to formulate the problem 8)8)

   So I preferred not to built an estimator into the scheduler,
   but run this task separately.
   Ideally, it should be kernel thread(s), but for now it runs
   from timers, which puts apparent top bounds on the number of rated
   flows, has minimal overhead on small, but is enough
   to handle controlled load service, sets of aggregates.

   We measure rate over A=(1<<interval) seconds and evaluate EWMA:

   avrate = avrate*(1-W) + rate*W

   where W is chosen as negative power of 2: W = 2^(-ewma_log)

   The resulting time constant is:

   T = A/(-ln(1-W))


   NOTES.

   * avbps and avpps are scaled by 2^5.
   * both values are reported as 32 bit unsigned values. bps can
     overflow for fast links : max speed being 34360Mbit/sec
   * Minimal interval is HZ/4=250msec (it is the greatest common divisor
     for HZ=100 and HZ=1024 8)), maximal interval
     is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
     are too expensive, longer ones can be implemented
     at user level painlessly.
 */
 */


#define EST_MAX_INTERVAL	5
struct net_rate_estimator {

struct gen_estimator {
	struct list_head	list;
	struct gnet_stats_basic_packed	*bstats;
	struct gnet_stats_basic_packed	*bstats;
	struct gnet_stats_rate_est64	*rate_est;
	spinlock_t		*stats_lock;
	spinlock_t		*stats_lock;
	seqcount_t		*running;
	seqcount_t		*running;
	int			ewma_log;
	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
	u8			ewma_log;
	u8			intvl_log; /* period : (250ms << intvl_log) */

	seqcount_t		seq;
	u32			last_packets;
	u32			last_packets;
	unsigned long		avpps;
	u64			last_bytes;
	u64			last_bytes;

	u64			avpps;
	u64			avbps;
	u64			avbps;
	struct rcu_head		e_rcu;
	struct rb_node		node;
	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
	struct rcu_head		head;
};


struct gen_estimator_head {
	unsigned long           next_jiffies;
	unsigned long           next_jiffies;
	struct timer_list       timer;
	struct timer_list       timer;
	struct list_head	list;
	struct rcu_head		rcu;
};
};


static struct gen_estimator_head elist[EST_MAX_INTERVAL+1];
static void est_fetch_counters(struct net_rate_estimator *e,

			       struct gnet_stats_basic_packed *b)
/* Protects against NULL dereference */
static DEFINE_RWLOCK(est_lock);

/* Protects against soft lockup during large deletion */
static struct rb_root est_root = RB_ROOT;
static DEFINE_SPINLOCK(est_tree_lock);

static void est_timer(unsigned long arg)
{
{
	int idx = (int)arg;
	struct gen_estimator *e;

	rcu_read_lock();
	list_for_each_entry_rcu(e, &elist[idx].list, list) {
		struct gnet_stats_basic_packed b = {0};
		unsigned long rate;
		u64 brate;

	if (e->stats_lock)
	if (e->stats_lock)
		spin_lock(e->stats_lock);
		spin_lock(e->stats_lock);
		read_lock(&est_lock);

		if (e->bstats == NULL)
	__gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats);
			goto skip;


		__gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);

		brate = (b.bytes - e->last_bytes)<<(7 - idx);
		e->last_bytes = b.bytes;
		e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
		WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);

		rate = b.packets - e->last_packets;
		rate <<= (7 - idx);
		e->last_packets = b.packets;
		e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
		WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
skip:
		read_unlock(&est_lock);
	if (e->stats_lock)
	if (e->stats_lock)
		spin_unlock(e->stats_lock);
		spin_unlock(e->stats_lock);
	}

	if (!list_empty(&elist[idx].list)) {
		elist[idx].next_jiffies += ((HZ/4) << idx);


		if (unlikely(time_after_eq(jiffies, elist[idx].next_jiffies))) {
			/* Ouch... timer was delayed. */
			elist[idx].next_jiffies = jiffies + 1;
		}
		mod_timer(&elist[idx].timer, elist[idx].next_jiffies);
	}
	rcu_read_unlock();
}
}


static void gen_add_node(struct gen_estimator *est)
static void est_timer(unsigned long arg)
{
{
	struct rb_node **p = &est_root.rb_node, *parent = NULL;
	struct net_rate_estimator *est = (struct net_rate_estimator *)arg;
	struct gnet_stats_basic_packed b;
	u64 rate, brate;


	while (*p) {
	est_fetch_counters(est, &b);
		struct gen_estimator *e;
	brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log);
	brate -= (est->avbps >> est->ewma_log);


		parent = *p;
	rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log);
		e = rb_entry(parent, struct gen_estimator, node);
	rate -= (est->avpps >> est->ewma_log);

		if (est->bstats > e->bstats)
			p = &parent->rb_right;
		else
			p = &parent->rb_left;
	}
	rb_link_node(&est->node, parent, p);
	rb_insert_color(&est->node, &est_root);
}


static
	write_seqcount_begin(&est->seq);
struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats,
	est->avbps += brate;
				    const struct gnet_stats_rate_est64 *rate_est)
	est->avpps += rate;
{
	write_seqcount_end(&est->seq);
	struct rb_node *p = est_root.rb_node;


	while (p) {
	est->last_bytes = b.bytes;
		struct gen_estimator *e;
	est->last_packets = b.packets;


		e = rb_entry(p, struct gen_estimator, node);
	est->next_jiffies += ((HZ/4) << est->intvl_log);


		if (bstats > e->bstats)
	if (unlikely(time_after_eq(jiffies, est->next_jiffies))) {
			p = p->rb_right;
		/* Ouch... timer was delayed. */
		else if (bstats < e->bstats || rate_est != e->rate_est)
		est->next_jiffies = jiffies + 1;
			p = p->rb_left;
		else
			return e;
	}
	}
	return NULL;
	mod_timer(&est->timer, est->next_jiffies);
}
}


/**
/**
@@ -217,84 +126,76 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
 */
 */
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
		      struct gnet_stats_rate_est64 *rate_est,
		      struct net_rate_estimator __rcu **rate_est,
		      spinlock_t *stats_lock,
		      spinlock_t *stats_lock,
		      seqcount_t *running,
		      seqcount_t *running,
		      struct nlattr *opt)
		      struct nlattr *opt)
{
{
	struct gen_estimator *est;
	struct gnet_estimator *parm = nla_data(opt);
	struct gnet_estimator *parm = nla_data(opt);
	struct gnet_stats_basic_packed b = {0};
	struct net_rate_estimator *old, *est;
	int idx;
	struct gnet_stats_basic_packed b;
	int intvl_log;


	if (nla_len(opt) < sizeof(*parm))
	if (nla_len(opt) < sizeof(*parm))
		return -EINVAL;
		return -EINVAL;


	/* allowed timer periods are :
	 * -2 : 250ms,   -1 : 500ms,    0 : 1 sec
	 *  1 : 2 sec,    2 : 4 sec,    3 : 8 sec
	 */
	if (parm->interval < -2 || parm->interval > 3)
	if (parm->interval < -2 || parm->interval > 3)
		return -EINVAL;
		return -EINVAL;


	est = kzalloc(sizeof(*est), GFP_KERNEL);
	est = kzalloc(sizeof(*est), GFP_KERNEL);
	if (est == NULL)
	if (!est)
		return -ENOBUFS;
		return -ENOBUFS;


	__gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);
	seqcount_init(&est->seq);

	intvl_log = parm->interval + 2;
	idx = parm->interval + 2;
	est->bstats = bstats;
	est->bstats = bstats;
	est->rate_est = rate_est;
	est->stats_lock = stats_lock;
	est->stats_lock = stats_lock;
	est->running  = running;
	est->running  = running;
	est->ewma_log = parm->ewma_log;
	est->ewma_log = parm->ewma_log;
	est->last_bytes = b.bytes;
	est->intvl_log = intvl_log;
	est->avbps = rate_est->bps<<5;
	est->last_packets = b.packets;
	est->avpps = rate_est->pps<<10;
	est->cpu_bstats = cpu_bstats;
	est->cpu_bstats = cpu_bstats;


	spin_lock_bh(&est_tree_lock);
	est_fetch_counters(est, &b);
	if (!elist[idx].timer.function) {
	est->last_bytes = b.bytes;
		INIT_LIST_HEAD(&elist[idx].list);
	est->last_packets = b.packets;
		setup_timer(&elist[idx].timer, est_timer, idx);
	old = rcu_dereference_protected(*rate_est, 1);
	if (old) {
		del_timer_sync(&old->timer);
		est->avbps = old->avbps;
		est->avpps = old->avpps;
	}
	}


	if (list_empty(&elist[idx].list)) {
	est->next_jiffies = jiffies + ((HZ/4) << intvl_log);
		elist[idx].next_jiffies = jiffies + ((HZ/4) << idx);
	setup_timer(&est->timer, est_timer, (unsigned long)est);
		mod_timer(&elist[idx].timer, elist[idx].next_jiffies);
	mod_timer(&est->timer, est->next_jiffies);
	}
	list_add_rcu(&est->list, &elist[idx].list);
	gen_add_node(est);
	spin_unlock_bh(&est_tree_lock);


	rcu_assign_pointer(*rate_est, est);
	if (old)
		kfree_rcu(old, rcu);
	return 0;
	return 0;
}
}
EXPORT_SYMBOL(gen_new_estimator);
EXPORT_SYMBOL(gen_new_estimator);


/**
/**
 * gen_kill_estimator - remove a rate estimator
 * gen_kill_estimator - remove a rate estimator
 * @bstats: basic statistics
 * @rate_est: rate estimator
 * @rate_est: rate estimator statistics
 *
 *
 * Removes the rate estimator specified by &bstats and &rate_est.
 * Removes the rate estimator.
 *
 *
 * Note : Caller should respect an RCU grace period before freeing stats_lock
 */
 */
void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
void gen_kill_estimator(struct net_rate_estimator __rcu **rate_est)
			struct gnet_stats_rate_est64 *rate_est)
{
{
	struct gen_estimator *e;
	struct net_rate_estimator *est;

	spin_lock_bh(&est_tree_lock);
	while ((e = gen_find_node(bstats, rate_est))) {
		rb_erase(&e->node, &est_root);

		write_lock(&est_lock);
		e->bstats = NULL;
		write_unlock(&est_lock);


		list_del_rcu(&e->list);
	est = xchg((__force struct net_rate_estimator **)rate_est, NULL);
		kfree_rcu(e, e_rcu);
	if (est) {
		del_timer_sync(&est->timer);
		kfree_rcu(est, rcu);
	}
	}
	spin_unlock_bh(&est_tree_lock);
}
}
EXPORT_SYMBOL(gen_kill_estimator);
EXPORT_SYMBOL(gen_kill_estimator);


@@ -314,33 +215,47 @@ EXPORT_SYMBOL(gen_kill_estimator);
 */
 */
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
			  struct gnet_stats_rate_est64 *rate_est,
			  struct net_rate_estimator __rcu **rate_est,
			  spinlock_t *stats_lock,
			  spinlock_t *stats_lock,
			  seqcount_t *running, struct nlattr *opt)
			  seqcount_t *running, struct nlattr *opt)
{
{
	gen_kill_estimator(bstats, rate_est);
	return gen_new_estimator(bstats, cpu_bstats, rate_est,
	return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
				 stats_lock, running, opt);
}
}
EXPORT_SYMBOL(gen_replace_estimator);
EXPORT_SYMBOL(gen_replace_estimator);


/**
/**
 * gen_estimator_active - test if estimator is currently in use
 * gen_estimator_active - test if estimator is currently in use
 * @bstats: basic statistics
 * @rate_est: rate estimator
 * @rate_est: rate estimator statistics
 *
 *
 * Returns true if estimator is active, and false if not.
 * Returns true if estimator is active, and false if not.
 */
 */
bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
bool gen_estimator_active(struct net_rate_estimator __rcu **rate_est)
			  const struct gnet_stats_rate_est64 *rate_est)
{
{
	bool res;
	return !!rcu_access_pointer(*rate_est);
}
EXPORT_SYMBOL(gen_estimator_active);


	ASSERT_RTNL();
bool gen_estimator_read(struct net_rate_estimator __rcu **rate_est,
			struct gnet_stats_rate_est64 *sample)
{
	struct net_rate_estimator *est;
	unsigned seq;


	spin_lock_bh(&est_tree_lock);
	rcu_read_lock();
	res = gen_find_node(bstats, rate_est) != NULL;
	est = rcu_dereference(*rate_est);
	spin_unlock_bh(&est_tree_lock);
	if (!est) {
		rcu_read_unlock();
		return false;
	}


	return res;
	do {
		seq = read_seqcount_begin(&est->seq);
		sample->bps = est->avbps >> 8;
		sample->pps = est->avpps >> 8;
	} while (read_seqcount_retry(&est->seq, seq));

	rcu_read_unlock();
	return true;
}
}
EXPORT_SYMBOL(gen_estimator_active);
EXPORT_SYMBOL(gen_estimator_read);
Loading