Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit edb09eb1 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

net: sched: do not acquire qdisc spinlock in qdisc/class stats dump

Large tc dumps (tc -s {qdisc|class} sh dev ethX) done by Google BwE host
agent [1] are problematic at scale :

For each qdisc/class found in the dump, we currently lock the root qdisc
spinlock in order to get stats. Sampling stats every 5 seconds from
thousands of HTB classes is a challenge when the root qdisc spinlock is
under high pressure. Not only the dumps take time, they also slow
down the fast path (queue/dequeue packets) by 10 % to 20 % in some cases.

An audit of existing qdiscs showed that sch_fq_codel is the only qdisc
that might need the qdisc lock in fq_codel_dump_stats() and
fq_codel_dump_class_stats()

In v2 of this patch, I now use the Qdisc running seqcount to provide
consistent reads of packets/bytes counters, regardless of 32/64 bit arches.

I also changed rate estimators to use the same infrastructure
so that they no longer need to lock root qdisc lock.

[1]
http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43838.pdf



Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Kevin Athey <kda@google.com>
Cc: Xiaotian Pei <xiaotian@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent f9eb8aea
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -21,7 +21,7 @@ struct mystruct {
	...
};

Update statistics:
Update statistics, in dequeue() methods only, (while owning qdisc->running)
mystruct->tstats.packet++;
mystruct->qstats.backlog += skb->pkt_len;

+8 −4
Original line number Diff line number Diff line
@@ -33,10 +33,12 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type,
				 spinlock_t *lock, struct gnet_dump *d,
				 int padattr);

int gnet_stats_copy_basic(struct gnet_dump *d,
int gnet_stats_copy_basic(const seqcount_t *running,
			  struct gnet_dump *d,
			  struct gnet_stats_basic_cpu __percpu *cpu,
			  struct gnet_stats_basic_packed *b);
void __gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
void __gnet_stats_copy_basic(const seqcount_t *running,
			     struct gnet_stats_basic_packed *bstats,
			     struct gnet_stats_basic_cpu __percpu *cpu,
			     struct gnet_stats_basic_packed *b);
int gnet_stats_copy_rate_est(struct gnet_dump *d,
@@ -52,13 +54,15 @@ int gnet_stats_finish_copy(struct gnet_dump *d);
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
		      struct gnet_stats_rate_est64 *rate_est,
		      spinlock_t *stats_lock, struct nlattr *opt);
		      spinlock_t *stats_lock,
		      seqcount_t *running, struct nlattr *opt);
void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
			struct gnet_stats_rate_est64 *rate_est);
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
			  struct gnet_stats_rate_est64 *rate_est,
			  spinlock_t *stats_lock, struct nlattr *opt);
			  spinlock_t *stats_lock,
			  seqcount_t *running, struct nlattr *opt);
bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
			  const struct gnet_stats_rate_est64 *rate_est);
#endif
+8 −0
Original line number Diff line number Diff line
@@ -314,6 +314,14 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc)
	return qdisc_lock(root);
}

static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc)
{
	struct Qdisc *root = qdisc_root_sleeping(qdisc);

	ASSERT_RTNL();
	return &root->running;
}

static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc)
{
	return qdisc->dev_queue->dev;
+16 −8
Original line number Diff line number Diff line
@@ -84,6 +84,7 @@ struct gen_estimator
	struct gnet_stats_basic_packed	*bstats;
	struct gnet_stats_rate_est64	*rate_est;
	spinlock_t		*stats_lock;
	seqcount_t		*running;
	int			ewma_log;
	u32			last_packets;
	unsigned long		avpps;
@@ -121,25 +122,27 @@ static void est_timer(unsigned long arg)
		unsigned long rate;
		u64 brate;

		if (e->stats_lock)
			spin_lock(e->stats_lock);
		read_lock(&est_lock);
		if (e->bstats == NULL)
			goto skip;

		__gnet_stats_copy_basic(&b, e->cpu_bstats, e->bstats);
		__gnet_stats_copy_basic(e->running, &b, e->cpu_bstats, e->bstats);

		brate = (b.bytes - e->last_bytes)<<(7 - idx);
		e->last_bytes = b.bytes;
		e->avbps += (brate >> e->ewma_log) - (e->avbps >> e->ewma_log);
		e->rate_est->bps = (e->avbps+0xF)>>5;
		WRITE_ONCE(e->rate_est->bps, (e->avbps + 0xF) >> 5);

		rate = b.packets - e->last_packets;
		rate <<= (7 - idx);
		e->last_packets = b.packets;
		e->avpps += (rate >> e->ewma_log) - (e->avpps >> e->ewma_log);
		e->rate_est->pps = (e->avpps + 0xF) >> 5;
		WRITE_ONCE(e->rate_est->pps, (e->avpps + 0xF) >> 5);
skip:
		read_unlock(&est_lock);
		if (e->stats_lock)
			spin_unlock(e->stats_lock);
	}

@@ -194,6 +197,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
 * @cpu_bstats: bstats per cpu
 * @rate_est: rate estimator statistics
 * @stats_lock: statistics lock
 * @running: qdisc running seqcount
 * @opt: rate estimator configuration TLV
 *
 * Creates a new rate estimator with &bstats as source and &rate_est
@@ -209,6 +213,7 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
		      struct gnet_stats_basic_cpu __percpu *cpu_bstats,
		      struct gnet_stats_rate_est64 *rate_est,
		      spinlock_t *stats_lock,
		      seqcount_t *running,
		      struct nlattr *opt)
{
	struct gen_estimator *est;
@@ -226,12 +231,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
	if (est == NULL)
		return -ENOBUFS;

	__gnet_stats_copy_basic(&b, cpu_bstats, bstats);
	__gnet_stats_copy_basic(running, &b, cpu_bstats, bstats);

	idx = parm->interval + 2;
	est->bstats = bstats;
	est->rate_est = rate_est;
	est->stats_lock = stats_lock;
	est->running  = running;
	est->ewma_log = parm->ewma_log;
	est->last_bytes = b.bytes;
	est->avbps = rate_est->bps<<5;
@@ -291,6 +297,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
 * @cpu_bstats: bstats per cpu
 * @rate_est: rate estimator statistics
 * @stats_lock: statistics lock
 * @running: qdisc running seqcount (might be NULL)
 * @opt: rate estimator configuration TLV
 *
 * Replaces the configuration of a rate estimator by calling
@@ -301,10 +308,11 @@ EXPORT_SYMBOL(gen_kill_estimator);
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
			  struct gnet_stats_basic_cpu __percpu *cpu_bstats,
			  struct gnet_stats_rate_est64 *rate_est,
			  spinlock_t *stats_lock, struct nlattr *opt)
			  spinlock_t *stats_lock,
			  seqcount_t *running, struct nlattr *opt)
{
	gen_kill_estimator(bstats, rate_est);
	return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, opt);
	return gen_new_estimator(bstats, cpu_bstats, rate_est, stats_lock, running, opt);
}
EXPORT_SYMBOL(gen_replace_estimator);

+23 −11
Original line number Diff line number Diff line
@@ -32,10 +32,11 @@ gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr)
	return 0;

nla_put_failure:
	if (d->lock)
		spin_unlock_bh(d->lock);
	kfree(d->xstats);
	d->xstats = NULL;
	d->xstats_len = 0;
	spin_unlock_bh(d->lock);
	return -1;
}

@@ -65,15 +66,16 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
{
	memset(d, 0, sizeof(*d));

	spin_lock_bh(lock);
	d->lock = lock;
	if (type)
		d->tail = (struct nlattr *)skb_tail_pointer(skb);
	d->skb = skb;
	d->compat_tc_stats = tc_stats_type;
	d->compat_xstats = xstats_type;
	d->padattr = padattr;

	if (lock) {
		d->lock = lock;
		spin_lock_bh(lock);
	}
	if (d->tail)
		return gnet_stats_copy(d, type, NULL, 0, padattr);

@@ -126,16 +128,23 @@ __gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats,
}

void
__gnet_stats_copy_basic(struct gnet_stats_basic_packed *bstats,
__gnet_stats_copy_basic(const seqcount_t *running,
			struct gnet_stats_basic_packed *bstats,
			struct gnet_stats_basic_cpu __percpu *cpu,
			struct gnet_stats_basic_packed *b)
{
	unsigned int seq;

	if (cpu) {
		__gnet_stats_copy_basic_cpu(bstats, cpu);
	} else {
		return;
	}
	do {
		if (running)
			seq = read_seqcount_begin(running);
		bstats->bytes = b->bytes;
		bstats->packets = b->packets;
	}
	} while (running && read_seqcount_retry(running, seq));
}
EXPORT_SYMBOL(__gnet_stats_copy_basic);

@@ -152,13 +161,14 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
 * if the room in the socket buffer was not sufficient.
 */
int
gnet_stats_copy_basic(struct gnet_dump *d,
gnet_stats_copy_basic(const seqcount_t *running,
		      struct gnet_dump *d,
		      struct gnet_stats_basic_cpu __percpu *cpu,
		      struct gnet_stats_basic_packed *b)
{
	struct gnet_stats_basic_packed bstats = {0};

	__gnet_stats_copy_basic(&bstats, cpu, b);
	__gnet_stats_copy_basic(running, &bstats, cpu, b);

	if (d->compat_tc_stats) {
		d->tc_stats.bytes = bstats.bytes;
@@ -328,8 +338,9 @@ gnet_stats_copy_app(struct gnet_dump *d, void *st, int len)
	return 0;

err_out:
	d->xstats_len = 0;
	if (d->lock)
		spin_unlock_bh(d->lock);
	d->xstats_len = 0;
	return -1;
}
EXPORT_SYMBOL(gnet_stats_copy_app);
@@ -363,10 +374,11 @@ gnet_stats_finish_copy(struct gnet_dump *d)
			return -1;
	}

	if (d->lock)
		spin_unlock_bh(d->lock);
	kfree(d->xstats);
	d->xstats = NULL;
	d->xstats_len = 0;
	spin_unlock_bh(d->lock);
	return 0;
}
EXPORT_SYMBOL(gnet_stats_finish_copy);
Loading