Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a97f4fe6 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'fc-quic-pacing'



Eric Dumazet says:

====================
net_sched: sch_fq: enable in-kernel pacing for QUIC servers

Willem added GSO support to UDP stack, greatly improving performance
of QUIC servers.

We also want to enable in-kernel pacing, which is possible thanks to EDT
model, since each sendmsg() can provide a timestamp for the skbs.

We have to change sch_fq to enable feeding packets in arbitrary EDT order,
and make sure that packet classification do not trust unconnected sockets.

Note that this patch series also is a prereq for a future TCP change
enabling per-flow delays/reorders/losses to implement high performance
TCP emulators.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents a55a385d 37c0aead
Loading
Loading
Loading
Loading
+96 −14
Original line number Diff line number Diff line
@@ -54,10 +54,23 @@
#include <net/tcp_states.h>
#include <net/tcp.h>

struct fq_skb_cb {
	u64	        time_to_send;
};

static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
{
	qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
	return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
}

/*
 * Per flow structure, dynamically allocated
 * Per flow structure, dynamically allocated.
 * If packets have monotically increasing time_to_send, they are placed in O(1)
 * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
 */
struct fq_flow {
	struct rb_root	t_root;
	struct sk_buff	*head;		/* list of skbs for this flow : first skb */
	union {
		struct sk_buff *tail;	/* last skb in the list */
@@ -257,6 +270,17 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
		 */
		sk = (struct sock *)((hash << 1) | 1UL);
		skb_orphan(skb);
	} else if (sk->sk_state == TCP_CLOSE) {
		unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
		/*
		 * Sockets in TCP_CLOSE are non connected.
		 * Typical use case is UDP sockets, they can send packets
		 * with sendto() to many different destinations.
		 * We probably could use a generic bit advertising
		 * non connected sockets, instead of sk_state == TCP_CLOSE,
		 * if we care enough.
		 */
		sk = (struct sock *)((hash << 1) | 1UL);
	}

	root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
@@ -277,7 +301,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
			 * It not, we need to refill credit with
			 * initial quantum
			 */
			if (unlikely(skb->sk &&
			if (unlikely(skb->sk == sk &&
				     f->socket_hash != sk->sk_hash)) {
				f->credit = q->initial_quantum;
				f->socket_hash = sk->sk_hash;
@@ -298,9 +322,11 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
		q->stat_allocation_errors++;
		return &q->internal;
	}
	/* f->t_root is already zeroed after kmem_cache_zalloc() */

	fq_flow_set_detached(f);
	f->sk = sk;
	if (skb->sk)
	if (skb->sk == sk)
		f->socket_hash = sk->sk_hash;
	f->credit = q->initial_quantum;

@@ -312,14 +338,40 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
	return f;
}

static struct sk_buff *fq_peek(struct fq_flow *flow)
{
	struct sk_buff *skb = skb_rb_first(&flow->t_root);
	struct sk_buff *head = flow->head;

	if (!skb)
		return head;

	if (!head)
		return skb;

	if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
		return skb;
	return head;
}

static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
			  struct sk_buff *skb)
{
	if (skb == flow->head) {
		flow->head = skb->next;
	} else {
		rb_erase(&skb->rbnode, &flow->t_root);
		skb->dev = qdisc_dev(sch);
	}
}

/* remove one skb from head of flow queue */
static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
{
	struct sk_buff *skb = flow->head;
	struct sk_buff *skb = fq_peek(flow);

	if (skb) {
		flow->head = skb->next;
		fq_erase_head(sch, flow, skb);
		skb_mark_not_on_list(skb);
		flow->qlen--;
		qdisc_qstats_backlog_dec(sch, skb);
@@ -330,15 +382,36 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)

static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
{
	struct sk_buff *head = flow->head;
	struct rb_node **p, *parent;
	struct sk_buff *head, *aux;

	skb->next = NULL;
	fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns();

	head = flow->head;
	if (!head ||
	    fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
		if (!head)
			flow->head = skb;
		else
			flow->tail->next = skb;

		flow->tail = skb;
		skb->next = NULL;
		return;
	}

	p = &flow->t_root.rb_node;
	parent = NULL;

	while (*p) {
		parent = *p;
		aux = rb_to_skb(parent);
		if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
			p = &parent->rb_right;
		else
			p = &parent->rb_left;
	}
	rb_link_node(&skb->rbnode, parent, p);
	rb_insert_color(&skb->rbnode, &flow->t_root);
}

static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -450,9 +523,9 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
		goto begin;
	}

	skb = f->head;
	skb = fq_peek(f);
	if (skb) {
		u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp),
		u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
					     f->time_next_packet);

		if (now < time_next_packet) {
@@ -533,6 +606,15 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)

static void fq_flow_purge(struct fq_flow *flow)
{
	struct rb_node *p = rb_first(&flow->t_root);

	while (p) {
		struct sk_buff *skb = rb_to_skb(p);

		p = rb_next(p);
		rb_erase(&skb->rbnode, &flow->t_root);
		rtnl_kfree_skbs(skb, skb);
	}
	rtnl_kfree_skbs(flow->head, flow->tail);
	flow->head = NULL;
	flow->qlen = 0;