Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9954729b authored by Willem de Bruijn's avatar Willem de Bruijn Committed by David S. Miller
Browse files

packet: rollover only to socket with headroom



Only migrate flows to sockets that have sufficient headroom, where
sufficient is defined as having at least 25% empty space.

The kernel has three different buffer types: a regular socket, a ring
with frames (TPACKET_V[12]) or a ring with blocks (TPACKET_V3). The
latter two do not expose a read pointer to the kernel, so headroom is
not computed easily. All three needs a different implementation to
estimate free space.

Tested:
  Ran bench_rollover for 10 sec with 1.5 Mpps of single flow input.

  bench_rollover has as many sockets as there are NIC receive queues
  in the system. Each socket is owned by a process that is pinned to
  one of the receive cpus. RFS is disabled. RPS is enabled with an
  identity mapping (cpu x -> cpu x), to count drops with softnettop.

    lpbb5:/export/hda3/willemb# ./bench_rollover -r -l 1000 -s
    Press [Enter] to exit

    cpu         rx       rx.k     drop.k   rollover     r.huge   r.failed
      0         16         16          0          0          0          0
      1         21         21          0          0          0          0
      2    5227502    5227502          0          0          0          0
      3         18         18          0          0          0          0
      4    6083289    6083289          0    5227496          0          0
      5         22         22          0          0          0          0
      6         21         21          0          0          0          0
      7          9          9          0          0          0          0

Signed-off-by: default avatarWillem de Bruijn <willemb@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 0648ab70
Loading
Loading
Loading
Loading
+59 −17
Original line number Diff line number Diff line
@@ -1234,27 +1234,68 @@ static void packet_free_pending(struct packet_sock *po)
	free_percpu(po->tx_ring.pending_refcnt);
}

static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
#define ROOM_POW_OFF	2
#define ROOM_NONE	0x0
#define ROOM_LOW	0x1
#define ROOM_NORMAL	0x2

static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
{
	struct sock *sk = &po->sk;
	bool has_room;
	int idx, len;

	if (po->prot_hook.func != tpacket_rcv)
		return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
			<= sk->sk_rcvbuf;
	len = po->rx_ring.frame_max + 1;
	idx = po->rx_ring.head;
	if (pow_off)
		idx += len >> pow_off;
	if (idx >= len)
		idx -= len;
	return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}

	spin_lock(&sk->sk_receive_queue.lock);
	if (po->tp_version == TPACKET_V3)
		has_room = prb_lookup_block(po, &po->rx_ring,
					    po->rx_ring.prb_bdqc.kactive_blk_num,
					    TP_STATUS_KERNEL);
static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
{
	int idx, len;

	len = po->rx_ring.prb_bdqc.knum_blocks;
	idx = po->rx_ring.prb_bdqc.kactive_blk_num;
	if (pow_off)
		idx += len >> pow_off;
	if (idx >= len)
		idx -= len;
	return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}

static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
	struct sock *sk = &po->sk;
	int ret = ROOM_NONE;

	if (po->prot_hook.func != tpacket_rcv) {
		int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
					  - skb->truesize;
		if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
			return ROOM_NORMAL;
		else if (avail > 0)
			return ROOM_LOW;
		else
		has_room = packet_lookup_frame(po, &po->rx_ring,
					       po->rx_ring.head,
					       TP_STATUS_KERNEL);
			return ROOM_NONE;
	}

	spin_lock(&sk->sk_receive_queue.lock);
	if (po->tp_version == TPACKET_V3) {
		if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
			ret = ROOM_NORMAL;
		else if (__tpacket_v3_has_room(po, 0))
			ret = ROOM_LOW;
	} else {
		if (__tpacket_has_room(po, ROOM_POW_OFF))
			ret = ROOM_NORMAL;
		else if (__tpacket_has_room(po, 0))
			ret = ROOM_LOW;
	}
	spin_unlock(&sk->sk_receive_queue.lock);

	return has_room;
	return ret;
}

static void packet_sock_destruct(struct sock *sk)
@@ -1325,12 +1366,13 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
	unsigned int i, j;

	po = pkt_sk(f->arr[idx]);
	if (try_self && packet_rcv_has_room(po, skb))
	if (try_self && packet_rcv_has_room(po, skb) != ROOM_NONE)
		return idx;

	i = j = min_t(int, po->rollover->sock, num - 1);
	do {
		if (i != idx && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
		if (i != idx &&
		    packet_rcv_has_room(pkt_sk(f->arr[i]), skb) == ROOM_NORMAL) {
			if (i != j)
				po->rollover->sock = i;
			return i;