Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e5a3e259 authored by Daniel Borkmann's avatar Daniel Borkmann
Browse files

Merge branch 'bpf-tcp-rtt-hook'



Stanislav Fomichev says:

====================
Congestion control team would like to have a periodic callback to
track some TCP statistics. Let's add a sock_ops callback that can be
selectively enabled on a socket by socket basis and is executed for
every RTT. BPF program frequency can be further controlled by calling
bpf_ktime_get_ns and bailing out early.

I run neper tcp_stream and tcp_rr tests with the sample program
from the last patch and didn't observe any noticeable performance
difference.

v2:
* add a comment about second accept() in selftest (Yonghong Song)
* refer to tcp_bpf.readme in sample program (Yonghong Song)
====================

Suggested-by: default avatarEric Dumazet <edumazet@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Priyaranjan Jha <priyarjha@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Acked-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Acked-by: default avatarYuchung Cheng <ycheng@google.com>
Acked-by: default avatarYonghong Song <yhs@fb.com>
Acked-by: default avatarLawrence Brakmo <brakmo@fb.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents d2f5bbbc d78e3f06
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -2221,6 +2221,14 @@ static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
	return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1);
}

static inline void tcp_bpf_rtt(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTT_CB_FLAG))
		tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL);
}

#if IS_ENABLED(CONFIG_SMC)
extern struct static_key_false tcp_have_smc;
#endif
+11 −1
Original line number Diff line number Diff line
@@ -1770,6 +1770,7 @@ union bpf_attr {
 * 		* **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out)
 * 		* **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission)
 * 		* **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change)
 * 		* **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT)
 *
 * 		Therefore, this function can be used to clear a callback flag by
 * 		setting the appropriate bit to zero. e.g. to disable the RTO
@@ -3072,6 +3073,12 @@ struct bpf_tcp_sock {
				 * sum(delta(snd_una)), or how many bytes
				 * were acked.
				 */
	__u32 dsack_dups;	/* RFC4898 tcpEStatsStackDSACKDups
				 * total number of DSACK blocks received
				 */
	__u32 delivered;	/* Total data packets delivered incl. rexmits */
	__u32 delivered_ce;	/* Like the above but only ECE marked packets */
	__u32 icsk_retransmits;	/* Number of unrecovered [RTO] timeouts */
};

struct bpf_sock_tuple {
@@ -3314,7 +3321,8 @@ struct bpf_sock_ops {
#define BPF_SOCK_OPS_RTO_CB_FLAG	(1<<0)
#define BPF_SOCK_OPS_RETRANS_CB_FLAG	(1<<1)
#define BPF_SOCK_OPS_STATE_CB_FLAG	(1<<2)
#define BPF_SOCK_OPS_ALL_CB_FLAGS       0x7		/* Mask of all currently
#define BPF_SOCK_OPS_RTT_CB_FLAG	(1<<3)
#define BPF_SOCK_OPS_ALL_CB_FLAGS       0xF		/* Mask of all currently
							 * supported cb flags
							 */

@@ -3369,6 +3377,8 @@ enum {
	BPF_SOCK_OPS_TCP_LISTEN_CB,	/* Called on listen(2), right after
					 * socket transition to LISTEN state.
					 */
	BPF_SOCK_OPS_RTT_CB,		/* Called on every RTT.
					 */
};

/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect
+153 −54
Original line number Diff line number Diff line
@@ -5194,54 +5194,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
};
#endif /* CONFIG_IPV6_SEG6_BPF */

#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT)		\
do {									\
	switch (si->off) {						\
	case offsetof(md_type, snd_cwnd):				\
		CONVERT(snd_cwnd); break;				\
	case offsetof(md_type, srtt_us):				\
		CONVERT(srtt_us); break;				\
	case offsetof(md_type, snd_ssthresh):				\
		CONVERT(snd_ssthresh); break;				\
	case offsetof(md_type, rcv_nxt):				\
		CONVERT(rcv_nxt); break;				\
	case offsetof(md_type, snd_nxt):				\
		CONVERT(snd_nxt); break;				\
	case offsetof(md_type, snd_una):				\
		CONVERT(snd_una); break;				\
	case offsetof(md_type, mss_cache):				\
		CONVERT(mss_cache); break;				\
	case offsetof(md_type, ecn_flags):				\
		CONVERT(ecn_flags); break;				\
	case offsetof(md_type, rate_delivered):				\
		CONVERT(rate_delivered); break;				\
	case offsetof(md_type, rate_interval_us):			\
		CONVERT(rate_interval_us); break;			\
	case offsetof(md_type, packets_out):				\
		CONVERT(packets_out); break;				\
	case offsetof(md_type, retrans_out):				\
		CONVERT(retrans_out); break;				\
	case offsetof(md_type, total_retrans):				\
		CONVERT(total_retrans); break;				\
	case offsetof(md_type, segs_in):				\
		CONVERT(segs_in); break;				\
	case offsetof(md_type, data_segs_in):				\
		CONVERT(data_segs_in); break;				\
	case offsetof(md_type, segs_out):				\
		CONVERT(segs_out); break;				\
	case offsetof(md_type, data_segs_out):				\
		CONVERT(data_segs_out); break;				\
	case offsetof(md_type, lost_out):				\
		CONVERT(lost_out); break;				\
	case offsetof(md_type, sacked_out):				\
		CONVERT(sacked_out); break;				\
	case offsetof(md_type, bytes_received):				\
		CONVERT(bytes_received); break;				\
	case offsetof(md_type, bytes_acked):				\
		CONVERT(bytes_acked); break;				\
	}								\
} while (0)

#ifdef CONFIG_INET
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
			      int dif, int sdif, u8 family, u8 proto)
@@ -5592,7 +5544,8 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
				  struct bpf_insn_access_aux *info)
{
	if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked))
	if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
					  icsk_retransmits))
		return false;

	if (off % size != 0)
@@ -5623,8 +5576,19 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
				      offsetof(struct tcp_sock, FIELD)); \
	} while (0)

	CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock,
				       BPF_TCP_SOCK_GET_COMMON);
#define BPF_INET_SOCK_GET_COMMON(FIELD)					\
	do {								\
		BUILD_BUG_ON(FIELD_SIZEOF(struct inet_connection_sock,	\
					  FIELD) >			\
			     FIELD_SIZEOF(struct bpf_tcp_sock, FIELD));	\
		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(			\
					struct inet_connection_sock,	\
					FIELD),				\
				      si->dst_reg, si->src_reg,		\
				      offsetof(				\
					struct inet_connection_sock,	\
					FIELD));			\
	} while (0)

	if (insn > insn_buf)
		return insn - insn_buf;
@@ -5640,6 +5604,81 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
				      offsetof(struct tcp_sock, rtt_min) +
				      offsetof(struct minmax_sample, v));
		break;
	case offsetof(struct bpf_tcp_sock, snd_cwnd):
		BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
		break;
	case offsetof(struct bpf_tcp_sock, srtt_us):
		BPF_TCP_SOCK_GET_COMMON(srtt_us);
		break;
	case offsetof(struct bpf_tcp_sock, snd_ssthresh):
		BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
		break;
	case offsetof(struct bpf_tcp_sock, rcv_nxt):
		BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
		break;
	case offsetof(struct bpf_tcp_sock, snd_nxt):
		BPF_TCP_SOCK_GET_COMMON(snd_nxt);
		break;
	case offsetof(struct bpf_tcp_sock, snd_una):
		BPF_TCP_SOCK_GET_COMMON(snd_una);
		break;
	case offsetof(struct bpf_tcp_sock, mss_cache):
		BPF_TCP_SOCK_GET_COMMON(mss_cache);
		break;
	case offsetof(struct bpf_tcp_sock, ecn_flags):
		BPF_TCP_SOCK_GET_COMMON(ecn_flags);
		break;
	case offsetof(struct bpf_tcp_sock, rate_delivered):
		BPF_TCP_SOCK_GET_COMMON(rate_delivered);
		break;
	case offsetof(struct bpf_tcp_sock, rate_interval_us):
		BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
		break;
	case offsetof(struct bpf_tcp_sock, packets_out):
		BPF_TCP_SOCK_GET_COMMON(packets_out);
		break;
	case offsetof(struct bpf_tcp_sock, retrans_out):
		BPF_TCP_SOCK_GET_COMMON(retrans_out);
		break;
	case offsetof(struct bpf_tcp_sock, total_retrans):
		BPF_TCP_SOCK_GET_COMMON(total_retrans);
		break;
	case offsetof(struct bpf_tcp_sock, segs_in):
		BPF_TCP_SOCK_GET_COMMON(segs_in);
		break;
	case offsetof(struct bpf_tcp_sock, data_segs_in):
		BPF_TCP_SOCK_GET_COMMON(data_segs_in);
		break;
	case offsetof(struct bpf_tcp_sock, segs_out):
		BPF_TCP_SOCK_GET_COMMON(segs_out);
		break;
	case offsetof(struct bpf_tcp_sock, data_segs_out):
		BPF_TCP_SOCK_GET_COMMON(data_segs_out);
		break;
	case offsetof(struct bpf_tcp_sock, lost_out):
		BPF_TCP_SOCK_GET_COMMON(lost_out);
		break;
	case offsetof(struct bpf_tcp_sock, sacked_out):
		BPF_TCP_SOCK_GET_COMMON(sacked_out);
		break;
	case offsetof(struct bpf_tcp_sock, bytes_received):
		BPF_TCP_SOCK_GET_COMMON(bytes_received);
		break;
	case offsetof(struct bpf_tcp_sock, bytes_acked):
		BPF_TCP_SOCK_GET_COMMON(bytes_acked);
		break;
	case offsetof(struct bpf_tcp_sock, dsack_dups):
		BPF_TCP_SOCK_GET_COMMON(dsack_dups);
		break;
	case offsetof(struct bpf_tcp_sock, delivered):
		BPF_TCP_SOCK_GET_COMMON(delivered);
		break;
	case offsetof(struct bpf_tcp_sock, delivered_ce):
		BPF_TCP_SOCK_GET_COMMON(delivered_ce);
		break;
	case offsetof(struct bpf_tcp_sock, icsk_retransmits):
		BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
		break;
	}

	return insn - insn_buf;
@@ -7913,9 +7952,6 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
			SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ);	      \
	} while (0)

	CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_sock_ops,
				       SOCK_OPS_GET_TCP_SOCK_FIELD);

	if (insn > insn_buf)
		return insn - insn_buf;

@@ -8085,6 +8121,69 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
		SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
					  struct sock, type);
		break;
	case offsetof(struct bpf_sock_ops, snd_cwnd):
		SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
		break;
	case offsetof(struct bpf_sock_ops, srtt_us):
		SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
		break;
	case offsetof(struct bpf_sock_ops, snd_ssthresh):
		SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
		break;
	case offsetof(struct bpf_sock_ops, rcv_nxt):
		SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
		break;
	case offsetof(struct bpf_sock_ops, snd_nxt):
		SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
		break;
	case offsetof(struct bpf_sock_ops, snd_una):
		SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
		break;
	case offsetof(struct bpf_sock_ops, mss_cache):
		SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
		break;
	case offsetof(struct bpf_sock_ops, ecn_flags):
		SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
		break;
	case offsetof(struct bpf_sock_ops, rate_delivered):
		SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
		break;
	case offsetof(struct bpf_sock_ops, rate_interval_us):
		SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
		break;
	case offsetof(struct bpf_sock_ops, packets_out):
		SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
		break;
	case offsetof(struct bpf_sock_ops, retrans_out):
		SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
		break;
	case offsetof(struct bpf_sock_ops, total_retrans):
		SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
		break;
	case offsetof(struct bpf_sock_ops, segs_in):
		SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
		break;
	case offsetof(struct bpf_sock_ops, data_segs_in):
		SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
		break;
	case offsetof(struct bpf_sock_ops, segs_out):
		SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
		break;
	case offsetof(struct bpf_sock_ops, data_segs_out):
		SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
		break;
	case offsetof(struct bpf_sock_ops, lost_out):
		SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
		break;
	case offsetof(struct bpf_sock_ops, sacked_out):
		SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
		break;
	case offsetof(struct bpf_sock_ops, bytes_received):
		SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
		break;
	case offsetof(struct bpf_sock_ops, bytes_acked):
		SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
		break;
	case offsetof(struct bpf_sock_ops, sk):
		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
						struct bpf_sock_ops_kern,
+4 −0
Original line number Diff line number Diff line
@@ -778,6 +778,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
				tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
			tp->rtt_seq = tp->snd_nxt;
			tp->mdev_max_us = tcp_rto_min_us(sk);

			tcp_bpf_rtt(sk);
		}
	} else {
		/* no previous measure. */
@@ -786,6 +788,8 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
		tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
		tp->mdev_max_us = tp->rttvar_us;
		tp->rtt_seq = tp->snd_nxt;

		tcp_bpf_rtt(sk);
	}
	tp->srtt_us = max(1U, srtt);
}
+1 −0
Original line number Diff line number Diff line
@@ -154,6 +154,7 @@ always += tcp_iw_kern.o
always += tcp_clamp_kern.o
always += tcp_basertt_kern.o
always += tcp_tos_reflect_kern.o
always += tcp_dumpstats_kern.o
always += xdp_redirect_kern.o
always += xdp_redirect_map_kern.o
always += xdp_redirect_cpu_kern.o
Loading