Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1cf4a0cc authored by Daniel Borkmann's avatar Daniel Borkmann
Browse files

Merge branch 'bpf-sockmap-fixes-and-improvements'



John Fastabend says:

====================
Set of bpf fixes and improvements to make sockmap with kTLS usable
with "real" applications. This set came as the fallout of pulling
kTLS+sockmap into Cilium[1] and running in container environment.

Roughly broken into three parts,

Patches 1-3: resolve/improve handling of size field in sk_msg_md
Patch     4: it became difficult to use this in Cilium when the
	     SK_PASS verdict was not correctly handle. So handle
	     the case correctly.
Patch   5-8: Set of issues found while running OpenSSL TX kTLS
	     enabled applications. This resolves the most obvious
	     issues and gets applications using kTLS TX up and
	     running with sock{map|has}.

Other than the "sk_msg, zap ingress queue on psock down" (PATCH 6/8)
which can potentially cause a WARNING the issues fixed in this
series do not cause kernel side warnings, BUG, etc. but instead
cause stalls and other odd behavior in the user space applications
when using kTLS with BPF policies applied.

Primarily tested with 'curl' compiled with latest openssl and
also 'openssl s_client/s_server' containers using Cilium network
plugin with docker/k8s. Some basic testing with httpd was also
enabled. Cilium CI tests will be added shortly to cover these
cases as well. We also have 'wrk' and other test and benchmarking
tools we can run now.

We have two more sets of patches currently under testing that
will be sent shortly to address a few more issues. First the
OpenSSL RX kTLS side breaks when both sk_msg and sk_skb_verdict
programs are used with kTLS, the sk_skb_verdict programs are
not enforced. Second skmsg needs to call into tcp stack to
send to indicate consumed data.
====================

Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents 77ea5f4c 28cb6f1e
Loading
Loading
Loading
Loading
+9 −3
Original line number Diff line number Diff line
@@ -36,9 +36,7 @@ struct sk_msg_sg {
	struct scatterlist		data[MAX_MSG_FRAGS + 1];
};

/* UAPI in filter.c depends on struct sk_msg_sg being first element. If
 * this is moved filter.c also must be updated.
 */
/* UAPI in filter.c depends on struct sk_msg_sg being first element. */
struct sk_msg {
	struct sk_msg_sg		sg;
	void				*data;
@@ -419,6 +417,14 @@ static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock)
		sk_psock_drop(sk, psock);
}

static inline void sk_psock_data_ready(struct sock *sk, struct sk_psock *psock)
{
	if (psock->parser.enabled)
		psock->parser.saved_data_ready(sk);
	else
		sk->sk_data_ready(sk);
}

static inline void psock_set_prog(struct bpf_prog **pprog,
				  struct bpf_prog *prog)
{
+1 −0
Original line number Diff line number Diff line
@@ -286,6 +286,7 @@ struct ucred {
#define MSG_NOSIGNAL	0x4000	/* Do not generate SIGPIPE */
#define MSG_MORE	0x8000	/* Sender will send more */
#define MSG_WAITFORONE	0x10000	/* recvmmsg(): block until 1+ packets avail */
#define MSG_SENDPAGE_NOPOLICY 0x10000 /* sendpage() internal : do no apply policy */
#define MSG_SENDPAGE_NOTLAST 0x20000 /* sendpage() internal : not the last page */
#define MSG_BATCH	0x40000 /* sendmmsg(): more messages coming */
#define MSG_EOF         MSG_FIN
+9 −0
Original line number Diff line number Diff line
@@ -454,6 +454,15 @@ tls_offload_ctx_tx(const struct tls_context *tls_ctx)
	return (struct tls_offload_context_tx *)tls_ctx->priv_ctx_tx;
}

static inline bool tls_sw_has_ctx_tx(const struct sock *sk)
{
	struct tls_context *ctx = tls_get_ctx(sk);

	if (!ctx)
		return false;
	return !!tls_sw_ctx_tx(ctx);
}

static inline struct tls_offload_context_rx *
tls_offload_ctx_rx(const struct tls_context *tls_ctx)
{
+17 −7
Original line number Diff line number Diff line
@@ -6313,6 +6313,9 @@ static bool sk_msg_is_valid_access(int off, int size,
	if (type == BPF_WRITE)
		return false;

	if (off % size != 0)
		return false;

	switch (off) {
	case offsetof(struct sk_msg_md, data):
		info->reg_type = PTR_TO_PACKET;
@@ -6324,16 +6327,20 @@ static bool sk_msg_is_valid_access(int off, int size,
		if (size != sizeof(__u64))
			return false;
		break;
	default:
	case bpf_ctx_range(struct sk_msg_md, family):
	case bpf_ctx_range(struct sk_msg_md, remote_ip4):
	case bpf_ctx_range(struct sk_msg_md, local_ip4):
	case bpf_ctx_range_till(struct sk_msg_md, remote_ip6[0], remote_ip6[3]):
	case bpf_ctx_range_till(struct sk_msg_md, local_ip6[0], local_ip6[3]):
	case bpf_ctx_range(struct sk_msg_md, remote_port):
	case bpf_ctx_range(struct sk_msg_md, local_port):
	case bpf_ctx_range(struct sk_msg_md, size):
		if (size != sizeof(__u32))
			return false;
	}

	if (off < 0 || off >= sizeof(struct sk_msg_md))
		return false;
	if (off % size != 0)
		break;
	default:
		return false;

	}
	return true;
}

@@ -7418,6 +7425,9 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type,
	int off;
#endif

	/* convert ctx uses the fact sg element is first in struct */
	BUILD_BUG_ON(offsetof(struct sk_msg, sg) != 0);

	switch (si->off) {
	case offsetof(struct sk_msg_md, data):
		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, data),
+20 −3
Original line number Diff line number Diff line
@@ -403,7 +403,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb)
	msg->skb = skb;

	sk_psock_queue_msg(psock, msg);
	sk->sk_data_ready(sk);
	sk_psock_data_ready(sk, psock);
	return copied;
}

@@ -572,6 +572,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
	rcu_assign_sk_user_data(sk, NULL);
	sk_psock_cork_free(psock);
	sk_psock_zap_ingress(psock);
	sk_psock_restore_proto(sk, psock);

	write_lock_bh(&sk->sk_callback_lock);
@@ -669,6 +670,22 @@ static void sk_psock_verdict_apply(struct sk_psock *psock,
	bool ingress;

	switch (verdict) {
	case __SK_PASS:
		sk_other = psock->sk;
		if (sock_flag(sk_other, SOCK_DEAD) ||
		    !sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
			goto out_free;
		}
		if (atomic_read(&sk_other->sk_rmem_alloc) <=
		    sk_other->sk_rcvbuf) {
			struct tcp_skb_cb *tcp = TCP_SKB_CB(skb);

			tcp->bpf.flags |= BPF_F_INGRESS;
			skb_queue_tail(&psock->ingress_skb, skb);
			schedule_work(&psock->work);
			break;
		}
		goto out_free;
	case __SK_REDIRECT:
		sk_other = tcp_skb_bpf_redirect_fetch(skb);
		if (unlikely(!sk_other))
@@ -735,7 +752,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb)
}

/* Called with socket lock held. */
static void sk_psock_data_ready(struct sock *sk)
static void sk_psock_strp_data_ready(struct sock *sk)
{
	struct sk_psock *psock;

@@ -783,7 +800,7 @@ void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock)
		return;

	parser->saved_data_ready = sk->sk_data_ready;
	sk->sk_data_ready = sk_psock_data_ready;
	sk->sk_data_ready = sk_psock_strp_data_ready;
	sk->sk_write_space = sk_psock_write_space;
	parser->enabled = true;
}
Loading