Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 1aa12bdf authored by John Fastabend's avatar John Fastabend Committed by Daniel Borkmann
Browse files

bpf: sockmap, add sock close() hook to remove socks



The selftests test_maps program was leaving dangling BPF sockmap
programs around because not all psock elements were removed from
the map. The elements in turn hold a reference on the BPF program
they are attached to causing BPF programs to stay open even after
test_maps has completed.

The original intent was that sk_state_change() would be called
when TCP socks went through TCP_CLOSE state. However, because
socks may be in SOCK_DEAD state or the sock may be a listening
socket the event is not always triggered.

To resolve this use the ULP infrastructure and register our own
proto close() handler. This fixes the above case.

Fixes: 174a79ff ("bpf: sockmap with sk redirect support")
Reported-by: default avatarPrashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: default avatarJohn Fastabend <john.fastabend@gmail.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parent b11a632c
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -1985,6 +1985,7 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);

enum {
	TCP_ULP_TLS,
	TCP_ULP_BPF,
};

struct tcp_ulp_ops {
@@ -2003,6 +2004,7 @@ struct tcp_ulp_ops {
int tcp_register_ulp(struct tcp_ulp_ops *type);
void tcp_unregister_ulp(struct tcp_ulp_ops *type);
int tcp_set_ulp(struct sock *sk, const char *name);
int tcp_set_ulp_id(struct sock *sk, const int ulp);
void tcp_get_available_ulp(char *buf, size_t len);
void tcp_cleanup_ulp(struct sock *sk);

+101 −67
Original line number Diff line number Diff line
@@ -86,9 +86,10 @@ struct smap_psock {
	struct work_struct tx_work;
	struct work_struct gc_work;

	struct proto *sk_proto;
	void (*save_close)(struct sock *sk, long timeout);
	void (*save_data_ready)(struct sock *sk);
	void (*save_write_space)(struct sock *sk);
	void (*save_state_change)(struct sock *sk);
};

static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
@@ -96,12 +97,102 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
	return rcu_dereference_sk_user_data(sk);
}

static struct proto tcp_bpf_proto;
static int bpf_tcp_init(struct sock *sk)
{
	struct smap_psock *psock;

	rcu_read_lock();
	psock = smap_psock_sk(sk);
	if (unlikely(!psock)) {
		rcu_read_unlock();
		return -EINVAL;
	}

	if (unlikely(psock->sk_proto)) {
		rcu_read_unlock();
		return -EBUSY;
	}

	psock->save_close = sk->sk_prot->close;
	psock->sk_proto = sk->sk_prot;
	sk->sk_prot = &tcp_bpf_proto;
	rcu_read_unlock();
	return 0;
}

static void bpf_tcp_release(struct sock *sk)
{
	struct smap_psock *psock;

	rcu_read_lock();
	psock = smap_psock_sk(sk);

	if (likely(psock)) {
		sk->sk_prot = psock->sk_proto;
		psock->sk_proto = NULL;
	}
	rcu_read_unlock();
}

static void smap_release_sock(struct smap_psock *psock, struct sock *sock);

static void bpf_tcp_close(struct sock *sk, long timeout)
{
	void (*close_fun)(struct sock *sk, long timeout);
	struct smap_psock_map_entry *e, *tmp;
	struct smap_psock *psock;
	struct sock *osk;

	rcu_read_lock();
	psock = smap_psock_sk(sk);
	if (unlikely(!psock)) {
		rcu_read_unlock();
		return sk->sk_prot->close(sk, timeout);
	}

	/* The psock may be destroyed anytime after exiting the RCU critial
	 * section so by the time we use close_fun the psock may no longer
	 * be valid. However, bpf_tcp_close is called with the sock lock
	 * held so the close hook and sk are still valid.
	 */
	close_fun = psock->save_close;

	write_lock_bh(&sk->sk_callback_lock);
	list_for_each_entry_safe(e, tmp, &psock->maps, list) {
		osk = cmpxchg(e->entry, sk, NULL);
		if (osk == sk) {
			list_del(&e->list);
			smap_release_sock(psock, sk);
		}
	}
	write_unlock_bh(&sk->sk_callback_lock);
	rcu_read_unlock();
	close_fun(sk, timeout);
}

enum __sk_action {
	__SK_DROP = 0,
	__SK_PASS,
	__SK_REDIRECT,
};

static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
	.name		= "bpf_tcp",
	.uid		= TCP_ULP_BPF,
	.user_visible	= false,
	.owner		= NULL,
	.init		= bpf_tcp_init,
	.release	= bpf_tcp_release,
};

static int bpf_tcp_ulp_register(void)
{
	tcp_bpf_proto = tcp_prot;
	tcp_bpf_proto.close = bpf_tcp_close;
	return tcp_register_ulp(&bpf_tcp_ulp_ops);
}

static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
{
	struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
@@ -166,68 +257,6 @@ static void smap_report_sk_error(struct smap_psock *psock, int err)
	sk->sk_error_report(sk);
}

static void smap_release_sock(struct smap_psock *psock, struct sock *sock);

/* Called with lock_sock(sk) held */
static void smap_state_change(struct sock *sk)
{
	struct smap_psock_map_entry *e, *tmp;
	struct smap_psock *psock;
	struct socket_wq *wq;
	struct sock *osk;

	rcu_read_lock();

	/* Allowing transitions into an established syn_recv states allows
	 * for early binding sockets to a smap object before the connection
	 * is established.
	 */
	switch (sk->sk_state) {
	case TCP_SYN_SENT:
	case TCP_SYN_RECV:
	case TCP_ESTABLISHED:
		break;
	case TCP_CLOSE_WAIT:
	case TCP_CLOSING:
	case TCP_LAST_ACK:
	case TCP_FIN_WAIT1:
	case TCP_FIN_WAIT2:
	case TCP_LISTEN:
		break;
	case TCP_CLOSE:
		/* Only release if the map entry is in fact the sock in
		 * question. There is a case where the operator deletes
		 * the sock from the map, but the TCP sock is closed before
		 * the psock is detached. Use cmpxchg to verify correct
		 * sock is removed.
		 */
		psock = smap_psock_sk(sk);
		if (unlikely(!psock))
			break;
		write_lock_bh(&sk->sk_callback_lock);
		list_for_each_entry_safe(e, tmp, &psock->maps, list) {
			osk = cmpxchg(e->entry, sk, NULL);
			if (osk == sk) {
				list_del(&e->list);
				smap_release_sock(psock, sk);
			}
		}
		write_unlock_bh(&sk->sk_callback_lock);
		break;
	default:
		psock = smap_psock_sk(sk);
		if (unlikely(!psock))
			break;
		smap_report_sk_error(psock, EPIPE);
		break;
	}

	wq = rcu_dereference(sk->sk_wq);
	if (skwq_has_sleeper(wq))
		wake_up_interruptible_all(&wq->wait);
	rcu_read_unlock();
}

static void smap_read_sock_strparser(struct strparser *strp,
				     struct sk_buff *skb)
{
@@ -322,10 +351,8 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
		return;
	sk->sk_data_ready = psock->save_data_ready;
	sk->sk_write_space = psock->save_write_space;
	sk->sk_state_change = psock->save_state_change;
	psock->save_data_ready = NULL;
	psock->save_write_space = NULL;
	psock->save_state_change = NULL;
	strp_stop(&psock->strp);
	psock->strp_enabled = false;
}
@@ -350,6 +377,7 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
	if (psock->refcnt)
		return;

	tcp_cleanup_ulp(sock);
	smap_stop_sock(psock, sock);
	clear_bit(SMAP_TX_RUNNING, &psock->state);
	rcu_assign_sk_user_data(sock, NULL);
@@ -427,10 +455,8 @@ static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
		return;
	psock->save_data_ready = sk->sk_data_ready;
	psock->save_write_space = sk->sk_write_space;
	psock->save_state_change = sk->sk_state_change;
	sk->sk_data_ready = smap_data_ready;
	sk->sk_write_space = smap_write_space;
	sk->sk_state_change = smap_state_change;
	psock->strp_enabled = true;
}

@@ -509,6 +535,10 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
	if (attr->value_size > KMALLOC_MAX_SIZE)
		return ERR_PTR(-E2BIG);

	err = bpf_tcp_ulp_register();
	if (err && err != -EEXIST)
		return ERR_PTR(err);

	stab = kzalloc(sizeof(*stab), GFP_USER);
	if (!stab)
		return ERR_PTR(-ENOMEM);
@@ -754,6 +784,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
			goto out_progs;
		}

		err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
		if (err)
			goto out_progs;

		set_bit(SMAP_TX_RUNNING, &psock->state);
	}