Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 43815482 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

net: sock_def_readable() and friends RCU conversion



sk_callback_lock rwlock actually protects sk->sk_sleep pointer, so we
need two atomic operations (and associated dirtying) per incoming
packet.

RCU conversion is pretty much needed :

1) Add a new structure, called "struct socket_wq" to hold all fields
that will need rcu_read_lock() protection (currently: a
wait_queue_head_t and a struct fasync_struct pointer).

[Future patch will add a list anchor for wakeup coalescing]

2) Attach one of such structure to each "struct socket" created in
sock_alloc_inode().

3) Respect RCU grace period when freeing a "struct socket_wq"

4) Change sk_sleep pointer in "struct sock" by sk_wq, pointer to "struct
socket_wq"

5) Change sk_sleep() function to use new sk->sk_wq instead of
sk->sk_sleep

6) Change sk_has_sleeper() to wq_has_sleeper() that must be used inside
a rcu_read_lock() section.

7) Change all sk_has_sleeper() callers to :
  - Use rcu_read_lock() instead of read_lock(&sk->sk_callback_lock)
  - Use wq_has_sleeper() to eventually wakeup tasks.
  - Use rcu_read_unlock() instead of read_unlock(&sk->sk_callback_lock)

8) sock_wake_async() is modified to use rcu protection as well.

9) Exceptions :
  macvtap, drivers/net/tun.c, af_unix use integrated "struct socket_wq"
instead of dynamically allocated ones. They dont need rcu freeing.

Some cleanups or followups are probably needed, (possible
sk_callback_lock conversion to a spinlock for example...).

Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 83d7eb29
Loading
Loading
Loading
Loading
+9 −4
Original line number Diff line number Diff line
@@ -37,6 +37,7 @@
struct macvtap_queue {
	struct sock sk;
	struct socket sock;
	struct socket_wq wq;
	struct macvlan_dev *vlan;
	struct file *file;
	unsigned int flags;
@@ -242,12 +243,15 @@ static struct rtnl_link_ops macvtap_link_ops __read_mostly = {

static void macvtap_sock_write_space(struct sock *sk)
{
	wait_queue_head_t *wqueue;

	if (!sock_writeable(sk) ||
	    !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
		return;

	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
		wake_up_interruptible_poll(sk_sleep(sk), POLLOUT | POLLWRNORM | POLLWRBAND);
	wqueue = sk_sleep(sk);
	if (wqueue && waitqueue_active(wqueue))
		wake_up_interruptible_poll(wqueue, POLLOUT | POLLWRNORM | POLLWRBAND);
}

static int macvtap_open(struct inode *inode, struct file *file)
@@ -272,7 +276,8 @@ static int macvtap_open(struct inode *inode, struct file *file)
	if (!q)
		goto out;

	init_waitqueue_head(&q->sock.wait);
	q->sock.wq = &q->wq;
	init_waitqueue_head(&q->wq.wait);
	q->sock.type = SOCK_RAW;
	q->sock.state = SS_CONNECTED;
	q->sock.file = file;
@@ -308,7 +313,7 @@ static unsigned int macvtap_poll(struct file *file, poll_table * wait)
		goto out;

	mask = 0;
	poll_wait(file, &q->sock.wait, wait);
	poll_wait(file, &q->wq.wait, wait);

	if (!skb_queue_empty(&q->sk.sk_receive_queue))
		mask |= POLLIN | POLLRDNORM;
+12 −9
Original line number Diff line number Diff line
@@ -109,7 +109,7 @@ struct tun_struct {

	struct tap_filter       txflt;
	struct socket		socket;

	struct socket_wq	wq;
#ifdef TUN_DEBUG
	int debug;
#endif
@@ -323,7 +323,7 @@ static void tun_net_uninit(struct net_device *dev)
	/* Inform the methods they need to stop using the dev.
	 */
	if (tfile) {
		wake_up_all(&tun->socket.wait);
		wake_up_all(&tun->wq.wait);
		if (atomic_dec_and_test(&tfile->count))
			__tun_detach(tun);
	}
@@ -398,7 +398,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
	/* Notify and wake up reader process */
	if (tun->flags & TUN_FASYNC)
		kill_fasync(&tun->fasync, SIGIO, POLL_IN);
	wake_up_interruptible_poll(&tun->socket.wait, POLLIN |
	wake_up_interruptible_poll(&tun->wq.wait, POLLIN |
				   POLLRDNORM | POLLRDBAND);
	return NETDEV_TX_OK;

@@ -498,7 +498,7 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)

	DBG(KERN_INFO "%s: tun_chr_poll\n", tun->dev->name);

	poll_wait(file, &tun->socket.wait, wait);
	poll_wait(file, &tun->wq.wait, wait);

	if (!skb_queue_empty(&sk->sk_receive_queue))
		mask |= POLLIN | POLLRDNORM;
@@ -773,7 +773,7 @@ static ssize_t tun_do_read(struct tun_struct *tun,

	DBG(KERN_INFO "%s: tun_chr_read\n", tun->dev->name);

	add_wait_queue(&tun->socket.wait, &wait);
	add_wait_queue(&tun->wq.wait, &wait);
	while (len) {
		current->state = TASK_INTERRUPTIBLE;

@@ -804,7 +804,7 @@ static ssize_t tun_do_read(struct tun_struct *tun,
	}

	current->state = TASK_RUNNING;
	remove_wait_queue(&tun->socket.wait, &wait);
	remove_wait_queue(&tun->wq.wait, &wait);

	return ret;
}
@@ -861,6 +861,7 @@ static struct rtnl_link_ops tun_link_ops __read_mostly = {
static void tun_sock_write_space(struct sock *sk)
{
	struct tun_struct *tun;
	wait_queue_head_t *wqueue;

	if (!sock_writeable(sk))
		return;
@@ -868,8 +869,9 @@ static void tun_sock_write_space(struct sock *sk)
	if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
		return;

	if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
		wake_up_interruptible_sync_poll(sk_sleep(sk), POLLOUT |
	wqueue = sk_sleep(sk);
	if (wqueue && waitqueue_active(wqueue))
		wake_up_interruptible_sync_poll(wqueue, POLLOUT |
						POLLWRNORM | POLLWRBAND);

	tun = tun_sk(sk)->tun;
@@ -1039,7 +1041,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
		if (!sk)
			goto err_free_dev;

		init_waitqueue_head(&tun->socket.wait);
		tun->socket.wq = &tun->wq;
		init_waitqueue_head(&tun->wq.wait);
		tun->socket.ops = &tun_socket_ops;
		sock_init_data(&tun->socket, sk);
		sk->sk_write_space = tun_sock_write_space;
+9 −5
Original line number Diff line number Diff line
@@ -59,6 +59,7 @@ typedef enum {
#include <linux/wait.h>
#include <linux/fcntl.h>	/* For O_CLOEXEC and O_NONBLOCK */
#include <linux/kmemcheck.h>
#include <linux/rcupdate.h>

struct poll_table_struct;
struct pipe_inode_info;
@@ -116,6 +117,12 @@ enum sock_shutdown_cmd {
	SHUT_RDWR	= 2,
};

struct socket_wq {
	wait_queue_head_t	wait;
	struct fasync_struct	*fasync_list;
	struct rcu_head		rcu;
} ____cacheline_aligned_in_smp;

/**
 *  struct socket - general BSD socket
 *  @state: socket state (%SS_CONNECTED, etc)
@@ -135,11 +142,8 @@ struct socket {
	kmemcheck_bitfield_end(type);

	unsigned long		flags;
	/*
	 * Please keep fasync_list & wait fields in the same cache line
	 */
	struct fasync_struct	*fasync_list;
	wait_queue_head_t	wait;

	struct socket_wq	*wq;

	struct file		*file;
	struct sock		*sk;
+11 −9
Original line number Diff line number Diff line
@@ -56,10 +56,12 @@ struct unix_sock {
	spinlock_t		lock;
	unsigned int		gc_candidate : 1;
	unsigned int		gc_maybe_cycle : 1;
        wait_queue_head_t       peer_wait;
	struct socket_wq	peer_wq;
};
#define unix_sk(__sk) ((struct unix_sock *)__sk)

#define peer_wait peer_wq.wait

#ifdef CONFIG_SYSCTL
extern int unix_sysctl_register(struct net *net);
extern void unix_sysctl_unregister(struct net *net);
+19 −19
Original line number Diff line number Diff line
@@ -159,7 +159,7 @@ struct sock_common {
  *	@sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
  *	@sk_lock:	synchronizer
  *	@sk_rcvbuf: size of receive buffer in bytes
  *	@sk_sleep: sock wait queue
  *	@sk_wq: sock wait queue and async head
  *	@sk_dst_cache: destination cache
  *	@sk_dst_lock: destination cache lock
  *	@sk_policy: flow policy
@@ -257,7 +257,7 @@ struct sock {
		struct sk_buff *tail;
		int len;
	} sk_backlog;
	wait_queue_head_t	*sk_sleep;
	struct socket_wq	*sk_wq;
	struct dst_entry	*sk_dst_cache;
#ifdef CONFIG_XFRM
	struct xfrm_policy	*sk_policy[2];
@@ -1219,7 +1219,7 @@ static inline void sk_set_socket(struct sock *sk, struct socket *sock)

static inline wait_queue_head_t *sk_sleep(struct sock *sk)
{
	return sk->sk_sleep;
	return &sk->sk_wq->wait;
}
/* Detach socket from process context.
 * Announce socket dead, detach it from wait queue and inode.
@@ -1233,14 +1233,14 @@ static inline void sock_orphan(struct sock *sk)
	write_lock_bh(&sk->sk_callback_lock);
	sock_set_flag(sk, SOCK_DEAD);
	sk_set_socket(sk, NULL);
	sk->sk_sleep  = NULL;
	sk->sk_wq  = NULL;
	write_unlock_bh(&sk->sk_callback_lock);
}

static inline void sock_graft(struct sock *sk, struct socket *parent)
{
	write_lock_bh(&sk->sk_callback_lock);
	sk->sk_sleep = &parent->wait;
	rcu_assign_pointer(sk->sk_wq, parent->wq);
	parent->sk = sk;
	sk_set_socket(sk, parent);
	security_sock_graft(sk, parent);
@@ -1392,12 +1392,12 @@ static inline int sk_has_allocations(const struct sock *sk)
}

/**
 * sk_has_sleeper - check if there are any waiting processes
 * @sk: socket
 * wq_has_sleeper - check if there are any waiting processes
 * @sk: struct socket_wq
 *
 * Returns true if socket has waiting processes
 * Returns true if socket_wq has waiting processes
 *
 * The purpose of the sk_has_sleeper and sock_poll_wait is to wrap the memory
 * The purpose of the wq_has_sleeper and sock_poll_wait is to wrap the memory
 * barrier call. They were added due to the race found within the tcp code.
 *
 * Consider following tcp code paths:
@@ -1410,9 +1410,10 @@ static inline int sk_has_allocations(const struct sock *sk)
 *   ...                 ...
 *   tp->rcv_nxt check   sock_def_readable
 *   ...                 {
 *   schedule               ...
 *                          if (sk_sleep(sk) && waitqueue_active(sk_sleep(sk)))
 *                              wake_up_interruptible(sk_sleep(sk))
 *   schedule               rcu_read_lock();
 *                          wq = rcu_dereference(sk->sk_wq);
 *                          if (wq && waitqueue_active(&wq->wait))
 *                              wake_up_interruptible(&wq->wait)
 *                          ...
 *                       }
 *
@@ -1421,19 +1422,18 @@ static inline int sk_has_allocations(const struct sock *sk)
 * could then endup calling schedule and sleep forever if there are no more
 * data on the socket.
 *
 * The sk_has_sleeper is always called right after a call to read_lock, so we
 * can use smp_mb__after_lock barrier.
 */
static inline int sk_has_sleeper(struct sock *sk)
static inline bool wq_has_sleeper(struct socket_wq *wq)
{

	/*
	 * We need to be sure we are in sync with the
	 * add_wait_queue modifications to the wait queue.
	 *
	 * This memory barrier is paired in the sock_poll_wait.
	 */
	smp_mb__after_lock();
	return sk_sleep(sk) && waitqueue_active(sk_sleep(sk));
	smp_mb();
	return wq && waitqueue_active(&wq->wait);
}

/**
@@ -1442,7 +1442,7 @@ static inline int sk_has_sleeper(struct sock *sk)
 * @wait_address:   socket wait queue
 * @p:              poll_table
 *
 * See the comments in the sk_has_sleeper function.
 * See the comments in the wq_has_sleeper function.
 */
static inline void sock_poll_wait(struct file *filp,
		wait_queue_head_t *wait_address, poll_table *p)
@@ -1453,7 +1453,7 @@ static inline void sock_poll_wait(struct file *filp,
		 * We need to be sure we are in sync with the
		 * socket flags modification.
		 *
		 * This memory barrier is paired in the sk_has_sleeper.
		 * This memory barrier is paired in the wq_has_sleeper.
		*/
		smp_mb();
	}
Loading