Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e370a723 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller
Browse files

af_unix: improve STREAM behavior with fragmented memory



unix_stream_sendmsg() currently uses order-2 allocations,
and we had numerous reports this can fail.

The __GFP_REPEAT flag present in sock_alloc_send_pskb() is
not helping.

This patch extends the work done in commit eb6a2481
("af_unix: reduce high order page allocations) for
datagram sockets.

This opens the possibility of zero copy IO (splice() and
friends)

The trick is to not use skb_pull() anymore in recvmsg() path,
and instead add a @consumed field in UNIXCB() to track amount
of already read payload in the skb.

There is a performance regression for large sends
because of extra page allocations that will be addressed
in a follow-up patch, allowing sock_alloc_send_pskb()
to attempt high order page allocations.

Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 149479d0
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -35,6 +35,7 @@ struct unix_skb_parms {
#ifdef CONFIG_SECURITY_NETWORK
	u32			secid;		/* Security ID		*/
#endif
	u32			consumed;
};

#define UNIXCB(skb) 	(*(struct unix_skb_parms *)&((skb)->cb))
+30 −35
Original line number Diff line number Diff line
@@ -1596,6 +1596,10 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
	return err;
}

/* We use paged skbs for stream sockets, and limit occupancy to 32768
 * bytes, and a minimun of a full page.
 */
#define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))

static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
			       struct msghdr *msg, size_t len)
@@ -1609,6 +1613,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
	struct scm_cookie tmp_scm;
	bool fds_sent = false;
	int max_level;
	int data_len;

	if (NULL == siocb->scm)
		siocb->scm = &tmp_scm;
@@ -1635,40 +1640,21 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
		goto pipe_err;

	while (sent < len) {
		/*
		 *	Optimisation for the fact that under 0.01% of X
		 *	messages typically need breaking up.
		 */

		size = len - sent;

		/* Keep two messages in the pipe so it schedules better */
		if (size > ((sk->sk_sndbuf >> 1) - 64))
			size = (sk->sk_sndbuf >> 1) - 64;
		size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);

		if (size > SKB_MAX_ALLOC)
			size = SKB_MAX_ALLOC;
		/* allow fallback to order-0 allocations */
		size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);

		/*
		 *	Grab a buffer
		 */

		skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT,
					  &err);
		data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));

		if (skb == NULL)
		skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
					   msg->msg_flags & MSG_DONTWAIT, &err);
		if (!skb)
			goto out_err;

		/*
		 *	If you pass two values to the sock_alloc_send_skb
		 *	it tries to grab the large buffer with GFP_NOFS
		 *	(which can fail easily), and if it fails grab the
		 *	fallback size buffer which is under a page and will
		 *	succeed. [Alan]
		 */
		size = min_t(int, size, skb_tailroom(skb));


		/* Only send the fds in the first buffer */
		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
		if (err < 0) {
@@ -1678,7 +1664,10 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
		max_level = err + 1;
		fds_sent = true;

		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
		skb_put(skb, size - data_len);
		skb->data_len = data_len;
		skb->len = size;
		err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, size);
		if (err) {
			kfree_skb(skb);
			goto out_err;
@@ -1890,6 +1879,11 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
	return timeo;
}

static unsigned int unix_skb_len(const struct sk_buff *skb)
{
	return skb->len - UNIXCB(skb).consumed;
}

static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
			       struct msghdr *msg, size_t size,
			       int flags)
@@ -1977,8 +1971,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
		}

		skip = sk_peek_offset(sk, flags);
		while (skip >= skb->len) {
			skip -= skb->len;
		while (skip >= unix_skb_len(skb)) {
			skip -= unix_skb_len(skb);
			last = skb;
			skb = skb_peek_next(skb, &sk->sk_receive_queue);
			if (!skb)
@@ -2005,8 +1999,9 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
			sunaddr = NULL;
		}

		chunk = min_t(unsigned int, skb->len - skip, size);
		if (memcpy_toiovec(msg->msg_iov, skb->data + skip, chunk)) {
		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
					    msg->msg_iov, chunk)) {
			if (copied == 0)
				copied = -EFAULT;
			break;
@@ -2016,14 +2011,14 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,

		/* Mark read part of skb as used */
		if (!(flags & MSG_PEEK)) {
			skb_pull(skb, chunk);
			UNIXCB(skb).consumed += chunk;

			sk_peek_offset_bwd(sk, chunk);

			if (UNIXCB(skb).fp)
				unix_detach_fds(siocb->scm, skb);

			if (skb->len)
			if (unix_skb_len(skb))
				break;

			skb_unlink(skb, &sk->sk_receive_queue);
@@ -2107,7 +2102,7 @@ long unix_inq_len(struct sock *sk)
	if (sk->sk_type == SOCK_STREAM ||
	    sk->sk_type == SOCK_SEQPACKET) {
		skb_queue_walk(&sk->sk_receive_queue, skb)
			amount += skb->len;
			amount += unix_skb_len(skb);
	} else {
		skb = skb_peek(&sk->sk_receive_queue);
		if (skb)