Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6e360f73 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'udp-msg_zerocopy'

Willem de Bruijn says:

====================
udp msg_zerocopy

Enable MSG_ZEROCOPY for udp sockets

Patch 1/3 is the main patch, a rework of RFC patch
  http://patchwork.ozlabs.org/patch/899630/
  more details in the patch commit message

Patch 2/3 is an optimization to remove a branch from the UDP hot path
  and refcount_inc/refcount_dec_and_test pair when zerocopy is used.
  This used to be included in the first patch in v2.

Patch 3/3 runs the already existing udp zerocopy tests
  as part of kselftest

See also recent Linux Plumbers presentation
  https://linuxplumbersconf.org/event/2/contributions/106/attachments/104/128/willemdebruijn-lpc2018-udpgso-presentation-20181113.pdf



Changes:
  v1 -> v2
    - Fixup reverse christmas tree violation
  v2 -> v3
    - Split refcount avoidance optimization into separate patch
      - Fix refcount leak on error in fragmented case
        (thanks to Paolo Abeni for pointing this one out!)
      - Fix refcount inc on zero
  v3 -> v4
    - Move skb_zcopy_set below the only kfree_skb that might cause
      a premature uarg destroy before skb_zerocopy_put_abort
      - Move the entire skb_shinfo assignment block, to keep that
	cacheline access in one place
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents ce01a56b db63e489
Loading
Loading
Loading
Loading
+9 −4
Original line number Diff line number Diff line
@@ -481,10 +481,11 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg)
}

void sock_zerocopy_put(struct ubuf_info *uarg);
void sock_zerocopy_put_abort(struct ubuf_info *uarg);
void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref);

void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);

int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
			     struct msghdr *msg, int len,
			     struct ubuf_info *uarg);
@@ -1325,9 +1326,13 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
	return is_zcopy ? skb_uarg(skb) : NULL;
}

static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg,
				 bool *have_ref)
{
	if (skb && uarg && !skb_zcopy(skb)) {
		if (unlikely(have_ref && *have_ref))
			*have_ref = false;
		else
			sock_zerocopy_get(uarg);
		skb_shinfo(skb)->destructor_arg = uarg;
		skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
@@ -1373,7 +1378,7 @@ static inline void skb_zcopy_abort(struct sk_buff *skb)
	struct ubuf_info *uarg = skb_zcopy(skb);

	if (uarg) {
		sock_zerocopy_put_abort(uarg);
		sock_zerocopy_put_abort(uarg, false);
		skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
	}
}
+11 −4
Original line number Diff line number Diff line
@@ -1089,7 +1089,7 @@ void sock_zerocopy_put(struct ubuf_info *uarg)
}
EXPORT_SYMBOL_GPL(sock_zerocopy_put);

void sock_zerocopy_put_abort(struct ubuf_info *uarg)
void sock_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
	if (uarg) {
		struct sock *sk = skb_from_uarg(uarg)->sk;
@@ -1097,6 +1097,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
		atomic_dec(&sk->sk_zckey);
		uarg->len--;

		if (have_uref)
			sock_zerocopy_put(uarg);
	}
}
@@ -1105,6 +1106,12 @@ EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort);
extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
				   struct iov_iter *from, size_t length);

int skb_zerocopy_iter_dgram(struct sk_buff *skb, struct msghdr *msg, int len)
{
	return __zerocopy_sg_from_iter(skb->sk, skb, &msg->msg_iter, len);
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_dgram);

int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
			     struct msghdr *msg, int len,
			     struct ubuf_info *uarg)
@@ -1131,7 +1138,7 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
		return err;
	}

	skb_zcopy_set(skb, uarg);
	skb_zcopy_set(skb, uarg, NULL);
	return skb->len - orig_len;
}
EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);
@@ -1151,7 +1158,7 @@ static int skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig,
			if (skb_copy_ubufs(nskb, GFP_ATOMIC))
				return -EIO;
		}
		skb_zcopy_set(nskb, skb_uarg(orig));
		skb_zcopy_set(nskb, skb_uarg(orig), NULL);
	}
	return 0;
}
+4 −1
Original line number Diff line number Diff line
@@ -1018,7 +1018,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,

	case SO_ZEROCOPY:
		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
			if (sk->sk_protocol != IPPROTO_TCP)
			if (!((sk->sk_type == SOCK_STREAM &&
			       sk->sk_protocol == IPPROTO_TCP) ||
			      (sk->sk_type == SOCK_DGRAM &&
			       sk->sk_protocol == IPPROTO_UDP)))
				ret = -ENOTSUPP;
		} else if (sk->sk_family != PF_RDS) {
			ret = -ENOTSUPP;
+29 −8
Original line number Diff line number Diff line
@@ -867,6 +867,7 @@ static int __ip_append_data(struct sock *sk,
			    unsigned int flags)
{
	struct inet_sock *inet = inet_sk(sk);
	struct ubuf_info *uarg = NULL;
	struct sk_buff *skb;

	struct ip_options *opt = cork->opt;
@@ -880,8 +881,8 @@ static int __ip_append_data(struct sock *sk,
	int csummode = CHECKSUM_NONE;
	struct rtable *rt = (struct rtable *)cork->dst;
	unsigned int wmem_alloc_delta = 0;
	bool paged, extra_uref;
	u32 tskey = 0;
	bool paged;

	skb = skb_peek_tail(queue);

@@ -916,6 +917,20 @@ static int __ip_append_data(struct sock *sk,
	    (!exthdrlen || (rt->dst.dev->features & NETIF_F_HW_ESP_TX_CSUM)))
		csummode = CHECKSUM_PARTIAL;

	if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) {
		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
		if (!uarg)
			return -ENOBUFS;
		extra_uref = true;
		if (rt->dst.dev->features & NETIF_F_SG &&
		    csummode == CHECKSUM_PARTIAL) {
			paged = true;
		} else {
			uarg->zerocopy = 0;
			skb_zcopy_set(skb, uarg, &extra_uref);
		}
	}

	cork->length += length;

	/* So, what's going on in the loop below?
@@ -1001,12 +1016,6 @@ static int __ip_append_data(struct sock *sk,
			skb->csum = 0;
			skb_reserve(skb, hh_len);

			/* only the initial fragment is time stamped */
			skb_shinfo(skb)->tx_flags = cork->tx_flags;
			cork->tx_flags = 0;
			skb_shinfo(skb)->tskey = tskey;
			tskey = 0;

			/*
			 *	Find where to start putting bytes.
			 */
@@ -1039,6 +1048,13 @@ static int __ip_append_data(struct sock *sk,
			exthdrlen = 0;
			csummode = CHECKSUM_NONE;

			/* only the initial fragment is time stamped */
			skb_shinfo(skb)->tx_flags = cork->tx_flags;
			cork->tx_flags = 0;
			skb_shinfo(skb)->tskey = tskey;
			tskey = 0;
			skb_zcopy_set(skb, uarg, &extra_uref);

			if ((flags & MSG_CONFIRM) && !skb_prev)
				skb_set_dst_pending_confirm(skb, 1);

@@ -1068,7 +1084,7 @@ static int __ip_append_data(struct sock *sk,
				err = -EFAULT;
				goto error;
			}
		} else {
		} else if (!uarg || !uarg->zerocopy) {
			int i = skb_shinfo(skb)->nr_frags;

			err = -ENOMEM;
@@ -1098,6 +1114,10 @@ static int __ip_append_data(struct sock *sk,
			skb->data_len += copy;
			skb->truesize += copy;
			wmem_alloc_delta += copy;
		} else {
			err = skb_zerocopy_iter_dgram(skb, from, copy);
			if (err < 0)
				goto error;
		}
		offset += copy;
		length -= copy;
@@ -1110,6 +1130,7 @@ static int __ip_append_data(struct sock *sk,
error_efault:
	err = -EFAULT;
error:
	sock_zerocopy_put_abort(uarg, extra_uref);
	cork->length -= length;
	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
+1 −1
Original line number Diff line number Diff line
@@ -1423,7 +1423,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
	if (copied + copied_syn)
		goto out;
out_err:
	sock_zerocopy_put_abort(uarg);
	sock_zerocopy_put_abort(uarg, true);
	err = sk_stream_error(sk, flags, err);
	/* make sure we wake any epoll edge trigger waiter */
	if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 &&
Loading