Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 52267790 authored by Willem de Bruijn's avatar Willem de Bruijn Committed by David S. Miller
Browse files

sock: add MSG_ZEROCOPY



The kernel supports zerocopy sendmsg in virtio and tap. Expand the
infrastructure to support other socket types. Introduce a completion
notification channel over the socket error queue. Notifications are
returned with ee_origin SO_EE_ORIGIN_ZEROCOPY. ee_errno is 0 to avoid
blocking the send/recv path on receiving notifications.

Add reference counting, to support the skb split, merge, resize and
clone operations possible with SOCK_STREAM and other socket types.

The patch does not yet modify any datapaths.

Signed-off-by: default avatarWillem de Bruijn <willemb@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 3ece7826
Loading
Loading
Loading
Loading
+60 −0
Original line number Diff line number Diff line
@@ -429,6 +429,7 @@ enum {
	SKBTX_SCHED_TSTAMP = 1 << 6,
};

#define SKBTX_ZEROCOPY_FRAG	(SKBTX_DEV_ZEROCOPY | SKBTX_SHARED_FRAG)
#define SKBTX_ANY_SW_TSTAMP	(SKBTX_SW_TSTAMP    | \
				 SKBTX_SCHED_TSTAMP)
#define SKBTX_ANY_TSTAMP	(SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP)
@@ -445,8 +446,28 @@ struct ubuf_info {
	void (*callback)(struct ubuf_info *, bool zerocopy_success);
	void *ctx;
	unsigned long desc;
	u16 zerocopy:1;
	atomic_t refcnt;
};

#define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))

struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);

static inline void sock_zerocopy_get(struct ubuf_info *uarg)
{
	atomic_inc(&uarg->refcnt);
}

void sock_zerocopy_put(struct ubuf_info *uarg);
void sock_zerocopy_put_abort(struct ubuf_info *uarg);

void sock_zerocopy_callback(struct ubuf_info *uarg, bool success);

int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
			     struct msghdr *msg, int len,
			     struct ubuf_info *uarg);

/* This data is invariant across clones and lives at
 * the end of the header data, ie. at skb->end.
 */
@@ -1214,6 +1235,45 @@ static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb)
	return &skb_shinfo(skb)->hwtstamps;
}

static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
{
	bool is_zcopy = skb && skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY;

	return is_zcopy ? skb_uarg(skb) : NULL;
}

static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
{
	if (skb && uarg && !skb_zcopy(skb)) {
		sock_zerocopy_get(uarg);
		skb_shinfo(skb)->destructor_arg = uarg;
		skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
	}
}

/* Release a reference on a zerocopy structure */
static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy)
{
	struct ubuf_info *uarg = skb_zcopy(skb);

	if (uarg) {
		uarg->zerocopy = uarg->zerocopy && zerocopy;
		sock_zerocopy_put(uarg);
		skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
	}
}

/* Abort a zerocopy operation and revert zckey on error in send syscall */
static inline void skb_zcopy_abort(struct sk_buff *skb)
{
	struct ubuf_info *uarg = skb_zcopy(skb);

	if (uarg) {
		sock_zerocopy_put_abort(uarg);
		skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG;
	}
}

/**
 *	skb_queue_empty - check if a queue is empty
 *	@list: queue head
+1 −0
Original line number Diff line number Diff line
@@ -287,6 +287,7 @@ struct ucred {
#define MSG_BATCH	0x40000 /* sendmmsg(): more messages coming */
#define MSG_EOF         MSG_FIN

#define MSG_ZEROCOPY	0x4000000	/* Use user data in kernel path */
#define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
#define MSG_CMSG_CLOEXEC 0x40000000	/* Set close_on_exec for file
					   descriptor received through
+2 −0
Original line number Diff line number Diff line
@@ -294,6 +294,7 @@ struct sock_common {
  *	@sk_stamp: time stamp of last packet received
  *	@sk_tsflags: SO_TIMESTAMPING socket options
  *	@sk_tskey: counter to disambiguate concurrent tstamp requests
  *	@sk_zckey: counter to order MSG_ZEROCOPY notifications
  *	@sk_socket: Identd and reporting IO signals
  *	@sk_user_data: RPC layer private data
  *	@sk_frag: cached page frag
@@ -462,6 +463,7 @@ struct sock {
	u16			sk_tsflags;
	u8			sk_shutdown;
	u32			sk_tskey;
	atomic_t		sk_zckey;
	struct socket		*sk_socket;
	void			*sk_user_data;
#ifdef CONFIG_SECURITY
+3 −0
Original line number Diff line number Diff line
@@ -18,10 +18,13 @@ struct sock_extended_err {
#define SO_EE_ORIGIN_ICMP	2
#define SO_EE_ORIGIN_ICMP6	3
#define SO_EE_ORIGIN_TXSTATUS	4
#define SO_EE_ORIGIN_ZEROCOPY	5
#define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS

#define SO_EE_OFFENDER(ee)	((struct sockaddr*)((ee)+1))

#define SO_EE_CODE_ZEROCOPY_COPIED	1

/**
 *	struct scm_timestamping - timestamps exposed through cmsg
 *
+34 −21
Original line number Diff line number Diff line
@@ -573,27 +573,12 @@ int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_datagram_from_iter);

/**
 *	zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
 *	@skb: buffer to copy
 *	@from: the source to copy from
 *
 *	The function will first copy up to headlen, and then pin the userspace
 *	pages and build frags through them.
 *
 *	Returns 0, -EFAULT or -EMSGSIZE.
 */
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb,
			    struct iov_iter *from, size_t length)
{
	int len = iov_iter_count(from);
	int copy = min_t(int, skb_headlen(skb), len);
	int frag = 0;
	int frag = skb_shinfo(skb)->nr_frags;

	/* copy up to skb headlen */
	if (skb_copy_datagram_from_iter(skb, 0, from, copy))
		return -EFAULT;

	while (iov_iter_count(from)) {
	while (length && iov_iter_count(from)) {
		struct page *pages[MAX_SKB_FRAGS];
		size_t start;
		ssize_t copied;
@@ -603,18 +588,24 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
		if (frag == MAX_SKB_FRAGS)
			return -EMSGSIZE;

		copied = iov_iter_get_pages(from, pages, ~0U,
		copied = iov_iter_get_pages(from, pages, length,
					    MAX_SKB_FRAGS - frag, &start);
		if (copied < 0)
			return -EFAULT;

		iov_iter_advance(from, copied);
		length -= copied;

		truesize = PAGE_ALIGN(copied + start);
		skb->data_len += copied;
		skb->len += copied;
		skb->truesize += truesize;
		if (sk && sk->sk_type == SOCK_STREAM) {
			sk->sk_wmem_queued += truesize;
			sk_mem_charge(sk, truesize);
		} else {
			refcount_add(truesize, &skb->sk->sk_wmem_alloc);
		}
		while (copied) {
			int size = min_t(int, copied, PAGE_SIZE - start);
			skb_fill_page_desc(skb, frag++, pages[n], start, size);
@@ -625,6 +616,28 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
	}
	return 0;
}
EXPORT_SYMBOL(__zerocopy_sg_from_iter);

/**
 *	zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter
 *	@skb: buffer to copy
 *	@from: the source to copy from
 *
 *	The function will first copy up to headlen, and then pin the userspace
 *	pages and build frags through them.
 *
 *	Returns 0, -EFAULT or -EMSGSIZE.
 */
int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from)
{
	int copy = min_t(int, skb_headlen(skb), iov_iter_count(from));

	/* copy up to skb headlen */
	if (skb_copy_datagram_from_iter(skb, 0, from, copy))
		return -EFAULT;

	return __zerocopy_sg_from_iter(NULL, skb, from, ~0U);
}
EXPORT_SYMBOL(zerocopy_sg_from_iter);

static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
Loading