Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 80c6d2b8 authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'RDS-zerocopy-support'

Sowmini Varadhan says:

====================
RDS: zerocopy support

This is version 3 of the series, following up on review comments for
 http://patchwork.ozlabs.org/project/netdev/list/?series=28530



Review comments addressed
Patch 4
  - fix fragile use of skb->cb[], do not set ee_code incorrectly.
Patch 5:
  - remove needless bzero of skb->cb[], consolidate err cleanup

A brief overview of this feature follows.

This patch series provides support for MSG_ZERCOCOPY
on a PF_RDS socket based on the APIs and infrastructure added
by Commit f214f915 ("tcp: enable MSG_ZEROCOPY")

For single threaded rds-stress testing using rds-tcp with the
ixgbe driver using 1M message sizes (-a 1M -q 1M) preliminary
results show that  there is a significant reduction in latency: about
90 usec with zerocopy, compared with 200 usec without zerocopy.

This patchset modifies the above for zerocopy in the following manner.
- if the MSG_ZEROCOPY flag is specified with rds_sendmsg(), and,
- if the SO_ZEROCOPY  socket option has been set on the PF_RDS socket,
application pages sent down with rds_sendmsg are pinned. The pinning
uses the accounting infrastructure added by a91dbff5 ("sock: ulimit
on MSG_ZEROCOPY pages"). The message is unpinned when all references
to the message go down to 0, and the message is freed by rds_message_purge.

A multithreaded application using this infrastructure must send down
a unique 32 bit cookie as ancillary data with each sendmsg invocation.
The format of this ancillary data is described in Patch 5 of the series.
The cookie is passed up to the application on the sk_error_queue when
the message is unpinned, indicating to the application that it is now
safe to free/reuse the message buffer. The details of the completion
notification are provided in Patch 4 of this series.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents ee99b2d8 dfb8434b
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -466,6 +466,9 @@ struct ubuf_info {

#define skb_uarg(SKB)	((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))

int mm_account_pinned_pages(struct mmpin *mmp, size_t size);
void mm_unaccount_pinned_pages(struct mmpin *mmp);

struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
					struct ubuf_info *uarg);
+2 −0
Original line number Diff line number Diff line
@@ -20,11 +20,13 @@ struct sock_extended_err {
#define SO_EE_ORIGIN_ICMP6	3
#define SO_EE_ORIGIN_TXSTATUS	4
#define SO_EE_ORIGIN_ZEROCOPY	5
#define SO_EE_ORIGIN_ZCOOKIE	6
#define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS

#define SO_EE_OFFENDER(ee)	((struct sockaddr*)((ee)+1))

#define SO_EE_CODE_ZEROCOPY_COPIED	1
#define	SO_EE_ORIGIN_MAX_ZCOOKIES	8

/**
 *	struct scm_timestamping - timestamps exposed through cmsg
+1 −0
Original line number Diff line number Diff line
@@ -103,6 +103,7 @@
#define RDS_CMSG_MASKED_ATOMIC_FADD	8
#define RDS_CMSG_MASKED_ATOMIC_CSWP	9
#define RDS_CMSG_RXPATH_LATENCY		11
#define	RDS_CMSG_ZCOPY_COOKIE		12

#define RDS_INFO_FIRST			10000
#define RDS_INFO_COUNTERS		10000
+4 −2
Original line number Diff line number Diff line
@@ -890,7 +890,7 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
}
EXPORT_SYMBOL_GPL(skb_morph);

static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
{
	unsigned long max_pg, num_pg, new_pg, old_pg;
	struct user_struct *user;
@@ -919,14 +919,16 @@ static int mm_account_pinned_pages(struct mmpin *mmp, size_t size)

	return 0;
}
EXPORT_SYMBOL_GPL(mm_account_pinned_pages);

static void mm_unaccount_pinned_pages(struct mmpin *mmp)
void mm_unaccount_pinned_pages(struct mmpin *mmp)
{
	if (mmp->user) {
		atomic_long_sub(mmp->num_pg, &mmp->user->locked_vm);
		free_uid(mmp->user);
	}
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);

struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
{
+14 −11
Original line number Diff line number Diff line
@@ -1049,18 +1049,21 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
		break;

	case SO_ZEROCOPY:
		if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
			ret = -ENOTSUPP;
		else if (sk->sk_protocol != IPPROTO_TCP)
		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
			if (sk->sk_protocol != IPPROTO_TCP)
				ret = -ENOTSUPP;
			else if (sk->sk_state != TCP_CLOSE)
				ret = -EBUSY;
		else if (val < 0 || val > 1)
		} else if (sk->sk_family != PF_RDS) {
			ret = -ENOTSUPP;
		}
		if (!ret) {
			if (val < 0 || val > 1)
				ret = -EINVAL;
			else
				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
			break;

		}
	default:
		ret = -ENOPROTOOPT;
		break;
Loading