Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 5d659b1d authored by David S. Miller's avatar David S. Miller
Browse files

Merge branch 'tcp-mmap-rework-zerocopy-receive'



Eric Dumazet says:

====================
tcp: mmap: rework zerocopy receive

syzbot reported a lockdep issue caused by tcp mmap() support.

I implemented Andy Lutomirski nice suggestions to resolve the
issue and increase scalability as well.

First patch is adding a new getsockopt() operation and changes mmap()
behavior.

Second patch changes tcp_mmap reference program.

v4: tcp mmap() support depends on CONFIG_MMU, as kbuild bot told us.

v3: change TCP_ZEROCOPY_RECEIVE to be a getsockopt() option
    instead of setsockopt(), feedback from Ka-Cheon Poon

v2: Added a missing page align of zc->length in tcp_zerocopy_receive()
    Properly clear zc->recv_skip_hint in case user request was completed.
====================

Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 589f84fb aacb0c2e
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -122,6 +122,7 @@ enum {
#define TCP_MD5SIG_EXT		32	/* TCP MD5 Signature with extensions */
#define TCP_FASTOPEN_KEY	33	/* Set the key for Fast Open (cookie) */
#define TCP_FASTOPEN_NO_COOKIE	34	/* Enable TFO without a TFO cookie */
#define TCP_ZEROCOPY_RECEIVE	35

struct tcp_repair_opt {
	__u32	opt_code;
@@ -276,4 +277,11 @@ struct tcp_diag_md5sig {
	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];
};

/* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */

struct tcp_zerocopy_receive {
	__u64 address;		/* in: address of mapping */
	__u32 length;		/* in/out: number of bytes to map/mapped */
	__u32 recv_skip_hint;	/* out: amount of bytes to skip */
};
#endif /* _UAPI_LINUX_TCP_H */
+2 −0
Original line number Diff line number Diff line
@@ -994,7 +994,9 @@ const struct proto_ops inet_stream_ops = {
	.getsockopt	   = sock_common_getsockopt,
	.sendmsg	   = inet_sendmsg,
	.recvmsg	   = inet_recvmsg,
#ifdef CONFIG_MMU
	.mmap		   = tcp_mmap,
#endif
	.sendpage	   = inet_sendpage,
	.splice_read	   = tcp_splice_read,
	.read_sock	   = tcp_read_sock,
+104 −90
Original line number Diff line number Diff line
@@ -1726,73 +1726,74 @@ int tcp_set_rcvlowat(struct sock *sk, int val)
}
EXPORT_SYMBOL(tcp_set_rcvlowat);

/* When user wants to mmap X pages, we first need to perform the mapping
 * before freeing any skbs in receive queue, otherwise user would be unable
 * to fallback to standard recvmsg(). This happens if some data in the
 * requested block is not exactly fitting in a page.
 *
 * We only support order-0 pages for the moment.
 * mmap() on TCP is very strict, there is no point
 * trying to accommodate with pathological layouts.
 */
#ifdef CONFIG_MMU
static const struct vm_operations_struct tcp_vm_ops = {
};

int tcp_mmap(struct file *file, struct socket *sock,
	     struct vm_area_struct *vma)
{
	unsigned long size = vma->vm_end - vma->vm_start;
	unsigned int nr_pages = size >> PAGE_SHIFT;
	struct page **pages_array = NULL;
	u32 seq, len, offset, nr = 0;
	struct sock *sk = sock->sk;
	const skb_frag_t *frags;
	if (vma->vm_flags & (VM_WRITE | VM_EXEC))
		return -EPERM;
	vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);

	/* Instruct vm_insert_page() to not down_read(mmap_sem) */
	vma->vm_flags |= VM_MIXEDMAP;

	vma->vm_ops = &tcp_vm_ops;
	return 0;
}
EXPORT_SYMBOL(tcp_mmap);

static int tcp_zerocopy_receive(struct sock *sk,
				struct tcp_zerocopy_receive *zc)
{
	unsigned long address = (unsigned long)zc->address;
	const skb_frag_t *frags = NULL;
	u32 length = 0, seq, offset;
	struct vm_area_struct *vma;
	struct sk_buff *skb = NULL;
	struct tcp_sock *tp;
	struct sk_buff *skb;
	int ret;

	if (vma->vm_pgoff || !nr_pages)
	if (address & (PAGE_SIZE - 1) || address != zc->address)
		return -EINVAL;

	if (vma->vm_flags & VM_WRITE)
		return -EPERM;
	/* TODO: Maybe the following is not needed if pages are COW */
	vma->vm_flags &= ~VM_MAYWRITE;

	lock_sock(sk);

	ret = -ENOTCONN;
	if (sk->sk_state == TCP_LISTEN)
		goto out;
		return -ENOTCONN;

	sock_rps_record_flow(sk);

	if (tcp_inq(sk) < size) {
		ret = sock_flag(sk, SOCK_DONE) ? -EIO : -EAGAIN;
	down_read(&current->mm->mmap_sem);

	ret = -EINVAL;
	vma = find_vma(current->mm, address);
	if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops)
		goto out;
	}
	zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);

	tp = tcp_sk(sk);
	seq = tp->copied_seq;
	/* Abort if urgent data is in the area */
	if (unlikely(tp->urg_data)) {
		u32 urg_offset = tp->urg_seq - seq;
	zc->length = min_t(u32, zc->length, tcp_inq(sk));
	zc->length &= ~(PAGE_SIZE - 1);

		ret = -EINVAL;
		if (urg_offset < size)
			goto out;
	}
	ret = -ENOMEM;
	pages_array = kvmalloc_array(nr_pages, sizeof(struct page *),
				     GFP_KERNEL);
	if (!pages_array)
		goto out;
	zap_page_range(vma, address, zc->length);

	zc->recv_skip_hint = 0;
	ret = 0;
	while (length + PAGE_SIZE <= zc->length) {
		if (zc->recv_skip_hint < PAGE_SIZE) {
			if (skb) {
				skb = skb->next;
				offset = seq - TCP_SKB_CB(skb)->seq;
			} else {
				skb = tcp_recv_skb(sk, seq, &offset);
	ret = -EINVAL;
skb_start:
	/* We do not support anything not in page frags */
			}

			zc->recv_skip_hint = skb->len - offset;
			offset -= skb_headlen(skb);
	if ((int)offset < 0)
		goto out;
	if (skb_has_frag_list(skb))
		goto out;
	len = skb->data_len - offset;
			if ((int)offset < 0 || skb_has_frag_list(skb))
				break;
			frags = skb_shinfo(skb)->frags;
			while (offset) {
				if (frags->size > offset)
@@ -1800,44 +1801,38 @@ int tcp_mmap(struct file *file, struct socket *sock,
				offset -= frags->size;
				frags++;
			}
	while (nr < nr_pages) {
		if (len) {
			if (len < PAGE_SIZE)
				goto out;
			if (frags->size != PAGE_SIZE || frags->page_offset)
				goto out;
			pages_array[nr++] = skb_frag_page(frags);
			frags++;
			len -= PAGE_SIZE;
			seq += PAGE_SIZE;
			continue;
		}
		skb = skb->next;
		offset = seq - TCP_SKB_CB(skb)->seq;
		goto skb_start;
	}
	/* OK, we have a full set of pages ready to be inserted into vma */
	for (nr = 0; nr < nr_pages; nr++) {
		ret = vm_insert_page(vma, vma->vm_start + (nr << PAGE_SHIFT),
				     pages_array[nr]);
		if (frags->size != PAGE_SIZE || frags->page_offset)
			break;
		ret = vm_insert_page(vma, address + length,
				     skb_frag_page(frags));
		if (ret)
			goto out;
			break;
		length += PAGE_SIZE;
		seq += PAGE_SIZE;
		zc->recv_skip_hint -= PAGE_SIZE;
		frags++;
	}
	/* operation is complete, we can 'consume' all skbs */
out:
	up_read(&current->mm->mmap_sem);
	if (length) {
		tp->copied_seq = seq;
		tcp_rcv_space_adjust(sk);

		/* Clean up data we have read: This will do ACK frames. */
		tcp_recv_skb(sk, seq, &offset);
	tcp_cleanup_rbuf(sk, size);

		tcp_cleanup_rbuf(sk, length);
		ret = 0;
out:
	release_sock(sk);
	kvfree(pages_array);
		if (length == zc->length)
			zc->recv_skip_hint = 0;
	} else {
		if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE))
			ret = -EIO;
	}
	zc->length = length;
	return ret;
}
EXPORT_SYMBOL(tcp_mmap);
#endif

static void tcp_update_recv_tstamps(struct sk_buff *skb,
				    struct scm_timestamping *tss)
@@ -3472,6 +3467,25 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
		}
		return 0;
	}
#ifdef CONFIG_MMU
	case TCP_ZEROCOPY_RECEIVE: {
		struct tcp_zerocopy_receive zc;
		int err;

		if (get_user(len, optlen))
			return -EFAULT;
		if (len != sizeof(zc))
			return -EINVAL;
		if (copy_from_user(&zc, optval, len))
			return -EFAULT;
		lock_sock(sk);
		err = tcp_zerocopy_receive(sk, &zc);
		release_sock(sk);
		if (!err && copy_to_user(optval, &zc, len))
			err = -EFAULT;
		return err;
	}
#endif
	default:
		return -ENOPROTOOPT;
	}
+2 −0
Original line number Diff line number Diff line
@@ -578,7 +578,9 @@ const struct proto_ops inet6_stream_ops = {
	.getsockopt	   = sock_common_getsockopt,	/* ok		*/
	.sendmsg	   = inet_sendmsg,		/* ok		*/
	.recvmsg	   = inet_recvmsg,		/* ok		*/
#ifdef CONFIG_MMU
	.mmap		   = tcp_mmap,
#endif
	.sendpage	   = inet_sendpage,
	.sendmsg_locked    = tcp_sendmsg_locked,
	.sendpage_locked   = tcp_sendpage_locked,
+38 −28
Original line number Diff line number Diff line
@@ -76,9 +76,10 @@
#include <time.h>
#include <sys/time.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <arpa/inet.h>
#include <poll.h>
#include <linux/tcp.h>
#include <assert.h>

#ifndef MSG_ZEROCOPY
#define MSG_ZEROCOPY    0x4000000
@@ -134,11 +135,12 @@ void hash_zone(void *zone, unsigned int length)
void *child_thread(void *arg)
{
	unsigned long total_mmap = 0, total = 0;
	struct tcp_zerocopy_receive zc;
	unsigned long delta_usec;
	int flags = MAP_SHARED;
	struct timeval t0, t1;
	char *buffer = NULL;
	void *oaddr = NULL;
	void *addr = NULL;
	double throughput;
	struct rusage ru;
	int lu, fd;
@@ -153,41 +155,46 @@ void *child_thread(void *arg)
		perror("malloc");
		goto error;
	}
	if (zflg) {
		addr = mmap(NULL, chunk_size, PROT_READ, flags, fd, 0);
		if (addr == (void *)-1)
			zflg = 0;
	}
	while (1) {
		struct pollfd pfd = { .fd = fd, .events = POLLIN, };
		int sub;

		poll(&pfd, 1, 10000);
		if (zflg) {
			void *naddr;

			naddr = mmap(oaddr, chunk_size, PROT_READ, flags, fd, 0);
			if (naddr == (void *)-1) {
				if (errno == EAGAIN) {
					/* That is if SO_RCVLOWAT is buggy */
					usleep(1000);
					continue;
				}
				if (errno == EINVAL) {
					flags = MAP_SHARED;
					oaddr = NULL;
					goto fallback;
				}
				if (errno != EIO)
					perror("mmap()");
			socklen_t zc_len = sizeof(zc);
			int res;

			zc.address = (__u64)addr;
			zc.length = chunk_size;
			zc.recv_skip_hint = 0;
			res = getsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE,
					 &zc, &zc_len);
			if (res == -1)
				break;

			if (zc.length) {
				assert(zc.length <= chunk_size);
				total_mmap += zc.length;
				if (xflg)
					hash_zone(addr, zc.length);
				total += zc.length;
			}
			total_mmap += chunk_size;
			if (zc.recv_skip_hint) {
				assert(zc.recv_skip_hint <= chunk_size);
				lu = read(fd, buffer, zc.recv_skip_hint);
				if (lu > 0) {
					if (xflg)
				hash_zone(naddr, chunk_size);
			total += chunk_size;
			if (!keepflag) {
				flags |= MAP_FIXED;
				oaddr = naddr;
						hash_zone(buffer, lu);
					total += lu;
				}
			}
			continue;
		}
fallback:
		sub = 0;
		while (sub < chunk_size) {
			lu = read(fd, buffer + sub, chunk_size - sub);
@@ -228,6 +235,8 @@ void *child_thread(void *arg)
error:
	free(buffer);
	close(fd);
	if (zflg)
		munmap(addr, chunk_size);
	pthread_exit(0);
}

@@ -371,7 +380,8 @@ int main(int argc, char *argv[])
		setup_sockaddr(cfg_family, host, &listenaddr);

		if (mss &&
		    setsockopt(fdlisten, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
		    setsockopt(fdlisten, IPPROTO_TCP, TCP_MAXSEG,
			       &mss, sizeof(mss)) == -1) {
			perror("setsockopt TCP_MAXSEG");
			exit(1);
		}
@@ -402,7 +412,7 @@ int main(int argc, char *argv[])
	setup_sockaddr(cfg_family, host, &addr);

	if (mss &&
	    setsockopt(fd, SOL_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
	    setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &mss, sizeof(mss)) == -1) {
		perror("setsockopt TCP_MAXSEG");
		exit(1);
	}