Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 241699cd authored by Al Viro's avatar Al Viro
Browse files

new iov_iter flavour: pipe-backed



iov_iter variant for passing data into pipe.  copy_to_iter()
copies data into page(s) it has allocated and stuffs them into
the pipe; copy_page_to_iter() stuffs there a reference to the
page given to it.  Both will try to coalesce if possible.
iov_iter_zero() is similar to copy_to_iter(); iov_iter_get_pages()
and friends will do as copy_to_iter() would have and return the
pages where the data would've been copied.  iov_iter_advance()
will truncate everything past the spot it has advanced to.

New primitive: iov_iter_pipe(), used for initializing those.
pipe should be locked all along.

Running out of space acts as fault would for iovec-backed ones;
in other words, giving it to ->read_iter() may result in short
read if the pipe overflows, or -EFAULT if it happens with nothing
copied there.

In other words, ->read_iter() on those acts pretty much like
->splice_read().  Moreover, all generic_file_splice_read() users,
as well as many other ->splice_read() instances can be switched
to that scheme - that'll happen in the next commit.

Signed-off-by: default avatarAl Viro <viro@zeniv.linux.org.uk>
parent d82718e3
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -524,7 +524,7 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
}
EXPORT_SYMBOL(generic_file_splice_read);

static const struct pipe_buf_operations default_pipe_buf_ops = {
const struct pipe_buf_operations default_pipe_buf_ops = {
	.can_merge = 0,
	.confirm = generic_pipe_buf_confirm,
	.release = generic_pipe_buf_release,
+1 −0
Original line number Diff line number Diff line
@@ -85,4 +85,5 @@ extern void splice_shrink_spd(struct splice_pipe_desc *);
extern void spd_release_page(struct splice_pipe_desc *, unsigned int);

extern const struct pipe_buf_operations page_cache_pipe_buf_ops;
extern const struct pipe_buf_operations default_pipe_buf_ops;
#endif
+11 −3
Original line number Diff line number Diff line
@@ -13,6 +13,7 @@
#include <uapi/linux/uio.h>

struct page;
struct pipe_inode_info;

struct kvec {
	void *iov_base; /* and that should *never* hold a userland pointer */
@@ -23,6 +24,7 @@ enum {
	ITER_IOVEC = 0,
	ITER_KVEC = 2,
	ITER_BVEC = 4,
	ITER_PIPE = 8,
};

struct iov_iter {
@@ -33,8 +35,12 @@ struct iov_iter {
		const struct iovec *iov;
		const struct kvec *kvec;
		const struct bio_vec *bvec;
		struct pipe_inode_info *pipe;
	};
	union {
		unsigned long nr_segs;
		int idx;
	};
};

/*
@@ -64,7 +70,7 @@ static inline struct iovec iov_iter_iovec(const struct iov_iter *iter)
}

#define iov_for_each(iov, iter, start)				\
	if (!((start).type & ITER_BVEC))			\
	if (!((start).type & (ITER_BVEC | ITER_PIPE)))		\
	for (iter = (start);					\
	     (iter).count &&					\
	     ((iov = iov_iter_iovec(&(iter))), 1);		\
@@ -94,6 +100,8 @@ void iov_iter_kvec(struct iov_iter *i, int direction, const struct kvec *kvec,
			unsigned long nr_segs, size_t count);
void iov_iter_bvec(struct iov_iter *i, int direction, const struct bio_vec *bvec,
			unsigned long nr_segs, size_t count);
void iov_iter_pipe(struct iov_iter *i, int direction, struct pipe_inode_info *pipe,
			size_t count);
ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
			size_t maxsize, unsigned maxpages, size_t *start);
ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
@@ -109,7 +117,7 @@ static inline size_t iov_iter_count(struct iov_iter *i)

static inline bool iter_is_iovec(struct iov_iter *i)
{
	return !(i->type & (ITER_BVEC | ITER_KVEC));
	return !(i->type & (ITER_BVEC | ITER_KVEC | ITER_PIPE));
}

/*
+395 −2
Original line number Diff line number Diff line
@@ -3,8 +3,11 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/splice.h>
#include <net/checksum.h>

#define PIPE_PARANOIA /* for now */

#define iterate_iovec(i, n, __v, __p, skip, STEP) {	\
	size_t left;					\
	size_t wanted = n;				\
@@ -290,6 +293,93 @@ static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t
	return wanted - bytes;
}

#ifdef PIPE_PARANOIA
static bool sanity(const struct iov_iter *i)
{
	struct pipe_inode_info *pipe = i->pipe;
	int idx = i->idx;
	int next = pipe->curbuf + pipe->nrbufs;
	if (i->iov_offset) {
		struct pipe_buffer *p;
		if (unlikely(!pipe->nrbufs))
			goto Bad;	// pipe must be non-empty
		if (unlikely(idx != ((next - 1) & (pipe->buffers - 1))))
			goto Bad;	// must be at the last buffer...

		p = &pipe->bufs[idx];
		if (unlikely(p->offset + p->len != i->iov_offset))
			goto Bad;	// ... at the end of segment
	} else {
		if (idx != (next & (pipe->buffers - 1)))
			goto Bad;	// must be right after the last buffer
	}
	return true;
Bad:
	printk(KERN_ERR "idx = %d, offset = %zd\n", i->idx, i->iov_offset);
	printk(KERN_ERR "curbuf = %d, nrbufs = %d, buffers = %d\n",
			pipe->curbuf, pipe->nrbufs, pipe->buffers);
	for (idx = 0; idx < pipe->buffers; idx++)
		printk(KERN_ERR "[%p %p %d %d]\n",
			pipe->bufs[idx].ops,
			pipe->bufs[idx].page,
			pipe->bufs[idx].offset,
			pipe->bufs[idx].len);
	WARN_ON(1);
	return false;
}
#else
#define sanity(i) true
#endif

static inline int next_idx(int idx, struct pipe_inode_info *pipe)
{
	return (idx + 1) & (pipe->buffers - 1);
}

static size_t copy_page_to_iter_pipe(struct page *page, size_t offset, size_t bytes,
			 struct iov_iter *i)
{
	struct pipe_inode_info *pipe = i->pipe;
	struct pipe_buffer *buf;
	size_t off;
	int idx;

	if (unlikely(bytes > i->count))
		bytes = i->count;

	if (unlikely(!bytes))
		return 0;

	if (!sanity(i))
		return 0;

	off = i->iov_offset;
	idx = i->idx;
	buf = &pipe->bufs[idx];
	if (off) {
		if (offset == off && buf->page == page) {
			/* merge with the last one */
			buf->len += bytes;
			i->iov_offset += bytes;
			goto out;
		}
		idx = next_idx(idx, pipe);
		buf = &pipe->bufs[idx];
	}
	if (idx == pipe->curbuf && pipe->nrbufs)
		return 0;
	pipe->nrbufs++;
	buf->ops = &page_cache_pipe_buf_ops;
	get_page(buf->page = page);
	buf->offset = offset;
	buf->len = bytes;
	i->iov_offset = offset + bytes;
	i->idx = idx;
out:
	i->count -= bytes;
	return bytes;
}

/*
 * Fault in one or more iovecs of the given iov_iter, to a maximum length of
 * bytes.  For each iovec, fault in each page that constitutes the iovec.
@@ -356,9 +446,98 @@ static void memzero_page(struct page *page, size_t offset, size_t len)
	kunmap_atomic(addr);
}

static inline bool allocated(struct pipe_buffer *buf)
{
	return buf->ops == &default_pipe_buf_ops;
}

static inline void data_start(const struct iov_iter *i, int *idxp, size_t *offp)
{
	size_t off = i->iov_offset;
	int idx = i->idx;
	if (off && (!allocated(&i->pipe->bufs[idx]) || off == PAGE_SIZE)) {
		idx = next_idx(idx, i->pipe);
		off = 0;
	}
	*idxp = idx;
	*offp = off;
}

static size_t push_pipe(struct iov_iter *i, size_t size,
			int *idxp, size_t *offp)
{
	struct pipe_inode_info *pipe = i->pipe;
	size_t off;
	int idx;
	ssize_t left;

	if (unlikely(size > i->count))
		size = i->count;
	if (unlikely(!size))
		return 0;

	left = size;
	data_start(i, &idx, &off);
	*idxp = idx;
	*offp = off;
	if (off) {
		left -= PAGE_SIZE - off;
		if (left <= 0) {
			pipe->bufs[idx].len += size;
			return size;
		}
		pipe->bufs[idx].len = PAGE_SIZE;
		idx = next_idx(idx, pipe);
	}
	while (idx != pipe->curbuf || !pipe->nrbufs) {
		struct page *page = alloc_page(GFP_USER);
		if (!page)
			break;
		pipe->nrbufs++;
		pipe->bufs[idx].ops = &default_pipe_buf_ops;
		pipe->bufs[idx].page = page;
		pipe->bufs[idx].offset = 0;
		if (left <= PAGE_SIZE) {
			pipe->bufs[idx].len = left;
			return size;
		}
		pipe->bufs[idx].len = PAGE_SIZE;
		left -= PAGE_SIZE;
		idx = next_idx(idx, pipe);
	}
	return size - left;
}

static size_t copy_pipe_to_iter(const void *addr, size_t bytes,
				struct iov_iter *i)
{
	struct pipe_inode_info *pipe = i->pipe;
	size_t n, off;
	int idx;

	if (!sanity(i))
		return 0;

	bytes = n = push_pipe(i, bytes, &idx, &off);
	if (unlikely(!n))
		return 0;
	for ( ; n; idx = next_idx(idx, pipe), off = 0) {
		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
		memcpy_to_page(pipe->bufs[idx].page, off, addr, chunk);
		i->idx = idx;
		i->iov_offset = off + chunk;
		n -= chunk;
		addr += chunk;
	}
	i->count -= bytes;
	return bytes;
}

size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
{
	const char *from = addr;
	if (unlikely(i->type & ITER_PIPE))
		return copy_pipe_to_iter(addr, bytes, i);
	iterate_and_advance(i, bytes, v,
		__copy_to_user(v.iov_base, (from += v.iov_len) - v.iov_len,
			       v.iov_len),
@@ -374,6 +553,10 @@ EXPORT_SYMBOL(copy_to_iter);
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
	char *to = addr;
	if (unlikely(i->type & ITER_PIPE)) {
		WARN_ON(1);
		return 0;
	}
	iterate_and_advance(i, bytes, v,
		__copy_from_user((to += v.iov_len) - v.iov_len, v.iov_base,
				 v.iov_len),
@@ -389,6 +572,10 @@ EXPORT_SYMBOL(copy_from_iter);
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
{
	char *to = addr;
	if (unlikely(i->type & ITER_PIPE)) {
		WARN_ON(1);
		return 0;
	}
	iterate_and_advance(i, bytes, v,
		__copy_from_user_nocache((to += v.iov_len) - v.iov_len,
					 v.iov_base, v.iov_len),
@@ -409,14 +596,20 @@ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
		size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
		kunmap_atomic(kaddr);
		return wanted;
	} else
	} else if (likely(!(i->type & ITER_PIPE)))
		return copy_page_to_iter_iovec(page, offset, bytes, i);
	else
		return copy_page_to_iter_pipe(page, offset, bytes, i);
}
EXPORT_SYMBOL(copy_page_to_iter);

size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
			 struct iov_iter *i)
{
	if (unlikely(i->type & ITER_PIPE)) {
		WARN_ON(1);
		return 0;
	}
	if (i->type & (ITER_BVEC|ITER_KVEC)) {
		void *kaddr = kmap_atomic(page);
		size_t wanted = copy_from_iter(kaddr + offset, bytes, i);
@@ -427,8 +620,34 @@ size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
}
EXPORT_SYMBOL(copy_page_from_iter);

static size_t pipe_zero(size_t bytes, struct iov_iter *i)
{
	struct pipe_inode_info *pipe = i->pipe;
	size_t n, off;
	int idx;

	if (!sanity(i))
		return 0;

	bytes = n = push_pipe(i, bytes, &idx, &off);
	if (unlikely(!n))
		return 0;

	for ( ; n; idx = next_idx(idx, pipe), off = 0) {
		size_t chunk = min_t(size_t, n, PAGE_SIZE - off);
		memzero_page(pipe->bufs[idx].page, off, chunk);
		i->idx = idx;
		i->iov_offset = off + chunk;
		n -= chunk;
	}
	i->count -= bytes;
	return bytes;
}

size_t iov_iter_zero(size_t bytes, struct iov_iter *i)
{
	if (unlikely(i->type & ITER_PIPE))
		return pipe_zero(bytes, i);
	iterate_and_advance(i, bytes, v,
		__clear_user(v.iov_base, v.iov_len),
		memzero_page(v.bv_page, v.bv_offset, v.bv_len),
@@ -443,6 +662,11 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
		struct iov_iter *i, unsigned long offset, size_t bytes)
{
	char *kaddr = kmap_atomic(page), *p = kaddr + offset;
	if (unlikely(i->type & ITER_PIPE)) {
		kunmap_atomic(kaddr);
		WARN_ON(1);
		return 0;
	}
	iterate_all_kinds(i, bytes, v,
		__copy_from_user_inatomic((p += v.iov_len) - v.iov_len,
					  v.iov_base, v.iov_len),
@@ -455,8 +679,51 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
}
EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);

static void pipe_advance(struct iov_iter *i, size_t size)
{
	struct pipe_inode_info *pipe = i->pipe;
	struct pipe_buffer *buf;
	int idx = i->idx;
	size_t off = i->iov_offset;
	
	if (unlikely(i->count < size))
		size = i->count;

	if (size) {
		if (off) /* make it relative to the beginning of buffer */
			size += off - pipe->bufs[idx].offset;
		while (1) {
			buf = &pipe->bufs[idx];
			if (size <= buf->len)
				break;
			size -= buf->len;
			idx = next_idx(idx, pipe);
		}
		buf->len = size;
		i->idx = idx;
		off = i->iov_offset = buf->offset + size;
	}
	if (off)
		idx = next_idx(idx, pipe);
	if (pipe->nrbufs) {
		int unused = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
		/* [curbuf,unused) is in use.  Free [idx,unused) */
		while (idx != unused) {
			buf = &pipe->bufs[idx];
			buf->ops->release(pipe, buf);
			buf->ops = NULL;
			idx = next_idx(idx, pipe);
			pipe->nrbufs--;
		}
	}
}

void iov_iter_advance(struct iov_iter *i, size_t size)
{
	if (unlikely(i->type & ITER_PIPE)) {
		pipe_advance(i, size);
		return;
	}
	iterate_and_advance(i, size, v, 0, 0, 0)
}
EXPORT_SYMBOL(iov_iter_advance);
@@ -466,6 +733,8 @@ EXPORT_SYMBOL(iov_iter_advance);
 */
size_t iov_iter_single_seg_count(const struct iov_iter *i)
{
	if (unlikely(i->type & ITER_PIPE))
		return i->count;	// it is a silly place, anyway
	if (i->nr_segs == 1)
		return i->count;
	else if (i->type & ITER_BVEC)
@@ -501,6 +770,19 @@ void iov_iter_bvec(struct iov_iter *i, int direction,
}
EXPORT_SYMBOL(iov_iter_bvec);

void iov_iter_pipe(struct iov_iter *i, int direction,
			struct pipe_inode_info *pipe,
			size_t count)
{
	BUG_ON(direction != ITER_PIPE);
	i->type = direction;
	i->pipe = pipe;
	i->idx = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
	i->iov_offset = 0;
	i->count = count;
}
EXPORT_SYMBOL(iov_iter_pipe);

unsigned long iov_iter_alignment(const struct iov_iter *i)
{
	unsigned long res = 0;
@@ -509,6 +791,11 @@ unsigned long iov_iter_alignment(const struct iov_iter *i)
	if (!size)
		return 0;

	if (unlikely(i->type & ITER_PIPE)) {
		if (i->iov_offset && allocated(&i->pipe->bufs[i->idx]))
			return size | i->iov_offset;
		return size;
	}
	iterate_all_kinds(i, size, v,
		(res |= (unsigned long)v.iov_base | v.iov_len, 0),
		res |= v.bv_offset | v.bv_len,
@@ -525,6 +812,11 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
	if (!size)
		return 0;

	if (unlikely(i->type & ITER_PIPE)) {
		WARN_ON(1);
		return ~0U;
	}

	iterate_all_kinds(i, size, v,
		(res |= (!res ? 0 : (unsigned long)v.iov_base) |
			(size != v.iov_len ? size : 0), 0),
@@ -537,6 +829,47 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i)
}
EXPORT_SYMBOL(iov_iter_gap_alignment);

static inline size_t __pipe_get_pages(struct iov_iter *i,
				size_t maxsize,
				struct page **pages,
				int idx,
				size_t *start)
{
	struct pipe_inode_info *pipe = i->pipe;
	size_t n = push_pipe(i, maxsize, &idx, start);
	if (!n)
		return -EFAULT;

	maxsize = n;
	n += *start;
	while (n >= PAGE_SIZE) {
		get_page(*pages++ = pipe->bufs[idx].page);
		idx = next_idx(idx, pipe);
		n -= PAGE_SIZE;
	}

	return maxsize;
}

static ssize_t pipe_get_pages(struct iov_iter *i,
		   struct page **pages, size_t maxsize, unsigned maxpages,
		   size_t *start)
{
	unsigned npages;
	size_t capacity;
	int idx;

	if (!sanity(i))
		return -EFAULT;

	data_start(i, &idx, start);
	/* some of this one + all after this one */
	npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1;
	capacity = min(npages,maxpages) * PAGE_SIZE - *start;

	return __pipe_get_pages(i, min(maxsize, capacity), pages, idx, start);
}

ssize_t iov_iter_get_pages(struct iov_iter *i,
		   struct page **pages, size_t maxsize, unsigned maxpages,
		   size_t *start)
@@ -547,6 +880,8 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
	if (!maxsize)
		return 0;

	if (unlikely(i->type & ITER_PIPE))
		return pipe_get_pages(i, pages, maxsize, maxpages, start);
	iterate_all_kinds(i, maxsize, v, ({
		unsigned long addr = (unsigned long)v.iov_base;
		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
@@ -582,6 +917,37 @@ static struct page **get_pages_array(size_t n)
	return p;
}

static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
		   struct page ***pages, size_t maxsize,
		   size_t *start)
{
	struct page **p;
	size_t n;
	int idx;
	int npages;

	if (!sanity(i))
		return -EFAULT;

	data_start(i, &idx, start);
	/* some of this one + all after this one */
	npages = ((i->pipe->curbuf - idx - 1) & (i->pipe->buffers - 1)) + 1;
	n = npages * PAGE_SIZE - *start;
	if (maxsize > n)
		maxsize = n;
	else
		npages = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
	p = get_pages_array(npages);
	if (!p)
		return -ENOMEM;
	n = __pipe_get_pages(i, maxsize, p, idx, start);
	if (n > 0)
		*pages = p;
	else
		kvfree(p);
	return n;
}

ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
		   struct page ***pages, size_t maxsize,
		   size_t *start)
@@ -594,6 +960,8 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
	if (!maxsize)
		return 0;

	if (unlikely(i->type & ITER_PIPE))
		return pipe_get_pages_alloc(i, pages, maxsize, start);
	iterate_all_kinds(i, maxsize, v, ({
		unsigned long addr = (unsigned long)v.iov_base;
		size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
@@ -635,6 +1003,10 @@ size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
	__wsum sum, next;
	size_t off = 0;
	sum = *csum;
	if (unlikely(i->type & ITER_PIPE)) {
		WARN_ON(1);
		return 0;
	}
	iterate_and_advance(i, bytes, v, ({
		int err = 0;
		next = csum_and_copy_from_user(v.iov_base, 
@@ -673,6 +1045,10 @@ size_t csum_and_copy_to_iter(const void *addr, size_t bytes, __wsum *csum,
	__wsum sum, next;
	size_t off = 0;
	sum = *csum;
	if (unlikely(i->type & ITER_PIPE)) {
		WARN_ON(1);	/* for now */
		return 0;
	}
	iterate_and_advance(i, bytes, v, ({
		int err = 0;
		next = csum_and_copy_to_user((from += v.iov_len) - v.iov_len,
@@ -712,7 +1088,20 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages)
	if (!size)
		return 0;

	iterate_all_kinds(i, size, v, ({
	if (unlikely(i->type & ITER_PIPE)) {
		struct pipe_inode_info *pipe = i->pipe;
		size_t off;
		int idx;

		if (!sanity(i))
			return 0;

		data_start(i, &idx, &off);
		/* some of this one + all after this one */
		npages = ((pipe->curbuf - idx - 1) & (pipe->buffers - 1)) + 1;
		if (npages >= maxpages)
			return maxpages;
	} else iterate_all_kinds(i, size, v, ({
		unsigned long p = (unsigned long)v.iov_base;
		npages += DIV_ROUND_UP(p + v.iov_len, PAGE_SIZE)
			- p / PAGE_SIZE;
@@ -737,6 +1126,10 @@ EXPORT_SYMBOL(iov_iter_npages);
const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags)
{
	*new = *old;
	if (unlikely(new->type & ITER_PIPE)) {
		WARN_ON(1);
		return NULL;
	}
	if (new->type & ITER_BVEC)
		return new->bvec = kmemdup(new->bvec,
				    new->nr_segs * sizeof(struct bio_vec),