Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit fc218544 authored by Ilya Dryomov's avatar Ilya Dryomov
Browse files

ceph: fix iov_iter issues in ceph_direct_read_write()

dio_get_pagev_size() and dio_get_pages_alloc() introduced in commit
b5b98989 ("ceph: combine as many iovec as possile into one OSD
request") assume that the passed iov_iter is ITER_IOVEC.  This isn't
the case with splice where it ends up poking into the guts of ITER_BVEC
or ITER_PIPE iterators, causing lockups and crashes easily reproduced
with generic/095.

Rather than trying to figure out gap alignment and stuff pages into
a page vector, add a helper for going from iov_iter to a bio_vec array
and make use of the new CEPH_OSD_DATA_TYPE_BVECS code.

Fixes: b5b98989 ("ceph: combine as many iovec as possile into one OSD request")
Link: http://tracker.ceph.com/issues/18130


Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
Reviewed-by: default avatarJeff Layton <jlayton@redhat.com>
Reviewed-by: default avatar"Yan, Zheng" <zyan@redhat.com>
Tested-by: default avatarLuis Henriques <lhenriques@suse.com>
parent 0010f705
Loading
Loading
Loading
Loading
+117 −78
Original line number Original line Diff line number Diff line
@@ -70,69 +70,104 @@ static __le32 ceph_flags_sys2wire(u32 flags)
 */
 */


/*
/*
 * Calculate the length sum of direct io vectors that can
 * How many pages to get in one call to iov_iter_get_pages().  This
 * be combined into one page vector.
 * determines the size of the on-stack array used as a buffer.
 */
 */
static size_t dio_get_pagev_size(const struct iov_iter *it)
#define ITER_GET_BVECS_PAGES	64

static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize,
				struct bio_vec *bvecs)
{
{
    const struct iovec *iov = it->iov;
	size_t size = 0;
    const struct iovec *iovend = iov + it->nr_segs;
	int bvec_idx = 0;
    size_t size;


    size = iov->iov_len - it->iov_offset;
	if (maxsize > iov_iter_count(iter))
    /*
		maxsize = iov_iter_count(iter);
     * An iov can be page vectored when both the current tail

     * and the next base are page aligned.
	while (size < maxsize) {
     */
		struct page *pages[ITER_GET_BVECS_PAGES];
    while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
		ssize_t bytes;
           (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
		size_t start;
        size += iov->iov_len;
		int idx = 0;

		bytes = iov_iter_get_pages(iter, pages, maxsize - size,
					   ITER_GET_BVECS_PAGES, &start);
		if (bytes < 0)
			return size ?: bytes;

		iov_iter_advance(iter, bytes);
		size += bytes;

		for ( ; bytes; idx++, bvec_idx++) {
			struct bio_vec bv = {
				.bv_page = pages[idx],
				.bv_len = min_t(int, bytes, PAGE_SIZE - start),
				.bv_offset = start,
			};

			bvecs[bvec_idx] = bv;
			bytes -= bv.bv_len;
			start = 0;
		}
	}
	}
    dout("dio_get_pagevlen len = %zu\n", size);

	return size;
	return size;
}
}


/*
/*
 * Allocate a page vector based on (@it, @nbytes).
 * iov_iter_get_pages() only considers one iov_iter segment, no matter
 * The return value is the tuple describing a page vector,
 * what maxsize or maxpages are given.  For ITER_BVEC that is a single
 * that is (@pages, @page_align, @num_pages).
 * page.
 *
 * Attempt to get up to @maxsize bytes worth of pages from @iter.
 * Return the number of bytes in the created bio_vec array, or an error.
 */
 */
static struct page **
static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize,
dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
				    struct bio_vec **bvecs, int *num_bvecs)
		    size_t *page_align, int *num_pages)
{
{
	struct iov_iter tmp_it = *it;
	struct bio_vec *bv;
	size_t align;
	size_t orig_count = iov_iter_count(iter);
	struct page **pages;
	ssize_t bytes;
	int ret = 0, idx, npages;
	int npages;


	align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
	iov_iter_truncate(iter, maxsize);
		(PAGE_SIZE - 1);
	npages = iov_iter_npages(iter, INT_MAX);
	npages = calc_pages_for(align, nbytes);
	iov_iter_reexpand(iter, orig_count);
	pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL);
	if (!pages)
		return ERR_PTR(-ENOMEM);


	for (idx = 0; idx < npages; ) {
	/*
		size_t start;
	 * __iter_get_bvecs() may populate only part of the array -- zero it
		ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
	 * out.
					 npages - idx, &start);
	 */
		if (ret < 0)
	bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO);
			goto fail;
	if (!bv)
		return -ENOMEM;

	bytes = __iter_get_bvecs(iter, maxsize, bv);
	if (bytes < 0) {
		/*
		 * No pages were pinned -- just free the array.
		 */
		kvfree(bv);
		return bytes;
	}


		iov_iter_advance(&tmp_it, ret);
	*bvecs = bv;
		nbytes -= ret;
	*num_bvecs = npages;
		idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
	return bytes;
}
}


	BUG_ON(nbytes != 0);
static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty)
	*num_pages = npages;
{
	*page_align = align;
	int i;
	dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);

	return pages;
	for (i = 0; i < num_bvecs; i++) {
fail:
		if (bvecs[i].bv_page) {
	ceph_put_page_vector(pages, idx, false);
			if (should_dirty)
	return ERR_PTR(ret);
				set_page_dirty_lock(bvecs[i].bv_page);
			put_page(bvecs[i].bv_page);
		}
	}
	kvfree(bvecs);
}
}


/*
/*
@@ -746,11 +781,12 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
	struct inode *inode = req->r_inode;
	struct inode *inode = req->r_inode;
	struct ceph_aio_request *aio_req = req->r_priv;
	struct ceph_aio_request *aio_req = req->r_priv;
	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
	int num_pages = calc_pages_for((u64)osd_data->alignment,
				       osd_data->length);


	dout("ceph_aio_complete_req %p rc %d bytes %llu\n",
	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS);
	     inode, rc, osd_data->length);
	BUG_ON(!osd_data->num_bvecs);

	dout("ceph_aio_complete_req %p rc %d bytes %u\n",
	     inode, rc, osd_data->bvec_pos.iter.bi_size);


	if (rc == -EOLDSNAPC) {
	if (rc == -EOLDSNAPC) {
		struct ceph_aio_work *aio_work;
		struct ceph_aio_work *aio_work;
@@ -768,9 +804,10 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
	} else if (!aio_req->write) {
	} else if (!aio_req->write) {
		if (rc == -ENOENT)
		if (rc == -ENOENT)
			rc = 0;
			rc = 0;
		if (rc >= 0 && osd_data->length > rc) {
		if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) {
			int zoff = osd_data->alignment + rc;
			struct iov_iter i;
			int zlen = osd_data->length - rc;
			int zlen = osd_data->bvec_pos.iter.bi_size - rc;

			/*
			/*
			 * If read is satisfied by single OSD request,
			 * If read is satisfied by single OSD request,
			 * it can pass EOF. Otherwise read is within
			 * it can pass EOF. Otherwise read is within
@@ -785,13 +822,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
				aio_req->total_len = rc + zlen;
				aio_req->total_len = rc + zlen;
			}
			}


			if (zlen > 0)
			iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs,
				ceph_zero_page_vector_range(zoff, zlen,
				      osd_data->num_bvecs,
							    osd_data->pages);
				      osd_data->bvec_pos.iter.bi_size);
			iov_iter_advance(&i, rc);
			iov_iter_zero(zlen, &i);
		}
		}
	}
	}


	ceph_put_page_vector(osd_data->pages, num_pages, aio_req->should_dirty);
	put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs,
		  aio_req->should_dirty);
	ceph_osdc_put_request(req);
	ceph_osdc_put_request(req);


	if (rc < 0)
	if (rc < 0)
@@ -879,7 +919,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
	struct ceph_vino vino;
	struct ceph_vino vino;
	struct ceph_osd_request *req;
	struct ceph_osd_request *req;
	struct page **pages;
	struct bio_vec *bvecs;
	struct ceph_aio_request *aio_req = NULL;
	struct ceph_aio_request *aio_req = NULL;
	int num_pages = 0;
	int num_pages = 0;
	int flags;
	int flags;
@@ -914,8 +954,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
	}
	}


	while (iov_iter_count(iter) > 0) {
	while (iov_iter_count(iter) > 0) {
		u64 size = dio_get_pagev_size(iter);
		u64 size = iov_iter_count(iter);
		size_t start = 0;
		ssize_t len;
		ssize_t len;


		if (write)
		if (write)
@@ -938,13 +977,14 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
			break;
			break;
		}
		}


		len = size;
		len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages);
		pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
		if (len < 0) {
		if (IS_ERR(pages)) {
			ceph_osdc_put_request(req);
			ceph_osdc_put_request(req);
			ret = PTR_ERR(pages);
			ret = len;
			break;
			break;
		}
		}
		if (len != size)
			osd_req_op_extent_update(req, 0, len);


		/*
		/*
		 * To simplify error handling, allow AIO when IO within i_size
		 * To simplify error handling, allow AIO when IO within i_size
@@ -977,8 +1017,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
			req->r_mtime = mtime;
			req->r_mtime = mtime;
		}
		}


		osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
		osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
						 false, false);


		if (aio_req) {
		if (aio_req) {
			aio_req->total_len += len;
			aio_req->total_len += len;
@@ -991,7 +1030,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
			list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
			list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);


			pos += len;
			pos += len;
			iov_iter_advance(iter, len);
			continue;
			continue;
		}
		}


@@ -1004,25 +1042,26 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
			if (ret == -ENOENT)
			if (ret == -ENOENT)
				ret = 0;
				ret = 0;
			if (ret >= 0 && ret < len && pos + ret < size) {
			if (ret >= 0 && ret < len && pos + ret < size) {
				struct iov_iter i;
				int zlen = min_t(size_t, len - ret,
				int zlen = min_t(size_t, len - ret,
						 size - pos - ret);
						 size - pos - ret);
				ceph_zero_page_vector_range(start + ret, zlen,

							    pages);
				iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages,
					      len);
				iov_iter_advance(&i, ret);
				iov_iter_zero(zlen, &i);
				ret += zlen;
				ret += zlen;
			}
			}
			if (ret >= 0)
			if (ret >= 0)
				len = ret;
				len = ret;
		}
		}


		ceph_put_page_vector(pages, num_pages, should_dirty);
		put_bvecs(bvecs, num_pages, should_dirty);

		ceph_osdc_put_request(req);
		ceph_osdc_put_request(req);
		if (ret < 0)
		if (ret < 0)
			break;
			break;


		pos += len;
		pos += len;
		iov_iter_advance(iter, len);

		if (!write && pos >= size)
		if (!write && pos >= size)
			break;
			break;