Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 402b27f9 authored by Roger Pau Monne's avatar Roger Pau Monne Committed by Konrad Rzeszutek Wilk
Browse files

xen-block: implement indirect descriptors



Indirect descriptors introduce a new block operation
(BLKIF_OP_INDIRECT) that passes grant references instead of segments
in the request. This grant references are filled with arrays of
blkif_request_segment_aligned, this way we can send more segments in a
request.

The proposed implementation sets the maximum number of indirect grefs
(frames filled with blkif_request_segment_aligned) to 256 in the
backend and 32 in the frontend. The value in the frontend has been
chosen experimentally, and the backend value has been set to a sane
value that allows expanding the maximum number of indirect descriptors
in the frontend if needed.

The migration code has changed from the previous implementation, in
which we simply remapped the segments on the shared ring. Now the
maximum number of segments allowed in a request can change depending
on the backend, so we have to requeue all the requests in the ring and
in the queue and split the bios in them if they are bigger than the
new maximum number of segments.

[v2: Fixed minor comments by Konrad.
[v1: Added padding to make the indirect request 64bit aligned.
 Added some BUGs, comments; fixed number of indirect pages in
 blkif_get_x86_{32/64}_req. Added description about the indirect operation
 in blkif.h]
Signed-off-by: default avatarRoger Pau Monné <roger.pau@citrix.com>
[v3: Fixed spaces and tabs mix ups]
Signed-off-by: default avatarKonrad Rzeszutek Wilk <konrad.wilk@oracle.com>
parent 31552ee3
Loading
Loading
Loading
Loading
+98 −36
Original line number Diff line number Diff line
@@ -59,7 +59,7 @@
 * IO workloads.
 */

static int xen_blkif_max_buffer_pages = 704;
static int xen_blkif_max_buffer_pages = 1024;
module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644);
MODULE_PARM_DESC(max_buffer_pages,
"Maximum number of free pages to keep in each block backend buffer");
@@ -75,7 +75,7 @@ MODULE_PARM_DESC(max_buffer_pages,
 * algorithm.
 */

static int xen_blkif_max_pgrants = 352;
static int xen_blkif_max_pgrants = 1056;
module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
MODULE_PARM_DESC(max_persistent_grants,
                 "Maximum number of grants to map persistently");
@@ -636,10 +636,6 @@ purge_gnt_list:
	return 0;
}

struct seg_buf {
	unsigned int offset;
	unsigned int nsec;
};
/*
 * Unmap the grant references, and also remove the M2P over-rides
 * used in the 'pending_req'.
@@ -818,29 +814,69 @@ out_of_memory:
	return -ENOMEM;
}

static int xen_blkbk_map_seg(struct blkif_request *req,
			     struct pending_req *pending_req,
static int xen_blkbk_map_seg(struct pending_req *pending_req,
			     struct seg_buf seg[],
			     struct page *pages[])
{
	int i, rc;
	grant_ref_t grefs[BLKIF_MAX_SEGMENTS_PER_REQUEST];

	for (i = 0; i < req->u.rw.nr_segments; i++)
		grefs[i] = req->u.rw.seg[i].gref;
	int rc;

	rc = xen_blkbk_map(pending_req->blkif, grefs,
	rc = xen_blkbk_map(pending_req->blkif, pending_req->grefs,
	                   pending_req->persistent_gnts,
	                   pending_req->grant_handles, pending_req->pages,
	                   req->u.rw.nr_segments,
			   pending_req->nr_pages,
	                   (pending_req->operation != BLKIF_OP_READ));
	if (rc)
		return rc;

	for (i = 0; i < req->u.rw.nr_segments; i++)
		seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
	return rc;
}

	return 0;
static int xen_blkbk_parse_indirect(struct blkif_request *req,
				    struct pending_req *pending_req,
				    struct seg_buf seg[],
				    struct phys_req *preq)
{
	struct persistent_gnt **persistent =
		pending_req->indirect_persistent_gnts;
	struct page **pages = pending_req->indirect_pages;
	struct xen_blkif *blkif = pending_req->blkif;
	int indirect_grefs, rc, n, nseg, i;
	struct blkif_request_segment_aligned *segments = NULL;

	nseg = pending_req->nr_pages;
	indirect_grefs = INDIRECT_PAGES(nseg);
	BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);

	rc = xen_blkbk_map(blkif, req->u.indirect.indirect_grefs,
			   persistent, pending_req->indirect_handles,
			   pages, indirect_grefs, true);
	if (rc)
		goto unmap;

	for (n = 0, i = 0; n < nseg; n++) {
		if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
			/* Map indirect segments */
			if (segments)
				kunmap_atomic(segments);
			segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]);
		}
		i = n % SEGS_PER_INDIRECT_FRAME;
		pending_req->grefs[n] = segments[i].gref;
		seg[n].nsec = segments[i].last_sect -
			segments[i].first_sect + 1;
		seg[n].offset = (segments[i].first_sect << 9);
		if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) ||
		    (segments[i].last_sect < segments[i].first_sect)) {
			rc = -EINVAL;
			goto unmap;
		}
		preq->nr_sects += seg[n].nsec;
	}

unmap:
	if (segments)
		kunmap_atomic(segments);
	xen_blkbk_unmap(blkif, pending_req->indirect_handles,
			pages, persistent, indirect_grefs);
	return rc;
}

static int dispatch_discard_io(struct xen_blkif *blkif,
@@ -1013,6 +1049,7 @@ __do_block_io_op(struct xen_blkif *blkif)
		case BLKIF_OP_WRITE:
		case BLKIF_OP_WRITE_BARRIER:
		case BLKIF_OP_FLUSH_DISKCACHE:
		case BLKIF_OP_INDIRECT:
			if (dispatch_rw_block_io(blkif, &req, pending_req))
				goto done;
			break;
@@ -1059,17 +1096,28 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
				struct pending_req *pending_req)
{
	struct phys_req preq;
	struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
	struct seg_buf *seg = pending_req->seg;
	unsigned int nseg;
	struct bio *bio = NULL;
	struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
	struct bio **biolist = pending_req->biolist;
	int i, nbio = 0;
	int operation;
	struct blk_plug plug;
	bool drain = false;
	struct page **pages = pending_req->pages;
	unsigned short req_operation;

	req_operation = req->operation == BLKIF_OP_INDIRECT ?
			req->u.indirect.indirect_op : req->operation;
	if ((req->operation == BLKIF_OP_INDIRECT) &&
	    (req_operation != BLKIF_OP_READ) &&
	    (req_operation != BLKIF_OP_WRITE)) {
		pr_debug(DRV_PFX "Invalid indirect operation (%u)\n",
			 req_operation);
		goto fail_response;
	}

	switch (req->operation) {
	switch (req_operation) {
	case BLKIF_OP_READ:
		blkif->st_rd_req++;
		operation = READ;
@@ -1091,33 +1139,47 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
	}

	/* Check that the number of segments is sane. */
	nseg = req->u.rw.nr_segments;
	nseg = req->operation == BLKIF_OP_INDIRECT ?
	       req->u.indirect.nr_segments : req->u.rw.nr_segments;

	if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
	    unlikely((req->operation != BLKIF_OP_INDIRECT) &&
		     (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
	    unlikely((req->operation == BLKIF_OP_INDIRECT) &&
		     (nseg > MAX_INDIRECT_SEGMENTS))) {
		pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
			 nseg);
		/* Haven't submitted any bio's yet. */
		goto fail_response;
	}

	preq.sector_number = req->u.rw.sector_number;
	preq.nr_sects      = 0;

	pending_req->blkif     = blkif;
	pending_req->id        = req->u.rw.id;
	pending_req->operation = req->operation;
	pending_req->operation = req_operation;
	pending_req->status    = BLKIF_RSP_OKAY;
	pending_req->nr_pages  = nseg;

	if (req->operation != BLKIF_OP_INDIRECT) {
		preq.dev               = req->u.rw.handle;
		preq.sector_number     = req->u.rw.sector_number;
		for (i = 0; i < nseg; i++) {
			pending_req->grefs[i] = req->u.rw.seg[i].gref;
			seg[i].nsec = req->u.rw.seg[i].last_sect -
				req->u.rw.seg[i].first_sect + 1;
			seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
			if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
		    (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
			    (req->u.rw.seg[i].last_sect <
			     req->u.rw.seg[i].first_sect))
				goto fail_response;
			preq.nr_sects += seg[i].nsec;

		}
	} else {
		preq.dev               = req->u.indirect.handle;
		preq.sector_number     = req->u.indirect.sector_number;
		if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
			goto fail_response;
	}

	if (xen_vbd_translate(&preq, blkif, operation) != 0) {
@@ -1154,7 +1216,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
	 * the hypercall to unmap the grants - that is all done in
	 * xen_blkbk_unmap.
	 */
	if (xen_blkbk_map_seg(req, pending_req, seg, pages))
	if (xen_blkbk_map_seg(pending_req, seg, pages))
		goto fail_flush;

	/*
@@ -1220,7 +1282,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
	                pending_req->nr_pages);
 fail_response:
	/* Haven't submitted any bio's yet. */
	make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
	make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
	free_req(blkif, pending_req);
	msleep(1); /* back off a bit */
	return -EIO;
+93 −5
Original line number Diff line number Diff line
@@ -50,6 +50,19 @@
		 __func__, __LINE__, ##args)


/*
 * This is the maximum number of segments that would be allowed in indirect
 * requests. This value will also be passed to the frontend.
 */
#define MAX_INDIRECT_SEGMENTS 256

#define SEGS_PER_INDIRECT_FRAME \
	(PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
#define MAX_INDIRECT_PAGES \
	((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
#define INDIRECT_PAGES(_segs) \
	((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)

/* Not a real protocol.  Used to generate ring structs which contain
 * the elements common to all protocols only.  This way we get a
 * compiler-checkable way to use common struct elements, so we can
@@ -83,12 +96,31 @@ struct blkif_x86_32_request_other {
	uint64_t       id;           /* private guest value, echoed in resp  */
} __attribute__((__packed__));

struct blkif_x86_32_request_indirect {
	uint8_t        indirect_op;
	uint16_t       nr_segments;
	uint64_t       id;
	blkif_sector_t sector_number;
	blkif_vdev_t   handle;
	uint16_t       _pad1;
	grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
	/*
	 * The maximum number of indirect segments (and pages) that will
	 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
	 * is also exported to the guest (via xenstore
	 * feature-max-indirect-segments entry), so the frontend knows how
	 * many indirect segments the backend supports.
	 */
	uint64_t       _pad2;        /* make it 64 byte aligned */
} __attribute__((__packed__));

struct blkif_x86_32_request {
	uint8_t        operation;    /* BLKIF_OP_???                         */
	union {
		struct blkif_x86_32_request_rw rw;
		struct blkif_x86_32_request_discard discard;
		struct blkif_x86_32_request_other other;
		struct blkif_x86_32_request_indirect indirect;
	} u;
} __attribute__((__packed__));

@@ -127,12 +159,32 @@ struct blkif_x86_64_request_other {
	uint64_t       id;           /* private guest value, echoed in resp  */
} __attribute__((__packed__));

struct blkif_x86_64_request_indirect {
	uint8_t        indirect_op;
	uint16_t       nr_segments;
	uint32_t       _pad1;        /* offsetof(blkif_..,u.indirect.id)==8   */
	uint64_t       id;
	blkif_sector_t sector_number;
	blkif_vdev_t   handle;
	uint16_t       _pad2;
	grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
	/*
	 * The maximum number of indirect segments (and pages) that will
	 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
	 * is also exported to the guest (via xenstore
	 * feature-max-indirect-segments entry), so the frontend knows how
	 * many indirect segments the backend supports.
	 */
	uint32_t       _pad3;        /* make it 64 byte aligned */
} __attribute__((__packed__));

struct blkif_x86_64_request {
	uint8_t        operation;    /* BLKIF_OP_???                         */
	union {
		struct blkif_x86_64_request_rw rw;
		struct blkif_x86_64_request_discard discard;
		struct blkif_x86_64_request_other other;
		struct blkif_x86_64_request_indirect indirect;
	} u;
} __attribute__((__packed__));

@@ -266,6 +318,11 @@ struct xen_blkif {
	wait_queue_head_t	waiting_to_free;
};

struct seg_buf {
	unsigned long offset;
	unsigned int nsec;
};

/*
 * Each outstanding request that we've passed to the lower device layers has a
 * 'pending_req' allocated to it. Each buffer_head that completes decrements
@@ -280,9 +337,16 @@ struct pending_req {
	unsigned short		operation;
	int			status;
	struct list_head	free_list;
	struct page		*pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
	struct persistent_gnt	*persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
	grant_handle_t		grant_handles[BLKIF_MAX_SEGMENTS_PER_REQUEST];
	struct page		*pages[MAX_INDIRECT_SEGMENTS];
	struct persistent_gnt	*persistent_gnts[MAX_INDIRECT_SEGMENTS];
	grant_handle_t		grant_handles[MAX_INDIRECT_SEGMENTS];
	grant_ref_t		grefs[MAX_INDIRECT_SEGMENTS];
	/* Indirect descriptors */
	struct persistent_gnt	*indirect_persistent_gnts[MAX_INDIRECT_PAGES];
	struct page		*indirect_pages[MAX_INDIRECT_PAGES];
	grant_handle_t		indirect_handles[MAX_INDIRECT_PAGES];
	struct seg_buf		seg[MAX_INDIRECT_SEGMENTS];
	struct bio		*biolist[MAX_INDIRECT_SEGMENTS];
};


@@ -321,7 +385,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
static inline void blkif_get_x86_32_req(struct blkif_request *dst,
					struct blkif_x86_32_request *src)
{
	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
	dst->operation = src->operation;
	switch (src->operation) {
	case BLKIF_OP_READ:
@@ -344,6 +408,18 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
		dst->u.discard.sector_number = src->u.discard.sector_number;
		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
		break;
	case BLKIF_OP_INDIRECT:
		dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
		dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
		dst->u.indirect.handle = src->u.indirect.handle;
		dst->u.indirect.id = src->u.indirect.id;
		dst->u.indirect.sector_number = src->u.indirect.sector_number;
		barrier();
		j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
		for (i = 0; i < j; i++)
			dst->u.indirect.indirect_grefs[i] =
				src->u.indirect.indirect_grefs[i];
		break;
	default:
		/*
		 * Don't know how to translate this op. Only get the
@@ -357,7 +433,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
static inline void blkif_get_x86_64_req(struct blkif_request *dst,
					struct blkif_x86_64_request *src)
{
	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
	dst->operation = src->operation;
	switch (src->operation) {
	case BLKIF_OP_READ:
@@ -380,6 +456,18 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
		dst->u.discard.sector_number = src->u.discard.sector_number;
		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
		break;
	case BLKIF_OP_INDIRECT:
		dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
		dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
		dst->u.indirect.handle = src->u.indirect.handle;
		dst->u.indirect.id = src->u.indirect.id;
		dst->u.indirect.sector_number = src->u.indirect.sector_number;
		barrier();
		j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
		for (i = 0; i < j; i++)
			dst->u.indirect.indirect_grefs[i] =
				src->u.indirect.indirect_grefs[i];
		break;
	default:
		/*
		 * Don't know how to translate this op. Only get the
+7 −0
Original line number Diff line number Diff line
@@ -107,6 +107,8 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
	struct xen_blkif *blkif;
	int i;

	BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);

	blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL);
	if (!blkif)
		return ERR_PTR(-ENOMEM);
@@ -709,6 +711,11 @@ again:
				 dev->nodename);
		goto abort;
	}
	err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u",
			    MAX_INDIRECT_SEGMENTS);
	if (err)
		dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)",
			 dev->nodename, err);

	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
			    (unsigned long long)vbd_sz(&be->blkif->vbd));
+406 −84

File changed.

Preview size limit exceeded, changes collapsed.

+53 −0
Original line number Diff line number Diff line
@@ -102,6 +102,30 @@ typedef uint64_t blkif_sector_t;
 */
#define BLKIF_OP_DISCARD           5

/*
 * Recognized if "feature-max-indirect-segments" in present in the backend
 * xenbus info. The "feature-max-indirect-segments" node contains the maximum
 * number of segments allowed by the backend per request. If the node is
 * present, the frontend might use blkif_request_indirect structs in order to
 * issue requests with more than BLKIF_MAX_SEGMENTS_PER_REQUEST (11). The
 * maximum number of indirect segments is fixed by the backend, but the
 * frontend can issue requests with any number of indirect segments as long as
 * it's less than the number provided by the backend. The indirect_grefs field
 * in blkif_request_indirect should be filled by the frontend with the
 * grant references of the pages that are holding the indirect segments.
 * This pages are filled with an array of blkif_request_segment_aligned
 * that hold the information about the segments. The number of indirect
 * pages to use is determined by the maximum number of segments
 * a indirect request contains. Every indirect page can contain a maximum
 * of 512 segments (PAGE_SIZE/sizeof(blkif_request_segment_aligned)),
 * so to calculate the number of indirect pages to use we have to do
 * ceil(indirect_segments/512).
 *
 * If a backend does not recognize BLKIF_OP_INDIRECT, it should *not*
 * create the "feature-max-indirect-segments" node!
 */
#define BLKIF_OP_INDIRECT          6

/*
 * Maximum scatter/gather segments per request.
 * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE.
@@ -109,6 +133,16 @@ typedef uint64_t blkif_sector_t;
 */
#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11

#define BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST 8

struct blkif_request_segment_aligned {
	grant_ref_t gref;        /* reference to I/O buffer frame        */
	/* @first_sect: first sector in frame to transfer (inclusive).   */
	/* @last_sect: last sector in frame to transfer (inclusive).     */
	uint8_t     first_sect, last_sect;
	uint16_t    _pad; /* padding to make it 8 bytes, so it's cache-aligned */
} __attribute__((__packed__));

struct blkif_request_rw {
	uint8_t        nr_segments;  /* number of segments                   */
	blkif_vdev_t   handle;       /* only for read/write requests         */
@@ -147,12 +181,31 @@ struct blkif_request_other {
	uint64_t     id;           /* private guest value, echoed in resp  */
} __attribute__((__packed__));

struct blkif_request_indirect {
	uint8_t        indirect_op;
	uint16_t       nr_segments;
#ifdef CONFIG_X86_64
	uint32_t       _pad1;        /* offsetof(blkif_...,u.indirect.id) == 8 */
#endif
	uint64_t       id;
	blkif_sector_t sector_number;
	blkif_vdev_t   handle;
	uint16_t       _pad2;
	grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
#ifdef CONFIG_X86_64
	uint32_t      _pad3;         /* make it 64 byte aligned */
#else
	uint64_t      _pad3;         /* make it 64 byte aligned */
#endif
} __attribute__((__packed__));

struct blkif_request {
	uint8_t        operation;    /* BLKIF_OP_???                         */
	union {
		struct blkif_request_rw rw;
		struct blkif_request_discard discard;
		struct blkif_request_other other;
		struct blkif_request_indirect indirect;
	} u;
} __attribute__((__packed__));