Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6a0cb1bc authored by Hannes Reinecke's avatar Hannes Reinecke Committed by Jens Axboe
Browse files

block: Implement support for zoned block devices



Implement zoned block device zone information reporting and reset.
Zone information are reported as struct blk_zone. This implementation
does not differentiate between host-aware and host-managed device
models and is valid for both. Two functions are provided:
blkdev_report_zones for discovering the zone configuration of a
zoned block device, and blkdev_reset_zones for resetting the write
pointer of sequential zones. The helper function blk_queue_zone_size
and bdev_zone_size are also provided for, as the name suggest,
obtaining the zone size (in 512B sectors) of the zones of the device.

Signed-off-by: default avatarHannes Reinecke <hare@suse.de>

[Damien: * Removed the zone cache
         * Implement report zones operation based on earlier proposal
           by Shaun Tancheff <shaun.tancheff@seagate.com>]
Signed-off-by: default avatarDamien Le Moal <damien.lemoal@hgst.com>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarMartin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: default avatarShaun Tancheff <shaun.tancheff@seagate.com>
Tested-by: default avatarShaun Tancheff <shaun.tancheff@seagate.com>
Signed-off-by: default avatarJens Axboe <axboe@fb.com>
parent 2d253440
Loading
Loading
Loading
Loading
+8 −0
Original line number Diff line number Diff line
@@ -89,6 +89,14 @@ config BLK_DEV_INTEGRITY
	T10/SCSI Data Integrity Field or the T13/ATA External Path
	Protection.  If in doubt, say N.

config BLK_DEV_ZONED
	bool "Zoned block device support"
	---help---
	Block layer zoned block device support. This option enables
	support for ZAC/ZBC host-managed and host-aware zoned block devices.

	Say yes here if you have a ZAC or ZBC storage device.

config BLK_DEV_THROTTLING
	bool "Block layer bio throttling support"
	depends on BLK_CGROUP=y
+1 −0
Original line number Diff line number Diff line
@@ -23,3 +23,4 @@ obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
obj-$(CONFIG_BLK_MQ_PCI)	+= blk-mq-pci.o
obj-$(CONFIG_BLK_DEV_ZONED)	+= blk-zoned.o

block/blk-zoned.c

0 → 100644
+257 −0
Original line number Diff line number Diff line
/*
 * Zoned block device handling
 *
 * Copyright (c) 2015, Hannes Reinecke
 * Copyright (c) 2015, SUSE Linux GmbH
 *
 * Copyright (c) 2016, Damien Le Moal
 * Copyright (c) 2016, Western Digital
 */

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/rbtree.h>
#include <linux/blkdev.h>

static inline sector_t blk_zone_start(struct request_queue *q,
				      sector_t sector)
{
	sector_t zone_mask = blk_queue_zone_size(q) - 1;

	return sector & ~zone_mask;
}

/*
 * Check that a zone report belongs to the partition.
 * If yes, fix its start sector and write pointer, copy it in the
 * zone information array and return true. Return false otherwise.
 */
static bool blkdev_report_zone(struct block_device *bdev,
			       struct blk_zone *rep,
			       struct blk_zone *zone)
{
	sector_t offset = get_start_sect(bdev);

	if (rep->start < offset)
		return false;

	rep->start -= offset;
	if (rep->start + rep->len > bdev->bd_part->nr_sects)
		return false;

	if (rep->type == BLK_ZONE_TYPE_CONVENTIONAL)
		rep->wp = rep->start + rep->len;
	else
		rep->wp -= offset;
	memcpy(zone, rep, sizeof(struct blk_zone));

	return true;
}

/**
 * blkdev_report_zones - Get zones information
 * @bdev:	Target block device
 * @sector:	Sector from which to report zones
 * @zones:	Array of zone structures where to return the zones information
 * @nr_zones:	Number of zone structures in the zone array
 * @gfp_mask:	Memory allocation flags (for bio_alloc)
 *
 * Description:
 *    Get zone information starting from the zone containing @sector.
 *    The number of zone information reported may be less than the number
 *    requested by @nr_zones. The number of zones actually reported is
 *    returned in @nr_zones.
 */
int blkdev_report_zones(struct block_device *bdev,
			sector_t sector,
			struct blk_zone *zones,
			unsigned int *nr_zones,
			gfp_t gfp_mask)
{
	struct request_queue *q = bdev_get_queue(bdev);
	struct blk_zone_report_hdr *hdr;
	unsigned int nrz = *nr_zones;
	struct page *page;
	unsigned int nr_rep;
	size_t rep_bytes;
	unsigned int nr_pages;
	struct bio *bio;
	struct bio_vec *bv;
	unsigned int i, n, nz;
	unsigned int ofst;
	void *addr;
	int ret = 0;

	if (!q)
		return -ENXIO;

	if (!blk_queue_is_zoned(q))
		return -EOPNOTSUPP;

	if (!nrz)
		return 0;

	if (sector > bdev->bd_part->nr_sects) {
		*nr_zones = 0;
		return 0;
	}

	/*
	 * The zone report has a header. So make room for it in the
	 * payload. Also make sure that the report fits in a single BIO
	 * that will not be split down the stack.
	 */
	rep_bytes = sizeof(struct blk_zone_report_hdr) +
		sizeof(struct blk_zone) * nrz;
	rep_bytes = (rep_bytes + PAGE_SIZE - 1) & PAGE_MASK;
	if (rep_bytes > (queue_max_sectors(q) << 9))
		rep_bytes = queue_max_sectors(q) << 9;

	nr_pages = min_t(unsigned int, BIO_MAX_PAGES,
			 rep_bytes >> PAGE_SHIFT);
	nr_pages = min_t(unsigned int, nr_pages,
			 queue_max_segments(q));

	bio = bio_alloc(gfp_mask, nr_pages);
	if (!bio)
		return -ENOMEM;

	bio->bi_bdev = bdev;
	bio->bi_iter.bi_sector = blk_zone_start(q, sector);
	bio_set_op_attrs(bio, REQ_OP_ZONE_REPORT, 0);

	for (i = 0; i < nr_pages; i++) {
		page = alloc_page(gfp_mask);
		if (!page) {
			ret = -ENOMEM;
			goto out;
		}
		if (!bio_add_page(bio, page, PAGE_SIZE, 0)) {
			__free_page(page);
			break;
		}
	}

	if (i == 0)
		ret = -ENOMEM;
	else
		ret = submit_bio_wait(bio);
	if (ret)
		goto out;

	/*
	 * Process the report result: skip the header and go through the
	 * reported zones to fixup and fixup the zone information for
	 * partitions. At the same time, return the zone information into
	 * the zone array.
	 */
	n = 0;
	nz = 0;
	nr_rep = 0;
	bio_for_each_segment_all(bv, bio, i) {

		if (!bv->bv_page)
			break;

		addr = kmap_atomic(bv->bv_page);

		/* Get header in the first page */
		ofst = 0;
		if (!nr_rep) {
			hdr = (struct blk_zone_report_hdr *) addr;
			nr_rep = hdr->nr_zones;
			ofst = sizeof(struct blk_zone_report_hdr);
		}

		/* Fixup and report zones */
		while (ofst < bv->bv_len &&
		       n < nr_rep && nz < nrz) {
			if (blkdev_report_zone(bdev, addr + ofst, &zones[nz]))
				nz++;
			ofst += sizeof(struct blk_zone);
			n++;
		}

		kunmap_atomic(addr);

		if (n >= nr_rep || nz >= nrz)
			break;

	}

out:
	bio_for_each_segment_all(bv, bio, i)
		__free_page(bv->bv_page);
	bio_put(bio);

	if (ret == 0)
		*nr_zones = nz;

	return ret;
}
EXPORT_SYMBOL_GPL(blkdev_report_zones);

/**
 * blkdev_reset_zones - Reset zones write pointer
 * @bdev:	Target block device
 * @sector:	Start sector of the first zone to reset
 * @nr_sectors:	Number of sectors, at least the length of one zone
 * @gfp_mask:	Memory allocation flags (for bio_alloc)
 *
 * Description:
 *    Reset the write pointer of the zones contained in the range
 *    @sector..@sector+@nr_sectors. Specifying the entire disk sector range
 *    is valid, but the specified range should not contain conventional zones.
 */
int blkdev_reset_zones(struct block_device *bdev,
		       sector_t sector, sector_t nr_sectors,
		       gfp_t gfp_mask)
{
	struct request_queue *q = bdev_get_queue(bdev);
	sector_t zone_sectors;
	sector_t end_sector = sector + nr_sectors;
	struct bio *bio;
	int ret;

	if (!q)
		return -ENXIO;

	if (!blk_queue_is_zoned(q))
		return -EOPNOTSUPP;

	if (end_sector > bdev->bd_part->nr_sects)
		/* Out of range */
		return -EINVAL;

	/* Check alignment (handle eventual smaller last zone) */
	zone_sectors = blk_queue_zone_size(q);
	if (sector & (zone_sectors - 1))
		return -EINVAL;

	if ((nr_sectors & (zone_sectors - 1)) &&
	    end_sector != bdev->bd_part->nr_sects)
		return -EINVAL;

	while (sector < end_sector) {

		bio = bio_alloc(gfp_mask, 0);
		bio->bi_iter.bi_sector = sector;
		bio->bi_bdev = bdev;
		bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0);

		ret = submit_bio_wait(bio);
		bio_put(bio);

		if (ret)
			return ret;

		sector += zone_sectors;

		/* This may take a while, so be nice to others */
		cond_resched();

	}

	return 0;
}
EXPORT_SYMBOL_GPL(blkdev_reset_zones);
+31 −0
Original line number Diff line number Diff line
@@ -24,6 +24,7 @@
#include <linux/rcupdate.h>
#include <linux/percpu-refcount.h>
#include <linux/scatterlist.h>
#include <linux/blkzoned.h>

struct module;
struct scsi_ioctl_command;
@@ -302,6 +303,21 @@ struct queue_limits {
	enum blk_zoned_model	zoned;
};

#ifdef CONFIG_BLK_DEV_ZONED

struct blk_zone_report_hdr {
	unsigned int	nr_zones;
	u8		padding[60];
};

extern int blkdev_report_zones(struct block_device *bdev,
			       sector_t sector, struct blk_zone *zones,
			       unsigned int *nr_zones, gfp_t gfp_mask);
extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
			      sector_t nr_sectors, gfp_t gfp_mask);

#endif /* CONFIG_BLK_DEV_ZONED */

struct request_queue {
	/*
	 * Together with queue_head for cacheline sharing
@@ -654,6 +670,11 @@ static inline bool blk_queue_is_zoned(struct request_queue *q)
	}
}

static inline unsigned int blk_queue_zone_size(struct request_queue *q)
{
	return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0;
}

/*
 * We regard a request as sync, if either a read or a sync write
 */
@@ -1401,6 +1422,16 @@ static inline bool bdev_is_zoned(struct block_device *bdev)
	return false;
}

static inline unsigned int bdev_zone_size(struct block_device *bdev)
{
	struct request_queue *q = bdev_get_queue(bdev);

	if (q)
		return blk_queue_zone_size(q);

	return 0;
}

static inline int queue_dma_alignment(struct request_queue *q)
{
	return q ? q->dma_alignment : 511;
+1 −0
Original line number Diff line number Diff line
@@ -70,6 +70,7 @@ header-y += bfs_fs.h
header-y += binfmts.h
header-y += blkpg.h
header-y += blktrace_api.h
header-y += blkzoned.h
header-y += bpf_common.h
header-y += bpf_perf_event.h
header-y += bpf.h
Loading