Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a6f0788e authored by Chaitanya Kulkarni's avatar Chaitanya Kulkarni Committed by Jens Axboe
Browse files

block: add support for REQ_OP_WRITE_ZEROES



This adds a new block layer operation to zero out a range of
LBAs. This allows to implement zeroing for devices that don't use
either discard with a predictable zero pattern or WRITE SAME of zeroes.
The prominent example of that is NVMe with the Write Zeroes command,
but in the future, this should also help with improving the way
zeroing discards work. For this operation, suitable entry is exported in
sysfs which indicate the number of maximum bytes allowed in one
write zeroes operation by the device.

Signed-off-by: default avatarChaitanya Kulkarni <chaitanya.kulkarni@hgst.com>
Reviewed-by: default avatarChristoph Hellwig <hch@lst.de>
Signed-off-by: default avatarJens Axboe <axboe@fb.com>
parent e73c23ff
Loading
Loading
Loading
Loading
+13 −0
Original line number Diff line number Diff line
@@ -235,6 +235,19 @@ Description:
		write_same_max_bytes is 0, write same is not supported
		by the device.

What:		/sys/block/<disk>/queue/write_zeroes_max_bytes
Date:		November 2016
Contact:	Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com>
Description:
		Devices that support write zeroes operation in which a
		single request can be issued to zero out the range of
		contiguous blocks on storage without having any payload
		in the request. This can be used to optimize writing zeroes
		to the devices. write_zeroes_max_bytes indicates how many
		bytes can be written in a single write zeroes command. If
		write_zeroes_max_bytes is 0, write zeroes is not supported
		by the device.

What:		/sys/block/<disk>/queue/zoned
Date:		September 2016
Contact:	Damien Le Moal <damien.lemoal@hgst.com>
+1 −0
Original line number Diff line number Diff line
@@ -674,6 +674,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
	switch (bio_op(bio)) {
	case REQ_OP_DISCARD:
	case REQ_OP_SECURE_ERASE:
	case REQ_OP_WRITE_ZEROES:
		break;
	case REQ_OP_WRITE_SAME:
		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
+4 −0
Original line number Diff line number Diff line
@@ -1950,6 +1950,10 @@ generic_make_request_checks(struct bio *bio)
		if (!bdev_is_zoned(bio->bi_bdev))
			goto not_supported;
		break;
	case REQ_OP_WRITE_ZEROES:
		if (!bdev_write_zeroes_sectors(bio->bi_bdev))
			goto not_supported;
		break;
	default:
		break;
	}
+56 −2
Original line number Diff line number Diff line
@@ -226,6 +226,55 @@ int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL(blkdev_issue_write_same);

/**
 * __blkdev_issue_write_zeroes - generate number of bios with WRITE ZEROES
 * @bdev:	blockdev to issue
 * @sector:	start sector
 * @nr_sects:	number of sectors to write
 * @gfp_mask:	memory allocation flags (for bio_alloc)
 * @biop:	pointer to anchor bio
 *
 * Description:
 *  Generate and issue number of bios(REQ_OP_WRITE_ZEROES) with zerofiled pages.
 */
static int __blkdev_issue_write_zeroes(struct block_device *bdev,
		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
		struct bio **biop)
{
	struct bio *bio = *biop;
	unsigned int max_write_zeroes_sectors;
	struct request_queue *q = bdev_get_queue(bdev);

	if (!q)
		return -ENXIO;

	/* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
	max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);

	if (max_write_zeroes_sectors == 0)
		return -EOPNOTSUPP;

	while (nr_sects) {
		bio = next_bio(bio, 0, gfp_mask);
		bio->bi_iter.bi_sector = sector;
		bio->bi_bdev = bdev;
		bio_set_op_attrs(bio, REQ_OP_WRITE_ZEROES, 0);

		if (nr_sects > max_write_zeroes_sectors) {
			bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
			nr_sects -= max_write_zeroes_sectors;
			sector += max_write_zeroes_sectors;
		} else {
			bio->bi_iter.bi_size = nr_sects << 9;
			nr_sects = 0;
		}
		cond_resched();
	}

	*biop = bio;
	return 0;
}

/**
 * __blkdev_issue_zeroout - generate number of zero filed write bios
 * @bdev:	blockdev to issue
@@ -259,6 +308,11 @@ int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
			goto out;
	}

	ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask,
			biop);
	if (ret == 0 || (ret && ret != -EOPNOTSUPP))
		goto out;

	ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
			ZERO_PAGE(0), biop);
	if (ret == 0 || (ret && ret != -EOPNOTSUPP))
@@ -304,8 +358,8 @@ EXPORT_SYMBOL(__blkdev_issue_zeroout);
 *  the discard request fail, if the discard flag is not set, or if
 *  discard_zeroes_data is not supported, this function will resort to
 *  zeroing the blocks manually, thus provisioning (allocating,
 *  anchoring) them. If the block device supports the WRITE SAME command
 *  blkdev_issue_zeroout() will use it to optimize the process of
 *  anchoring) them. If the block device supports WRITE ZEROES or WRITE SAME
 *  command(s), blkdev_issue_zeroout() will use it to optimize the process of
 *  clearing the block range. Otherwise the zeroing will be performed
 *  using regular WRITE calls.
 */
+13 −4
Original line number Diff line number Diff line
@@ -199,6 +199,10 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
	case REQ_OP_SECURE_ERASE:
		split = blk_bio_discard_split(q, *bio, bs, &nsegs);
		break;
	case REQ_OP_WRITE_ZEROES:
		split = NULL;
		nsegs = (*bio)->bi_phys_segments;
		break;
	case REQ_OP_WRITE_SAME:
		split = blk_bio_write_same_split(q, *bio, bs, &nsegs);
		break;
@@ -241,11 +245,15 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
	 * This should probably be returning 0, but blk_add_request_payload()
	 * (Christoph!!!!)
	 */
	if (bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE)
		return 1;

	if (bio_op(bio) == REQ_OP_WRITE_SAME)
	switch (bio_op(bio)) {
	case REQ_OP_DISCARD:
	case REQ_OP_SECURE_ERASE:
	case REQ_OP_WRITE_SAME:
	case REQ_OP_WRITE_ZEROES:
		return 1;
	default:
		break;
	}

	fbio = bio;
	cluster = blk_queue_cluster(q);
@@ -416,6 +424,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
	switch (bio_op(bio)) {
	case REQ_OP_DISCARD:
	case REQ_OP_SECURE_ERASE:
	case REQ_OP_WRITE_ZEROES:
		/*
		 * This is a hack - drivers should be neither modifying the
		 * biovec, nor relying on bi_vcnt - but because of
Loading