Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 9104d31a authored by Lars Ellenberg's avatar Lars Ellenberg Committed by Jens Axboe
Browse files

drbd: introduce WRITE_SAME support



We will support WRITE_SAME, if
 * all peers support WRITE_SAME (both in kernel and DRBD version),
 * all peer devices support WRITE_SAME
 * logical_block_size is identical on all peers.

We may at some point introduce a fallback on the receiving side
for devices/kernels that do not support WRITE_SAME,
by open-coding a submit loop. But not yet.

Signed-off-by: default avatarPhilipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: default avatarLars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: default avatarJens Axboe <axboe@fb.com>
parent 60bac040
Loading
Loading
Loading
Loading
+8 −1
Original line number Diff line number Diff line
@@ -840,6 +840,13 @@ static int update_sync_bits(struct drbd_device *device,
	return count;
}

static bool plausible_request_size(int size)
{
	return size > 0
		&& size <= DRBD_MAX_BATCH_BIO_SIZE
		&& IS_ALIGNED(size, 512);
}

/* clear the bit corresponding to the piece of storage in question:
 * size byte of data starting from sector.  Only clear a bits of the affected
 * one ore more _aligned_ BM_BLOCK_SIZE blocks.
@@ -859,7 +866,7 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
	if ((mode == SET_OUT_OF_SYNC) && size == 0)
		return 0;

	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
	if (!plausible_request_size(size)) {
		drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
				drbd_change_sync_fname[mode],
				(unsigned long long)sector, size);
+3 −8
Original line number Diff line number Diff line
@@ -237,14 +237,9 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re
	seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
	seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");

	if (f & EE_IS_TRIM) {
		seq_putc(m, sep);
		sep = '|';
		if (f & EE_IS_TRIM_USE_ZEROOUT)
			seq_puts(m, "zero-out");
		else
			seq_puts(m, "trim");
	}
	if (f & EE_IS_TRIM)
		__seq_print_rq_state_bit(m, f & EE_IS_TRIM_USE_ZEROOUT, &sep, "zero-out", "trim");
	seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same");
	seq_putc(m, '\n');
}

+9 −4
Original line number Diff line number Diff line
@@ -468,6 +468,9 @@ enum {
	/* this is/was a write request */
	__EE_WRITE,

	/* this is/was a write same request */
	__EE_WRITE_SAME,

	/* this originates from application on peer
	 * (not some resync or verify or other DRBD internal request) */
	__EE_APPLICATION,
@@ -487,6 +490,7 @@ enum {
#define EE_IN_INTERVAL_TREE	(1<<__EE_IN_INTERVAL_TREE)
#define EE_SUBMITTED		(1<<__EE_SUBMITTED)
#define EE_WRITE		(1<<__EE_WRITE)
#define EE_WRITE_SAME		(1<<__EE_WRITE_SAME)
#define EE_APPLICATION		(1<<__EE_APPLICATION)
#define EE_RS_THIN_REQ		(1<<__EE_RS_THIN_REQ)

@@ -1350,8 +1354,8 @@ struct bm_extent {
/* For now, don't allow more than half of what we can "activate" in one
 * activity log transaction to be discarded in one go. We may need to rework
 * drbd_al_begin_io() to allow for even larger discard ranges */
#define DRBD_MAX_DISCARD_SIZE	(AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE)
#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9)
#define DRBD_MAX_BATCH_BIO_SIZE	 (AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE)
#define DRBD_MAX_BBIO_SECTORS    (DRBD_MAX_BATCH_BIO_SIZE >> 9)

extern int  drbd_bm_init(struct drbd_device *device);
extern int  drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
@@ -1488,7 +1492,8 @@ enum determine_dev_size {
extern enum determine_dev_size
drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
extern void resync_after_online_grow(struct drbd_device *);
extern void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev);
extern void drbd_reconsider_queue_parameters(struct drbd_device *device,
			struct drbd_backing_dev *bdev, struct o_qlim *o);
extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
					enum drbd_role new_role,
					int force);
@@ -1569,7 +1574,7 @@ extern int drbd_submit_peer_request(struct drbd_device *,
extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
						     sector_t, unsigned int,
						     bool,
						     unsigned int,
						     gfp_t) __must_hold(local);
extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
				 int);
+72 −10
Original line number Diff line number Diff line
@@ -920,6 +920,31 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
	}
}

/* communicated if (agreed_features & DRBD_FF_WSAME) */
void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct request_queue *q)
{
	if (q) {
		p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
		p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
		p->qlim->alignment_offset = cpu_to_be32(queue_alignment_offset(q));
		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
		p->qlim->discard_enabled = blk_queue_discard(q);
		p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q);
		p->qlim->write_same_capable = !!q->limits.max_write_same_sectors;
	} else {
		q = device->rq_queue;
		p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q));
		p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q));
		p->qlim->alignment_offset = 0;
		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
		p->qlim->discard_enabled = 0;
		p->qlim->discard_zeroes_data = 0;
		p->qlim->write_same_capable = 0;
	}
}

int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
{
	struct drbd_device *device = peer_device->device;
@@ -928,29 +953,37 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu
	sector_t d_size, u_size;
	int q_order_type;
	unsigned int max_bio_size;
	unsigned int packet_size;

	sock = &peer_device->connection->data;
	p = drbd_prepare_command(peer_device, sock);
	if (!p)
		return -EIO;

	packet_size = sizeof(*p);
	if (peer_device->connection->agreed_features & DRBD_FF_WSAME)
		packet_size += sizeof(p->qlim[0]);

	memset(p, 0, packet_size);
	if (get_ldev_if_state(device, D_NEGOTIATING)) {
		D_ASSERT(device, device->ldev->backing_bdev);
		struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
		d_size = drbd_get_max_capacity(device->ldev);
		rcu_read_lock();
		u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
		rcu_read_unlock();
		q_order_type = drbd_queue_order_type(device);
		max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
		max_bio_size = queue_max_hw_sectors(q) << 9;
		max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
		assign_p_sizes_qlim(device, p, q);
		put_ldev(device);
	} else {
		d_size = 0;
		u_size = 0;
		q_order_type = QUEUE_ORDERED_NONE;
		max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
		assign_p_sizes_qlim(device, p, NULL);
	}

	sock = &peer_device->connection->data;
	p = drbd_prepare_command(peer_device, sock);
	if (!p)
		return -EIO;

	if (peer_device->connection->agreed_pro_version <= 94)
		max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
	else if (peer_device->connection->agreed_pro_version < 100)
@@ -962,7 +995,8 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu
	p->max_bio_size = cpu_to_be32(max_bio_size);
	p->queue_order_type = cpu_to_be16(q_order_type);
	p->dds_flags = cpu_to_be16(flags);
	return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0);

	return drbd_send_command(peer_device, sock, P_SIZES, packet_size, NULL, 0);
}

/**
@@ -1577,6 +1611,9 @@ static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
					 ? 0 : MSG_MORE);
		if (err)
			return err;
		/* REQ_OP_WRITE_SAME has only one segment */
		if (bio_op(bio) == REQ_OP_WRITE_SAME)
			break;
	}
	return 0;
}
@@ -1595,6 +1632,9 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b
				      bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
		if (err)
			return err;
		/* REQ_OP_WRITE_SAME has only one segment */
		if (bio_op(bio) == REQ_OP_WRITE_SAME)
			break;
	}
	return 0;
}
@@ -1626,6 +1666,7 @@ static u32 bio_flags_to_wire(struct drbd_connection *connection,
		return  (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
			(bio->bi_rw & REQ_FUA ? DP_FUA : 0) |
			(bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) |
			(bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) |
			(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0);
	else
		return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
@@ -1639,6 +1680,8 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
	struct drbd_device *device = peer_device->device;
	struct drbd_socket *sock;
	struct p_data *p;
	struct p_wsame *wsame = NULL;
	void *digest_out;
	unsigned int dp_flags = 0;
	int digest_size;
	int err;
@@ -1674,12 +1717,29 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *
		err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
		goto out;
	}
	if (dp_flags & DP_WSAME) {
		/* this will only work if DRBD_FF_WSAME is set AND the
		 * handshake agreed that all nodes and backend devices are
		 * WRITE_SAME capable and agree on logical_block_size */
		wsame = (struct p_wsame*)p;
		digest_out = wsame + 1;
		wsame->size = cpu_to_be32(req->i.size);
	} else
		digest_out = p + 1;

	/* our digest is still only over the payload.
	 * TRIM does not carry any payload. */
	if (digest_size)
		drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1);
	err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size);
		drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out);
	if (wsame) {
		err =
		    __send_command(peer_device->connection, device->vnr, sock, P_WSAME,
				   sizeof(*wsame) + digest_size, NULL,
				   bio_iovec(req->master_bio).bv_len);
	} else
		err =
		    __send_command(peer_device->connection, device->vnr, sock, P_DATA,
				   sizeof(*p) + digest_size, NULL, req->i.size);
	if (!err) {
		/* For protocol A, we have to memcpy the payload into
		 * socket buffers, as we may complete right away
@@ -3660,6 +3720,8 @@ const char *cmdname(enum drbd_packet cmd)
	 * one PRO_VERSION */
	static const char *cmdnames[] = {
		[P_DATA]	        = "Data",
		[P_WSAME]	        = "WriteSame",
		[P_TRIM]	        = "Trim",
		[P_DATA_REPLY]	        = "DataReply",
		[P_RS_DATA_REPLY]	= "RSDataReply",
		[P_BARRIER]	        = "Barrier",
+79 −9
Original line number Diff line number Diff line
@@ -1174,6 +1174,17 @@ static void blk_queue_discard_granularity(struct request_queue *q, unsigned int
{
	q->limits.discard_granularity = granularity;
}

static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection)
{
	/* when we introduced REQ_WRITE_SAME support, we also bumped
	 * our maximum supported batch bio size used for discards. */
	if (connection->agreed_features & DRBD_FF_WSAME)
		return DRBD_MAX_BBIO_SECTORS;
	/* before, with DRBD <= 8.4.6, we only allowed up to one AL_EXTENT_SIZE. */
	return AL_EXTENT_SIZE >> 9;
}

static void decide_on_discard_support(struct drbd_device *device,
			struct request_queue *q,
			struct request_queue *b,
@@ -1190,7 +1201,7 @@ static void decide_on_discard_support(struct drbd_device *device,
		can_do = false;
		drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n");
	}
	if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & FF_TRIM)) {
	if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) {
		can_do = false;
		drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n");
	}
@@ -1202,7 +1213,7 @@ static void decide_on_discard_support(struct drbd_device *device,
		 * you care, you need to use devices with similar
		 * topology on all peers. */
		blk_queue_discard_granularity(q, 512);
		q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS;
		q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
	} else {
		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
@@ -1223,8 +1234,67 @@ static void fixup_discard_if_not_supported(struct request_queue *q)
	}
}

static void decide_on_write_same_support(struct drbd_device *device,
			struct request_queue *q,
			struct request_queue *b, struct o_qlim *o)
{
	struct drbd_peer_device *peer_device = first_peer_device(device);
	struct drbd_connection *connection = peer_device->connection;
	bool can_do = b ? b->limits.max_write_same_sectors : true;

	if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_WSAME)) {
		can_do = false;
		drbd_info(peer_device, "peer does not support WRITE_SAME\n");
	}

	if (o) {
		/* logical block size; queue_logical_block_size(NULL) is 512 */
		unsigned int peer_lbs = be32_to_cpu(o->logical_block_size);
		unsigned int me_lbs_b = queue_logical_block_size(b);
		unsigned int me_lbs = queue_logical_block_size(q);

		if (me_lbs_b != me_lbs) {
			drbd_warn(device,
				"logical block size of local backend does not match (drbd:%u, backend:%u); was this a late attach?\n",
				me_lbs, me_lbs_b);
			/* rather disable write same than trigger some BUG_ON later in the scsi layer. */
			can_do = false;
		}
		if (me_lbs_b != peer_lbs) {
			drbd_warn(peer_device, "logical block sizes do not match (me:%u, peer:%u); this may cause problems.\n",
				me_lbs, peer_lbs);
			if (can_do) {
				drbd_dbg(peer_device, "logical block size mismatch: WRITE_SAME disabled.\n");
				can_do = false;
			}
			me_lbs = max(me_lbs, me_lbs_b);
			/* We cannot change the logical block size of an in-use queue.
			 * We can only hope that access happens to be properly aligned.
			 * If not, the peer will likely produce an IO error, and detach. */
			if (peer_lbs > me_lbs) {
				if (device->state.role != R_PRIMARY) {
					blk_queue_logical_block_size(q, peer_lbs);
					drbd_warn(peer_device, "logical block size set to %u\n", peer_lbs);
				} else {
					drbd_warn(peer_device,
						"current Primary must NOT adjust logical block size (%u -> %u); hope for the best.\n",
						me_lbs, peer_lbs);
				}
			}
		}
		if (can_do && !o->write_same_capable) {
			/* If we introduce an open-coded write-same loop on the receiving side,
			 * the peer would present itself as "capable". */
			drbd_dbg(peer_device, "WRITE_SAME disabled (peer device not capable)\n");
			can_do = false;
		}
	}

	blk_queue_max_write_same_sectors(q, can_do ? DRBD_MAX_BBIO_SECTORS : 0);
}

static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
				   unsigned int max_bio_size)
				   unsigned int max_bio_size, struct o_qlim *o)
{
	struct request_queue * const q = device->rq_queue;
	unsigned int max_hw_sectors = max_bio_size >> 9;
@@ -1244,15 +1314,15 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
		rcu_read_unlock();

		blk_set_stacking_limits(&q->limits);
		blk_queue_max_write_same_sectors(q, 0);
	}

	blk_queue_logical_block_size(q, 512);
	blk_queue_max_hw_sectors(q, max_hw_sectors);
	/* This is the workaround for "bio would need to, but cannot, be split" */
	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
	blk_queue_segment_boundary(q, PAGE_SIZE-1);
	decide_on_discard_support(device, q, b, discard_zeroes_if_aligned);
	decide_on_write_same_support(device, q, b, o);

	if (b) {
		blk_queue_stack_limits(q, b);

@@ -1266,7 +1336,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
	fixup_discard_if_not_supported(q);
}

void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev)
void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
{
	unsigned int now, new, local, peer;

@@ -1309,7 +1379,7 @@ void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_ba
	if (new != now)
		drbd_info(device, "max BIO size = %u\n", new);

	drbd_setup_queue_param(device, bdev, new);
	drbd_setup_queue_param(device, bdev, new, o);
}

/* Starts the worker thread */
@@ -1542,7 +1612,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
		drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);

	if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned)
		drbd_reconsider_queue_parameters(device, device->ldev);
		drbd_reconsider_queue_parameters(device, device->ldev, NULL);

	drbd_md_sync(device);

@@ -1922,7 +1992,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
	device->read_cnt = 0;
	device->writ_cnt = 0;

	drbd_reconsider_queue_parameters(device, device->ldev);
	drbd_reconsider_queue_parameters(device, device->ldev, NULL);

	/* If I am currently not R_PRIMARY,
	 * but meta data primary indicator is set,
Loading