Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 584acdd4 authored by Markus Stockhausen's avatar Markus Stockhausen Committed by NeilBrown
Browse files

md/raid5: activate raid6 rmw feature



Glue it altogehter. The raid6 rmw path should work the same as the
already existing raid5 logic. So emulate the prexor handling/flags
and split functions as needed.

1) Enable xor_syndrome() in the async layer.

2) Split ops_run_prexor() into RAID4/5 and RAID6 logic. Xor the syndrome
at the start of a rmw run as we did it before for the single parity.

3) Take care of rmw run in ops_run_reconstruct6(). Again process only
the changed pages to get syndrome back into sync.

4) Enhance set_syndrome_sources() to fill NULL pages if we are in a rmw
run. The lower layers will calculate start & end pages from that and
call the xor_syndrome() correspondingly.

5) Adapt the several places where we ignored Q handling up to now.

Performance numbers for a single E5630 system with a mix of 10 7200k
desktop/server disks. 300 seconds random write with 8 threads onto a
3,2TB (10*400GB) RAID6 64K chunk without spare (group_thread_cnt=4)

bsize   rmw_level=1   rmw_level=0   rmw_level=1   rmw_level=0
        skip_copy=1   skip_copy=1   skip_copy=0   skip_copy=0
   4K      115 KB/s      141 KB/s      165 KB/s      140 KB/s
   8K      225 KB/s      275 KB/s      324 KB/s      274 KB/s
  16K      434 KB/s      536 KB/s      640 KB/s      534 KB/s
  32K      751 KB/s    1,051 KB/s    1,234 KB/s    1,045 KB/s
  64K    1,339 KB/s    1,958 KB/s    2,282 KB/s    1,962 KB/s
 128K    2,673 KB/s    3,862 KB/s    4,113 KB/s    3,898 KB/s
 256K    7,685 KB/s    7,539 KB/s    7,557 KB/s    7,638 KB/s
 512K   19,556 KB/s   19,558 KB/s   19,652 KB/s   19,688 Kb/s

Signed-off-by: default avatarMarkus Stockhausen <stockhausen@collogia.de>
Signed-off-by: default avatarNeilBrown <neilb@suse.de>
parent a582564b
Loading
Loading
Loading
Loading
+16 −3
Original line number Diff line number Diff line
@@ -124,6 +124,7 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
{
	void **srcs;
	int i;
	int start = -1, stop = disks - 3;

	if (submit->scribble)
		srcs = submit->scribble;
@@ -134,9 +135,20 @@ do_sync_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
		if (blocks[i] == NULL) {
			BUG_ON(i > disks - 3); /* P or Q can't be zero */
			srcs[i] = (void*)raid6_empty_zero_page;
		} else
		} else {
			srcs[i] = page_address(blocks[i]) + offset;
			if (i < disks - 2) {
				stop = i;
				if (start == -1)
					start = i;
			}
		}
	}
	if (submit->flags & ASYNC_TX_PQ_XOR_DST) {
		BUG_ON(!raid6_call.xor_syndrome);
		if (start >= 0)
			raid6_call.xor_syndrome(disks, start, stop, len, srcs);
	} else
		raid6_call.gen_syndrome(disks, len, srcs);
	async_tx_sync_epilog(submit);
}
@@ -178,7 +190,8 @@ async_gen_syndrome(struct page **blocks, unsigned int offset, int disks,
	if (device)
		unmap = dmaengine_get_unmap_data(device->dev, disks, GFP_NOIO);

	if (unmap &&
	/* XORing P/Q is only implemented in software */
	if (unmap && !(submit->flags & ASYNC_TX_PQ_XOR_DST) &&
	    (src_cnt <= dma_maxpq(device, 0) ||
	     dma_maxpq(device, DMA_PREP_CONTINUE) > 0) &&
	    is_dma_pq_aligned(device, offset, 0, len)) {
+78 −26
Original line number Diff line number Diff line
@@ -1317,7 +1317,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 * destination buffer is recorded in srcs[count] and the Q destination
 * is recorded in srcs[count+1]].
 */
static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
static int set_syndrome_sources(struct page **srcs,
				struct stripe_head *sh,
				int srctype)
{
	int disks = sh->disks;
	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
@@ -1332,7 +1334,14 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
	i = d0_idx;
	do {
		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
		struct r5dev *dev = &sh->dev[i];

		if (i == sh->qd_idx || i == sh->pd_idx ||
		    (srctype == SYNDROME_SRC_ALL) ||
		    (srctype == SYNDROME_SRC_WANT_DRAIN &&
		     test_bit(R5_Wantdrain, &dev->flags)) ||
		    (srctype == SYNDROME_SRC_WRITTEN &&
		     dev->written))
			srcs[slot] = sh->dev[i].page;
		i = raid6_next_disk(i, disks);
	} while (i != d0_idx);
@@ -1373,7 +1382,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
	atomic_inc(&sh->count);

	if (target == qd_idx) {
		count = set_syndrome_sources(blocks, sh);
		count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
		blocks[count] = NULL; /* regenerating p is not necessary */
		BUG_ON(blocks[count+1] != dest); /* q should already be set */
		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
@@ -1481,7 +1490,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
				       &submit);

			count = set_syndrome_sources(blocks, sh);
			count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
					  ops_complete_compute, sh,
					  to_addr_conv(sh, percpu, 0));
@@ -1515,7 +1524,7 @@ static void ops_complete_prexor(void *stripe_head_ref)
}

static struct dma_async_tx_descriptor *
ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
		struct dma_async_tx_descriptor *tx)
{
	int disks = sh->disks;
@@ -1544,6 +1553,26 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
	return tx;
}

static struct dma_async_tx_descriptor *
ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
		struct dma_async_tx_descriptor *tx)
{
	struct page **blocks = to_addr_page(percpu, 0);
	int count;
	struct async_submit_ctl submit;

	pr_debug("%s: stripe %llu\n", __func__,
		(unsigned long long)sh->sector);

	count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);

	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
			  ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
	tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);

	return tx;
}

static struct dma_async_tx_descriptor *
ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
{
@@ -1746,6 +1775,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
	int count, i, j = 0;
	struct stripe_head *head_sh = sh;
	int last_stripe;
	int synflags;
	unsigned long txflags;

	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);

@@ -1765,14 +1796,23 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,

again:
	blocks = to_addr_page(percpu, j);
	count = set_syndrome_sources(blocks, sh);

	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
		synflags = SYNDROME_SRC_WRITTEN;
		txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
	} else {
		synflags = SYNDROME_SRC_ALL;
		txflags = ASYNC_TX_ACK;
	}

	count = set_syndrome_sources(blocks, sh, synflags);
	last_stripe = !head_sh->batch_head ||
		list_first_entry(&sh->batch_list,
				 struct stripe_head, batch_list) == head_sh;

	if (last_stripe) {
		atomic_inc(&head_sh->count);
		init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
		init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
				  head_sh, to_addr_conv(sh, percpu, j));
	} else
		init_async_submit(&submit, 0, tx, NULL, NULL,
@@ -1843,7 +1883,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
		(unsigned long long)sh->sector, checkp);

	BUG_ON(sh->batch_head);
	count = set_syndrome_sources(srcs, sh);
	count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
	if (!checkp)
		srcs[count] = NULL;

@@ -1884,8 +1924,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
			async_tx_ack(tx);
	}

	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
		tx = ops_run_prexor(sh, percpu, tx);
	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
		if (level < 6)
			tx = ops_run_prexor5(sh, percpu, tx);
		else
			tx = ops_run_prexor6(sh, percpu, tx);
	}

	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
		tx = ops_run_biodrain(sh, tx);
@@ -2770,7 +2814,7 @@ static void
schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
			 int rcw, int expand)
{
	int i, pd_idx = sh->pd_idx, disks = sh->disks;
	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
	struct r5conf *conf = sh->raid_conf;
	int level = conf->level;

@@ -2806,13 +2850,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
				atomic_inc(&conf->pending_full_writes);
	} else {
		BUG_ON(level == 6);
		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
		BUG_ON(level == 6 &&
			(!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
			   test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));

		for (i = disks; i--; ) {
			struct r5dev *dev = &sh->dev[i];
			if (i == pd_idx)
			if (i == pd_idx || i == qd_idx)
				continue;

			if (dev->towrite &&
@@ -3476,28 +3522,27 @@ static void handle_stripe_dirtying(struct r5conf *conf,
	int rmw = 0, rcw = 0, i;
	sector_t recovery_cp = conf->mddev->recovery_cp;

	/* RAID6 requires 'rcw' in current implementation.
	 * Otherwise, check whether resync is now happening or should start.
	/* Check whether resync is now happening or should start.
	 * If yes, then the array is dirty (after unclean shutdown or
	 * initial creation), so parity in some stripes might be inconsistent.
	 * In this case, we need to always do reconstruct-write, to ensure
	 * that in case of drive failure or read-error correction, we
	 * generate correct data from the parity.
	 */
	if (conf->max_degraded == 2 ||
	if (conf->rmw_level == PARITY_DISABLE_RMW ||
	    (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
	     s->failed == 0)) {
		/* Calculate the real rcw later - for now make it
		 * look like rcw is cheaper
		 */
		rcw = 1; rmw = 2;
		pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
			 conf->max_degraded, (unsigned long long)recovery_cp,
		pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
			 conf->rmw_level, (unsigned long long)recovery_cp,
			 (unsigned long long)sh->sector);
	} else for (i = disks; i--; ) {
		/* would I have to read this buffer for read_modify_write */
		struct r5dev *dev = &sh->dev[i];
		if ((dev->towrite || i == sh->pd_idx) &&
		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
		    !test_bit(R5_LOCKED, &dev->flags) &&
		    !(test_bit(R5_UPTODATE, &dev->flags) ||
		      test_bit(R5_Wantcompute, &dev->flags))) {
@@ -3507,7 +3552,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
				rmw += 2*disks;  /* cannot read it */
		}
		/* Would I have to read this buffer for reconstruct_write */
		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
		if (!test_bit(R5_OVERWRITE, &dev->flags) &&
		    i != sh->pd_idx && i != sh->qd_idx &&
		    !test_bit(R5_LOCKED, &dev->flags) &&
		    !(test_bit(R5_UPTODATE, &dev->flags) ||
		    test_bit(R5_Wantcompute, &dev->flags))) {
@@ -3520,7 +3566,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
		(unsigned long long)sh->sector, rmw, rcw);
	set_bit(STRIPE_HANDLE, &sh->state);
	if (rmw < rcw && rmw > 0) {
	if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) {
		/* prefer read-modify-write, but need to get some data */
		if (conf->mddev->queue)
			blk_add_trace_msg(conf->mddev->queue,
@@ -3528,7 +3574,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
					  (unsigned long long)sh->sector, rmw);
		for (i = disks; i--; ) {
			struct r5dev *dev = &sh->dev[i];
			if ((dev->towrite || i == sh->pd_idx) &&
			if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
			    !test_bit(R5_LOCKED, &dev->flags) &&
			    !(test_bit(R5_UPTODATE, &dev->flags) ||
			    test_bit(R5_Wantcompute, &dev->flags)) &&
@@ -3547,7 +3593,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
			}
		}
	}
	if (rcw <= rmw && rcw > 0) {
	if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) {
		/* want reconstruct write, but need to get some data */
		int qread =0;
		rcw = 0;
@@ -6344,10 +6390,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
	}

	conf->level = mddev->new_level;
	if (conf->level == 6)
	if (conf->level == 6) {
		conf->max_degraded = 2;
		if (raid6_call.xor_syndrome)
			conf->rmw_level = PARITY_ENABLE_RMW;
		else
			conf->rmw_level = PARITY_DISABLE_RMW;
	} else {
		conf->max_degraded = 1;
		conf->rmw_level = PARITY_ENABLE_RMW;
	}
	conf->algorithm = mddev->new_layout;
	conf->reshape_progress = mddev->reshape_position;
	if (conf->reshape_progress != MaxSector) {
+18 −1
Original line number Diff line number Diff line
@@ -355,6 +355,23 @@ enum {
	STRIPE_OP_RECONSTRUCT,
	STRIPE_OP_CHECK,
};

/*
 * RAID parity calculation preferences
 */
enum {
	PARITY_DISABLE_RMW = 0,
	PARITY_ENABLE_RMW,
};

/*
 * Pages requested from set_syndrome_sources()
 */
enum {
	SYNDROME_SRC_ALL,
	SYNDROME_SRC_WANT_DRAIN,
	SYNDROME_SRC_WRITTEN,
};
/*
 * Plugging:
 *
@@ -411,7 +428,7 @@ struct r5conf {
	spinlock_t		hash_locks[NR_STRIPE_HASH_LOCKS];
	struct mddev		*mddev;
	int			chunk_sectors;
	int			level, algorithm;
	int			level, algorithm, rmw_level;
	int			max_degraded;
	int			raid_disks;
	int			max_nr_stripes;
+3 −0
Original line number Diff line number Diff line
@@ -60,12 +60,15 @@ struct dma_chan_ref {
 * dependency chain
 * @ASYNC_TX_FENCE: specify that the next operation in the dependency
 * chain uses this operation's result as an input
 * @ASYNC_TX_PQ_XOR_DST: do not overwrite the syndrome but XOR it with the
 * input data. Required for rmw case.
 */
enum async_tx_flags {
	ASYNC_TX_XOR_ZERO_DST	 = (1 << 0),
	ASYNC_TX_XOR_DROP_DST	 = (1 << 1),
	ASYNC_TX_ACK		 = (1 << 2),
	ASYNC_TX_FENCE		 = (1 << 3),
	ASYNC_TX_PQ_XOR_DST	 = (1 << 4),
};

/**