Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a5e0d731 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'md-3.9' of git://neil.brown.name/md

Pull md updates from NeilBrown:
 "Mostly little bugfixes.

  Only "feature" is a new RAID10 layout which slightly improves the
  number of sets of devices that can concurrently fail, without data
  loss."

* tag 'md-3.9' of git://neil.brown.name/md:
  md: expedite metadata update when switching  read-auto -> active
  md: remove CONFIG_MULTICORE_RAID456
  md/raid1,raid10: fix deadlock with freeze_array()
  md/raid0: improve error message when converting RAID4-with-spares to RAID0
  md: raid0: fix error return from create_stripe_zones.
  md: fix two bugs when attempting to resize RAID0 array.
  DM RAID: Add support for MD's RAID10 "far" and "offset" algorithms
  MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part 2)
  MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part 1)
  MD RAID10: Minor non-functional code changes
  md: raid1,10: Handle REQ_WRITE_SAME flag in write bios
  md: protect against crash upon fsync on ro array
parents 6dbe51c2 f3378b48
Loading
Loading
Loading
Loading
+37 −7
Original line number Diff line number Diff line
@@ -30,6 +30,7 @@ The target is named "raid" and it accepts the following parameters:
  raid10        Various RAID10 inspired algorithms chosen by additional params
		- RAID10: Striped Mirrors (aka 'Striping on top of mirrors')
		- RAID1E: Integrated Adjacent Stripe Mirroring
		- RAID1E: Integrated Offset Stripe Mirroring
		-  and other similar RAID10 variants

  Reference: Chapter 4 of
@@ -64,15 +65,15 @@ The target is named "raid" and it accepts the following parameters:
		synchronisation state for each region.

        [raid10_copies   <# copies>]
        [raid10_format   near]
        [raid10_format   <near|far|offset>]
		These two options are used to alter the default layout of
		a RAID10 configuration.  The number of copies is can be
		specified, but the default is 2.  There are other variations
		to how the copies are laid down - the default and only current
		option is "near".  Near copies are what most people think of
		with respect to mirroring.  If these options are left
		unspecified, or 'raid10_copies 2' and/or 'raid10_format near'
		are given, then the layouts for 2, 3 and 4 devices are:
		specified, but the default is 2.  There are also three
		variations to how the copies are laid down - the default
		is "near".  Near copies are what most people think of with
		respect to mirroring.  If these options are left unspecified,
		or 'raid10_copies 2' and/or 'raid10_format near' are given,
		then the layouts for 2, 3 and 4 devices	are:
		2 drives         3 drives          4 drives
		--------         ----------        --------------
		A1  A1           A1  A1  A2        A1  A1  A2  A2
@@ -85,6 +86,33 @@ The target is named "raid" and it accepts the following parameters:
		3-device layout is what might be called a 'RAID1E - Integrated
		Adjacent Stripe Mirroring'.

		If 'raid10_copies 2' and 'raid10_format far', then the layouts
		for 2, 3 and 4 devices are:
		2 drives             3 drives             4 drives
		--------             --------------       --------------------
		A1  A2               A1   A2   A3         A1   A2   A3   A4
		A3  A4               A4   A5   A6         A5   A6   A7   A8
		A5  A6               A7   A8   A9         A9   A10  A11  A12
		..  ..               ..   ..   ..         ..   ..   ..   ..
		A2  A1               A3   A1   A2         A2   A1   A4   A3
		A4  A3               A6   A4   A5         A6   A5   A8   A7
		A6  A5               A9   A7   A8         A10  A9   A12  A11
		..  ..               ..   ..   ..         ..   ..   ..   ..

		If 'raid10_copies 2' and 'raid10_format offset', then the
		layouts for 2, 3 and 4 devices are:
		2 drives       3 drives           4 drives
		--------       ------------       -----------------
		A1  A2         A1  A2  A3         A1  A2  A3  A4
		A2  A1         A3  A1  A2         A2  A1  A4  A3
		A3  A4         A4  A5  A6         A5  A6  A7  A8
		A4  A3         A6  A4  A5         A6  A5  A8  A7
		A5  A6         A7  A8  A9         A9  A10 A11 A12
		A6  A5         A9  A7  A8         A10 A9  A12 A11
		..  ..         ..  ..  ..         ..  ..  ..  ..
		Here we see layouts closely akin to 'RAID1E - Integrated
		Offset Stripe Mirroring'.

<#raid_devs>: The number of devices composing the array.
	Each device consists of two entries.  The first is the device
	containing the metadata (if any); the second is the one containing the
@@ -142,3 +170,5 @@ Version History
1.3.0	Added support for RAID 10
1.3.1	Allow device replacement/rebuild for RAID 10
1.3.2   Fix/improve redundancy checking for RAID10
1.4.0	Non-functional change.  Removes arg from mapping function.
1.4.1   Add RAID10 "far" and "offset" algorithm support.
+0 −11
Original line number Diff line number Diff line
@@ -154,17 +154,6 @@ config MD_RAID456

	  If unsure, say Y.

config MULTICORE_RAID456
	bool "RAID-4/RAID-5/RAID-6 Multicore processing (EXPERIMENTAL)"
	depends on MD_RAID456
	depends on SMP
	depends on EXPERIMENTAL
	---help---
	  Enable the raid456 module to dispatch per-stripe raid operations to a
	  thread pool.

	  If unsure, say N.

config MD_MULTIPATH
	tristate "Multipath I/O support"
	depends on BLK_DEV_MD
+103 −20
Original line number Diff line number Diff line
@@ -91,15 +91,44 @@ static struct raid_type {
	{"raid6_nc", "RAID6 (N continue)",		2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
};

static char *raid10_md_layout_to_format(int layout)
{
	/*
	 * Bit 16 and 17 stand for "offset" and "use_far_sets"
	 * Refer to MD's raid10.c for details
	 */
	if ((layout & 0x10000) && (layout & 0x20000))
		return "offset";

	if ((layout & 0xFF) > 1)
		return "near";

	return "far";
}

static unsigned raid10_md_layout_to_copies(int layout)
{
	if ((layout & 0xFF) > 1)
		return layout & 0xFF;
	return (layout >> 8) & 0xFF;
}

static int raid10_format_to_md_layout(char *format, unsigned copies)
{
	/* 1 "far" copy, and 'copies' "near" copies */
	return (1 << 8) | (copies & 0xFF);
	unsigned n = 1, f = 1;

	if (!strcmp("near", format))
		n = copies;
	else
		f = copies;

	if (!strcmp("offset", format))
		return 0x30000 | (f << 8) | n;

	if (!strcmp("far", format))
		return 0x20000 | (f << 8) | n;

	return (f << 8) | n;
}

static struct raid_type *get_raid_type(char *name)
@@ -352,6 +381,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
{
	unsigned i, rebuild_cnt = 0;
	unsigned rebuilds_per_group, copies, d;
	unsigned group_size, last_group_start;

	for (i = 0; i < rs->md.raid_disks; i++)
		if (!test_bit(In_sync, &rs->dev[i].rdev.flags) ||
@@ -379,9 +409,6 @@ static int validate_raid_redundancy(struct raid_set *rs)
		 * as long as the failed devices occur in different mirror
		 * groups (i.e. different stripes).
		 *
		 * Right now, we only allow for "near" copies.  When other
		 * formats are added, we will have to check those too.
		 *
		 * When checking "near" format, make sure no adjacent devices
		 * have failed beyond what can be handled.  In addition to the
		 * simple case where the number of devices is a multiple of the
@@ -391,6 +418,7 @@ static int validate_raid_redundancy(struct raid_set *rs)
		 *          A    A    B    B    C
		 *          C    D    D    E    E
		 */
		if (!strcmp("near", raid10_md_layout_to_format(rs->md.layout))) {
			for (i = 0; i < rs->md.raid_disks * copies; i++) {
				if (!(i % copies))
					rebuilds_per_group = 0;
@@ -401,6 +429,32 @@ static int validate_raid_redundancy(struct raid_set *rs)
					goto too_many;
			}
			break;
		}

		/*
		 * When checking "far" and "offset" formats, we need to ensure
		 * that the device that holds its copy is not also dead or
		 * being rebuilt.  (Note that "far" and "offset" formats only
		 * support two copies right now.  These formats also only ever
		 * use the 'use_far_sets' variant.)
		 *
		 * This check is somewhat complicated by the need to account
		 * for arrays that are not a multiple of (far) copies.  This
		 * results in the need to treat the last (potentially larger)
		 * set differently.
		 */
		group_size = (rs->md.raid_disks / copies);
		last_group_start = (rs->md.raid_disks / group_size) - 1;
		last_group_start *= group_size;
		for (i = 0; i < rs->md.raid_disks; i++) {
			if (!(i % copies) && !(i > last_group_start))
				rebuilds_per_group = 0;
			if ((!rs->dev[i].rdev.sb_page ||
			     !test_bit(In_sync, &rs->dev[i].rdev.flags)) &&
			    (++rebuilds_per_group >= copies))
					goto too_many;
		}
		break;
	default:
		if (rebuild_cnt)
			return -EINVAL;
@@ -433,7 +487,7 @@ too_many:
 *
 * RAID10-only options:
 *    [raid10_copies <# copies>]        Number of copies.  (Default: 2)
 *    [raid10_format <near>]            Layout algorithm.  (Default: near)
 *    [raid10_format <near|far|offset>] Layout algorithm.  (Default: near)
 */
static int parse_raid_params(struct raid_set *rs, char **argv,
			     unsigned num_raid_params)
@@ -520,7 +574,9 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
				rs->ti->error = "'raid10_format' is an invalid parameter for this RAID type";
				return -EINVAL;
			}
			if (strcmp("near", argv[i])) {
			if (strcmp("near", argv[i]) &&
			    strcmp("far", argv[i]) &&
			    strcmp("offset", argv[i])) {
				rs->ti->error = "Invalid 'raid10_format' value given";
				return -EINVAL;
			}
@@ -644,6 +700,15 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
			return -EINVAL;
		}

		/*
		 * If the format is not "near", we only support
		 * two copies at the moment.
		 */
		if (strcmp("near", raid10_format) && (raid10_copies > 2)) {
			rs->ti->error = "Too many copies for given RAID10 format.";
			return -EINVAL;
		}

		/* (Len * #mirrors) / #devices */
		sectors_per_dev = rs->ti->len * raid10_copies;
		sector_div(sectors_per_dev, rs->md.raid_disks);
@@ -854,17 +919,30 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
	/*
	 * Reshaping is not currently allowed
	 */
	if ((le32_to_cpu(sb->level) != mddev->level) ||
	    (le32_to_cpu(sb->layout) != mddev->layout) ||
	    (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
		DMERR("Reshaping arrays not yet supported.");
	if (le32_to_cpu(sb->level) != mddev->level) {
		DMERR("Reshaping arrays not yet supported. (RAID level change)");
		return -EINVAL;
	}
	if (le32_to_cpu(sb->layout) != mddev->layout) {
		DMERR("Reshaping arrays not yet supported. (RAID layout change)");
		DMERR("  0x%X vs 0x%X", le32_to_cpu(sb->layout), mddev->layout);
		DMERR("  Old layout: %s w/ %d copies",
		      raid10_md_layout_to_format(le32_to_cpu(sb->layout)),
		      raid10_md_layout_to_copies(le32_to_cpu(sb->layout)));
		DMERR("  New layout: %s w/ %d copies",
		      raid10_md_layout_to_format(mddev->layout),
		      raid10_md_layout_to_copies(mddev->layout));
		return -EINVAL;
	}
	if (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors) {
		DMERR("Reshaping arrays not yet supported. (stripe sectors change)");
		return -EINVAL;
	}

	/* We can only change the number of devices in RAID1 right now */
	if ((rs->raid_type->level != 1) &&
	    (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
		DMERR("Reshaping arrays not yet supported.");
		DMERR("Reshaping arrays not yet supported. (device count change)");
		return -EINVAL;
	}

@@ -1329,7 +1407,8 @@ static void raid_status(struct dm_target *ti, status_type_t type,
			       raid10_md_layout_to_copies(rs->md.layout));

		if (rs->print_flags & DMPF_RAID10_FORMAT)
			DMEMIT(" raid10_format near");
			DMEMIT(" raid10_format %s",
			       raid10_md_layout_to_format(rs->md.layout));

		DMEMIT(" %d", rs->md.raid_disks);
		for (i = 0; i < rs->md.raid_disks; i++) {
@@ -1418,6 +1497,10 @@ static struct target_type raid_target = {

static int __init dm_raid_init(void)
{
	DMINFO("Loading target version %u.%u.%u",
	       raid_target.version[0],
	       raid_target.version[1],
	       raid_target.version[2]);
	return dm_register_target(&raid_target);
}

+18 −1
Original line number Diff line number Diff line
@@ -307,6 +307,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
		bio_io_error(bio);
		return;
	}
	if (mddev->ro == 1 && unlikely(rw == WRITE)) {
		bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
		return;
	}
	smp_rmb(); /* Ensure implications of  'active' are visible */
	rcu_read_lock();
	if (mddev->suspended) {
@@ -2994,6 +2998,9 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
		} else if (!sectors)
			sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
				rdev->data_offset;
		if (!my_mddev->pers->resize)
			/* Cannot change size for RAID0 or Linear etc */
			return -EINVAL;
	}
	if (sectors < my_mddev->dev_sectors)
		return -EINVAL; /* component must fit device */
@@ -6525,7 +6532,17 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
			mddev->ro = 0;
			sysfs_notify_dirent_safe(mddev->sysfs_state);
			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
			md_wakeup_thread(mddev->thread);
			/* mddev_unlock will wake thread */
			/* If a device failed while we were read-only, we
			 * need to make sure the metadata is updated now.
			 */
			if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
				mddev_unlock(mddev);
				wait_event(mddev->sb_wait,
					   !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
					   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
				mddev_lock(mddev);
			}
		} else {
			err = -EROFS;
			goto abort_unlock;
+10 −3
Original line number Diff line number Diff line
@@ -175,7 +175,13 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
			rdev1->new_raid_disk = j;
		}

		if (j < 0 || j >= mddev->raid_disks) {
		if (j < 0) {
			printk(KERN_ERR
			       "md/raid0:%s: remove inactive devices before converting to RAID0\n",
			       mdname(mddev));
			goto abort;
		}
		if (j >= mddev->raid_disks) {
			printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
			       "aborting!\n", mdname(mddev), j);
			goto abort;
@@ -289,7 +295,7 @@ abort:
	kfree(conf->strip_zone);
	kfree(conf->devlist);
	kfree(conf);
	*private_conf = NULL;
	*private_conf = ERR_PTR(err);
	return err;
}

@@ -411,7 +417,8 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
		  "%s does not support generic reshape\n", __func__);

	rdev_for_each(rdev, mddev)
		array_sectors += rdev->sectors;
		array_sectors += (rdev->sectors &
				  ~(sector_t)(mddev->chunk_sectors-1));

	return array_sectors;
}
Loading