Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 2943c833 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'md-3.3' of git://neil.brown.name/md

md update for 3.3

Big change is new hot-replacement.
A slot in an array can hold 2 devices - one that
wants-replacement and one that is the replacement.
Once the replacement is built - either from the
original or (in the case of errors) from elsewhere,
the wants-replacement device will be removed.

* tag 'md-3.3' of git://neil.brown.name/md: (36 commits)
  md/raid1: Mark device want_replacement when we see a write error.
  md/raid1: If there is a spare and a want_replacement device, start replacement.
  md/raid1: recognise replacements when assembling arrays.
  md/raid1: handle activation of replacement device when recovery completes.
  md/raid1: Allow a failed replacement device to be removed.
  md/raid1: Allocate spare to store replacement devices and their bios.
  md/raid1:  Replace use of mddev->raid_disks with conf->raid_disks.
  md/raid10: If there is a spare and a want_replacement device, start replacement.
  md/raid10: recognise replacements when assembling array.
  md/raid10: Allow replacement device to be replace old drive.
  md/raid10: handle recovery of replacement devices.
  md/raid10:  Handle replacement devices during resync.
  md/raid10: writes should get directed to replacement as well as original.
  md/raid10: allow removal of failed replacement devices.
  md/raid10: preferentially read from replacement device if possible.
  md/raid10:  change read_balance to return an rdev
  md/raid10: prepare data structures for handling replacement.
  md/raid5: Mark device want_replacement when we see a write error.
  md/raid5: If there is a spare and a want_replacement device, start replacement.
  md/raid5: recognise replacements when assembling array.
  ...
parents 98793265 19d67169
Loading
Loading
Loading
Loading
+18 −4
Original line number Diff line number Diff line
@@ -360,7 +360,7 @@ Each directory contains:
	A file recording the current state of the device in the array
	which can be a comma separated list of
	      faulty   - device has been kicked from active use due to
                         a detected fault or it has unacknowledged bad
			 a detected fault, or it has unacknowledged bad
			 blocks
	      in_sync  - device is a fully in-sync member of the array
	      writemostly - device will only be subject to read
@@ -374,6 +374,13 @@ Each directory contains:
			 This includes spares that are in the process
			 of being recovered to
	      write_error - device has ever seen a write error.
	      want_replacement - device is (mostly) working but probably
			 should be replaced, either due to errors or
			 due to user request.
	      replacement - device is a replacement for another active
			 device with same raid_disk.


	This list may grow in future.
	This can be written to.
	Writing "faulty"  simulates a failure on the device.
@@ -386,6 +393,13 @@ Each directory contains:
	Writing "in_sync" sets the in_sync flag.
	Writing "write_error" sets writeerrorseen flag.
	Writing "-write_error" clears writeerrorseen flag.
	Writing "want_replacement" is allowed at any time except to a
		replacement device or a spare.  It sets the flag.
	Writing "-want_replacement" is allowed at any time.  It clears
		the flag.
	Writing "replacement" or "-replacement" is only allowed before
		starting the array.  It sets or clears the flag.


	This file responds to select/poll. Any change to 'faulty'
	or 'blocked' causes an event.
+6 −6
Original line number Diff line number Diff line
@@ -1149,12 +1149,12 @@ void bitmap_daemon_work(struct mddev *mddev)
		return;
	}
	if (time_before(jiffies, bitmap->daemon_lastrun
			+ bitmap->mddev->bitmap_info.daemon_sleep))
			+ mddev->bitmap_info.daemon_sleep))
		goto done;

	bitmap->daemon_lastrun = jiffies;
	if (bitmap->allclean) {
		bitmap->mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
		mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
		goto done;
	}
	bitmap->allclean = 1;
@@ -1206,7 +1206,7 @@ void bitmap_daemon_work(struct mddev *mddev)
			 * sure that events_cleared is up-to-date.
			 */
			if (bitmap->need_sync &&
			    bitmap->mddev->bitmap_info.external == 0) {
			    mddev->bitmap_info.external == 0) {
				bitmap_super_t *sb;
				bitmap->need_sync = 0;
				sb = kmap_atomic(bitmap->sb_page, KM_USER0);
@@ -1270,8 +1270,8 @@ void bitmap_daemon_work(struct mddev *mddev)

 done:
	if (bitmap->allclean == 0)
		bitmap->mddev->thread->timeout =
			bitmap->mddev->bitmap_info.daemon_sleep;
		mddev->thread->timeout =
			mddev->bitmap_info.daemon_sleep;
	mutex_unlock(&mddev->bitmap_info.mutex);
}

@@ -1587,7 +1587,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
	}
	if (!*bmc) {
		struct page *page;
		*bmc = 1 | (needed ? NEEDED_MASK : 0);
		*bmc = 2 | (needed ? NEEDED_MASK : 0);
		bitmap_count_page(bitmap, offset, 1);
		page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap));
		set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
+80 −27
Original line number Diff line number Diff line
@@ -1713,6 +1713,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
		}
		if (sb->devflags & WriteMostly1)
			set_bit(WriteMostly, &rdev->flags);
		if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
			set_bit(Replacement, &rdev->flags);
	} else /* MULTIPATH are always insync */
		set_bit(In_sync, &rdev->flags);

@@ -1766,6 +1768,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
		sb->recovery_offset =
			cpu_to_le64(rdev->recovery_offset);
	}
	if (test_bit(Replacement, &rdev->flags))
		sb->feature_map |=
			cpu_to_le32(MD_FEATURE_REPLACEMENT);

	if (mddev->reshape_position != MaxSector) {
		sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
@@ -2559,6 +2564,15 @@ state_show(struct md_rdev *rdev, char *page)
		len += sprintf(page+len, "%swrite_error", sep);
		sep = ",";
	}
	if (test_bit(WantReplacement, &rdev->flags)) {
		len += sprintf(page+len, "%swant_replacement", sep);
		sep = ",";
	}
	if (test_bit(Replacement, &rdev->flags)) {
		len += sprintf(page+len, "%sreplacement", sep);
		sep = ",";
	}

	return len+sprintf(page+len, "\n");
}

@@ -2627,6 +2641,42 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
	} else if (cmd_match(buf, "-write_error")) {
		clear_bit(WriteErrorSeen, &rdev->flags);
		err = 0;
	} else if (cmd_match(buf, "want_replacement")) {
		/* Any non-spare device that is not a replacement can
		 * become want_replacement at any time, but we then need to
		 * check if recovery is needed.
		 */
		if (rdev->raid_disk >= 0 &&
		    !test_bit(Replacement, &rdev->flags))
			set_bit(WantReplacement, &rdev->flags);
		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
		md_wakeup_thread(rdev->mddev->thread);
		err = 0;
	} else if (cmd_match(buf, "-want_replacement")) {
		/* Clearing 'want_replacement' is always allowed.
		 * Once replacements starts it is too late though.
		 */
		err = 0;
		clear_bit(WantReplacement, &rdev->flags);
	} else if (cmd_match(buf, "replacement")) {
		/* Can only set a device as a replacement when array has not
		 * yet been started.  Once running, replacement is automatic
		 * from spares, or by assigning 'slot'.
		 */
		if (rdev->mddev->pers)
			err = -EBUSY;
		else {
			set_bit(Replacement, &rdev->flags);
			err = 0;
		}
	} else if (cmd_match(buf, "-replacement")) {
		/* Similarly, can only clear Replacement before start */
		if (rdev->mddev->pers)
			err = -EBUSY;
		else {
			clear_bit(Replacement, &rdev->flags);
			err = 0;
		}
	}
	if (!err)
		sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2688,7 +2738,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
		if (rdev->mddev->pers->hot_remove_disk == NULL)
			return -EINVAL;
		err = rdev->mddev->pers->
			hot_remove_disk(rdev->mddev, rdev->raid_disk);
			hot_remove_disk(rdev->mddev, rdev);
		if (err)
			return err;
		sysfs_unlink_rdev(rdev->mddev, rdev);
@@ -2696,7 +2746,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
		md_wakeup_thread(rdev->mddev->thread);
	} else if (rdev->mddev->pers) {
		struct md_rdev *rdev2;
		/* Activating a spare .. or possibly reactivating
		 * if we ever get bitmaps working here.
		 */
@@ -2710,10 +2759,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
		if (rdev->mddev->pers->hot_add_disk == NULL)
			return -EINVAL;

		list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
			if (rdev2->raid_disk == slot)
				return -EEXIST;

		if (slot >= rdev->mddev->raid_disks &&
		    slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
			return -ENOSPC;
@@ -6053,8 +6098,15 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
	struct mddev *mddev = NULL;
	int ro;

	switch (cmd) {
	case RAID_VERSION:
	case GET_ARRAY_INFO:
	case GET_DISK_INFO:
		break;
	default:
		if (!capable(CAP_SYS_ADMIN))
			return -EACCES;
	}

	/*
	 * Commands dealing with the RAID driver but not any
@@ -6714,8 +6766,11 @@ static int md_seq_show(struct seq_file *seq, void *v)
			if (test_bit(Faulty, &rdev->flags)) {
				seq_printf(seq, "(F)");
				continue;
			} else if (rdev->raid_disk < 0)
			}
			if (rdev->raid_disk < 0)
				seq_printf(seq, "(S)"); /* spare */
			if (test_bit(Replacement, &rdev->flags))
				seq_printf(seq, "(R)");
			sectors += rdev->sectors;
		}

@@ -7337,13 +7392,12 @@ static int remove_and_add_spares(struct mddev *mddev)
		     ! test_bit(In_sync, &rdev->flags)) &&
		    atomic_read(&rdev->nr_pending)==0) {
			if (mddev->pers->hot_remove_disk(
				    mddev, rdev->raid_disk)==0) {
				    mddev, rdev) == 0) {
				sysfs_unlink_rdev(mddev, rdev);
				rdev->raid_disk = -1;
			}
		}

	if (mddev->degraded) {
	list_for_each_entry(rdev, &mddev->disks, same_set) {
		if (rdev->raid_disk >= 0 &&
		    !test_bit(In_sync, &rdev->flags) &&
@@ -7362,7 +7416,6 @@ static int remove_and_add_spares(struct mddev *mddev)
			}
		}
	}
	}
	return spares;
}

@@ -7474,7 +7527,7 @@ void md_check_recovery(struct mddev *mddev)
				    test_bit(Faulty, &rdev->flags) &&
				    atomic_read(&rdev->nr_pending)==0) {
					if (mddev->pers->hot_remove_disk(
						    mddev, rdev->raid_disk)==0) {
						    mddev, rdev) == 0) {
						sysfs_unlink_rdev(mddev, rdev);
						rdev->raid_disk = -1;
					}
+49 −33
Original line number Diff line number Diff line
@@ -72,34 +72,7 @@ struct md_rdev {
	 * This reduces the burden of testing multiple flags in many cases
	 */

	unsigned long	flags;
#define	Faulty		1		/* device is known to have a fault */
#define	In_sync		2		/* device is in_sync with rest of array */
#define	WriteMostly	4		/* Avoid reading if at all possible */
#define	AutoDetected	7		/* added by auto-detect */
#define Blocked		8		/* An error occurred but has not yet
					 * been acknowledged by the metadata
					 * handler, so don't allow writes
					 * until it is cleared */
#define WriteErrorSeen	9		/* A write error has been seen on this
					 * device
					 */
#define FaultRecorded	10		/* Intermediate state for clearing
					 * Blocked.  The Fault is/will-be
					 * recorded in the metadata, but that
					 * metadata hasn't been stored safely
					 * on disk yet.
					 */
#define BlockedBadBlocks 11		/* A writer is blocked because they
					 * found an unacknowledged bad-block.
					 * This can safely be cleared at any
					 * time, and the writer will re-check.
					 * It may be set at any time, and at
					 * worst the writer will timeout and
					 * re-check.  So setting it as
					 * accurately as possible is good, but
					 * not absolutely critical.
					 */
	unsigned long	flags;	/* bit set of 'enum flag_bits' bits. */
	wait_queue_head_t blocked_wait;

	int desc_nr;			/* descriptor index in the superblock */
@@ -152,6 +125,44 @@ struct md_rdev {
		sector_t size;		/* in sectors */
	} badblocks;
};
enum flag_bits {
	Faulty,			/* device is known to have a fault */
	In_sync,		/* device is in_sync with rest of array */
	WriteMostly,		/* Avoid reading if at all possible */
	AutoDetected,		/* added by auto-detect */
	Blocked,		/* An error occurred but has not yet
				 * been acknowledged by the metadata
				 * handler, so don't allow writes
				 * until it is cleared */
	WriteErrorSeen,		/* A write error has been seen on this
				 * device
				 */
	FaultRecorded,		/* Intermediate state for clearing
				 * Blocked.  The Fault is/will-be
				 * recorded in the metadata, but that
				 * metadata hasn't been stored safely
				 * on disk yet.
				 */
	BlockedBadBlocks,	/* A writer is blocked because they
				 * found an unacknowledged bad-block.
				 * This can safely be cleared at any
				 * time, and the writer will re-check.
				 * It may be set at any time, and at
				 * worst the writer will timeout and
				 * re-check.  So setting it as
				 * accurately as possible is good, but
				 * not absolutely critical.
				 */
	WantReplacement,	/* This device is a candidate to be
				 * hot-replaced, either because it has
				 * reported some faults, or because
				 * of explicit request.
				 */
	Replacement,		/* This device is a replacement for
				 * a want_replacement device with same
				 * raid_disk number.
				 */
};

#define BB_LEN_MASK	(0x00000000000001FFULL)
#define BB_OFFSET_MASK	(0x7FFFFFFFFFFFFE00ULL)
@@ -428,7 +439,7 @@ struct md_personality
	 */
	void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
	int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
	int (*hot_remove_disk) (struct mddev *mddev, int number);
	int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
	int (*spare_active) (struct mddev *mddev);
	sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster);
	int (*resize) (struct mddev *mddev, sector_t sectors);
@@ -482,16 +493,21 @@ static inline char * mdname (struct mddev * mddev)
static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
	char nm[20];
	if (!test_bit(Replacement, &rdev->flags)) {
		sprintf(nm, "rd%d", rdev->raid_disk);
		return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
	} else
		return 0;
}

static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
	char nm[20];
	if (!test_bit(Replacement, &rdev->flags)) {
		sprintf(nm, "rd%d", rdev->raid_disk);
		sysfs_remove_link(&mddev->kobj, nm);
	}
}

/*
 * iterates through some rdev ringlist. It's safe to remove the
+3 −4
Original line number Diff line number Diff line
@@ -292,17 +292,16 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
	return err;
}

static int multipath_remove_disk(struct mddev *mddev, int number)
static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{
	struct mpconf *conf = mddev->private;
	int err = 0;
	struct md_rdev *rdev;
	int number = rdev->raid_disk;
	struct multipath_info *p = conf->multipaths + number;

	print_multipath_conf(conf);

	rdev = p->rdev;
	if (rdev) {
	if (rdev == p->rdev) {
		if (test_bit(In_sync, &rdev->flags) ||
		    atomic_read(&rdev->nr_pending)) {
			printk(KERN_ERR "hot-remove-disk, slot %d is identified"
Loading