Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 6cce3b23 authored by NeilBrown's avatar NeilBrown Committed by Linus Torvalds
Browse files

[PATCH] md: write intent bitmap support for raid10



Signed-off-by: default avatarNeil Brown <neilb@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent b15c2e57
Loading
Loading
Loading
Loading
+6 −4
Original line number Diff line number Diff line
@@ -714,9 +714,10 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)

		if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
		    mddev->bitmap_file == NULL) {
			if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6) {
			if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
			    && mddev->level != 10) {
				/* FIXME use a better test */
				printk(KERN_WARNING "md: bitmaps only support for raid1\n");
				printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
				return -EINVAL;
			}
			mddev->bitmap_offset = mddev->default_bitmap_offset;
@@ -1037,8 +1038,9 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)

		if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
		    mddev->bitmap_file == NULL ) {
			if (mddev->level != 1) {
				printk(KERN_WARNING "md: bitmaps only supported for raid1\n");
			if (mddev->level != 1 && mddev->level != 5 && mddev->level != 6
			    && mddev->level != 10) {
				printk(KERN_WARNING "md: bitmaps not supported for this level.\n");
				return -EINVAL;
			}
			mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
+157 −21
Original line number Diff line number Diff line
@@ -18,7 +18,9 @@
 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

#include "dm-bio-list.h"
#include <linux/raid/raid10.h>
#include <linux/raid/bitmap.h>

/*
 * RAID10 provides a combination of RAID0 and RAID1 functionality.
@@ -306,9 +308,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
	/*
	 * this branch is our 'one mirror IO has finished' event handler:
	 */
	if (!uptodate)
	if (!uptodate) {
		md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
	else
		/* an I/O failed, we can't clear the bitmap */
		set_bit(R10BIO_Degraded, &r10_bio->state);
	} else
		/*
		 * Set R10BIO_Uptodate in our master bio, so that
		 * we will return a good error code for to the higher
@@ -328,6 +332,11 @@ static int raid10_end_write_request(struct bio *bio, unsigned int bytes_done, in
	 * already.
	 */
	if (atomic_dec_and_test(&r10_bio->remaining)) {
		/* clear the bitmap if all writes complete successfully */
		bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
				r10_bio->sectors,
				!test_bit(R10BIO_Degraded, &r10_bio->state),
				0);
		md_write_end(r10_bio->mddev);
		raid_end_bio_io(r10_bio);
	}
@@ -486,8 +495,9 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio)
	rcu_read_lock();
	/*
	 * Check if we can balance. We can balance on the whole
	 * device if no resync is going on, or below the resync window.
	 * We take the first readable disk when above the resync window.
	 * device if no resync is going on (recovery is ok), or below
	 * the resync window. We take the first readable disk when
	 * above the resync window.
	 */
	if (conf->mddev->recovery_cp < MaxSector
	    && (this_sector + sectors >= conf->next_resync)) {
@@ -591,7 +601,10 @@ static void unplug_slaves(mddev_t *mddev)

static void raid10_unplug(request_queue_t *q)
{
	mddev_t *mddev = q->queuedata;

	unplug_slaves(q->queuedata);
	md_wakeup_thread(mddev->thread);
}

static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
@@ -647,12 +660,13 @@ static int raid10_issue_flush(request_queue_t *q, struct gendisk *disk,
 */
#define RESYNC_DEPTH 32

static void raise_barrier(conf_t *conf)
static void raise_barrier(conf_t *conf, int force)
{
	BUG_ON(force && !conf->barrier);
	spin_lock_irq(&conf->resync_lock);

	/* Wait until no block IO is waiting */
	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
	/* Wait until no block IO is waiting (unless 'force') */
	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
			    conf->resync_lock,
			    raid10_unplug(conf->mddev->queue));

@@ -710,6 +724,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
	int i;
	int chunk_sects = conf->chunk_mask + 1;
	const int rw = bio_data_dir(bio);
	struct bio_list bl;
	unsigned long flags;

	if (unlikely(bio_barrier(bio))) {
		bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
@@ -767,6 +783,7 @@ static int make_request(request_queue_t *q, struct bio * bio)

	r10_bio->mddev = mddev;
	r10_bio->sector = bio->bi_sector;
	r10_bio->state = 0;

	if (rw == READ) {
		/*
@@ -811,13 +828,16 @@ static int make_request(request_queue_t *q, struct bio * bio)
		    !test_bit(Faulty, &rdev->flags)) {
			atomic_inc(&rdev->nr_pending);
			r10_bio->devs[i].bio = bio;
		} else
		} else {
			r10_bio->devs[i].bio = NULL;
			set_bit(R10BIO_Degraded, &r10_bio->state);
		}
	}
	rcu_read_unlock();

	atomic_set(&r10_bio->remaining, 1);
	atomic_set(&r10_bio->remaining, 0);

	bio_list_init(&bl);
	for (i = 0; i < conf->copies; i++) {
		struct bio *mbio;
		int d = r10_bio->devs[i].devnum;
@@ -835,13 +855,14 @@ static int make_request(request_queue_t *q, struct bio * bio)
		mbio->bi_private = r10_bio;

		atomic_inc(&r10_bio->remaining);
		generic_make_request(mbio);
		bio_list_add(&bl, mbio);
	}

	if (atomic_dec_and_test(&r10_bio->remaining)) {
		md_write_end(mddev);
		raid_end_bio_io(r10_bio);
	}
	bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
	spin_lock_irqsave(&conf->device_lock, flags);
	bio_list_merge(&conf->pending_bio_list, &bl);
	blk_plug_device(mddev->queue);
	spin_unlock_irqrestore(&conf->device_lock, flags);

	return 0;
}
@@ -999,7 +1020,12 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
	if (!enough(conf))
		return 0;

	for (mirror=0; mirror < mddev->raid_disks; mirror++)
	if (rdev->saved_raid_disk >= 0 &&
	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
		mirror = rdev->saved_raid_disk;
	else
		mirror = 0;
	for ( ; mirror < mddev->raid_disks; mirror++)
		if ( !(p=conf->mirrors+mirror)->rdev) {

			blk_queue_stack_limits(mddev->queue,
@@ -1015,6 +1041,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
			p->head_position = 0;
			rdev->raid_disk = mirror;
			found = 1;
			if (rdev->saved_raid_disk != mirror)
				conf->fullsync = 1;
			rcu_assign_pointer(p->rdev, rdev);
			break;
		}
@@ -1282,6 +1310,26 @@ static void raid10d(mddev_t *mddev)
	for (;;) {
		char b[BDEVNAME_SIZE];
		spin_lock_irqsave(&conf->device_lock, flags);

		if (conf->pending_bio_list.head) {
			bio = bio_list_get(&conf->pending_bio_list);
			blk_remove_plug(mddev->queue);
			spin_unlock_irqrestore(&conf->device_lock, flags);
			/* flush any pending bitmap writes to disk before proceeding w/ I/O */
			if (bitmap_unplug(mddev->bitmap) != 0)
				printk("%s: bitmap file write failed!\n", mdname(mddev));

			while (bio) { /* submit pending writes */
				struct bio *next = bio->bi_next;
				bio->bi_next = NULL;
				generic_make_request(bio);
				bio = next;
			}
			unplug = 1;

			continue;
		}

		if (list_empty(head))
			break;
		r10_bio = list_entry(head->prev, r10bio_t, retry_list);
@@ -1388,6 +1436,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
	sector_t max_sector, nr_sectors;
	int disk;
	int i;
	int max_sync;
	int sync_blocks;

	sector_t sectors_skipped = 0;
	int chunks_skipped = 0;
@@ -1401,6 +1451,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
		max_sector = mddev->resync_max_sectors;
	if (sector_nr >= max_sector) {
		/* If we aborted, we need to abort the
		 * sync on the 'current' bitmap chucks (there can
		 * be several when recovering multiple devices).
		 * as we may have started syncing it but not finished.
		 * We can find the current address in
		 * mddev->curr_resync, but for recovery,
		 * we need to convert that to several
		 * virtual addresses.
		 */
		if (mddev->curr_resync < max_sector) { /* aborted */
			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
				bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
						&sync_blocks, 1);
			else for (i=0; i<conf->raid_disks; i++) {
				sector_t sect =
					raid10_find_virt(conf, mddev->curr_resync, i);
				bitmap_end_sync(mddev->bitmap, sect,
						&sync_blocks, 1);
			}
		} else /* completed sync */
			conf->fullsync = 0;

		bitmap_close_sync(mddev->bitmap);
		close_sync(conf);
		*skipped = 1;
		return sectors_skipped;
@@ -1425,8 +1498,6 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
	 */
	if (!go_faster && conf->nr_waiting)
		msleep_interruptible(1000);
	raise_barrier(conf);
	conf->next_resync = sector_nr;

	/* Again, very different code for resync and recovery.
	 * Both must result in an r10bio with a list of bios that
@@ -1443,6 +1514,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
	 * end_sync_write if we will want to write.
	 */

	max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
		/* recovery... the complicated one */
		int i, j, k;
@@ -1451,13 +1523,29 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
		for (i=0 ; i<conf->raid_disks; i++)
			if (conf->mirrors[i].rdev &&
			    !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) {
				int still_degraded = 0;
				/* want to reconstruct this device */
				r10bio_t *rb2 = r10_bio;
				sector_t sect = raid10_find_virt(conf, sector_nr, i);
				int must_sync;
				/* Unless we are doing a full sync, we only need
				 * to recover the block if it is set in the bitmap
				 */
				must_sync = bitmap_start_sync(mddev->bitmap, sect,
							      &sync_blocks, 1);
				if (sync_blocks < max_sync)
					max_sync = sync_blocks;
				if (!must_sync &&
				    !conf->fullsync) {
					/* yep, skip the sync_blocks here, but don't assume
					 * that there will never be anything to do here
					 */
					chunks_skipped = -1;
					continue;
				}

				r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
				spin_lock_irq(&conf->resync_lock);
				if (rb2) conf->barrier++;
				spin_unlock_irq(&conf->resync_lock);
				raise_barrier(conf, rb2 != NULL);
				atomic_set(&r10_bio->remaining, 0);

				r10_bio->master_bio = (struct bio*)rb2;
@@ -1465,8 +1553,21 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
					atomic_inc(&rb2->remaining);
				r10_bio->mddev = mddev;
				set_bit(R10BIO_IsRecover, &r10_bio->state);
				r10_bio->sector = raid10_find_virt(conf, sector_nr, i);
				r10_bio->sector = sect;

				raid10_find_phys(conf, r10_bio);
				/* Need to check if this section will still be
				 * degraded
				 */
				for (j=0; j<conf->copies;j++) {
					int d = r10_bio->devs[j].devnum;
					if (conf->mirrors[d].rdev == NULL ||
					    test_bit(Faulty, &conf->mirrors[d].rdev->flags))
						still_degraded = 1;
				}
				must_sync = bitmap_start_sync(mddev->bitmap, sect,
							      &sync_blocks, still_degraded);

				for (j=0; j<conf->copies;j++) {
					int d = r10_bio->devs[j].devnum;
					if (conf->mirrors[d].rdev &&
@@ -1526,10 +1627,22 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
	} else {
		/* resync. Schedule a read for every block at this virt offset */
		int count = 0;

		if (!bitmap_start_sync(mddev->bitmap, sector_nr,
				       &sync_blocks, mddev->degraded) &&
		    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
			/* We can skip this block */
			*skipped = 1;
			return sync_blocks + sectors_skipped;
		}
		if (sync_blocks < max_sync)
			max_sync = sync_blocks;
		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);

		r10_bio->mddev = mddev;
		atomic_set(&r10_bio->remaining, 0);
		raise_barrier(conf, 0);
		conf->next_resync = sector_nr;

		r10_bio->master_bio = NULL;
		r10_bio->sector = sector_nr;
@@ -1582,6 +1695,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i
	}

	nr_sectors = 0;
	if (sector_nr + max_sync < max_sector)
		max_sector = sector_nr + max_sync;
	do {
		struct page *page;
		int len = PAGE_SIZE;
@@ -1821,6 +1936,26 @@ static int stop(mddev_t *mddev)
	return 0;
}

static void raid10_quiesce(mddev_t *mddev, int state)
{
	conf_t *conf = mddev_to_conf(mddev);

	switch(state) {
	case 1:
		raise_barrier(conf, 0);
		break;
	case 0:
		lower_barrier(conf);
		break;
	}
	if (mddev->thread) {
		if (mddev->bitmap)
			mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
		else
			mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
		md_wakeup_thread(mddev->thread);
	}
}

static mdk_personality_t raid10_personality =
{
@@ -1835,6 +1970,7 @@ static mdk_personality_t raid10_personality =
	.hot_remove_disk= raid10_remove_disk,
	.spare_active	= raid10_spare_active,
	.sync_request	= sync_request,
	.quiesce	= raid10_quiesce,
};

static int __init raid_init(void)
+8 −1
Original line number Diff line number Diff line
@@ -35,13 +35,19 @@ struct r10_private_data_s {
	sector_t chunk_mask;

	struct list_head	retry_list;
	/* for use when syncing mirrors: */
	/* queue pending writes and submit them on unplug */
	struct bio_list		pending_bio_list;


	spinlock_t		resync_lock;
	int nr_pending;
	int nr_waiting;
	int barrier;
	sector_t		next_resync;
	int			fullsync;  /* set to 1 if a full sync is needed,
					    * (fresh device added).
					    * Cleared when a sync completes.
					    */

	wait_queue_head_t	wait_barrier;

@@ -100,4 +106,5 @@ struct r10bio_s {
#define	R10BIO_Uptodate	0
#define	R10BIO_IsSync	1
#define	R10BIO_IsRecover 2
#define	R10BIO_Degraded 3
#endif