Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e9c7469b authored by Tejun Heo's avatar Tejun Heo Committed by Jens Axboe
Browse files

md: implment REQ_FLUSH/FUA support



This patch converts md to support REQ_FLUSH/FUA instead of now
deprecated REQ_HARDBARRIER.  In the core part (md.c), the following
changes are notable.

* Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA don't interfere with
  processing of other requests and thus there is no reason to mark the
  queue congested while FLUSH/FUA is in progress.

* REQ_FLUSH/FUA failures are final and its users don't need retry
  logic.  Retry logic is removed.

* Preflush needs to be issued to all member devices but FUA writes can
  be handled the same way as other writes - their processing can be
  deferred to request_queue of member devices.  md_barrier_request()
  is renamed to md_flush_request() and simplified accordingly.

For linear, raid0 and multipath, the core changes are enough.  raid1,
5 and 10 need the following conversions.

* raid1: Handling of FLUSH/FUA bio's can simply be deferred to
  request_queues of member devices.  Barrier related logic removed.

* raid5: Queue draining logic dropped.  FUA bit is propagated through
  biodrain and stripe resconstruction such that all the updated parts
  of the stripe are written out with FUA writes if any of the dirtying
  writes was FUA.  preread_active_stripes handling in make_request()
  is updated as suggested by Neil Brown.

* raid10: FUA bit needs to be propagated to write clones.

linear, raid0, 1, 5 and 10 tested.

Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reviewed-by: default avatarNeil Brown <neilb@suse.de>
Signed-off-by: default avatarJens Axboe <jaxboe@fusionio.com>
parent 7bc9fdda
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -294,8 +294,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
	dev_info_t *tmp_dev;
	sector_t start_sector;

	if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
		md_barrier_request(mddev, bio);
	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
		md_flush_request(mddev, bio);
		return 0;
	}

+25 −92
Original line number Diff line number Diff line
@@ -226,12 +226,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
		return 0;
	}
	rcu_read_lock();
	if (mddev->suspended || mddev->barrier) {
	if (mddev->suspended) {
		DEFINE_WAIT(__wait);
		for (;;) {
			prepare_to_wait(&mddev->sb_wait, &__wait,
					TASK_UNINTERRUPTIBLE);
			if (!mddev->suspended && !mddev->barrier)
			if (!mddev->suspended)
				break;
			rcu_read_unlock();
			schedule();
@@ -282,40 +282,29 @@ EXPORT_SYMBOL_GPL(mddev_resume);

int mddev_congested(mddev_t *mddev, int bits)
{
	if (mddev->barrier)
		return 1;
	return mddev->suspended;
}
EXPORT_SYMBOL(mddev_congested);

/*
 * Generic barrier handling for md
 * Generic flush handling for md
 */

#define POST_REQUEST_BARRIER ((void*)1)

static void md_end_barrier(struct bio *bio, int err)
static void md_end_flush(struct bio *bio, int err)
{
	mdk_rdev_t *rdev = bio->bi_private;
	mddev_t *mddev = rdev->mddev;
	if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
		set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);

	rdev_dec_pending(rdev, mddev);

	if (atomic_dec_and_test(&mddev->flush_pending)) {
		if (mddev->barrier == POST_REQUEST_BARRIER) {
			/* This was a post-request barrier */
			mddev->barrier = NULL;
			wake_up(&mddev->sb_wait);
		} else
			/* The pre-request barrier has finished */
			schedule_work(&mddev->barrier_work);
		/* The pre-request flush has finished */
		schedule_work(&mddev->flush_work);
	}
	bio_put(bio);
}

static void submit_barriers(mddev_t *mddev)
static void submit_flushes(mddev_t *mddev)
{
	mdk_rdev_t *rdev;

@@ -332,60 +321,56 @@ static void submit_barriers(mddev_t *mddev)
			atomic_inc(&rdev->nr_pending);
			rcu_read_unlock();
			bi = bio_alloc(GFP_KERNEL, 0);
			bi->bi_end_io = md_end_barrier;
			bi->bi_end_io = md_end_flush;
			bi->bi_private = rdev;
			bi->bi_bdev = rdev->bdev;
			atomic_inc(&mddev->flush_pending);
			submit_bio(WRITE_BARRIER, bi);
			submit_bio(WRITE_FLUSH, bi);
			rcu_read_lock();
			rdev_dec_pending(rdev, mddev);
		}
	rcu_read_unlock();
}

static void md_submit_barrier(struct work_struct *ws)
static void md_submit_flush_data(struct work_struct *ws)
{
	mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
	struct bio *bio = mddev->barrier;
	mddev_t *mddev = container_of(ws, mddev_t, flush_work);
	struct bio *bio = mddev->flush_bio;

	atomic_set(&mddev->flush_pending, 1);

	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
		bio_endio(bio, -EOPNOTSUPP);
	else if (bio->bi_size == 0)
	if (bio->bi_size == 0)
		/* an empty barrier - all done */
		bio_endio(bio, 0);
	else {
		bio->bi_rw &= ~REQ_HARDBARRIER;
		bio->bi_rw &= ~REQ_FLUSH;
		if (mddev->pers->make_request(mddev, bio))
			generic_make_request(bio);
		mddev->barrier = POST_REQUEST_BARRIER;
		submit_barriers(mddev);
	}
	if (atomic_dec_and_test(&mddev->flush_pending)) {
		mddev->barrier = NULL;
		mddev->flush_bio = NULL;
		wake_up(&mddev->sb_wait);
	}
}

void md_barrier_request(mddev_t *mddev, struct bio *bio)
void md_flush_request(mddev_t *mddev, struct bio *bio)
{
	spin_lock_irq(&mddev->write_lock);
	wait_event_lock_irq(mddev->sb_wait,
			    !mddev->barrier,
			    !mddev->flush_bio,
			    mddev->write_lock, /*nothing*/);
	mddev->barrier = bio;
	mddev->flush_bio = bio;
	spin_unlock_irq(&mddev->write_lock);

	atomic_set(&mddev->flush_pending, 1);
	INIT_WORK(&mddev->barrier_work, md_submit_barrier);
	INIT_WORK(&mddev->flush_work, md_submit_flush_data);

	submit_barriers(mddev);
	submit_flushes(mddev);

	if (atomic_dec_and_test(&mddev->flush_pending))
		schedule_work(&mddev->barrier_work);
		schedule_work(&mddev->flush_work);
}
EXPORT_SYMBOL(md_barrier_request);
EXPORT_SYMBOL(md_flush_request);

/* Support for plugging.
 * This mirrors the plugging support in request_queue, but does not
@@ -696,31 +681,6 @@ static void super_written(struct bio *bio, int error)
	bio_put(bio);
}

static void super_written_barrier(struct bio *bio, int error)
{
	struct bio *bio2 = bio->bi_private;
	mdk_rdev_t *rdev = bio2->bi_private;
	mddev_t *mddev = rdev->mddev;

	if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
	    error == -EOPNOTSUPP) {
		unsigned long flags;
		/* barriers don't appear to be supported :-( */
		set_bit(BarriersNotsupp, &rdev->flags);
		mddev->barriers_work = 0;
		spin_lock_irqsave(&mddev->write_lock, flags);
		bio2->bi_next = mddev->biolist;
		mddev->biolist = bio2;
		spin_unlock_irqrestore(&mddev->write_lock, flags);
		wake_up(&mddev->sb_wait);
		bio_put(bio);
	} else {
		bio_put(bio2);
		bio->bi_private = rdev;
		super_written(bio, error);
	}
}

void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
		   sector_t sector, int size, struct page *page)
{
@@ -729,51 +689,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
	 * and decrement it on completion, waking up sb_wait
	 * if zero is reached.
	 * If an error occurred, call md_error
	 *
	 * As we might need to resubmit the request if REQ_HARDBARRIER
	 * causes ENOTSUPP, we allocate a spare bio...
	 */
	struct bio *bio = bio_alloc(GFP_NOIO, 1);
	int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;

	bio->bi_bdev = rdev->bdev;
	bio->bi_sector = sector;
	bio_add_page(bio, page, size, 0);
	bio->bi_private = rdev;
	bio->bi_end_io = super_written;
	bio->bi_rw = rw;

	atomic_inc(&mddev->pending_writes);
	if (!test_bit(BarriersNotsupp, &rdev->flags)) {
		struct bio *rbio;
		rw |= REQ_HARDBARRIER;
		rbio = bio_clone(bio, GFP_NOIO);
		rbio->bi_private = bio;
		rbio->bi_end_io = super_written_barrier;
		submit_bio(rw, rbio);
	} else
		submit_bio(rw, bio);
	submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
		   bio);
}

void md_super_wait(mddev_t *mddev)
{
	/* wait for all superblock writes that were scheduled to complete.
	 * if any had to be retried (due to BARRIER problems), retry them
	 */
	/* wait for all superblock writes that were scheduled to complete */
	DEFINE_WAIT(wq);
	for(;;) {
		prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
		if (atomic_read(&mddev->pending_writes)==0)
			break;
		while (mddev->biolist) {
			struct bio *bio;
			spin_lock_irq(&mddev->write_lock);
			bio = mddev->biolist;
			mddev->biolist = bio->bi_next ;
			bio->bi_next = NULL;
			spin_unlock_irq(&mddev->write_lock);
			submit_bio(bio->bi_rw, bio);
		}
		schedule();
	}
	finish_wait(&mddev->sb_wait, &wq);
@@ -1070,7 +1007,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
	clear_bit(Faulty, &rdev->flags);
	clear_bit(In_sync, &rdev->flags);
	clear_bit(WriteMostly, &rdev->flags);
	clear_bit(BarriersNotsupp, &rdev->flags);

	if (mddev->raid_disks == 0) {
		mddev->major_version = 0;
@@ -1485,7 +1421,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
	clear_bit(Faulty, &rdev->flags);
	clear_bit(In_sync, &rdev->flags);
	clear_bit(WriteMostly, &rdev->flags);
	clear_bit(BarriersNotsupp, &rdev->flags);

	if (mddev->raid_disks == 0) {
		mddev->major_version = 1;
@@ -4506,7 +4441,6 @@ int md_run(mddev_t *mddev)
	/* may be over-ridden by personality */
	mddev->resync_max_sectors = mddev->dev_sectors;

	mddev->barriers_work = 1;
	mddev->ok_start_degraded = start_dirty_degraded;

	if (start_readonly && mddev->ro == 0)
@@ -4685,7 +4619,6 @@ static void md_clean(mddev_t *mddev)
	mddev->recovery = 0;
	mddev->in_sync = 0;
	mddev->degraded = 0;
	mddev->barriers_work = 0;
	mddev->safemode = 0;
	mddev->bitmap_info.offset = 0;
	mddev->bitmap_info.default_offset = 0;
+6 −17
Original line number Diff line number Diff line
@@ -87,7 +87,6 @@ struct mdk_rdev_s
#define	Faulty		1		/* device is known to have a fault */
#define	In_sync		2		/* device is in_sync with rest of array */
#define	WriteMostly	4		/* Avoid reading if at all possible */
#define	BarriersNotsupp	5		/* REQ_HARDBARRIER is not supported */
#define	AllReserved	6		/* If whole device is reserved for
					 * one array */
#define	AutoDetected	7		/* added by auto-detect */
@@ -273,13 +272,6 @@ struct mddev_s
	int				degraded;	/* whether md should consider
							 * adding a spare
							 */
	int				barriers_work;	/* initialised to true, cleared as soon
							 * as a barrier request to slave
							 * fails.  Only supported
							 */
	struct bio			*biolist; 	/* bios that need to be retried
							 * because REQ_HARDBARRIER is not supported
							 */

	atomic_t			recovery_active; /* blocks scheduled, but not written */
	wait_queue_head_t		recovery_wait;
@@ -339,16 +331,13 @@ struct mddev_s
	struct attribute_group		*to_remove;
	struct plug_handle		*plug; /* if used by personality */

	/* Generic barrier handling.
	 * If there is a pending barrier request, all other
	 * writes are blocked while the devices are flushed.
	 * The last to finish a flush schedules a worker to
	 * submit the barrier request (without the barrier flag),
	 * then submit more flush requests.
	/* Generic flush handling.
	 * The last to finish preflush schedules a worker to submit
	 * the rest of the request (without the REQ_FLUSH flag).
	 */
	struct bio *barrier;
	struct bio *flush_bio;
	atomic_t flush_pending;
	struct work_struct barrier_work;
	struct work_struct flush_work;
	struct work_struct event_work;	/* used by dm to report failure event */
};

@@ -502,7 +491,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);

extern int mddev_congested(mddev_t *mddev, int bits);
extern void md_barrier_request(mddev_t *mddev, struct bio *bio);
extern void md_flush_request(mddev_t *mddev, struct bio *bio);
extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
			   sector_t sector, int size, struct page *page);
extern void md_super_wait(mddev_t *mddev);
+2 −2
Original line number Diff line number Diff line
@@ -142,8 +142,8 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
	struct multipath_bh * mp_bh;
	struct multipath_info *multipath;

	if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
		md_barrier_request(mddev, bio);
	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
		md_flush_request(mddev, bio);
		return 0;
	}

+2 −2
Original line number Diff line number Diff line
@@ -483,8 +483,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
	struct strip_zone *zone;
	mdk_rdev_t *tmp_dev;

	if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
		md_barrier_request(mddev, bio);
	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
		md_flush_request(mddev, bio);
		return 0;
	}

Loading