Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit dd553962 authored by Linus Torvalds's avatar Linus Torvalds
Browse files
Pull MD fixes from Shaohua Li:
 "This fixes several corner cases for raid5 cache, which is merged into
  this cycle"

* tag 'md/4.10-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
  md/r5cache: disable write back for degraded array
  md/r5cache: shift complex rmw from read path to write path
  md/r5cache: flush data only stripes in r5l_recovery_log()
  md/raid5: move comment of fetch_block to right location
  md/r5cache: read data into orig_page for prexor of cached data
  md/raid5-cache: delete meaningless code
parents 64a172d2 2e38a37f
Loading
Loading
Loading
Loading
+5 −0
Original line number Diff line number Diff line
@@ -5291,6 +5291,11 @@ int md_run(struct mddev *mddev)
	if (start_readonly && mddev->ro == 0)
		mddev->ro = 2; /* read-only, but switch on first write */

	/*
	 * NOTE: some pers->run(), for example r5l_recovery_log(), wakes
	 * up mddev->thread. It is important to initialize critical
	 * resources for mddev->thread BEFORE calling pers->run().
	 */
	err = pers->run(mddev);
	if (err)
		pr_warn("md: pers->run() failed ...\n");
+88 −18
Original line number Diff line number Diff line
@@ -162,6 +162,8 @@ struct r5l_log {

	/* to submit async io_units, to fulfill ordering of flush */
	struct work_struct deferred_io_work;
	/* to disable write back during in degraded mode */
	struct work_struct disable_writeback_work;
};

/*
@@ -611,6 +613,21 @@ static void r5l_submit_io_async(struct work_struct *work)
		r5l_do_submit_io(log, io);
}

static void r5c_disable_writeback_async(struct work_struct *work)
{
	struct r5l_log *log = container_of(work, struct r5l_log,
					   disable_writeback_work);
	struct mddev *mddev = log->rdev->mddev;

	if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
		return;
	pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
		mdname(mddev));
	mddev_suspend(mddev);
	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
	mddev_resume(mddev);
}

static void r5l_submit_current_io(struct r5l_log *log)
{
	struct r5l_io_unit *io = log->current_io;
@@ -1393,8 +1410,6 @@ static void r5l_do_reclaim(struct r5l_log *log)
	next_checkpoint = r5c_calculate_new_cp(conf);
	spin_unlock_irq(&log->io_list_lock);

	BUG_ON(reclaimable < 0);

	if (reclaimable == 0 || !write_super)
		return;

@@ -2062,7 +2077,7 @@ static int
r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
				       struct r5l_recovery_ctx *ctx)
{
	struct stripe_head *sh, *next;
	struct stripe_head *sh;
	struct mddev *mddev = log->rdev->mddev;
	struct page *page;
	sector_t next_checkpoint = MaxSector;
@@ -2076,7 +2091,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,

	WARN_ON(list_empty(&ctx->cached_list));

	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
	list_for_each_entry(sh, &ctx->cached_list, lru) {
		struct r5l_meta_block *mb;
		int i;
		int offset;
@@ -2126,14 +2141,39 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
		ctx->pos = write_pos;
		ctx->seq += 1;
		next_checkpoint = sh->log_start;
		list_del_init(&sh->lru);
		raid5_release_stripe(sh);
	}
	log->next_checkpoint = next_checkpoint;
	__free_page(page);
	return 0;
}

static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
						 struct r5l_recovery_ctx *ctx)
{
	struct mddev *mddev = log->rdev->mddev;
	struct r5conf *conf = mddev->private;
	struct stripe_head *sh, *next;

	if (ctx->data_only_stripes == 0)
		return;

	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;

	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
		r5c_make_stripe_write_out(sh);
		set_bit(STRIPE_HANDLE, &sh->state);
		list_del_init(&sh->lru);
		raid5_release_stripe(sh);
	}

	md_wakeup_thread(conf->mddev->thread);
	/* reuse conf->wait_for_quiescent in recovery */
	wait_event(conf->wait_for_quiescent,
		   atomic_read(&conf->active_stripes) == 0);

	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
}

static int r5l_recovery_log(struct r5l_log *log)
{
	struct mddev *mddev = log->rdev->mddev;
@@ -2160,32 +2200,31 @@ static int r5l_recovery_log(struct r5l_log *log)
	pos = ctx.pos;
	ctx.seq += 10000;

	if (ctx.data_only_stripes == 0) {
		log->next_checkpoint = ctx.pos;
		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
	}

	if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
		pr_debug("md/raid:%s: starting from clean shutdown\n",
			 mdname(mddev));
	else {
	else
		pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
			 mdname(mddev), ctx.data_only_stripes,
			 ctx.data_parity_stripes);

		if (ctx.data_only_stripes > 0)
			if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
	if (ctx.data_only_stripes == 0) {
		log->next_checkpoint = ctx.pos;
		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
	} else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
		pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
		       mdname(mddev));
		return -EIO;
	}
	}

	log->log_start = ctx.pos;
	log->seq = ctx.seq;
	log->last_checkpoint = pos;
	r5l_write_super(log, pos);

	r5c_recovery_flush_data_only_stripes(log, &ctx);
	return 0;
}

@@ -2247,6 +2286,10 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev,
	    val > R5C_JOURNAL_MODE_WRITE_BACK)
		return -EINVAL;

	if (raid5_calc_degraded(conf) > 0 &&
	    val == R5C_JOURNAL_MODE_WRITE_BACK)
		return -EINVAL;

	mddev_suspend(mddev);
	conf->log->r5c_journal_mode = val;
	mddev_resume(mddev);
@@ -2301,6 +2344,16 @@ int r5c_try_caching_write(struct r5conf *conf,
		set_bit(STRIPE_R5C_CACHING, &sh->state);
	}

	/*
	 * When run in degraded mode, array is set to write-through mode.
	 * This check helps drain pending write safely in the transition to
	 * write-through mode.
	 */
	if (s->failed) {
		r5c_make_stripe_write_out(sh);
		return -EAGAIN;
	}

	for (i = disks; i--; ) {
		dev = &sh->dev[i];
		/* if non-overwrite, use writing-out phase */
@@ -2351,6 +2404,8 @@ void r5c_release_extra_page(struct stripe_head *sh)
			struct page *p = sh->dev[i].orig_page;

			sh->dev[i].orig_page = sh->dev[i].page;
			clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);

			if (!using_disk_info_extra_page)
				put_page(p);
		}
@@ -2555,6 +2610,19 @@ static int r5l_load_log(struct r5l_log *log)
	return ret;
}

void r5c_update_on_rdev_error(struct mddev *mddev)
{
	struct r5conf *conf = mddev->private;
	struct r5l_log *log = conf->log;

	if (!log)
		return;

	if (raid5_calc_degraded(conf) > 0 &&
	    conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
		schedule_work(&log->disable_writeback_work);
}

int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
{
	struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -2627,6 +2695,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
	spin_lock_init(&log->no_space_stripes_lock);

	INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
	INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);

	log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
	INIT_LIST_HEAD(&log->stripe_in_journal_list);
@@ -2659,6 +2728,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)

void r5l_exit_log(struct r5l_log *log)
{
	flush_work(&log->disable_writeback_work);
	md_unregister_thread(&log->reclaim_thread);
	mempool_destroy(log->meta_pool);
	bioset_free(log->bs);
+94 −27
Original line number Diff line number Diff line
@@ -556,7 +556,7 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
 * of the two sections, and some non-in_sync devices may
 * be insync in the section most affected by failed devices.
 */
static int calc_degraded(struct r5conf *conf)
int raid5_calc_degraded(struct r5conf *conf)
{
	int degraded, degraded2;
	int i;
@@ -619,7 +619,7 @@ static int has_failed(struct r5conf *conf)
	if (conf->mddev->reshape_position == MaxSector)
		return conf->mddev->degraded > conf->max_degraded;

	degraded = calc_degraded(conf);
	degraded = raid5_calc_degraded(conf);
	if (degraded > conf->max_degraded)
		return 1;
	return 0;
@@ -1015,6 +1015,16 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)

			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
				WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));

			if (!op_is_write(op) &&
			    test_bit(R5_InJournal, &sh->dev[i].flags))
				/*
				 * issuing read for a page in journal, this
				 * must be preparing for prexor in rmw; read
				 * the data into orig_page
				 */
				sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
			else
				sh->dev[i].vec.bv_page = sh->dev[i].page;
			bi->bi_vcnt = 1;
			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
@@ -2380,6 +2390,13 @@ static void raid5_end_read_request(struct bio * bi)
		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
			clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);

		if (test_bit(R5_InJournal, &sh->dev[i].flags))
			/*
			 * end read for a page in journal, this
			 * must be preparing for prexor in rmw
			 */
			set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);

		if (atomic_read(&rdev->read_errors))
			atomic_set(&rdev->read_errors, 0);
	} else {
@@ -2538,7 +2555,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)

	spin_lock_irqsave(&conf->device_lock, flags);
	clear_bit(In_sync, &rdev->flags);
	mddev->degraded = calc_degraded(conf);
	mddev->degraded = raid5_calc_degraded(conf);
	spin_unlock_irqrestore(&conf->device_lock, flags);
	set_bit(MD_RECOVERY_INTR, &mddev->recovery);

@@ -2552,6 +2569,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
		bdevname(rdev->bdev, b),
		mdname(mddev),
		conf->raid_disks - mddev->degraded);
	r5c_update_on_rdev_error(mddev);
}

/*
@@ -2880,6 +2898,30 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
	return r_sector;
}

/*
 * There are cases where we want handle_stripe_dirtying() and
 * schedule_reconstruction() to delay towrite to some dev of a stripe.
 *
 * This function checks whether we want to delay the towrite. Specifically,
 * we delay the towrite when:
 *
 *   1. degraded stripe has a non-overwrite to the missing dev, AND this
 *      stripe has data in journal (for other devices).
 *
 *      In this case, when reading data for the non-overwrite dev, it is
 *      necessary to handle complex rmw of write back cache (prexor with
 *      orig_page, and xor with page). To keep read path simple, we would
 *      like to flush data in journal to RAID disks first, so complex rmw
 *      is handled in the write patch (handle_stripe_dirtying).
 *
 */
static inline bool delay_towrite(struct r5dev *dev,
				   struct stripe_head_state *s)
{
	return !test_bit(R5_OVERWRITE, &dev->flags) &&
		!test_bit(R5_Insync, &dev->flags) && s->injournal;
}

static void
schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
			 int rcw, int expand)
@@ -2900,7 +2942,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
		for (i = disks; i--; ) {
			struct r5dev *dev = &sh->dev[i];

			if (dev->towrite) {
			if (dev->towrite && !delay_towrite(dev, s)) {
				set_bit(R5_LOCKED, &dev->flags);
				set_bit(R5_Wantdrain, &dev->flags);
				if (!expand)
@@ -3295,13 +3337,6 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
	return rv;
}

/* fetch_block - checks the given member device to see if its data needs
 * to be read or computed to satisfy a request.
 *
 * Returns 1 when no more member devices need to be checked, otherwise returns
 * 0 to tell the loop in handle_stripe_fill to continue
 */

static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
			   int disk_idx, int disks)
{
@@ -3392,6 +3427,12 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
	return 0;
}

/* fetch_block - checks the given member device to see if its data needs
 * to be read or computed to satisfy a request.
 *
 * Returns 1 when no more member devices need to be checked, otherwise returns
 * 0 to tell the loop in handle_stripe_fill to continue
 */
static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
		       int disk_idx, int disks)
{
@@ -3478,10 +3519,26 @@ static void handle_stripe_fill(struct stripe_head *sh,
	 * midst of changing due to a write
	 */
	if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
	    !sh->reconstruct_state)
	    !sh->reconstruct_state) {

		/*
		 * For degraded stripe with data in journal, do not handle
		 * read requests yet, instead, flush the stripe to raid
		 * disks first, this avoids handling complex rmw of write
		 * back cache (prexor with orig_page, and then xor with
		 * page) in the read path
		 */
		if (s->injournal && s->failed) {
			if (test_bit(STRIPE_R5C_CACHING, &sh->state))
				r5c_make_stripe_write_out(sh);
			goto out;
		}

		for (i = disks; i--; )
			if (fetch_block(sh, s, i, disks))
				break;
	}
out:
	set_bit(STRIPE_HANDLE, &sh->state);
}

@@ -3594,6 +3651,21 @@ static void handle_stripe_clean_event(struct r5conf *conf,
		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
}

/*
 * For RMW in write back cache, we need extra page in prexor to store the
 * old data. This page is stored in dev->orig_page.
 *
 * This function checks whether we have data for prexor. The exact logic
 * is:
 *       R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE)
 */
static inline bool uptodate_for_rmw(struct r5dev *dev)
{
	return (test_bit(R5_UPTODATE, &dev->flags)) &&
		(!test_bit(R5_InJournal, &dev->flags) ||
		 test_bit(R5_OrigPageUPTDODATE, &dev->flags));
}

static int handle_stripe_dirtying(struct r5conf *conf,
				  struct stripe_head *sh,
				  struct stripe_head_state *s,
@@ -3622,12 +3694,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,
	} else for (i = disks; i--; ) {
		/* would I have to read this buffer for read_modify_write */
		struct r5dev *dev = &sh->dev[i];
		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
		if (((dev->towrite && !delay_towrite(dev, s)) ||
		     i == sh->pd_idx || i == sh->qd_idx ||
		     test_bit(R5_InJournal, &dev->flags)) &&
		    !test_bit(R5_LOCKED, &dev->flags) &&
		    !((test_bit(R5_UPTODATE, &dev->flags) &&
		       (!test_bit(R5_InJournal, &dev->flags) ||
			dev->page != dev->orig_page)) ||
		    !(uptodate_for_rmw(dev) ||
		      test_bit(R5_Wantcompute, &dev->flags))) {
			if (test_bit(R5_Insync, &dev->flags))
				rmw++;
@@ -3639,7 +3710,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
		    i != sh->pd_idx && i != sh->qd_idx &&
		    !test_bit(R5_LOCKED, &dev->flags) &&
		    !(test_bit(R5_UPTODATE, &dev->flags) ||
		      test_bit(R5_InJournal, &dev->flags) ||
		      test_bit(R5_Wantcompute, &dev->flags))) {
			if (test_bit(R5_Insync, &dev->flags))
				rcw++;
@@ -3689,13 +3759,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,

		for (i = disks; i--; ) {
			struct r5dev *dev = &sh->dev[i];
			if ((dev->towrite ||
			if (((dev->towrite && !delay_towrite(dev, s)) ||
			     i == sh->pd_idx || i == sh->qd_idx ||
			     test_bit(R5_InJournal, &dev->flags)) &&
			    !test_bit(R5_LOCKED, &dev->flags) &&
			    !((test_bit(R5_UPTODATE, &dev->flags) &&
			       (!test_bit(R5_InJournal, &dev->flags) ||
				dev->page != dev->orig_page)) ||
			    !(uptodate_for_rmw(dev) ||
			      test_bit(R5_Wantcompute, &dev->flags)) &&
			    test_bit(R5_Insync, &dev->flags)) {
				if (test_bit(STRIPE_PREREAD_ACTIVE,
@@ -3722,7 +3790,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
			    i != sh->pd_idx && i != sh->qd_idx &&
			    !test_bit(R5_LOCKED, &dev->flags) &&
			    !(test_bit(R5_UPTODATE, &dev->flags) ||
			      test_bit(R5_InJournal, &dev->flags) ||
			      test_bit(R5_Wantcompute, &dev->flags))) {
				rcw++;
				if (test_bit(R5_Insync, &dev->flags) &&
@@ -7025,7 +7092,7 @@ static int raid5_run(struct mddev *mddev)
	/*
	 * 0 for a fully functional array, 1 or 2 for a degraded array.
	 */
	mddev->degraded = calc_degraded(conf);
	mddev->degraded = raid5_calc_degraded(conf);

	if (has_failed(conf)) {
		pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
@@ -7272,7 +7339,7 @@ static int raid5_spare_active(struct mddev *mddev)
		}
	}
	spin_lock_irqsave(&conf->device_lock, flags);
	mddev->degraded = calc_degraded(conf);
	mddev->degraded = raid5_calc_degraded(conf);
	spin_unlock_irqrestore(&conf->device_lock, flags);
	print_raid5_conf(conf);
	return count;
@@ -7632,7 +7699,7 @@ static int raid5_start_reshape(struct mddev *mddev)
		 * pre and post number of devices.
		 */
		spin_lock_irqsave(&conf->device_lock, flags);
		mddev->degraded = calc_degraded(conf);
		mddev->degraded = raid5_calc_degraded(conf);
		spin_unlock_irqrestore(&conf->device_lock, flags);
	}
	mddev->raid_disks = conf->raid_disks;
@@ -7720,7 +7787,7 @@ static void raid5_finish_reshape(struct mddev *mddev)
		} else {
			int d;
			spin_lock_irq(&conf->device_lock);
			mddev->degraded = calc_degraded(conf);
			mddev->degraded = raid5_calc_degraded(conf);
			spin_unlock_irq(&conf->device_lock);
			for (d = conf->raid_disks ;
			     d < conf->raid_disks - mddev->delta_disks;
+7 −0
Original line number Diff line number Diff line
@@ -322,6 +322,11 @@ enum r5dev_flags {
			 * data and parity being written are in the journal
			 * device
			 */
	R5_OrigPageUPTDODATE,	/* with write back cache, we read old data into
				 * dev->orig_page for prexor. When this flag is
				 * set, orig_page contains latest data in the
				 * raid disk.
				 */
};

/*
@@ -753,6 +758,7 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
extern struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
			int previous, int noblock, int noquiesce);
extern int raid5_calc_degraded(struct r5conf *conf);
extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
extern void r5l_exit_log(struct r5l_log *log);
extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
@@ -781,4 +787,5 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
extern void r5c_check_cached_full_stripe(struct r5conf *conf);
extern struct md_sysfs_entry r5c_journal_mode;
extern void r5c_update_on_rdev_error(struct mddev *mddev);
#endif