Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 39b99586 authored by Song Liu's avatar Song Liu Committed by Shaohua Li
Browse files

md/r5cache: improve journal device efficiency



It is important to be able to flush all stripes in raid5-cache.
Therefore, we need reserve some space on the journal device for
these flushes. If flush operation includes pending writes to the
stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
for the flush out. This reduces the efficiency of journal space.
If we exclude these pending writes from flush operation, we only
need (conf->max_degraded + 1) pages per stripe.

With this patch, when log space is critical (R5C_LOG_CRITICAL=1),
pending writes will be excluded from stripe flush out. Therefore,
we can reduce reserved space for flush out and thus improve journal
device efficiency.

Signed-off-by: default avatarSong Liu <songliubraving@fb.com>
Signed-off-by: default avatarShaohua Li <shli@fb.com>
parent 03b047f4
Loading
Loading
Loading
Loading
+25 −11
Original line number Diff line number Diff line
@@ -389,17 +389,30 @@ void r5c_check_cached_full_stripe(struct r5conf *conf)
/*
 * Total log space (in sectors) needed to flush all data in cache
 *
 * Currently, writing-out phase automatically includes all pending writes
 * to the same sector. So the reclaim of each stripe takes up to
 * (conf->raid_disks + 1) pages of log space.
 * To avoid deadlock due to log space, it is necessary to reserve log
 * space to flush critical stripes (stripes that occupying log space near
 * last_checkpoint). This function helps check how much log space is
 * required to flush all cached stripes.
 *
 * To totally avoid deadlock due to log space, the code reserves
 * (conf->raid_disks + 1) pages for each stripe in cache, which is not
 * necessary in most cases.
 * To reduce log space requirements, two mechanisms are used to give cache
 * flush higher priorities:
 *    1. In handle_stripe_dirtying() and schedule_reconstruction(),
 *       stripes ALREADY in journal can be flushed w/o pending writes;
 *    2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal
 *       can be delayed (r5l_add_no_space_stripe).
 *
 * To improve this, we will need writing-out phase to be able to NOT include
 * pending writes, which will reduce the requirement to
 * (conf->max_degraded + 1) pages per stripe in cache.
 * In cache flush, the stripe goes through 1 and then 2. For a stripe that
 * already passed 1, flushing it requires at most (conf->max_degraded + 1)
 * pages of journal space. For stripes that has not passed 1, flushing it
 * requires (conf->raid_disks + 1) pages of journal space. There are at
 * most (conf->group_cnt + 1) stripe that passed 1. So total journal space
 * required to flush all cached stripes (in pages) is:
 *
 *     (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) +
 *     (group_cnt + 1) * (raid_disks + 1)
 * or
 *     (stripe_in_journal_count) * (max_degraded + 1) +
 *     (group_cnt + 1) * (raid_disks - max_degraded)
 */
static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
{
@@ -408,8 +421,9 @@ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf)
	if (!r5c_is_writeback(log))
		return 0;

	return BLOCK_SECTORS * (conf->raid_disks + 1) *
		atomic_read(&log->stripe_in_journal_count);
	return BLOCK_SECTORS *
		((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) +
		 (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1));
}

/*
+33 −9
Original line number Diff line number Diff line
@@ -2951,12 +2951,36 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
 *      like to flush data in journal to RAID disks first, so complex rmw
 *      is handled in the write patch (handle_stripe_dirtying).
 *
 *   2. when journal space is critical (R5C_LOG_CRITICAL=1)
 *
 *      It is important to be able to flush all stripes in raid5-cache.
 *      Therefore, we need reserve some space on the journal device for
 *      these flushes. If flush operation includes pending writes to the
 *      stripe, we need to reserve (conf->raid_disk + 1) pages per stripe
 *      for the flush out. If we exclude these pending writes from flush
 *      operation, we only need (conf->max_degraded + 1) pages per stripe.
 *      Therefore, excluding pending writes in these cases enables more
 *      efficient use of the journal device.
 *
 *      Note: To make sure the stripe makes progress, we only delay
 *      towrite for stripes with data already in journal (injournal > 0).
 *      When LOG_CRITICAL, stripes with injournal == 0 will be sent to
 *      no_space_stripes list.
 *
 */
static inline bool delay_towrite(struct r5dev *dev,
static inline bool delay_towrite(struct r5conf *conf,
				 struct r5dev *dev,
				 struct stripe_head_state *s)
{
	return !test_bit(R5_OVERWRITE, &dev->flags) &&
		!test_bit(R5_Insync, &dev->flags) && s->injournal;
	/* case 1 above */
	if (!test_bit(R5_OVERWRITE, &dev->flags) &&
	    !test_bit(R5_Insync, &dev->flags) && s->injournal)
		return true;
	/* case 2 above */
	if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
	    s->injournal > 0)
		return true;
	return false;
}

static void
@@ -2979,7 +3003,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
		for (i = disks; i--; ) {
			struct r5dev *dev = &sh->dev[i];

			if (dev->towrite && !delay_towrite(dev, s)) {
			if (dev->towrite && !delay_towrite(conf, dev, s)) {
				set_bit(R5_LOCKED, &dev->flags);
				set_bit(R5_Wantdrain, &dev->flags);
				if (!expand)
@@ -3731,7 +3755,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,
	} else for (i = disks; i--; ) {
		/* would I have to read this buffer for read_modify_write */
		struct r5dev *dev = &sh->dev[i];
		if (((dev->towrite && !delay_towrite(dev, s)) ||
		if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
		     i == sh->pd_idx || i == sh->qd_idx ||
		     test_bit(R5_InJournal, &dev->flags)) &&
		    !test_bit(R5_LOCKED, &dev->flags) &&
@@ -3755,8 +3779,8 @@ static int handle_stripe_dirtying(struct r5conf *conf,
		}
	}

	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
		(unsigned long long)sh->sector, rmw, rcw);
	pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n",
		 (unsigned long long)sh->sector, sh->state, rmw, rcw);
	set_bit(STRIPE_HANDLE, &sh->state);
	if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
		/* prefer read-modify-write, but need to get some data */
@@ -3796,7 +3820,7 @@ static int handle_stripe_dirtying(struct r5conf *conf,

		for (i = disks; i--; ) {
			struct r5dev *dev = &sh->dev[i];
			if (((dev->towrite && !delay_towrite(dev, s)) ||
			if (((dev->towrite && !delay_towrite(conf, dev, s)) ||
			     i == sh->pd_idx || i == sh->qd_idx ||
			     test_bit(R5_InJournal, &dev->flags)) &&
			    !test_bit(R5_LOCKED, &dev->flags) &&