Merge tag 'md/4.10-rc6' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md (dd553962) · Commits · e / devices / android_kernel_oneplus_sm8150

drivers/md/md.c

+5 −0

Original line number	Diff line number	Diff line
		@@ -5291,6 +5291,11 @@ int md_run(struct mddev *mddev)
		if (start_readonly && mddev->ro == 0)
		mddev->ro = 2; /* read-only, but switch on first write */

		/*
		* NOTE: some pers->run(), for example r5l_recovery_log(), wakes
		* up mddev->thread. It is important to initialize critical
		* resources for mddev->thread BEFORE calling pers->run().
		*/
		err = pers->run(mddev);
		if (err)
		pr_warn("md: pers->run() failed ...\n");

drivers/md/raid5-cache.c

+88 −18

Original line number	Diff line number	Diff line
		@@ -162,6 +162,8 @@ struct r5l_log {

		/* to submit async io_units, to fulfill ordering of flush */
		struct work_struct deferred_io_work;
		/* to disable write back during in degraded mode */
		struct work_struct disable_writeback_work;
		};

		/*
		@@ -611,6 +613,21 @@ static void r5l_submit_io_async(struct work_struct *work)
		r5l_do_submit_io(log, io);
		}

		static void r5c_disable_writeback_async(struct work_struct *work)
		{
		struct r5l_log *log = container_of(work, struct r5l_log,
		disable_writeback_work);
		struct mddev *mddev = log->rdev->mddev;

		if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
		return;
		pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
		mdname(mddev));
		mddev_suspend(mddev);
		log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
		mddev_resume(mddev);
		}

		static void r5l_submit_current_io(struct r5l_log *log)
		{
		struct r5l_io_unit *io = log->current_io;
		@@ -1393,8 +1410,6 @@ static void r5l_do_reclaim(struct r5l_log *log)
		next_checkpoint = r5c_calculate_new_cp(conf);
		spin_unlock_irq(&log->io_list_lock);

		BUG_ON(reclaimable < 0);

		if (reclaimable == 0 \|\| !write_super)
		return;

		@@ -2062,7 +2077,7 @@ static int
		r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
		struct r5l_recovery_ctx *ctx)
		{
		struct stripe_head sh, next;
		struct stripe_head *sh;
		struct mddev *mddev = log->rdev->mddev;
		struct page *page;
		sector_t next_checkpoint = MaxSector;
		@@ -2076,7 +2091,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,

		WARN_ON(list_empty(&ctx->cached_list));

		list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
		list_for_each_entry(sh, &ctx->cached_list, lru) {
		struct r5l_meta_block *mb;
		int i;
		int offset;
		@@ -2126,14 +2141,39 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
		ctx->pos = write_pos;
		ctx->seq += 1;
		next_checkpoint = sh->log_start;
		list_del_init(&sh->lru);
		raid5_release_stripe(sh);
		}
		log->next_checkpoint = next_checkpoint;
		__free_page(page);
		return 0;
		}

		static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
		struct r5l_recovery_ctx *ctx)
		{
		struct mddev *mddev = log->rdev->mddev;
		struct r5conf *conf = mddev->private;
		struct stripe_head sh, next;

		if (ctx->data_only_stripes == 0)
		return;

		log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_BACK;

		list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
		r5c_make_stripe_write_out(sh);
		set_bit(STRIPE_HANDLE, &sh->state);
		list_del_init(&sh->lru);
		raid5_release_stripe(sh);
		}

		md_wakeup_thread(conf->mddev->thread);
		/* reuse conf->wait_for_quiescent in recovery */
		wait_event(conf->wait_for_quiescent,
		atomic_read(&conf->active_stripes) == 0);

		log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
		}

		static int r5l_recovery_log(struct r5l_log *log)
		{
		struct mddev *mddev = log->rdev->mddev;
		@@ -2160,32 +2200,31 @@ static int r5l_recovery_log(struct r5l_log *log)
		pos = ctx.pos;
		ctx.seq += 10000;

		if (ctx.data_only_stripes == 0) {
		log->next_checkpoint = ctx.pos;
		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
		}

		if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
		pr_debug("md/raid:%s: starting from clean shutdown\n",
		mdname(mddev));
		else {
		else
		pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
		mdname(mddev), ctx.data_only_stripes,
		ctx.data_parity_stripes);

		if (ctx.data_only_stripes > 0)
		if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
		if (ctx.data_only_stripes == 0) {
		log->next_checkpoint = ctx.pos;
		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
		} else if (r5c_recovery_rewrite_data_only_stripes(log, &ctx)) {
		pr_err("md/raid:%s: failed to rewrite stripes to journal\n",
		mdname(mddev));
		return -EIO;
		}
		}

		log->log_start = ctx.pos;
		log->seq = ctx.seq;
		log->last_checkpoint = pos;
		r5l_write_super(log, pos);

		r5c_recovery_flush_data_only_stripes(log, &ctx);
		return 0;
		}

		@@ -2247,6 +2286,10 @@ static ssize_t r5c_journal_mode_store(struct mddev *mddev,
		val > R5C_JOURNAL_MODE_WRITE_BACK)
		return -EINVAL;

		if (raid5_calc_degraded(conf) > 0 &&
		val == R5C_JOURNAL_MODE_WRITE_BACK)
		return -EINVAL;

		mddev_suspend(mddev);
		conf->log->r5c_journal_mode = val;
		mddev_resume(mddev);
		@@ -2301,6 +2344,16 @@ int r5c_try_caching_write(struct r5conf *conf,
		set_bit(STRIPE_R5C_CACHING, &sh->state);
		}

		/*
		* When run in degraded mode, array is set to write-through mode.
		* This check helps drain pending write safely in the transition to
		* write-through mode.
		*/
		if (s->failed) {
		r5c_make_stripe_write_out(sh);
		return -EAGAIN;
		}

		for (i = disks; i--; ) {
		dev = &sh->dev[i];
		/* if non-overwrite, use writing-out phase */
		@@ -2351,6 +2404,8 @@ void r5c_release_extra_page(struct stripe_head *sh)
		struct page *p = sh->dev[i].orig_page;

		sh->dev[i].orig_page = sh->dev[i].page;
		clear_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);

		if (!using_disk_info_extra_page)
		put_page(p);
		}
		@@ -2555,6 +2610,19 @@ static int r5l_load_log(struct r5l_log *log)
		return ret;
		}

		void r5c_update_on_rdev_error(struct mddev *mddev)
		{
		struct r5conf *conf = mddev->private;
		struct r5l_log *log = conf->log;

		if (!log)
		return;

		if (raid5_calc_degraded(conf) > 0 &&
		conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
		schedule_work(&log->disable_writeback_work);
		}

		int r5l_init_log(struct r5conf conf, struct md_rdev rdev)
		{
		struct request_queue *q = bdev_get_queue(rdev->bdev);
		@@ -2627,6 +2695,7 @@ int r5l_init_log(struct r5conf conf, struct md_rdev rdev)
		spin_lock_init(&log->no_space_stripes_lock);

		INIT_WORK(&log->deferred_io_work, r5l_submit_io_async);
		INIT_WORK(&log->disable_writeback_work, r5c_disable_writeback_async);

		log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
		INIT_LIST_HEAD(&log->stripe_in_journal_list);
		@@ -2659,6 +2728,7 @@ int r5l_init_log(struct r5conf conf, struct md_rdev rdev)

		void r5l_exit_log(struct r5l_log *log)
		{
		flush_work(&log->disable_writeback_work);
		md_unregister_thread(&log->reclaim_thread);
		mempool_destroy(log->meta_pool);
		bioset_free(log->bs);

drivers/md/raid5.c

+94 −27

Original line number	Diff line number	Diff line
		@@ -556,7 +556,7 @@ static struct stripe_head __find_stripe(struct r5conf conf, sector_t sector,
		* of the two sections, and some non-in_sync devices may
		* be insync in the section most affected by failed devices.
		*/
		static int calc_degraded(struct r5conf *conf)
		int raid5_calc_degraded(struct r5conf *conf)
		{
		int degraded, degraded2;
		int i;
		@@ -619,7 +619,7 @@ static int has_failed(struct r5conf *conf)
		if (conf->mddev->reshape_position == MaxSector)
		return conf->mddev->degraded > conf->max_degraded;

		degraded = calc_degraded(conf);
		degraded = raid5_calc_degraded(conf);
		if (degraded > conf->max_degraded)
		return 1;
		return 0;
		@@ -1015,6 +1015,16 @@ static void ops_run_io(struct stripe_head sh, struct stripe_head_state s)

		if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
		WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));

		if (!op_is_write(op) &&
		test_bit(R5_InJournal, &sh->dev[i].flags))
		/*
		* issuing read for a page in journal, this
		* must be preparing for prexor in rmw; read
		* the data into orig_page
		*/
		sh->dev[i].vec.bv_page = sh->dev[i].orig_page;
		else
		sh->dev[i].vec.bv_page = sh->dev[i].page;
		bi->bi_vcnt = 1;
		bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
		@@ -2380,6 +2390,13 @@ static void raid5_end_read_request(struct bio * bi)
		} else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
		clear_bit(R5_ReadNoMerge, &sh->dev[i].flags);

		if (test_bit(R5_InJournal, &sh->dev[i].flags))
		/*
		* end read for a page in journal, this
		* must be preparing for prexor in rmw
		*/
		set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags);

		if (atomic_read(&rdev->read_errors))
		atomic_set(&rdev->read_errors, 0);
		} else {
		@@ -2538,7 +2555,7 @@ static void raid5_error(struct mddev mddev, struct md_rdev rdev)

		spin_lock_irqsave(&conf->device_lock, flags);
		clear_bit(In_sync, &rdev->flags);
		mddev->degraded = calc_degraded(conf);
		mddev->degraded = raid5_calc_degraded(conf);
		spin_unlock_irqrestore(&conf->device_lock, flags);
		set_bit(MD_RECOVERY_INTR, &mddev->recovery);

		@@ -2552,6 +2569,7 @@ static void raid5_error(struct mddev mddev, struct md_rdev rdev)
		bdevname(rdev->bdev, b),
		mdname(mddev),
		conf->raid_disks - mddev->degraded);
		r5c_update_on_rdev_error(mddev);
		}

		/*
		@@ -2880,6 +2898,30 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
		return r_sector;
		}

		/*
		* There are cases where we want handle_stripe_dirtying() and
		* schedule_reconstruction() to delay towrite to some dev of a stripe.
		*
		* This function checks whether we want to delay the towrite. Specifically,
		* we delay the towrite when:
		*
		* 1. degraded stripe has a non-overwrite to the missing dev, AND this
		* stripe has data in journal (for other devices).
		*
		* In this case, when reading data for the non-overwrite dev, it is
		* necessary to handle complex rmw of write back cache (prexor with
		* orig_page, and xor with page). To keep read path simple, we would
		* like to flush data in journal to RAID disks first, so complex rmw
		* is handled in the write patch (handle_stripe_dirtying).
		*
		*/
		static inline bool delay_towrite(struct r5dev *dev,
		struct stripe_head_state *s)
		{
		return !test_bit(R5_OVERWRITE, &dev->flags) &&
		!test_bit(R5_Insync, &dev->flags) && s->injournal;
		}

		static void
		schedule_reconstruction(struct stripe_head sh, struct stripe_head_state s,
		int rcw, int expand)
		@@ -2900,7 +2942,7 @@ schedule_reconstruction(struct stripe_head sh, struct stripe_head_state s,
		for (i = disks; i--; ) {
		struct r5dev *dev = &sh->dev[i];

		if (dev->towrite) {
		if (dev->towrite && !delay_towrite(dev, s)) {
		set_bit(R5_LOCKED, &dev->flags);
		set_bit(R5_Wantdrain, &dev->flags);
		if (!expand)
		@@ -3295,13 +3337,6 @@ static int want_replace(struct stripe_head *sh, int disk_idx)
		return rv;
		}

		/* fetch_block - checks the given member device to see if its data needs
		* to be read or computed to satisfy a request.
		*
		* Returns 1 when no more member devices need to be checked, otherwise returns
		* 0 to tell the loop in handle_stripe_fill to continue
		*/

		static int need_this_block(struct stripe_head sh, struct stripe_head_state s,
		int disk_idx, int disks)
		{
		@@ -3392,6 +3427,12 @@ static int need_this_block(struct stripe_head sh, struct stripe_head_state s,
		return 0;
		}

		/* fetch_block - checks the given member device to see if its data needs
		* to be read or computed to satisfy a request.
		*
		* Returns 1 when no more member devices need to be checked, otherwise returns
		* 0 to tell the loop in handle_stripe_fill to continue
		*/
		static int fetch_block(struct stripe_head sh, struct stripe_head_state s,
		int disk_idx, int disks)
		{
		@@ -3478,10 +3519,26 @@ static void handle_stripe_fill(struct stripe_head *sh,
		* midst of changing due to a write
		*/
		if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
		!sh->reconstruct_state)
		!sh->reconstruct_state) {

		/*
		* For degraded stripe with data in journal, do not handle
		* read requests yet, instead, flush the stripe to raid
		* disks first, this avoids handling complex rmw of write
		* back cache (prexor with orig_page, and then xor with
		* page) in the read path
		*/
		if (s->injournal && s->failed) {
		if (test_bit(STRIPE_R5C_CACHING, &sh->state))
		r5c_make_stripe_write_out(sh);
		goto out;
		}

		for (i = disks; i--; )
		if (fetch_block(sh, s, i, disks))
		break;
		}
		out:
		set_bit(STRIPE_HANDLE, &sh->state);
		}

		@@ -3594,6 +3651,21 @@ static void handle_stripe_clean_event(struct r5conf *conf,
		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
		}

		/*
		* For RMW in write back cache, we need extra page in prexor to store the
		* old data. This page is stored in dev->orig_page.
		*
		* This function checks whether we have data for prexor. The exact logic
		* is:
		* R5_UPTODATE && (!R5_InJournal \|\| R5_OrigPageUPTDODATE)
		*/
		static inline bool uptodate_for_rmw(struct r5dev *dev)
		{
		return (test_bit(R5_UPTODATE, &dev->flags)) &&
		(!test_bit(R5_InJournal, &dev->flags) \|\|
		test_bit(R5_OrigPageUPTDODATE, &dev->flags));
		}

		static int handle_stripe_dirtying(struct r5conf *conf,
		struct stripe_head *sh,
		struct stripe_head_state *s,
		@@ -3622,12 +3694,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,
		} else for (i = disks; i--; ) {
		/* would I have to read this buffer for read_modify_write */
		struct r5dev *dev = &sh->dev[i];
		if ((dev->towrite \|\| i == sh->pd_idx \|\| i == sh->qd_idx \|\|
		if (((dev->towrite && !delay_towrite(dev, s)) \|\|
		i == sh->pd_idx \|\| i == sh->qd_idx \|\|
		test_bit(R5_InJournal, &dev->flags)) &&
		!test_bit(R5_LOCKED, &dev->flags) &&
		!((test_bit(R5_UPTODATE, &dev->flags) &&
		(!test_bit(R5_InJournal, &dev->flags) \|\|
		dev->page != dev->orig_page)) \|\|
		!(uptodate_for_rmw(dev) \|\|
		test_bit(R5_Wantcompute, &dev->flags))) {
		if (test_bit(R5_Insync, &dev->flags))
		rmw++;
		@@ -3639,7 +3710,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
		i != sh->pd_idx && i != sh->qd_idx &&
		!test_bit(R5_LOCKED, &dev->flags) &&
		!(test_bit(R5_UPTODATE, &dev->flags) \|\|
		test_bit(R5_InJournal, &dev->flags) \|\|
		test_bit(R5_Wantcompute, &dev->flags))) {
		if (test_bit(R5_Insync, &dev->flags))
		rcw++;
		@@ -3689,13 +3759,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,

		for (i = disks; i--; ) {
		struct r5dev *dev = &sh->dev[i];
		if ((dev->towrite \|\|
		if (((dev->towrite && !delay_towrite(dev, s)) \|\|
		i == sh->pd_idx \|\| i == sh->qd_idx \|\|
		test_bit(R5_InJournal, &dev->flags)) &&
		!test_bit(R5_LOCKED, &dev->flags) &&
		!((test_bit(R5_UPTODATE, &dev->flags) &&
		(!test_bit(R5_InJournal, &dev->flags) \|\|
		dev->page != dev->orig_page)) \|\|
		!(uptodate_for_rmw(dev) \|\|
		test_bit(R5_Wantcompute, &dev->flags)) &&
		test_bit(R5_Insync, &dev->flags)) {
		if (test_bit(STRIPE_PREREAD_ACTIVE,
		@@ -3722,7 +3790,6 @@ static int handle_stripe_dirtying(struct r5conf *conf,
		i != sh->pd_idx && i != sh->qd_idx &&
		!test_bit(R5_LOCKED, &dev->flags) &&
		!(test_bit(R5_UPTODATE, &dev->flags) \|\|
		test_bit(R5_InJournal, &dev->flags) \|\|
		test_bit(R5_Wantcompute, &dev->flags))) {
		rcw++;
		if (test_bit(R5_Insync, &dev->flags) &&
		@@ -7025,7 +7092,7 @@ static int raid5_run(struct mddev *mddev)
		/*
		* 0 for a fully functional array, 1 or 2 for a degraded array.
		*/
		mddev->degraded = calc_degraded(conf);
		mddev->degraded = raid5_calc_degraded(conf);

		if (has_failed(conf)) {
		pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
		@@ -7272,7 +7339,7 @@ static int raid5_spare_active(struct mddev *mddev)
		}
		}
		spin_lock_irqsave(&conf->device_lock, flags);
		mddev->degraded = calc_degraded(conf);
		mddev->degraded = raid5_calc_degraded(conf);
		spin_unlock_irqrestore(&conf->device_lock, flags);
		print_raid5_conf(conf);
		return count;
		@@ -7632,7 +7699,7 @@ static int raid5_start_reshape(struct mddev *mddev)
		* pre and post number of devices.
		*/
		spin_lock_irqsave(&conf->device_lock, flags);
		mddev->degraded = calc_degraded(conf);
		mddev->degraded = raid5_calc_degraded(conf);
		spin_unlock_irqrestore(&conf->device_lock, flags);
		}
		mddev->raid_disks = conf->raid_disks;
		@@ -7720,7 +7787,7 @@ static void raid5_finish_reshape(struct mddev *mddev)
		} else {
		int d;
		spin_lock_irq(&conf->device_lock);
		mddev->degraded = calc_degraded(conf);
		mddev->degraded = raid5_calc_degraded(conf);
		spin_unlock_irq(&conf->device_lock);
		for (d = conf->raid_disks ;
		d < conf->raid_disks - mddev->delta_disks;

drivers/md/raid5.h

+7 −0

Original line number	Diff line number	Diff line
		@@ -322,6 +322,11 @@ enum r5dev_flags {
		* data and parity being written are in the journal
		* device
		*/
		R5_OrigPageUPTDODATE, /* with write back cache, we read old data into
		* dev->orig_page for prexor. When this flag is
		* set, orig_page contains latest data in the
		* raid disk.
		*/
		};

		/*
		@@ -753,6 +758,7 @@ extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
		extern struct stripe_head *
		raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
		int previous, int noblock, int noquiesce);
		extern int raid5_calc_degraded(struct r5conf *conf);
		extern int r5l_init_log(struct r5conf conf, struct md_rdev rdev);
		extern void r5l_exit_log(struct r5l_log *log);
		extern int r5l_write_stripe(struct r5l_log log, struct stripe_head head_sh);
		@@ -781,4 +787,5 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
		extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
		extern void r5c_check_cached_full_stripe(struct r5conf *conf);
		extern struct md_sysfs_entry r5c_journal_mode;
		extern void r5c_update_on_rdev_error(struct mddev *mddev);
		#endif