Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 7f6b9285 authored by NeilBrown's avatar NeilBrown Committed by Greg Kroah-Hartman
Browse files

Revert "MD: fix lock contention for flush bios"



commit 4bc034d35377196c854236133b07730a777c4aba upstream.

This reverts commit 5a409b4f.

This patch has two problems.

1/ it make multiple calls to submit_bio() from inside a make_request_fn.
 The bios thus submitted will be queued on current->bio_list and not
 submitted immediately.  As the bios are allocated from a mempool,
 this can theoretically result in a deadlock - all the pool of requests
 could be in various ->bio_list queues and a subsequent mempool_alloc
 could block waiting for one of them to be released.

2/ It aims to handle a case when there are many concurrent flush requests.
  It handles this by submitting many requests in parallel - all of which
  are identical and so most of which do nothing useful.
  It would be more efficient to just send one lower-level request, but
  allow that to satisfy multiple upper-level requests.

Fixes: 5a409b4f ("MD: fix lock contention for flush bios")
Cc: <stable@vger.kernel.org> # v4.19+
Tested-by: default avatarXiao Ni <xni@redhat.com>
Signed-off-by: default avatarNeilBrown <neilb@suse.com>
Signed-off-by: default avatarSong Liu <songliubraving@fb.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 7928396d
Loading
Loading
Loading
Loading
+55 −108
Original line number Diff line number Diff line
@@ -132,24 +132,6 @@ static inline int speed_max(struct mddev *mddev)
		mddev->sync_speed_max : sysctl_speed_limit_max;
}

static void * flush_info_alloc(gfp_t gfp_flags, void *data)
{
        return kzalloc(sizeof(struct flush_info), gfp_flags);
}
static void flush_info_free(void *flush_info, void *data)
{
        kfree(flush_info);
}

static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
{
	return kzalloc(sizeof(struct flush_bio), gfp_flags);
}
static void flush_bio_free(void *flush_bio, void *data)
{
	kfree(flush_bio);
}

static struct ctl_table_header *raid_table_header;

static struct ctl_table raid_table[] = {
@@ -429,54 +411,30 @@ static int md_congested(void *data, int bits)
/*
 * Generic flush handling for md
 */
static void submit_flushes(struct work_struct *ws)
{
	struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
	struct mddev *mddev = fi->mddev;
	struct bio *bio = fi->bio;

	bio->bi_opf &= ~REQ_PREFLUSH;
	md_handle_request(mddev, bio);

	mempool_free(fi, mddev->flush_pool);
}

static void md_end_flush(struct bio *fbio)
static void md_end_flush(struct bio *bio)
{
	struct flush_bio *fb = fbio->bi_private;
	struct md_rdev *rdev = fb->rdev;
	struct flush_info *fi = fb->fi;
	struct bio *bio = fi->bio;
	struct mddev *mddev = fi->mddev;
	struct md_rdev *rdev = bio->bi_private;
	struct mddev *mddev = rdev->mddev;

	rdev_dec_pending(rdev, mddev);

	if (atomic_dec_and_test(&fi->flush_pending)) {
		if (bio->bi_iter.bi_size == 0) {
			/* an empty barrier - all done */
			bio_endio(bio);
			mempool_free(fi, mddev->flush_pool);
		} else {
			INIT_WORK(&fi->flush_work, submit_flushes);
			queue_work(md_wq, &fi->flush_work);
	if (atomic_dec_and_test(&mddev->flush_pending)) {
		/* The pre-request flush has finished */
		queue_work(md_wq, &mddev->flush_work);
	}
	bio_put(bio);
}

	mempool_free(fb, mddev->flush_bio_pool);
	bio_put(fbio);
}
static void md_submit_flush_data(struct work_struct *ws);

void md_flush_request(struct mddev *mddev, struct bio *bio)
static void submit_flushes(struct work_struct *ws)
{
	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
	struct md_rdev *rdev;
	struct flush_info *fi;

	fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);

	fi->bio = bio;
	fi->mddev = mddev;
	atomic_set(&fi->flush_pending, 1);

	INIT_WORK(&mddev->flush_work, md_submit_flush_data);
	atomic_set(&mddev->flush_pending, 1);
	rcu_read_lock();
	rdev_for_each_rcu(rdev, mddev)
		if (rdev->raid_disk >= 0 &&
@@ -486,39 +444,58 @@ void md_flush_request(struct mddev *mddev, struct bio *bio)
			 * we reclaim rcu_read_lock
			 */
			struct bio *bi;
			struct flush_bio *fb;
			atomic_inc(&rdev->nr_pending);
			atomic_inc(&rdev->nr_pending);
			rcu_read_unlock();

			fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
			fb->fi = fi;
			fb->rdev = rdev;

			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
			bio_set_dev(bi, rdev->bdev);
			bi->bi_end_io = md_end_flush;
			bi->bi_private = fb;
			bi->bi_private = rdev;
			bio_set_dev(bi, rdev->bdev);
			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;

			atomic_inc(&fi->flush_pending);
			atomic_inc(&mddev->flush_pending);
			submit_bio(bi);

			rcu_read_lock();
			rdev_dec_pending(rdev, mddev);
		}
	rcu_read_unlock();
	if (atomic_dec_and_test(&mddev->flush_pending))
		queue_work(md_wq, &mddev->flush_work);
}

static void md_submit_flush_data(struct work_struct *ws)
{
	struct mddev *mddev = container_of(ws, struct mddev, flush_work);
	struct bio *bio = mddev->flush_bio;

	/*
	 * must reset flush_bio before calling into md_handle_request to avoid a
	 * deadlock, because other bios passed md_handle_request suspend check
	 * could wait for this and below md_handle_request could wait for those
	 * bios because of suspend check
	 */
	mddev->flush_bio = NULL;
	wake_up(&mddev->sb_wait);

	if (atomic_dec_and_test(&fi->flush_pending)) {
	if (bio->bi_iter.bi_size == 0) {
		/* an empty barrier - all done */
		bio_endio(bio);
			mempool_free(fi, mddev->flush_pool);
	} else {
			INIT_WORK(&fi->flush_work, submit_flushes);
			queue_work(md_wq, &fi->flush_work);
		bio->bi_opf &= ~REQ_PREFLUSH;
		md_handle_request(mddev, bio);
	}
}

void md_flush_request(struct mddev *mddev, struct bio *bio)
{
	spin_lock_irq(&mddev->lock);
	wait_event_lock_irq(mddev->sb_wait,
			    !mddev->flush_bio,
			    mddev->lock);
	mddev->flush_bio = bio;
	spin_unlock_irq(&mddev->lock);

	INIT_WORK(&mddev->flush_work, submit_flushes);
	queue_work(md_wq, &mddev->flush_work);
}
EXPORT_SYMBOL(md_flush_request);

@@ -566,6 +543,7 @@ void mddev_init(struct mddev *mddev)
	atomic_set(&mddev->openers, 0);
	atomic_set(&mddev->active_io, 0);
	spin_lock_init(&mddev->lock);
	atomic_set(&mddev->flush_pending, 0);
	init_waitqueue_head(&mddev->sb_wait);
	init_waitqueue_head(&mddev->recovery_wait);
	mddev->reshape_position = MaxSector;
@@ -5519,22 +5497,6 @@ int md_run(struct mddev *mddev)
		if (err)
			return err;
	}
	if (mddev->flush_pool == NULL) {
		mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
						flush_info_free, mddev);
		if (!mddev->flush_pool) {
			err = -ENOMEM;
			goto abort;
		}
	}
	if (mddev->flush_bio_pool == NULL) {
		mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
						flush_bio_free, mddev);
		if (!mddev->flush_bio_pool) {
			err = -ENOMEM;
			goto abort;
		}
	}

	spin_lock(&pers_lock);
	pers = find_pers(mddev->level, mddev->clevel);
@@ -5694,15 +5656,8 @@ int md_run(struct mddev *mddev)
	return 0;

abort:
	if (mddev->flush_bio_pool) {
		mempool_destroy(mddev->flush_bio_pool);
		mddev->flush_bio_pool = NULL;
	}
	if (mddev->flush_pool){
		mempool_destroy(mddev->flush_pool);
		mddev->flush_pool = NULL;
	}

	bioset_exit(&mddev->bio_set);
	bioset_exit(&mddev->sync_set);
	return err;
}
EXPORT_SYMBOL_GPL(md_run);
@@ -5906,14 +5861,6 @@ static void __md_stop(struct mddev *mddev)
		mddev->to_remove = &md_redundancy_group;
	module_put(pers->owner);
	clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
	if (mddev->flush_bio_pool) {
		mempool_destroy(mddev->flush_bio_pool);
		mddev->flush_bio_pool = NULL;
	}
	if (mddev->flush_pool) {
		mempool_destroy(mddev->flush_pool);
		mddev->flush_pool = NULL;
	}
}

void md_stop(struct mddev *mddev)
+7 −15
Original line number Diff line number Diff line
@@ -252,19 +252,6 @@ enum mddev_sb_flags {
	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
};

#define NR_FLUSH_INFOS 8
#define NR_FLUSH_BIOS 64
struct flush_info {
	struct bio			*bio;
	struct mddev			*mddev;
	struct work_struct		flush_work;
	atomic_t			flush_pending;
};
struct flush_bio {
	struct flush_info *fi;
	struct md_rdev *rdev;
};

struct mddev {
	void				*private;
	struct md_personality		*pers;
@@ -470,8 +457,13 @@ struct mddev {
						   * metadata and bitmap writes
						   */

	mempool_t			*flush_pool;
	mempool_t			*flush_bio_pool;
	/* Generic flush handling.
	 * The last to finish preflush schedules a worker to submit
	 * the rest of the request (without the REQ_PREFLUSH flag).
	 */
	struct bio *flush_bio;
	atomic_t flush_pending;
	struct work_struct flush_work;
	struct work_struct event_work;	/* used by dm to report failure event */
	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
	struct md_cluster_info		*cluster_info;