Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit c2a4f318 authored by Kent Overstreet's avatar Kent Overstreet Committed by Linus Torvalds
Browse files

bcache: Fix a writeback performance regression



Background writeback works by scanning the btree for dirty data and
adding those keys into a fixed size buffer, then for each dirty key in
the keybuf writing it to the backing device.

When read_dirty() finishes and it's time to scan for more dirty data, we
need to wait for the outstanding writeback IO to finish - they still
take up slots in the keybuf (so that foreground writes can check for
them to avoid races) - without that wait, we'll continually rescan when
we'll be able to add at most a key or two to the keybuf, and that takes
locks that starves foreground IO.  Doh.

Signed-off-by: default avatarKent Overstreet <kmo@daterainc.com>
Cc: linux-stable <stable@vger.kernel.org> # >= v3.10
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 61cbd250
Loading
Loading
Loading
Loading
+3 −4
Original line number Diff line number Diff line
@@ -498,7 +498,7 @@ struct cached_dev {
	 */
	atomic_t		has_dirty;

	struct ratelimit	writeback_rate;
	struct bch_ratelimit	writeback_rate;
	struct delayed_work	writeback_rate_update;

	/*
@@ -507,10 +507,9 @@ struct cached_dev {
	 */
	sector_t		last_read;

	/* Number of writeback bios in flight */
	atomic_t		in_flight;
	/* Limit number of writeback bios in flight */
	struct semaphore	in_flight;
	struct closure_with_timer writeback;
	struct closure_waitlist	writeback_wait;

	struct keybuf		writeback_keys;

+10 −1
Original line number Diff line number Diff line
@@ -190,7 +190,16 @@ void bch_time_stats_update(struct time_stats *stats, uint64_t start_time)
	stats->last = now ?: 1;
}

unsigned bch_next_delay(struct ratelimit *d, uint64_t done)
/**
 * bch_next_delay() - increment @d by the amount of work done, and return how
 * long to delay until the next time to do some work.
 *
 * @d - the struct bch_ratelimit to update
 * @done - the amount of work done, in arbitrary units
 *
 * Returns the amount of time to delay by, in jiffies
 */
uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
{
	uint64_t now = local_clock();

+9 −3
Original line number Diff line number Diff line
@@ -450,17 +450,23 @@ read_attribute(name ## _last_ ## frequency_units)
	(ewma) >> factor;						\
})

struct ratelimit {
struct bch_ratelimit {
	/* Next time we want to do some work, in nanoseconds */
	uint64_t		next;

	/*
	 * Rate at which we want to do work, in units per nanosecond
	 * The units here correspond to the units passed to bch_next_delay()
	 */
	unsigned		rate;
};

static inline void ratelimit_reset(struct ratelimit *d)
static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
{
	d->next = local_clock();
}

unsigned bch_next_delay(struct ratelimit *d, uint64_t done);
uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done);

#define __DIV_SAFE(n, d, zero)						\
({									\
+21 −22
Original line number Diff line number Diff line
@@ -94,11 +94,15 @@ static void update_writeback_rate(struct work_struct *work)

static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
{
	uint64_t ret;

	if (atomic_read(&dc->disk.detaching) ||
	    !dc->writeback_percent)
		return 0;

	return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
	ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);

	return min_t(uint64_t, ret, HZ);
}

/* Background writeback */
@@ -208,7 +212,7 @@ static void refill_dirty(struct closure *cl)

	up_write(&dc->writeback_lock);

	ratelimit_reset(&dc->writeback_rate);
	bch_ratelimit_reset(&dc->writeback_rate);

	/* Punt to workqueue only so we don't recurse and blow the stack */
	continue_at(cl, read_dirty, dirty_wq);
@@ -318,9 +322,7 @@ static void write_dirty_finish(struct closure *cl)
	}

	bch_keybuf_del(&dc->writeback_keys, w);
	atomic_dec_bug(&dc->in_flight);

	closure_wake_up(&dc->writeback_wait);
	up(&dc->in_flight);

	closure_return_with_destructor(cl, dirty_io_destructor);
}
@@ -349,7 +351,7 @@ static void write_dirty(struct closure *cl)

	closure_bio_submit(&io->bio, cl, &io->dc->disk);

	continue_at(cl, write_dirty_finish, dirty_wq);
	continue_at(cl, write_dirty_finish, system_wq);
}

static void read_dirty_endio(struct bio *bio, int error)
@@ -369,7 +371,7 @@ static void read_dirty_submit(struct closure *cl)

	closure_bio_submit(&io->bio, cl, &io->dc->disk);

	continue_at(cl, write_dirty, dirty_wq);
	continue_at(cl, write_dirty, system_wq);
}

static void read_dirty(struct closure *cl)
@@ -394,12 +396,9 @@ static void read_dirty(struct closure *cl)

		if (delay > 0 &&
		    (KEY_START(&w->key) != dc->last_read ||
		     jiffies_to_msecs(delay) > 50)) {
			w->private = NULL;

			closure_delay(&dc->writeback, delay);
			continue_at(cl, read_dirty, dirty_wq);
		}
		     jiffies_to_msecs(delay) > 50))
			while (delay)
				delay = schedule_timeout(delay);

		dc->last_read	= KEY_OFFSET(&w->key);

@@ -424,15 +423,10 @@ static void read_dirty(struct closure *cl)

		trace_bcache_writeback(&w->key);

		closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
		down(&dc->in_flight);
		closure_call(&io->cl, read_dirty_submit, NULL, cl);

		delay = writeback_delay(dc, KEY_SIZE(&w->key));

		atomic_inc(&dc->in_flight);

		if (!closure_wait_event(&dc->writeback_wait, cl,
					atomic_read(&dc->in_flight) < 64))
			continue_at(cl, read_dirty, dirty_wq);
	}

	if (0) {
@@ -442,7 +436,11 @@ static void read_dirty(struct closure *cl)
		bch_keybuf_del(&dc->writeback_keys, w);
	}

	refill_dirty(cl);
	/*
	 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
	 * freed) before refilling again
	 */
	continue_at(cl, refill_dirty, dirty_wq);
}

/* Init */
@@ -484,6 +482,7 @@ void bch_sectors_dirty_init(struct cached_dev *dc)

void bch_cached_dev_writeback_init(struct cached_dev *dc)
{
	sema_init(&dc->in_flight, 64);
	closure_init_unlocked(&dc->writeback);
	init_rwsem(&dc->writeback_lock);

@@ -513,7 +512,7 @@ void bch_writeback_exit(void)

int __init bch_writeback_init(void)
{
	dirty_wq = create_singlethread_workqueue("bcache_writeback");
	dirty_wq = create_workqueue("bcache_writeback");
	if (!dirty_wq)
		return -ENOMEM;