Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d0e3d023 authored by Jens Axboe's avatar Jens Axboe
Browse files

Merge branch 'bcache-for-3.11' of git://evilpiepirate.org/~kent/linux-bcache into for-3.11/drivers

parents 5f0e5afa 8e51e414
Loading
Loading
Loading
Loading
+29 −18
Original line number Diff line number Diff line
@@ -46,29 +46,33 @@ you format your backing devices and cache device at the same time, you won't
have to manually attach:
  make-bcache -B /dev/sda /dev/sdb -C /dev/sdc

To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register:
bcache-tools now ships udev rules, and bcache devices are known to the kernel
immediately.  Without udev, you can manually register devices like this:

  echo /dev/sdb > /sys/fs/bcache/register
  echo /dev/sdc > /sys/fs/bcache/register

To register your bcache devices automatically, you could add something like
this to an init script:
Registering the backing device makes the bcache device show up in /dev; you can
now format it and use it as normal. But the first time using a new bcache
device, it'll be running in passthrough mode until you attach it to a cache.
See the section on attaching.

  echo /dev/sd* > /sys/fs/bcache/register_quiet
The devices show up as:

It'll look for bcache superblocks and ignore everything that doesn't have one.
  /dev/bcache<N>

Registering the backing device makes the bcache show up in /dev; you can now
format it and use it as normal. But the first time using a new bcache device,
it'll be running in passthrough mode until you attach it to a cache. See the
section on attaching.
As well as (with udev):

The devices show up at /dev/bcacheN, and can be controlled via sysfs from
/sys/block/bcacheN/bcache:
  /dev/bcache/by-uuid/<uuid>
  /dev/bcache/by-label/<label>

To get started:

  mkfs.ext4 /dev/bcache0
  mount /dev/bcache0 /mnt

You can control bcache devices through sysfs at /sys/block/bcache<N>/bcache .

Cache devices are managed as sets; multiple caches per set isn't supported yet
but will allow for mirroring of metadata and dirty data in the future. Your new
cache set shows up as /sys/fs/bcache/<UUID>
@@ -80,11 +84,11 @@ must be attached to your cache set to enable caching. Attaching a backing
device to a cache set is done thusly, with the UUID of the cache set in
/sys/fs/bcache:

  echo <UUID> > /sys/block/bcache0/bcache/attach
  echo <CSET-UUID> > /sys/block/bcache0/bcache/attach

This only has to be done once. The next time you reboot, just reregister all
your bcache devices. If a backing device has data in a cache somewhere, the
/dev/bcache# device won't be created until the cache shows up - particularly
/dev/bcache<N> device won't be created until the cache shows up - particularly
important if you have writeback caching turned on.

If you're booting up and your cache device is gone and never coming back, you
@@ -181,7 +185,7 @@ want for getting the best possible numbers when benchmarking.

   In practice this isn't an issue because as soon as a write comes along it'll
   cause the btree node to be split, and you need almost no write traffic for
   this to not show up enough to be noticable (especially since bcache's btree
   this to not show up enough to be noticeable (especially since bcache's btree
   nodes are huge and index large regions of the device). But when you're
   benchmarking, if you're trying to warm the cache by reading a bunch of data
   and there's no other traffic - that can be a problem.
@@ -191,6 +195,9 @@ want for getting the best possible numbers when benchmarking.

SYSFS - BACKING DEVICE:

Available at /sys/block/<bdev>/bcache, /sys/block/bcache*/bcache and
(if attached) /sys/fs/bcache/<cset-uuid>/bdev*

attach
  Echo the UUID of a cache set to this file to enable caching.

@@ -222,7 +229,7 @@ running
  it's in passthrough mode or caching).

sequential_cutoff
  A sequential IO will bypass the cache once it passes this threshhold; the
  A sequential IO will bypass the cache once it passes this threshold; the
  most recent 128 IOs are tracked so sequential IO can be detected even when
  it isn't all done at once.

@@ -296,10 +303,12 @@ cache_miss_collisions
  since the synchronization for cache misses was rewritten)

cache_readaheads
  Count of times readahead occured.
  Count of times readahead occurred.

SYSFS - CACHE SET:

Available at /sys/fs/bcache/<cset-uuid>

average_key_size
  Average data per key in the btree.

@@ -362,7 +371,7 @@ unregister
SYSFS - CACHE SET INTERNAL:

This directory also exposes timings for a number of internal operations, with
separate files for average duration, average frequency, last occurence and max
separate files for average duration, average frequency, last occurrence and max
duration: garbage collection, btree read, btree node sorts and btree splits.

active_journal_entries
@@ -390,6 +399,8 @@ trigger_gc

SYSFS - CACHE DEVICE:

Available at /sys/block/<cdev>/bcache

block_size
  Minimum granularity of writes - should match hardware sector size.

@@ -417,7 +428,7 @@ freelist_percent
  space.

io_errors
  Number of errors that have occured, decayed by io_error_halflife.
  Number of errors that have occurred, decayed by io_error_halflife.

metadata_written
  Sum of all non data writes (btree writes and all other metadata).
+1 −1
Original line number Diff line number Diff line
@@ -1621,7 +1621,7 @@ S: Maintained
F:	drivers/net/hamradio/baycom*

BCACHE (BLOCK LAYER CACHE)
M:	Kent Overstreet <koverstreet@google.com>
M:	Kent Overstreet <kmo@daterainc.com>
L:	linux-bcache@vger.kernel.org
W:	http://bcache.evilpiepirate.org
S:	Maintained:
+25 −19
Original line number Diff line number Diff line
@@ -63,7 +63,9 @@
#include "bcache.h"
#include "btree.h"

#include <linux/kthread.h>
#include <linux/random.h>
#include <trace/events/bcache.h>

#define MAX_IN_FLIGHT_DISCARDS		8U

@@ -151,7 +153,7 @@ static void discard_finish(struct work_struct *w)
	mutex_unlock(&ca->set->bucket_lock);

	closure_wake_up(&ca->set->bucket_wait);
	wake_up(&ca->set->alloc_wait);
	wake_up_process(ca->alloc_thread);

	closure_put(&ca->set->cl);
}
@@ -350,38 +352,31 @@ static void invalidate_buckets(struct cache *ca)
		break;
	}

	pr_debug("free %zu/%zu free_inc %zu/%zu unused %zu/%zu",
		 fifo_used(&ca->free), ca->free.size,
		 fifo_used(&ca->free_inc), ca->free_inc.size,
		 fifo_used(&ca->unused), ca->unused.size);
	trace_bcache_alloc_invalidate(ca);
}

#define allocator_wait(ca, cond)					\
do {									\
	DEFINE_WAIT(__wait);						\
									\
	while (1) {							\
		prepare_to_wait(&ca->set->alloc_wait,			\
				&__wait, TASK_INTERRUPTIBLE);		\
		set_current_state(TASK_INTERRUPTIBLE);			\
		if (cond)						\
			break;						\
									\
		mutex_unlock(&(ca)->set->bucket_lock);			\
		if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) {	\
			finish_wait(&ca->set->alloc_wait, &__wait);	\
			closure_return(cl);				\
			closure_put(&ca->set->cl);			\
			return 0;					\
		}							\
									\
		schedule();						\
		mutex_lock(&(ca)->set->bucket_lock);			\
	}								\
									\
	finish_wait(&ca->set->alloc_wait, &__wait);			\
	__set_current_state(TASK_RUNNING);				\
} while (0)

void bch_allocator_thread(struct closure *cl)
static int bch_allocator_thread(void *arg)
{
	struct cache *ca = container_of(cl, struct cache, alloc);
	struct cache *ca = arg;

	mutex_lock(&ca->set->bucket_lock);

@@ -442,7 +437,7 @@ long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
{
	long r = -1;
again:
	wake_up(&ca->set->alloc_wait);
	wake_up_process(ca->alloc_thread);

	if (fifo_used(&ca->free) > ca->watermark[watermark] &&
	    fifo_pop(&ca->free, r)) {
@@ -476,9 +471,7 @@ again:
		return r;
	}

	pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu",
		 atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
		 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
	trace_bcache_alloc_fail(ca);

	if (cl) {
		closure_wait(&ca->set->bucket_wait, cl);
@@ -552,6 +545,19 @@ int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,

/* Init */

int bch_cache_allocator_start(struct cache *ca)
{
	ca->alloc_thread = kthread_create(bch_allocator_thread,
					  ca, "bcache_allocator");
	if (IS_ERR(ca->alloc_thread))
		return PTR_ERR(ca->alloc_thread);

	closure_get(&ca->set->cl);
	wake_up_process(ca->alloc_thread);

	return 0;
}

void bch_cache_allocator_exit(struct cache *ca)
{
	struct discard *d;
+19 −37
Original line number Diff line number Diff line
@@ -178,7 +178,6 @@
#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__

#include <linux/bio.h>
#include <linux/blktrace_api.h>
#include <linux/kobject.h>
#include <linux/list.h>
#include <linux/mutex.h>
@@ -388,8 +387,6 @@ struct keybuf_key {
typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);

struct keybuf {
	keybuf_pred_fn		*key_predicate;

	struct bkey		last_scanned;
	spinlock_t		lock;

@@ -438,8 +435,10 @@ struct bcache_device {
	/* If nonzero, we're detaching/unregistering from cache set */
	atomic_t		detaching;

	atomic_long_t		sectors_dirty;
	unsigned long		sectors_dirty_gc;
	uint64_t		nr_stripes;
	unsigned		stripe_size_bits;
	atomic_t		*stripe_sectors_dirty;

	unsigned long		sectors_dirty_last;
	long			sectors_dirty_derivative;

@@ -531,6 +530,7 @@ struct cached_dev {
	unsigned		sequential_merge:1;
	unsigned		verify:1;

	unsigned		partial_stripes_expensive:1;
	unsigned		writeback_metadata:1;
	unsigned		writeback_running:1;
	unsigned char		writeback_percent;
@@ -565,8 +565,7 @@ struct cache {

	unsigned		watermark[WATERMARK_MAX];

	struct closure		alloc;
	struct workqueue_struct	*alloc_workqueue;
	struct task_struct	*alloc_thread;

	struct closure		prio;
	struct prio_set		*disk_buckets;
@@ -703,9 +702,6 @@ struct cache_set {
	/* For the btree cache */
	struct shrinker		shrink;

	/* For the allocator itself */
	wait_queue_head_t	alloc_wait;

	/* For the btree cache and anything allocation related */
	struct mutex		bucket_lock;

@@ -823,10 +819,9 @@ struct cache_set {

	/*
	 * A btree node on disk could have too many bsets for an iterator to fit
	 * on the stack - this is a single element mempool for btree_read_work()
	 * on the stack - have to dynamically allocate them
	 */
	struct mutex		fill_lock;
	struct btree_iter	*fill_iter;
	mempool_t		*fill_iter;

	/*
	 * btree_sort() is a merge sort and requires temporary space - single
@@ -834,6 +829,7 @@ struct cache_set {
	 */
	struct mutex		sort_lock;
	struct bset		*sort;
	unsigned		sort_crit_factor;

	/* List of buckets we're currently writing data to */
	struct list_head	data_buckets;
@@ -906,8 +902,6 @@ static inline unsigned local_clock_us(void)
	return local_clock() >> 10;
}

#define MAX_BSETS		4U

#define BTREE_PRIO		USHRT_MAX
#define INITIAL_PRIO		32768

@@ -1112,23 +1106,6 @@ static inline void __bkey_put(struct cache_set *c, struct bkey *k)
		atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
}

/* Blktrace macros */

#define blktrace_msg(c, fmt, ...)					\
do {									\
	struct request_queue *q = bdev_get_queue(c->bdev);		\
	if (q)								\
		blk_add_trace_msg(q, fmt, ##__VA_ARGS__);		\
} while (0)

#define blktrace_msg_all(s, fmt, ...)					\
do {									\
	struct cache *_c;						\
	unsigned i;							\
	for_each_cache(_c, (s), i)					\
		blktrace_msg(_c, fmt, ##__VA_ARGS__);			\
} while (0)

static inline void cached_dev_put(struct cached_dev *dc)
{
	if (atomic_dec_and_test(&dc->count))
@@ -1173,10 +1150,16 @@ static inline uint8_t bucket_disk_gen(struct bucket *b)
	static struct kobj_attribute ksysfs_##n =			\
		__ATTR(n, S_IWUSR|S_IRUSR, show, store)

/* Forward declarations */
static inline void wake_up_allocators(struct cache_set *c)
{
	struct cache *ca;
	unsigned i;

	for_each_cache(ca, c, i)
		wake_up_process(ca->alloc_thread);
}

void bch_writeback_queue(struct cached_dev *);
void bch_writeback_add(struct cached_dev *, unsigned);
/* Forward declarations */

void bch_count_io_errors(struct cache *, int, const char *);
void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
@@ -1193,7 +1176,6 @@ void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
uint8_t bch_inc_gen(struct cache *, struct bucket *);
void bch_rescale_priorities(struct cache_set *, int);
bool bch_bucket_add_unused(struct cache *, struct bucket *);
void bch_allocator_thread(struct closure *);

long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
void bch_bucket_free(struct cache_set *, struct bkey *);
@@ -1241,9 +1223,9 @@ void bch_cache_set_stop(struct cache_set *);
struct cache_set *bch_cache_set_alloc(struct cache_sb *);
void bch_btree_cache_free(struct cache_set *);
int bch_btree_cache_alloc(struct cache_set *);
void bch_cached_dev_writeback_init(struct cached_dev *);
void bch_moving_init_cache_set(struct cache_set *);

int bch_cache_allocator_start(struct cache *ca);
void bch_cache_allocator_exit(struct cache *ca);
int bch_cache_allocator_init(struct cache *ca);

+35 −21
Original line number Diff line number Diff line
@@ -78,6 +78,7 @@ struct bkey *bch_keylist_pop(struct keylist *l)
bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
{
	unsigned i;
	char buf[80];

	if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
		goto bad;
@@ -102,7 +103,8 @@ bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)

	return false;
bad:
	cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k));
	bch_bkey_to_text(buf, sizeof(buf), k);
	cache_bug(c, "spotted bad key %s: %s", buf, bch_ptr_status(c, k));
	return true;
}

@@ -162,10 +164,16 @@ bool bch_ptr_bad(struct btree *b, const struct bkey *k)
#ifdef CONFIG_BCACHE_EDEBUG
bug:
	mutex_unlock(&b->c->bucket_lock);

	{
		char buf[80];

		bch_bkey_to_text(buf, sizeof(buf), k);
		btree_bug(b,
"inconsistent pointer %s: bucket %zu pin %i prio %i gen %i last_gc %i mark %llu gc_gen %i",
		  pkey(k), PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
			  buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
			  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
	}
	return true;
#endif
}
@@ -1084,33 +1092,39 @@ void bch_btree_sort_into(struct btree *b, struct btree *new)
	new->sets->size = 0;
}

#define SORT_CRIT	(4096 / sizeof(uint64_t))

void bch_btree_sort_lazy(struct btree *b)
{
	if (b->nsets) {
		unsigned i, j, keys = 0, total;

		for (i = 0; i <= b->nsets; i++)
			keys += b->sets[i].data->keys;
	unsigned crit = SORT_CRIT;
	int i;

		total = keys;
	/* Don't sort if nothing to do */
	if (!b->nsets)
		goto out;

		for (j = 0; j < b->nsets; j++) {
			if (keys * 2 < total ||
			    keys < 1000) {
				bch_btree_sort_partial(b, j);
	/* If not a leaf node, always sort */
	if (b->level) {
		bch_btree_sort(b);
		return;
	}

			keys -= b->sets[j].data->keys;
	for (i = b->nsets - 1; i >= 0; --i) {
		crit *= b->c->sort_crit_factor;

		if (b->sets[i].data->keys < crit) {
			bch_btree_sort_partial(b, i);
			return;
		}
	}

		/* Must sort if b->nsets == 3 or we'll overflow */
		if (b->nsets >= (MAX_BSETS - 1) - b->level) {
	/* Sort if we'd overflow */
	if (b->nsets + 1 == MAX_BSETS) {
		bch_btree_sort(b);
		return;
	}
	}

out:
	bset_build_written_tree(b);
}

Loading