Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 53b381b3 authored by David Woodhouse's avatar David Woodhouse Committed by Chris Mason
Browse files

Btrfs: RAID5 and RAID6



This builds on David Woodhouse's original Btrfs raid5/6 implementation.
The code has changed quite a bit, blame Chris Mason for any bugs.

Read/modify/write is done after the higher levels of the filesystem have
prepared a given bio.  This means the higher layers are not responsible
for building full stripes, and they don't need to query for the topology
of the extents that may get allocated during delayed allocation runs.
It also means different files can easily share the same stripe.

But, it does expose us to incorrect parity if we crash or lose power
while doing a read/modify/write cycle.  This will be addressed in a
later commit.

Scrub is unable to repair crc errors on raid5/6 chunks.

Discard does not work on raid5/6 (yet)

The stripe size is fixed at 64KiB per disk.  This will be tunable
in a later commit.

Signed-off-by: default avatarChris Mason <chris.mason@fusionio.com>
parent 64a16701
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -6,6 +6,8 @@ config BTRFS_FS
	select ZLIB_DEFLATE
	select LZO_COMPRESS
	select LZO_DECOMPRESS
	select RAID6_PQ

	help
	  Btrfs is a new filesystem with extents, writable snapshotting,
	  support for multiple devices and many more features.
+1 −1
Original line number Diff line number Diff line
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o

btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+34 −1
Original line number Diff line number Diff line
@@ -502,6 +502,7 @@ struct btrfs_super_block {
#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA	(1ULL << 5)

#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF	(1ULL << 6)
#define BTRFS_FEATURE_INCOMPAT_RAID56		(1ULL << 7)

#define BTRFS_FEATURE_COMPAT_SUPP		0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
@@ -511,6 +512,7 @@ struct btrfs_super_block {
	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)

/*
@@ -952,8 +954,10 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RAID1		(1ULL << 4)
#define BTRFS_BLOCK_GROUP_DUP		(1ULL << 5)
#define BTRFS_BLOCK_GROUP_RAID10	(1ULL << 6)
#define BTRFS_BLOCK_GROUP_RAID5    (1 << 7)
#define BTRFS_BLOCK_GROUP_RAID6    (1 << 8)
#define BTRFS_BLOCK_GROUP_RESERVED	BTRFS_AVAIL_ALLOC_BIT_SINGLE
#define BTRFS_NR_RAID_TYPES		5
#define BTRFS_NR_RAID_TYPES		7

#define BTRFS_BLOCK_GROUP_TYPE_MASK	(BTRFS_BLOCK_GROUP_DATA |    \
					 BTRFS_BLOCK_GROUP_SYSTEM |  \
@@ -961,6 +965,8 @@ struct btrfs_dev_replace_item {

#define BTRFS_BLOCK_GROUP_PROFILE_MASK	(BTRFS_BLOCK_GROUP_RAID0 |   \
					 BTRFS_BLOCK_GROUP_RAID1 |   \
					 BTRFS_BLOCK_GROUP_RAID5 |   \
					 BTRFS_BLOCK_GROUP_RAID6 |   \
					 BTRFS_BLOCK_GROUP_DUP |     \
					 BTRFS_BLOCK_GROUP_RAID10)
/*
@@ -1185,6 +1191,10 @@ struct btrfs_block_group_cache {
	u64 flags;
	u64 sectorsize;
	u64 cache_generation;

	/* for raid56, this is a full stripe, without parity */
	unsigned long full_stripe_len;

	unsigned int ro:1;
	unsigned int dirty:1;
	unsigned int iref:1;
@@ -1225,6 +1235,20 @@ struct seq_list {
	u64 seq;
};

/* used by the raid56 code to lock stripes for read/modify/write */
struct btrfs_stripe_hash {
	struct list_head hash_list;
	wait_queue_head_t wait;
	spinlock_t lock;
};

/* used by the raid56 code to lock stripes for read/modify/write */
struct btrfs_stripe_hash_table {
	struct btrfs_stripe_hash *table;
};

#define BTRFS_STRIPE_HASH_TABLE_BITS 11

/* fs_info */
struct reloc_control;
struct btrfs_device;
@@ -1307,6 +1331,13 @@ struct btrfs_fs_info {
	struct mutex cleaner_mutex;
	struct mutex chunk_mutex;
	struct mutex volume_mutex;

	/* this is used during read/modify/write to make sure
	 * no two ios are trying to mod the same stripe at the same
	 * time
	 */
	struct btrfs_stripe_hash_table *stripe_hash_table;

	/*
	 * this protects the ordered operations list only while we are
	 * processing all of the entries on it.  This way we make
@@ -1395,6 +1426,8 @@ struct btrfs_fs_info {
	struct btrfs_workers flush_workers;
	struct btrfs_workers endio_workers;
	struct btrfs_workers endio_meta_workers;
	struct btrfs_workers endio_raid56_workers;
	struct btrfs_workers rmw_workers;
	struct btrfs_workers endio_meta_write_workers;
	struct btrfs_workers endio_write_workers;
	struct btrfs_workers endio_freespace_worker;
+53 −9
Original line number Diff line number Diff line
@@ -46,6 +46,7 @@
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "raid56.h"

#ifdef CONFIG_X86
#include <asm/cpufeature.h>
@@ -639,8 +640,15 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
		btree_readahead_hook(root, eb, eb->start, ret);
	}

	if (ret)
	if (ret) {
		/*
		 * our io error hook is going to dec the io pages
		 * again, we have to make sure it has something
		 * to decrement
		 */
		atomic_inc(&eb->io_pages);
		clear_extent_buffer_uptodate(eb);
	}
	free_extent_buffer(eb);
out:
	return ret;
@@ -654,6 +662,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
	eb = (struct extent_buffer *)page->private;
	set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
	eb->read_mirror = failed_mirror;
	atomic_dec(&eb->io_pages);
	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
		btree_readahead_hook(root, eb, eb->start, -EIO);
	return -EIO;	/* we fixed nothing */
@@ -670,17 +679,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
	end_io_wq->work.flags = 0;

	if (bio->bi_rw & REQ_WRITE) {
		if (end_io_wq->metadata == 1)
		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
					   &end_io_wq->work);
		else if (end_io_wq->metadata == 2)
		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
			btrfs_queue_worker(&fs_info->endio_freespace_worker,
					   &end_io_wq->work);
		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
			btrfs_queue_worker(&fs_info->endio_raid56_workers,
					   &end_io_wq->work);
		else
			btrfs_queue_worker(&fs_info->endio_write_workers,
					   &end_io_wq->work);
	} else {
		if (end_io_wq->metadata)
		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
			btrfs_queue_worker(&fs_info->endio_raid56_workers,
					   &end_io_wq->work);
		else if (end_io_wq->metadata)
			btrfs_queue_worker(&fs_info->endio_meta_workers,
					   &end_io_wq->work);
		else
@@ -695,6 +710,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
 * 0 - if data
 * 1 - if normal metadta
 * 2 - if writing to the free space cache area
 * 3 - raid parity work
 */
int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
			int metadata)
@@ -2165,6 +2181,12 @@ int open_ctree(struct super_block *sb,
	init_waitqueue_head(&fs_info->transaction_blocked_wait);
	init_waitqueue_head(&fs_info->async_submit_wait);

	ret = btrfs_alloc_stripe_hash_table(fs_info);
	if (ret) {
		err = -ENOMEM;
		goto fail_alloc;
	}

	__setup_root(4096, 4096, 4096, 4096, tree_root,
		     fs_info, BTRFS_ROOT_TREE_OBJECTID);

@@ -2332,6 +2354,12 @@ int open_ctree(struct super_block *sb,
	btrfs_init_workers(&fs_info->endio_meta_write_workers,
			   "endio-meta-write", fs_info->thread_pool_size,
			   &fs_info->generic_worker);
	btrfs_init_workers(&fs_info->endio_raid56_workers,
			   "endio-raid56", fs_info->thread_pool_size,
			   &fs_info->generic_worker);
	btrfs_init_workers(&fs_info->rmw_workers,
			   "rmw", fs_info->thread_pool_size,
			   &fs_info->generic_worker);
	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
			   fs_info->thread_pool_size,
			   &fs_info->generic_worker);
@@ -2350,6 +2378,8 @@ int open_ctree(struct super_block *sb,
	 */
	fs_info->endio_workers.idle_thresh = 4;
	fs_info->endio_meta_workers.idle_thresh = 4;
	fs_info->endio_raid56_workers.idle_thresh = 4;
	fs_info->rmw_workers.idle_thresh = 2;

	fs_info->endio_write_workers.idle_thresh = 2;
	fs_info->endio_meta_write_workers.idle_thresh = 2;
@@ -2366,6 +2396,8 @@ int open_ctree(struct super_block *sb,
	ret |= btrfs_start_workers(&fs_info->fixup_workers);
	ret |= btrfs_start_workers(&fs_info->endio_workers);
	ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
	ret |= btrfs_start_workers(&fs_info->rmw_workers);
	ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
	ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
	ret |= btrfs_start_workers(&fs_info->endio_write_workers);
	ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
@@ -2710,6 +2742,8 @@ int open_ctree(struct super_block *sb,
	btrfs_stop_workers(&fs_info->workers);
	btrfs_stop_workers(&fs_info->endio_workers);
	btrfs_stop_workers(&fs_info->endio_meta_workers);
	btrfs_stop_workers(&fs_info->endio_raid56_workers);
	btrfs_stop_workers(&fs_info->rmw_workers);
	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
	btrfs_stop_workers(&fs_info->endio_write_workers);
	btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -2728,6 +2762,7 @@ int open_ctree(struct super_block *sb,
fail_srcu:
	cleanup_srcu_struct(&fs_info->subvol_srcu);
fail:
	btrfs_free_stripe_hash_table(fs_info);
	btrfs_close_devices(fs_info->fs_devices);
	return err;

@@ -3076,11 +3111,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
				     ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
				      == 0)))
					num_tolerated_disk_barrier_failures = 0;
				else if (num_tolerated_disk_barrier_failures > 1
					 &&
					 (flags & (BTRFS_BLOCK_GROUP_RAID1 |
						   BTRFS_BLOCK_GROUP_RAID10)))
				else if (num_tolerated_disk_barrier_failures > 1) {
					if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
					    BTRFS_BLOCK_GROUP_RAID5 |
					    BTRFS_BLOCK_GROUP_RAID10)) {
						num_tolerated_disk_barrier_failures = 1;
					} else if (flags &
						   BTRFS_BLOCK_GROUP_RAID5) {
						num_tolerated_disk_barrier_failures = 2;
					}
				}
			}
		}
		up_read(&sinfo->groups_sem);
@@ -3384,6 +3424,8 @@ int close_ctree(struct btrfs_root *root)
	btrfs_stop_workers(&fs_info->workers);
	btrfs_stop_workers(&fs_info->endio_workers);
	btrfs_stop_workers(&fs_info->endio_meta_workers);
	btrfs_stop_workers(&fs_info->endio_raid56_workers);
	btrfs_stop_workers(&fs_info->rmw_workers);
	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
	btrfs_stop_workers(&fs_info->endio_write_workers);
	btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -3404,6 +3446,8 @@ int close_ctree(struct btrfs_root *root)
	bdi_destroy(&fs_info->bdi);
	cleanup_srcu_struct(&fs_info->subvol_srcu);

	btrfs_free_stripe_hash_table(fs_info);

	return 0;
}

+7 −0
Original line number Diff line number Diff line
@@ -25,6 +25,13 @@
#define BTRFS_SUPER_MIRROR_MAX	 3
#define BTRFS_SUPER_MIRROR_SHIFT 12

enum {
	BTRFS_WQ_ENDIO_DATA = 0,
	BTRFS_WQ_ENDIO_METADATA = 1,
	BTRFS_WQ_ENDIO_FREE_SPACE = 2,
	BTRFS_WQ_ENDIO_RAID56 = 3,
};

static inline u64 btrfs_sb_offset(int mirror)
{
	u64 start = 16 * 1024;
Loading